2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/mount.h>
29 #include "alloc-util.h"
30 #include "dev-setup.h"
32 #include "loopback-setup.h"
35 #include "mount-util.h"
36 #include "namespace.h"
37 #include "path-util.h"
38 #include "selinux-util.h"
39 #include "socket-util.h"
40 #include "string-table.h"
41 #include "string-util.h"
43 #include "umask-util.h"
44 #include "user-util.h"
47 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
49 typedef enum MountMode
{
50 /* This is ordered by priority! */
59 typedef struct BindMount
{
66 static int append_mounts(BindMount
**p
, char **strv
, MountMode mode
) {
71 STRV_FOREACH(i
, strv
) {
76 if ((mode
== INACCESSIBLE
|| mode
== READONLY
|| mode
== READWRITE
) && (*i
)[0] == '-') {
81 if (!path_is_absolute(*i
))
92 static int mount_path_compare(const void *a
, const void *b
) {
93 const BindMount
*p
= a
, *q
= b
;
96 d
= path_compare(p
->path
, q
->path
);
99 /* If the paths are equal, check the mode */
100 if (p
->mode
< q
->mode
)
103 if (p
->mode
> q
->mode
)
109 /* If the paths are not equal, then order prefixes first */
113 static void drop_duplicates(BindMount
*m
, unsigned *n
) {
114 BindMount
*f
, *t
, *previous
;
119 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+*n
; f
++) {
121 /* The first one wins */
122 if (previous
&& path_equal(f
->path
, previous
->path
))
135 static int mount_dev(BindMount
*m
) {
136 static const char devnodes
[] =
144 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
145 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
146 _cleanup_umask_ mode_t u
;
153 if (!mkdtemp(temporary_mount
))
156 dev
= strjoina(temporary_mount
, "/dev");
157 (void) mkdir(dev
, 0755);
158 if (mount("tmpfs", dev
, "tmpfs", DEV_MOUNT_OPTIONS
, "mode=755") < 0) {
163 devpts
= strjoina(temporary_mount
, "/dev/pts");
164 (void) mkdir(devpts
, 0755);
165 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
170 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
171 if (symlink("pts/ptmx", devptmx
) < 0) {
176 devshm
= strjoina(temporary_mount
, "/dev/shm");
177 (void) mkdir(devshm
, 01777);
178 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
184 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
185 (void) mkdir(devmqueue
, 0755);
186 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
188 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
189 (void) mkdir(devhugepages
, 0755);
190 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
192 devlog
= strjoina(temporary_mount
, "/dev/log");
193 (void) symlink("/run/systemd/journal/dev-log", devlog
);
195 NULSTR_FOREACH(d
, devnodes
) {
196 _cleanup_free_
char *dn
= NULL
;
209 if (!S_ISBLK(st
.st_mode
) &&
210 !S_ISCHR(st
.st_mode
)) {
218 dn
= strappend(temporary_mount
, d
);
224 mac_selinux_create_file_prepare(d
, st
.st_mode
);
225 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
226 mac_selinux_create_file_clear();
234 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
236 /* Create the /dev directory if missing. It is more likely to be
237 * missing when the service is started with RootDirectory. This is
238 * consistent with mount units creating the mount points when missing.
240 (void) mkdir_p_label(m
->path
, 0755);
242 /* Unmount everything in old /dev */
243 umount_recursive(m
->path
, 0);
244 if (mount(dev
, m
->path
, NULL
, MS_MOVE
, NULL
) < 0) {
250 rmdir(temporary_mount
);
262 umount(devhugepages
);
269 rmdir(temporary_mount
);
274 static int apply_mount(
277 const char *var_tmp_dir
) {
289 /* First, get rid of everything that is below if there
290 * is anything... Then, overmount it with an
291 * inaccessible path. */
292 umount_recursive(m
->path
, 0);
294 r
= lstat(m
->path
, &target
);
296 if (m
->ignore
&& errno
== ENOENT
)
301 what
= mode_to_inaccessible_node(target
.st_mode
);
303 log_debug("File type not supported. Note that symlinks are not allowed");
309 /* Nothing to mount here, we just later toggle the
310 * MS_RDONLY bit for the mount point */
317 case PRIVATE_VAR_TMP
:
325 assert_not_reached("Unknown mode");
330 r
= mount(what
, m
->path
, NULL
, MS_BIND
|MS_REC
, NULL
);
332 log_debug("Successfully mounted %s to %s", what
, m
->path
);
336 if (m
->ignore
&& errno
== ENOENT
)
338 log_debug("Failed mounting %s to %s: %s", what
, m
->path
, strerror(errno
));
343 static int make_read_only(BindMount
*m
) {
348 if (IN_SET(m
->mode
, INACCESSIBLE
, READONLY
))
349 r
= bind_remount_recursive(m
->path
, true);
350 else if (IN_SET(m
->mode
, READWRITE
, PRIVATE_TMP
, PRIVATE_VAR_TMP
, PRIVATE_DEV
)) {
351 r
= bind_remount_recursive(m
->path
, false);
352 if (r
== 0 && m
->mode
== PRIVATE_DEV
) /* can be readonly but the submounts can't*/
353 r
= mount(NULL
, m
->path
, NULL
, MS_REMOUNT
|DEV_MOUNT_OPTIONS
|MS_RDONLY
, NULL
);
357 if (m
->ignore
&& r
== -ENOENT
)
364 const char* root_directory
,
365 char** read_write_paths
,
366 char** read_only_paths
,
367 char** inaccessible_paths
,
369 const char* var_tmp_dir
,
371 ProtectHome protect_home
,
372 ProtectSystem protect_system
,
373 unsigned long mount_flags
) {
375 BindMount
*m
, *mounts
= NULL
;
379 if (mount_flags
== 0)
380 mount_flags
= MS_SHARED
;
382 if (unshare(CLONE_NEWNS
) < 0)
385 n
= !!tmp_dir
+ !!var_tmp_dir
+
386 strv_length(read_write_paths
) +
387 strv_length(read_only_paths
) +
388 strv_length(inaccessible_paths
) +
390 (protect_home
!= PROTECT_HOME_NO
? 3 : 0) +
391 (protect_system
!= PROTECT_SYSTEM_NO
? 2 : 0) +
392 (protect_system
== PROTECT_SYSTEM_FULL
? 1 : 0);
395 m
= mounts
= (BindMount
*) alloca0(n
* sizeof(BindMount
));
396 r
= append_mounts(&m
, read_write_paths
, READWRITE
);
400 r
= append_mounts(&m
, read_only_paths
, READONLY
);
404 r
= append_mounts(&m
, inaccessible_paths
, INACCESSIBLE
);
409 m
->path
= prefix_roota(root_directory
, "/tmp");
410 m
->mode
= PRIVATE_TMP
;
415 m
->path
= prefix_roota(root_directory
, "/var/tmp");
416 m
->mode
= PRIVATE_VAR_TMP
;
421 m
->path
= prefix_roota(root_directory
, "/dev");
422 m
->mode
= PRIVATE_DEV
;
426 if (protect_home
!= PROTECT_HOME_NO
) {
427 const char *home_dir
, *run_user_dir
, *root_dir
;
429 home_dir
= prefix_roota(root_directory
, "/home");
430 home_dir
= strjoina("-", home_dir
);
431 run_user_dir
= prefix_roota(root_directory
, "/run/user");
432 run_user_dir
= strjoina("-", run_user_dir
);
433 root_dir
= prefix_roota(root_directory
, "/root");
434 root_dir
= strjoina("-", root_dir
);
436 r
= append_mounts(&m
, STRV_MAKE(home_dir
, run_user_dir
, root_dir
),
437 protect_home
== PROTECT_HOME_READ_ONLY
? READONLY
: INACCESSIBLE
);
442 if (protect_system
!= PROTECT_SYSTEM_NO
) {
443 const char *usr_dir
, *boot_dir
, *etc_dir
;
445 usr_dir
= prefix_roota(root_directory
, "/usr");
446 boot_dir
= prefix_roota(root_directory
, "/boot");
447 boot_dir
= strjoina("-", boot_dir
);
448 etc_dir
= prefix_roota(root_directory
, "/etc");
450 r
= append_mounts(&m
, protect_system
== PROTECT_SYSTEM_FULL
451 ? STRV_MAKE(usr_dir
, boot_dir
, etc_dir
)
452 : STRV_MAKE(usr_dir
, boot_dir
), READONLY
);
457 assert(mounts
+ n
== m
);
459 qsort(mounts
, n
, sizeof(BindMount
), mount_path_compare
);
460 drop_duplicates(mounts
, &n
);
463 if (n
> 0 || root_directory
) {
464 /* Remount / as SLAVE so that nothing now mounted in the namespace
465 shows up in the parent */
466 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
470 if (root_directory
) {
471 /* Turn directory into bind mount */
472 if (mount(root_directory
, root_directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
477 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
478 r
= apply_mount(m
, tmp_dir
, var_tmp_dir
);
483 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
484 r
= make_read_only(m
);
490 if (root_directory
) {
491 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
492 r
= mount_move_root(root_directory
);
494 /* at this point, we cannot rollback */
499 /* Remount / as the desired mode. Not that this will not
500 * reestablish propagation from our side to the host, since
501 * what's disconnected is disconnected. */
502 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0)
503 /* at this point, we cannot rollback */
510 for (m
= mounts
; m
< mounts
+ n
; ++m
)
512 (void) umount2(m
->path
, MNT_DETACH
);
518 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
519 _cleanup_free_
char *x
= NULL
;
520 char bid
[SD_ID128_STRING_MAX
];
528 /* We include the boot id in the directory so that after a
529 * reboot we can easily identify obsolete directories. */
531 r
= sd_id128_get_boot(&boot_id
);
535 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX", NULL
);
543 RUN_WITH_UMASK(0000) {
546 y
= strjoina(x
, "/tmp");
548 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
558 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
566 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
570 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
574 t
= strjoina(a
, "/tmp");
588 int setup_netns(int netns_storage_socket
[2]) {
589 _cleanup_close_
int netns
= -1;
592 assert(netns_storage_socket
);
593 assert(netns_storage_socket
[0] >= 0);
594 assert(netns_storage_socket
[1] >= 0);
596 /* We use the passed socketpair as a storage buffer for our
597 * namespace reference fd. Whatever process runs this first
598 * shall create a new namespace, all others should just join
599 * it. To serialize that we use a file lock on the socket
602 * It's a bit crazy, but hey, works great! */
604 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
607 netns
= receive_one_fd(netns_storage_socket
[0], MSG_DONTWAIT
);
608 if (netns
== -EAGAIN
) {
609 /* Nothing stored yet, so let's create a new namespace */
611 if (unshare(CLONE_NEWNET
) < 0) {
618 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
626 } else if (netns
< 0) {
631 /* Yay, found something, so let's join the namespace */
632 if (setns(netns
, CLONE_NEWNET
) < 0) {
640 q
= send_one_fd(netns_storage_socket
[1], netns
, MSG_DONTWAIT
);
647 lockf(netns_storage_socket
[0], F_ULOCK
, 0);
651 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
652 [PROTECT_HOME_NO
] = "no",
653 [PROTECT_HOME_YES
] = "yes",
654 [PROTECT_HOME_READ_ONLY
] = "read-only",
657 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
659 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
660 [PROTECT_SYSTEM_NO
] = "no",
661 [PROTECT_SYSTEM_YES
] = "yes",
662 [PROTECT_SYSTEM_FULL
] = "full",
665 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);