2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/mount.h>
29 #include "alloc-util.h"
30 #include "dev-setup.h"
32 #include "loopback-setup.h"
35 #include "mount-util.h"
36 #include "namespace.h"
37 #include "path-util.h"
38 #include "selinux-util.h"
39 #include "socket-util.h"
40 #include "string-table.h"
41 #include "string-util.h"
43 #include "umask-util.h"
44 #include "user-util.h"
47 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
49 typedef enum MountMode
{
50 /* This is ordered by priority! */
59 typedef struct BindMount
{
66 static int append_mounts(BindMount
**p
, char **strv
, MountMode mode
) {
71 STRV_FOREACH(i
, strv
) {
76 if ((mode
== INACCESSIBLE
|| mode
== READONLY
|| mode
== READWRITE
) && (*i
)[0] == '-') {
81 if (!path_is_absolute(*i
))
92 static int mount_path_compare(const void *a
, const void *b
) {
93 const BindMount
*p
= a
, *q
= b
;
96 /* If the paths are not equal, then order prefixes first */
97 d
= path_compare(p
->path
, q
->path
);
101 /* If the paths are equal, check the mode */
102 if (p
->mode
< q
->mode
)
105 if (p
->mode
> q
->mode
)
111 static void drop_duplicates(BindMount
*m
, unsigned *n
) {
112 BindMount
*f
, *t
, *previous
;
117 /* Drops duplicate entries. Expects that the array is properly ordered already. */
119 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+*n
; f
++) {
121 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
123 if (previous
&& path_equal(f
->path
, previous
->path
)) {
124 log_debug("%s is duplicate.", f
->path
);
136 static void drop_inaccessible(BindMount
*m
, unsigned *n
) {
138 const char *clear
= NULL
;
143 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
144 * ordered already. */
146 for (f
= m
, t
= m
; f
< m
+*n
; f
++) {
148 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
149 * it, as inaccessible paths really should drop the entire subtree. */
150 if (clear
&& path_startswith(f
->path
, clear
)) {
151 log_debug("%s is masked by %s.", f
->path
, clear
);
155 clear
= f
->mode
== INACCESSIBLE
? f
->path
: NULL
;
164 static void drop_nop(BindMount
*m
, unsigned *n
) {
170 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
171 * list is ordered by prefixes. */
173 for (f
= m
, t
= m
; f
< m
+*n
; f
++) {
175 /* Only suppress such subtrees for READONLY and READWRITE entries */
176 if (IN_SET(f
->mode
, READONLY
, READWRITE
)) {
180 /* Now let's find the first parent of the entry we are looking at. */
181 for (p
= t
-1; p
>= m
; p
--) {
182 if (path_startswith(f
->path
, p
->path
)) {
188 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
189 if (found
&& p
->mode
== f
->mode
) {
190 log_debug("%s is redundant by %s", f
->path
, p
->path
);
202 static int mount_dev(BindMount
*m
) {
203 static const char devnodes
[] =
211 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
212 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
213 _cleanup_umask_ mode_t u
;
220 if (!mkdtemp(temporary_mount
))
223 dev
= strjoina(temporary_mount
, "/dev");
224 (void) mkdir(dev
, 0755);
225 if (mount("tmpfs", dev
, "tmpfs", DEV_MOUNT_OPTIONS
, "mode=755") < 0) {
230 devpts
= strjoina(temporary_mount
, "/dev/pts");
231 (void) mkdir(devpts
, 0755);
232 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
237 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
238 if (symlink("pts/ptmx", devptmx
) < 0) {
243 devshm
= strjoina(temporary_mount
, "/dev/shm");
244 (void) mkdir(devshm
, 01777);
245 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
251 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
252 (void) mkdir(devmqueue
, 0755);
253 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
255 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
256 (void) mkdir(devhugepages
, 0755);
257 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
259 devlog
= strjoina(temporary_mount
, "/dev/log");
260 (void) symlink("/run/systemd/journal/dev-log", devlog
);
262 NULSTR_FOREACH(d
, devnodes
) {
263 _cleanup_free_
char *dn
= NULL
;
276 if (!S_ISBLK(st
.st_mode
) &&
277 !S_ISCHR(st
.st_mode
)) {
285 dn
= strappend(temporary_mount
, d
);
291 mac_selinux_create_file_prepare(d
, st
.st_mode
);
292 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
293 mac_selinux_create_file_clear();
301 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
303 /* Create the /dev directory if missing. It is more likely to be
304 * missing when the service is started with RootDirectory. This is
305 * consistent with mount units creating the mount points when missing.
307 (void) mkdir_p_label(m
->path
, 0755);
309 /* Unmount everything in old /dev */
310 umount_recursive(m
->path
, 0);
311 if (mount(dev
, m
->path
, NULL
, MS_MOVE
, NULL
) < 0) {
317 rmdir(temporary_mount
);
329 umount(devhugepages
);
336 rmdir(temporary_mount
);
341 static int apply_mount(
344 const char *var_tmp_dir
) {
351 log_debug("Applying namespace mount on %s", m
->path
);
358 /* First, get rid of everything that is below if there
359 * is anything... Then, overmount it with an
360 * inaccessible path. */
361 (void) umount_recursive(m
->path
, 0);
363 if (lstat(m
->path
, &target
) < 0) {
364 if (m
->ignore
&& errno
== ENOENT
)
366 return log_debug_errno(errno
, "Failed to lstat() %s to determine what to mount over it: %m", m
->path
);
369 what
= mode_to_inaccessible_node(target
.st_mode
);
371 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
380 r
= path_is_mount_point(m
->path
, 0);
382 if (m
->ignore
&& errno
== ENOENT
)
384 return log_debug_errno(r
, "Failed to determine whether %s is already a mount point: %m", m
->path
);
386 if (r
> 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
389 /* This isn't a mount point yet, let's make it one. */
397 case PRIVATE_VAR_TMP
:
405 assert_not_reached("Unknown mode");
410 if (mount(what
, m
->path
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
411 if (m
->ignore
&& errno
== ENOENT
)
414 return log_debug_errno(errno
, "Failed to mount %s to %s: %m", what
, m
->path
);
417 log_debug("Successfully mounted %s to %s", what
, m
->path
);
421 static int make_read_only(BindMount
*m
, char **blacklist
) {
426 if (IN_SET(m
->mode
, INACCESSIBLE
, READONLY
))
427 r
= bind_remount_recursive(m
->path
, true, blacklist
);
428 else if (m
->mode
== PRIVATE_DEV
) { /* Can be readonly but the submounts can't*/
429 if (mount(NULL
, m
->path
, NULL
, MS_REMOUNT
|DEV_MOUNT_OPTIONS
|MS_RDONLY
, NULL
) < 0)
434 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
435 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
436 * read-only mounts already applied. */
438 if (m
->ignore
&& r
== -ENOENT
)
445 const char* root_directory
,
446 char** read_write_paths
,
447 char** read_only_paths
,
448 char** inaccessible_paths
,
450 const char* var_tmp_dir
,
453 bool protect_cgroups
,
454 ProtectHome protect_home
,
455 ProtectSystem protect_system
,
456 unsigned long mount_flags
) {
458 BindMount
*m
, *mounts
= NULL
;
462 if (mount_flags
== 0)
463 mount_flags
= MS_SHARED
;
465 if (unshare(CLONE_NEWNS
) < 0)
468 n
= !!tmp_dir
+ !!var_tmp_dir
+
469 strv_length(read_write_paths
) +
470 strv_length(read_only_paths
) +
471 strv_length(inaccessible_paths
) +
473 (protect_sysctl
? 3 : 0) +
474 (protect_cgroups
!= protect_sysctl
) +
475 (protect_home
!= PROTECT_HOME_NO
|| protect_system
== PROTECT_SYSTEM_STRICT
? 3 : 0) +
476 (protect_system
== PROTECT_SYSTEM_STRICT
?
477 (2 + !private_dev
+ !protect_sysctl
) :
478 ((protect_system
!= PROTECT_SYSTEM_NO
? 3 : 0) +
479 (protect_system
== PROTECT_SYSTEM_FULL
? 1 : 0)));
482 m
= mounts
= (BindMount
*) alloca0(n
* sizeof(BindMount
));
483 r
= append_mounts(&m
, read_write_paths
, READWRITE
);
487 r
= append_mounts(&m
, read_only_paths
, READONLY
);
491 r
= append_mounts(&m
, inaccessible_paths
, INACCESSIBLE
);
496 m
->path
= prefix_roota(root_directory
, "/tmp");
497 m
->mode
= PRIVATE_TMP
;
502 m
->path
= prefix_roota(root_directory
, "/var/tmp");
503 m
->mode
= PRIVATE_VAR_TMP
;
508 m
->path
= prefix_roota(root_directory
, "/dev");
509 m
->mode
= PRIVATE_DEV
;
513 if (protect_sysctl
) {
514 m
->path
= prefix_roota(root_directory
, "/proc/sys");
518 m
->path
= prefix_roota(root_directory
, "/proc/sysrq-trigger");
520 m
->ignore
= true; /* Not always compiled into the kernel */
523 m
->path
= prefix_roota(root_directory
, "/sys");
528 if (protect_cgroups
!= protect_sysctl
) {
529 m
->path
= prefix_roota(root_directory
, "/sys/fs/cgroup");
530 m
->mode
= protect_cgroups
? READONLY
: READWRITE
;
534 if (protect_home
!= PROTECT_HOME_NO
|| protect_system
== PROTECT_SYSTEM_STRICT
) {
535 const char *home_dir
, *run_user_dir
, *root_dir
;
537 /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
538 * strict system protection mode, then also add entries for these directories, but mark them
539 * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
541 home_dir
= prefix_roota(root_directory
, "/home");
542 home_dir
= strjoina("-", home_dir
);
543 run_user_dir
= prefix_roota(root_directory
, "/run/user");
544 run_user_dir
= strjoina("-", run_user_dir
);
545 root_dir
= prefix_roota(root_directory
, "/root");
546 root_dir
= strjoina("-", root_dir
);
548 r
= append_mounts(&m
, STRV_MAKE(home_dir
, run_user_dir
, root_dir
),
549 protect_home
== PROTECT_HOME_READ_ONLY
? READONLY
:
550 protect_home
== PROTECT_HOME_YES
? INACCESSIBLE
: READWRITE
);
555 if (protect_system
== PROTECT_SYSTEM_STRICT
) {
556 /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
557 * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
558 * protect those, and these options should be fully orthogonal. (And of course /home and
559 * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
562 m
->path
= prefix_roota(root_directory
, "/");
566 m
->path
= prefix_roota(root_directory
, "/proc");
571 m
->path
= prefix_roota(root_directory
, "/dev");
575 if (!protect_sysctl
) {
576 m
->path
= prefix_roota(root_directory
, "/sys");
581 } else if (protect_system
!= PROTECT_SYSTEM_NO
) {
582 const char *usr_dir
, *boot_dir
, *efi_dir
, *etc_dir
;
584 /* In any other mode we simply mark the relevant three directories ready-only. */
586 usr_dir
= prefix_roota(root_directory
, "/usr");
587 boot_dir
= prefix_roota(root_directory
, "/boot");
588 boot_dir
= strjoina("-", boot_dir
);
589 efi_dir
= prefix_roota(root_directory
, "/efi");
590 efi_dir
= strjoina("-", efi_dir
);
591 etc_dir
= prefix_roota(root_directory
, "/etc");
593 r
= append_mounts(&m
, protect_system
== PROTECT_SYSTEM_FULL
594 ? STRV_MAKE(usr_dir
, boot_dir
, efi_dir
, etc_dir
)
595 : STRV_MAKE(usr_dir
, boot_dir
, efi_dir
), READONLY
);
600 assert(mounts
+ n
== m
);
602 qsort(mounts
, n
, sizeof(BindMount
), mount_path_compare
);
604 drop_duplicates(mounts
, &n
);
605 drop_inaccessible(mounts
, &n
);
606 drop_nop(mounts
, &n
);
609 if (n
> 0 || root_directory
) {
610 /* Remount / as SLAVE so that nothing now mounted in the namespace
611 shows up in the parent */
612 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
616 if (root_directory
) {
617 /* Turn directory into bind mount */
618 if (mount(root_directory
, root_directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
626 /* First round, add in all special mounts we need */
627 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
628 r
= apply_mount(m
, tmp_dir
, var_tmp_dir
);
633 /* Create a blacklist we can pass to bind_mount_recursive() */
634 blacklist
= newa(char*, n
+1);
635 for (j
= 0; j
< n
; j
++)
636 blacklist
[j
] = (char*) mounts
[j
].path
;
639 /* Second round, flip the ro bits if necessary. */
640 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
641 r
= make_read_only(m
, blacklist
);
647 if (root_directory
) {
648 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
649 r
= mount_move_root(root_directory
);
650 if (r
< 0) /* at this point, we cannot rollback */
654 /* Remount / as the desired mode. Not that this will not
655 * reestablish propagation from our side to the host, since
656 * what's disconnected is disconnected. */
657 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0)
658 return -errno
; /* at this point, we cannot rollback */
664 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
668 (void) umount2(m
->path
, MNT_DETACH
);
675 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
676 _cleanup_free_
char *x
= NULL
;
677 char bid
[SD_ID128_STRING_MAX
];
685 /* We include the boot id in the directory so that after a
686 * reboot we can easily identify obsolete directories. */
688 r
= sd_id128_get_boot(&boot_id
);
692 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX", NULL
);
700 RUN_WITH_UMASK(0000) {
703 y
= strjoina(x
, "/tmp");
705 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
715 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
723 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
727 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
731 t
= strjoina(a
, "/tmp");
745 int setup_netns(int netns_storage_socket
[2]) {
746 _cleanup_close_
int netns
= -1;
749 assert(netns_storage_socket
);
750 assert(netns_storage_socket
[0] >= 0);
751 assert(netns_storage_socket
[1] >= 0);
753 /* We use the passed socketpair as a storage buffer for our
754 * namespace reference fd. Whatever process runs this first
755 * shall create a new namespace, all others should just join
756 * it. To serialize that we use a file lock on the socket
759 * It's a bit crazy, but hey, works great! */
761 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
764 netns
= receive_one_fd(netns_storage_socket
[0], MSG_DONTWAIT
);
765 if (netns
== -EAGAIN
) {
766 /* Nothing stored yet, so let's create a new namespace */
768 if (unshare(CLONE_NEWNET
) < 0) {
775 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
783 } else if (netns
< 0) {
788 /* Yay, found something, so let's join the namespace */
789 if (setns(netns
, CLONE_NEWNET
) < 0) {
797 q
= send_one_fd(netns_storage_socket
[1], netns
, MSG_DONTWAIT
);
804 (void) lockf(netns_storage_socket
[0], F_ULOCK
, 0);
808 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
809 [PROTECT_HOME_NO
] = "no",
810 [PROTECT_HOME_YES
] = "yes",
811 [PROTECT_HOME_READ_ONLY
] = "read-only",
814 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
816 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
817 [PROTECT_SYSTEM_NO
] = "no",
818 [PROTECT_SYSTEM_YES
] = "yes",
819 [PROTECT_SYSTEM_FULL
] = "full",
820 [PROTECT_SYSTEM_STRICT
] = "strict",
823 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);