2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/mount.h>
29 #include "alloc-util.h"
30 #include "base-filesystem.h"
31 #include "dev-setup.h"
35 #include "loop-util.h"
36 #include "loopback-setup.h"
39 #include "mount-util.h"
40 #include "namespace.h"
41 #include "path-util.h"
42 #include "selinux-util.h"
43 #include "socket-util.h"
44 #include "string-table.h"
45 #include "string-util.h"
47 #include "umask-util.h"
48 #include "user-util.h"
51 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
53 typedef enum MountMode
{
54 /* This is ordered by priority! */
69 typedef struct MountEntry
{
70 const char *path_const
; /* Memory allocated on stack or static */
72 bool ignore
:1; /* Ignore if path does not exist? */
73 bool has_prefix
:1; /* Already is prefixed by the root dir? */
74 bool read_only
:1; /* Shall this mount point be read-only? */
75 char *path_malloc
; /* Use this instead of 'path' if we had to allocate memory */
76 const char *source_const
; /* The source path, for bind mounts */
80 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
81 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
82 static const MountEntry apivfs_table
[] = {
83 { "/proc", PROCFS
, false },
84 { "/dev", BIND_DEV
, false },
85 { "/sys", SYSFS
, false },
88 /* ProtectKernelTunables= option and the related filesystem APIs */
89 static const MountEntry protect_kernel_tunables_table
[] = {
90 { "/proc/sys", READONLY
, false },
91 { "/proc/sysrq-trigger", READONLY
, true },
92 { "/proc/latency_stats", READONLY
, true },
93 { "/proc/mtrr", READONLY
, true },
94 { "/proc/apm", READONLY
, true }, /* Obsolete API, there's no point in permitting access to this, ever */
95 { "/proc/acpi", READONLY
, true },
96 { "/proc/timer_stats", READONLY
, true },
97 { "/proc/asound", READONLY
, true },
98 { "/proc/bus", READONLY
, true },
99 { "/proc/fs", READONLY
, true },
100 { "/proc/irq", READONLY
, true },
101 { "/sys", READONLY
, false },
102 { "/sys/kernel/debug", READONLY
, true },
103 { "/sys/kernel/tracing", READONLY
, true },
104 { "/sys/fs/cgroup", READWRITE
, false }, /* READONLY is set by ProtectControlGroups= option */
105 { "/sys/fs/selinux", READWRITE
, true },
108 /* ProtectKernelModules= option */
109 static const MountEntry protect_kernel_modules_table
[] = {
111 { "/lib/modules", INACCESSIBLE
, true },
113 { "/usr/lib/modules", INACCESSIBLE
, true },
117 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
118 * system should be protected by ProtectSystem=
120 static const MountEntry protect_home_read_only_table
[] = {
121 { "/home", READONLY
, true },
122 { "/run/user", READONLY
, true },
123 { "/root", READONLY
, true },
126 /* ProtectHome=yes table */
127 static const MountEntry protect_home_yes_table
[] = {
128 { "/home", INACCESSIBLE
, true },
129 { "/run/user", INACCESSIBLE
, true },
130 { "/root", INACCESSIBLE
, true },
133 /* ProtectSystem=yes table */
134 static const MountEntry protect_system_yes_table
[] = {
135 { "/usr", READONLY
, false },
136 { "/boot", READONLY
, true },
137 { "/efi", READONLY
, true },
140 /* ProtectSystem=full includes ProtectSystem=yes */
141 static const MountEntry protect_system_full_table
[] = {
142 { "/usr", READONLY
, false },
143 { "/boot", READONLY
, true },
144 { "/efi", READONLY
, true },
145 { "/etc", READONLY
, false },
149 * ProtectSystem=strict table. In this strict mode, we mount everything
150 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
151 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
152 * protect those, and these options should be fully orthogonal.
153 * (And of course /home and friends are also left writable, as ProtectHome=
154 * shall manage those, orthogonally).
156 static const MountEntry protect_system_strict_table
[] = {
157 { "/", READONLY
, false },
158 { "/proc", READWRITE
, false }, /* ProtectKernelTunables= */
159 { "/sys", READWRITE
, false }, /* ProtectKernelTunables= */
160 { "/dev", READWRITE
, false }, /* PrivateDevices= */
161 { "/home", READWRITE
, true }, /* ProtectHome= */
162 { "/run/user", READWRITE
, true }, /* ProtectHome= */
163 { "/root", READWRITE
, true }, /* ProtectHome= */
166 static const char *mount_entry_path(const MountEntry
*p
) {
169 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
170 * otherwise the stack/static ->path field is returned. */
172 return p
->path_malloc
?: p
->path_const
;
175 static bool mount_entry_read_only(const MountEntry
*p
) {
178 return p
->read_only
|| IN_SET(p
->mode
, READONLY
, INACCESSIBLE
);
181 static const char *mount_entry_source(const MountEntry
*p
) {
184 return p
->source_malloc
?: p
->source_const
;
187 static void mount_entry_done(MountEntry
*p
) {
190 p
->path_malloc
= mfree(p
->path_malloc
);
191 p
->source_malloc
= mfree(p
->source_malloc
);
194 static int append_access_mounts(MountEntry
**p
, char **strv
, MountMode mode
) {
199 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
201 STRV_FOREACH(i
, strv
) {
202 bool ignore
= false, needs_prefix
= false;
205 /* Look for any prefixes */
206 if (startswith(e
, "-")) {
210 if (startswith(e
, "+")) {
215 if (!path_is_absolute(e
))
218 *((*p
)++) = (MountEntry
) {
222 .has_prefix
= !needs_prefix
,
229 static int append_empty_dir_mounts(MountEntry
**p
, char **strv
) {
234 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
235 * "/private/" boundary directories for DynamicUser=1. */
237 STRV_FOREACH(i
, strv
) {
239 *((*p
)++) = (MountEntry
) {
251 static int append_bind_mounts(MountEntry
**p
, const BindMount
*binds
, unsigned n
) {
256 for (i
= 0; i
< n
; i
++) {
257 const BindMount
*b
= binds
+ i
;
259 *((*p
)++) = (MountEntry
) {
260 .path_const
= b
->destination
,
261 .mode
= b
->recursive
? BIND_MOUNT_RECURSIVE
: BIND_MOUNT
,
262 .read_only
= b
->read_only
,
263 .source_const
= b
->source
,
270 static int append_static_mounts(MountEntry
**p
, const MountEntry
*mounts
, unsigned n
, bool ignore_protect
) {
276 /* Adds a list of static pre-defined entries */
278 for (i
= 0; i
< n
; i
++)
279 *((*p
)++) = (MountEntry
) {
280 .path_const
= mount_entry_path(mounts
+i
),
281 .mode
= mounts
[i
].mode
,
282 .ignore
= mounts
[i
].ignore
|| ignore_protect
,
288 static int append_protect_home(MountEntry
**p
, ProtectHome protect_home
, bool ignore_protect
) {
291 switch (protect_home
) {
293 case PROTECT_HOME_NO
:
296 case PROTECT_HOME_READ_ONLY
:
297 return append_static_mounts(p
, protect_home_read_only_table
, ELEMENTSOF(protect_home_read_only_table
), ignore_protect
);
299 case PROTECT_HOME_YES
:
300 return append_static_mounts(p
, protect_home_yes_table
, ELEMENTSOF(protect_home_yes_table
), ignore_protect
);
303 assert_not_reached("Unexpected ProtectHome= value");
307 static int append_protect_system(MountEntry
**p
, ProtectSystem protect_system
, bool ignore_protect
) {
310 switch (protect_system
) {
312 case PROTECT_SYSTEM_NO
:
315 case PROTECT_SYSTEM_STRICT
:
316 return append_static_mounts(p
, protect_system_strict_table
, ELEMENTSOF(protect_system_strict_table
), ignore_protect
);
318 case PROTECT_SYSTEM_YES
:
319 return append_static_mounts(p
, protect_system_yes_table
, ELEMENTSOF(protect_system_yes_table
), ignore_protect
);
321 case PROTECT_SYSTEM_FULL
:
322 return append_static_mounts(p
, protect_system_full_table
, ELEMENTSOF(protect_system_full_table
), ignore_protect
);
325 assert_not_reached("Unexpected ProtectSystem= value");
329 static int mount_path_compare(const void *a
, const void *b
) {
330 const MountEntry
*p
= a
, *q
= b
;
333 /* If the paths are not equal, then order prefixes first */
334 d
= path_compare(mount_entry_path(p
), mount_entry_path(q
));
338 /* If the paths are equal, check the mode */
339 if (p
->mode
< q
->mode
)
342 if (p
->mode
> q
->mode
)
348 static int prefix_where_needed(MountEntry
*m
, unsigned n
, const char *root_directory
) {
351 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
357 for (i
= 0; i
< n
; i
++) {
363 s
= prefix_root(root_directory
, mount_entry_path(m
+i
));
367 free(m
[i
].path_malloc
);
368 m
[i
].path_malloc
= s
;
370 m
[i
].has_prefix
= true;
376 static void drop_duplicates(MountEntry
*m
, unsigned *n
) {
377 MountEntry
*f
, *t
, *previous
;
382 /* Drops duplicate entries. Expects that the array is properly ordered already. */
384 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+ *n
; f
++) {
386 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
388 if (previous
&& path_equal(mount_entry_path(f
), mount_entry_path(previous
))) {
389 log_debug("%s is duplicate.", mount_entry_path(f
));
390 previous
->read_only
= previous
->read_only
|| mount_entry_read_only(f
); /* Propagate the read-only flag to the remaining entry */
403 static void drop_inaccessible(MountEntry
*m
, unsigned *n
) {
405 const char *clear
= NULL
;
410 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
411 * ordered already. */
413 for (f
= m
, t
= m
; f
< m
+ *n
; f
++) {
415 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
416 * it, as inaccessible paths really should drop the entire subtree. */
417 if (clear
&& path_startswith(mount_entry_path(f
), clear
)) {
418 log_debug("%s is masked by %s.", mount_entry_path(f
), clear
);
423 clear
= f
->mode
== INACCESSIBLE
? mount_entry_path(f
) : NULL
;
432 static void drop_nop(MountEntry
*m
, unsigned *n
) {
438 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
439 * list is ordered by prefixes. */
441 for (f
= m
, t
= m
; f
< m
+ *n
; f
++) {
443 /* Only suppress such subtrees for READONLY and READWRITE entries */
444 if (IN_SET(f
->mode
, READONLY
, READWRITE
)) {
448 /* Now let's find the first parent of the entry we are looking at. */
449 for (p
= t
-1; p
>= m
; p
--) {
450 if (path_startswith(mount_entry_path(f
), mount_entry_path(p
))) {
456 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
457 if (found
&& p
->mode
== f
->mode
) {
458 log_debug("%s is redundant by %s", mount_entry_path(f
), mount_entry_path(p
));
471 static void drop_outside_root(const char *root_directory
, MountEntry
*m
, unsigned *n
) {
481 /* Drops all mounts that are outside of the root directory. */
483 for (f
= m
, t
= m
; f
< m
+ *n
; f
++) {
485 if (!path_startswith(mount_entry_path(f
), root_directory
)) {
486 log_debug("%s is outside of root directory.", mount_entry_path(f
));
498 static int mount_private_dev(MountEntry
*m
) {
499 static const char devnodes
[] =
507 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
508 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
509 _cleanup_umask_ mode_t u
;
516 if (!mkdtemp(temporary_mount
))
519 dev
= strjoina(temporary_mount
, "/dev");
520 (void) mkdir(dev
, 0755);
521 if (mount("tmpfs", dev
, "tmpfs", DEV_MOUNT_OPTIONS
, "mode=755") < 0) {
526 devpts
= strjoina(temporary_mount
, "/dev/pts");
527 (void) mkdir(devpts
, 0755);
528 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
533 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
534 if (symlink("pts/ptmx", devptmx
) < 0) {
539 devshm
= strjoina(temporary_mount
, "/dev/shm");
540 (void) mkdir(devshm
, 01777);
541 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
547 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
548 (void) mkdir(devmqueue
, 0755);
549 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
551 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
552 (void) mkdir(devhugepages
, 0755);
553 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
555 devlog
= strjoina(temporary_mount
, "/dev/log");
556 (void) symlink("/run/systemd/journal/dev-log", devlog
);
558 NULSTR_FOREACH(d
, devnodes
) {
559 _cleanup_free_
char *dn
= NULL
;
572 if (!S_ISBLK(st
.st_mode
) &&
573 !S_ISCHR(st
.st_mode
)) {
581 dn
= strappend(temporary_mount
, d
);
587 mac_selinux_create_file_prepare(d
, st
.st_mode
);
588 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
589 mac_selinux_create_file_clear();
597 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
599 /* Create the /dev directory if missing. It is more likely to be
600 * missing when the service is started with RootDirectory. This is
601 * consistent with mount units creating the mount points when missing.
603 (void) mkdir_p_label(mount_entry_path(m
), 0755);
605 /* Unmount everything in old /dev */
606 umount_recursive(mount_entry_path(m
), 0);
607 if (mount(dev
, mount_entry_path(m
), NULL
, MS_MOVE
, NULL
) < 0) {
613 rmdir(temporary_mount
);
625 umount(devhugepages
);
632 rmdir(temporary_mount
);
637 static int mount_bind_dev(MountEntry
*m
) {
642 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
643 * /dev. This is only used when RootDirectory= is set. */
645 (void) mkdir_p_label(mount_entry_path(m
), 0755);
647 r
= path_is_mount_point(mount_entry_path(m
), NULL
, 0);
649 return log_debug_errno(r
, "Unable to determine whether /dev is already mounted: %m");
650 if (r
> 0) /* make this a NOP if /dev is already a mount point */
653 if (mount("/dev", mount_entry_path(m
), NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
654 return log_debug_errno(errno
, "Failed to bind mount %s: %m", mount_entry_path(m
));
659 static int mount_sysfs(MountEntry
*m
) {
664 (void) mkdir_p_label(mount_entry_path(m
), 0755);
666 r
= path_is_mount_point(mount_entry_path(m
), NULL
, 0);
668 return log_debug_errno(r
, "Unable to determine whether /sys is already mounted: %m");
669 if (r
> 0) /* make this a NOP if /sys is already a mount point */
672 /* Bind mount the host's version so that we get all child mounts of it, too. */
673 if (mount("/sys", mount_entry_path(m
), NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
674 return log_debug_errno(errno
, "Failed to mount %s: %m", mount_entry_path(m
));
679 static int mount_procfs(MountEntry
*m
) {
684 (void) mkdir_p_label(mount_entry_path(m
), 0755);
686 r
= path_is_mount_point(mount_entry_path(m
), NULL
, 0);
688 return log_debug_errno(r
, "Unable to determine whether /proc is already mounted: %m");
689 if (r
> 0) /* make this a NOP if /proc is already a mount point */
692 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
693 if (mount("proc", mount_entry_path(m
), "proc", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
) < 0)
694 return log_debug_errno(errno
, "Failed to mount %s: %m", mount_entry_path(m
));
699 static int mount_empty_dir(MountEntry
*m
) {
702 /* First, get rid of everything that is below if there is anything. Then, overmount with our new empty dir */
704 (void) mkdir_p_label(mount_entry_path(m
), 0755);
705 (void) umount_recursive(mount_entry_path(m
), 0);
707 if (mount("tmpfs", mount_entry_path(m
), "tmpfs", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, "mode=755") < 0)
708 return log_debug_errno(errno
, "Failed to mount %s: %m", mount_entry_path(m
));
713 static int mount_entry_chase(
714 const char *root_directory
,
724 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
725 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
726 * that applies). The result is stored in "location". */
728 r
= chase_symlinks(path
, root_directory
,
729 IN_SET(m
->mode
, BIND_MOUNT
, BIND_MOUNT_RECURSIVE
, PRIVATE_TMP
, PRIVATE_VAR_TMP
, PRIVATE_DEV
, BIND_DEV
, EMPTY_DIR
, SYSFS
, PROCFS
) ? CHASE_NONEXISTENT
: 0,
731 if (r
== -ENOENT
&& m
->ignore
) {
732 log_debug_errno(r
, "Path %s does not exist, ignoring.", path
);
736 return log_debug_errno(r
, "Failed to follow symlinks on %s: %m", path
);
738 log_debug("Followed symlinks %s → %s.", path
, chased
);
746 static int apply_mount(
747 const char *root_directory
,
750 const char *var_tmp_dir
) {
752 bool rbind
= true, make
= false;
758 r
= mount_entry_chase(root_directory
, m
, mount_entry_path(m
), &m
->path_malloc
);
762 log_debug("Applying namespace mount on %s", mount_entry_path(m
));
769 /* First, get rid of everything that is below if there
770 * is anything... Then, overmount it with an
771 * inaccessible path. */
772 (void) umount_recursive(mount_entry_path(m
), 0);
774 if (lstat(mount_entry_path(m
), &target
) < 0)
775 return log_debug_errno(errno
, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m
));
777 what
= mode_to_inaccessible_node(target
.st_mode
);
779 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
787 r
= path_is_mount_point(mount_entry_path(m
), root_directory
, 0);
789 return log_debug_errno(r
, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m
));
790 if (r
> 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
792 /* This isn't a mount point yet, let's make it one. */
793 what
= mount_entry_path(m
);
800 case BIND_MOUNT_RECURSIVE
:
801 /* Also chase the source mount */
803 r
= mount_entry_chase(root_directory
, m
, mount_entry_source(m
), &m
->source_malloc
);
807 what
= mount_entry_source(m
);
812 return mount_empty_dir(m
);
819 case PRIVATE_VAR_TMP
:
825 return mount_private_dev(m
);
828 return mount_bind_dev(m
);
831 return mount_sysfs(m
);
834 return mount_procfs(m
);
837 assert_not_reached("Unknown mode");
842 if (mount(what
, mount_entry_path(m
), NULL
, MS_BIND
|(rbind
? MS_REC
: 0), NULL
) < 0) {
843 bool try_again
= false;
846 if (r
== -ENOENT
&& make
) {
849 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
851 if (stat(what
, &st
) >= 0) {
853 (void) mkdir_parents(mount_entry_path(m
), 0755);
855 if (S_ISDIR(st
.st_mode
))
856 try_again
= mkdir(mount_entry_path(m
), 0755) >= 0;
858 try_again
= touch(mount_entry_path(m
)) >= 0;
863 if (mount(what
, mount_entry_path(m
), NULL
, MS_BIND
|(rbind
? MS_REC
: 0), NULL
) < 0)
870 return log_debug_errno(r
, "Failed to mount %s to %s: %m", what
, mount_entry_path(m
));
873 log_debug("Successfully mounted %s to %s", what
, mount_entry_path(m
));
877 static int make_read_only(MountEntry
*m
, char **blacklist
, FILE *proc_self_mountinfo
) {
881 assert(proc_self_mountinfo
);
883 if (mount_entry_read_only(m
))
884 r
= bind_remount_recursive_with_mountinfo(mount_entry_path(m
), true, blacklist
, proc_self_mountinfo
);
885 else if (m
->mode
== PRIVATE_DEV
) { /* Superblock can be readonly but the submounts can't */
886 if (mount(NULL
, mount_entry_path(m
), NULL
, MS_REMOUNT
|DEV_MOUNT_OPTIONS
|MS_RDONLY
, NULL
) < 0)
891 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
892 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
893 * read-only mounts already applied. */
895 if (r
== -ENOENT
&& m
->ignore
)
901 static bool namespace_info_mount_apivfs(const char *root_directory
, const NameSpaceInfo
*ns_info
) {
905 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
906 * since to protect the API VFS mounts, they need to be around in the
907 * first place... and RootDirectory= or RootImage= need to be set.
910 /* root_directory should point to a mount point */
911 return root_directory
&&
912 (ns_info
->mount_apivfs
||
913 ns_info
->protect_control_groups
||
914 ns_info
->protect_kernel_tunables
);
917 static unsigned namespace_calculate_mounts(
918 const char* root_directory
,
919 const NameSpaceInfo
*ns_info
,
920 char** read_write_paths
,
921 char** read_only_paths
,
922 char** inaccessible_paths
,
923 char** empty_directories
,
924 const BindMount
*bind_mounts
,
925 unsigned n_bind_mounts
,
927 const char* var_tmp_dir
,
928 ProtectHome protect_home
,
929 ProtectSystem protect_system
) {
931 unsigned protect_home_cnt
;
932 unsigned protect_system_cnt
=
933 (protect_system
== PROTECT_SYSTEM_STRICT
?
934 ELEMENTSOF(protect_system_strict_table
) :
935 ((protect_system
== PROTECT_SYSTEM_FULL
) ?
936 ELEMENTSOF(protect_system_full_table
) :
937 ((protect_system
== PROTECT_SYSTEM_YES
) ?
938 ELEMENTSOF(protect_system_yes_table
) : 0)));
941 (protect_home
== PROTECT_HOME_YES
?
942 ELEMENTSOF(protect_home_yes_table
) :
943 ((protect_home
== PROTECT_HOME_READ_ONLY
) ?
944 ELEMENTSOF(protect_home_read_only_table
) : 0));
946 return !!tmp_dir
+ !!var_tmp_dir
+
947 strv_length(read_write_paths
) +
948 strv_length(read_only_paths
) +
949 strv_length(inaccessible_paths
) +
950 strv_length(empty_directories
) +
952 ns_info
->private_dev
+
953 (ns_info
->protect_kernel_tunables
? ELEMENTSOF(protect_kernel_tunables_table
) : 0) +
954 (ns_info
->protect_control_groups
? 1 : 0) +
955 (ns_info
->protect_kernel_modules
? ELEMENTSOF(protect_kernel_modules_table
) : 0) +
956 protect_home_cnt
+ protect_system_cnt
+
957 (namespace_info_mount_apivfs(root_directory
, ns_info
) ? ELEMENTSOF(apivfs_table
) : 0);
961 const char* root_directory
,
962 const char* root_image
,
963 const NameSpaceInfo
*ns_info
,
964 char** read_write_paths
,
965 char** read_only_paths
,
966 char** inaccessible_paths
,
967 char** empty_directories
,
968 const BindMount
*bind_mounts
,
969 unsigned n_bind_mounts
,
971 const char* var_tmp_dir
,
972 ProtectHome protect_home
,
973 ProtectSystem protect_system
,
974 unsigned long mount_flags
,
975 DissectImageFlags dissect_image_flags
) {
977 _cleanup_(loop_device_unrefp
) LoopDevice
*loop_device
= NULL
;
978 _cleanup_(decrypted_image_unrefp
) DecryptedImage
*decrypted_image
= NULL
;
979 _cleanup_(dissected_image_unrefp
) DissectedImage
*dissected_image
= NULL
;
980 _cleanup_free_
void *root_hash
= NULL
;
981 MountEntry
*m
, *mounts
= NULL
;
982 size_t root_hash_size
= 0;
983 bool make_slave
= false;
990 if (mount_flags
== 0)
991 mount_flags
= MS_SHARED
;
994 dissect_image_flags
|= DISSECT_IMAGE_REQUIRE_ROOT
;
996 if (protect_system
== PROTECT_SYSTEM_STRICT
&& strv_isempty(read_write_paths
))
997 dissect_image_flags
|= DISSECT_IMAGE_READ_ONLY
;
999 r
= loop_device_make_by_path(root_image
,
1000 dissect_image_flags
& DISSECT_IMAGE_READ_ONLY
? O_RDONLY
: O_RDWR
,
1005 r
= root_hash_load(root_image
, &root_hash
, &root_hash_size
);
1009 r
= dissect_image(loop_device
->fd
, root_hash
, root_hash_size
, dissect_image_flags
, &dissected_image
);
1013 r
= dissected_image_decrypt(dissected_image
, NULL
, root_hash
, root_hash_size
, dissect_image_flags
, &decrypted_image
);
1019 root
= root_directory
;
1020 else if (root_image
|| n_bind_mounts
> 0) {
1022 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1023 * the same mount point for all images, which is safe, since they all live in their own namespaces
1024 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1025 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1026 * while we are applying them. */
1028 root
= "/run/systemd/unit-root";
1029 (void) mkdir_label(root
, 0700);
1033 n_mounts
= namespace_calculate_mounts(
1040 bind_mounts
, n_bind_mounts
,
1041 tmp_dir
, var_tmp_dir
,
1042 protect_home
, protect_system
);
1044 /* Set mount slave mode */
1045 if (root
|| n_mounts
> 0)
1049 m
= mounts
= (MountEntry
*) alloca0(n_mounts
* sizeof(MountEntry
));
1050 r
= append_access_mounts(&m
, read_write_paths
, READWRITE
);
1054 r
= append_access_mounts(&m
, read_only_paths
, READONLY
);
1058 r
= append_access_mounts(&m
, inaccessible_paths
, INACCESSIBLE
);
1062 r
= append_empty_dir_mounts(&m
, empty_directories
);
1066 r
= append_bind_mounts(&m
, bind_mounts
, n_bind_mounts
);
1071 *(m
++) = (MountEntry
) {
1072 .path_const
= "/tmp",
1073 .mode
= PRIVATE_TMP
,
1078 *(m
++) = (MountEntry
) {
1079 .path_const
= "/var/tmp",
1080 .mode
= PRIVATE_VAR_TMP
,
1084 if (ns_info
->private_dev
) {
1085 *(m
++) = (MountEntry
) {
1086 .path_const
= "/dev",
1087 .mode
= PRIVATE_DEV
,
1091 if (ns_info
->protect_kernel_tunables
) {
1092 r
= append_static_mounts(&m
, protect_kernel_tunables_table
, ELEMENTSOF(protect_kernel_tunables_table
), ns_info
->ignore_protect_paths
);
1097 if (ns_info
->protect_kernel_modules
) {
1098 r
= append_static_mounts(&m
, protect_kernel_modules_table
, ELEMENTSOF(protect_kernel_modules_table
), ns_info
->ignore_protect_paths
);
1103 if (ns_info
->protect_control_groups
) {
1104 *(m
++) = (MountEntry
) {
1105 .path_const
= "/sys/fs/cgroup",
1110 r
= append_protect_home(&m
, protect_home
, ns_info
->ignore_protect_paths
);
1114 r
= append_protect_system(&m
, protect_system
, false);
1118 if (namespace_info_mount_apivfs(root
, ns_info
)) {
1119 r
= append_static_mounts(&m
, apivfs_table
, ELEMENTSOF(apivfs_table
), ns_info
->ignore_protect_paths
);
1124 assert(mounts
+ n_mounts
== m
);
1126 /* Prepend the root directory where that's necessary */
1127 r
= prefix_where_needed(mounts
, n_mounts
, root
);
1131 qsort(mounts
, n_mounts
, sizeof(MountEntry
), mount_path_compare
);
1133 drop_duplicates(mounts
, &n_mounts
);
1134 drop_outside_root(root
, mounts
, &n_mounts
);
1135 drop_inaccessible(mounts
, &n_mounts
);
1136 drop_nop(mounts
, &n_mounts
);
1139 if (unshare(CLONE_NEWNS
) < 0) {
1145 /* Remount / as SLAVE so that nothing now mounted in the namespace
1146 shows up in the parent */
1147 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0) {
1153 /* Try to set up the new root directory before mounting anything there */
1155 (void) base_filesystem_create(root
, UID_INVALID
, GID_INVALID
);
1158 /* A root image is specified, mount it to the right place */
1159 r
= dissected_image_mount(dissected_image
, root
, dissect_image_flags
);
1163 if (decrypted_image
) {
1164 r
= decrypted_image_relinquish(decrypted_image
);
1169 loop_device_relinquish(loop_device
);
1171 } else if (root_directory
) {
1173 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1174 r
= path_is_mount_point(root
, NULL
, AT_SYMLINK_FOLLOW
);
1178 if (mount(root
, root
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
1186 /* Let's mount the main root directory to the root directory to use */
1187 if (mount("/", root
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
1194 _cleanup_fclose_
FILE *proc_self_mountinfo
= NULL
;
1198 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1199 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1200 proc_self_mountinfo
= fopen("/proc/self/mountinfo", "re");
1201 if (!proc_self_mountinfo
) {
1206 /* First round, add in all special mounts we need */
1207 for (m
= mounts
; m
< mounts
+ n_mounts
; ++m
) {
1208 r
= apply_mount(root
, m
, tmp_dir
, var_tmp_dir
);
1213 /* Create a blacklist we can pass to bind_mount_recursive() */
1214 blacklist
= newa(char*, n_mounts
+1);
1215 for (j
= 0; j
< n_mounts
; j
++)
1216 blacklist
[j
] = (char*) mount_entry_path(mounts
+j
);
1217 blacklist
[j
] = NULL
;
1219 /* Second round, flip the ro bits if necessary. */
1220 for (m
= mounts
; m
< mounts
+ n_mounts
; ++m
) {
1221 r
= make_read_only(m
, blacklist
, proc_self_mountinfo
);
1228 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1229 r
= mount_move_root(root
);
1234 /* Remount / as the desired mode. Not that this will not
1235 * reestablish propagation from our side to the host, since
1236 * what's disconnected is disconnected. */
1237 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0) {
1245 for (m
= mounts
; m
< mounts
+ n_mounts
; m
++)
1246 mount_entry_done(m
);
1251 void bind_mount_free_many(BindMount
*b
, unsigned n
) {
1254 assert(b
|| n
== 0);
1256 for (i
= 0; i
< n
; i
++) {
1258 free(b
[i
].destination
);
1264 int bind_mount_add(BindMount
**b
, unsigned *n
, const BindMount
*item
) {
1265 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
1272 s
= strdup(item
->source
);
1276 d
= strdup(item
->destination
);
1280 c
= realloc_multiply(*b
, sizeof(BindMount
), *n
+ 1);
1286 c
[(*n
) ++] = (BindMount
) {
1289 .read_only
= item
->read_only
,
1290 .recursive
= item
->recursive
,
1291 .ignore_enoent
= item
->ignore_enoent
,
1298 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
1299 _cleanup_free_
char *x
= NULL
;
1300 char bid
[SD_ID128_STRING_MAX
];
1308 /* We include the boot id in the directory so that after a
1309 * reboot we can easily identify obsolete directories. */
1311 r
= sd_id128_get_boot(&boot_id
);
1315 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX");
1319 RUN_WITH_UMASK(0077)
1323 RUN_WITH_UMASK(0000) {
1326 y
= strjoina(x
, "/tmp");
1328 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
1338 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
1344 assert(var_tmp_dir
);
1346 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
1350 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
1354 t
= strjoina(a
, "/tmp");
1368 int setup_netns(int netns_storage_socket
[2]) {
1369 _cleanup_close_
int netns
= -1;
1372 assert(netns_storage_socket
);
1373 assert(netns_storage_socket
[0] >= 0);
1374 assert(netns_storage_socket
[1] >= 0);
1376 /* We use the passed socketpair as a storage buffer for our
1377 * namespace reference fd. Whatever process runs this first
1378 * shall create a new namespace, all others should just join
1379 * it. To serialize that we use a file lock on the socket
1382 * It's a bit crazy, but hey, works great! */
1384 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
1387 netns
= receive_one_fd(netns_storage_socket
[0], MSG_DONTWAIT
);
1388 if (netns
== -EAGAIN
) {
1389 /* Nothing stored yet, so let's create a new namespace */
1391 if (unshare(CLONE_NEWNET
) < 0) {
1398 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
1406 } else if (netns
< 0) {
1411 /* Yay, found something, so let's join the namespace */
1412 if (setns(netns
, CLONE_NEWNET
) < 0) {
1420 q
= send_one_fd(netns_storage_socket
[1], netns
, MSG_DONTWAIT
);
1427 (void) lockf(netns_storage_socket
[0], F_ULOCK
, 0);
1431 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
1432 [PROTECT_HOME_NO
] = "no",
1433 [PROTECT_HOME_YES
] = "yes",
1434 [PROTECT_HOME_READ_ONLY
] = "read-only",
1437 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
1439 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
1440 [PROTECT_SYSTEM_NO
] = "no",
1441 [PROTECT_SYSTEM_YES
] = "yes",
1442 [PROTECT_SYSTEM_FULL
] = "full",
1443 [PROTECT_SYSTEM_STRICT
] = "strict",
1446 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);