2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/mount.h>
29 #include "alloc-util.h"
30 #include "dev-setup.h"
33 #include "loop-util.h"
34 #include "loopback-setup.h"
37 #include "mount-util.h"
38 #include "namespace.h"
39 #include "path-util.h"
40 #include "selinux-util.h"
41 #include "socket-util.h"
42 #include "string-table.h"
43 #include "string-util.h"
45 #include "umask-util.h"
46 #include "user-util.h"
49 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
51 typedef enum MountMode
{
52 /* This is ordered by priority! */
66 typedef struct MountEntry
{
67 const char *path_const
; /* Memory allocated on stack or static */
69 bool ignore
:1; /* Ignore if path does not exist? */
70 bool has_prefix
:1; /* Already is prefixed by the root dir? */
71 bool read_only
:1; /* Shall this mount point be read-only? */
72 char *path_malloc
; /* Use this instead of 'path' if we had to allocate memory */
73 const char *source_const
; /* The source path, for bind mounts */
77 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
78 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
79 static const MountEntry apivfs_table
[] = {
80 { "/proc", PROCFS
, false },
81 { "/dev", BIND_DEV
, false },
82 { "/sys", SYSFS
, false },
85 /* ProtectKernelTunables= option and the related filesystem APIs */
86 static const MountEntry protect_kernel_tunables_table
[] = {
87 { "/proc/sys", READONLY
, false },
88 { "/proc/sysrq-trigger", READONLY
, true },
89 { "/proc/latency_stats", READONLY
, true },
90 { "/proc/mtrr", READONLY
, true },
91 { "/proc/apm", READONLY
, true }, /* Obsolete API, there's no point in permitting access to this, ever */
92 { "/proc/acpi", READONLY
, true },
93 { "/proc/timer_stats", READONLY
, true },
94 { "/proc/asound", READONLY
, true },
95 { "/proc/bus", READONLY
, true },
96 { "/proc/fs", READONLY
, true },
97 { "/proc/irq", READONLY
, true },
98 { "/sys", READONLY
, false },
99 { "/sys/kernel/debug", READONLY
, true },
100 { "/sys/kernel/tracing", READONLY
, true },
101 { "/sys/fs/cgroup", READWRITE
, false }, /* READONLY is set by ProtectControlGroups= option */
104 /* ProtectKernelModules= option */
105 static const MountEntry protect_kernel_modules_table
[] = {
106 #ifdef HAVE_SPLIT_USR
107 { "/lib/modules", INACCESSIBLE
, true },
109 { "/usr/lib/modules", INACCESSIBLE
, true },
113 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
114 * system should be protected by ProtectSystem=
116 static const MountEntry protect_home_read_only_table
[] = {
117 { "/home", READONLY
, true },
118 { "/run/user", READONLY
, true },
119 { "/root", READONLY
, true },
122 /* ProtectHome=yes table */
123 static const MountEntry protect_home_yes_table
[] = {
124 { "/home", INACCESSIBLE
, true },
125 { "/run/user", INACCESSIBLE
, true },
126 { "/root", INACCESSIBLE
, true },
129 /* ProtectSystem=yes table */
130 static const MountEntry protect_system_yes_table
[] = {
131 { "/usr", READONLY
, false },
132 { "/boot", READONLY
, true },
133 { "/efi", READONLY
, true },
136 /* ProtectSystem=full includes ProtectSystem=yes */
137 static const MountEntry protect_system_full_table
[] = {
138 { "/usr", READONLY
, false },
139 { "/boot", READONLY
, true },
140 { "/efi", READONLY
, true },
141 { "/etc", READONLY
, false },
145 * ProtectSystem=strict table. In this strict mode, we mount everything
146 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
147 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
148 * protect those, and these options should be fully orthogonal.
149 * (And of course /home and friends are also left writable, as ProtectHome=
150 * shall manage those, orthogonally).
152 static const MountEntry protect_system_strict_table
[] = {
153 { "/", READONLY
, false },
154 { "/proc", READWRITE
, false }, /* ProtectKernelTunables= */
155 { "/sys", READWRITE
, false }, /* ProtectKernelTunables= */
156 { "/dev", READWRITE
, false }, /* PrivateDevices= */
157 { "/home", READWRITE
, true }, /* ProtectHome= */
158 { "/run/user", READWRITE
, true }, /* ProtectHome= */
159 { "/root", READWRITE
, true }, /* ProtectHome= */
162 static const char *mount_entry_path(const MountEntry
*p
) {
165 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
166 * otherwise the stack/static ->path field is returned. */
168 return p
->path_malloc
?: p
->path_const
;
171 static bool mount_entry_read_only(const MountEntry
*p
) {
174 return p
->read_only
|| IN_SET(p
->mode
, READONLY
, INACCESSIBLE
);
177 static const char *mount_entry_source(const MountEntry
*p
) {
180 return p
->source_malloc
?: p
->source_const
;
183 static void mount_entry_done(MountEntry
*p
) {
186 p
->path_malloc
= mfree(p
->path_malloc
);
187 p
->source_malloc
= mfree(p
->source_malloc
);
190 static int append_access_mounts(MountEntry
**p
, char **strv
, MountMode mode
) {
195 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
197 STRV_FOREACH(i
, strv
) {
198 bool ignore
= false, needs_prefix
= false;
201 /* Look for any prefixes */
202 if (startswith(e
, "-")) {
206 if (startswith(e
, "+")) {
211 if (!path_is_absolute(e
))
214 *((*p
)++) = (MountEntry
) {
218 .has_prefix
= !needs_prefix
,
225 static int append_bind_mounts(MountEntry
**p
, const BindMount
*binds
, unsigned n
) {
230 for (i
= 0; i
< n
; i
++) {
231 const BindMount
*b
= binds
+ i
;
233 *((*p
)++) = (MountEntry
) {
234 .path_const
= b
->destination
,
235 .mode
= b
->recursive
? BIND_MOUNT_RECURSIVE
: BIND_MOUNT
,
236 .read_only
= b
->read_only
,
237 .source_const
= b
->source
,
244 static int append_static_mounts(MountEntry
**p
, const MountEntry
*mounts
, unsigned n
, bool ignore_protect
) {
250 /* Adds a list of static pre-defined entries */
252 for (i
= 0; i
< n
; i
++)
253 *((*p
)++) = (MountEntry
) {
254 .path_const
= mount_entry_path(mounts
+i
),
255 .mode
= mounts
[i
].mode
,
256 .ignore
= mounts
[i
].ignore
|| ignore_protect
,
262 static int append_protect_home(MountEntry
**p
, ProtectHome protect_home
, bool ignore_protect
) {
265 switch (protect_home
) {
267 case PROTECT_HOME_NO
:
270 case PROTECT_HOME_READ_ONLY
:
271 return append_static_mounts(p
, protect_home_read_only_table
, ELEMENTSOF(protect_home_read_only_table
), ignore_protect
);
273 case PROTECT_HOME_YES
:
274 return append_static_mounts(p
, protect_home_yes_table
, ELEMENTSOF(protect_home_yes_table
), ignore_protect
);
277 assert_not_reached("Unexpected ProtectHome= value");
281 static int append_protect_system(MountEntry
**p
, ProtectSystem protect_system
, bool ignore_protect
) {
284 switch (protect_system
) {
286 case PROTECT_SYSTEM_NO
:
289 case PROTECT_SYSTEM_STRICT
:
290 return append_static_mounts(p
, protect_system_strict_table
, ELEMENTSOF(protect_system_strict_table
), ignore_protect
);
292 case PROTECT_SYSTEM_YES
:
293 return append_static_mounts(p
, protect_system_yes_table
, ELEMENTSOF(protect_system_yes_table
), ignore_protect
);
295 case PROTECT_SYSTEM_FULL
:
296 return append_static_mounts(p
, protect_system_full_table
, ELEMENTSOF(protect_system_full_table
), ignore_protect
);
299 assert_not_reached("Unexpected ProtectSystem= value");
303 static int mount_path_compare(const void *a
, const void *b
) {
304 const MountEntry
*p
= a
, *q
= b
;
307 /* If the paths are not equal, then order prefixes first */
308 d
= path_compare(mount_entry_path(p
), mount_entry_path(q
));
312 /* If the paths are equal, check the mode */
313 if (p
->mode
< q
->mode
)
316 if (p
->mode
> q
->mode
)
322 static int prefix_where_needed(MountEntry
*m
, unsigned n
, const char *root_directory
) {
325 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
331 for (i
= 0; i
< n
; i
++) {
337 s
= prefix_root(root_directory
, mount_entry_path(m
+i
));
341 free(m
[i
].path_malloc
);
342 m
[i
].path_malloc
= s
;
344 m
[i
].has_prefix
= true;
350 static void drop_duplicates(MountEntry
*m
, unsigned *n
) {
351 MountEntry
*f
, *t
, *previous
;
356 /* Drops duplicate entries. Expects that the array is properly ordered already. */
358 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+ *n
; f
++) {
360 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
362 if (previous
&& path_equal(mount_entry_path(f
), mount_entry_path(previous
))) {
363 log_debug("%s is duplicate.", mount_entry_path(f
));
364 previous
->read_only
= previous
->read_only
|| mount_entry_read_only(f
); /* Propagate the read-only flag to the remaining entry */
377 static void drop_inaccessible(MountEntry
*m
, unsigned *n
) {
379 const char *clear
= NULL
;
384 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
385 * ordered already. */
387 for (f
= m
, t
= m
; f
< m
+ *n
; f
++) {
389 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
390 * it, as inaccessible paths really should drop the entire subtree. */
391 if (clear
&& path_startswith(mount_entry_path(f
), clear
)) {
392 log_debug("%s is masked by %s.", mount_entry_path(f
), clear
);
397 clear
= f
->mode
== INACCESSIBLE
? mount_entry_path(f
) : NULL
;
406 static void drop_nop(MountEntry
*m
, unsigned *n
) {
412 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
413 * list is ordered by prefixes. */
415 for (f
= m
, t
= m
; f
< m
+ *n
; f
++) {
417 /* Only suppress such subtrees for READONLY and READWRITE entries */
418 if (IN_SET(f
->mode
, READONLY
, READWRITE
)) {
422 /* Now let's find the first parent of the entry we are looking at. */
423 for (p
= t
-1; p
>= m
; p
--) {
424 if (path_startswith(mount_entry_path(f
), mount_entry_path(p
))) {
430 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
431 if (found
&& p
->mode
== f
->mode
) {
432 log_debug("%s is redundant by %s", mount_entry_path(f
), mount_entry_path(p
));
445 static void drop_outside_root(const char *root_directory
, MountEntry
*m
, unsigned *n
) {
455 /* Drops all mounts that are outside of the root directory. */
457 for (f
= m
, t
= m
; f
< m
+ *n
; f
++) {
459 if (!path_startswith(mount_entry_path(f
), root_directory
)) {
460 log_debug("%s is outside of root directory.", mount_entry_path(f
));
472 static int mount_private_dev(MountEntry
*m
) {
473 static const char devnodes
[] =
481 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
482 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
483 _cleanup_umask_ mode_t u
;
490 if (!mkdtemp(temporary_mount
))
493 dev
= strjoina(temporary_mount
, "/dev");
494 (void) mkdir(dev
, 0755);
495 if (mount("tmpfs", dev
, "tmpfs", DEV_MOUNT_OPTIONS
, "mode=755") < 0) {
500 devpts
= strjoina(temporary_mount
, "/dev/pts");
501 (void) mkdir(devpts
, 0755);
502 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
507 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
508 if (symlink("pts/ptmx", devptmx
) < 0) {
513 devshm
= strjoina(temporary_mount
, "/dev/shm");
514 (void) mkdir(devshm
, 01777);
515 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
521 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
522 (void) mkdir(devmqueue
, 0755);
523 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
525 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
526 (void) mkdir(devhugepages
, 0755);
527 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
529 devlog
= strjoina(temporary_mount
, "/dev/log");
530 (void) symlink("/run/systemd/journal/dev-log", devlog
);
532 NULSTR_FOREACH(d
, devnodes
) {
533 _cleanup_free_
char *dn
= NULL
;
546 if (!S_ISBLK(st
.st_mode
) &&
547 !S_ISCHR(st
.st_mode
)) {
555 dn
= strappend(temporary_mount
, d
);
561 mac_selinux_create_file_prepare(d
, st
.st_mode
);
562 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
563 mac_selinux_create_file_clear();
571 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
573 /* Create the /dev directory if missing. It is more likely to be
574 * missing when the service is started with RootDirectory. This is
575 * consistent with mount units creating the mount points when missing.
577 (void) mkdir_p_label(mount_entry_path(m
), 0755);
579 /* Unmount everything in old /dev */
580 umount_recursive(mount_entry_path(m
), 0);
581 if (mount(dev
, mount_entry_path(m
), NULL
, MS_MOVE
, NULL
) < 0) {
587 rmdir(temporary_mount
);
599 umount(devhugepages
);
606 rmdir(temporary_mount
);
611 static int mount_bind_dev(MountEntry
*m
) {
616 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
617 * /dev. This is only used when RootDirectory= is set. */
619 r
= path_is_mount_point(mount_entry_path(m
), NULL
, 0);
621 return log_debug_errno(r
, "Unable to determine whether /dev is already mounted: %m");
622 if (r
> 0) /* make this a NOP if /dev is already a mount point */
625 if (mount("/dev", mount_entry_path(m
), NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
626 return log_debug_errno(errno
, "Failed to bind mount %s: %m", mount_entry_path(m
));
631 static int mount_sysfs(MountEntry
*m
) {
636 r
= path_is_mount_point(mount_entry_path(m
), NULL
, 0);
638 return log_debug_errno(r
, "Unable to determine whether /sys is already mounted: %m");
639 if (r
> 0) /* make this a NOP if /sys is already a mount point */
642 /* Bind mount the host's version so that we get all child mounts of it, too. */
643 if (mount("/sys", mount_entry_path(m
), NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
644 return log_debug_errno(errno
, "Failed to mount %s: %m", mount_entry_path(m
));
649 static int mount_procfs(MountEntry
*m
) {
654 r
= path_is_mount_point(mount_entry_path(m
), NULL
, 0);
656 return log_debug_errno(r
, "Unable to determine whether /proc is already mounted: %m");
657 if (r
> 0) /* make this a NOP if /proc is already a mount point */
660 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
661 if (mount("proc", mount_entry_path(m
), "proc", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
) < 0)
662 return log_debug_errno(errno
, "Failed to mount %s: %m", mount_entry_path(m
));
667 static int mount_entry_chase(
668 const char *root_directory
,
678 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
679 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
680 * that applies). The result is stored in "location". */
682 r
= chase_symlinks(path
, root_directory
, 0, &chased
);
683 if (r
== -ENOENT
&& m
->ignore
) {
684 log_debug_errno(r
, "Path %s does not exist, ignoring.", path
);
688 return log_debug_errno(r
, "Failed to follow symlinks on %s: %m", path
);
690 log_debug("Followed symlinks %s → %s.", path
, chased
);
698 static int apply_mount(
699 const char *root_directory
,
702 const char *var_tmp_dir
) {
710 r
= mount_entry_chase(root_directory
, m
, mount_entry_path(m
), &m
->path_malloc
);
714 log_debug("Applying namespace mount on %s", mount_entry_path(m
));
721 /* First, get rid of everything that is below if there
722 * is anything... Then, overmount it with an
723 * inaccessible path. */
724 (void) umount_recursive(mount_entry_path(m
), 0);
726 if (lstat(mount_entry_path(m
), &target
) < 0)
727 return log_debug_errno(errno
, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m
));
729 what
= mode_to_inaccessible_node(target
.st_mode
);
731 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
739 r
= path_is_mount_point(mount_entry_path(m
), root_directory
, 0);
741 return log_debug_errno(r
, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m
));
742 if (r
> 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
744 /* This isn't a mount point yet, let's make it one. */
745 what
= mount_entry_path(m
);
752 case BIND_MOUNT_RECURSIVE
:
753 /* Also chase the source mount */
755 r
= mount_entry_chase(root_directory
, m
, mount_entry_source(m
), &m
->source_malloc
);
759 what
= mount_entry_source(m
);
766 case PRIVATE_VAR_TMP
:
771 return mount_private_dev(m
);
774 return mount_bind_dev(m
);
777 return mount_sysfs(m
);
780 return mount_procfs(m
);
783 assert_not_reached("Unknown mode");
788 if (mount(what
, mount_entry_path(m
), NULL
, MS_BIND
|(rbind
? MS_REC
: 0), NULL
) < 0)
789 return log_debug_errno(errno
, "Failed to mount %s to %s: %m", what
, mount_entry_path(m
));
791 log_debug("Successfully mounted %s to %s", what
, mount_entry_path(m
));
795 static int make_read_only(MountEntry
*m
, char **blacklist
) {
800 if (mount_entry_read_only(m
))
801 r
= bind_remount_recursive(mount_entry_path(m
), true, blacklist
);
802 else if (m
->mode
== PRIVATE_DEV
) { /* Superblock can be readonly but the submounts can't*/
803 if (mount(NULL
, mount_entry_path(m
), NULL
, MS_REMOUNT
|DEV_MOUNT_OPTIONS
|MS_RDONLY
, NULL
) < 0)
808 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
809 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
810 * read-only mounts already applied. */
812 if (r
== -ENOENT
&& m
->ignore
)
818 static bool namespace_info_mount_apivfs(const NameSpaceInfo
*ns_info
) {
821 /* ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, since to protect the API VFS mounts,
822 * they need to be around in the first place... */
824 return ns_info
->mount_apivfs
||
825 ns_info
->protect_control_groups
||
826 ns_info
->protect_kernel_tunables
;
829 static unsigned namespace_calculate_mounts(
830 const NameSpaceInfo
*ns_info
,
831 char** read_write_paths
,
832 char** read_only_paths
,
833 char** inaccessible_paths
,
834 const BindMount
*bind_mounts
,
835 unsigned n_bind_mounts
,
837 const char* var_tmp_dir
,
838 ProtectHome protect_home
,
839 ProtectSystem protect_system
) {
841 unsigned protect_home_cnt
;
842 unsigned protect_system_cnt
=
843 (protect_system
== PROTECT_SYSTEM_STRICT
?
844 ELEMENTSOF(protect_system_strict_table
) :
845 ((protect_system
== PROTECT_SYSTEM_FULL
) ?
846 ELEMENTSOF(protect_system_full_table
) :
847 ((protect_system
== PROTECT_SYSTEM_YES
) ?
848 ELEMENTSOF(protect_system_yes_table
) : 0)));
851 (protect_home
== PROTECT_HOME_YES
?
852 ELEMENTSOF(protect_home_yes_table
) :
853 ((protect_home
== PROTECT_HOME_READ_ONLY
) ?
854 ELEMENTSOF(protect_home_read_only_table
) : 0));
856 return !!tmp_dir
+ !!var_tmp_dir
+
857 strv_length(read_write_paths
) +
858 strv_length(read_only_paths
) +
859 strv_length(inaccessible_paths
) +
861 ns_info
->private_dev
+
862 (ns_info
->protect_kernel_tunables
? ELEMENTSOF(protect_kernel_tunables_table
) : 0) +
863 (ns_info
->protect_control_groups
? 1 : 0) +
864 (ns_info
->protect_kernel_modules
? ELEMENTSOF(protect_kernel_modules_table
) : 0) +
865 protect_home_cnt
+ protect_system_cnt
+
866 (namespace_info_mount_apivfs(ns_info
) ? ELEMENTSOF(apivfs_table
) : 0);
870 const char* root_directory
,
871 const char* root_image
,
872 const NameSpaceInfo
*ns_info
,
873 char** read_write_paths
,
874 char** read_only_paths
,
875 char** inaccessible_paths
,
876 const BindMount
*bind_mounts
,
877 unsigned n_bind_mounts
,
879 const char* var_tmp_dir
,
880 ProtectHome protect_home
,
881 ProtectSystem protect_system
,
882 unsigned long mount_flags
,
883 DissectImageFlags dissect_image_flags
) {
885 _cleanup_(loop_device_unrefp
) LoopDevice
*loop_device
= NULL
;
886 _cleanup_(dissected_image_unrefp
) DissectedImage
*dissected_image
= NULL
;
887 MountEntry
*m
, *mounts
= NULL
;
888 bool make_slave
= false;
894 if (mount_flags
== 0)
895 mount_flags
= MS_SHARED
;
898 dissect_image_flags
|= DISSECT_IMAGE_REQUIRE_ROOT
;
900 if (protect_system
== PROTECT_SYSTEM_STRICT
&& strv_isempty(read_write_paths
))
901 dissect_image_flags
|= DISSECT_IMAGE_READ_ONLY
;
903 r
= loop_device_make_by_path(root_image
,
904 dissect_image_flags
& DISSECT_IMAGE_READ_ONLY
? O_RDONLY
: O_RDWR
,
909 r
= dissect_image(loop_device
->fd
, NULL
, 0, dissect_image_flags
, &dissected_image
);
913 if (!root_directory
) {
914 /* Create a mount point for the image, if it's still missing. We use the same mount point for
915 * all images, which is safe, since they all live in their own namespaces after all, and hence
916 * won't see each other. */
917 root_directory
= "/run/systemd/unit-root";
918 (void) mkdir(root_directory
, 0700);
922 n_mounts
= namespace_calculate_mounts(
927 bind_mounts
, n_bind_mounts
,
928 tmp_dir
, var_tmp_dir
,
929 protect_home
, protect_system
);
931 /* Set mount slave mode */
932 if (root_directory
|| n_mounts
> 0)
936 m
= mounts
= (MountEntry
*) alloca0(n_mounts
* sizeof(MountEntry
));
937 r
= append_access_mounts(&m
, read_write_paths
, READWRITE
);
941 r
= append_access_mounts(&m
, read_only_paths
, READONLY
);
945 r
= append_access_mounts(&m
, inaccessible_paths
, INACCESSIBLE
);
949 r
= append_bind_mounts(&m
, bind_mounts
, n_bind_mounts
);
954 *(m
++) = (MountEntry
) {
955 .path_const
= "/tmp",
961 *(m
++) = (MountEntry
) {
962 .path_const
= "/var/tmp",
963 .mode
= PRIVATE_VAR_TMP
,
967 if (ns_info
->private_dev
) {
968 *(m
++) = (MountEntry
) {
969 .path_const
= "/dev",
974 if (ns_info
->protect_kernel_tunables
) {
975 r
= append_static_mounts(&m
, protect_kernel_tunables_table
, ELEMENTSOF(protect_kernel_tunables_table
), ns_info
->ignore_protect_paths
);
980 if (ns_info
->protect_kernel_modules
) {
981 r
= append_static_mounts(&m
, protect_kernel_modules_table
, ELEMENTSOF(protect_kernel_modules_table
), ns_info
->ignore_protect_paths
);
986 if (ns_info
->protect_control_groups
) {
987 *(m
++) = (MountEntry
) {
988 .path_const
= "/sys/fs/cgroup",
993 r
= append_protect_home(&m
, protect_home
, ns_info
->ignore_protect_paths
);
997 r
= append_protect_system(&m
, protect_system
, false);
1001 if (namespace_info_mount_apivfs(ns_info
)) {
1002 r
= append_static_mounts(&m
, apivfs_table
, ELEMENTSOF(apivfs_table
), ns_info
->ignore_protect_paths
);
1007 assert(mounts
+ n_mounts
== m
);
1009 /* Prepend the root directory where that's necessary */
1010 r
= prefix_where_needed(mounts
, n_mounts
, root_directory
);
1014 qsort(mounts
, n_mounts
, sizeof(MountEntry
), mount_path_compare
);
1016 drop_duplicates(mounts
, &n_mounts
);
1017 drop_outside_root(root_directory
, mounts
, &n_mounts
);
1018 drop_inaccessible(mounts
, &n_mounts
);
1019 drop_nop(mounts
, &n_mounts
);
1022 if (unshare(CLONE_NEWNS
) < 0) {
1028 /* Remount / as SLAVE so that nothing now mounted in the namespace
1029 shows up in the parent */
1030 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0) {
1037 r
= dissected_image_mount(dissected_image
, root_directory
, dissect_image_flags
);
1041 loop_device_relinquish(loop_device
);
1043 } else if (root_directory
) {
1045 /* Turn directory into bind mount, if it isn't one yet */
1046 r
= path_is_mount_point(root_directory
, NULL
, AT_SYMLINK_FOLLOW
);
1050 if (mount(root_directory
, root_directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
1061 /* First round, add in all special mounts we need */
1062 for (m
= mounts
; m
< mounts
+ n_mounts
; ++m
) {
1063 r
= apply_mount(root_directory
, m
, tmp_dir
, var_tmp_dir
);
1068 /* Create a blacklist we can pass to bind_mount_recursive() */
1069 blacklist
= newa(char*, n_mounts
+1);
1070 for (j
= 0; j
< n_mounts
; j
++)
1071 blacklist
[j
] = (char*) mount_entry_path(mounts
+j
);
1072 blacklist
[j
] = NULL
;
1074 /* Second round, flip the ro bits if necessary. */
1075 for (m
= mounts
; m
< mounts
+ n_mounts
; ++m
) {
1076 r
= make_read_only(m
, blacklist
);
1082 if (root_directory
) {
1083 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1084 r
= mount_move_root(root_directory
);
1089 /* Remount / as the desired mode. Not that this will not
1090 * reestablish propagation from our side to the host, since
1091 * what's disconnected is disconnected. */
1092 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0) {
1100 for (m
= mounts
; m
< mounts
+ n_mounts
; m
++)
1101 mount_entry_done(m
);
1106 void bind_mount_free_many(BindMount
*b
, unsigned n
) {
1109 assert(b
|| n
== 0);
1111 for (i
= 0; i
< n
; i
++) {
1113 free(b
[i
].destination
);
1119 int bind_mount_add(BindMount
**b
, unsigned *n
, const BindMount
*item
) {
1120 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
1127 s
= strdup(item
->source
);
1131 d
= strdup(item
->destination
);
1135 c
= realloc_multiply(*b
, sizeof(BindMount
), *n
+ 1);
1141 c
[(*n
) ++] = (BindMount
) {
1144 .read_only
= item
->read_only
,
1145 .recursive
= item
->recursive
,
1146 .ignore_enoent
= item
->ignore_enoent
,
1153 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
1154 _cleanup_free_
char *x
= NULL
;
1155 char bid
[SD_ID128_STRING_MAX
];
1163 /* We include the boot id in the directory so that after a
1164 * reboot we can easily identify obsolete directories. */
1166 r
= sd_id128_get_boot(&boot_id
);
1170 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX");
1174 RUN_WITH_UMASK(0077)
1178 RUN_WITH_UMASK(0000) {
1181 y
= strjoina(x
, "/tmp");
1183 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
1193 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
1199 assert(var_tmp_dir
);
1201 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
1205 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
1209 t
= strjoina(a
, "/tmp");
1223 int setup_netns(int netns_storage_socket
[2]) {
1224 _cleanup_close_
int netns
= -1;
1227 assert(netns_storage_socket
);
1228 assert(netns_storage_socket
[0] >= 0);
1229 assert(netns_storage_socket
[1] >= 0);
1231 /* We use the passed socketpair as a storage buffer for our
1232 * namespace reference fd. Whatever process runs this first
1233 * shall create a new namespace, all others should just join
1234 * it. To serialize that we use a file lock on the socket
1237 * It's a bit crazy, but hey, works great! */
1239 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
1242 netns
= receive_one_fd(netns_storage_socket
[0], MSG_DONTWAIT
);
1243 if (netns
== -EAGAIN
) {
1244 /* Nothing stored yet, so let's create a new namespace */
1246 if (unshare(CLONE_NEWNET
) < 0) {
1253 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
1261 } else if (netns
< 0) {
1266 /* Yay, found something, so let's join the namespace */
1267 if (setns(netns
, CLONE_NEWNET
) < 0) {
1275 q
= send_one_fd(netns_storage_socket
[1], netns
, MSG_DONTWAIT
);
1282 (void) lockf(netns_storage_socket
[0], F_ULOCK
, 0);
1286 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
1287 [PROTECT_HOME_NO
] = "no",
1288 [PROTECT_HOME_YES
] = "yes",
1289 [PROTECT_HOME_READ_ONLY
] = "read-only",
1292 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
1294 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
1295 [PROTECT_SYSTEM_NO
] = "no",
1296 [PROTECT_SYSTEM_YES
] = "yes",
1297 [PROTECT_SYSTEM_FULL
] = "full",
1298 [PROTECT_SYSTEM_STRICT
] = "strict",
1301 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);