1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2010 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/mount.h>
30 #include "alloc-util.h"
31 #include "base-filesystem.h"
32 #include "dev-setup.h"
36 #include "loop-util.h"
37 #include "loopback-setup.h"
40 #include "mount-util.h"
41 #include "namespace.h"
42 #include "path-util.h"
43 #include "selinux-util.h"
44 #include "socket-util.h"
45 #include "stat-util.h"
46 #include "string-table.h"
47 #include "string-util.h"
49 #include "umask-util.h"
50 #include "user-util.h"
53 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
55 typedef enum MountMode
{
56 /* This is ordered by priority! */
71 typedef struct MountEntry
{
72 const char *path_const
; /* Memory allocated on stack or static */
74 bool ignore
:1; /* Ignore if path does not exist? */
75 bool has_prefix
:1; /* Already is prefixed by the root dir? */
76 bool read_only
:1; /* Shall this mount point be read-only? */
77 char *path_malloc
; /* Use this instead of 'path_const' if we had to allocate memory */
78 const char *source_const
; /* The source path, for bind mounts */
80 const char *options_const
;/* Mount options for tmpfs */
82 unsigned long flags
; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
85 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
86 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
87 static const MountEntry apivfs_table
[] = {
88 { "/proc", PROCFS
, false },
89 { "/dev", BIND_DEV
, false },
90 { "/sys", SYSFS
, false },
93 /* ProtectKernelTunables= option and the related filesystem APIs */
94 static const MountEntry protect_kernel_tunables_table
[] = {
95 { "/proc/sys", READONLY
, false },
96 { "/proc/sysrq-trigger", READONLY
, true },
97 { "/proc/latency_stats", READONLY
, true },
98 { "/proc/mtrr", READONLY
, true },
99 { "/proc/apm", READONLY
, true }, /* Obsolete API, there's no point in permitting access to this, ever */
100 { "/proc/acpi", READONLY
, true },
101 { "/proc/timer_stats", READONLY
, true },
102 { "/proc/asound", READONLY
, true },
103 { "/proc/bus", READONLY
, true },
104 { "/proc/fs", READONLY
, true },
105 { "/proc/irq", READONLY
, true },
106 { "/sys", READONLY
, false },
107 { "/sys/kernel/debug", READONLY
, true },
108 { "/sys/kernel/tracing", READONLY
, true },
109 { "/sys/fs/bpf", READONLY
, true },
110 { "/sys/fs/cgroup", READWRITE
, false }, /* READONLY is set by ProtectControlGroups= option */
111 { "/sys/fs/selinux", READWRITE
, true },
114 /* ProtectKernelModules= option */
115 static const MountEntry protect_kernel_modules_table
[] = {
117 { "/lib/modules", INACCESSIBLE
, true },
119 { "/usr/lib/modules", INACCESSIBLE
, true },
123 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
124 * system should be protected by ProtectSystem=
126 static const MountEntry protect_home_read_only_table
[] = {
127 { "/home", READONLY
, true },
128 { "/run/user", READONLY
, true },
129 { "/root", READONLY
, true },
132 /* ProtectHome=tmpfs table */
133 static const MountEntry protect_home_tmpfs_table
[] = {
134 { "/home", TMPFS
, true, .read_only
= true, .options_const
= "mode=0755", .flags
= MS_NODEV
|MS_STRICTATIME
},
135 { "/run/user", TMPFS
, true, .read_only
= true, .options_const
= "mode=0755", .flags
= MS_NODEV
|MS_STRICTATIME
},
136 { "/root", TMPFS
, true, .read_only
= true, .options_const
= "mode=0700", .flags
= MS_NODEV
|MS_STRICTATIME
},
139 /* ProtectHome=yes table */
140 static const MountEntry protect_home_yes_table
[] = {
141 { "/home", INACCESSIBLE
, true },
142 { "/run/user", INACCESSIBLE
, true },
143 { "/root", INACCESSIBLE
, true },
146 /* ProtectSystem=yes table */
147 static const MountEntry protect_system_yes_table
[] = {
148 { "/usr", READONLY
, false },
149 { "/boot", READONLY
, true },
150 { "/efi", READONLY
, true },
153 /* ProtectSystem=full includes ProtectSystem=yes */
154 static const MountEntry protect_system_full_table
[] = {
155 { "/usr", READONLY
, false },
156 { "/boot", READONLY
, true },
157 { "/efi", READONLY
, true },
158 { "/etc", READONLY
, false },
162 * ProtectSystem=strict table. In this strict mode, we mount everything
163 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
164 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
165 * protect those, and these options should be fully orthogonal.
166 * (And of course /home and friends are also left writable, as ProtectHome=
167 * shall manage those, orthogonally).
169 static const MountEntry protect_system_strict_table
[] = {
170 { "/", READONLY
, false },
171 { "/proc", READWRITE
, false }, /* ProtectKernelTunables= */
172 { "/sys", READWRITE
, false }, /* ProtectKernelTunables= */
173 { "/dev", READWRITE
, false }, /* PrivateDevices= */
174 { "/home", READWRITE
, true }, /* ProtectHome= */
175 { "/run/user", READWRITE
, true }, /* ProtectHome= */
176 { "/root", READWRITE
, true }, /* ProtectHome= */
179 static const char *mount_entry_path(const MountEntry
*p
) {
182 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
183 * otherwise the stack/static ->path field is returned. */
185 return p
->path_malloc
?: p
->path_const
;
188 static bool mount_entry_read_only(const MountEntry
*p
) {
191 return p
->read_only
|| IN_SET(p
->mode
, READONLY
, INACCESSIBLE
);
194 static const char *mount_entry_source(const MountEntry
*p
) {
197 return p
->source_malloc
?: p
->source_const
;
200 static const char *mount_entry_options(const MountEntry
*p
) {
203 return p
->options_malloc
?: p
->options_const
;
206 static void mount_entry_done(MountEntry
*p
) {
209 p
->path_malloc
= mfree(p
->path_malloc
);
210 p
->source_malloc
= mfree(p
->source_malloc
);
211 p
->options_malloc
= mfree(p
->options_malloc
);
214 static int append_access_mounts(MountEntry
**p
, char **strv
, MountMode mode
, bool forcibly_require_prefix
) {
219 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
221 STRV_FOREACH(i
, strv
) {
222 bool ignore
= false, needs_prefix
= false;
225 /* Look for any prefixes */
226 if (startswith(e
, "-")) {
230 if (startswith(e
, "+")) {
235 if (!path_is_absolute(e
))
238 *((*p
)++) = (MountEntry
) {
242 .has_prefix
= !needs_prefix
&& !forcibly_require_prefix
,
249 static int append_empty_dir_mounts(MountEntry
**p
, char **strv
) {
254 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
255 * "/private/" boundary directories for DynamicUser=1. */
257 STRV_FOREACH(i
, strv
) {
259 *((*p
)++) = (MountEntry
) {
265 .options_const
= "mode=755",
266 .flags
= MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
,
273 static int append_bind_mounts(MountEntry
**p
, const BindMount
*binds
, unsigned n
) {
278 for (i
= 0; i
< n
; i
++) {
279 const BindMount
*b
= binds
+ i
;
281 *((*p
)++) = (MountEntry
) {
282 .path_const
= b
->destination
,
283 .mode
= b
->recursive
? BIND_MOUNT_RECURSIVE
: BIND_MOUNT
,
284 .read_only
= b
->read_only
,
285 .source_const
= b
->source
,
286 .ignore
= b
->ignore_enoent
,
293 static int append_tmpfs_mounts(MountEntry
**p
, const TemporaryFileSystem
*tmpfs
, unsigned n
) {
299 for (i
= 0; i
< n
; i
++) {
300 const TemporaryFileSystem
*t
= tmpfs
+ i
;
301 _cleanup_free_
char *o
= NULL
, *str
= NULL
;
302 unsigned long flags
= MS_NODEV
|MS_STRICTATIME
;
305 if (!path_is_absolute(t
->path
))
308 if (!isempty(t
->options
)) {
309 str
= strjoin("mode=0755,", t
->options
);
313 r
= mount_option_mangle(str
, MS_NODEV
|MS_STRICTATIME
, &flags
, &o
);
317 ro
= !!(flags
& MS_RDONLY
);
322 *((*p
)++) = (MountEntry
) {
323 .path_const
= t
->path
,
336 static int append_static_mounts(MountEntry
**p
, const MountEntry
*mounts
, unsigned n
, bool ignore_protect
) {
342 /* Adds a list of static pre-defined entries */
344 for (i
= 0; i
< n
; i
++)
345 *((*p
)++) = (MountEntry
) {
346 .path_const
= mount_entry_path(mounts
+i
),
347 .mode
= mounts
[i
].mode
,
348 .ignore
= mounts
[i
].ignore
|| ignore_protect
,
354 static int append_protect_home(MountEntry
**p
, ProtectHome protect_home
, bool ignore_protect
) {
357 switch (protect_home
) {
359 case PROTECT_HOME_NO
:
362 case PROTECT_HOME_READ_ONLY
:
363 return append_static_mounts(p
, protect_home_read_only_table
, ELEMENTSOF(protect_home_read_only_table
), ignore_protect
);
365 case PROTECT_HOME_TMPFS
:
366 return append_static_mounts(p
, protect_home_tmpfs_table
, ELEMENTSOF(protect_home_tmpfs_table
), ignore_protect
);
368 case PROTECT_HOME_YES
:
369 return append_static_mounts(p
, protect_home_yes_table
, ELEMENTSOF(protect_home_yes_table
), ignore_protect
);
372 assert_not_reached("Unexpected ProtectHome= value");
376 static int append_protect_system(MountEntry
**p
, ProtectSystem protect_system
, bool ignore_protect
) {
379 switch (protect_system
) {
381 case PROTECT_SYSTEM_NO
:
384 case PROTECT_SYSTEM_STRICT
:
385 return append_static_mounts(p
, protect_system_strict_table
, ELEMENTSOF(protect_system_strict_table
), ignore_protect
);
387 case PROTECT_SYSTEM_YES
:
388 return append_static_mounts(p
, protect_system_yes_table
, ELEMENTSOF(protect_system_yes_table
), ignore_protect
);
390 case PROTECT_SYSTEM_FULL
:
391 return append_static_mounts(p
, protect_system_full_table
, ELEMENTSOF(protect_system_full_table
), ignore_protect
);
394 assert_not_reached("Unexpected ProtectSystem= value");
398 static int mount_path_compare(const void *a
, const void *b
) {
399 const MountEntry
*p
= a
, *q
= b
;
402 /* If the paths are not equal, then order prefixes first */
403 d
= path_compare(mount_entry_path(p
), mount_entry_path(q
));
407 /* If the paths are equal, check the mode */
408 if (p
->mode
< q
->mode
)
411 if (p
->mode
> q
->mode
)
417 static int prefix_where_needed(MountEntry
*m
, unsigned n
, const char *root_directory
) {
420 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
426 for (i
= 0; i
< n
; i
++) {
432 s
= prefix_root(root_directory
, mount_entry_path(m
+i
));
436 free_and_replace(m
[i
].path_malloc
, s
);
437 m
[i
].has_prefix
= true;
443 static void drop_duplicates(MountEntry
*m
, unsigned *n
) {
444 MountEntry
*f
, *t
, *previous
;
449 /* Drops duplicate entries. Expects that the array is properly ordered already. */
451 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+ *n
; f
++) {
453 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
455 if (previous
&& path_equal(mount_entry_path(f
), mount_entry_path(previous
))) {
456 log_debug("%s is duplicate.", mount_entry_path(f
));
457 previous
->read_only
= previous
->read_only
|| mount_entry_read_only(f
); /* Propagate the read-only flag to the remaining entry */
470 static void drop_inaccessible(MountEntry
*m
, unsigned *n
) {
472 const char *clear
= NULL
;
477 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
478 * ordered already. */
480 for (f
= m
, t
= m
; f
< m
+ *n
; f
++) {
482 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
483 * it, as inaccessible paths really should drop the entire subtree. */
484 if (clear
&& path_startswith(mount_entry_path(f
), clear
)) {
485 log_debug("%s is masked by %s.", mount_entry_path(f
), clear
);
490 clear
= f
->mode
== INACCESSIBLE
? mount_entry_path(f
) : NULL
;
499 static void drop_nop(MountEntry
*m
, unsigned *n
) {
505 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
506 * list is ordered by prefixes. */
508 for (f
= m
, t
= m
; f
< m
+ *n
; f
++) {
510 /* Only suppress such subtrees for READONLY and READWRITE entries */
511 if (IN_SET(f
->mode
, READONLY
, READWRITE
)) {
515 /* Now let's find the first parent of the entry we are looking at. */
516 for (p
= t
-1; p
>= m
; p
--) {
517 if (path_startswith(mount_entry_path(f
), mount_entry_path(p
))) {
523 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
524 if (found
&& p
->mode
== f
->mode
) {
525 log_debug("%s is redundant by %s", mount_entry_path(f
), mount_entry_path(p
));
538 static void drop_outside_root(const char *root_directory
, MountEntry
*m
, unsigned *n
) {
548 /* Drops all mounts that are outside of the root directory. */
550 for (f
= m
, t
= m
; f
< m
+ *n
; f
++) {
552 if (!path_startswith(mount_entry_path(f
), root_directory
)) {
553 log_debug("%s is outside of root directory.", mount_entry_path(f
));
565 static int clone_device_node(const char *d
, const char *temporary_mount
) {
570 if (stat(d
, &st
) < 0) {
576 if (!S_ISBLK(st
.st_mode
) &&
577 !S_ISCHR(st
.st_mode
))
583 dn
= strjoina(temporary_mount
, d
);
585 mac_selinux_create_file_prepare(d
, st
.st_mode
);
586 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
587 mac_selinux_create_file_clear();
589 return log_debug_errno(errno
, "mknod failed for %s: %m", d
);
594 static int mount_private_dev(MountEntry
*m
) {
595 static const char devnodes
[] =
603 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
604 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
605 _cleanup_umask_ mode_t u
;
612 if (!mkdtemp(temporary_mount
))
615 dev
= strjoina(temporary_mount
, "/dev");
616 (void) mkdir(dev
, 0755);
617 if (mount("tmpfs", dev
, "tmpfs", DEV_MOUNT_OPTIONS
, "mode=755") < 0) {
622 devpts
= strjoina(temporary_mount
, "/dev/pts");
623 (void) mkdir(devpts
, 0755);
624 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
629 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
630 * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
631 * thus, in that case make a clone
633 * in nspawn and other containers it will be a symlink, in that case make it a symlink
635 r
= is_symlink("/dev/ptmx");
639 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
640 if (symlink("pts/ptmx", devptmx
) < 0) {
645 r
= clone_device_node("/dev/ptmx", temporary_mount
);
654 devshm
= strjoina(temporary_mount
, "/dev/shm");
655 (void) mkdir(devshm
, 0755);
656 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
662 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
663 (void) mkdir(devmqueue
, 0755);
664 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
666 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
667 (void) mkdir(devhugepages
, 0755);
668 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
670 devlog
= strjoina(temporary_mount
, "/dev/log");
671 (void) symlink("/run/systemd/journal/dev-log", devlog
);
673 NULSTR_FOREACH(d
, devnodes
) {
674 r
= clone_device_node(d
, temporary_mount
);
679 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
681 /* Create the /dev directory if missing. It is more likely to be
682 * missing when the service is started with RootDirectory. This is
683 * consistent with mount units creating the mount points when missing.
685 (void) mkdir_p_label(mount_entry_path(m
), 0755);
687 /* Unmount everything in old /dev */
688 umount_recursive(mount_entry_path(m
), 0);
689 if (mount(dev
, mount_entry_path(m
), NULL
, MS_MOVE
, NULL
) < 0) {
695 rmdir(temporary_mount
);
707 umount(devhugepages
);
714 rmdir(temporary_mount
);
719 static int mount_bind_dev(const MountEntry
*m
) {
724 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
725 * /dev. This is only used when RootDirectory= is set. */
727 (void) mkdir_p_label(mount_entry_path(m
), 0755);
729 r
= path_is_mount_point(mount_entry_path(m
), NULL
, 0);
731 return log_debug_errno(r
, "Unable to determine whether /dev is already mounted: %m");
732 if (r
> 0) /* make this a NOP if /dev is already a mount point */
735 if (mount("/dev", mount_entry_path(m
), NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
736 return log_debug_errno(errno
, "Failed to bind mount %s: %m", mount_entry_path(m
));
741 static int mount_sysfs(const MountEntry
*m
) {
746 (void) mkdir_p_label(mount_entry_path(m
), 0755);
748 r
= path_is_mount_point(mount_entry_path(m
), NULL
, 0);
750 return log_debug_errno(r
, "Unable to determine whether /sys is already mounted: %m");
751 if (r
> 0) /* make this a NOP if /sys is already a mount point */
754 /* Bind mount the host's version so that we get all child mounts of it, too. */
755 if (mount("/sys", mount_entry_path(m
), NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
756 return log_debug_errno(errno
, "Failed to mount %s: %m", mount_entry_path(m
));
761 static int mount_procfs(const MountEntry
*m
) {
766 (void) mkdir_p_label(mount_entry_path(m
), 0755);
768 r
= path_is_mount_point(mount_entry_path(m
), NULL
, 0);
770 return log_debug_errno(r
, "Unable to determine whether /proc is already mounted: %m");
771 if (r
> 0) /* make this a NOP if /proc is already a mount point */
774 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
775 if (mount("proc", mount_entry_path(m
), "proc", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
) < 0)
776 return log_debug_errno(errno
, "Failed to mount %s: %m", mount_entry_path(m
));
781 static int mount_tmpfs(const MountEntry
*m
) {
784 /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
786 (void) mkdir_p_label(mount_entry_path(m
), 0755);
787 (void) umount_recursive(mount_entry_path(m
), 0);
789 if (mount("tmpfs", mount_entry_path(m
), "tmpfs", m
->flags
, mount_entry_options(m
)) < 0)
790 return log_debug_errno(errno
, "Failed to mount %s: %m", mount_entry_path(m
));
795 static int mount_entry_chase(
796 const char *root_directory
,
799 bool chase_nonexistent
,
807 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
808 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
809 * that applies). The result is stored in "location". */
811 r
= chase_symlinks(path
, root_directory
, chase_nonexistent
? CHASE_NONEXISTENT
: 0, &chased
);
812 if (r
== -ENOENT
&& m
->ignore
) {
813 log_debug_errno(r
, "Path %s does not exist, ignoring.", path
);
817 return log_debug_errno(r
, "Failed to follow symlinks on %s: %m", path
);
819 log_debug("Followed symlinks %s → %s.", path
, chased
);
827 static int apply_mount(
828 const char *root_directory
,
831 bool rbind
= true, make
= false;
837 r
= mount_entry_chase(root_directory
, m
, mount_entry_path(m
), !IN_SET(m
->mode
, INACCESSIBLE
, READONLY
, READWRITE
), &m
->path_malloc
);
841 log_debug("Applying namespace mount on %s", mount_entry_path(m
));
848 /* First, get rid of everything that is below if there
849 * is anything... Then, overmount it with an
850 * inaccessible path. */
851 (void) umount_recursive(mount_entry_path(m
), 0);
853 if (lstat(mount_entry_path(m
), &target
) < 0)
854 return log_debug_errno(errno
, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m
));
856 what
= mode_to_inaccessible_node(target
.st_mode
);
858 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
866 r
= path_is_mount_point(mount_entry_path(m
), root_directory
, 0);
868 return log_debug_errno(r
, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m
));
869 if (r
> 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
871 /* This isn't a mount point yet, let's make it one. */
872 what
= mount_entry_path(m
);
879 case BIND_MOUNT_RECURSIVE
:
880 /* Also chase the source mount */
882 r
= mount_entry_chase(root_directory
, m
, mount_entry_source(m
), false, &m
->source_malloc
);
886 what
= mount_entry_source(m
);
892 return mount_tmpfs(m
);
895 what
= mount_entry_source(m
);
900 return mount_private_dev(m
);
903 return mount_bind_dev(m
);
906 return mount_sysfs(m
);
909 return mount_procfs(m
);
912 assert_not_reached("Unknown mode");
917 if (mount(what
, mount_entry_path(m
), NULL
, MS_BIND
|(rbind
? MS_REC
: 0), NULL
) < 0) {
918 bool try_again
= false;
921 if (r
== -ENOENT
&& make
) {
924 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
926 if (stat(what
, &st
) >= 0) {
928 (void) mkdir_parents(mount_entry_path(m
), 0755);
930 if (S_ISDIR(st
.st_mode
))
931 try_again
= mkdir(mount_entry_path(m
), 0755) >= 0;
933 try_again
= touch(mount_entry_path(m
)) >= 0;
938 if (mount(what
, mount_entry_path(m
), NULL
, MS_BIND
|(rbind
? MS_REC
: 0), NULL
) < 0)
945 return log_debug_errno(r
, "Failed to mount %s to %s: %m", what
, mount_entry_path(m
));
948 log_debug("Successfully mounted %s to %s", what
, mount_entry_path(m
));
952 static int make_read_only(const MountEntry
*m
, char **blacklist
, FILE *proc_self_mountinfo
) {
956 assert(proc_self_mountinfo
);
958 if (mount_entry_read_only(m
)) {
959 if (IN_SET(m
->mode
, EMPTY_DIR
, TMPFS
)) {
960 /* Make superblock readonly */
961 if (mount(NULL
, mount_entry_path(m
), NULL
, MS_REMOUNT
| MS_RDONLY
| m
->flags
, mount_entry_options(m
)) < 0)
964 r
= bind_remount_recursive_with_mountinfo(mount_entry_path(m
), true, blacklist
, proc_self_mountinfo
);
965 } else if (m
->mode
== PRIVATE_DEV
) {
966 /* Superblock can be readonly but the submounts can't */
967 if (mount(NULL
, mount_entry_path(m
), NULL
, MS_REMOUNT
|DEV_MOUNT_OPTIONS
|MS_RDONLY
, NULL
) < 0)
972 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
973 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
974 * read-only mounts already applied. */
976 if (r
== -ENOENT
&& m
->ignore
)
982 static bool namespace_info_mount_apivfs(const char *root_directory
, const NamespaceInfo
*ns_info
) {
986 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
987 * since to protect the API VFS mounts, they need to be around in the
988 * first place... and RootDirectory= or RootImage= need to be set.
991 /* root_directory should point to a mount point */
992 return root_directory
&&
993 (ns_info
->mount_apivfs
||
994 ns_info
->protect_control_groups
||
995 ns_info
->protect_kernel_tunables
);
998 static unsigned namespace_calculate_mounts(
999 const char* root_directory
,
1000 const NamespaceInfo
*ns_info
,
1001 char** read_write_paths
,
1002 char** read_only_paths
,
1003 char** inaccessible_paths
,
1004 char** empty_directories
,
1005 unsigned n_bind_mounts
,
1006 unsigned n_temporary_filesystems
,
1007 const char* tmp_dir
,
1008 const char* var_tmp_dir
,
1009 ProtectHome protect_home
,
1010 ProtectSystem protect_system
) {
1012 unsigned protect_home_cnt
;
1013 unsigned protect_system_cnt
=
1014 (protect_system
== PROTECT_SYSTEM_STRICT
?
1015 ELEMENTSOF(protect_system_strict_table
) :
1016 ((protect_system
== PROTECT_SYSTEM_FULL
) ?
1017 ELEMENTSOF(protect_system_full_table
) :
1018 ((protect_system
== PROTECT_SYSTEM_YES
) ?
1019 ELEMENTSOF(protect_system_yes_table
) : 0)));
1022 (protect_home
== PROTECT_HOME_YES
?
1023 ELEMENTSOF(protect_home_yes_table
) :
1024 ((protect_home
== PROTECT_HOME_READ_ONLY
) ?
1025 ELEMENTSOF(protect_home_read_only_table
) :
1026 ((protect_home
== PROTECT_HOME_TMPFS
) ?
1027 ELEMENTSOF(protect_home_tmpfs_table
) : 0)));
1029 return !!tmp_dir
+ !!var_tmp_dir
+
1030 strv_length(read_write_paths
) +
1031 strv_length(read_only_paths
) +
1032 strv_length(inaccessible_paths
) +
1033 strv_length(empty_directories
) +
1035 n_temporary_filesystems
+
1036 ns_info
->private_dev
+
1037 (ns_info
->protect_kernel_tunables
? ELEMENTSOF(protect_kernel_tunables_table
) : 0) +
1038 (ns_info
->protect_control_groups
? 1 : 0) +
1039 (ns_info
->protect_kernel_modules
? ELEMENTSOF(protect_kernel_modules_table
) : 0) +
1040 protect_home_cnt
+ protect_system_cnt
+
1041 (namespace_info_mount_apivfs(root_directory
, ns_info
) ? ELEMENTSOF(apivfs_table
) : 0);
1044 int setup_namespace(
1045 const char* root_directory
,
1046 const char* root_image
,
1047 const NamespaceInfo
*ns_info
,
1048 char** read_write_paths
,
1049 char** read_only_paths
,
1050 char** inaccessible_paths
,
1051 char** empty_directories
,
1052 const BindMount
*bind_mounts
,
1053 unsigned n_bind_mounts
,
1054 const TemporaryFileSystem
*temporary_filesystems
,
1055 unsigned n_temporary_filesystems
,
1056 const char* tmp_dir
,
1057 const char* var_tmp_dir
,
1058 ProtectHome protect_home
,
1059 ProtectSystem protect_system
,
1060 unsigned long mount_flags
,
1061 DissectImageFlags dissect_image_flags
) {
1063 _cleanup_(loop_device_unrefp
) LoopDevice
*loop_device
= NULL
;
1064 _cleanup_(decrypted_image_unrefp
) DecryptedImage
*decrypted_image
= NULL
;
1065 _cleanup_(dissected_image_unrefp
) DissectedImage
*dissected_image
= NULL
;
1066 _cleanup_free_
void *root_hash
= NULL
;
1067 MountEntry
*m
, *mounts
= NULL
;
1068 size_t root_hash_size
= 0;
1069 bool make_slave
= false;
1072 bool require_prefix
= false;
1077 if (mount_flags
== 0)
1078 mount_flags
= MS_SHARED
;
1081 dissect_image_flags
|= DISSECT_IMAGE_REQUIRE_ROOT
;
1083 if (protect_system
== PROTECT_SYSTEM_STRICT
&& strv_isempty(read_write_paths
))
1084 dissect_image_flags
|= DISSECT_IMAGE_READ_ONLY
;
1086 r
= loop_device_make_by_path(root_image
,
1087 dissect_image_flags
& DISSECT_IMAGE_READ_ONLY
? O_RDONLY
: O_RDWR
,
1092 r
= root_hash_load(root_image
, &root_hash
, &root_hash_size
);
1096 r
= dissect_image(loop_device
->fd
, root_hash
, root_hash_size
, dissect_image_flags
, &dissected_image
);
1100 r
= dissected_image_decrypt(dissected_image
, NULL
, root_hash
, root_hash_size
, dissect_image_flags
, &decrypted_image
);
1106 root
= root_directory
;
1107 else if (root_image
|| n_bind_mounts
> 0 || n_temporary_filesystems
> 0) {
1109 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1110 * the same mount point for all images, which is safe, since they all live in their own namespaces
1111 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1112 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1113 * while we are applying them. */
1115 root
= "/run/systemd/unit-root";
1116 (void) mkdir_label(root
, 0700);
1117 require_prefix
= true;
1121 n_mounts
= namespace_calculate_mounts(
1129 n_temporary_filesystems
,
1130 tmp_dir
, var_tmp_dir
,
1131 protect_home
, protect_system
);
1133 /* Set mount slave mode */
1134 if (root
|| n_mounts
> 0)
1138 m
= mounts
= (MountEntry
*) alloca0(n_mounts
* sizeof(MountEntry
));
1139 r
= append_access_mounts(&m
, read_write_paths
, READWRITE
, require_prefix
);
1143 r
= append_access_mounts(&m
, read_only_paths
, READONLY
, require_prefix
);
1147 r
= append_access_mounts(&m
, inaccessible_paths
, INACCESSIBLE
, require_prefix
);
1151 r
= append_empty_dir_mounts(&m
, empty_directories
);
1155 r
= append_bind_mounts(&m
, bind_mounts
, n_bind_mounts
);
1159 r
= append_tmpfs_mounts(&m
, temporary_filesystems
, n_temporary_filesystems
);
1164 *(m
++) = (MountEntry
) {
1165 .path_const
= "/tmp",
1166 .mode
= PRIVATE_TMP
,
1167 .source_const
= tmp_dir
,
1172 *(m
++) = (MountEntry
) {
1173 .path_const
= "/var/tmp",
1174 .mode
= PRIVATE_TMP
,
1175 .source_const
= var_tmp_dir
,
1179 if (ns_info
->private_dev
) {
1180 *(m
++) = (MountEntry
) {
1181 .path_const
= "/dev",
1182 .mode
= PRIVATE_DEV
,
1186 if (ns_info
->protect_kernel_tunables
) {
1187 r
= append_static_mounts(&m
, protect_kernel_tunables_table
, ELEMENTSOF(protect_kernel_tunables_table
), ns_info
->ignore_protect_paths
);
1192 if (ns_info
->protect_kernel_modules
) {
1193 r
= append_static_mounts(&m
, protect_kernel_modules_table
, ELEMENTSOF(protect_kernel_modules_table
), ns_info
->ignore_protect_paths
);
1198 if (ns_info
->protect_control_groups
) {
1199 *(m
++) = (MountEntry
) {
1200 .path_const
= "/sys/fs/cgroup",
1205 r
= append_protect_home(&m
, protect_home
, ns_info
->ignore_protect_paths
);
1209 r
= append_protect_system(&m
, protect_system
, false);
1213 if (namespace_info_mount_apivfs(root
, ns_info
)) {
1214 r
= append_static_mounts(&m
, apivfs_table
, ELEMENTSOF(apivfs_table
), ns_info
->ignore_protect_paths
);
1219 assert(mounts
+ n_mounts
== m
);
1221 /* Prepend the root directory where that's necessary */
1222 r
= prefix_where_needed(mounts
, n_mounts
, root
);
1226 qsort(mounts
, n_mounts
, sizeof(MountEntry
), mount_path_compare
);
1228 drop_duplicates(mounts
, &n_mounts
);
1229 drop_outside_root(root
, mounts
, &n_mounts
);
1230 drop_inaccessible(mounts
, &n_mounts
);
1231 drop_nop(mounts
, &n_mounts
);
1234 if (unshare(CLONE_NEWNS
) < 0) {
1240 /* Remount / as SLAVE so that nothing now mounted in the namespace
1241 shows up in the parent */
1242 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0) {
1249 /* A root image is specified, mount it to the right place */
1250 r
= dissected_image_mount(dissected_image
, root
, UID_INVALID
, dissect_image_flags
);
1254 if (decrypted_image
) {
1255 r
= decrypted_image_relinquish(decrypted_image
);
1260 loop_device_relinquish(loop_device
);
1262 } else if (root_directory
) {
1264 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1265 r
= path_is_mount_point(root
, NULL
, AT_SYMLINK_FOLLOW
);
1269 if (mount(root
, root
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
1277 /* Let's mount the main root directory to the root directory to use */
1278 if (mount("/", root
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
1284 /* Try to set up the new root directory before mounting anything else there. */
1285 if (root_image
|| root_directory
)
1286 (void) base_filesystem_create(root
, UID_INVALID
, GID_INVALID
);
1289 _cleanup_fclose_
FILE *proc_self_mountinfo
= NULL
;
1293 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1294 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1295 proc_self_mountinfo
= fopen("/proc/self/mountinfo", "re");
1296 if (!proc_self_mountinfo
) {
1301 /* First round, add in all special mounts we need */
1302 for (m
= mounts
; m
< mounts
+ n_mounts
; ++m
) {
1303 r
= apply_mount(root
, m
);
1308 /* Create a blacklist we can pass to bind_mount_recursive() */
1309 blacklist
= newa(char*, n_mounts
+1);
1310 for (j
= 0; j
< n_mounts
; j
++)
1311 blacklist
[j
] = (char*) mount_entry_path(mounts
+j
);
1312 blacklist
[j
] = NULL
;
1314 /* Second round, flip the ro bits if necessary. */
1315 for (m
= mounts
; m
< mounts
+ n_mounts
; ++m
) {
1316 r
= make_read_only(m
, blacklist
, proc_self_mountinfo
);
1323 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1324 r
= mount_move_root(root
);
1329 /* Remount / as the desired mode. Note that this will not
1330 * reestablish propagation from our side to the host, since
1331 * what's disconnected is disconnected. */
1332 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0) {
1340 for (m
= mounts
; m
< mounts
+ n_mounts
; m
++)
1341 mount_entry_done(m
);
1346 void bind_mount_free_many(BindMount
*b
, unsigned n
) {
1349 assert(b
|| n
== 0);
1351 for (i
= 0; i
< n
; i
++) {
1353 free(b
[i
].destination
);
1359 int bind_mount_add(BindMount
**b
, unsigned *n
, const BindMount
*item
) {
1360 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
1367 s
= strdup(item
->source
);
1371 d
= strdup(item
->destination
);
1375 c
= realloc_multiply(*b
, sizeof(BindMount
), *n
+ 1);
1381 c
[(*n
) ++] = (BindMount
) {
1384 .read_only
= item
->read_only
,
1385 .recursive
= item
->recursive
,
1386 .ignore_enoent
= item
->ignore_enoent
,
1393 void temporary_filesystem_free_many(TemporaryFileSystem
*t
, unsigned n
) {
1396 assert(t
|| n
== 0);
1398 for (i
= 0; i
< n
; i
++) {
1406 int temporary_filesystem_add(
1407 TemporaryFileSystem
**t
,
1410 const char *options
) {
1412 _cleanup_free_
char *p
= NULL
, *o
= NULL
;
1413 TemporaryFileSystem
*c
;
1423 if (!isempty(options
)) {
1424 o
= strdup(options
);
1429 c
= realloc_multiply(*t
, sizeof(TemporaryFileSystem
), *n
+ 1);
1435 c
[(*n
) ++] = (TemporaryFileSystem
) {
1444 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
1445 _cleanup_free_
char *x
= NULL
;
1446 char bid
[SD_ID128_STRING_MAX
];
1454 /* We include the boot id in the directory so that after a
1455 * reboot we can easily identify obsolete directories. */
1457 r
= sd_id128_get_boot(&boot_id
);
1461 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX");
1465 RUN_WITH_UMASK(0077)
1469 RUN_WITH_UMASK(0000) {
1472 y
= strjoina(x
, "/tmp");
1474 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
1484 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
1490 assert(var_tmp_dir
);
1492 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
1496 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
1500 t
= strjoina(a
, "/tmp");
1514 int setup_netns(int netns_storage_socket
[2]) {
1515 _cleanup_close_
int netns
= -1;
1518 assert(netns_storage_socket
);
1519 assert(netns_storage_socket
[0] >= 0);
1520 assert(netns_storage_socket
[1] >= 0);
1522 /* We use the passed socketpair as a storage buffer for our
1523 * namespace reference fd. Whatever process runs this first
1524 * shall create a new namespace, all others should just join
1525 * it. To serialize that we use a file lock on the socket
1528 * It's a bit crazy, but hey, works great! */
1530 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
1533 netns
= receive_one_fd(netns_storage_socket
[0], MSG_DONTWAIT
);
1534 if (netns
== -EAGAIN
) {
1535 /* Nothing stored yet, so let's create a new namespace */
1537 if (unshare(CLONE_NEWNET
) < 0) {
1544 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
1552 } else if (netns
< 0) {
1557 /* Yay, found something, so let's join the namespace */
1558 if (setns(netns
, CLONE_NEWNET
) < 0) {
1566 q
= send_one_fd(netns_storage_socket
[1], netns
, MSG_DONTWAIT
);
1573 (void) lockf(netns_storage_socket
[0], F_ULOCK
, 0);
1577 bool ns_type_supported(NamespaceType type
) {
1578 const char *t
, *ns_proc
;
1580 t
= namespace_type_to_string(type
);
1581 if (!t
) /* Don't know how to translate this? Then it's not supported */
1584 ns_proc
= strjoina("/proc/self/ns/", t
);
1585 return access(ns_proc
, F_OK
) == 0;
1588 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
1589 [PROTECT_HOME_NO
] = "no",
1590 [PROTECT_HOME_YES
] = "yes",
1591 [PROTECT_HOME_READ_ONLY
] = "read-only",
1592 [PROTECT_HOME_TMPFS
] = "tmpfs",
1595 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
1597 ProtectHome
parse_protect_home_or_bool(const char *s
) {
1600 r
= parse_boolean(s
);
1602 return PROTECT_HOME_YES
;
1604 return PROTECT_HOME_NO
;
1606 return protect_home_from_string(s
);
1609 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
1610 [PROTECT_SYSTEM_NO
] = "no",
1611 [PROTECT_SYSTEM_YES
] = "yes",
1612 [PROTECT_SYSTEM_FULL
] = "full",
1613 [PROTECT_SYSTEM_STRICT
] = "strict",
1616 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);
1618 ProtectSystem
parse_protect_system_or_bool(const char *s
) {
1621 r
= parse_boolean(s
);
1623 return PROTECT_SYSTEM_YES
;
1625 return PROTECT_SYSTEM_NO
;
1627 return protect_system_from_string(s
);
1630 static const char* const namespace_type_table
[] = {
1631 [NAMESPACE_MOUNT
] = "mnt",
1632 [NAMESPACE_CGROUP
] = "cgroup",
1633 [NAMESPACE_UTS
] = "uts",
1634 [NAMESPACE_IPC
] = "ipc",
1635 [NAMESPACE_USER
] = "user",
1636 [NAMESPACE_PID
] = "pid",
1637 [NAMESPACE_NET
] = "net",
1640 DEFINE_STRING_TABLE_LOOKUP(namespace_type
, NamespaceType
);