1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2010 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/mount.h>
27 #include "alloc-util.h"
29 #include "cgroup-util.h"
30 #include "dev-setup.h"
39 #include "mount-setup.h"
40 #include "mount-util.h"
41 #include "path-util.h"
43 #include "smack-util.h"
45 #include "user-util.h"
49 typedef enum MountMode
{
52 MNT_IN_CONTAINER
= 1 << 1,
53 MNT_CHECK_WRITABLE
= 1 << 2,
56 typedef struct MountPoint
{
62 bool (*condition_fn
)(void);
66 /* The first three entries we might need before SELinux is up. The
67 * fourth (securityfs) is needed by IMA to load a custom policy. The
68 * other ones we can delay until SELinux and IMA are loaded. When
69 * SMACK is enabled we need smackfs, too, so it's a fifth one. */
71 #define N_EARLY_MOUNT 5
73 #define N_EARLY_MOUNT 4
76 static const MountPoint mount_table
[] = {
77 { "sysfs", "/sys", "sysfs", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
78 NULL
, MNT_FATAL
|MNT_IN_CONTAINER
},
79 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
80 NULL
, MNT_FATAL
|MNT_IN_CONTAINER
},
81 { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
,
82 NULL
, MNT_FATAL
|MNT_IN_CONTAINER
},
83 { "securityfs", "/sys/kernel/security", "securityfs", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
86 { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
87 mac_smack_use
, MNT_FATAL
},
88 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
89 mac_smack_use
, MNT_FATAL
},
91 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
92 NULL
, MNT_FATAL
|MNT_IN_CONTAINER
},
93 { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID
), MS_NOSUID
|MS_NOEXEC
,
94 NULL
, MNT_IN_CONTAINER
},
96 { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
97 mac_smack_use
, MNT_FATAL
},
99 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
100 NULL
, MNT_FATAL
|MNT_IN_CONTAINER
},
101 { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
102 cg_is_unified_wanted
, MNT_IN_CONTAINER
|MNT_CHECK_WRITABLE
},
103 { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
104 cg_is_unified_wanted
, MNT_IN_CONTAINER
|MNT_CHECK_WRITABLE
},
105 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
,
106 cg_is_legacy_wanted
, MNT_FATAL
|MNT_IN_CONTAINER
},
107 { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
108 cg_is_hybrid_wanted
, MNT_IN_CONTAINER
|MNT_CHECK_WRITABLE
},
109 { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
110 cg_is_hybrid_wanted
, MNT_IN_CONTAINER
|MNT_CHECK_WRITABLE
},
111 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
112 cg_is_legacy_wanted
, MNT_IN_CONTAINER
},
113 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
114 cg_is_legacy_wanted
, MNT_FATAL
|MNT_IN_CONTAINER
},
115 { "pstore", "/sys/fs/pstore", "pstore", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
118 { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
119 is_efi_boot
, MNT_NONE
},
121 { "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
125 /* These are API file systems that might be mounted by other software,
126 * we just list them here so that we know that we should ignore them */
128 static const char ignore_paths
[] =
129 /* SELinux file systems */
131 /* Container bind mounts */
136 bool mount_point_is_api(const char *path
) {
139 /* Checks if this mount point is considered "API", and hence
140 * should be ignored */
142 for (i
= 0; i
< ELEMENTSOF(mount_table
); i
++)
143 if (path_equal(path
, mount_table
[i
].where
))
146 return path_startswith(path
, "/sys/fs/cgroup/");
149 bool mount_point_ignore(const char *path
) {
152 NULSTR_FOREACH(i
, ignore_paths
)
153 if (path_equal(path
, i
))
159 static int mount_one(const MountPoint
*p
, bool relabel
) {
164 priority
= (p
->mode
& MNT_FATAL
) ? LOG_ERR
: LOG_DEBUG
;
166 if (p
->condition_fn
&& !p
->condition_fn())
169 /* Relabel first, just in case */
171 (void) label_fix(p
->where
, true, true);
173 r
= path_is_mount_point(p
->where
, NULL
, AT_SYMLINK_FOLLOW
);
174 if (r
< 0 && r
!= -ENOENT
) {
175 log_full_errno(priority
, r
, "Failed to determine whether %s is a mount point: %m", p
->where
);
176 return (p
->mode
& MNT_FATAL
) ? r
: 0;
181 /* Skip securityfs in a container */
182 if (!(p
->mode
& MNT_IN_CONTAINER
) && detect_container() > 0)
185 /* The access mode here doesn't really matter too much, since
186 * the mounted file system will take precedence anyway. */
188 (void) mkdir_p_label(p
->where
, 0755);
190 (void) mkdir_p(p
->where
, 0755);
192 log_debug("Mounting %s to %s of type %s with options %s.",
203 log_full_errno(priority
, errno
, "Failed to mount %s at %s: %m", p
->type
, p
->where
);
204 return (p
->mode
& MNT_FATAL
) ? -errno
: 0;
207 /* Relabel again, since we now mounted something fresh here */
209 (void) label_fix(p
->where
, false, false);
211 if (p
->mode
& MNT_CHECK_WRITABLE
) {
212 if (access(p
->where
, W_OK
) < 0) {
215 (void) umount(p
->where
);
216 (void) rmdir(p
->where
);
218 log_full_errno(priority
, r
, "Mount point %s not writable after mounting: %m", p
->where
);
219 return (p
->mode
& MNT_FATAL
) ? r
: 0;
226 static int mount_points_setup(unsigned n
, bool loaded_policy
) {
230 for (i
= 0; i
< n
; i
++) {
233 j
= mount_one(mount_table
+ i
, loaded_policy
);
234 if (j
!= 0 && r
>= 0)
241 int mount_setup_early(void) {
242 assert_cc(N_EARLY_MOUNT
<= ELEMENTSOF(mount_table
));
244 /* Do a minimal mount of /proc and friends to enable the most
245 * basic stuff, such as SELinux */
246 return mount_points_setup(N_EARLY_MOUNT
, false);
249 int mount_cgroup_controllers(char ***join_controllers
) {
250 _cleanup_set_free_free_ Set
*controllers
= NULL
;
251 bool has_argument
= !!join_controllers
;
254 if (!cg_is_legacy_wanted())
257 /* Mount all available cgroup controllers that are built into the kernel. */
261 * mount "cpu" + "cpuacct" together, and "net_cls" + "net_prio".
263 * We'd like to add "cpuset" to the mix, but "cpuset" doesn't really
264 * work for groups with no initialized attributes.
266 join_controllers
= (char**[]) {
267 STRV_MAKE("cpu", "cpuacct"),
268 STRV_MAKE("net_cls", "net_prio"),
272 r
= cg_kernel_controllers(&controllers
);
274 return log_error_errno(r
, "Failed to enumerate cgroup controllers: %m");
277 _cleanup_free_
char *options
= NULL
, *controller
= NULL
, *where
= NULL
;
281 .flags
= MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
282 .mode
= MNT_IN_CONTAINER
,
286 controller
= set_steal_first(controllers
);
290 for (k
= join_controllers
; *k
; k
++)
291 if (strv_find(*k
, controller
))
297 for (i
= *k
, j
= *k
; *i
; i
++) {
299 if (!streq(*i
, controller
)) {
300 _cleanup_free_
char *t
;
302 t
= set_remove(controllers
, *i
);
315 options
= strv_join(*k
, ",");
319 options
= controller
;
323 where
= strappend("/sys/fs/cgroup/", options
);
330 r
= mount_one(&p
, true);
334 if (r
> 0 && k
&& *k
) {
337 for (i
= *k
; *i
; i
++) {
338 _cleanup_free_
char *t
= NULL
;
340 t
= strappend("/sys/fs/cgroup/", *i
);
344 r
= symlink(options
, t
);
346 #ifdef SMACK_RUN_LABEL
347 _cleanup_free_
char *src
;
348 src
= strappend("/sys/fs/cgroup/", options
);
351 r
= mac_smack_copy(t
, src
);
352 if (r
< 0 && r
!= -EOPNOTSUPP
)
353 return log_error_errno(r
, "Failed to copy smack label from %s to %s: %m", src
, t
);
355 } else if (errno
!= EEXIST
)
356 return log_error_errno(errno
, "Failed to create symlink %s: %m", t
);
361 /* Now that we mounted everything, let's make the tmpfs the
362 * cgroup file systems are mounted into read-only. */
363 (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755");
368 #if HAVE_SELINUX || ENABLE_SMACK
371 const struct stat
*sb
,
373 struct FTW
*ftwbuf
) {
375 /* No need to label /dev twice in a row... */
376 if (_unlikely_(ftwbuf
->level
== 0))
379 label_fix(fpath
, false, false);
381 /* /run/initramfs is static data and big, no need to
382 * dynamically relabel its contents at boot... */
383 if (_unlikely_(ftwbuf
->level
== 1 &&
385 streq(fpath
, "/run/initramfs")))
386 return FTW_SKIP_SUBTREE
;
392 int mount_setup(bool loaded_policy
) {
395 r
= mount_points_setup(ELEMENTSOF(mount_table
), loaded_policy
);
399 #if HAVE_SELINUX || ENABLE_SMACK
400 /* Nodes in devtmpfs and /run need to be manually updated for
401 * the appropriate labels, after mounting. The other virtual
402 * API file systems like /sys and /proc do not need that, they
403 * use the same label for all their files. */
405 usec_t before_relabel
, after_relabel
;
406 char timespan
[FORMAT_TIMESPAN_MAX
];
408 before_relabel
= now(CLOCK_MONOTONIC
);
410 nftw("/dev", nftw_cb
, 64, FTW_MOUNT
|FTW_PHYS
|FTW_ACTIONRETVAL
);
411 nftw("/dev/shm", nftw_cb
, 64, FTW_MOUNT
|FTW_PHYS
|FTW_ACTIONRETVAL
);
412 nftw("/run", nftw_cb
, 64, FTW_MOUNT
|FTW_PHYS
|FTW_ACTIONRETVAL
);
414 /* Temporarily remount the root cgroup filesystem to give it a proper label. */
415 r
= cg_all_unified();
417 (void) mount(NULL
, "/sys/fs/cgroup", NULL
, MS_REMOUNT
, NULL
);
418 label_fix("/sys/fs/cgroup", false, false);
419 nftw("/sys/fs/cgroup", nftw_cb
, 64, FTW_MOUNT
|FTW_PHYS
|FTW_ACTIONRETVAL
);
420 (void) mount(NULL
, "/sys/fs/cgroup", NULL
, MS_REMOUNT
|MS_RDONLY
, NULL
);
422 return log_error_errno(r
, "Failed to determine whether we are in all unified mode: %m");
424 after_relabel
= now(CLOCK_MONOTONIC
);
426 log_info("Relabelled /dev, /run and /sys/fs/cgroup in %s.",
427 format_timespan(timespan
, sizeof(timespan
), after_relabel
- before_relabel
, 0));
431 /* Create a few default symlinks, which are normally created
432 * by udevd, but some scripts might need them before we start
434 dev_setup(NULL
, UID_INVALID
, GID_INVALID
);
436 /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
437 * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
438 * the box. If specific setups need other settings they can reset the propagation mode to private if
439 * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
440 * container manager we assume the container manager knows what it is doing (for example, because it set up
441 * some directories with different propagation modes). */
442 if (detect_container() <= 0)
443 if (mount(NULL
, "/", NULL
, MS_REC
|MS_SHARED
, NULL
) < 0)
444 log_warning_errno(errno
, "Failed to set up the root directory for shared mount propagation: %m");
446 /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
447 * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
448 * misdetect systemd. */
449 (void) mkdir_label("/run/systemd", 0755);
450 (void) mkdir_label("/run/systemd/system", 0755);
452 /* Set up inaccessible items */
453 (void) mkdir_label("/run/systemd/inaccessible", 0000);
454 (void) mknod("/run/systemd/inaccessible/reg", S_IFREG
| 0000, 0);
455 (void) mkdir_label("/run/systemd/inaccessible/dir", 0000);
456 (void) mknod("/run/systemd/inaccessible/chr", S_IFCHR
| 0000, makedev(0, 0));
457 (void) mknod("/run/systemd/inaccessible/blk", S_IFBLK
| 0000, makedev(0, 0));
458 (void) mkfifo("/run/systemd/inaccessible/fifo", 0000);
459 (void) mknod("/run/systemd/inaccessible/sock", S_IFSOCK
| 0000, 0);