]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/mount-setup.c
df1ab1e21589ab4b9abdca67b96e3ae55824d87a
[thirdparty/systemd.git] / src / core / mount-setup.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 Copyright 2010 Lennart Poettering
4 ***/
5
6 #include <errno.h>
7 #include <ftw.h>
8 #include <stdlib.h>
9 #include <sys/mount.h>
10 #include <sys/statvfs.h>
11 #include <unistd.h>
12
13 #include "alloc-util.h"
14 #include "bus-util.h"
15 #include "cgroup-util.h"
16 #include "dev-setup.h"
17 #include "efivars.h"
18 #include "fileio.h"
19 #include "fs-util.h"
20 #include "label.h"
21 #include "log.h"
22 #include "macro.h"
23 #include "missing.h"
24 #include "mkdir.h"
25 #include "mount-setup.h"
26 #include "mount-util.h"
27 #include "path-util.h"
28 #include "set.h"
29 #include "smack-util.h"
30 #include "strv.h"
31 #include "user-util.h"
32 #include "util.h"
33 #include "virt.h"
34
35 typedef enum MountMode {
36 MNT_NONE = 0,
37 MNT_FATAL = 1 << 0,
38 MNT_IN_CONTAINER = 1 << 1,
39 MNT_CHECK_WRITABLE = 1 << 2,
40 } MountMode;
41
42 typedef struct MountPoint {
43 const char *what;
44 const char *where;
45 const char *type;
46 const char *options;
47 unsigned long flags;
48 bool (*condition_fn)(void);
49 MountMode mode;
50 } MountPoint;
51
52 /* The first three entries we might need before SELinux is up. The
53 * fourth (securityfs) is needed by IMA to load a custom policy. The
54 * other ones we can delay until SELinux and IMA are loaded. When
55 * SMACK is enabled we need smackfs, too, so it's a fifth one. */
56 #if ENABLE_SMACK
57 #define N_EARLY_MOUNT 5
58 #else
59 #define N_EARLY_MOUNT 4
60 #endif
61
62 static const MountPoint mount_table[] = {
63 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
64 NULL, MNT_FATAL|MNT_IN_CONTAINER },
65 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
66 NULL, MNT_FATAL|MNT_IN_CONTAINER },
67 { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
68 NULL, MNT_FATAL|MNT_IN_CONTAINER },
69 { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
70 NULL, MNT_NONE },
71 #if ENABLE_SMACK
72 { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
73 mac_smack_use, MNT_FATAL },
74 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
75 mac_smack_use, MNT_FATAL },
76 #endif
77 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
78 NULL, MNT_FATAL|MNT_IN_CONTAINER },
79 { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
80 NULL, MNT_IN_CONTAINER },
81 #if ENABLE_SMACK
82 { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
83 mac_smack_use, MNT_FATAL },
84 #endif
85 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
86 NULL, MNT_FATAL|MNT_IN_CONTAINER },
87 { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
88 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
89 { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
90 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
91 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
92 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
93 { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
94 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
95 { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
96 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
97 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
98 cg_is_legacy_wanted, MNT_IN_CONTAINER },
99 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
100 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
101 { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
102 NULL, MNT_NONE },
103 #if ENABLE_EFI
104 { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
105 is_efi_boot, MNT_NONE },
106 #endif
107 { "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID|MS_NOEXEC|MS_NODEV,
108 NULL, MNT_NONE, },
109 };
110
111 /* These are API file systems that might be mounted by other software,
112 * we just list them here so that we know that we should ignore them */
113
114 static const char ignore_paths[] =
115 /* SELinux file systems */
116 "/sys/fs/selinux\0"
117 /* Container bind mounts */
118 "/proc/sys\0"
119 "/dev/console\0"
120 "/proc/kmsg\0";
121
122 bool mount_point_is_api(const char *path) {
123 unsigned i;
124
125 /* Checks if this mount point is considered "API", and hence
126 * should be ignored */
127
128 for (i = 0; i < ELEMENTSOF(mount_table); i ++)
129 if (path_equal(path, mount_table[i].where))
130 return true;
131
132 return path_startswith(path, "/sys/fs/cgroup/");
133 }
134
135 bool mount_point_ignore(const char *path) {
136 const char *i;
137
138 NULSTR_FOREACH(i, ignore_paths)
139 if (path_equal(path, i))
140 return true;
141
142 return false;
143 }
144
145 static int mount_one(const MountPoint *p, bool relabel) {
146 int r, priority;
147
148 assert(p);
149
150 priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG;
151
152 if (p->condition_fn && !p->condition_fn())
153 return 0;
154
155 /* Relabel first, just in case */
156 if (relabel)
157 (void) label_fix(p->where, LABEL_IGNORE_ENOENT|LABEL_IGNORE_EROFS);
158
159 r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW);
160 if (r < 0 && r != -ENOENT) {
161 log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where);
162 return (p->mode & MNT_FATAL) ? r : 0;
163 }
164 if (r > 0)
165 return 0;
166
167 /* Skip securityfs in a container */
168 if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0)
169 return 0;
170
171 /* The access mode here doesn't really matter too much, since
172 * the mounted file system will take precedence anyway. */
173 if (relabel)
174 (void) mkdir_p_label(p->where, 0755);
175 else
176 (void) mkdir_p(p->where, 0755);
177
178 log_debug("Mounting %s to %s of type %s with options %s.",
179 p->what,
180 p->where,
181 p->type,
182 strna(p->options));
183
184 if (mount(p->what,
185 p->where,
186 p->type,
187 p->flags,
188 p->options) < 0) {
189 log_full_errno(priority, errno, "Failed to mount %s at %s: %m", p->type, p->where);
190 return (p->mode & MNT_FATAL) ? -errno : 0;
191 }
192
193 /* Relabel again, since we now mounted something fresh here */
194 if (relabel)
195 (void) label_fix(p->where, 0);
196
197 if (p->mode & MNT_CHECK_WRITABLE) {
198 if (access(p->where, W_OK) < 0) {
199 r = -errno;
200
201 (void) umount(p->where);
202 (void) rmdir(p->where);
203
204 log_full_errno(priority, r, "Mount point %s not writable after mounting: %m", p->where);
205 return (p->mode & MNT_FATAL) ? r : 0;
206 }
207 }
208
209 return 1;
210 }
211
212 static int mount_points_setup(unsigned n, bool loaded_policy) {
213 unsigned i;
214 int r = 0;
215
216 for (i = 0; i < n; i ++) {
217 int j;
218
219 j = mount_one(mount_table + i, loaded_policy);
220 if (j != 0 && r >= 0)
221 r = j;
222 }
223
224 return r;
225 }
226
227 int mount_setup_early(void) {
228 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
229
230 /* Do a minimal mount of /proc and friends to enable the most
231 * basic stuff, such as SELinux */
232 return mount_points_setup(N_EARLY_MOUNT, false);
233 }
234
235 int mount_cgroup_controllers(char ***join_controllers) {
236 _cleanup_set_free_free_ Set *controllers = NULL;
237 bool has_argument = !!join_controllers;
238 int r;
239
240 if (!cg_is_legacy_wanted())
241 return 0;
242
243 /* Mount all available cgroup controllers that are built into the kernel. */
244
245 if (!has_argument)
246 /* The defaults:
247 * mount "cpu" + "cpuacct" together, and "net_cls" + "net_prio".
248 *
249 * We'd like to add "cpuset" to the mix, but "cpuset" doesn't really
250 * work for groups with no initialized attributes.
251 */
252 join_controllers = (char**[]) {
253 STRV_MAKE("cpu", "cpuacct"),
254 STRV_MAKE("net_cls", "net_prio"),
255 NULL,
256 };
257
258 r = cg_kernel_controllers(&controllers);
259 if (r < 0)
260 return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
261
262 for (;;) {
263 _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
264 MountPoint p = {
265 .what = "cgroup",
266 .type = "cgroup",
267 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
268 .mode = MNT_IN_CONTAINER,
269 };
270 char ***k = NULL;
271
272 controller = set_steal_first(controllers);
273 if (!controller)
274 break;
275
276 for (k = join_controllers; *k; k++)
277 if (strv_find(*k, controller))
278 break;
279
280 if (*k) {
281 char **i, **j;
282
283 for (i = *k, j = *k; *i; i++) {
284
285 if (!streq(*i, controller)) {
286 _cleanup_free_ char *t;
287
288 t = set_remove(controllers, *i);
289 if (!t) {
290 if (has_argument)
291 free(*i);
292 continue;
293 }
294 }
295
296 *(j++) = *i;
297 }
298
299 *j = NULL;
300
301 options = strv_join(*k, ",");
302 if (!options)
303 return log_oom();
304 } else
305 options = TAKE_PTR(controller);
306
307 where = strappend("/sys/fs/cgroup/", options);
308 if (!where)
309 return log_oom();
310
311 p.where = where;
312 p.options = options;
313
314 r = mount_one(&p, true);
315 if (r < 0)
316 return r;
317
318 if (r > 0 && *k) {
319 char **i;
320
321 for (i = *k; *i; i++) {
322 _cleanup_free_ char *t = NULL;
323
324 t = strappend("/sys/fs/cgroup/", *i);
325 if (!t)
326 return log_oom();
327
328 r = symlink(options, t);
329 if (r >= 0) {
330 #ifdef SMACK_RUN_LABEL
331 _cleanup_free_ char *src;
332 src = strappend("/sys/fs/cgroup/", options);
333 if (!src)
334 return log_oom();
335 r = mac_smack_copy(t, src);
336 if (r < 0 && r != -EOPNOTSUPP)
337 return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", src, t);
338 #endif
339 } else if (errno != EEXIST)
340 return log_error_errno(errno, "Failed to create symlink %s: %m", t);
341 }
342 }
343 }
344
345 /* Now that we mounted everything, let's make the tmpfs the
346 * cgroup file systems are mounted into read-only. */
347 (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
348
349 return 0;
350 }
351
352 #if HAVE_SELINUX || ENABLE_SMACK
353 static int nftw_cb(
354 const char *fpath,
355 const struct stat *sb,
356 int tflag,
357 struct FTW *ftwbuf) {
358
359 /* No need to label /dev twice in a row... */
360 if (_unlikely_(ftwbuf->level == 0))
361 return FTW_CONTINUE;
362
363 (void) label_fix(fpath, 0);
364
365 /* /run/initramfs is static data and big, no need to
366 * dynamically relabel its contents at boot... */
367 if (_unlikely_(ftwbuf->level == 1 &&
368 tflag == FTW_D &&
369 streq(fpath, "/run/initramfs")))
370 return FTW_SKIP_SUBTREE;
371
372 return FTW_CONTINUE;
373 };
374
375 static int relabel_cgroup_filesystems(void) {
376 int r;
377 struct statfs st;
378
379 r = cg_all_unified();
380 if (r == 0) {
381 /* Temporarily remount the root cgroup filesystem to give it a proper label. Do this
382 only when the filesystem has been already populated by a previous instance of systemd
383 running from initrd. Otherwise don't remount anything and leave the filesystem read-write
384 for the cgroup filesystems to be mounted inside. */
385 if (statfs("/sys/fs/cgroup", &st) < 0)
386 return log_error_errno(errno, "Failed to determine mount flags for /sys/fs/cgroup: %m");
387
388 if (st.f_flags & ST_RDONLY)
389 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL);
390
391 (void) label_fix("/sys/fs/cgroup", 0);
392 (void) nftw("/sys/fs/cgroup", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
393
394 if (st.f_flags & ST_RDONLY)
395 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL);
396
397 } else if (r < 0)
398 return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m");
399
400 return 0;
401 }
402 #endif
403
404 int mount_setup(bool loaded_policy) {
405 int r = 0;
406
407 r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy);
408 if (r < 0)
409 return r;
410
411 #if HAVE_SELINUX || ENABLE_SMACK
412 /* Nodes in devtmpfs and /run need to be manually updated for
413 * the appropriate labels, after mounting. The other virtual
414 * API file systems like /sys and /proc do not need that, they
415 * use the same label for all their files. */
416 if (loaded_policy) {
417 usec_t before_relabel, after_relabel;
418 char timespan[FORMAT_TIMESPAN_MAX];
419
420 before_relabel = now(CLOCK_MONOTONIC);
421
422 (void) nftw("/dev", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
423 (void) nftw("/dev/shm", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
424 (void) nftw("/run", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
425
426 r = relabel_cgroup_filesystems();
427 if (r < 0)
428 return r;
429
430 after_relabel = now(CLOCK_MONOTONIC);
431
432 log_info("Relabelled /dev, /run and /sys/fs/cgroup in %s.",
433 format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0));
434 }
435 #endif
436
437 /* Create a few default symlinks, which are normally created
438 * by udevd, but some scripts might need them before we start
439 * udevd. */
440 dev_setup(NULL, UID_INVALID, GID_INVALID);
441
442 /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
443 * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
444 * the box. If specific setups need other settings they can reset the propagation mode to private if
445 * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
446 * container manager we assume the container manager knows what it is doing (for example, because it set up
447 * some directories with different propagation modes). */
448 if (detect_container() <= 0)
449 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
450 log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
451
452 /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
453 * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
454 * misdetect systemd. */
455 (void) mkdir_label("/run/systemd", 0755);
456 (void) mkdir_label("/run/systemd/system", 0755);
457
458 /* Set up inaccessible (and empty) file nodes of all types */
459 (void) mkdir_label("/run/systemd/inaccessible", 0000);
460 (void) mknod("/run/systemd/inaccessible/reg", S_IFREG | 0000, 0);
461 (void) mkdir_label("/run/systemd/inaccessible/dir", 0000);
462 (void) mkfifo("/run/systemd/inaccessible/fifo", 0000);
463 (void) mknod("/run/systemd/inaccessible/sock", S_IFSOCK | 0000, 0);
464
465 /* The following two are likely to fail if we lack the privs for it (for example in an userns environment, if
466 * CAP_SYS_MKNOD is missing, or if a device node policy prohibit major/minor of 0 device nodes to be
467 * created). But that's entirely fine. Consumers of these files should carry fallback to use a different node
468 * then, for example /run/systemd/inaccessible/sock, which is close enough in behaviour and semantics for most
469 * uses. */
470 (void) mknod("/run/systemd/inaccessible/chr", S_IFCHR | 0000, makedev(0, 0));
471 (void) mknod("/run/systemd/inaccessible/blk", S_IFBLK | 0000, makedev(0, 0));
472
473 return 0;
474 }