]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/mount-setup.c
Merge pull request #8149 from poettering/fake-root-cgroup
[thirdparty/systemd.git] / src / core / mount-setup.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <ftw.h>
23 #include <stdlib.h>
24 #include <sys/mount.h>
25 #include <unistd.h>
26
27 #include "alloc-util.h"
28 #include "bus-util.h"
29 #include "cgroup-util.h"
30 #include "dev-setup.h"
31 #include "efivars.h"
32 #include "fileio.h"
33 #include "fs-util.h"
34 #include "label.h"
35 #include "log.h"
36 #include "macro.h"
37 #include "missing.h"
38 #include "mkdir.h"
39 #include "mount-setup.h"
40 #include "mount-util.h"
41 #include "path-util.h"
42 #include "set.h"
43 #include "smack-util.h"
44 #include "strv.h"
45 #include "user-util.h"
46 #include "util.h"
47 #include "virt.h"
48
49 typedef enum MountMode {
50 MNT_NONE = 0,
51 MNT_FATAL = 1 << 0,
52 MNT_IN_CONTAINER = 1 << 1,
53 MNT_CHECK_WRITABLE = 1 << 2,
54 } MountMode;
55
56 typedef struct MountPoint {
57 const char *what;
58 const char *where;
59 const char *type;
60 const char *options;
61 unsigned long flags;
62 bool (*condition_fn)(void);
63 MountMode mode;
64 } MountPoint;
65
66 /* The first three entries we might need before SELinux is up. The
67 * fourth (securityfs) is needed by IMA to load a custom policy. The
68 * other ones we can delay until SELinux and IMA are loaded. When
69 * SMACK is enabled we need smackfs, too, so it's a fifth one. */
70 #if ENABLE_SMACK
71 #define N_EARLY_MOUNT 5
72 #else
73 #define N_EARLY_MOUNT 4
74 #endif
75
76 static const MountPoint mount_table[] = {
77 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
78 NULL, MNT_FATAL|MNT_IN_CONTAINER },
79 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
80 NULL, MNT_FATAL|MNT_IN_CONTAINER },
81 { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
82 NULL, MNT_FATAL|MNT_IN_CONTAINER },
83 { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
84 NULL, MNT_NONE },
85 #if ENABLE_SMACK
86 { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
87 mac_smack_use, MNT_FATAL },
88 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
89 mac_smack_use, MNT_FATAL },
90 #endif
91 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
92 NULL, MNT_FATAL|MNT_IN_CONTAINER },
93 { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
94 NULL, MNT_IN_CONTAINER },
95 #if ENABLE_SMACK
96 { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
97 mac_smack_use, MNT_FATAL },
98 #endif
99 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
100 NULL, MNT_FATAL|MNT_IN_CONTAINER },
101 { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
102 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
103 { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
104 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
105 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
106 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
107 { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
108 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
109 { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
110 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
111 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
112 cg_is_legacy_wanted, MNT_IN_CONTAINER },
113 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
114 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
115 { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
116 NULL, MNT_NONE },
117 #if ENABLE_EFI
118 { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
119 is_efi_boot, MNT_NONE },
120 #endif
121 { "bpf", "/sys/fs/bpf", "bpf", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
122 NULL, MNT_NONE, },
123 };
124
125 /* These are API file systems that might be mounted by other software,
126 * we just list them here so that we know that we should ignore them */
127
128 static const char ignore_paths[] =
129 /* SELinux file systems */
130 "/sys/fs/selinux\0"
131 /* Container bind mounts */
132 "/proc/sys\0"
133 "/dev/console\0"
134 "/proc/kmsg\0";
135
136 bool mount_point_is_api(const char *path) {
137 unsigned i;
138
139 /* Checks if this mount point is considered "API", and hence
140 * should be ignored */
141
142 for (i = 0; i < ELEMENTSOF(mount_table); i ++)
143 if (path_equal(path, mount_table[i].where))
144 return true;
145
146 return path_startswith(path, "/sys/fs/cgroup/");
147 }
148
149 bool mount_point_ignore(const char *path) {
150 const char *i;
151
152 NULSTR_FOREACH(i, ignore_paths)
153 if (path_equal(path, i))
154 return true;
155
156 return false;
157 }
158
159 static int mount_one(const MountPoint *p, bool relabel) {
160 int r, priority;
161
162 assert(p);
163
164 priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG;
165
166 if (p->condition_fn && !p->condition_fn())
167 return 0;
168
169 /* Relabel first, just in case */
170 if (relabel)
171 (void) label_fix(p->where, true, true);
172
173 r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW);
174 if (r < 0 && r != -ENOENT) {
175 log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where);
176 return (p->mode & MNT_FATAL) ? r : 0;
177 }
178 if (r > 0)
179 return 0;
180
181 /* Skip securityfs in a container */
182 if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0)
183 return 0;
184
185 /* The access mode here doesn't really matter too much, since
186 * the mounted file system will take precedence anyway. */
187 if (relabel)
188 (void) mkdir_p_label(p->where, 0755);
189 else
190 (void) mkdir_p(p->where, 0755);
191
192 log_debug("Mounting %s to %s of type %s with options %s.",
193 p->what,
194 p->where,
195 p->type,
196 strna(p->options));
197
198 if (mount(p->what,
199 p->where,
200 p->type,
201 p->flags,
202 p->options) < 0) {
203 log_full_errno(priority, errno, "Failed to mount %s at %s: %m", p->type, p->where);
204 return (p->mode & MNT_FATAL) ? -errno : 0;
205 }
206
207 /* Relabel again, since we now mounted something fresh here */
208 if (relabel)
209 (void) label_fix(p->where, false, false);
210
211 if (p->mode & MNT_CHECK_WRITABLE) {
212 if (access(p->where, W_OK) < 0) {
213 r = -errno;
214
215 (void) umount(p->where);
216 (void) rmdir(p->where);
217
218 log_full_errno(priority, r, "Mount point %s not writable after mounting: %m", p->where);
219 return (p->mode & MNT_FATAL) ? r : 0;
220 }
221 }
222
223 return 1;
224 }
225
226 static int mount_points_setup(unsigned n, bool loaded_policy) {
227 unsigned i;
228 int r = 0;
229
230 for (i = 0; i < n; i ++) {
231 int j;
232
233 j = mount_one(mount_table + i, loaded_policy);
234 if (j != 0 && r >= 0)
235 r = j;
236 }
237
238 return r;
239 }
240
241 int mount_setup_early(void) {
242 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
243
244 /* Do a minimal mount of /proc and friends to enable the most
245 * basic stuff, such as SELinux */
246 return mount_points_setup(N_EARLY_MOUNT, false);
247 }
248
249 int mount_cgroup_controllers(char ***join_controllers) {
250 _cleanup_set_free_free_ Set *controllers = NULL;
251 int r;
252
253 if (!cg_is_legacy_wanted())
254 return 0;
255
256 /* Mount all available cgroup controllers that are built into the kernel. */
257
258 if (!join_controllers)
259 /* The defaults:
260 * mount "cpu" + "cpuacct" together, and "net_cls" + "net_prio".
261 *
262 * We'd like to add "cpuset" to the mix, but "cpuset" doesn't really
263 * work for groups with no initialized attributes.
264 */
265 join_controllers = (char**[]) {
266 STRV_MAKE("cpu", "cpuacct"),
267 STRV_MAKE("net_cls", "net_prio"),
268 NULL,
269 };
270
271 r = cg_kernel_controllers(&controllers);
272 if (r < 0)
273 return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
274
275 for (;;) {
276 _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
277 MountPoint p = {
278 .what = "cgroup",
279 .type = "cgroup",
280 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
281 .mode = MNT_IN_CONTAINER,
282 };
283 char ***k = NULL;
284
285 controller = set_steal_first(controllers);
286 if (!controller)
287 break;
288
289 for (k = join_controllers; *k; k++)
290 if (strv_find(*k, controller))
291 break;
292
293 if (k && *k) {
294 char **i, **j;
295
296 for (i = *k, j = *k; *i; i++) {
297
298 if (!streq(*i, controller)) {
299 _cleanup_free_ char *t;
300
301 t = set_remove(controllers, *i);
302 if (!t) {
303 free(*i);
304 continue;
305 }
306 }
307
308 *(j++) = *i;
309 }
310
311 *j = NULL;
312
313 options = strv_join(*k, ",");
314 if (!options)
315 return log_oom();
316 } else {
317 options = controller;
318 controller = NULL;
319 }
320
321 where = strappend("/sys/fs/cgroup/", options);
322 if (!where)
323 return log_oom();
324
325 p.where = where;
326 p.options = options;
327
328 r = mount_one(&p, true);
329 if (r < 0)
330 return r;
331
332 if (r > 0 && k && *k) {
333 char **i;
334
335 for (i = *k; *i; i++) {
336 _cleanup_free_ char *t = NULL;
337
338 t = strappend("/sys/fs/cgroup/", *i);
339 if (!t)
340 return log_oom();
341
342 r = symlink(options, t);
343 if (r >= 0) {
344 #ifdef SMACK_RUN_LABEL
345 _cleanup_free_ char *src;
346 src = strappend("/sys/fs/cgroup/", options);
347 if (!src)
348 return log_oom();
349 r = mac_smack_copy(t, src);
350 if (r < 0 && r != -EOPNOTSUPP)
351 return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", src, t);
352 #endif
353 } else if (errno != EEXIST)
354 return log_error_errno(errno, "Failed to create symlink %s: %m", t);
355 }
356 }
357 }
358
359 /* Now that we mounted everything, let's make the tmpfs the
360 * cgroup file systems are mounted into read-only. */
361 (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
362
363 return 0;
364 }
365
366 #if HAVE_SELINUX || ENABLE_SMACK
367 static int nftw_cb(
368 const char *fpath,
369 const struct stat *sb,
370 int tflag,
371 struct FTW *ftwbuf) {
372
373 /* No need to label /dev twice in a row... */
374 if (_unlikely_(ftwbuf->level == 0))
375 return FTW_CONTINUE;
376
377 label_fix(fpath, false, false);
378
379 /* /run/initramfs is static data and big, no need to
380 * dynamically relabel its contents at boot... */
381 if (_unlikely_(ftwbuf->level == 1 &&
382 tflag == FTW_D &&
383 streq(fpath, "/run/initramfs")))
384 return FTW_SKIP_SUBTREE;
385
386 return FTW_CONTINUE;
387 };
388 #endif
389
390 int mount_setup(bool loaded_policy) {
391 int r = 0;
392
393 r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy);
394 if (r < 0)
395 return r;
396
397 #if HAVE_SELINUX || ENABLE_SMACK
398 /* Nodes in devtmpfs and /run need to be manually updated for
399 * the appropriate labels, after mounting. The other virtual
400 * API file systems like /sys and /proc do not need that, they
401 * use the same label for all their files. */
402 if (loaded_policy) {
403 usec_t before_relabel, after_relabel;
404 char timespan[FORMAT_TIMESPAN_MAX];
405
406 before_relabel = now(CLOCK_MONOTONIC);
407
408 nftw("/dev", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
409 nftw("/dev/shm", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
410 nftw("/run", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
411
412 /* Temporarily remount the root cgroup filesystem to give it a proper label. */
413 r = cg_all_unified();
414 if (r == 0) {
415 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL);
416 label_fix("/sys/fs/cgroup", false, false);
417 nftw("/sys/fs/cgroup", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
418 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL);
419 } else if (r < 0)
420 return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m");
421
422 after_relabel = now(CLOCK_MONOTONIC);
423
424 log_info("Relabelled /dev, /run and /sys/fs/cgroup in %s.",
425 format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0));
426 }
427 #endif
428
429 /* Create a few default symlinks, which are normally created
430 * by udevd, but some scripts might need them before we start
431 * udevd. */
432 dev_setup(NULL, UID_INVALID, GID_INVALID);
433
434 /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
435 * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
436 * the box. If specific setups need other settings they can reset the propagation mode to private if
437 * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
438 * container manager we assume the container manager knows what it is doing (for example, because it set up
439 * some directories with different propagation modes). */
440 if (detect_container() <= 0)
441 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
442 log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
443
444 /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
445 * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
446 * misdetect systemd. */
447 (void) mkdir_label("/run/systemd", 0755);
448 (void) mkdir_label("/run/systemd/system", 0755);
449
450 /* Set up inaccessible items */
451 (void) mkdir_label("/run/systemd/inaccessible", 0000);
452 (void) mknod("/run/systemd/inaccessible/reg", S_IFREG | 0000, 0);
453 (void) mkdir_label("/run/systemd/inaccessible/dir", 0000);
454 (void) mknod("/run/systemd/inaccessible/chr", S_IFCHR | 0000, makedev(0, 0));
455 (void) mknod("/run/systemd/inaccessible/blk", S_IFBLK | 0000, makedev(0, 0));
456 (void) mkfifo("/run/systemd/inaccessible/fifo", 0000);
457 (void) mknod("/run/systemd/inaccessible/sock", S_IFSOCK | 0000, 0);
458
459 return 0;
460 }