]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/core/mount-setup.c
bpf-program: optionally take fd of program to detach
[thirdparty/systemd.git] / src / core / mount-setup.c
... / ...
CommitLineData
1/* SPDX-License-Identifier: LGPL-2.1+ */
2/***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
21#include <errno.h>
22#include <ftw.h>
23#include <stdlib.h>
24#include <sys/mount.h>
25#include <unistd.h>
26
27#include "alloc-util.h"
28#include "bus-util.h"
29#include "cgroup-util.h"
30#include "dev-setup.h"
31#include "efivars.h"
32#include "fileio.h"
33#include "fs-util.h"
34#include "label.h"
35#include "log.h"
36#include "macro.h"
37#include "missing.h"
38#include "mkdir.h"
39#include "mount-setup.h"
40#include "mount-util.h"
41#include "path-util.h"
42#include "set.h"
43#include "smack-util.h"
44#include "strv.h"
45#include "user-util.h"
46#include "util.h"
47#include "virt.h"
48
49typedef enum MountMode {
50 MNT_NONE = 0,
51 MNT_FATAL = 1 << 0,
52 MNT_IN_CONTAINER = 1 << 1,
53 MNT_CHECK_WRITABLE = 1 << 2,
54} MountMode;
55
56typedef struct MountPoint {
57 const char *what;
58 const char *where;
59 const char *type;
60 const char *options;
61 unsigned long flags;
62 bool (*condition_fn)(void);
63 MountMode mode;
64} MountPoint;
65
66/* The first three entries we might need before SELinux is up. The
67 * fourth (securityfs) is needed by IMA to load a custom policy. The
68 * other ones we can delay until SELinux and IMA are loaded. When
69 * SMACK is enabled we need smackfs, too, so it's a fifth one. */
70#if ENABLE_SMACK
71#define N_EARLY_MOUNT 5
72#else
73#define N_EARLY_MOUNT 4
74#endif
75
76static const MountPoint mount_table[] = {
77 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
78 NULL, MNT_FATAL|MNT_IN_CONTAINER },
79 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
80 NULL, MNT_FATAL|MNT_IN_CONTAINER },
81 { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
82 NULL, MNT_FATAL|MNT_IN_CONTAINER },
83 { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
84 NULL, MNT_NONE },
85#if ENABLE_SMACK
86 { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
87 mac_smack_use, MNT_FATAL },
88 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
89 mac_smack_use, MNT_FATAL },
90#endif
91 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
92 NULL, MNT_FATAL|MNT_IN_CONTAINER },
93 { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
94 NULL, MNT_IN_CONTAINER },
95#if ENABLE_SMACK
96 { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
97 mac_smack_use, MNT_FATAL },
98#endif
99 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
100 NULL, MNT_FATAL|MNT_IN_CONTAINER },
101 { "cgroup", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
102 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
103 { "cgroup", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
104 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
105 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
106 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
107 { "cgroup", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
108 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
109 { "cgroup", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
110 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
111 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
112 cg_is_legacy_wanted, MNT_IN_CONTAINER },
113 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
114 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
115 { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
116 NULL, MNT_NONE },
117#if ENABLE_EFI
118 { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
119 is_efi_boot, MNT_NONE },
120#endif
121};
122
123/* These are API file systems that might be mounted by other software,
124 * we just list them here so that we know that we should ignore them */
125
126static const char ignore_paths[] =
127 /* SELinux file systems */
128 "/sys/fs/selinux\0"
129 /* Container bind mounts */
130 "/proc/sys\0"
131 "/dev/console\0"
132 "/proc/kmsg\0";
133
134bool mount_point_is_api(const char *path) {
135 unsigned i;
136
137 /* Checks if this mount point is considered "API", and hence
138 * should be ignored */
139
140 for (i = 0; i < ELEMENTSOF(mount_table); i ++)
141 if (path_equal(path, mount_table[i].where))
142 return true;
143
144 return path_startswith(path, "/sys/fs/cgroup/");
145}
146
147bool mount_point_ignore(const char *path) {
148 const char *i;
149
150 NULSTR_FOREACH(i, ignore_paths)
151 if (path_equal(path, i))
152 return true;
153
154 return false;
155}
156
157static int mount_one(const MountPoint *p, bool relabel) {
158 int r, priority;
159
160 assert(p);
161
162 priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG;
163
164 if (p->condition_fn && !p->condition_fn())
165 return 0;
166
167 /* Relabel first, just in case */
168 if (relabel)
169 (void) label_fix(p->where, true, true);
170
171 r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW);
172 if (r < 0 && r != -ENOENT) {
173 log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where);
174 return (p->mode & MNT_FATAL) ? r : 0;
175 }
176 if (r > 0)
177 return 0;
178
179 /* Skip securityfs in a container */
180 if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0)
181 return 0;
182
183 /* The access mode here doesn't really matter too much, since
184 * the mounted file system will take precedence anyway. */
185 if (relabel)
186 (void) mkdir_p_label(p->where, 0755);
187 else
188 (void) mkdir_p(p->where, 0755);
189
190 log_debug("Mounting %s to %s of type %s with options %s.",
191 p->what,
192 p->where,
193 p->type,
194 strna(p->options));
195
196 if (mount(p->what,
197 p->where,
198 p->type,
199 p->flags,
200 p->options) < 0) {
201 log_full_errno(priority, errno, "Failed to mount %s at %s: %m", p->type, p->where);
202 return (p->mode & MNT_FATAL) ? -errno : 0;
203 }
204
205 /* Relabel again, since we now mounted something fresh here */
206 if (relabel)
207 (void) label_fix(p->where, false, false);
208
209 if (p->mode & MNT_CHECK_WRITABLE) {
210 if (access(p->where, W_OK) < 0) {
211 r = -errno;
212
213 (void) umount(p->where);
214 (void) rmdir(p->where);
215
216 log_full_errno(priority, r, "Mount point %s not writable after mounting: %m", p->where);
217 return (p->mode & MNT_FATAL) ? r : 0;
218 }
219 }
220
221 return 1;
222}
223
224static int mount_points_setup(unsigned n, bool loaded_policy) {
225 unsigned i;
226 int r = 0;
227
228 for (i = 0; i < n; i ++) {
229 int j;
230
231 j = mount_one(mount_table + i, loaded_policy);
232 if (j != 0 && r >= 0)
233 r = j;
234 }
235
236 return r;
237}
238
239int mount_setup_early(void) {
240 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
241
242 /* Do a minimal mount of /proc and friends to enable the most
243 * basic stuff, such as SELinux */
244 return mount_points_setup(N_EARLY_MOUNT, false);
245}
246
247int mount_cgroup_controllers(char ***join_controllers) {
248 _cleanup_set_free_free_ Set *controllers = NULL;
249 int r;
250
251 if (!cg_is_legacy_wanted())
252 return 0;
253
254 /* Mount all available cgroup controllers that are built into the kernel. */
255
256 if (!join_controllers)
257 /* The defaults:
258 * mount "cpu" + "cpuacct" together, and "net_cls" + "net_prio".
259 *
260 * We'd like to add "cpuset" to the mix, but "cpuset" doesn't really
261 * work for groups with no initialized attributes.
262 */
263 join_controllers = (char**[]) {
264 STRV_MAKE("cpu", "cpuacct"),
265 STRV_MAKE("net_cls", "net_prio"),
266 NULL,
267 };
268
269 r = cg_kernel_controllers(&controllers);
270 if (r < 0)
271 return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
272
273 for (;;) {
274 _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
275 MountPoint p = {
276 .what = "cgroup",
277 .type = "cgroup",
278 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
279 .mode = MNT_IN_CONTAINER,
280 };
281 char ***k = NULL;
282
283 controller = set_steal_first(controllers);
284 if (!controller)
285 break;
286
287 for (k = join_controllers; *k; k++)
288 if (strv_find(*k, controller))
289 break;
290
291 if (k && *k) {
292 char **i, **j;
293
294 for (i = *k, j = *k; *i; i++) {
295
296 if (!streq(*i, controller)) {
297 _cleanup_free_ char *t;
298
299 t = set_remove(controllers, *i);
300 if (!t) {
301 free(*i);
302 continue;
303 }
304 }
305
306 *(j++) = *i;
307 }
308
309 *j = NULL;
310
311 options = strv_join(*k, ",");
312 if (!options)
313 return log_oom();
314 } else {
315 options = controller;
316 controller = NULL;
317 }
318
319 where = strappend("/sys/fs/cgroup/", options);
320 if (!where)
321 return log_oom();
322
323 p.where = where;
324 p.options = options;
325
326 r = mount_one(&p, true);
327 if (r < 0)
328 return r;
329
330 if (r > 0 && k && *k) {
331 char **i;
332
333 for (i = *k; *i; i++) {
334 _cleanup_free_ char *t = NULL;
335
336 t = strappend("/sys/fs/cgroup/", *i);
337 if (!t)
338 return log_oom();
339
340 r = symlink(options, t);
341 if (r >= 0) {
342#ifdef SMACK_RUN_LABEL
343 _cleanup_free_ char *src;
344 src = strappend("/sys/fs/cgroup/", options);
345 if (!src)
346 return log_oom();
347 r = mac_smack_copy(t, src);
348 if (r < 0 && r != -EOPNOTSUPP)
349 return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", src, t);
350#endif
351 } else if (errno != EEXIST)
352 return log_error_errno(errno, "Failed to create symlink %s: %m", t);
353 }
354 }
355 }
356
357 /* Now that we mounted everything, let's make the tmpfs the
358 * cgroup file systems are mounted into read-only. */
359 (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
360
361 return 0;
362}
363
364#if HAVE_SELINUX || ENABLE_SMACK
365static int nftw_cb(
366 const char *fpath,
367 const struct stat *sb,
368 int tflag,
369 struct FTW *ftwbuf) {
370
371 /* No need to label /dev twice in a row... */
372 if (_unlikely_(ftwbuf->level == 0))
373 return FTW_CONTINUE;
374
375 label_fix(fpath, false, false);
376
377 /* /run/initramfs is static data and big, no need to
378 * dynamically relabel its contents at boot... */
379 if (_unlikely_(ftwbuf->level == 1 &&
380 tflag == FTW_D &&
381 streq(fpath, "/run/initramfs")))
382 return FTW_SKIP_SUBTREE;
383
384 return FTW_CONTINUE;
385};
386#endif
387
388int mount_setup(bool loaded_policy) {
389 int r = 0;
390
391 r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy);
392 if (r < 0)
393 return r;
394
395#if HAVE_SELINUX || ENABLE_SMACK
396 /* Nodes in devtmpfs and /run need to be manually updated for
397 * the appropriate labels, after mounting. The other virtual
398 * API file systems like /sys and /proc do not need that, they
399 * use the same label for all their files. */
400 if (loaded_policy) {
401 usec_t before_relabel, after_relabel;
402 char timespan[FORMAT_TIMESPAN_MAX];
403
404 before_relabel = now(CLOCK_MONOTONIC);
405
406 nftw("/dev", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
407 nftw("/dev/shm", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
408 nftw("/run", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
409
410 /* Temporarily remount the root cgroup filesystem to give it a proper label. */
411 r = cg_all_unified();
412 if (r == 0) {
413 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL);
414 label_fix("/sys/fs/cgroup", false, false);
415 nftw("/sys/fs/cgroup", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
416 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL);
417 } else if (r < 0)
418 return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m");
419
420 after_relabel = now(CLOCK_MONOTONIC);
421
422 log_info("Relabelled /dev, /run and /sys/fs/cgroup in %s.",
423 format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0));
424 }
425#endif
426
427 /* Create a few default symlinks, which are normally created
428 * by udevd, but some scripts might need them before we start
429 * udevd. */
430 dev_setup(NULL, UID_INVALID, GID_INVALID);
431
432 /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
433 * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
434 * the box. If specific setups need other settings they can reset the propagation mode to private if
435 * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
436 * container manager we assume the container manager knows what it is doing (for example, because it set up
437 * some directories with different propagation modes). */
438 if (detect_container() <= 0)
439 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
440 log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
441
442 /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
443 * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
444 * misdetect systemd. */
445 (void) mkdir_label("/run/systemd", 0755);
446 (void) mkdir_label("/run/systemd/system", 0755);
447
448 /* Set up inaccessible items */
449 (void) mkdir_label("/run/systemd/inaccessible", 0000);
450 (void) mknod("/run/systemd/inaccessible/reg", S_IFREG | 0000, 0);
451 (void) mkdir_label("/run/systemd/inaccessible/dir", 0000);
452 (void) mknod("/run/systemd/inaccessible/chr", S_IFCHR | 0000, makedev(0, 0));
453 (void) mknod("/run/systemd/inaccessible/blk", S_IFBLK | 0000, makedev(0, 0));
454 (void) mkfifo("/run/systemd/inaccessible/fifo", 0000);
455 (void) mknod("/run/systemd/inaccessible/sock", S_IFSOCK | 0000, 0);
456
457 return 0;
458}