]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/mount-setup.c
bpf-program: optionally take fd of program to detach
[thirdparty/systemd.git] / src / core / mount-setup.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
8e274523
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
8e274523 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
8e274523 21#include <errno.h>
cf0fbc49 22#include <ftw.h>
8e274523 23#include <stdlib.h>
cf0fbc49 24#include <sys/mount.h>
5c0532d1 25#include <unistd.h>
8e274523 26
b5efdb8a 27#include "alloc-util.h"
64824462 28#include "bus-util.h"
4349cd7c
LP
29#include "cgroup-util.h"
30#include "dev-setup.h"
31#include "efivars.h"
e07aefbd 32#include "fileio.h"
c4b41707 33#include "fs-util.h"
4349cd7c 34#include "label.h"
8e274523 35#include "log.h"
c9af1080 36#include "macro.h"
4349cd7c 37#include "missing.h"
49e942b2 38#include "mkdir.h"
4349cd7c
LP
39#include "mount-setup.h"
40#include "mount-util.h"
9eb977db 41#include "path-util.h"
4349cd7c 42#include "set.h"
8552b176 43#include "smack-util.h"
4349cd7c 44#include "strv.h"
ee104e11 45#include "user-util.h"
4349cd7c
LP
46#include "util.h"
47#include "virt.h"
bef2733f 48
6aa220e0 49typedef enum MountMode {
e07aefbd
CB
50 MNT_NONE = 0,
51 MNT_FATAL = 1 << 0,
52 MNT_IN_CONTAINER = 1 << 1,
53 MNT_CHECK_WRITABLE = 1 << 2,
6aa220e0
KS
54} MountMode;
55
ca714c0e
LP
56typedef struct MountPoint {
57 const char *what;
58 const char *where;
59 const char *type;
60 const char *options;
61 unsigned long flags;
6aa220e0
KS
62 bool (*condition_fn)(void);
63 MountMode mode;
ca714c0e
LP
64} MountPoint;
65
4ef31082 66/* The first three entries we might need before SELinux is up. The
160481f6 67 * fourth (securityfs) is needed by IMA to load a custom policy. The
7c96ab1d
LP
68 * other ones we can delay until SELinux and IMA are loaded. When
69 * SMACK is enabled we need smackfs, too, so it's a fifth one. */
f9fa32f0 70#if ENABLE_SMACK
ffbd2c4d 71#define N_EARLY_MOUNT 5
7c96ab1d
LP
72#else
73#define N_EARLY_MOUNT 4
74#endif
4ef31082 75
ca714c0e 76static const MountPoint mount_table[] = {
68d4c452
LP
77 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
78 NULL, MNT_FATAL|MNT_IN_CONTAINER },
79 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
80 NULL, MNT_FATAL|MNT_IN_CONTAINER },
81 { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
82 NULL, MNT_FATAL|MNT_IN_CONTAINER },
83 { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
84 NULL, MNT_NONE },
f9fa32f0 85#if ENABLE_SMACK
68d4c452
LP
86 { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
87 mac_smack_use, MNT_FATAL },
88 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
89 mac_smack_use, MNT_FATAL },
d407c940 90#endif
68d4c452
LP
91 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
92 NULL, MNT_FATAL|MNT_IN_CONTAINER },
93 { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
94 NULL, MNT_IN_CONTAINER },
f9fa32f0 95#if ENABLE_SMACK
68d4c452
LP
96 { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
97 mac_smack_use, MNT_FATAL },
d407c940 98#endif
68d4c452
LP
99 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
100 NULL, MNT_FATAL|MNT_IN_CONTAINER },
4095205e 101 { "cgroup", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
2d56b80a 102 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
09961995 103 { "cgroup", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
2d56b80a 104 cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
68d4c452 105 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
efdb0237 106 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
4095205e 107 { "cgroup", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
e07aefbd 108 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
2977724b 109 { "cgroup", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
e07aefbd 110 cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
68d4c452 111 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
2977724b 112 cg_is_legacy_wanted, MNT_IN_CONTAINER },
68d4c452 113 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
2977724b 114 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
68d4c452
LP
115 { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
116 NULL, MNT_NONE },
349cc4a5 117#if ENABLE_EFI
68d4c452
LP
118 { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
119 is_efi_boot, MNT_NONE },
c06bf414 120#endif
63cc4c31
DM
121};
122
949c6510 123/* These are API file systems that might be mounted by other software,
46ff0ed7 124 * we just list them here so that we know that we should ignore them */
949c6510 125
eaeb18db
LP
126static const char ignore_paths[] =
127 /* SELinux file systems */
128 "/sys/fs/selinux\0"
eaeb18db
LP
129 /* Container bind mounts */
130 "/proc/sys\0"
131 "/dev/console\0"
c481f78b 132 "/proc/kmsg\0";
949c6510 133
dad08730
LP
134bool mount_point_is_api(const char *path) {
135 unsigned i;
136
137 /* Checks if this mount point is considered "API", and hence
138 * should be ignored */
139
ca714c0e 140 for (i = 0; i < ELEMENTSOF(mount_table); i ++)
449ddb2d 141 if (path_equal(path, mount_table[i].where))
dad08730
LP
142 return true;
143
57f2a956
KS
144 return path_startswith(path, "/sys/fs/cgroup/");
145}
146
147bool mount_point_ignore(const char *path) {
eaeb18db 148 const char *i;
57f2a956 149
eaeb18db
LP
150 NULSTR_FOREACH(i, ignore_paths)
151 if (path_equal(path, i))
949c6510
LP
152 return true;
153
57f2a956 154 return false;
dad08730
LP
155}
156
4ef31082 157static int mount_one(const MountPoint *p, bool relabel) {
713a8875 158 int r, priority;
8e274523 159
ca714c0e 160 assert(p);
8e274523 161
713a8875
LP
162 priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG;
163
6aa220e0
KS
164 if (p->condition_fn && !p->condition_fn())
165 return 0;
166
51b4af2c 167 /* Relabel first, just in case */
4ef31082 168 if (relabel)
1411b094 169 (void) label_fix(p->where, true, true);
51b4af2c 170
e1873695 171 r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW);
1411b094 172 if (r < 0 && r != -ENOENT) {
713a8875 173 log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where);
1411b094
LP
174 return (p->mode & MNT_FATAL) ? r : 0;
175 }
8e274523 176 if (r > 0)
51b4af2c 177 return 0;
8e274523 178
c481f78b 179 /* Skip securityfs in a container */
75f86906 180 if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0)
c481f78b
LP
181 return 0;
182
a04f58d6
LP
183 /* The access mode here doesn't really matter too much, since
184 * the mounted file system will take precedence anyway. */
c4bfd169 185 if (relabel)
1411b094 186 (void) mkdir_p_label(p->where, 0755);
c4bfd169 187 else
1411b094 188 (void) mkdir_p(p->where, 0755);
a04f58d6 189
8e274523 190 log_debug("Mounting %s to %s of type %s with options %s.",
ca714c0e
LP
191 p->what,
192 p->where,
193 p->type,
194 strna(p->options));
195
196 if (mount(p->what,
197 p->where,
198 p->type,
199 p->flags,
200 p->options) < 0) {
713a8875 201 log_full_errno(priority, errno, "Failed to mount %s at %s: %m", p->type, p->where);
6aa220e0 202 return (p->mode & MNT_FATAL) ? -errno : 0;
8e274523
LP
203 }
204
51b4af2c 205 /* Relabel again, since we now mounted something fresh here */
4ef31082 206 if (relabel)
1411b094 207 (void) label_fix(p->where, false, false);
5275d3c1 208
e07aefbd 209 if (p->mode & MNT_CHECK_WRITABLE) {
713a8875
LP
210 if (access(p->where, W_OK) < 0) {
211 r = -errno;
212
e07aefbd 213 (void) umount(p->where);
1ff654e2 214 (void) rmdir(p->where);
713a8875
LP
215
216 log_full_errno(priority, r, "Mount point %s not writable after mounting: %m", p->where);
e07aefbd
CB
217 return (p->mode & MNT_FATAL) ? r : 0;
218 }
219 }
220
0c85a4f3 221 return 1;
8e274523
LP
222}
223
400fac06 224static int mount_points_setup(unsigned n, bool loaded_policy) {
4ef31082
LP
225 unsigned i;
226 int r = 0;
227
400fac06 228 for (i = 0; i < n; i ++) {
4ef31082
LP
229 int j;
230
400fac06 231 j = mount_one(mount_table + i, loaded_policy);
7ff307bc 232 if (j != 0 && r >= 0)
4ef31082
LP
233 r = j;
234 }
235
236 return r;
237}
238
400fac06
AK
239int mount_setup_early(void) {
240 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
241
242 /* Do a minimal mount of /proc and friends to enable the most
243 * basic stuff, such as SELinux */
244 return mount_points_setup(N_EARLY_MOUNT, false);
245}
246
0c85a4f3 247int mount_cgroup_controllers(char ***join_controllers) {
a6b26d90 248 _cleanup_set_free_free_ Set *controllers = NULL;
a641dcd9 249 int r;
2076ca54 250
efdb0237
LP
251 if (!cg_is_legacy_wanted())
252 return 0;
253
670802d4 254 /* Mount all available cgroup controllers that are built into the kernel. */
2076ca54 255
56c8d744
ZJS
256 if (!join_controllers)
257 /* The defaults:
258 * mount "cpu" + "cpuacct" together, and "net_cls" + "net_prio".
259 *
260 * We'd like to add "cpuset" to the mix, but "cpuset" doesn't really
261 * work for groups with no initialized attributes.
262 */
263 join_controllers = (char**[]) {
264 STRV_MAKE("cpu", "cpuacct"),
265 STRV_MAKE("net_cls", "net_prio"),
266 NULL,
267 };
268
6925a0de 269 r = cg_kernel_controllers(&controllers);
b12afc8c
LP
270 if (r < 0)
271 return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
0c85a4f3
LP
272
273 for (;;) {
a641dcd9 274 _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
a6b26d90
ZJS
275 MountPoint p = {
276 .what = "cgroup",
277 .type = "cgroup",
278 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
279 .mode = MNT_IN_CONTAINER,
280 };
0c85a4f3
LP
281 char ***k = NULL;
282
283 controller = set_steal_first(controllers);
284 if (!controller)
285 break;
286
56c8d744
ZJS
287 for (k = join_controllers; *k; k++)
288 if (strv_find(*k, controller))
289 break;
0c85a4f3
LP
290
291 if (k && *k) {
292 char **i, **j;
293
294 for (i = *k, j = *k; *i; i++) {
295
296 if (!streq(*i, controller)) {
a641dcd9 297 _cleanup_free_ char *t;
0c85a4f3
LP
298
299 t = set_remove(controllers, *i);
300 if (!t) {
301 free(*i);
302 continue;
303 }
0c85a4f3
LP
304 }
305
306 *(j++) = *i;
307 }
308
309 *j = NULL;
310
311 options = strv_join(*k, ",");
a6b26d90
ZJS
312 if (!options)
313 return log_oom();
0c85a4f3
LP
314 } else {
315 options = controller;
316 controller = NULL;
317 }
318
a641dcd9
LP
319 where = strappend("/sys/fs/cgroup/", options);
320 if (!where)
321 return log_oom();
322
323 p.where = where;
0c85a4f3 324 p.options = options;
2076ca54 325
4ef31082 326 r = mount_one(&p, true);
a6b26d90
ZJS
327 if (r < 0)
328 return r;
0c85a4f3
LP
329
330 if (r > 0 && k && *k) {
331 char **i;
332
333 for (i = *k; *i; i++) {
a641dcd9
LP
334 _cleanup_free_ char *t = NULL;
335
336 t = strappend("/sys/fs/cgroup/", *i);
337 if (!t)
338 return log_oom();
0c85a4f3
LP
339
340 r = symlink(options, t);
ea2b93a8 341 if (r >= 0) {
f8c1a81c 342#ifdef SMACK_RUN_LABEL
ea2b93a8
PO
343 _cleanup_free_ char *src;
344 src = strappend("/sys/fs/cgroup/", options);
345 if (!src)
346 return log_oom();
347 r = mac_smack_copy(t, src);
348 if (r < 0 && r != -EOPNOTSUPP)
349 return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", src, t);
f8c1a81c 350#endif
ea2b93a8
PO
351 } else if (errno != EEXIST)
352 return log_error_errno(errno, "Failed to create symlink %s: %m", t);
0c85a4f3
LP
353 }
354 }
2076ca54
LP
355 }
356
679142ce
LP
357 /* Now that we mounted everything, let's make the tmpfs the
358 * cgroup file systems are mounted into read-only. */
b12afc8c 359 (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
679142ce 360
a6b26d90 361 return 0;
2076ca54
LP
362}
363
f9fa32f0 364#if HAVE_SELINUX || ENABLE_SMACK
1829dc9d
LP
365static int nftw_cb(
366 const char *fpath,
367 const struct stat *sb,
368 int tflag,
369 struct FTW *ftwbuf) {
370
9fe117ea 371 /* No need to label /dev twice in a row... */
edb49778
LP
372 if (_unlikely_(ftwbuf->level == 0))
373 return FTW_CONTINUE;
374
c9bc0764 375 label_fix(fpath, false, false);
af65c248 376
edb49778 377 /* /run/initramfs is static data and big, no need to
af65c248 378 * dynamically relabel its contents at boot... */
edb49778
LP
379 if (_unlikely_(ftwbuf->level == 1 &&
380 tflag == FTW_D &&
381 streq(fpath, "/run/initramfs")))
382 return FTW_SKIP_SUBTREE;
9fe117ea 383
edb49778 384 return FTW_CONTINUE;
1829dc9d 385};
0fff82e5 386#endif
1829dc9d 387
0b3325e7 388int mount_setup(bool loaded_policy) {
68d4c452 389 int r = 0;
8e274523 390
400fac06 391 r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy);
68d4c452
LP
392 if (r < 0)
393 return r;
394
f9fa32f0 395#if HAVE_SELINUX || ENABLE_SMACK
f1d19aa4
LP
396 /* Nodes in devtmpfs and /run need to be manually updated for
397 * the appropriate labels, after mounting. The other virtual
398 * API file systems like /sys and /proc do not need that, they
399 * use the same label for all their files. */
0b3325e7
LP
400 if (loaded_policy) {
401 usec_t before_relabel, after_relabel;
402 char timespan[FORMAT_TIMESPAN_MAX];
403
404 before_relabel = now(CLOCK_MONOTONIC);
405
edb49778 406 nftw("/dev", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
cacf980e 407 nftw("/dev/shm", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
edb49778 408 nftw("/run", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
0b3325e7 409
8739f23e
KN
410 /* Temporarily remount the root cgroup filesystem to give it a proper label. */
411 r = cg_all_unified();
412 if (r == 0) {
413 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL);
414 label_fix("/sys/fs/cgroup", false, false);
415 nftw("/sys/fs/cgroup", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
416 (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL);
417 } else if (r < 0)
418 return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m");
419
0b3325e7
LP
420 after_relabel = now(CLOCK_MONOTONIC);
421
8739f23e 422 log_info("Relabelled /dev, /run and /sys/fs/cgroup in %s.",
2fa4092c 423 format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0));
3bbecb2f 424 }
0fff82e5 425#endif
1829dc9d 426
5c0532d1 427 /* Create a few default symlinks, which are normally created
f1d19aa4 428 * by udevd, but some scripts might need them before we start
5c0532d1 429 * udevd. */
03cfe0d5 430 dev_setup(NULL, UID_INVALID, GID_INVALID);
5c0532d1 431
dee22f39
LP
432 /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
433 * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
434 * the box. If specific setups need other settings they can reset the propagation mode to private if
435 * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
436 * container manager we assume the container manager knows what it is doing (for example, because it set up
437 * some directories with different propagation modes). */
75f86906 438 if (detect_container() <= 0)
c481f78b 439 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
56f64d95 440 log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
b3ac5f8c 441
dee22f39
LP
442 /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
443 * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
444 * misdetect systemd. */
c4b41707
AP
445 (void) mkdir_label("/run/systemd", 0755);
446 (void) mkdir_label("/run/systemd/system", 0755);
dee22f39 447
c4b41707 448 /* Set up inaccessible items */
dee22f39 449 (void) mkdir_label("/run/systemd/inaccessible", 0000);
c4b41707
AP
450 (void) mknod("/run/systemd/inaccessible/reg", S_IFREG | 0000, 0);
451 (void) mkdir_label("/run/systemd/inaccessible/dir", 0000);
452 (void) mknod("/run/systemd/inaccessible/chr", S_IFCHR | 0000, makedev(0, 0));
453 (void) mknod("/run/systemd/inaccessible/blk", S_IFBLK | 0000, makedev(0, 0));
454 (void) mkfifo("/run/systemd/inaccessible/fifo", 0000);
455 (void) mknod("/run/systemd/inaccessible/sock", S_IFSOCK | 0000, 0);
b925e726 456
0c85a4f3 457 return 0;
8e274523 458}