]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
Merge pull request #14039 from keszybz/systemd-man
[thirdparty/systemd.git] / src / core / namespace.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <sched.h>
5 #include <stdio.h>
6 #include <sys/mount.h>
7 #include <unistd.h>
8 #include <linux/fs.h>
9
10 #include "alloc-util.h"
11 #include "base-filesystem.h"
12 #include "dev-setup.h"
13 #include "fd-util.h"
14 #include "fs-util.h"
15 #include "label.h"
16 #include "loop-util.h"
17 #include "loopback-setup.h"
18 #include "mkdir.h"
19 #include "mount-util.h"
20 #include "mountpoint-util.h"
21 #include "namespace-util.h"
22 #include "namespace.h"
23 #include "nulstr-util.h"
24 #include "path-util.h"
25 #include "selinux-util.h"
26 #include "socket-util.h"
27 #include "sort-util.h"
28 #include "stat-util.h"
29 #include "string-table.h"
30 #include "string-util.h"
31 #include "strv.h"
32 #include "umask-util.h"
33 #include "user-util.h"
34
35 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
36
37 typedef enum MountMode {
38 /* This is ordered by priority! */
39 INACCESSIBLE,
40 BIND_MOUNT,
41 BIND_MOUNT_RECURSIVE,
42 PRIVATE_TMP,
43 PRIVATE_DEV,
44 BIND_DEV,
45 EMPTY_DIR,
46 SYSFS,
47 PROCFS,
48 READONLY,
49 READWRITE,
50 TMPFS,
51 READWRITE_IMPLICIT, /* Should have the lowest priority. */
52 _MOUNT_MODE_MAX,
53 } MountMode;
54
55 typedef struct MountEntry {
56 const char *path_const; /* Memory allocated on stack or static */
57 MountMode mode:5;
58 bool ignore:1; /* Ignore if path does not exist? */
59 bool has_prefix:1; /* Already is prefixed by the root dir? */
60 bool read_only:1; /* Shall this mount point be read-only? */
61 bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */
62 bool applied:1; /* Already applied */
63 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
64 const char *source_const; /* The source path, for bind mounts */
65 char *source_malloc;
66 const char *options_const;/* Mount options for tmpfs */
67 char *options_malloc;
68 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
69 unsigned n_followed;
70 } MountEntry;
71
72 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
73 * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
74 static const MountEntry apivfs_table[] = {
75 { "/proc", PROCFS, false },
76 { "/dev", BIND_DEV, false },
77 { "/sys", SYSFS, false },
78 };
79
80 /* ProtectKernelTunables= option and the related filesystem APIs */
81 static const MountEntry protect_kernel_tunables_table[] = {
82 { "/proc/acpi", READONLY, true },
83 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
84 { "/proc/asound", READONLY, true },
85 { "/proc/bus", READONLY, true },
86 { "/proc/fs", READONLY, true },
87 { "/proc/irq", READONLY, true },
88 { "/proc/kallsyms", INACCESSIBLE, true },
89 { "/proc/kcore", INACCESSIBLE, true },
90 { "/proc/latency_stats", READONLY, true },
91 { "/proc/mtrr", READONLY, true },
92 { "/proc/scsi", READONLY, true },
93 { "/proc/sys", READONLY, false },
94 { "/proc/sysrq-trigger", READONLY, true },
95 { "/proc/timer_stats", READONLY, true },
96 { "/sys", READONLY, false },
97 { "/sys/fs/bpf", READONLY, true },
98 { "/sys/fs/cgroup", READWRITE_IMPLICIT, false }, /* READONLY is set by ProtectControlGroups= option */
99 { "/sys/fs/selinux", READWRITE_IMPLICIT, true },
100 { "/sys/kernel/debug", READONLY, true },
101 { "/sys/kernel/tracing", READONLY, true },
102 };
103
104 /* ProtectKernelModules= option */
105 static const MountEntry protect_kernel_modules_table[] = {
106 #if HAVE_SPLIT_USR
107 { "/lib/modules", INACCESSIBLE, true },
108 #endif
109 { "/usr/lib/modules", INACCESSIBLE, true },
110 };
111
112 /* ProtectKernelLogs= option */
113 static const MountEntry protect_kernel_logs_table[] = {
114 { "/proc/kmsg", INACCESSIBLE, true },
115 { "/dev/kmsg", INACCESSIBLE, true },
116 };
117
118 /*
119 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
120 * system should be protected by ProtectSystem=
121 */
122 static const MountEntry protect_home_read_only_table[] = {
123 { "/home", READONLY, true },
124 { "/run/user", READONLY, true },
125 { "/root", READONLY, true },
126 };
127
128 /* ProtectHome=tmpfs table */
129 static const MountEntry protect_home_tmpfs_table[] = {
130 { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
131 { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
132 { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
133 };
134
135 /* ProtectHome=yes table */
136 static const MountEntry protect_home_yes_table[] = {
137 { "/home", INACCESSIBLE, true },
138 { "/run/user", INACCESSIBLE, true },
139 { "/root", INACCESSIBLE, true },
140 };
141
142 /* ProtectSystem=yes table */
143 static const MountEntry protect_system_yes_table[] = {
144 { "/usr", READONLY, false },
145 { "/boot", READONLY, true },
146 { "/efi", READONLY, true },
147 #if HAVE_SPLIT_USR
148 { "/lib", READONLY, true },
149 { "/lib64", READONLY, true },
150 { "/bin", READONLY, true },
151 # if HAVE_SPLIT_BIN
152 { "/sbin", READONLY, true },
153 # endif
154 #endif
155 };
156
157 /* ProtectSystem=full includes ProtectSystem=yes */
158 static const MountEntry protect_system_full_table[] = {
159 { "/usr", READONLY, false },
160 { "/boot", READONLY, true },
161 { "/efi", READONLY, true },
162 { "/etc", READONLY, false },
163 #if HAVE_SPLIT_USR
164 { "/lib", READONLY, true },
165 { "/lib64", READONLY, true },
166 { "/bin", READONLY, true },
167 # if HAVE_SPLIT_BIN
168 { "/sbin", READONLY, true },
169 # endif
170 #endif
171 };
172
173 /*
174 * ProtectSystem=strict table. In this strict mode, we mount everything
175 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
176 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
177 * protect those, and these options should be fully orthogonal.
178 * (And of course /home and friends are also left writable, as ProtectHome=
179 * shall manage those, orthogonally).
180 */
181 static const MountEntry protect_system_strict_table[] = {
182 { "/", READONLY, false },
183 { "/proc", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
184 { "/sys", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
185 { "/dev", READWRITE_IMPLICIT, false }, /* PrivateDevices= */
186 { "/home", READWRITE_IMPLICIT, true }, /* ProtectHome= */
187 { "/run/user", READWRITE_IMPLICIT, true }, /* ProtectHome= */
188 { "/root", READWRITE_IMPLICIT, true }, /* ProtectHome= */
189 };
190
191 static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
192 [INACCESSIBLE] = "inaccessible",
193 [BIND_MOUNT] = "bind",
194 [BIND_MOUNT_RECURSIVE] = "rbind",
195 [PRIVATE_TMP] = "private-tmp",
196 [PRIVATE_DEV] = "private-dev",
197 [BIND_DEV] = "bind-dev",
198 [EMPTY_DIR] = "empty",
199 [SYSFS] = "sysfs",
200 [PROCFS] = "procfs",
201 [READONLY] = "read-only",
202 [READWRITE] = "read-write",
203 [TMPFS] = "tmpfs",
204 [READWRITE_IMPLICIT] = "rw-implicit",
205 };
206
207 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
208
209 static const char *mount_entry_path(const MountEntry *p) {
210 assert(p);
211
212 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
213 * otherwise the stack/static ->path field is returned. */
214
215 return p->path_malloc ?: p->path_const;
216 }
217
218 static bool mount_entry_read_only(const MountEntry *p) {
219 assert(p);
220
221 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
222 }
223
224 static const char *mount_entry_source(const MountEntry *p) {
225 assert(p);
226
227 return p->source_malloc ?: p->source_const;
228 }
229
230 static const char *mount_entry_options(const MountEntry *p) {
231 assert(p);
232
233 return p->options_malloc ?: p->options_const;
234 }
235
236 static void mount_entry_done(MountEntry *p) {
237 assert(p);
238
239 p->path_malloc = mfree(p->path_malloc);
240 p->source_malloc = mfree(p->source_malloc);
241 p->options_malloc = mfree(p->options_malloc);
242 }
243
244 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
245 char **i;
246
247 assert(p);
248
249 /* Adds a list of user-supplied READWRITE/READWRITE_IMPLICIT/READONLY/INACCESSIBLE entries */
250
251 STRV_FOREACH(i, strv) {
252 bool ignore = false, needs_prefix = false;
253 const char *e = *i;
254
255 /* Look for any prefixes */
256 if (startswith(e, "-")) {
257 e++;
258 ignore = true;
259 }
260 if (startswith(e, "+")) {
261 e++;
262 needs_prefix = true;
263 }
264
265 if (!path_is_absolute(e))
266 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
267 "Path is not absolute: %s", e);
268
269 *((*p)++) = (MountEntry) {
270 .path_const = e,
271 .mode = mode,
272 .ignore = ignore,
273 .has_prefix = !needs_prefix && !forcibly_require_prefix,
274 };
275 }
276
277 return 0;
278 }
279
280 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
281 char **i;
282
283 assert(p);
284
285 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
286 * "/private/" boundary directories for DynamicUser=1. */
287
288 STRV_FOREACH(i, strv) {
289
290 *((*p)++) = (MountEntry) {
291 .path_const = *i,
292 .mode = EMPTY_DIR,
293 .ignore = false,
294 .read_only = true,
295 .options_const = "mode=755",
296 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
297 };
298 }
299
300 return 0;
301 }
302
303 static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) {
304 size_t i;
305
306 assert(p);
307
308 for (i = 0; i < n; i++) {
309 const BindMount *b = binds + i;
310
311 *((*p)++) = (MountEntry) {
312 .path_const = b->destination,
313 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
314 .read_only = b->read_only,
315 .nosuid = b->nosuid,
316 .source_const = b->source,
317 .ignore = b->ignore_enoent,
318 };
319 }
320
321 return 0;
322 }
323
324 static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) {
325 size_t i;
326 int r;
327
328 assert(p);
329
330 for (i = 0; i < n; i++) {
331 const TemporaryFileSystem *t = tmpfs + i;
332 _cleanup_free_ char *o = NULL, *str = NULL;
333 unsigned long flags;
334 bool ro = false;
335
336 if (!path_is_absolute(t->path))
337 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
338 "Path is not absolute: %s",
339 t->path);
340
341 str = strjoin("mode=0755,", t->options);
342 if (!str)
343 return -ENOMEM;
344
345 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
346 if (r < 0)
347 return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
348
349 ro = flags & MS_RDONLY;
350 if (ro)
351 flags ^= MS_RDONLY;
352
353 *((*p)++) = (MountEntry) {
354 .path_const = t->path,
355 .mode = TMPFS,
356 .read_only = ro,
357 .options_malloc = TAKE_PTR(o),
358 .flags = flags,
359 };
360 }
361
362 return 0;
363 }
364
365 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) {
366 size_t i;
367
368 assert(p);
369 assert(mounts);
370
371 /* Adds a list of static pre-defined entries */
372
373 for (i = 0; i < n; i++)
374 *((*p)++) = (MountEntry) {
375 .path_const = mount_entry_path(mounts+i),
376 .mode = mounts[i].mode,
377 .ignore = mounts[i].ignore || ignore_protect,
378 };
379
380 return 0;
381 }
382
383 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
384 assert(p);
385
386 switch (protect_home) {
387
388 case PROTECT_HOME_NO:
389 return 0;
390
391 case PROTECT_HOME_READ_ONLY:
392 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
393
394 case PROTECT_HOME_TMPFS:
395 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
396
397 case PROTECT_HOME_YES:
398 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
399
400 default:
401 assert_not_reached("Unexpected ProtectHome= value");
402 }
403 }
404
405 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
406 assert(p);
407
408 switch (protect_system) {
409
410 case PROTECT_SYSTEM_NO:
411 return 0;
412
413 case PROTECT_SYSTEM_STRICT:
414 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
415
416 case PROTECT_SYSTEM_YES:
417 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
418
419 case PROTECT_SYSTEM_FULL:
420 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
421
422 default:
423 assert_not_reached("Unexpected ProtectSystem= value");
424 }
425 }
426
427 static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
428 int d;
429
430 /* If the paths are not equal, then order prefixes first */
431 d = path_compare(mount_entry_path(a), mount_entry_path(b));
432 if (d != 0)
433 return d;
434
435 /* If the paths are equal, check the mode */
436 return CMP((int) a->mode, (int) b->mode);
437 }
438
439 static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) {
440 size_t i;
441
442 /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
443
444 for (i = 0; i < n; i++) {
445 char *s;
446
447 if (m[i].has_prefix)
448 continue;
449
450 s = path_join(root_directory, mount_entry_path(m+i));
451 if (!s)
452 return -ENOMEM;
453
454 free_and_replace(m[i].path_malloc, s);
455 m[i].has_prefix = true;
456 }
457
458 return 0;
459 }
460
461 static void drop_duplicates(MountEntry *m, size_t *n) {
462 MountEntry *f, *t, *previous;
463
464 assert(m);
465 assert(n);
466
467 /* Drops duplicate entries. Expects that the array is properly ordered already. */
468
469 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
470
471 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
472 * above. Note that we only drop duplicates that haven't been mounted yet. */
473 if (previous &&
474 path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
475 !f->applied && !previous->applied) {
476 log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
477 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
478 mount_entry_done(f);
479 continue;
480 }
481
482 *t = *f;
483 previous = t;
484 t++;
485 }
486
487 *n = t - m;
488 }
489
490 static void drop_inaccessible(MountEntry *m, size_t *n) {
491 MountEntry *f, *t;
492 const char *clear = NULL;
493
494 assert(m);
495 assert(n);
496
497 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
498 * ordered already. */
499
500 for (f = m, t = m; f < m + *n; f++) {
501
502 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
503 * it, as inaccessible paths really should drop the entire subtree. */
504 if (clear && path_startswith(mount_entry_path(f), clear)) {
505 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
506 mount_entry_done(f);
507 continue;
508 }
509
510 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
511
512 *t = *f;
513 t++;
514 }
515
516 *n = t - m;
517 }
518
519 static void drop_nop(MountEntry *m, size_t *n) {
520 MountEntry *f, *t;
521
522 assert(m);
523 assert(n);
524
525 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
526 * list is ordered by prefixes. */
527
528 for (f = m, t = m; f < m + *n; f++) {
529
530 /* Only suppress such subtrees for READONLY, READWRITE and READWRITE_IMPLICIT entries */
531 if (IN_SET(f->mode, READONLY, READWRITE, READWRITE_IMPLICIT)) {
532 MountEntry *p;
533 bool found = false;
534
535 /* Now let's find the first parent of the entry we are looking at. */
536 for (p = t-1; p >= m; p--) {
537 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
538 found = true;
539 break;
540 }
541 }
542
543 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
544 if (found && p->mode == f->mode) {
545 log_debug("%s (%s) is made redundant by %s (%s)",
546 mount_entry_path(f), mount_mode_to_string(f->mode),
547 mount_entry_path(p), mount_mode_to_string(p->mode));
548 mount_entry_done(f);
549 continue;
550 }
551 }
552
553 *t = *f;
554 t++;
555 }
556
557 *n = t - m;
558 }
559
560 static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) {
561 MountEntry *f, *t;
562
563 assert(m);
564 assert(n);
565
566 /* Nothing to do */
567 if (!root_directory)
568 return;
569
570 /* Drops all mounts that are outside of the root directory. */
571
572 for (f = m, t = m; f < m + *n; f++) {
573
574 if (!path_startswith(mount_entry_path(f), root_directory)) {
575 log_debug("%s is outside of root directory.", mount_entry_path(f));
576 mount_entry_done(f);
577 continue;
578 }
579
580 *t = *f;
581 t++;
582 }
583
584 *n = t - m;
585 }
586
587 static int clone_device_node(
588 const char *d,
589 const char *temporary_mount,
590 bool *make_devnode) {
591
592 _cleanup_free_ char *sl = NULL;
593 const char *dn, *bn, *t;
594 struct stat st;
595 int r;
596
597 if (stat(d, &st) < 0) {
598 if (errno == ENOENT) {
599 log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
600 return -ENXIO;
601 }
602
603 return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
604 }
605
606 if (!S_ISBLK(st.st_mode) &&
607 !S_ISCHR(st.st_mode))
608 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
609 "Device node '%s' to clone is not a device node, ignoring.",
610 d);
611
612 dn = strjoina(temporary_mount, d);
613
614 /* First, try to create device node properly */
615 if (*make_devnode) {
616 mac_selinux_create_file_prepare(d, st.st_mode);
617 r = mknod(dn, st.st_mode, st.st_rdev);
618 mac_selinux_create_file_clear();
619 if (r >= 0)
620 goto add_symlink;
621 if (errno != EPERM)
622 return log_debug_errno(errno, "mknod failed for %s: %m", d);
623
624 /* This didn't work, let's not try this again for the next iterations. */
625 *make_devnode = false;
626 }
627
628 /* We're about to fallback to bind-mounting the device
629 * node. So create a dummy bind-mount target. */
630 mac_selinux_create_file_prepare(d, 0);
631 r = mknod(dn, S_IFREG, 0);
632 mac_selinux_create_file_clear();
633 if (r < 0 && errno != EEXIST)
634 return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
635
636 /* Fallback to bind-mounting:
637 * The assumption here is that all used device nodes carry standard
638 * properties. Specifically, the devices nodes we bind-mount should
639 * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx)
640 * and should not carry ACLs. */
641 if (mount(d, dn, NULL, MS_BIND, NULL) < 0)
642 return log_debug_errno(errno, "Bind mounting failed for '%s': %m", d);
643
644 add_symlink:
645 bn = path_startswith(d, "/dev/");
646 if (!bn)
647 return 0;
648
649 /* Create symlinks like /dev/char/1:9 → ../urandom */
650 if (asprintf(&sl, "%s/dev/%s/%u:%u", temporary_mount, S_ISCHR(st.st_mode) ? "char" : "block", major(st.st_rdev), minor(st.st_rdev)) < 0)
651 return log_oom();
652
653 (void) mkdir_parents(sl, 0755);
654
655 t = strjoina("../", bn);
656
657 if (symlink(t, sl) < 0)
658 log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
659
660 return 0;
661 }
662
663 static int mount_private_dev(MountEntry *m) {
664 static const char devnodes[] =
665 "/dev/null\0"
666 "/dev/zero\0"
667 "/dev/full\0"
668 "/dev/random\0"
669 "/dev/urandom\0"
670 "/dev/tty\0";
671
672 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
673 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
674 bool can_mknod = true;
675 _cleanup_umask_ mode_t u;
676 int r;
677
678 assert(m);
679
680 u = umask(0000);
681
682 if (!mkdtemp(temporary_mount))
683 return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
684
685 dev = strjoina(temporary_mount, "/dev");
686 (void) mkdir(dev, 0755);
687 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
688 r = log_debug_errno(errno, "Failed to mount tmpfs on '%s': %m", dev);
689 goto fail;
690 }
691
692 devpts = strjoina(temporary_mount, "/dev/pts");
693 (void) mkdir(devpts, 0755);
694 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
695 r = log_debug_errno(errno, "Failed to bind mount /dev/pts on '%s': %m", devpts);
696 goto fail;
697 }
698
699 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
700 * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
701 * Thus, in that case make a clone.
702 * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
703 r = is_symlink("/dev/ptmx");
704 if (r < 0) {
705 log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
706 goto fail;
707 } else if (r > 0) {
708 devptmx = strjoina(temporary_mount, "/dev/ptmx");
709 if (symlink("pts/ptmx", devptmx) < 0) {
710 r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
711 goto fail;
712 }
713 } else {
714 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
715 if (r < 0)
716 goto fail;
717 }
718
719 devshm = strjoina(temporary_mount, "/dev/shm");
720 (void) mkdir(devshm, 0755);
721 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
722 if (r < 0) {
723 r = log_debug_errno(errno, "Failed to bind mount /dev/shm on '%s': %m", devshm);
724 goto fail;
725 }
726
727 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
728 (void) mkdir(devmqueue, 0755);
729 if (mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL) < 0)
730 log_debug_errno(errno, "Failed to bind mount /dev/mqueue on '%s', ignoring: %m", devmqueue);
731
732 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
733 (void) mkdir(devhugepages, 0755);
734 if (mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL) < 0)
735 log_debug_errno(errno, "Failed to bind mount /dev/hugepages on '%s', ignoring: %m", devhugepages);
736
737 devlog = strjoina(temporary_mount, "/dev/log");
738 if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
739 log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
740
741 NULSTR_FOREACH(d, devnodes) {
742 r = clone_device_node(d, temporary_mount, &can_mknod);
743 /* ENXIO means the the *source* is not a device file, skip creation in that case */
744 if (r < 0 && r != -ENXIO)
745 goto fail;
746 }
747
748 r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
749 if (r < 0)
750 log_debug_errno(r, "Failed to setup basic device tree at '%s', ignoring: %m", temporary_mount);
751
752 /* Create the /dev directory if missing. It is more likely to be
753 * missing when the service is started with RootDirectory. This is
754 * consistent with mount units creating the mount points when missing.
755 */
756 (void) mkdir_p_label(mount_entry_path(m), 0755);
757
758 /* Unmount everything in old /dev */
759 r = umount_recursive(mount_entry_path(m), 0);
760 if (r < 0)
761 log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
762
763 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
764 r = log_debug_errno(errno, "Failed to move mount point '%s' to '%s': %m", dev, mount_entry_path(m));
765 goto fail;
766 }
767
768 (void) rmdir(dev);
769 (void) rmdir(temporary_mount);
770
771 return 0;
772
773 fail:
774 if (devpts)
775 (void) umount(devpts);
776
777 if (devshm)
778 (void) umount(devshm);
779
780 if (devhugepages)
781 (void) umount(devhugepages);
782
783 if (devmqueue)
784 (void) umount(devmqueue);
785
786 (void) umount(dev);
787 (void) rmdir(dev);
788 (void) rmdir(temporary_mount);
789
790 return r;
791 }
792
793 static int mount_bind_dev(const MountEntry *m) {
794 int r;
795
796 assert(m);
797
798 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
799 * /dev. This is only used when RootDirectory= is set. */
800
801 (void) mkdir_p_label(mount_entry_path(m), 0755);
802
803 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
804 if (r < 0)
805 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
806 if (r > 0) /* make this a NOP if /dev is already a mount point */
807 return 0;
808
809 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
810 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
811
812 return 1;
813 }
814
815 static int mount_sysfs(const MountEntry *m) {
816 int r;
817
818 assert(m);
819
820 (void) mkdir_p_label(mount_entry_path(m), 0755);
821
822 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
823 if (r < 0)
824 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
825 if (r > 0) /* make this a NOP if /sys is already a mount point */
826 return 0;
827
828 /* Bind mount the host's version so that we get all child mounts of it, too. */
829 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
830 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
831
832 return 1;
833 }
834
835 static int mount_procfs(const MountEntry *m) {
836 int r;
837
838 assert(m);
839
840 (void) mkdir_p_label(mount_entry_path(m), 0755);
841
842 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
843 if (r < 0)
844 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
845 if (r > 0) /* make this a NOP if /proc is already a mount point */
846 return 0;
847
848 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
849 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
850 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
851
852 return 1;
853 }
854
855 static int mount_tmpfs(const MountEntry *m) {
856 assert(m);
857
858 /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
859
860 (void) mkdir_p_label(mount_entry_path(m), 0755);
861 (void) umount_recursive(mount_entry_path(m), 0);
862
863 if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
864 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
865
866 return 1;
867 }
868
869 static int follow_symlink(
870 const char *root_directory,
871 MountEntry *m) {
872
873 _cleanup_free_ char *target = NULL;
874 int r;
875
876 /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
877 * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
878 * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
879 * end and already have a fully normalized name. */
880
881 r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL);
882 if (r < 0)
883 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
884 if (r > 0) /* Reached the end, nothing more to resolve */
885 return 1;
886
887 if (m->n_followed >= CHASE_SYMLINKS_MAX) /* put a boundary on things */
888 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
889 "Symlink loop on '%s'.",
890 mount_entry_path(m));
891
892 log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target);
893
894 free_and_replace(m->path_malloc, target);
895 m->has_prefix = true;
896
897 m->n_followed ++;
898
899 return 0;
900 }
901
902 static int apply_mount(
903 const char *root_directory,
904 MountEntry *m) {
905
906 bool rbind = true, make = false;
907 const char *what;
908 int r;
909
910 assert(m);
911
912 log_debug("Applying namespace mount on %s", mount_entry_path(m));
913
914 switch (m->mode) {
915
916 case INACCESSIBLE: {
917 struct stat target;
918
919 /* First, get rid of everything that is below if there
920 * is anything... Then, overmount it with an
921 * inaccessible path. */
922 (void) umount_recursive(mount_entry_path(m), 0);
923
924 if (lstat(mount_entry_path(m), &target) < 0) {
925 if (errno == ENOENT && m->ignore)
926 return 0;
927
928 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
929 }
930
931 what = mode_to_inaccessible_node(target.st_mode);
932 if (!what)
933 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
934 "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
935 break;
936 }
937
938 case READONLY:
939 case READWRITE:
940 case READWRITE_IMPLICIT:
941 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
942 if (r == -ENOENT && m->ignore)
943 return 0;
944 if (r < 0)
945 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
946 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
947 return 0;
948 /* This isn't a mount point yet, let's make it one. */
949 what = mount_entry_path(m);
950 break;
951
952 case BIND_MOUNT:
953 rbind = false;
954
955 _fallthrough_;
956 case BIND_MOUNT_RECURSIVE: {
957 _cleanup_free_ char *chased = NULL;
958
959 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note that bind
960 * mount source paths are always relative to the host root, hence we pass NULL as root directory to
961 * chase_symlinks() here. */
962
963 r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL);
964 if (r == -ENOENT && m->ignore) {
965 log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
966 return 0;
967 }
968 if (r < 0)
969 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
970
971 log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased);
972
973 free_and_replace(m->source_malloc, chased);
974
975 what = mount_entry_source(m);
976 make = true;
977 break;
978 }
979
980 case EMPTY_DIR:
981 case TMPFS:
982 return mount_tmpfs(m);
983
984 case PRIVATE_TMP:
985 what = mount_entry_source(m);
986 make = true;
987 break;
988
989 case PRIVATE_DEV:
990 return mount_private_dev(m);
991
992 case BIND_DEV:
993 return mount_bind_dev(m);
994
995 case SYSFS:
996 return mount_sysfs(m);
997
998 case PROCFS:
999 return mount_procfs(m);
1000
1001 default:
1002 assert_not_reached("Unknown mode");
1003 }
1004
1005 assert(what);
1006
1007 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
1008 bool try_again = false;
1009 r = -errno;
1010
1011 if (r == -ENOENT && make) {
1012 struct stat st;
1013
1014 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
1015
1016 if (stat(what, &st) < 0)
1017 log_debug_errno(errno, "Mount point source '%s' is not accessible: %m", what);
1018 else {
1019 int q;
1020
1021 (void) mkdir_parents(mount_entry_path(m), 0755);
1022
1023 if (S_ISDIR(st.st_mode))
1024 q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0;
1025 else
1026 q = touch(mount_entry_path(m));
1027
1028 if (q < 0)
1029 log_debug_errno(q, "Failed to create destination mount point node '%s': %m", mount_entry_path(m));
1030 else
1031 try_again = true;
1032 }
1033 }
1034
1035 if (try_again) {
1036 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
1037 r = -errno;
1038 else
1039 r = 0;
1040 }
1041
1042 if (r < 0)
1043 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
1044 }
1045
1046 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
1047 return 0;
1048 }
1049
1050 /* Change per-mount flags on an existing mount */
1051 static int bind_remount_one(const char *path, unsigned long orig_flags, unsigned long new_flags, unsigned long flags_mask) {
1052 if (mount(NULL, path, NULL, (orig_flags & ~flags_mask) | MS_REMOUNT | MS_BIND | new_flags, NULL) < 0)
1053 return -errno;
1054
1055 return 0;
1056 }
1057
1058 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
1059 unsigned long new_flags = 0, flags_mask = 0;
1060 bool submounts = false;
1061 int r = 0;
1062
1063 assert(m);
1064 assert(proc_self_mountinfo);
1065
1066 if (mount_entry_read_only(m) || m->mode == PRIVATE_DEV) {
1067 new_flags |= MS_RDONLY;
1068 flags_mask |= MS_RDONLY;
1069 }
1070
1071 if (m->nosuid) {
1072 new_flags |= MS_NOSUID;
1073 flags_mask |= MS_NOSUID;
1074 }
1075
1076 if (flags_mask == 0) /* No Change? */
1077 return 0;
1078
1079 /* We generally apply these changes recursively, except for /dev, and the cases we know there's
1080 * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
1081 * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace
1082 * and running Linux <= 4.17. */
1083 submounts =
1084 mount_entry_read_only(m) &&
1085 !IN_SET(m->mode, EMPTY_DIR, TMPFS);
1086 if (submounts)
1087 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, blacklist, proc_self_mountinfo);
1088 else
1089 r = bind_remount_one(mount_entry_path(m), m->flags, new_flags, flags_mask);
1090
1091 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
1092 * read-only already stays this way. This improves compatibility with container managers, where we
1093 * won't attempt to undo read-only mounts already applied. */
1094
1095 if (r == -ENOENT && m->ignore)
1096 return 0;
1097 if (r < 0)
1098 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
1099 submounts ? " and its submounts" : "");
1100 return 0;
1101 }
1102
1103 static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
1104 assert(ns_info);
1105
1106 /*
1107 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1108 * since to protect the API VFS mounts, they need to be around in the
1109 * first place...
1110 */
1111
1112 return ns_info->mount_apivfs ||
1113 ns_info->protect_control_groups ||
1114 ns_info->protect_kernel_tunables;
1115 }
1116
1117 static size_t namespace_calculate_mounts(
1118 const NamespaceInfo *ns_info,
1119 char** read_write_paths,
1120 char** read_only_paths,
1121 char** inaccessible_paths,
1122 char** empty_directories,
1123 size_t n_bind_mounts,
1124 size_t n_temporary_filesystems,
1125 const char* tmp_dir,
1126 const char* var_tmp_dir,
1127 ProtectHome protect_home,
1128 ProtectSystem protect_system) {
1129
1130 size_t protect_home_cnt;
1131 size_t protect_system_cnt =
1132 (protect_system == PROTECT_SYSTEM_STRICT ?
1133 ELEMENTSOF(protect_system_strict_table) :
1134 ((protect_system == PROTECT_SYSTEM_FULL) ?
1135 ELEMENTSOF(protect_system_full_table) :
1136 ((protect_system == PROTECT_SYSTEM_YES) ?
1137 ELEMENTSOF(protect_system_yes_table) : 0)));
1138
1139 protect_home_cnt =
1140 (protect_home == PROTECT_HOME_YES ?
1141 ELEMENTSOF(protect_home_yes_table) :
1142 ((protect_home == PROTECT_HOME_READ_ONLY) ?
1143 ELEMENTSOF(protect_home_read_only_table) :
1144 ((protect_home == PROTECT_HOME_TMPFS) ?
1145 ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1146
1147 return !!tmp_dir + !!var_tmp_dir +
1148 strv_length(read_write_paths) +
1149 strv_length(read_only_paths) +
1150 strv_length(inaccessible_paths) +
1151 strv_length(empty_directories) +
1152 n_bind_mounts +
1153 n_temporary_filesystems +
1154 ns_info->private_dev +
1155 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1156 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1157 (ns_info->protect_kernel_logs ? ELEMENTSOF(protect_kernel_logs_table) : 0) +
1158 (ns_info->protect_control_groups ? 1 : 0) +
1159 protect_home_cnt + protect_system_cnt +
1160 (ns_info->protect_hostname ? 2 : 0) +
1161 (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1162 }
1163
1164 static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) {
1165 assert(root_directory);
1166 assert(n_mounts);
1167 assert(mounts || *n_mounts == 0);
1168
1169 typesafe_qsort(mounts, *n_mounts, mount_path_compare);
1170
1171 drop_duplicates(mounts, n_mounts);
1172 drop_outside_root(root_directory, mounts, n_mounts);
1173 drop_inaccessible(mounts, n_mounts);
1174 drop_nop(mounts, n_mounts);
1175 }
1176
1177 int setup_namespace(
1178 const char* root_directory,
1179 const char* root_image,
1180 const NamespaceInfo *ns_info,
1181 char** read_write_paths,
1182 char** read_only_paths,
1183 char** inaccessible_paths,
1184 char** empty_directories,
1185 const BindMount *bind_mounts,
1186 size_t n_bind_mounts,
1187 const TemporaryFileSystem *temporary_filesystems,
1188 size_t n_temporary_filesystems,
1189 const char* tmp_dir,
1190 const char* var_tmp_dir,
1191 ProtectHome protect_home,
1192 ProtectSystem protect_system,
1193 unsigned long mount_flags,
1194 DissectImageFlags dissect_image_flags,
1195 char **error_path) {
1196
1197 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1198 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1199 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1200 _cleanup_free_ void *root_hash = NULL;
1201 MountEntry *m = NULL, *mounts = NULL;
1202 size_t n_mounts, root_hash_size = 0;
1203 bool require_prefix = false;
1204 const char *root;
1205 int r = 0;
1206
1207 assert(ns_info);
1208
1209 if (mount_flags == 0)
1210 mount_flags = MS_SHARED;
1211
1212 if (root_image) {
1213 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1214
1215 if (protect_system == PROTECT_SYSTEM_STRICT &&
1216 protect_home != PROTECT_HOME_NO &&
1217 strv_isempty(read_write_paths))
1218 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1219
1220 r = loop_device_make_by_path(root_image,
1221 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1222 &loop_device);
1223 if (r < 0)
1224 return log_debug_errno(r, "Failed to create loop device for root image: %m");
1225
1226 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1227 if (r < 0)
1228 return log_debug_errno(r, "Failed to load root hash: %m");
1229
1230 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1231 if (r < 0)
1232 return log_debug_errno(r, "Failed to dissect image: %m");
1233
1234 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1235 if (r < 0)
1236 return log_debug_errno(r, "Failed to decrypt dissected image: %m");
1237 }
1238
1239 if (root_directory)
1240 root = root_directory;
1241 else {
1242 /* Always create the mount namespace in a temporary directory, instead of operating
1243 * directly in the root. The temporary directory prevents any mounts from being
1244 * potentially obscured my other mounts we already applied.
1245 * We use the same mount point for all images, which is safe, since they all live
1246 * in their own namespaces after all, and hence won't see each other. */
1247
1248 root = "/run/systemd/unit-root";
1249 (void) mkdir_label(root, 0700);
1250 require_prefix = true;
1251 }
1252
1253 n_mounts = namespace_calculate_mounts(
1254 ns_info,
1255 read_write_paths,
1256 read_only_paths,
1257 inaccessible_paths,
1258 empty_directories,
1259 n_bind_mounts,
1260 n_temporary_filesystems,
1261 tmp_dir, var_tmp_dir,
1262 protect_home, protect_system);
1263
1264 if (n_mounts > 0) {
1265 m = mounts = new0(MountEntry, n_mounts);
1266 if (!mounts)
1267 return -ENOMEM;
1268
1269 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1270 if (r < 0)
1271 goto finish;
1272
1273 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1274 if (r < 0)
1275 goto finish;
1276
1277 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1278 if (r < 0)
1279 goto finish;
1280
1281 r = append_empty_dir_mounts(&m, empty_directories);
1282 if (r < 0)
1283 goto finish;
1284
1285 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1286 if (r < 0)
1287 goto finish;
1288
1289 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1290 if (r < 0)
1291 goto finish;
1292
1293 if (tmp_dir) {
1294 *(m++) = (MountEntry) {
1295 .path_const = "/tmp",
1296 .mode = PRIVATE_TMP,
1297 .source_const = tmp_dir,
1298 };
1299 }
1300
1301 if (var_tmp_dir) {
1302 *(m++) = (MountEntry) {
1303 .path_const = "/var/tmp",
1304 .mode = PRIVATE_TMP,
1305 .source_const = var_tmp_dir,
1306 };
1307 }
1308
1309 if (ns_info->private_dev) {
1310 *(m++) = (MountEntry) {
1311 .path_const = "/dev",
1312 .mode = PRIVATE_DEV,
1313 .flags = DEV_MOUNT_OPTIONS,
1314 };
1315 }
1316
1317 if (ns_info->protect_kernel_tunables) {
1318 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1319 if (r < 0)
1320 goto finish;
1321 }
1322
1323 if (ns_info->protect_kernel_modules) {
1324 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1325 if (r < 0)
1326 goto finish;
1327 }
1328
1329 if (ns_info->protect_kernel_logs) {
1330 r = append_static_mounts(&m, protect_kernel_logs_table, ELEMENTSOF(protect_kernel_logs_table), ns_info->ignore_protect_paths);
1331 if (r < 0)
1332 goto finish;
1333 }
1334
1335 if (ns_info->protect_control_groups) {
1336 *(m++) = (MountEntry) {
1337 .path_const = "/sys/fs/cgroup",
1338 .mode = READONLY,
1339 };
1340 }
1341
1342 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1343 if (r < 0)
1344 goto finish;
1345
1346 r = append_protect_system(&m, protect_system, false);
1347 if (r < 0)
1348 goto finish;
1349
1350 if (namespace_info_mount_apivfs(ns_info)) {
1351 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1352 if (r < 0)
1353 goto finish;
1354 }
1355
1356 if (ns_info->protect_hostname) {
1357 *(m++) = (MountEntry) {
1358 .path_const = "/proc/sys/kernel/hostname",
1359 .mode = READONLY,
1360 };
1361 *(m++) = (MountEntry) {
1362 .path_const = "/proc/sys/kernel/domainname",
1363 .mode = READONLY,
1364 };
1365 }
1366
1367 assert(mounts + n_mounts == m);
1368
1369 /* Prepend the root directory where that's necessary */
1370 r = prefix_where_needed(mounts, n_mounts, root);
1371 if (r < 0)
1372 goto finish;
1373
1374 normalize_mounts(root, mounts, &n_mounts);
1375 }
1376
1377 /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
1378
1379 if (unshare(CLONE_NEWNS) < 0) {
1380 r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
1381 if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS))
1382 /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter in place
1383 * that doesn't allow us to create namespaces (or a missing cap), then propagate a recognizable
1384 * error back, which the caller can use to detect this case (and only this) and optionally
1385 * continue without namespacing applied. */
1386 r = -ENOANO;
1387
1388 goto finish;
1389 }
1390
1391 /* Remount / as SLAVE so that nothing now mounted in the namespace
1392 * shows up in the parent */
1393 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1394 r = log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
1395 goto finish;
1396 }
1397
1398 if (root_image) {
1399 /* A root image is specified, mount it to the right place */
1400 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1401 if (r < 0) {
1402 log_debug_errno(r, "Failed to mount root image: %m");
1403 goto finish;
1404 }
1405
1406 if (decrypted_image) {
1407 r = decrypted_image_relinquish(decrypted_image);
1408 if (r < 0) {
1409 log_debug_errno(r, "Failed to relinquish decrypted image: %m");
1410 goto finish;
1411 }
1412 }
1413
1414 loop_device_relinquish(loop_device);
1415
1416 } else if (root_directory) {
1417
1418 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1419 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1420 if (r < 0) {
1421 log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
1422 goto finish;
1423 }
1424 if (r == 0) {
1425 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1426 r = log_debug_errno(errno, "Failed to bind mount '%s': %m", root);
1427 goto finish;
1428 }
1429 }
1430
1431 } else {
1432
1433 /* Let's mount the main root directory to the root directory to use */
1434 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1435 r = log_debug_errno(errno, "Failed to bind mount '/' on '%s': %m", root);
1436 goto finish;
1437 }
1438 }
1439
1440 /* Try to set up the new root directory before mounting anything else there. */
1441 if (root_image || root_directory)
1442 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1443
1444 if (n_mounts > 0) {
1445 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1446 _cleanup_free_ char **blacklist = NULL;
1447 size_t j;
1448
1449 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1450 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1451 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1452 if (!proc_self_mountinfo) {
1453 r = log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m");
1454 if (error_path)
1455 *error_path = strdup("/proc/self/mountinfo");
1456 goto finish;
1457 }
1458
1459 /* First round, establish all mounts we need */
1460 for (;;) {
1461 bool again = false;
1462
1463 for (m = mounts; m < mounts + n_mounts; ++m) {
1464
1465 if (m->applied)
1466 continue;
1467
1468 r = follow_symlink(root, m);
1469 if (r < 0) {
1470 if (error_path && mount_entry_path(m))
1471 *error_path = strdup(mount_entry_path(m));
1472 goto finish;
1473 }
1474 if (r == 0) {
1475 /* We hit a symlinked mount point. The entry got rewritten and might point to a
1476 * very different place now. Let's normalize the changed list, and start from
1477 * the beginning. After all to mount the entry at the new location we might
1478 * need some other mounts first */
1479 again = true;
1480 break;
1481 }
1482
1483 r = apply_mount(root, m);
1484 if (r < 0) {
1485 if (error_path && mount_entry_path(m))
1486 *error_path = strdup(mount_entry_path(m));
1487 goto finish;
1488 }
1489
1490 m->applied = true;
1491 }
1492
1493 if (!again)
1494 break;
1495
1496 normalize_mounts(root, mounts, &n_mounts);
1497 }
1498
1499 /* Create a blacklist we can pass to bind_mount_recursive() */
1500 blacklist = new(char*, n_mounts+1);
1501 if (!blacklist) {
1502 r = -ENOMEM;
1503 goto finish;
1504 }
1505 for (j = 0; j < n_mounts; j++)
1506 blacklist[j] = (char*) mount_entry_path(mounts+j);
1507 blacklist[j] = NULL;
1508
1509 /* Second round, flip the ro bits if necessary. */
1510 for (m = mounts; m < mounts + n_mounts; ++m) {
1511 r = make_read_only(m, blacklist, proc_self_mountinfo);
1512 if (r < 0) {
1513 if (error_path && mount_entry_path(m))
1514 *error_path = strdup(mount_entry_path(m));
1515 goto finish;
1516 }
1517 }
1518 }
1519
1520 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1521 r = mount_move_root(root);
1522 if (r < 0) {
1523 log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
1524 goto finish;
1525 }
1526
1527 /* Remount / as the desired mode. Note that this will not
1528 * reestablish propagation from our side to the host, since
1529 * what's disconnected is disconnected. */
1530 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1531 r = log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
1532 goto finish;
1533 }
1534
1535 r = 0;
1536
1537 finish:
1538 for (m = mounts; m < mounts + n_mounts; m++)
1539 mount_entry_done(m);
1540
1541 free(mounts);
1542
1543 return r;
1544 }
1545
1546 void bind_mount_free_many(BindMount *b, size_t n) {
1547 size_t i;
1548
1549 assert(b || n == 0);
1550
1551 for (i = 0; i < n; i++) {
1552 free(b[i].source);
1553 free(b[i].destination);
1554 }
1555
1556 free(b);
1557 }
1558
1559 int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
1560 _cleanup_free_ char *s = NULL, *d = NULL;
1561 BindMount *c;
1562
1563 assert(b);
1564 assert(n);
1565 assert(item);
1566
1567 s = strdup(item->source);
1568 if (!s)
1569 return -ENOMEM;
1570
1571 d = strdup(item->destination);
1572 if (!d)
1573 return -ENOMEM;
1574
1575 c = reallocarray(*b, *n + 1, sizeof(BindMount));
1576 if (!c)
1577 return -ENOMEM;
1578
1579 *b = c;
1580
1581 c[(*n) ++] = (BindMount) {
1582 .source = TAKE_PTR(s),
1583 .destination = TAKE_PTR(d),
1584 .read_only = item->read_only,
1585 .nosuid = item->nosuid,
1586 .recursive = item->recursive,
1587 .ignore_enoent = item->ignore_enoent,
1588 };
1589
1590 return 0;
1591 }
1592
1593 void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
1594 size_t i;
1595
1596 assert(t || n == 0);
1597
1598 for (i = 0; i < n; i++) {
1599 free(t[i].path);
1600 free(t[i].options);
1601 }
1602
1603 free(t);
1604 }
1605
1606 int temporary_filesystem_add(
1607 TemporaryFileSystem **t,
1608 size_t *n,
1609 const char *path,
1610 const char *options) {
1611
1612 _cleanup_free_ char *p = NULL, *o = NULL;
1613 TemporaryFileSystem *c;
1614
1615 assert(t);
1616 assert(n);
1617 assert(path);
1618
1619 p = strdup(path);
1620 if (!p)
1621 return -ENOMEM;
1622
1623 if (!isempty(options)) {
1624 o = strdup(options);
1625 if (!o)
1626 return -ENOMEM;
1627 }
1628
1629 c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
1630 if (!c)
1631 return -ENOMEM;
1632
1633 *t = c;
1634
1635 c[(*n) ++] = (TemporaryFileSystem) {
1636 .path = TAKE_PTR(p),
1637 .options = TAKE_PTR(o),
1638 };
1639
1640 return 0;
1641 }
1642
1643 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1644 _cleanup_free_ char *x = NULL;
1645 char bid[SD_ID128_STRING_MAX];
1646 sd_id128_t boot_id;
1647 int r;
1648
1649 assert(id);
1650 assert(prefix);
1651 assert(path);
1652
1653 /* We include the boot id in the directory so that after a
1654 * reboot we can easily identify obsolete directories. */
1655
1656 r = sd_id128_get_boot(&boot_id);
1657 if (r < 0)
1658 return r;
1659
1660 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1661 if (!x)
1662 return -ENOMEM;
1663
1664 RUN_WITH_UMASK(0077)
1665 if (!mkdtemp(x))
1666 return -errno;
1667
1668 RUN_WITH_UMASK(0000) {
1669 char *y;
1670
1671 y = strjoina(x, "/tmp");
1672
1673 if (mkdir(y, 0777 | S_ISVTX) < 0)
1674 return -errno;
1675 }
1676
1677 *path = TAKE_PTR(x);
1678
1679 return 0;
1680 }
1681
1682 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1683 char *a, *b;
1684 int r;
1685
1686 assert(id);
1687 assert(tmp_dir);
1688 assert(var_tmp_dir);
1689
1690 r = setup_one_tmp_dir(id, "/tmp", &a);
1691 if (r < 0)
1692 return r;
1693
1694 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1695 if (r < 0) {
1696 char *t;
1697
1698 t = strjoina(a, "/tmp");
1699 (void) rmdir(t);
1700 (void) rmdir(a);
1701
1702 free(a);
1703 return r;
1704 }
1705
1706 *tmp_dir = a;
1707 *var_tmp_dir = b;
1708
1709 return 0;
1710 }
1711
1712 int setup_netns(const int netns_storage_socket[static 2]) {
1713 _cleanup_close_ int netns = -1;
1714 int r, q;
1715
1716 assert(netns_storage_socket);
1717 assert(netns_storage_socket[0] >= 0);
1718 assert(netns_storage_socket[1] >= 0);
1719
1720 /* We use the passed socketpair as a storage buffer for our
1721 * namespace reference fd. Whatever process runs this first
1722 * shall create a new namespace, all others should just join
1723 * it. To serialize that we use a file lock on the socket
1724 * pair.
1725 *
1726 * It's a bit crazy, but hey, works great! */
1727
1728 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1729 return -errno;
1730
1731 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1732 if (netns == -EAGAIN) {
1733 /* Nothing stored yet, so let's create a new namespace. */
1734
1735 if (unshare(CLONE_NEWNET) < 0) {
1736 r = -errno;
1737 goto fail;
1738 }
1739
1740 (void) loopback_setup();
1741
1742 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1743 if (netns < 0) {
1744 r = -errno;
1745 goto fail;
1746 }
1747
1748 r = 1;
1749
1750 } else if (netns < 0) {
1751 r = netns;
1752 goto fail;
1753
1754 } else {
1755 /* Yay, found something, so let's join the namespace */
1756 if (setns(netns, CLONE_NEWNET) < 0) {
1757 r = -errno;
1758 goto fail;
1759 }
1760
1761 r = 0;
1762 }
1763
1764 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1765 if (q < 0) {
1766 r = q;
1767 goto fail;
1768 }
1769
1770 fail:
1771 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1772 return r;
1773 }
1774
1775 int open_netns_path(const int netns_storage_socket[static 2], const char *path) {
1776 _cleanup_close_ int netns = -1;
1777 int q, r;
1778
1779 assert(netns_storage_socket);
1780 assert(netns_storage_socket[0] >= 0);
1781 assert(netns_storage_socket[1] >= 0);
1782 assert(path);
1783
1784 /* If the storage socket doesn't contain a netns fd yet, open one via the file system and store it in
1785 * it. This is supposed to be called ahead of time, i.e. before setup_netns() which will allocate a
1786 * new anonymous netns if needed. */
1787
1788 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1789 return -errno;
1790
1791 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1792 if (netns == -EAGAIN) {
1793 /* Nothing stored yet. Open the file from the file system. */
1794
1795 netns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
1796 if (netns < 0) {
1797 r = -errno;
1798 goto fail;
1799 }
1800
1801 r = fd_is_network_ns(netns);
1802 if (r == 0) { /* Not a netns? Refuse early. */
1803 r = -EINVAL;
1804 goto fail;
1805 }
1806 if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
1807 goto fail;
1808
1809 r = 1;
1810
1811 } else if (netns < 0) {
1812 r = netns;
1813 goto fail;
1814 } else
1815 r = 0; /* Already allocated */
1816
1817 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1818 if (q < 0) {
1819 r = q;
1820 goto fail;
1821 }
1822
1823 fail:
1824 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1825 return r;
1826 }
1827
1828 bool ns_type_supported(NamespaceType type) {
1829 const char *t, *ns_proc;
1830
1831 t = namespace_type_to_string(type);
1832 if (!t) /* Don't know how to translate this? Then it's not supported */
1833 return false;
1834
1835 ns_proc = strjoina("/proc/self/ns/", t);
1836 return access(ns_proc, F_OK) == 0;
1837 }
1838
1839 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1840 [PROTECT_HOME_NO] = "no",
1841 [PROTECT_HOME_YES] = "yes",
1842 [PROTECT_HOME_READ_ONLY] = "read-only",
1843 [PROTECT_HOME_TMPFS] = "tmpfs",
1844 };
1845
1846 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
1847
1848 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1849 [PROTECT_SYSTEM_NO] = "no",
1850 [PROTECT_SYSTEM_YES] = "yes",
1851 [PROTECT_SYSTEM_FULL] = "full",
1852 [PROTECT_SYSTEM_STRICT] = "strict",
1853 };
1854
1855 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
1856
1857 static const char* const namespace_type_table[] = {
1858 [NAMESPACE_MOUNT] = "mnt",
1859 [NAMESPACE_CGROUP] = "cgroup",
1860 [NAMESPACE_UTS] = "uts",
1861 [NAMESPACE_IPC] = "ipc",
1862 [NAMESPACE_USER] = "user",
1863 [NAMESPACE_PID] = "pid",
1864 [NAMESPACE_NET] = "net",
1865 };
1866
1867 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);