]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
namespace: always use a root directory when setting up namespace
[thirdparty/systemd.git] / src / core / namespace.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6 ***/
7
8 #include <errno.h>
9 #include <sched.h>
10 #include <stdio.h>
11 #include <string.h>
12 #include <sys/mount.h>
13 #include <sys/stat.h>
14 #include <unistd.h>
15 #include <linux/fs.h>
16
17 #include "alloc-util.h"
18 #include "base-filesystem.h"
19 #include "dev-setup.h"
20 #include "fd-util.h"
21 #include "fs-util.h"
22 #include "label.h"
23 #include "loop-util.h"
24 #include "loopback-setup.h"
25 #include "missing.h"
26 #include "mkdir.h"
27 #include "mount-util.h"
28 #include "namespace.h"
29 #include "path-util.h"
30 #include "selinux-util.h"
31 #include "socket-util.h"
32 #include "stat-util.h"
33 #include "string-table.h"
34 #include "string-util.h"
35 #include "strv.h"
36 #include "umask-util.h"
37 #include "user-util.h"
38 #include "util.h"
39
40 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
41
42 typedef enum MountMode {
43 /* This is ordered by priority! */
44 INACCESSIBLE,
45 BIND_MOUNT,
46 BIND_MOUNT_RECURSIVE,
47 PRIVATE_TMP,
48 PRIVATE_DEV,
49 BIND_DEV,
50 EMPTY_DIR,
51 SYSFS,
52 PROCFS,
53 READONLY,
54 READWRITE,
55 TMPFS,
56 } MountMode;
57
58 typedef struct MountEntry {
59 const char *path_const; /* Memory allocated on stack or static */
60 MountMode mode:5;
61 bool ignore:1; /* Ignore if path does not exist? */
62 bool has_prefix:1; /* Already is prefixed by the root dir? */
63 bool read_only:1; /* Shall this mount point be read-only? */
64 bool applied:1; /* Already applied */
65 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
66 const char *source_const; /* The source path, for bind mounts */
67 char *source_malloc;
68 const char *options_const;/* Mount options for tmpfs */
69 char *options_malloc;
70 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
71 unsigned n_followed;
72 } MountEntry;
73
74 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
75 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
76 static const MountEntry apivfs_table[] = {
77 { "/proc", PROCFS, false },
78 { "/dev", BIND_DEV, false },
79 { "/sys", SYSFS, false },
80 };
81
82 /* ProtectKernelTunables= option and the related filesystem APIs */
83 static const MountEntry protect_kernel_tunables_table[] = {
84 { "/proc/acpi", READONLY, true },
85 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
86 { "/proc/asound", READONLY, true },
87 { "/proc/bus", READONLY, true },
88 { "/proc/fs", READONLY, true },
89 { "/proc/irq", READONLY, true },
90 { "/proc/kallsyms", INACCESSIBLE, true },
91 { "/proc/kcore", INACCESSIBLE, true },
92 { "/proc/latency_stats", READONLY, true },
93 { "/proc/mtrr", READONLY, true },
94 { "/proc/scsi", READONLY, true },
95 { "/proc/sys", READONLY, false },
96 { "/proc/sysrq-trigger", READONLY, true },
97 { "/proc/timer_stats", READONLY, true },
98 { "/sys", READONLY, false },
99 { "/sys/fs/bpf", READONLY, true },
100 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
101 { "/sys/fs/selinux", READWRITE, true },
102 { "/sys/kernel/debug", READONLY, true },
103 { "/sys/kernel/tracing", READONLY, true },
104 };
105
106 /* ProtectKernelModules= option */
107 static const MountEntry protect_kernel_modules_table[] = {
108 #if HAVE_SPLIT_USR
109 { "/lib/modules", INACCESSIBLE, true },
110 #endif
111 { "/usr/lib/modules", INACCESSIBLE, true },
112 };
113
114 /*
115 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
116 * system should be protected by ProtectSystem=
117 */
118 static const MountEntry protect_home_read_only_table[] = {
119 { "/home", READONLY, true },
120 { "/run/user", READONLY, true },
121 { "/root", READONLY, true },
122 };
123
124 /* ProtectHome=tmpfs table */
125 static const MountEntry protect_home_tmpfs_table[] = {
126 { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
127 { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
128 { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
129 };
130
131 /* ProtectHome=yes table */
132 static const MountEntry protect_home_yes_table[] = {
133 { "/home", INACCESSIBLE, true },
134 { "/run/user", INACCESSIBLE, true },
135 { "/root", INACCESSIBLE, true },
136 };
137
138 /* ProtectSystem=yes table */
139 static const MountEntry protect_system_yes_table[] = {
140 { "/usr", READONLY, false },
141 { "/boot", READONLY, true },
142 { "/efi", READONLY, true },
143 #if HAVE_SPLIT_USR
144 { "/lib", READONLY, true },
145 { "/lib64", READONLY, true },
146 { "/bin", READONLY, true },
147 # if HAVE_SPLIT_BIN
148 { "/sbin", READONLY, true },
149 # endif
150 #endif
151 };
152
153 /* ProtectSystem=full includes ProtectSystem=yes */
154 static const MountEntry protect_system_full_table[] = {
155 { "/usr", READONLY, false },
156 { "/boot", READONLY, true },
157 { "/efi", READONLY, true },
158 { "/etc", READONLY, false },
159 #if HAVE_SPLIT_USR
160 { "/lib", READONLY, true },
161 { "/lib64", READONLY, true },
162 { "/bin", READONLY, true },
163 # if HAVE_SPLIT_BIN
164 { "/sbin", READONLY, true },
165 # endif
166 #endif
167 };
168
169 /*
170 * ProtectSystem=strict table. In this strict mode, we mount everything
171 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
172 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
173 * protect those, and these options should be fully orthogonal.
174 * (And of course /home and friends are also left writable, as ProtectHome=
175 * shall manage those, orthogonally).
176 */
177 static const MountEntry protect_system_strict_table[] = {
178 { "/", READONLY, false },
179 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
180 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
181 { "/dev", READWRITE, false }, /* PrivateDevices= */
182 { "/home", READWRITE, true }, /* ProtectHome= */
183 { "/run/user", READWRITE, true }, /* ProtectHome= */
184 { "/root", READWRITE, true }, /* ProtectHome= */
185 };
186
187 static const char *mount_entry_path(const MountEntry *p) {
188 assert(p);
189
190 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
191 * otherwise the stack/static ->path field is returned. */
192
193 return p->path_malloc ?: p->path_const;
194 }
195
196 static bool mount_entry_read_only(const MountEntry *p) {
197 assert(p);
198
199 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
200 }
201
202 static const char *mount_entry_source(const MountEntry *p) {
203 assert(p);
204
205 return p->source_malloc ?: p->source_const;
206 }
207
208 static const char *mount_entry_options(const MountEntry *p) {
209 assert(p);
210
211 return p->options_malloc ?: p->options_const;
212 }
213
214 static void mount_entry_done(MountEntry *p) {
215 assert(p);
216
217 p->path_malloc = mfree(p->path_malloc);
218 p->source_malloc = mfree(p->source_malloc);
219 p->options_malloc = mfree(p->options_malloc);
220 }
221
222 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
223 char **i;
224
225 assert(p);
226
227 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
228
229 STRV_FOREACH(i, strv) {
230 bool ignore = false, needs_prefix = false;
231 const char *e = *i;
232
233 /* Look for any prefixes */
234 if (startswith(e, "-")) {
235 e++;
236 ignore = true;
237 }
238 if (startswith(e, "+")) {
239 e++;
240 needs_prefix = true;
241 }
242
243 if (!path_is_absolute(e))
244 return -EINVAL;
245
246 *((*p)++) = (MountEntry) {
247 .path_const = e,
248 .mode = mode,
249 .ignore = ignore,
250 .has_prefix = !needs_prefix && !forcibly_require_prefix,
251 };
252 }
253
254 return 0;
255 }
256
257 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
258 char **i;
259
260 assert(p);
261
262 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
263 * "/private/" boundary directories for DynamicUser=1. */
264
265 STRV_FOREACH(i, strv) {
266
267 *((*p)++) = (MountEntry) {
268 .path_const = *i,
269 .mode = EMPTY_DIR,
270 .ignore = false,
271 .has_prefix = false,
272 .read_only = true,
273 .options_const = "mode=755",
274 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
275 };
276 }
277
278 return 0;
279 }
280
281 static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) {
282 size_t i;
283
284 assert(p);
285
286 for (i = 0; i < n; i++) {
287 const BindMount *b = binds + i;
288
289 *((*p)++) = (MountEntry) {
290 .path_const = b->destination,
291 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
292 .read_only = b->read_only,
293 .source_const = b->source,
294 .ignore = b->ignore_enoent,
295 };
296 }
297
298 return 0;
299 }
300
301 static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) {
302 size_t i;
303 int r;
304
305 assert(p);
306
307 for (i = 0; i < n; i++) {
308 const TemporaryFileSystem *t = tmpfs + i;
309 _cleanup_free_ char *o = NULL, *str = NULL;
310 unsigned long flags = MS_NODEV|MS_STRICTATIME;
311 bool ro = false;
312
313 if (!path_is_absolute(t->path))
314 return -EINVAL;
315
316 if (!isempty(t->options)) {
317 str = strjoin("mode=0755,", t->options);
318 if (!str)
319 return -ENOMEM;
320
321 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
322 if (r < 0)
323 return r;
324
325 ro = flags & MS_RDONLY;
326 if (ro)
327 flags ^= MS_RDONLY;
328 }
329
330 *((*p)++) = (MountEntry) {
331 .path_const = t->path,
332 .mode = TMPFS,
333 .read_only = ro,
334 .options_malloc = o,
335 .flags = flags,
336 };
337
338 o = NULL;
339 }
340
341 return 0;
342 }
343
344 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) {
345 size_t i;
346
347 assert(p);
348 assert(mounts);
349
350 /* Adds a list of static pre-defined entries */
351
352 for (i = 0; i < n; i++)
353 *((*p)++) = (MountEntry) {
354 .path_const = mount_entry_path(mounts+i),
355 .mode = mounts[i].mode,
356 .ignore = mounts[i].ignore || ignore_protect,
357 };
358
359 return 0;
360 }
361
362 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
363 assert(p);
364
365 switch (protect_home) {
366
367 case PROTECT_HOME_NO:
368 return 0;
369
370 case PROTECT_HOME_READ_ONLY:
371 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
372
373 case PROTECT_HOME_TMPFS:
374 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
375
376 case PROTECT_HOME_YES:
377 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
378
379 default:
380 assert_not_reached("Unexpected ProtectHome= value");
381 }
382 }
383
384 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
385 assert(p);
386
387 switch (protect_system) {
388
389 case PROTECT_SYSTEM_NO:
390 return 0;
391
392 case PROTECT_SYSTEM_STRICT:
393 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
394
395 case PROTECT_SYSTEM_YES:
396 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
397
398 case PROTECT_SYSTEM_FULL:
399 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
400
401 default:
402 assert_not_reached("Unexpected ProtectSystem= value");
403 }
404 }
405
406 static int mount_path_compare(const void *a, const void *b) {
407 const MountEntry *p = a, *q = b;
408 int d;
409
410 /* If the paths are not equal, then order prefixes first */
411 d = path_compare(mount_entry_path(p), mount_entry_path(q));
412 if (d != 0)
413 return d;
414
415 /* If the paths are equal, check the mode */
416 if (p->mode < q->mode)
417 return -1;
418 if (p->mode > q->mode)
419 return 1;
420
421 return 0;
422 }
423
424 static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) {
425 size_t i;
426
427 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
428 * that. */
429
430 if (!root_directory)
431 return 0;
432
433 for (i = 0; i < n; i++) {
434 char *s;
435
436 if (m[i].has_prefix)
437 continue;
438
439 s = prefix_root(root_directory, mount_entry_path(m+i));
440 if (!s)
441 return -ENOMEM;
442
443 free_and_replace(m[i].path_malloc, s);
444 m[i].has_prefix = true;
445 }
446
447 return 0;
448 }
449
450 static void drop_duplicates(MountEntry *m, size_t *n) {
451 MountEntry *f, *t, *previous;
452
453 assert(m);
454 assert(n);
455
456 /* Drops duplicate entries. Expects that the array is properly ordered already. */
457
458 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
459
460 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
461 * above. Note that we only drop duplicates that haven't been mounted yet. */
462 if (previous &&
463 path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
464 !f->applied && !previous->applied) {
465 log_debug("%s is duplicate.", mount_entry_path(f));
466 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
467 mount_entry_done(f);
468 continue;
469 }
470
471 *t = *f;
472 previous = t;
473 t++;
474 }
475
476 *n = t - m;
477 }
478
479 static void drop_inaccessible(MountEntry *m, size_t *n) {
480 MountEntry *f, *t;
481 const char *clear = NULL;
482
483 assert(m);
484 assert(n);
485
486 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
487 * ordered already. */
488
489 for (f = m, t = m; f < m + *n; f++) {
490
491 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
492 * it, as inaccessible paths really should drop the entire subtree. */
493 if (clear && path_startswith(mount_entry_path(f), clear)) {
494 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
495 mount_entry_done(f);
496 continue;
497 }
498
499 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
500
501 *t = *f;
502 t++;
503 }
504
505 *n = t - m;
506 }
507
508 static void drop_nop(MountEntry *m, size_t *n) {
509 MountEntry *f, *t;
510
511 assert(m);
512 assert(n);
513
514 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
515 * list is ordered by prefixes. */
516
517 for (f = m, t = m; f < m + *n; f++) {
518
519 /* Only suppress such subtrees for READONLY and READWRITE entries */
520 if (IN_SET(f->mode, READONLY, READWRITE)) {
521 MountEntry *p;
522 bool found = false;
523
524 /* Now let's find the first parent of the entry we are looking at. */
525 for (p = t-1; p >= m; p--) {
526 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
527 found = true;
528 break;
529 }
530 }
531
532 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
533 if (found && p->mode == f->mode) {
534 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
535 mount_entry_done(f);
536 continue;
537 }
538 }
539
540 *t = *f;
541 t++;
542 }
543
544 *n = t - m;
545 }
546
547 static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) {
548 MountEntry *f, *t;
549
550 assert(m);
551 assert(n);
552
553 /* Nothing to do */
554 if (!root_directory)
555 return;
556
557 /* Drops all mounts that are outside of the root directory. */
558
559 for (f = m, t = m; f < m + *n; f++) {
560
561 if (!path_startswith(mount_entry_path(f), root_directory)) {
562 log_debug("%s is outside of root directory.", mount_entry_path(f));
563 mount_entry_done(f);
564 continue;
565 }
566
567 *t = *f;
568 t++;
569 }
570
571 *n = t - m;
572 }
573
574 static int clone_device_node(const char *d, const char *temporary_mount, bool *make_devnode) {
575 const char *dn;
576 struct stat st;
577 int r;
578
579 if (stat(d, &st) < 0) {
580 if (errno == ENOENT)
581 return -ENXIO;
582 return -errno;
583 }
584
585 if (!S_ISBLK(st.st_mode) &&
586 !S_ISCHR(st.st_mode))
587 return -EINVAL;
588
589 if (st.st_rdev == 0)
590 return -ENXIO;
591
592 dn = strjoina(temporary_mount, d);
593
594 if (*make_devnode) {
595 mac_selinux_create_file_prepare(d, st.st_mode);
596 r = mknod(dn, st.st_mode, st.st_rdev);
597 mac_selinux_create_file_clear();
598
599 if (r == 0)
600 return 0;
601 if (errno != EPERM)
602 return log_debug_errno(errno, "mknod failed for %s: %m", d);
603
604 *make_devnode = false;
605 }
606
607 /* We're about to fallback to bind-mounting the device
608 * node. So create a dummy bind-mount target. */
609 mac_selinux_create_file_prepare(d, 0);
610 r = mknod(dn, S_IFREG, 0);
611 mac_selinux_create_file_clear();
612
613 if (r < 0 && errno != EEXIST)
614 return log_debug_errno(errno, "mknod fallback failed for %s: %m", d);
615
616 /* Fallback to bind-mounting:
617 * The assumption here is that all used device nodes carry standard
618 * properties. Specifically, the devices nodes we bind-mount should
619 * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx)
620 * and should not carry ACLs. */
621 if (mount(d, dn, NULL, MS_BIND, NULL) < 0)
622 return log_debug_errno(errno, "mount failed for %s: %m", d);
623
624 return 0;
625 }
626
627 static int mount_private_dev(MountEntry *m) {
628 static const char devnodes[] =
629 "/dev/null\0"
630 "/dev/zero\0"
631 "/dev/full\0"
632 "/dev/random\0"
633 "/dev/urandom\0"
634 "/dev/tty\0";
635
636 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
637 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
638 bool can_mknod = true;
639 _cleanup_umask_ mode_t u;
640 int r;
641
642 assert(m);
643
644 u = umask(0000);
645
646 if (!mkdtemp(temporary_mount))
647 return -errno;
648
649 dev = strjoina(temporary_mount, "/dev");
650 (void) mkdir(dev, 0755);
651 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
652 r = -errno;
653 goto fail;
654 }
655
656 devpts = strjoina(temporary_mount, "/dev/pts");
657 (void) mkdir(devpts, 0755);
658 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
659 r = -errno;
660 goto fail;
661 }
662
663 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
664 * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
665 * thus, in that case make a clone
666 *
667 * in nspawn and other containers it will be a symlink, in that case make it a symlink
668 */
669 r = is_symlink("/dev/ptmx");
670 if (r < 0)
671 goto fail;
672 if (r > 0) {
673 devptmx = strjoina(temporary_mount, "/dev/ptmx");
674 if (symlink("pts/ptmx", devptmx) < 0) {
675 r = -errno;
676 goto fail;
677 }
678 } else {
679 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
680 if (r < 0)
681 goto fail;
682 }
683
684 devshm = strjoina(temporary_mount, "/dev/shm");
685 (void) mkdir(devshm, 0755);
686 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
687 if (r < 0) {
688 r = -errno;
689 goto fail;
690 }
691
692 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
693 (void) mkdir(devmqueue, 0755);
694 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
695
696 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
697 (void) mkdir(devhugepages, 0755);
698 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
699
700 devlog = strjoina(temporary_mount, "/dev/log");
701 (void) symlink("/run/systemd/journal/dev-log", devlog);
702
703 NULSTR_FOREACH(d, devnodes) {
704 r = clone_device_node(d, temporary_mount, &can_mknod);
705 /* ENXIO means the the *source* is not a device file, skip creation in that case */
706 if (r < 0 && r != -ENXIO)
707 goto fail;
708 }
709
710 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
711
712 /* Create the /dev directory if missing. It is more likely to be
713 * missing when the service is started with RootDirectory. This is
714 * consistent with mount units creating the mount points when missing.
715 */
716 (void) mkdir_p_label(mount_entry_path(m), 0755);
717
718 /* Unmount everything in old /dev */
719 umount_recursive(mount_entry_path(m), 0);
720 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
721 r = -errno;
722 goto fail;
723 }
724
725 rmdir(dev);
726 rmdir(temporary_mount);
727
728 return 0;
729
730 fail:
731 if (devpts)
732 umount(devpts);
733
734 if (devshm)
735 umount(devshm);
736
737 if (devhugepages)
738 umount(devhugepages);
739
740 if (devmqueue)
741 umount(devmqueue);
742
743 umount(dev);
744 rmdir(dev);
745 rmdir(temporary_mount);
746
747 return r;
748 }
749
750 static int mount_bind_dev(const MountEntry *m) {
751 int r;
752
753 assert(m);
754
755 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
756 * /dev. This is only used when RootDirectory= is set. */
757
758 (void) mkdir_p_label(mount_entry_path(m), 0755);
759
760 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
761 if (r < 0)
762 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
763 if (r > 0) /* make this a NOP if /dev is already a mount point */
764 return 0;
765
766 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
767 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
768
769 return 1;
770 }
771
772 static int mount_sysfs(const MountEntry *m) {
773 int r;
774
775 assert(m);
776
777 (void) mkdir_p_label(mount_entry_path(m), 0755);
778
779 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
780 if (r < 0)
781 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
782 if (r > 0) /* make this a NOP if /sys is already a mount point */
783 return 0;
784
785 /* Bind mount the host's version so that we get all child mounts of it, too. */
786 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
787 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
788
789 return 1;
790 }
791
792 static int mount_procfs(const MountEntry *m) {
793 int r;
794
795 assert(m);
796
797 (void) mkdir_p_label(mount_entry_path(m), 0755);
798
799 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
800 if (r < 0)
801 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
802 if (r > 0) /* make this a NOP if /proc is already a mount point */
803 return 0;
804
805 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
806 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
807 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
808
809 return 1;
810 }
811
812 static int mount_tmpfs(const MountEntry *m) {
813 assert(m);
814
815 /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
816
817 (void) mkdir_p_label(mount_entry_path(m), 0755);
818 (void) umount_recursive(mount_entry_path(m), 0);
819
820 if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
821 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
822
823 return 1;
824 }
825
826 static int follow_symlink(
827 const char *root_directory,
828 MountEntry *m) {
829
830 _cleanup_free_ char *target = NULL;
831 int r;
832
833 /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
834 * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
835 * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
836 * end and already have a fully normalized name. */
837
838 r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target);
839 if (r < 0)
840 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
841 if (r > 0) /* Reached the end, nothing more to resolve */
842 return 1;
843
844 if (m->n_followed >= CHASE_SYMLINKS_MAX) { /* put a boundary on things */
845 log_debug("Symlink loop on '%s'.", mount_entry_path(m));
846 return -ELOOP;
847 }
848
849 log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target);
850
851 free_and_replace(m->path_malloc, target);
852 m->has_prefix = true;
853
854 m->n_followed ++;
855
856 return 0;
857 }
858
859 static int apply_mount(
860 const char *root_directory,
861 MountEntry *m) {
862
863 bool rbind = true, make = false;
864 const char *what;
865 int r;
866
867 assert(m);
868
869 log_debug("Applying namespace mount on %s", mount_entry_path(m));
870
871 switch (m->mode) {
872
873 case INACCESSIBLE: {
874 struct stat target;
875
876 /* First, get rid of everything that is below if there
877 * is anything... Then, overmount it with an
878 * inaccessible path. */
879 (void) umount_recursive(mount_entry_path(m), 0);
880
881 if (lstat(mount_entry_path(m), &target) < 0) {
882 if (errno == ENOENT && m->ignore)
883 return 0;
884
885 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
886 }
887
888 what = mode_to_inaccessible_node(target.st_mode);
889 if (!what) {
890 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
891 return -ELOOP;
892 }
893 break;
894 }
895
896 case READONLY:
897 case READWRITE:
898 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
899 if (r == -ENOENT && m->ignore)
900 return 0;
901 if (r < 0)
902 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
903 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
904 return 0;
905 /* This isn't a mount point yet, let's make it one. */
906 what = mount_entry_path(m);
907 break;
908
909 case BIND_MOUNT:
910 rbind = false;
911
912 _fallthrough_;
913 case BIND_MOUNT_RECURSIVE: {
914 _cleanup_free_ char *chased = NULL;
915
916 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note that bind
917 * mount source paths are always relative to the host root, hence we pass NULL as root directory to
918 * chase_symlinks() here. */
919
920 r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased);
921 if (r == -ENOENT && m->ignore) {
922 log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
923 return 0;
924 }
925 if (r < 0)
926 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
927
928 log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased);
929
930 free_and_replace(m->source_malloc, chased);
931
932 what = mount_entry_source(m);
933 make = true;
934 break;
935 }
936
937 case EMPTY_DIR:
938 case TMPFS:
939 return mount_tmpfs(m);
940
941 case PRIVATE_TMP:
942 what = mount_entry_source(m);
943 make = true;
944 break;
945
946 case PRIVATE_DEV:
947 return mount_private_dev(m);
948
949 case BIND_DEV:
950 return mount_bind_dev(m);
951
952 case SYSFS:
953 return mount_sysfs(m);
954
955 case PROCFS:
956 return mount_procfs(m);
957
958 default:
959 assert_not_reached("Unknown mode");
960 }
961
962 assert(what);
963
964 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
965 bool try_again = false;
966 r = -errno;
967
968 if (r == -ENOENT && make) {
969 struct stat st;
970
971 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
972
973 if (stat(what, &st) < 0)
974 log_debug_errno(errno, "Mount point source '%s' is not accessible: %m", what);
975 else {
976 int q;
977
978 (void) mkdir_parents(mount_entry_path(m), 0755);
979
980 if (S_ISDIR(st.st_mode))
981 q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0;
982 else
983 q = touch(mount_entry_path(m));
984
985 if (q < 0)
986 log_debug_errno(q, "Failed to create destination mount point node '%s': %m", mount_entry_path(m));
987 else
988 try_again = true;
989 }
990 }
991
992 if (try_again) {
993 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
994 r = -errno;
995 else
996 r = 0;
997 }
998
999 if (r < 0)
1000 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
1001 }
1002
1003 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
1004 return 0;
1005 }
1006
1007 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
1008 int r = 0;
1009
1010 assert(m);
1011 assert(proc_self_mountinfo);
1012
1013 if (mount_entry_read_only(m)) {
1014 if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) {
1015 /* Make superblock readonly */
1016 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0)
1017 r = -errno;
1018 } else
1019 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
1020 } else if (m->mode == PRIVATE_DEV) {
1021 /* Superblock can be readonly but the submounts can't */
1022 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
1023 r = -errno;
1024 } else
1025 return 0;
1026
1027 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
1028 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
1029 * read-only mounts already applied. */
1030
1031 if (r == -ENOENT && m->ignore)
1032 r = 0;
1033
1034 return r;
1035 }
1036
1037 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
1038 assert(ns_info);
1039
1040 /*
1041 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1042 * since to protect the API VFS mounts, they need to be around in the
1043 * first place... and RootDirectory= or RootImage= need to be set.
1044 */
1045
1046 /* root_directory should point to a mount point */
1047 return root_directory &&
1048 (ns_info->mount_apivfs ||
1049 ns_info->protect_control_groups ||
1050 ns_info->protect_kernel_tunables);
1051 }
1052
1053 static size_t namespace_calculate_mounts(
1054 const char* root_directory,
1055 const NamespaceInfo *ns_info,
1056 char** read_write_paths,
1057 char** read_only_paths,
1058 char** inaccessible_paths,
1059 char** empty_directories,
1060 size_t n_bind_mounts,
1061 size_t n_temporary_filesystems,
1062 const char* tmp_dir,
1063 const char* var_tmp_dir,
1064 ProtectHome protect_home,
1065 ProtectSystem protect_system) {
1066
1067 size_t protect_home_cnt;
1068 size_t protect_system_cnt =
1069 (protect_system == PROTECT_SYSTEM_STRICT ?
1070 ELEMENTSOF(protect_system_strict_table) :
1071 ((protect_system == PROTECT_SYSTEM_FULL) ?
1072 ELEMENTSOF(protect_system_full_table) :
1073 ((protect_system == PROTECT_SYSTEM_YES) ?
1074 ELEMENTSOF(protect_system_yes_table) : 0)));
1075
1076 protect_home_cnt =
1077 (protect_home == PROTECT_HOME_YES ?
1078 ELEMENTSOF(protect_home_yes_table) :
1079 ((protect_home == PROTECT_HOME_READ_ONLY) ?
1080 ELEMENTSOF(protect_home_read_only_table) :
1081 ((protect_home == PROTECT_HOME_TMPFS) ?
1082 ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1083
1084 return !!tmp_dir + !!var_tmp_dir +
1085 strv_length(read_write_paths) +
1086 strv_length(read_only_paths) +
1087 strv_length(inaccessible_paths) +
1088 strv_length(empty_directories) +
1089 n_bind_mounts +
1090 n_temporary_filesystems +
1091 ns_info->private_dev +
1092 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1093 (ns_info->protect_control_groups ? 1 : 0) +
1094 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1095 protect_home_cnt + protect_system_cnt +
1096 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1097 }
1098
1099 static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) {
1100 assert(n_mounts);
1101 assert(mounts || *n_mounts == 0);
1102
1103 qsort_safe(mounts, *n_mounts, sizeof(MountEntry), mount_path_compare);
1104
1105 drop_duplicates(mounts, n_mounts);
1106 drop_outside_root(root_directory, mounts, n_mounts);
1107 drop_inaccessible(mounts, n_mounts);
1108 drop_nop(mounts, n_mounts);
1109 }
1110
1111 int setup_namespace(
1112 const char* root_directory,
1113 const char* root_image,
1114 const NamespaceInfo *ns_info,
1115 char** read_write_paths,
1116 char** read_only_paths,
1117 char** inaccessible_paths,
1118 char** empty_directories,
1119 const BindMount *bind_mounts,
1120 size_t n_bind_mounts,
1121 const TemporaryFileSystem *temporary_filesystems,
1122 size_t n_temporary_filesystems,
1123 const char* tmp_dir,
1124 const char* var_tmp_dir,
1125 ProtectHome protect_home,
1126 ProtectSystem protect_system,
1127 unsigned long mount_flags,
1128 DissectImageFlags dissect_image_flags) {
1129
1130 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1131 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1132 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1133 _cleanup_free_ void *root_hash = NULL;
1134 MountEntry *m, *mounts = NULL;
1135 size_t root_hash_size = 0;
1136 const char *root;
1137 size_t n_mounts;
1138 bool make_slave;
1139 bool require_prefix = false;
1140 int r = 0;
1141
1142 assert(ns_info);
1143
1144 if (mount_flags == 0)
1145 mount_flags = MS_SHARED;
1146
1147 if (root_image) {
1148 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1149
1150 if (protect_system == PROTECT_SYSTEM_STRICT &&
1151 protect_home != PROTECT_HOME_NO &&
1152 strv_isempty(read_write_paths))
1153 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1154
1155 r = loop_device_make_by_path(root_image,
1156 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1157 &loop_device);
1158 if (r < 0)
1159 return r;
1160
1161 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1162 if (r < 0)
1163 return r;
1164
1165 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1166 if (r < 0)
1167 return r;
1168
1169 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1170 if (r < 0)
1171 return r;
1172 }
1173
1174 if (root_directory)
1175 root = root_directory;
1176 else {
1177 /* Always create the mount namespace in a temporary directory, instead of operating
1178 * directly in the root. The temporary directory prevents any mounts from being
1179 * potentially obscured my other mounts we already applied.
1180 * We use the same mount point for all images, which is safe, since they all live
1181 * in their own namespaces after all, and hence won't see each other. */
1182
1183 root = "/run/systemd/unit-root";
1184 (void) mkdir_label(root, 0700);
1185 require_prefix = true;
1186 }
1187
1188 n_mounts = namespace_calculate_mounts(
1189 root,
1190 ns_info,
1191 read_write_paths,
1192 read_only_paths,
1193 inaccessible_paths,
1194 empty_directories,
1195 n_bind_mounts,
1196 n_temporary_filesystems,
1197 tmp_dir, var_tmp_dir,
1198 protect_home, protect_system);
1199
1200 /* Set mount slave mode */
1201 make_slave = root || n_mounts > 0 || ns_info->private_mounts;
1202
1203 if (n_mounts > 0) {
1204 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1205 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1206 if (r < 0)
1207 goto finish;
1208
1209 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1210 if (r < 0)
1211 goto finish;
1212
1213 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1214 if (r < 0)
1215 goto finish;
1216
1217 r = append_empty_dir_mounts(&m, empty_directories);
1218 if (r < 0)
1219 goto finish;
1220
1221 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1222 if (r < 0)
1223 goto finish;
1224
1225 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1226 if (r < 0)
1227 goto finish;
1228
1229 if (tmp_dir) {
1230 *(m++) = (MountEntry) {
1231 .path_const = "/tmp",
1232 .mode = PRIVATE_TMP,
1233 .source_const = tmp_dir,
1234 };
1235 }
1236
1237 if (var_tmp_dir) {
1238 *(m++) = (MountEntry) {
1239 .path_const = "/var/tmp",
1240 .mode = PRIVATE_TMP,
1241 .source_const = var_tmp_dir,
1242 };
1243 }
1244
1245 if (ns_info->private_dev) {
1246 *(m++) = (MountEntry) {
1247 .path_const = "/dev",
1248 .mode = PRIVATE_DEV,
1249 };
1250 }
1251
1252 if (ns_info->protect_kernel_tunables) {
1253 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1254 if (r < 0)
1255 goto finish;
1256 }
1257
1258 if (ns_info->protect_kernel_modules) {
1259 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1260 if (r < 0)
1261 goto finish;
1262 }
1263
1264 if (ns_info->protect_control_groups) {
1265 *(m++) = (MountEntry) {
1266 .path_const = "/sys/fs/cgroup",
1267 .mode = READONLY,
1268 };
1269 }
1270
1271 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1272 if (r < 0)
1273 goto finish;
1274
1275 r = append_protect_system(&m, protect_system, false);
1276 if (r < 0)
1277 goto finish;
1278
1279 if (namespace_info_mount_apivfs(root, ns_info)) {
1280 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1281 if (r < 0)
1282 goto finish;
1283 }
1284
1285 assert(mounts + n_mounts == m);
1286
1287 /* Prepend the root directory where that's necessary */
1288 r = prefix_where_needed(mounts, n_mounts, root);
1289 if (r < 0)
1290 goto finish;
1291
1292 normalize_mounts(root_directory, mounts, &n_mounts);
1293 }
1294
1295 if (unshare(CLONE_NEWNS) < 0) {
1296 r = -errno;
1297 goto finish;
1298 }
1299
1300 if (make_slave) {
1301 /* Remount / as SLAVE so that nothing now mounted in the namespace
1302 shows up in the parent */
1303 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1304 r = -errno;
1305 goto finish;
1306 }
1307 }
1308
1309 if (root_image) {
1310 /* A root image is specified, mount it to the right place */
1311 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1312 if (r < 0)
1313 goto finish;
1314
1315 if (decrypted_image) {
1316 r = decrypted_image_relinquish(decrypted_image);
1317 if (r < 0)
1318 goto finish;
1319 }
1320
1321 loop_device_relinquish(loop_device);
1322
1323 } else if (root_directory) {
1324
1325 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1326 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1327 if (r < 0)
1328 goto finish;
1329 if (r == 0) {
1330 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1331 r = -errno;
1332 goto finish;
1333 }
1334 }
1335
1336 } else if (root) {
1337
1338 /* Let's mount the main root directory to the root directory to use */
1339 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1340 r = -errno;
1341 goto finish;
1342 }
1343 }
1344
1345 /* Try to set up the new root directory before mounting anything else there. */
1346 if (root_image || root_directory)
1347 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1348
1349 if (n_mounts > 0) {
1350 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1351 char **blacklist;
1352 size_t j;
1353
1354 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1355 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1356 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1357 if (!proc_self_mountinfo) {
1358 r = -errno;
1359 goto finish;
1360 }
1361
1362 /* First round, establish all mounts we need */
1363 for (;;) {
1364 bool again = false;
1365
1366 for (m = mounts; m < mounts + n_mounts; ++m) {
1367
1368 if (m->applied)
1369 continue;
1370
1371 r = follow_symlink(root, m);
1372 if (r < 0)
1373 goto finish;
1374 if (r == 0) {
1375 /* We hit a symlinked mount point. The entry got rewritten and might point to a
1376 * very different place now. Let's normalize the changed list, and start from
1377 * the beginning. After all to mount the entry at the new location we might
1378 * need some other mounts first */
1379 again = true;
1380 break;
1381 }
1382
1383 r = apply_mount(root, m);
1384 if (r < 0)
1385 goto finish;
1386
1387 m->applied = true;
1388 }
1389
1390 if (!again)
1391 break;
1392
1393 normalize_mounts(root_directory, mounts, &n_mounts);
1394 }
1395
1396 /* Create a blacklist we can pass to bind_mount_recursive() */
1397 blacklist = newa(char*, n_mounts+1);
1398 for (j = 0; j < n_mounts; j++)
1399 blacklist[j] = (char*) mount_entry_path(mounts+j);
1400 blacklist[j] = NULL;
1401
1402 /* Second round, flip the ro bits if necessary. */
1403 for (m = mounts; m < mounts + n_mounts; ++m) {
1404 r = make_read_only(m, blacklist, proc_self_mountinfo);
1405 if (r < 0)
1406 goto finish;
1407 }
1408 }
1409
1410 if (root) {
1411 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1412 r = mount_move_root(root);
1413 if (r < 0)
1414 goto finish;
1415 }
1416
1417 /* Remount / as the desired mode. Note that this will not
1418 * reestablish propagation from our side to the host, since
1419 * what's disconnected is disconnected. */
1420 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1421 r = -errno;
1422 goto finish;
1423 }
1424
1425 r = 0;
1426
1427 finish:
1428 for (m = mounts; m < mounts + n_mounts; m++)
1429 mount_entry_done(m);
1430
1431 return r;
1432 }
1433
1434 void bind_mount_free_many(BindMount *b, size_t n) {
1435 size_t i;
1436
1437 assert(b || n == 0);
1438
1439 for (i = 0; i < n; i++) {
1440 free(b[i].source);
1441 free(b[i].destination);
1442 }
1443
1444 free(b);
1445 }
1446
1447 int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
1448 _cleanup_free_ char *s = NULL, *d = NULL;
1449 BindMount *c;
1450
1451 assert(b);
1452 assert(n);
1453 assert(item);
1454
1455 s = strdup(item->source);
1456 if (!s)
1457 return -ENOMEM;
1458
1459 d = strdup(item->destination);
1460 if (!d)
1461 return -ENOMEM;
1462
1463 c = reallocarray(*b, *n + 1, sizeof(BindMount));
1464 if (!c)
1465 return -ENOMEM;
1466
1467 *b = c;
1468
1469 c[(*n) ++] = (BindMount) {
1470 .source = TAKE_PTR(s),
1471 .destination = TAKE_PTR(d),
1472 .read_only = item->read_only,
1473 .recursive = item->recursive,
1474 .ignore_enoent = item->ignore_enoent,
1475 };
1476
1477 return 0;
1478 }
1479
1480 void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
1481 size_t i;
1482
1483 assert(t || n == 0);
1484
1485 for (i = 0; i < n; i++) {
1486 free(t[i].path);
1487 free(t[i].options);
1488 }
1489
1490 free(t);
1491 }
1492
1493 int temporary_filesystem_add(
1494 TemporaryFileSystem **t,
1495 size_t *n,
1496 const char *path,
1497 const char *options) {
1498
1499 _cleanup_free_ char *p = NULL, *o = NULL;
1500 TemporaryFileSystem *c;
1501
1502 assert(t);
1503 assert(n);
1504 assert(path);
1505
1506 p = strdup(path);
1507 if (!p)
1508 return -ENOMEM;
1509
1510 if (!isempty(options)) {
1511 o = strdup(options);
1512 if (!o)
1513 return -ENOMEM;
1514 }
1515
1516 c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
1517 if (!c)
1518 return -ENOMEM;
1519
1520 *t = c;
1521
1522 c[(*n) ++] = (TemporaryFileSystem) {
1523 .path = TAKE_PTR(p),
1524 .options = TAKE_PTR(o),
1525 };
1526
1527 return 0;
1528 }
1529
1530 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1531 _cleanup_free_ char *x = NULL;
1532 char bid[SD_ID128_STRING_MAX];
1533 sd_id128_t boot_id;
1534 int r;
1535
1536 assert(id);
1537 assert(prefix);
1538 assert(path);
1539
1540 /* We include the boot id in the directory so that after a
1541 * reboot we can easily identify obsolete directories. */
1542
1543 r = sd_id128_get_boot(&boot_id);
1544 if (r < 0)
1545 return r;
1546
1547 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1548 if (!x)
1549 return -ENOMEM;
1550
1551 RUN_WITH_UMASK(0077)
1552 if (!mkdtemp(x))
1553 return -errno;
1554
1555 RUN_WITH_UMASK(0000) {
1556 char *y;
1557
1558 y = strjoina(x, "/tmp");
1559
1560 if (mkdir(y, 0777 | S_ISVTX) < 0)
1561 return -errno;
1562 }
1563
1564 *path = TAKE_PTR(x);
1565
1566 return 0;
1567 }
1568
1569 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1570 char *a, *b;
1571 int r;
1572
1573 assert(id);
1574 assert(tmp_dir);
1575 assert(var_tmp_dir);
1576
1577 r = setup_one_tmp_dir(id, "/tmp", &a);
1578 if (r < 0)
1579 return r;
1580
1581 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1582 if (r < 0) {
1583 char *t;
1584
1585 t = strjoina(a, "/tmp");
1586 rmdir(t);
1587 rmdir(a);
1588
1589 free(a);
1590 return r;
1591 }
1592
1593 *tmp_dir = a;
1594 *var_tmp_dir = b;
1595
1596 return 0;
1597 }
1598
1599 int setup_netns(int netns_storage_socket[2]) {
1600 _cleanup_close_ int netns = -1;
1601 int r, q;
1602
1603 assert(netns_storage_socket);
1604 assert(netns_storage_socket[0] >= 0);
1605 assert(netns_storage_socket[1] >= 0);
1606
1607 /* We use the passed socketpair as a storage buffer for our
1608 * namespace reference fd. Whatever process runs this first
1609 * shall create a new namespace, all others should just join
1610 * it. To serialize that we use a file lock on the socket
1611 * pair.
1612 *
1613 * It's a bit crazy, but hey, works great! */
1614
1615 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1616 return -errno;
1617
1618 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1619 if (netns == -EAGAIN) {
1620 /* Nothing stored yet, so let's create a new namespace */
1621
1622 if (unshare(CLONE_NEWNET) < 0) {
1623 r = -errno;
1624 goto fail;
1625 }
1626
1627 loopback_setup();
1628
1629 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1630 if (netns < 0) {
1631 r = -errno;
1632 goto fail;
1633 }
1634
1635 r = 1;
1636
1637 } else if (netns < 0) {
1638 r = netns;
1639 goto fail;
1640
1641 } else {
1642 /* Yay, found something, so let's join the namespace */
1643 if (setns(netns, CLONE_NEWNET) < 0) {
1644 r = -errno;
1645 goto fail;
1646 }
1647
1648 r = 0;
1649 }
1650
1651 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1652 if (q < 0) {
1653 r = q;
1654 goto fail;
1655 }
1656
1657 fail:
1658 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1659 return r;
1660 }
1661
1662 bool ns_type_supported(NamespaceType type) {
1663 const char *t, *ns_proc;
1664
1665 t = namespace_type_to_string(type);
1666 if (!t) /* Don't know how to translate this? Then it's not supported */
1667 return false;
1668
1669 ns_proc = strjoina("/proc/self/ns/", t);
1670 return access(ns_proc, F_OK) == 0;
1671 }
1672
1673 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1674 [PROTECT_HOME_NO] = "no",
1675 [PROTECT_HOME_YES] = "yes",
1676 [PROTECT_HOME_READ_ONLY] = "read-only",
1677 [PROTECT_HOME_TMPFS] = "tmpfs",
1678 };
1679
1680 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1681
1682 ProtectHome protect_home_or_bool_from_string(const char *s) {
1683 int r;
1684
1685 r = parse_boolean(s);
1686 if (r > 0)
1687 return PROTECT_HOME_YES;
1688 if (r == 0)
1689 return PROTECT_HOME_NO;
1690
1691 return protect_home_from_string(s);
1692 }
1693
1694 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1695 [PROTECT_SYSTEM_NO] = "no",
1696 [PROTECT_SYSTEM_YES] = "yes",
1697 [PROTECT_SYSTEM_FULL] = "full",
1698 [PROTECT_SYSTEM_STRICT] = "strict",
1699 };
1700
1701 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1702
1703 ProtectSystem protect_system_or_bool_from_string(const char *s) {
1704 int r;
1705
1706 r = parse_boolean(s);
1707 if (r > 0)
1708 return PROTECT_SYSTEM_YES;
1709 if (r == 0)
1710 return PROTECT_SYSTEM_NO;
1711
1712 return protect_system_from_string(s);
1713 }
1714
1715 static const char* const namespace_type_table[] = {
1716 [NAMESPACE_MOUNT] = "mnt",
1717 [NAMESPACE_CGROUP] = "cgroup",
1718 [NAMESPACE_UTS] = "uts",
1719 [NAMESPACE_IPC] = "ipc",
1720 [NAMESPACE_USER] = "user",
1721 [NAMESPACE_PID] = "pid",
1722 [NAMESPACE_NET] = "net",
1723 };
1724
1725 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);