]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
Merge pull request #8754 from poettering/sysusers-fix
[thirdparty/systemd.git] / src / core / namespace.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6 ***/
7
8 #include <errno.h>
9 #include <sched.h>
10 #include <stdio.h>
11 #include <string.h>
12 #include <sys/mount.h>
13 #include <sys/stat.h>
14 #include <unistd.h>
15 #include <linux/fs.h>
16
17 #include "alloc-util.h"
18 #include "base-filesystem.h"
19 #include "dev-setup.h"
20 #include "fd-util.h"
21 #include "fs-util.h"
22 #include "label.h"
23 #include "loop-util.h"
24 #include "loopback-setup.h"
25 #include "missing.h"
26 #include "mkdir.h"
27 #include "mount-util.h"
28 #include "namespace.h"
29 #include "path-util.h"
30 #include "selinux-util.h"
31 #include "socket-util.h"
32 #include "stat-util.h"
33 #include "string-table.h"
34 #include "string-util.h"
35 #include "strv.h"
36 #include "umask-util.h"
37 #include "user-util.h"
38 #include "util.h"
39
40 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
41
42 typedef enum MountMode {
43 /* This is ordered by priority! */
44 INACCESSIBLE,
45 BIND_MOUNT,
46 BIND_MOUNT_RECURSIVE,
47 PRIVATE_TMP,
48 PRIVATE_DEV,
49 BIND_DEV,
50 EMPTY_DIR,
51 SYSFS,
52 PROCFS,
53 READONLY,
54 READWRITE,
55 TMPFS,
56 } MountMode;
57
58 typedef struct MountEntry {
59 const char *path_const; /* Memory allocated on stack or static */
60 MountMode mode:5;
61 bool ignore:1; /* Ignore if path does not exist? */
62 bool has_prefix:1; /* Already is prefixed by the root dir? */
63 bool read_only:1; /* Shall this mount point be read-only? */
64 bool applied:1; /* Already applied */
65 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
66 const char *source_const; /* The source path, for bind mounts */
67 char *source_malloc;
68 const char *options_const;/* Mount options for tmpfs */
69 char *options_malloc;
70 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
71 unsigned n_followed;
72 } MountEntry;
73
74 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
75 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
76 static const MountEntry apivfs_table[] = {
77 { "/proc", PROCFS, false },
78 { "/dev", BIND_DEV, false },
79 { "/sys", SYSFS, false },
80 };
81
82 /* ProtectKernelTunables= option and the related filesystem APIs */
83 static const MountEntry protect_kernel_tunables_table[] = {
84 { "/proc/sys", READONLY, false },
85 { "/proc/sysrq-trigger", READONLY, true },
86 { "/proc/latency_stats", READONLY, true },
87 { "/proc/mtrr", READONLY, true },
88 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
89 { "/proc/acpi", READONLY, true },
90 { "/proc/timer_stats", READONLY, true },
91 { "/proc/asound", READONLY, true },
92 { "/proc/bus", READONLY, true },
93 { "/proc/fs", READONLY, true },
94 { "/proc/irq", READONLY, true },
95 { "/sys", READONLY, false },
96 { "/sys/kernel/debug", READONLY, true },
97 { "/sys/kernel/tracing", READONLY, true },
98 { "/sys/fs/bpf", READONLY, true },
99 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
100 { "/sys/fs/selinux", READWRITE, true },
101 };
102
103 /* ProtectKernelModules= option */
104 static const MountEntry protect_kernel_modules_table[] = {
105 #if HAVE_SPLIT_USR
106 { "/lib/modules", INACCESSIBLE, true },
107 #endif
108 { "/usr/lib/modules", INACCESSIBLE, true },
109 };
110
111 /*
112 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
113 * system should be protected by ProtectSystem=
114 */
115 static const MountEntry protect_home_read_only_table[] = {
116 { "/home", READONLY, true },
117 { "/run/user", READONLY, true },
118 { "/root", READONLY, true },
119 };
120
121 /* ProtectHome=tmpfs table */
122 static const MountEntry protect_home_tmpfs_table[] = {
123 { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
124 { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
125 { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
126 };
127
128 /* ProtectHome=yes table */
129 static const MountEntry protect_home_yes_table[] = {
130 { "/home", INACCESSIBLE, true },
131 { "/run/user", INACCESSIBLE, true },
132 { "/root", INACCESSIBLE, true },
133 };
134
135 /* ProtectSystem=yes table */
136 static const MountEntry protect_system_yes_table[] = {
137 { "/usr", READONLY, false },
138 { "/boot", READONLY, true },
139 { "/efi", READONLY, true },
140 #if HAVE_SPLIT_USR
141 { "/lib", READONLY, true },
142 { "/lib64", READONLY, true },
143 { "/bin", READONLY, true },
144 # if HAVE_SPLIT_BIN
145 { "/sbin", READONLY, true },
146 # endif
147 #endif
148 };
149
150 /* ProtectSystem=full includes ProtectSystem=yes */
151 static const MountEntry protect_system_full_table[] = {
152 { "/usr", READONLY, false },
153 { "/boot", READONLY, true },
154 { "/efi", READONLY, true },
155 { "/etc", READONLY, false },
156 #if HAVE_SPLIT_USR
157 { "/lib", READONLY, true },
158 { "/lib64", READONLY, true },
159 { "/bin", READONLY, true },
160 # if HAVE_SPLIT_BIN
161 { "/sbin", READONLY, true },
162 # endif
163 #endif
164 };
165
166 /*
167 * ProtectSystem=strict table. In this strict mode, we mount everything
168 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
169 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
170 * protect those, and these options should be fully orthogonal.
171 * (And of course /home and friends are also left writable, as ProtectHome=
172 * shall manage those, orthogonally).
173 */
174 static const MountEntry protect_system_strict_table[] = {
175 { "/", READONLY, false },
176 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
177 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
178 { "/dev", READWRITE, false }, /* PrivateDevices= */
179 { "/home", READWRITE, true }, /* ProtectHome= */
180 { "/run/user", READWRITE, true }, /* ProtectHome= */
181 { "/root", READWRITE, true }, /* ProtectHome= */
182 };
183
184 static const char *mount_entry_path(const MountEntry *p) {
185 assert(p);
186
187 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
188 * otherwise the stack/static ->path field is returned. */
189
190 return p->path_malloc ?: p->path_const;
191 }
192
193 static bool mount_entry_read_only(const MountEntry *p) {
194 assert(p);
195
196 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
197 }
198
199 static const char *mount_entry_source(const MountEntry *p) {
200 assert(p);
201
202 return p->source_malloc ?: p->source_const;
203 }
204
205 static const char *mount_entry_options(const MountEntry *p) {
206 assert(p);
207
208 return p->options_malloc ?: p->options_const;
209 }
210
211 static void mount_entry_done(MountEntry *p) {
212 assert(p);
213
214 p->path_malloc = mfree(p->path_malloc);
215 p->source_malloc = mfree(p->source_malloc);
216 p->options_malloc = mfree(p->options_malloc);
217 }
218
219 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
220 char **i;
221
222 assert(p);
223
224 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
225
226 STRV_FOREACH(i, strv) {
227 bool ignore = false, needs_prefix = false;
228 const char *e = *i;
229
230 /* Look for any prefixes */
231 if (startswith(e, "-")) {
232 e++;
233 ignore = true;
234 }
235 if (startswith(e, "+")) {
236 e++;
237 needs_prefix = true;
238 }
239
240 if (!path_is_absolute(e))
241 return -EINVAL;
242
243 *((*p)++) = (MountEntry) {
244 .path_const = e,
245 .mode = mode,
246 .ignore = ignore,
247 .has_prefix = !needs_prefix && !forcibly_require_prefix,
248 };
249 }
250
251 return 0;
252 }
253
254 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
255 char **i;
256
257 assert(p);
258
259 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
260 * "/private/" boundary directories for DynamicUser=1. */
261
262 STRV_FOREACH(i, strv) {
263
264 *((*p)++) = (MountEntry) {
265 .path_const = *i,
266 .mode = EMPTY_DIR,
267 .ignore = false,
268 .has_prefix = false,
269 .read_only = true,
270 .options_const = "mode=755",
271 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
272 };
273 }
274
275 return 0;
276 }
277
278 static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
279 unsigned i;
280
281 assert(p);
282
283 for (i = 0; i < n; i++) {
284 const BindMount *b = binds + i;
285
286 *((*p)++) = (MountEntry) {
287 .path_const = b->destination,
288 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
289 .read_only = b->read_only,
290 .source_const = b->source,
291 .ignore = b->ignore_enoent,
292 };
293 }
294
295 return 0;
296 }
297
298 static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, unsigned n) {
299 unsigned i;
300 int r;
301
302 assert(p);
303
304 for (i = 0; i < n; i++) {
305 const TemporaryFileSystem *t = tmpfs + i;
306 _cleanup_free_ char *o = NULL, *str = NULL;
307 unsigned long flags = MS_NODEV|MS_STRICTATIME;
308 bool ro = false;
309
310 if (!path_is_absolute(t->path))
311 return -EINVAL;
312
313 if (!isempty(t->options)) {
314 str = strjoin("mode=0755,", t->options);
315 if (!str)
316 return -ENOMEM;
317
318 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
319 if (r < 0)
320 return r;
321
322 ro = !!(flags & MS_RDONLY);
323 if (ro)
324 flags ^= MS_RDONLY;
325 }
326
327 *((*p)++) = (MountEntry) {
328 .path_const = t->path,
329 .mode = TMPFS,
330 .read_only = ro,
331 .options_malloc = o,
332 .flags = flags,
333 };
334
335 o = NULL;
336 }
337
338 return 0;
339 }
340
341 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
342 unsigned i;
343
344 assert(p);
345 assert(mounts);
346
347 /* Adds a list of static pre-defined entries */
348
349 for (i = 0; i < n; i++)
350 *((*p)++) = (MountEntry) {
351 .path_const = mount_entry_path(mounts+i),
352 .mode = mounts[i].mode,
353 .ignore = mounts[i].ignore || ignore_protect,
354 };
355
356 return 0;
357 }
358
359 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
360 assert(p);
361
362 switch (protect_home) {
363
364 case PROTECT_HOME_NO:
365 return 0;
366
367 case PROTECT_HOME_READ_ONLY:
368 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
369
370 case PROTECT_HOME_TMPFS:
371 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
372
373 case PROTECT_HOME_YES:
374 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
375
376 default:
377 assert_not_reached("Unexpected ProtectHome= value");
378 }
379 }
380
381 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
382 assert(p);
383
384 switch (protect_system) {
385
386 case PROTECT_SYSTEM_NO:
387 return 0;
388
389 case PROTECT_SYSTEM_STRICT:
390 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
391
392 case PROTECT_SYSTEM_YES:
393 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
394
395 case PROTECT_SYSTEM_FULL:
396 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
397
398 default:
399 assert_not_reached("Unexpected ProtectSystem= value");
400 }
401 }
402
403 static int mount_path_compare(const void *a, const void *b) {
404 const MountEntry *p = a, *q = b;
405 int d;
406
407 /* If the paths are not equal, then order prefixes first */
408 d = path_compare(mount_entry_path(p), mount_entry_path(q));
409 if (d != 0)
410 return d;
411
412 /* If the paths are equal, check the mode */
413 if (p->mode < q->mode)
414 return -1;
415 if (p->mode > q->mode)
416 return 1;
417
418 return 0;
419 }
420
421 static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
422 unsigned i;
423
424 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
425 * that. */
426
427 if (!root_directory)
428 return 0;
429
430 for (i = 0; i < n; i++) {
431 char *s;
432
433 if (m[i].has_prefix)
434 continue;
435
436 s = prefix_root(root_directory, mount_entry_path(m+i));
437 if (!s)
438 return -ENOMEM;
439
440 free_and_replace(m[i].path_malloc, s);
441 m[i].has_prefix = true;
442 }
443
444 return 0;
445 }
446
447 static void drop_duplicates(MountEntry *m, unsigned *n) {
448 MountEntry *f, *t, *previous;
449
450 assert(m);
451 assert(n);
452
453 /* Drops duplicate entries. Expects that the array is properly ordered already. */
454
455 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
456
457 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
458 * above. Note that we only drop duplicates that haven't been mounted yet. */
459 if (previous &&
460 path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
461 !f->applied && !previous->applied) {
462 log_debug("%s is duplicate.", mount_entry_path(f));
463 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
464 mount_entry_done(f);
465 continue;
466 }
467
468 *t = *f;
469 previous = t;
470 t++;
471 }
472
473 *n = t - m;
474 }
475
476 static void drop_inaccessible(MountEntry *m, unsigned *n) {
477 MountEntry *f, *t;
478 const char *clear = NULL;
479
480 assert(m);
481 assert(n);
482
483 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
484 * ordered already. */
485
486 for (f = m, t = m; f < m + *n; f++) {
487
488 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
489 * it, as inaccessible paths really should drop the entire subtree. */
490 if (clear && path_startswith(mount_entry_path(f), clear)) {
491 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
492 mount_entry_done(f);
493 continue;
494 }
495
496 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
497
498 *t = *f;
499 t++;
500 }
501
502 *n = t - m;
503 }
504
505 static void drop_nop(MountEntry *m, unsigned *n) {
506 MountEntry *f, *t;
507
508 assert(m);
509 assert(n);
510
511 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
512 * list is ordered by prefixes. */
513
514 for (f = m, t = m; f < m + *n; f++) {
515
516 /* Only suppress such subtrees for READONLY and READWRITE entries */
517 if (IN_SET(f->mode, READONLY, READWRITE)) {
518 MountEntry *p;
519 bool found = false;
520
521 /* Now let's find the first parent of the entry we are looking at. */
522 for (p = t-1; p >= m; p--) {
523 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
524 found = true;
525 break;
526 }
527 }
528
529 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
530 if (found && p->mode == f->mode) {
531 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
532 mount_entry_done(f);
533 continue;
534 }
535 }
536
537 *t = *f;
538 t++;
539 }
540
541 *n = t - m;
542 }
543
544 static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
545 MountEntry *f, *t;
546
547 assert(m);
548 assert(n);
549
550 /* Nothing to do */
551 if (!root_directory)
552 return;
553
554 /* Drops all mounts that are outside of the root directory. */
555
556 for (f = m, t = m; f < m + *n; f++) {
557
558 if (!path_startswith(mount_entry_path(f), root_directory)) {
559 log_debug("%s is outside of root directory.", mount_entry_path(f));
560 mount_entry_done(f);
561 continue;
562 }
563
564 *t = *f;
565 t++;
566 }
567
568 *n = t - m;
569 }
570
571 static int clone_device_node(const char *d, const char *temporary_mount, bool *make_devnode) {
572 const char *dn;
573 struct stat st;
574 int r;
575
576 if (stat(d, &st) < 0) {
577 if (errno == ENOENT)
578 return -ENXIO;
579 return -errno;
580 }
581
582 if (!S_ISBLK(st.st_mode) &&
583 !S_ISCHR(st.st_mode))
584 return -EINVAL;
585
586 if (st.st_rdev == 0)
587 return -ENXIO;
588
589 dn = strjoina(temporary_mount, d);
590
591 if (*make_devnode) {
592 mac_selinux_create_file_prepare(d, st.st_mode);
593 r = mknod(dn, st.st_mode, st.st_rdev);
594 mac_selinux_create_file_clear();
595
596 if (r == 0)
597 return 0;
598 if (errno != EPERM)
599 return log_debug_errno(errno, "mknod failed for %s: %m", d);
600
601 *make_devnode = false;
602 }
603
604 /* We're about to fallback to bind-mounting the device
605 * node. So create a dummy bind-mount target. */
606 mac_selinux_create_file_prepare(d, 0);
607 r = mknod(dn, S_IFREG, 0);
608 mac_selinux_create_file_clear();
609
610 if (r < 0 && errno != EEXIST)
611 return log_debug_errno(errno, "mknod fallback failed for %s: %m", d);
612
613 /* Fallback to bind-mounting:
614 * The assumption here is that all used device nodes carry standard
615 * properties. Specifically, the devices nodes we bind-mount should
616 * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx)
617 * and should not carry ACLs. */
618 if (mount(d, dn, NULL, MS_BIND, NULL) < 0)
619 return log_debug_errno(errno, "mount failed for %s: %m", d);
620
621 return 0;
622 }
623
624 static int mount_private_dev(MountEntry *m) {
625 static const char devnodes[] =
626 "/dev/null\0"
627 "/dev/zero\0"
628 "/dev/full\0"
629 "/dev/random\0"
630 "/dev/urandom\0"
631 "/dev/tty\0";
632
633 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
634 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
635 bool can_mknod = true;
636 _cleanup_umask_ mode_t u;
637 int r;
638
639 assert(m);
640
641 u = umask(0000);
642
643 if (!mkdtemp(temporary_mount))
644 return -errno;
645
646 dev = strjoina(temporary_mount, "/dev");
647 (void) mkdir(dev, 0755);
648 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
649 r = -errno;
650 goto fail;
651 }
652
653 devpts = strjoina(temporary_mount, "/dev/pts");
654 (void) mkdir(devpts, 0755);
655 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
656 r = -errno;
657 goto fail;
658 }
659
660 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
661 * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
662 * thus, in that case make a clone
663 *
664 * in nspawn and other containers it will be a symlink, in that case make it a symlink
665 */
666 r = is_symlink("/dev/ptmx");
667 if (r < 0)
668 goto fail;
669 if (r > 0) {
670 devptmx = strjoina(temporary_mount, "/dev/ptmx");
671 if (symlink("pts/ptmx", devptmx) < 0) {
672 r = -errno;
673 goto fail;
674 }
675 } else {
676 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
677 if (r < 0)
678 goto fail;
679 }
680
681 devshm = strjoina(temporary_mount, "/dev/shm");
682 (void) mkdir(devshm, 0755);
683 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
684 if (r < 0) {
685 r = -errno;
686 goto fail;
687 }
688
689 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
690 (void) mkdir(devmqueue, 0755);
691 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
692
693 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
694 (void) mkdir(devhugepages, 0755);
695 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
696
697 devlog = strjoina(temporary_mount, "/dev/log");
698 (void) symlink("/run/systemd/journal/dev-log", devlog);
699
700 NULSTR_FOREACH(d, devnodes) {
701 r = clone_device_node(d, temporary_mount, &can_mknod);
702 /* ENXIO means the the *source* is not a device file, skip creation in that case */
703 if (r < 0 && r != -ENXIO)
704 goto fail;
705 }
706
707 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
708
709 /* Create the /dev directory if missing. It is more likely to be
710 * missing when the service is started with RootDirectory. This is
711 * consistent with mount units creating the mount points when missing.
712 */
713 (void) mkdir_p_label(mount_entry_path(m), 0755);
714
715 /* Unmount everything in old /dev */
716 umount_recursive(mount_entry_path(m), 0);
717 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
718 r = -errno;
719 goto fail;
720 }
721
722 rmdir(dev);
723 rmdir(temporary_mount);
724
725 return 0;
726
727 fail:
728 if (devpts)
729 umount(devpts);
730
731 if (devshm)
732 umount(devshm);
733
734 if (devhugepages)
735 umount(devhugepages);
736
737 if (devmqueue)
738 umount(devmqueue);
739
740 umount(dev);
741 rmdir(dev);
742 rmdir(temporary_mount);
743
744 return r;
745 }
746
747 static int mount_bind_dev(const MountEntry *m) {
748 int r;
749
750 assert(m);
751
752 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
753 * /dev. This is only used when RootDirectory= is set. */
754
755 (void) mkdir_p_label(mount_entry_path(m), 0755);
756
757 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
758 if (r < 0)
759 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
760 if (r > 0) /* make this a NOP if /dev is already a mount point */
761 return 0;
762
763 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
764 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
765
766 return 1;
767 }
768
769 static int mount_sysfs(const MountEntry *m) {
770 int r;
771
772 assert(m);
773
774 (void) mkdir_p_label(mount_entry_path(m), 0755);
775
776 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
777 if (r < 0)
778 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
779 if (r > 0) /* make this a NOP if /sys is already a mount point */
780 return 0;
781
782 /* Bind mount the host's version so that we get all child mounts of it, too. */
783 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
784 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
785
786 return 1;
787 }
788
789 static int mount_procfs(const MountEntry *m) {
790 int r;
791
792 assert(m);
793
794 (void) mkdir_p_label(mount_entry_path(m), 0755);
795
796 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
797 if (r < 0)
798 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
799 if (r > 0) /* make this a NOP if /proc is already a mount point */
800 return 0;
801
802 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
803 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
804 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
805
806 return 1;
807 }
808
809 static int mount_tmpfs(const MountEntry *m) {
810 assert(m);
811
812 /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
813
814 (void) mkdir_p_label(mount_entry_path(m), 0755);
815 (void) umount_recursive(mount_entry_path(m), 0);
816
817 if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
818 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
819
820 return 1;
821 }
822
823 static int follow_symlink(
824 const char *root_directory,
825 MountEntry *m) {
826
827 _cleanup_free_ char *target = NULL;
828 int r;
829
830 /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
831 * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
832 * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
833 * end and already have a fully normalized name. */
834
835 r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target);
836 if (r < 0)
837 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
838 if (r > 0) /* Reached the end, nothing more to resolve */
839 return 1;
840
841 if (m->n_followed >= CHASE_SYMLINKS_MAX) { /* put a boundary on things */
842 log_debug("Symlink loop on '%s'.", mount_entry_path(m));
843 return -ELOOP;
844 }
845
846 log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target);
847
848 free_and_replace(m->path_malloc, target);
849 m->has_prefix = true;
850
851 m->n_followed ++;
852
853 return 0;
854 }
855
856 static int apply_mount(
857 const char *root_directory,
858 MountEntry *m) {
859
860 bool rbind = true, make = false;
861 const char *what;
862 int r;
863
864 assert(m);
865
866 log_debug("Applying namespace mount on %s", mount_entry_path(m));
867
868 switch (m->mode) {
869
870 case INACCESSIBLE: {
871 struct stat target;
872
873 /* First, get rid of everything that is below if there
874 * is anything... Then, overmount it with an
875 * inaccessible path. */
876 (void) umount_recursive(mount_entry_path(m), 0);
877
878 if (lstat(mount_entry_path(m), &target) < 0) {
879 if (errno == ENOENT && m->ignore)
880 return 0;
881
882 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
883 }
884
885 what = mode_to_inaccessible_node(target.st_mode);
886 if (!what) {
887 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
888 return -ELOOP;
889 }
890 break;
891 }
892
893 case READONLY:
894 case READWRITE:
895 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
896 if (r == -ENOENT && m->ignore)
897 return 0;
898 if (r < 0)
899 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
900 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
901 return 0;
902 /* This isn't a mount point yet, let's make it one. */
903 what = mount_entry_path(m);
904 break;
905
906 case BIND_MOUNT:
907 rbind = false;
908
909 _fallthrough_;
910 case BIND_MOUNT_RECURSIVE: {
911 _cleanup_free_ char *chased = NULL;
912
913 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note that bind
914 * mount source paths are always relative to the host root, hence we pass NULL as root directory to
915 * chase_symlinks() here. */
916
917 r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased);
918 if (r == -ENOENT && m->ignore) {
919 log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
920 return 0;
921 }
922 if (r < 0)
923 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
924
925 log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased);
926
927 free_and_replace(m->source_malloc, chased);
928
929 what = mount_entry_source(m);
930 make = true;
931 break;
932 }
933
934 case EMPTY_DIR:
935 case TMPFS:
936 return mount_tmpfs(m);
937
938 case PRIVATE_TMP:
939 what = mount_entry_source(m);
940 make = true;
941 break;
942
943 case PRIVATE_DEV:
944 return mount_private_dev(m);
945
946 case BIND_DEV:
947 return mount_bind_dev(m);
948
949 case SYSFS:
950 return mount_sysfs(m);
951
952 case PROCFS:
953 return mount_procfs(m);
954
955 default:
956 assert_not_reached("Unknown mode");
957 }
958
959 assert(what);
960
961 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
962 bool try_again = false;
963 r = -errno;
964
965 if (r == -ENOENT && make) {
966 struct stat st;
967
968 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
969
970 if (stat(what, &st) < 0)
971 log_debug_errno(errno, "Mount point source '%s' is not accessible: %m", what);
972 else {
973 int q;
974
975 (void) mkdir_parents(mount_entry_path(m), 0755);
976
977 if (S_ISDIR(st.st_mode))
978 q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0;
979 else
980 q = touch(mount_entry_path(m));
981
982 if (q < 0)
983 log_debug_errno(q, "Failed to create destination mount point node '%s': %m", mount_entry_path(m));
984 else
985 try_again = true;
986 }
987 }
988
989 if (try_again) {
990 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
991 r = -errno;
992 else
993 r = 0;
994 }
995
996 if (r < 0)
997 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
998 }
999
1000 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
1001 return 0;
1002 }
1003
1004 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
1005 int r = 0;
1006
1007 assert(m);
1008 assert(proc_self_mountinfo);
1009
1010 if (mount_entry_read_only(m)) {
1011 if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) {
1012 /* Make superblock readonly */
1013 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0)
1014 r = -errno;
1015 } else
1016 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
1017 } else if (m->mode == PRIVATE_DEV) {
1018 /* Superblock can be readonly but the submounts can't */
1019 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
1020 r = -errno;
1021 } else
1022 return 0;
1023
1024 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
1025 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
1026 * read-only mounts already applied. */
1027
1028 if (r == -ENOENT && m->ignore)
1029 r = 0;
1030
1031 return r;
1032 }
1033
1034 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
1035 assert(ns_info);
1036
1037 /*
1038 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1039 * since to protect the API VFS mounts, they need to be around in the
1040 * first place... and RootDirectory= or RootImage= need to be set.
1041 */
1042
1043 /* root_directory should point to a mount point */
1044 return root_directory &&
1045 (ns_info->mount_apivfs ||
1046 ns_info->protect_control_groups ||
1047 ns_info->protect_kernel_tunables);
1048 }
1049
1050 static unsigned namespace_calculate_mounts(
1051 const char* root_directory,
1052 const NamespaceInfo *ns_info,
1053 char** read_write_paths,
1054 char** read_only_paths,
1055 char** inaccessible_paths,
1056 char** empty_directories,
1057 unsigned n_bind_mounts,
1058 unsigned n_temporary_filesystems,
1059 const char* tmp_dir,
1060 const char* var_tmp_dir,
1061 ProtectHome protect_home,
1062 ProtectSystem protect_system) {
1063
1064 unsigned protect_home_cnt;
1065 unsigned protect_system_cnt =
1066 (protect_system == PROTECT_SYSTEM_STRICT ?
1067 ELEMENTSOF(protect_system_strict_table) :
1068 ((protect_system == PROTECT_SYSTEM_FULL) ?
1069 ELEMENTSOF(protect_system_full_table) :
1070 ((protect_system == PROTECT_SYSTEM_YES) ?
1071 ELEMENTSOF(protect_system_yes_table) : 0)));
1072
1073 protect_home_cnt =
1074 (protect_home == PROTECT_HOME_YES ?
1075 ELEMENTSOF(protect_home_yes_table) :
1076 ((protect_home == PROTECT_HOME_READ_ONLY) ?
1077 ELEMENTSOF(protect_home_read_only_table) :
1078 ((protect_home == PROTECT_HOME_TMPFS) ?
1079 ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1080
1081 return !!tmp_dir + !!var_tmp_dir +
1082 strv_length(read_write_paths) +
1083 strv_length(read_only_paths) +
1084 strv_length(inaccessible_paths) +
1085 strv_length(empty_directories) +
1086 n_bind_mounts +
1087 n_temporary_filesystems +
1088 ns_info->private_dev +
1089 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1090 (ns_info->protect_control_groups ? 1 : 0) +
1091 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1092 protect_home_cnt + protect_system_cnt +
1093 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1094 }
1095
1096 static void normalize_mounts(const char *root_directory, MountEntry *mounts, unsigned *n_mounts) {
1097 assert(n_mounts);
1098 assert(mounts || *n_mounts == 0);
1099
1100 qsort_safe(mounts, *n_mounts, sizeof(MountEntry), mount_path_compare);
1101
1102 drop_duplicates(mounts, n_mounts);
1103 drop_outside_root(root_directory, mounts, n_mounts);
1104 drop_inaccessible(mounts, n_mounts);
1105 drop_nop(mounts, n_mounts);
1106 }
1107
1108 int setup_namespace(
1109 const char* root_directory,
1110 const char* root_image,
1111 const NamespaceInfo *ns_info,
1112 char** read_write_paths,
1113 char** read_only_paths,
1114 char** inaccessible_paths,
1115 char** empty_directories,
1116 const BindMount *bind_mounts,
1117 unsigned n_bind_mounts,
1118 const TemporaryFileSystem *temporary_filesystems,
1119 unsigned n_temporary_filesystems,
1120 const char* tmp_dir,
1121 const char* var_tmp_dir,
1122 ProtectHome protect_home,
1123 ProtectSystem protect_system,
1124 unsigned long mount_flags,
1125 DissectImageFlags dissect_image_flags) {
1126
1127 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1128 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1129 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1130 _cleanup_free_ void *root_hash = NULL;
1131 MountEntry *m, *mounts = NULL;
1132 size_t root_hash_size = 0;
1133 bool make_slave = false;
1134 const char *root;
1135 unsigned n_mounts;
1136 bool require_prefix = false;
1137 int r = 0;
1138
1139 assert(ns_info);
1140
1141 if (mount_flags == 0)
1142 mount_flags = MS_SHARED;
1143
1144 if (root_image) {
1145 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1146
1147 if (protect_system == PROTECT_SYSTEM_STRICT &&
1148 protect_home != PROTECT_HOME_NO &&
1149 strv_isempty(read_write_paths))
1150 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1151
1152 r = loop_device_make_by_path(root_image,
1153 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1154 &loop_device);
1155 if (r < 0)
1156 return r;
1157
1158 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1159 if (r < 0)
1160 return r;
1161
1162 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1163 if (r < 0)
1164 return r;
1165
1166 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1167 if (r < 0)
1168 return r;
1169 }
1170
1171 if (root_directory)
1172 root = root_directory;
1173 else if (root_image || n_bind_mounts > 0 || n_temporary_filesystems > 0) {
1174
1175 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1176 * the same mount point for all images, which is safe, since they all live in their own namespaces
1177 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1178 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1179 * while we are applying them. */
1180
1181 root = "/run/systemd/unit-root";
1182 (void) mkdir_label(root, 0700);
1183 require_prefix = true;
1184 } else
1185 root = NULL;
1186
1187 n_mounts = namespace_calculate_mounts(
1188 root,
1189 ns_info,
1190 read_write_paths,
1191 read_only_paths,
1192 inaccessible_paths,
1193 empty_directories,
1194 n_bind_mounts,
1195 n_temporary_filesystems,
1196 tmp_dir, var_tmp_dir,
1197 protect_home, protect_system);
1198
1199 /* Set mount slave mode */
1200 if (root || n_mounts > 0)
1201 make_slave = true;
1202
1203 if (n_mounts > 0) {
1204 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1205 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1206 if (r < 0)
1207 goto finish;
1208
1209 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1210 if (r < 0)
1211 goto finish;
1212
1213 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1214 if (r < 0)
1215 goto finish;
1216
1217 r = append_empty_dir_mounts(&m, empty_directories);
1218 if (r < 0)
1219 goto finish;
1220
1221 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1222 if (r < 0)
1223 goto finish;
1224
1225 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1226 if (r < 0)
1227 goto finish;
1228
1229 if (tmp_dir) {
1230 *(m++) = (MountEntry) {
1231 .path_const = "/tmp",
1232 .mode = PRIVATE_TMP,
1233 .source_const = tmp_dir,
1234 };
1235 }
1236
1237 if (var_tmp_dir) {
1238 *(m++) = (MountEntry) {
1239 .path_const = "/var/tmp",
1240 .mode = PRIVATE_TMP,
1241 .source_const = var_tmp_dir,
1242 };
1243 }
1244
1245 if (ns_info->private_dev) {
1246 *(m++) = (MountEntry) {
1247 .path_const = "/dev",
1248 .mode = PRIVATE_DEV,
1249 };
1250 }
1251
1252 if (ns_info->protect_kernel_tunables) {
1253 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1254 if (r < 0)
1255 goto finish;
1256 }
1257
1258 if (ns_info->protect_kernel_modules) {
1259 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1260 if (r < 0)
1261 goto finish;
1262 }
1263
1264 if (ns_info->protect_control_groups) {
1265 *(m++) = (MountEntry) {
1266 .path_const = "/sys/fs/cgroup",
1267 .mode = READONLY,
1268 };
1269 }
1270
1271 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1272 if (r < 0)
1273 goto finish;
1274
1275 r = append_protect_system(&m, protect_system, false);
1276 if (r < 0)
1277 goto finish;
1278
1279 if (namespace_info_mount_apivfs(root, ns_info)) {
1280 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1281 if (r < 0)
1282 goto finish;
1283 }
1284
1285 assert(mounts + n_mounts == m);
1286
1287 /* Prepend the root directory where that's necessary */
1288 r = prefix_where_needed(mounts, n_mounts, root);
1289 if (r < 0)
1290 goto finish;
1291
1292 normalize_mounts(root_directory, mounts, &n_mounts);
1293 }
1294
1295 if (unshare(CLONE_NEWNS) < 0) {
1296 r = -errno;
1297 goto finish;
1298 }
1299
1300 if (make_slave) {
1301 /* Remount / as SLAVE so that nothing now mounted in the namespace
1302 shows up in the parent */
1303 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1304 r = -errno;
1305 goto finish;
1306 }
1307 }
1308
1309 if (root_image) {
1310 /* A root image is specified, mount it to the right place */
1311 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1312 if (r < 0)
1313 goto finish;
1314
1315 if (decrypted_image) {
1316 r = decrypted_image_relinquish(decrypted_image);
1317 if (r < 0)
1318 goto finish;
1319 }
1320
1321 loop_device_relinquish(loop_device);
1322
1323 } else if (root_directory) {
1324
1325 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1326 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1327 if (r < 0)
1328 goto finish;
1329 if (r == 0) {
1330 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1331 r = -errno;
1332 goto finish;
1333 }
1334 }
1335
1336 } else if (root) {
1337
1338 /* Let's mount the main root directory to the root directory to use */
1339 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1340 r = -errno;
1341 goto finish;
1342 }
1343 }
1344
1345 /* Try to set up the new root directory before mounting anything else there. */
1346 if (root_image || root_directory)
1347 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1348
1349 if (n_mounts > 0) {
1350 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1351 char **blacklist;
1352 unsigned j;
1353
1354 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1355 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1356 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1357 if (!proc_self_mountinfo) {
1358 r = -errno;
1359 goto finish;
1360 }
1361
1362 /* First round, establish all mounts we need */
1363 for (;;) {
1364 bool again = false;
1365
1366 for (m = mounts; m < mounts + n_mounts; ++m) {
1367
1368 if (m->applied)
1369 continue;
1370
1371 r = follow_symlink(root, m);
1372 if (r < 0)
1373 goto finish;
1374 if (r == 0) {
1375 /* We hit a symlinked mount point. The entry got rewritten and might point to a
1376 * very different place now. Let's normalize the changed list, and start from
1377 * the beginning. After all to mount the entry at the new location we might
1378 * need some other mounts first */
1379 again = true;
1380 break;
1381 }
1382
1383 r = apply_mount(root, m);
1384 if (r < 0)
1385 goto finish;
1386
1387 m->applied = true;
1388 }
1389
1390 if (!again)
1391 break;
1392
1393 normalize_mounts(root_directory, mounts, &n_mounts);
1394 }
1395
1396 /* Create a blacklist we can pass to bind_mount_recursive() */
1397 blacklist = newa(char*, n_mounts+1);
1398 for (j = 0; j < n_mounts; j++)
1399 blacklist[j] = (char*) mount_entry_path(mounts+j);
1400 blacklist[j] = NULL;
1401
1402 /* Second round, flip the ro bits if necessary. */
1403 for (m = mounts; m < mounts + n_mounts; ++m) {
1404 r = make_read_only(m, blacklist, proc_self_mountinfo);
1405 if (r < 0)
1406 goto finish;
1407 }
1408 }
1409
1410 if (root) {
1411 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1412 r = mount_move_root(root);
1413 if (r < 0)
1414 goto finish;
1415 }
1416
1417 /* Remount / as the desired mode. Note that this will not
1418 * reestablish propagation from our side to the host, since
1419 * what's disconnected is disconnected. */
1420 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1421 r = -errno;
1422 goto finish;
1423 }
1424
1425 r = 0;
1426
1427 finish:
1428 for (m = mounts; m < mounts + n_mounts; m++)
1429 mount_entry_done(m);
1430
1431 return r;
1432 }
1433
1434 void bind_mount_free_many(BindMount *b, unsigned n) {
1435 unsigned i;
1436
1437 assert(b || n == 0);
1438
1439 for (i = 0; i < n; i++) {
1440 free(b[i].source);
1441 free(b[i].destination);
1442 }
1443
1444 free(b);
1445 }
1446
1447 int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1448 _cleanup_free_ char *s = NULL, *d = NULL;
1449 BindMount *c;
1450
1451 assert(b);
1452 assert(n);
1453 assert(item);
1454
1455 s = strdup(item->source);
1456 if (!s)
1457 return -ENOMEM;
1458
1459 d = strdup(item->destination);
1460 if (!d)
1461 return -ENOMEM;
1462
1463 c = reallocarray(*b, *n + 1, sizeof(BindMount));
1464 if (!c)
1465 return -ENOMEM;
1466
1467 *b = c;
1468
1469 c[(*n) ++] = (BindMount) {
1470 .source = TAKE_PTR(s),
1471 .destination = TAKE_PTR(d),
1472 .read_only = item->read_only,
1473 .recursive = item->recursive,
1474 .ignore_enoent = item->ignore_enoent,
1475 };
1476
1477 return 0;
1478 }
1479
1480 void temporary_filesystem_free_many(TemporaryFileSystem *t, unsigned n) {
1481 unsigned i;
1482
1483 assert(t || n == 0);
1484
1485 for (i = 0; i < n; i++) {
1486 free(t[i].path);
1487 free(t[i].options);
1488 }
1489
1490 free(t);
1491 }
1492
1493 int temporary_filesystem_add(
1494 TemporaryFileSystem **t,
1495 unsigned *n,
1496 const char *path,
1497 const char *options) {
1498
1499 _cleanup_free_ char *p = NULL, *o = NULL;
1500 TemporaryFileSystem *c;
1501
1502 assert(t);
1503 assert(n);
1504 assert(path);
1505
1506 p = strdup(path);
1507 if (!p)
1508 return -ENOMEM;
1509
1510 if (!isempty(options)) {
1511 o = strdup(options);
1512 if (!o)
1513 return -ENOMEM;
1514 }
1515
1516 c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
1517 if (!c)
1518 return -ENOMEM;
1519
1520 *t = c;
1521
1522 c[(*n) ++] = (TemporaryFileSystem) {
1523 .path = TAKE_PTR(p),
1524 .options = TAKE_PTR(o),
1525 };
1526
1527 return 0;
1528 }
1529
1530 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1531 _cleanup_free_ char *x = NULL;
1532 char bid[SD_ID128_STRING_MAX];
1533 sd_id128_t boot_id;
1534 int r;
1535
1536 assert(id);
1537 assert(prefix);
1538 assert(path);
1539
1540 /* We include the boot id in the directory so that after a
1541 * reboot we can easily identify obsolete directories. */
1542
1543 r = sd_id128_get_boot(&boot_id);
1544 if (r < 0)
1545 return r;
1546
1547 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1548 if (!x)
1549 return -ENOMEM;
1550
1551 RUN_WITH_UMASK(0077)
1552 if (!mkdtemp(x))
1553 return -errno;
1554
1555 RUN_WITH_UMASK(0000) {
1556 char *y;
1557
1558 y = strjoina(x, "/tmp");
1559
1560 if (mkdir(y, 0777 | S_ISVTX) < 0)
1561 return -errno;
1562 }
1563
1564 *path = TAKE_PTR(x);
1565
1566 return 0;
1567 }
1568
1569 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1570 char *a, *b;
1571 int r;
1572
1573 assert(id);
1574 assert(tmp_dir);
1575 assert(var_tmp_dir);
1576
1577 r = setup_one_tmp_dir(id, "/tmp", &a);
1578 if (r < 0)
1579 return r;
1580
1581 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1582 if (r < 0) {
1583 char *t;
1584
1585 t = strjoina(a, "/tmp");
1586 rmdir(t);
1587 rmdir(a);
1588
1589 free(a);
1590 return r;
1591 }
1592
1593 *tmp_dir = a;
1594 *var_tmp_dir = b;
1595
1596 return 0;
1597 }
1598
1599 int setup_netns(int netns_storage_socket[2]) {
1600 _cleanup_close_ int netns = -1;
1601 int r, q;
1602
1603 assert(netns_storage_socket);
1604 assert(netns_storage_socket[0] >= 0);
1605 assert(netns_storage_socket[1] >= 0);
1606
1607 /* We use the passed socketpair as a storage buffer for our
1608 * namespace reference fd. Whatever process runs this first
1609 * shall create a new namespace, all others should just join
1610 * it. To serialize that we use a file lock on the socket
1611 * pair.
1612 *
1613 * It's a bit crazy, but hey, works great! */
1614
1615 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1616 return -errno;
1617
1618 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1619 if (netns == -EAGAIN) {
1620 /* Nothing stored yet, so let's create a new namespace */
1621
1622 if (unshare(CLONE_NEWNET) < 0) {
1623 r = -errno;
1624 goto fail;
1625 }
1626
1627 loopback_setup();
1628
1629 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1630 if (netns < 0) {
1631 r = -errno;
1632 goto fail;
1633 }
1634
1635 r = 1;
1636
1637 } else if (netns < 0) {
1638 r = netns;
1639 goto fail;
1640
1641 } else {
1642 /* Yay, found something, so let's join the namespace */
1643 if (setns(netns, CLONE_NEWNET) < 0) {
1644 r = -errno;
1645 goto fail;
1646 }
1647
1648 r = 0;
1649 }
1650
1651 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1652 if (q < 0) {
1653 r = q;
1654 goto fail;
1655 }
1656
1657 fail:
1658 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1659 return r;
1660 }
1661
1662 bool ns_type_supported(NamespaceType type) {
1663 const char *t, *ns_proc;
1664
1665 t = namespace_type_to_string(type);
1666 if (!t) /* Don't know how to translate this? Then it's not supported */
1667 return false;
1668
1669 ns_proc = strjoina("/proc/self/ns/", t);
1670 return access(ns_proc, F_OK) == 0;
1671 }
1672
1673 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1674 [PROTECT_HOME_NO] = "no",
1675 [PROTECT_HOME_YES] = "yes",
1676 [PROTECT_HOME_READ_ONLY] = "read-only",
1677 [PROTECT_HOME_TMPFS] = "tmpfs",
1678 };
1679
1680 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1681
1682 ProtectHome parse_protect_home_or_bool(const char *s) {
1683 int r;
1684
1685 r = parse_boolean(s);
1686 if (r > 0)
1687 return PROTECT_HOME_YES;
1688 if (r == 0)
1689 return PROTECT_HOME_NO;
1690
1691 return protect_home_from_string(s);
1692 }
1693
1694 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1695 [PROTECT_SYSTEM_NO] = "no",
1696 [PROTECT_SYSTEM_YES] = "yes",
1697 [PROTECT_SYSTEM_FULL] = "full",
1698 [PROTECT_SYSTEM_STRICT] = "strict",
1699 };
1700
1701 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1702
1703 ProtectSystem parse_protect_system_or_bool(const char *s) {
1704 int r;
1705
1706 r = parse_boolean(s);
1707 if (r > 0)
1708 return PROTECT_SYSTEM_YES;
1709 if (r == 0)
1710 return PROTECT_SYSTEM_NO;
1711
1712 return protect_system_from_string(s);
1713 }
1714
1715 static const char* const namespace_type_table[] = {
1716 [NAMESPACE_MOUNT] = "mnt",
1717 [NAMESPACE_CGROUP] = "cgroup",
1718 [NAMESPACE_UTS] = "uts",
1719 [NAMESPACE_IPC] = "ipc",
1720 [NAMESPACE_USER] = "user",
1721 [NAMESPACE_PID] = "pid",
1722 [NAMESPACE_NET] = "net",
1723 };
1724
1725 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);