]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
core: add new PrivateMounts= unit setting
[thirdparty/systemd.git] / src / core / namespace.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6 ***/
7
8 #include <errno.h>
9 #include <sched.h>
10 #include <stdio.h>
11 #include <string.h>
12 #include <sys/mount.h>
13 #include <sys/stat.h>
14 #include <unistd.h>
15 #include <linux/fs.h>
16
17 #include "alloc-util.h"
18 #include "base-filesystem.h"
19 #include "dev-setup.h"
20 #include "fd-util.h"
21 #include "fs-util.h"
22 #include "label.h"
23 #include "loop-util.h"
24 #include "loopback-setup.h"
25 #include "missing.h"
26 #include "mkdir.h"
27 #include "mount-util.h"
28 #include "namespace.h"
29 #include "path-util.h"
30 #include "selinux-util.h"
31 #include "socket-util.h"
32 #include "stat-util.h"
33 #include "string-table.h"
34 #include "string-util.h"
35 #include "strv.h"
36 #include "umask-util.h"
37 #include "user-util.h"
38 #include "util.h"
39
40 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
41
42 typedef enum MountMode {
43 /* This is ordered by priority! */
44 INACCESSIBLE,
45 BIND_MOUNT,
46 BIND_MOUNT_RECURSIVE,
47 PRIVATE_TMP,
48 PRIVATE_DEV,
49 BIND_DEV,
50 EMPTY_DIR,
51 SYSFS,
52 PROCFS,
53 READONLY,
54 READWRITE,
55 TMPFS,
56 } MountMode;
57
58 typedef struct MountEntry {
59 const char *path_const; /* Memory allocated on stack or static */
60 MountMode mode:5;
61 bool ignore:1; /* Ignore if path does not exist? */
62 bool has_prefix:1; /* Already is prefixed by the root dir? */
63 bool read_only:1; /* Shall this mount point be read-only? */
64 bool applied:1; /* Already applied */
65 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
66 const char *source_const; /* The source path, for bind mounts */
67 char *source_malloc;
68 const char *options_const;/* Mount options for tmpfs */
69 char *options_malloc;
70 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
71 unsigned n_followed;
72 } MountEntry;
73
74 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
75 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
76 static const MountEntry apivfs_table[] = {
77 { "/proc", PROCFS, false },
78 { "/dev", BIND_DEV, false },
79 { "/sys", SYSFS, false },
80 };
81
82 /* ProtectKernelTunables= option and the related filesystem APIs */
83 static const MountEntry protect_kernel_tunables_table[] = {
84 { "/proc/acpi", READONLY, true },
85 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
86 { "/proc/asound", READONLY, true },
87 { "/proc/bus", READONLY, true },
88 { "/proc/fs", READONLY, true },
89 { "/proc/irq", READONLY, true },
90 { "/proc/kallsyms", INACCESSIBLE, true },
91 { "/proc/kcore", INACCESSIBLE, true },
92 { "/proc/latency_stats", READONLY, true },
93 { "/proc/mtrr", READONLY, true },
94 { "/proc/scsi", READONLY, true },
95 { "/proc/sys", READONLY, false },
96 { "/proc/sysrq-trigger", READONLY, true },
97 { "/proc/timer_stats", READONLY, true },
98 { "/sys", READONLY, false },
99 { "/sys/fs/bpf", READONLY, true },
100 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
101 { "/sys/fs/selinux", READWRITE, true },
102 { "/sys/kernel/debug", READONLY, true },
103 { "/sys/kernel/tracing", READONLY, true },
104 };
105
106 /* ProtectKernelModules= option */
107 static const MountEntry protect_kernel_modules_table[] = {
108 #if HAVE_SPLIT_USR
109 { "/lib/modules", INACCESSIBLE, true },
110 #endif
111 { "/usr/lib/modules", INACCESSIBLE, true },
112 };
113
114 /*
115 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
116 * system should be protected by ProtectSystem=
117 */
118 static const MountEntry protect_home_read_only_table[] = {
119 { "/home", READONLY, true },
120 { "/run/user", READONLY, true },
121 { "/root", READONLY, true },
122 };
123
124 /* ProtectHome=tmpfs table */
125 static const MountEntry protect_home_tmpfs_table[] = {
126 { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
127 { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
128 { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
129 };
130
131 /* ProtectHome=yes table */
132 static const MountEntry protect_home_yes_table[] = {
133 { "/home", INACCESSIBLE, true },
134 { "/run/user", INACCESSIBLE, true },
135 { "/root", INACCESSIBLE, true },
136 };
137
138 /* ProtectSystem=yes table */
139 static const MountEntry protect_system_yes_table[] = {
140 { "/usr", READONLY, false },
141 { "/boot", READONLY, true },
142 { "/efi", READONLY, true },
143 #if HAVE_SPLIT_USR
144 { "/lib", READONLY, true },
145 { "/lib64", READONLY, true },
146 { "/bin", READONLY, true },
147 # if HAVE_SPLIT_BIN
148 { "/sbin", READONLY, true },
149 # endif
150 #endif
151 };
152
153 /* ProtectSystem=full includes ProtectSystem=yes */
154 static const MountEntry protect_system_full_table[] = {
155 { "/usr", READONLY, false },
156 { "/boot", READONLY, true },
157 { "/efi", READONLY, true },
158 { "/etc", READONLY, false },
159 #if HAVE_SPLIT_USR
160 { "/lib", READONLY, true },
161 { "/lib64", READONLY, true },
162 { "/bin", READONLY, true },
163 # if HAVE_SPLIT_BIN
164 { "/sbin", READONLY, true },
165 # endif
166 #endif
167 };
168
169 /*
170 * ProtectSystem=strict table. In this strict mode, we mount everything
171 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
172 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
173 * protect those, and these options should be fully orthogonal.
174 * (And of course /home and friends are also left writable, as ProtectHome=
175 * shall manage those, orthogonally).
176 */
177 static const MountEntry protect_system_strict_table[] = {
178 { "/", READONLY, false },
179 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
180 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
181 { "/dev", READWRITE, false }, /* PrivateDevices= */
182 { "/home", READWRITE, true }, /* ProtectHome= */
183 { "/run/user", READWRITE, true }, /* ProtectHome= */
184 { "/root", READWRITE, true }, /* ProtectHome= */
185 };
186
187 static const char *mount_entry_path(const MountEntry *p) {
188 assert(p);
189
190 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
191 * otherwise the stack/static ->path field is returned. */
192
193 return p->path_malloc ?: p->path_const;
194 }
195
196 static bool mount_entry_read_only(const MountEntry *p) {
197 assert(p);
198
199 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
200 }
201
202 static const char *mount_entry_source(const MountEntry *p) {
203 assert(p);
204
205 return p->source_malloc ?: p->source_const;
206 }
207
208 static const char *mount_entry_options(const MountEntry *p) {
209 assert(p);
210
211 return p->options_malloc ?: p->options_const;
212 }
213
214 static void mount_entry_done(MountEntry *p) {
215 assert(p);
216
217 p->path_malloc = mfree(p->path_malloc);
218 p->source_malloc = mfree(p->source_malloc);
219 p->options_malloc = mfree(p->options_malloc);
220 }
221
222 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
223 char **i;
224
225 assert(p);
226
227 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
228
229 STRV_FOREACH(i, strv) {
230 bool ignore = false, needs_prefix = false;
231 const char *e = *i;
232
233 /* Look for any prefixes */
234 if (startswith(e, "-")) {
235 e++;
236 ignore = true;
237 }
238 if (startswith(e, "+")) {
239 e++;
240 needs_prefix = true;
241 }
242
243 if (!path_is_absolute(e))
244 return -EINVAL;
245
246 *((*p)++) = (MountEntry) {
247 .path_const = e,
248 .mode = mode,
249 .ignore = ignore,
250 .has_prefix = !needs_prefix && !forcibly_require_prefix,
251 };
252 }
253
254 return 0;
255 }
256
257 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
258 char **i;
259
260 assert(p);
261
262 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
263 * "/private/" boundary directories for DynamicUser=1. */
264
265 STRV_FOREACH(i, strv) {
266
267 *((*p)++) = (MountEntry) {
268 .path_const = *i,
269 .mode = EMPTY_DIR,
270 .ignore = false,
271 .has_prefix = false,
272 .read_only = true,
273 .options_const = "mode=755",
274 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
275 };
276 }
277
278 return 0;
279 }
280
281 static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) {
282 size_t i;
283
284 assert(p);
285
286 for (i = 0; i < n; i++) {
287 const BindMount *b = binds + i;
288
289 *((*p)++) = (MountEntry) {
290 .path_const = b->destination,
291 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
292 .read_only = b->read_only,
293 .source_const = b->source,
294 .ignore = b->ignore_enoent,
295 };
296 }
297
298 return 0;
299 }
300
301 static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) {
302 size_t i;
303 int r;
304
305 assert(p);
306
307 for (i = 0; i < n; i++) {
308 const TemporaryFileSystem *t = tmpfs + i;
309 _cleanup_free_ char *o = NULL, *str = NULL;
310 unsigned long flags = MS_NODEV|MS_STRICTATIME;
311 bool ro = false;
312
313 if (!path_is_absolute(t->path))
314 return -EINVAL;
315
316 if (!isempty(t->options)) {
317 str = strjoin("mode=0755,", t->options);
318 if (!str)
319 return -ENOMEM;
320
321 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
322 if (r < 0)
323 return r;
324
325 ro = !!(flags & MS_RDONLY);
326 if (ro)
327 flags ^= MS_RDONLY;
328 }
329
330 *((*p)++) = (MountEntry) {
331 .path_const = t->path,
332 .mode = TMPFS,
333 .read_only = ro,
334 .options_malloc = o,
335 .flags = flags,
336 };
337
338 o = NULL;
339 }
340
341 return 0;
342 }
343
344 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) {
345 size_t i;
346
347 assert(p);
348 assert(mounts);
349
350 /* Adds a list of static pre-defined entries */
351
352 for (i = 0; i < n; i++)
353 *((*p)++) = (MountEntry) {
354 .path_const = mount_entry_path(mounts+i),
355 .mode = mounts[i].mode,
356 .ignore = mounts[i].ignore || ignore_protect,
357 };
358
359 return 0;
360 }
361
362 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
363 assert(p);
364
365 switch (protect_home) {
366
367 case PROTECT_HOME_NO:
368 return 0;
369
370 case PROTECT_HOME_READ_ONLY:
371 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
372
373 case PROTECT_HOME_TMPFS:
374 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
375
376 case PROTECT_HOME_YES:
377 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
378
379 default:
380 assert_not_reached("Unexpected ProtectHome= value");
381 }
382 }
383
384 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
385 assert(p);
386
387 switch (protect_system) {
388
389 case PROTECT_SYSTEM_NO:
390 return 0;
391
392 case PROTECT_SYSTEM_STRICT:
393 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
394
395 case PROTECT_SYSTEM_YES:
396 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
397
398 case PROTECT_SYSTEM_FULL:
399 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
400
401 default:
402 assert_not_reached("Unexpected ProtectSystem= value");
403 }
404 }
405
406 static int mount_path_compare(const void *a, const void *b) {
407 const MountEntry *p = a, *q = b;
408 int d;
409
410 /* If the paths are not equal, then order prefixes first */
411 d = path_compare(mount_entry_path(p), mount_entry_path(q));
412 if (d != 0)
413 return d;
414
415 /* If the paths are equal, check the mode */
416 if (p->mode < q->mode)
417 return -1;
418 if (p->mode > q->mode)
419 return 1;
420
421 return 0;
422 }
423
424 static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) {
425 size_t i;
426
427 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
428 * that. */
429
430 if (!root_directory)
431 return 0;
432
433 for (i = 0; i < n; i++) {
434 char *s;
435
436 if (m[i].has_prefix)
437 continue;
438
439 s = prefix_root(root_directory, mount_entry_path(m+i));
440 if (!s)
441 return -ENOMEM;
442
443 free_and_replace(m[i].path_malloc, s);
444 m[i].has_prefix = true;
445 }
446
447 return 0;
448 }
449
450 static void drop_duplicates(MountEntry *m, size_t *n) {
451 MountEntry *f, *t, *previous;
452
453 assert(m);
454 assert(n);
455
456 /* Drops duplicate entries. Expects that the array is properly ordered already. */
457
458 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
459
460 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
461 * above. Note that we only drop duplicates that haven't been mounted yet. */
462 if (previous &&
463 path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
464 !f->applied && !previous->applied) {
465 log_debug("%s is duplicate.", mount_entry_path(f));
466 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
467 mount_entry_done(f);
468 continue;
469 }
470
471 *t = *f;
472 previous = t;
473 t++;
474 }
475
476 *n = t - m;
477 }
478
479 static void drop_inaccessible(MountEntry *m, size_t *n) {
480 MountEntry *f, *t;
481 const char *clear = NULL;
482
483 assert(m);
484 assert(n);
485
486 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
487 * ordered already. */
488
489 for (f = m, t = m; f < m + *n; f++) {
490
491 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
492 * it, as inaccessible paths really should drop the entire subtree. */
493 if (clear && path_startswith(mount_entry_path(f), clear)) {
494 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
495 mount_entry_done(f);
496 continue;
497 }
498
499 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
500
501 *t = *f;
502 t++;
503 }
504
505 *n = t - m;
506 }
507
508 static void drop_nop(MountEntry *m, size_t *n) {
509 MountEntry *f, *t;
510
511 assert(m);
512 assert(n);
513
514 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
515 * list is ordered by prefixes. */
516
517 for (f = m, t = m; f < m + *n; f++) {
518
519 /* Only suppress such subtrees for READONLY and READWRITE entries */
520 if (IN_SET(f->mode, READONLY, READWRITE)) {
521 MountEntry *p;
522 bool found = false;
523
524 /* Now let's find the first parent of the entry we are looking at. */
525 for (p = t-1; p >= m; p--) {
526 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
527 found = true;
528 break;
529 }
530 }
531
532 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
533 if (found && p->mode == f->mode) {
534 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
535 mount_entry_done(f);
536 continue;
537 }
538 }
539
540 *t = *f;
541 t++;
542 }
543
544 *n = t - m;
545 }
546
547 static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) {
548 MountEntry *f, *t;
549
550 assert(m);
551 assert(n);
552
553 /* Nothing to do */
554 if (!root_directory)
555 return;
556
557 /* Drops all mounts that are outside of the root directory. */
558
559 for (f = m, t = m; f < m + *n; f++) {
560
561 if (!path_startswith(mount_entry_path(f), root_directory)) {
562 log_debug("%s is outside of root directory.", mount_entry_path(f));
563 mount_entry_done(f);
564 continue;
565 }
566
567 *t = *f;
568 t++;
569 }
570
571 *n = t - m;
572 }
573
574 static int clone_device_node(const char *d, const char *temporary_mount, bool *make_devnode) {
575 const char *dn;
576 struct stat st;
577 int r;
578
579 if (stat(d, &st) < 0) {
580 if (errno == ENOENT)
581 return -ENXIO;
582 return -errno;
583 }
584
585 if (!S_ISBLK(st.st_mode) &&
586 !S_ISCHR(st.st_mode))
587 return -EINVAL;
588
589 if (st.st_rdev == 0)
590 return -ENXIO;
591
592 dn = strjoina(temporary_mount, d);
593
594 if (*make_devnode) {
595 mac_selinux_create_file_prepare(d, st.st_mode);
596 r = mknod(dn, st.st_mode, st.st_rdev);
597 mac_selinux_create_file_clear();
598
599 if (r == 0)
600 return 0;
601 if (errno != EPERM)
602 return log_debug_errno(errno, "mknod failed for %s: %m", d);
603
604 *make_devnode = false;
605 }
606
607 /* We're about to fallback to bind-mounting the device
608 * node. So create a dummy bind-mount target. */
609 mac_selinux_create_file_prepare(d, 0);
610 r = mknod(dn, S_IFREG, 0);
611 mac_selinux_create_file_clear();
612
613 if (r < 0 && errno != EEXIST)
614 return log_debug_errno(errno, "mknod fallback failed for %s: %m", d);
615
616 /* Fallback to bind-mounting:
617 * The assumption here is that all used device nodes carry standard
618 * properties. Specifically, the devices nodes we bind-mount should
619 * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx)
620 * and should not carry ACLs. */
621 if (mount(d, dn, NULL, MS_BIND, NULL) < 0)
622 return log_debug_errno(errno, "mount failed for %s: %m", d);
623
624 return 0;
625 }
626
627 static int mount_private_dev(MountEntry *m) {
628 static const char devnodes[] =
629 "/dev/null\0"
630 "/dev/zero\0"
631 "/dev/full\0"
632 "/dev/random\0"
633 "/dev/urandom\0"
634 "/dev/tty\0";
635
636 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
637 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
638 bool can_mknod = true;
639 _cleanup_umask_ mode_t u;
640 int r;
641
642 assert(m);
643
644 u = umask(0000);
645
646 if (!mkdtemp(temporary_mount))
647 return -errno;
648
649 dev = strjoina(temporary_mount, "/dev");
650 (void) mkdir(dev, 0755);
651 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
652 r = -errno;
653 goto fail;
654 }
655
656 devpts = strjoina(temporary_mount, "/dev/pts");
657 (void) mkdir(devpts, 0755);
658 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
659 r = -errno;
660 goto fail;
661 }
662
663 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
664 * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
665 * thus, in that case make a clone
666 *
667 * in nspawn and other containers it will be a symlink, in that case make it a symlink
668 */
669 r = is_symlink("/dev/ptmx");
670 if (r < 0)
671 goto fail;
672 if (r > 0) {
673 devptmx = strjoina(temporary_mount, "/dev/ptmx");
674 if (symlink("pts/ptmx", devptmx) < 0) {
675 r = -errno;
676 goto fail;
677 }
678 } else {
679 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
680 if (r < 0)
681 goto fail;
682 }
683
684 devshm = strjoina(temporary_mount, "/dev/shm");
685 (void) mkdir(devshm, 0755);
686 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
687 if (r < 0) {
688 r = -errno;
689 goto fail;
690 }
691
692 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
693 (void) mkdir(devmqueue, 0755);
694 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
695
696 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
697 (void) mkdir(devhugepages, 0755);
698 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
699
700 devlog = strjoina(temporary_mount, "/dev/log");
701 (void) symlink("/run/systemd/journal/dev-log", devlog);
702
703 NULSTR_FOREACH(d, devnodes) {
704 r = clone_device_node(d, temporary_mount, &can_mknod);
705 /* ENXIO means the the *source* is not a device file, skip creation in that case */
706 if (r < 0 && r != -ENXIO)
707 goto fail;
708 }
709
710 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
711
712 /* Create the /dev directory if missing. It is more likely to be
713 * missing when the service is started with RootDirectory. This is
714 * consistent with mount units creating the mount points when missing.
715 */
716 (void) mkdir_p_label(mount_entry_path(m), 0755);
717
718 /* Unmount everything in old /dev */
719 umount_recursive(mount_entry_path(m), 0);
720 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
721 r = -errno;
722 goto fail;
723 }
724
725 rmdir(dev);
726 rmdir(temporary_mount);
727
728 return 0;
729
730 fail:
731 if (devpts)
732 umount(devpts);
733
734 if (devshm)
735 umount(devshm);
736
737 if (devhugepages)
738 umount(devhugepages);
739
740 if (devmqueue)
741 umount(devmqueue);
742
743 umount(dev);
744 rmdir(dev);
745 rmdir(temporary_mount);
746
747 return r;
748 }
749
750 static int mount_bind_dev(const MountEntry *m) {
751 int r;
752
753 assert(m);
754
755 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
756 * /dev. This is only used when RootDirectory= is set. */
757
758 (void) mkdir_p_label(mount_entry_path(m), 0755);
759
760 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
761 if (r < 0)
762 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
763 if (r > 0) /* make this a NOP if /dev is already a mount point */
764 return 0;
765
766 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
767 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
768
769 return 1;
770 }
771
772 static int mount_sysfs(const MountEntry *m) {
773 int r;
774
775 assert(m);
776
777 (void) mkdir_p_label(mount_entry_path(m), 0755);
778
779 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
780 if (r < 0)
781 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
782 if (r > 0) /* make this a NOP if /sys is already a mount point */
783 return 0;
784
785 /* Bind mount the host's version so that we get all child mounts of it, too. */
786 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
787 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
788
789 return 1;
790 }
791
792 static int mount_procfs(const MountEntry *m) {
793 int r;
794
795 assert(m);
796
797 (void) mkdir_p_label(mount_entry_path(m), 0755);
798
799 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
800 if (r < 0)
801 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
802 if (r > 0) /* make this a NOP if /proc is already a mount point */
803 return 0;
804
805 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
806 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
807 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
808
809 return 1;
810 }
811
812 static int mount_tmpfs(const MountEntry *m) {
813 assert(m);
814
815 /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
816
817 (void) mkdir_p_label(mount_entry_path(m), 0755);
818 (void) umount_recursive(mount_entry_path(m), 0);
819
820 if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
821 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
822
823 return 1;
824 }
825
826 static int follow_symlink(
827 const char *root_directory,
828 MountEntry *m) {
829
830 _cleanup_free_ char *target = NULL;
831 int r;
832
833 /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
834 * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
835 * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
836 * end and already have a fully normalized name. */
837
838 r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target);
839 if (r < 0)
840 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
841 if (r > 0) /* Reached the end, nothing more to resolve */
842 return 1;
843
844 if (m->n_followed >= CHASE_SYMLINKS_MAX) { /* put a boundary on things */
845 log_debug("Symlink loop on '%s'.", mount_entry_path(m));
846 return -ELOOP;
847 }
848
849 log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target);
850
851 free_and_replace(m->path_malloc, target);
852 m->has_prefix = true;
853
854 m->n_followed ++;
855
856 return 0;
857 }
858
859 static int apply_mount(
860 const char *root_directory,
861 MountEntry *m) {
862
863 bool rbind = true, make = false;
864 const char *what;
865 int r;
866
867 assert(m);
868
869 log_debug("Applying namespace mount on %s", mount_entry_path(m));
870
871 switch (m->mode) {
872
873 case INACCESSIBLE: {
874 struct stat target;
875
876 /* First, get rid of everything that is below if there
877 * is anything... Then, overmount it with an
878 * inaccessible path. */
879 (void) umount_recursive(mount_entry_path(m), 0);
880
881 if (lstat(mount_entry_path(m), &target) < 0) {
882 if (errno == ENOENT && m->ignore)
883 return 0;
884
885 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
886 }
887
888 what = mode_to_inaccessible_node(target.st_mode);
889 if (!what) {
890 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
891 return -ELOOP;
892 }
893 break;
894 }
895
896 case READONLY:
897 case READWRITE:
898 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
899 if (r == -ENOENT && m->ignore)
900 return 0;
901 if (r < 0)
902 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
903 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
904 return 0;
905 /* This isn't a mount point yet, let's make it one. */
906 what = mount_entry_path(m);
907 break;
908
909 case BIND_MOUNT:
910 rbind = false;
911
912 _fallthrough_;
913 case BIND_MOUNT_RECURSIVE: {
914 _cleanup_free_ char *chased = NULL;
915
916 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note that bind
917 * mount source paths are always relative to the host root, hence we pass NULL as root directory to
918 * chase_symlinks() here. */
919
920 r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased);
921 if (r == -ENOENT && m->ignore) {
922 log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
923 return 0;
924 }
925 if (r < 0)
926 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
927
928 log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased);
929
930 free_and_replace(m->source_malloc, chased);
931
932 what = mount_entry_source(m);
933 make = true;
934 break;
935 }
936
937 case EMPTY_DIR:
938 case TMPFS:
939 return mount_tmpfs(m);
940
941 case PRIVATE_TMP:
942 what = mount_entry_source(m);
943 make = true;
944 break;
945
946 case PRIVATE_DEV:
947 return mount_private_dev(m);
948
949 case BIND_DEV:
950 return mount_bind_dev(m);
951
952 case SYSFS:
953 return mount_sysfs(m);
954
955 case PROCFS:
956 return mount_procfs(m);
957
958 default:
959 assert_not_reached("Unknown mode");
960 }
961
962 assert(what);
963
964 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
965 bool try_again = false;
966 r = -errno;
967
968 if (r == -ENOENT && make) {
969 struct stat st;
970
971 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
972
973 if (stat(what, &st) < 0)
974 log_debug_errno(errno, "Mount point source '%s' is not accessible: %m", what);
975 else {
976 int q;
977
978 (void) mkdir_parents(mount_entry_path(m), 0755);
979
980 if (S_ISDIR(st.st_mode))
981 q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0;
982 else
983 q = touch(mount_entry_path(m));
984
985 if (q < 0)
986 log_debug_errno(q, "Failed to create destination mount point node '%s': %m", mount_entry_path(m));
987 else
988 try_again = true;
989 }
990 }
991
992 if (try_again) {
993 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
994 r = -errno;
995 else
996 r = 0;
997 }
998
999 if (r < 0)
1000 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
1001 }
1002
1003 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
1004 return 0;
1005 }
1006
1007 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
1008 int r = 0;
1009
1010 assert(m);
1011 assert(proc_self_mountinfo);
1012
1013 if (mount_entry_read_only(m)) {
1014 if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) {
1015 /* Make superblock readonly */
1016 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0)
1017 r = -errno;
1018 } else
1019 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
1020 } else if (m->mode == PRIVATE_DEV) {
1021 /* Superblock can be readonly but the submounts can't */
1022 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
1023 r = -errno;
1024 } else
1025 return 0;
1026
1027 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
1028 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
1029 * read-only mounts already applied. */
1030
1031 if (r == -ENOENT && m->ignore)
1032 r = 0;
1033
1034 return r;
1035 }
1036
1037 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
1038 assert(ns_info);
1039
1040 /*
1041 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1042 * since to protect the API VFS mounts, they need to be around in the
1043 * first place... and RootDirectory= or RootImage= need to be set.
1044 */
1045
1046 /* root_directory should point to a mount point */
1047 return root_directory &&
1048 (ns_info->mount_apivfs ||
1049 ns_info->protect_control_groups ||
1050 ns_info->protect_kernel_tunables);
1051 }
1052
1053 static size_t namespace_calculate_mounts(
1054 const char* root_directory,
1055 const NamespaceInfo *ns_info,
1056 char** read_write_paths,
1057 char** read_only_paths,
1058 char** inaccessible_paths,
1059 char** empty_directories,
1060 size_t n_bind_mounts,
1061 size_t n_temporary_filesystems,
1062 const char* tmp_dir,
1063 const char* var_tmp_dir,
1064 ProtectHome protect_home,
1065 ProtectSystem protect_system) {
1066
1067 size_t protect_home_cnt;
1068 size_t protect_system_cnt =
1069 (protect_system == PROTECT_SYSTEM_STRICT ?
1070 ELEMENTSOF(protect_system_strict_table) :
1071 ((protect_system == PROTECT_SYSTEM_FULL) ?
1072 ELEMENTSOF(protect_system_full_table) :
1073 ((protect_system == PROTECT_SYSTEM_YES) ?
1074 ELEMENTSOF(protect_system_yes_table) : 0)));
1075
1076 protect_home_cnt =
1077 (protect_home == PROTECT_HOME_YES ?
1078 ELEMENTSOF(protect_home_yes_table) :
1079 ((protect_home == PROTECT_HOME_READ_ONLY) ?
1080 ELEMENTSOF(protect_home_read_only_table) :
1081 ((protect_home == PROTECT_HOME_TMPFS) ?
1082 ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1083
1084 return !!tmp_dir + !!var_tmp_dir +
1085 strv_length(read_write_paths) +
1086 strv_length(read_only_paths) +
1087 strv_length(inaccessible_paths) +
1088 strv_length(empty_directories) +
1089 n_bind_mounts +
1090 n_temporary_filesystems +
1091 ns_info->private_dev +
1092 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1093 (ns_info->protect_control_groups ? 1 : 0) +
1094 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1095 protect_home_cnt + protect_system_cnt +
1096 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1097 }
1098
1099 static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) {
1100 assert(n_mounts);
1101 assert(mounts || *n_mounts == 0);
1102
1103 qsort_safe(mounts, *n_mounts, sizeof(MountEntry), mount_path_compare);
1104
1105 drop_duplicates(mounts, n_mounts);
1106 drop_outside_root(root_directory, mounts, n_mounts);
1107 drop_inaccessible(mounts, n_mounts);
1108 drop_nop(mounts, n_mounts);
1109 }
1110
1111 int setup_namespace(
1112 const char* root_directory,
1113 const char* root_image,
1114 const NamespaceInfo *ns_info,
1115 char** read_write_paths,
1116 char** read_only_paths,
1117 char** inaccessible_paths,
1118 char** empty_directories,
1119 const BindMount *bind_mounts,
1120 size_t n_bind_mounts,
1121 const TemporaryFileSystem *temporary_filesystems,
1122 size_t n_temporary_filesystems,
1123 const char* tmp_dir,
1124 const char* var_tmp_dir,
1125 ProtectHome protect_home,
1126 ProtectSystem protect_system,
1127 unsigned long mount_flags,
1128 DissectImageFlags dissect_image_flags) {
1129
1130 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1131 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1132 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1133 _cleanup_free_ void *root_hash = NULL;
1134 MountEntry *m, *mounts = NULL;
1135 size_t root_hash_size = 0;
1136 const char *root;
1137 size_t n_mounts;
1138 bool make_slave;
1139 bool require_prefix = false;
1140 int r = 0;
1141
1142 assert(ns_info);
1143
1144 if (mount_flags == 0)
1145 mount_flags = MS_SHARED;
1146
1147 if (root_image) {
1148 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1149
1150 if (protect_system == PROTECT_SYSTEM_STRICT &&
1151 protect_home != PROTECT_HOME_NO &&
1152 strv_isempty(read_write_paths))
1153 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1154
1155 r = loop_device_make_by_path(root_image,
1156 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1157 &loop_device);
1158 if (r < 0)
1159 return r;
1160
1161 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1162 if (r < 0)
1163 return r;
1164
1165 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1166 if (r < 0)
1167 return r;
1168
1169 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1170 if (r < 0)
1171 return r;
1172 }
1173
1174 if (root_directory)
1175 root = root_directory;
1176 else if (root_image || n_bind_mounts > 0 || n_temporary_filesystems > 0) {
1177
1178 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1179 * the same mount point for all images, which is safe, since they all live in their own namespaces
1180 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1181 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1182 * while we are applying them. */
1183
1184 root = "/run/systemd/unit-root";
1185 (void) mkdir_label(root, 0700);
1186 require_prefix = true;
1187 } else
1188 root = NULL;
1189
1190 n_mounts = namespace_calculate_mounts(
1191 root,
1192 ns_info,
1193 read_write_paths,
1194 read_only_paths,
1195 inaccessible_paths,
1196 empty_directories,
1197 n_bind_mounts,
1198 n_temporary_filesystems,
1199 tmp_dir, var_tmp_dir,
1200 protect_home, protect_system);
1201
1202 /* Set mount slave mode */
1203 make_slave = root || n_mounts > 0 || ns_info->private_mounts;
1204
1205 if (n_mounts > 0) {
1206 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1207 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1208 if (r < 0)
1209 goto finish;
1210
1211 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1212 if (r < 0)
1213 goto finish;
1214
1215 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1216 if (r < 0)
1217 goto finish;
1218
1219 r = append_empty_dir_mounts(&m, empty_directories);
1220 if (r < 0)
1221 goto finish;
1222
1223 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1224 if (r < 0)
1225 goto finish;
1226
1227 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1228 if (r < 0)
1229 goto finish;
1230
1231 if (tmp_dir) {
1232 *(m++) = (MountEntry) {
1233 .path_const = "/tmp",
1234 .mode = PRIVATE_TMP,
1235 .source_const = tmp_dir,
1236 };
1237 }
1238
1239 if (var_tmp_dir) {
1240 *(m++) = (MountEntry) {
1241 .path_const = "/var/tmp",
1242 .mode = PRIVATE_TMP,
1243 .source_const = var_tmp_dir,
1244 };
1245 }
1246
1247 if (ns_info->private_dev) {
1248 *(m++) = (MountEntry) {
1249 .path_const = "/dev",
1250 .mode = PRIVATE_DEV,
1251 };
1252 }
1253
1254 if (ns_info->protect_kernel_tunables) {
1255 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1256 if (r < 0)
1257 goto finish;
1258 }
1259
1260 if (ns_info->protect_kernel_modules) {
1261 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1262 if (r < 0)
1263 goto finish;
1264 }
1265
1266 if (ns_info->protect_control_groups) {
1267 *(m++) = (MountEntry) {
1268 .path_const = "/sys/fs/cgroup",
1269 .mode = READONLY,
1270 };
1271 }
1272
1273 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1274 if (r < 0)
1275 goto finish;
1276
1277 r = append_protect_system(&m, protect_system, false);
1278 if (r < 0)
1279 goto finish;
1280
1281 if (namespace_info_mount_apivfs(root, ns_info)) {
1282 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1283 if (r < 0)
1284 goto finish;
1285 }
1286
1287 assert(mounts + n_mounts == m);
1288
1289 /* Prepend the root directory where that's necessary */
1290 r = prefix_where_needed(mounts, n_mounts, root);
1291 if (r < 0)
1292 goto finish;
1293
1294 normalize_mounts(root_directory, mounts, &n_mounts);
1295 }
1296
1297 if (unshare(CLONE_NEWNS) < 0) {
1298 r = -errno;
1299 goto finish;
1300 }
1301
1302 if (make_slave) {
1303 /* Remount / as SLAVE so that nothing now mounted in the namespace
1304 shows up in the parent */
1305 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1306 r = -errno;
1307 goto finish;
1308 }
1309 }
1310
1311 if (root_image) {
1312 /* A root image is specified, mount it to the right place */
1313 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1314 if (r < 0)
1315 goto finish;
1316
1317 if (decrypted_image) {
1318 r = decrypted_image_relinquish(decrypted_image);
1319 if (r < 0)
1320 goto finish;
1321 }
1322
1323 loop_device_relinquish(loop_device);
1324
1325 } else if (root_directory) {
1326
1327 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1328 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1329 if (r < 0)
1330 goto finish;
1331 if (r == 0) {
1332 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1333 r = -errno;
1334 goto finish;
1335 }
1336 }
1337
1338 } else if (root) {
1339
1340 /* Let's mount the main root directory to the root directory to use */
1341 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1342 r = -errno;
1343 goto finish;
1344 }
1345 }
1346
1347 /* Try to set up the new root directory before mounting anything else there. */
1348 if (root_image || root_directory)
1349 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1350
1351 if (n_mounts > 0) {
1352 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1353 char **blacklist;
1354 size_t j;
1355
1356 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1357 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1358 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1359 if (!proc_self_mountinfo) {
1360 r = -errno;
1361 goto finish;
1362 }
1363
1364 /* First round, establish all mounts we need */
1365 for (;;) {
1366 bool again = false;
1367
1368 for (m = mounts; m < mounts + n_mounts; ++m) {
1369
1370 if (m->applied)
1371 continue;
1372
1373 r = follow_symlink(root, m);
1374 if (r < 0)
1375 goto finish;
1376 if (r == 0) {
1377 /* We hit a symlinked mount point. The entry got rewritten and might point to a
1378 * very different place now. Let's normalize the changed list, and start from
1379 * the beginning. After all to mount the entry at the new location we might
1380 * need some other mounts first */
1381 again = true;
1382 break;
1383 }
1384
1385 r = apply_mount(root, m);
1386 if (r < 0)
1387 goto finish;
1388
1389 m->applied = true;
1390 }
1391
1392 if (!again)
1393 break;
1394
1395 normalize_mounts(root_directory, mounts, &n_mounts);
1396 }
1397
1398 /* Create a blacklist we can pass to bind_mount_recursive() */
1399 blacklist = newa(char*, n_mounts+1);
1400 for (j = 0; j < n_mounts; j++)
1401 blacklist[j] = (char*) mount_entry_path(mounts+j);
1402 blacklist[j] = NULL;
1403
1404 /* Second round, flip the ro bits if necessary. */
1405 for (m = mounts; m < mounts + n_mounts; ++m) {
1406 r = make_read_only(m, blacklist, proc_self_mountinfo);
1407 if (r < 0)
1408 goto finish;
1409 }
1410 }
1411
1412 if (root) {
1413 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1414 r = mount_move_root(root);
1415 if (r < 0)
1416 goto finish;
1417 }
1418
1419 /* Remount / as the desired mode. Note that this will not
1420 * reestablish propagation from our side to the host, since
1421 * what's disconnected is disconnected. */
1422 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1423 r = -errno;
1424 goto finish;
1425 }
1426
1427 r = 0;
1428
1429 finish:
1430 for (m = mounts; m < mounts + n_mounts; m++)
1431 mount_entry_done(m);
1432
1433 return r;
1434 }
1435
1436 void bind_mount_free_many(BindMount *b, size_t n) {
1437 size_t i;
1438
1439 assert(b || n == 0);
1440
1441 for (i = 0; i < n; i++) {
1442 free(b[i].source);
1443 free(b[i].destination);
1444 }
1445
1446 free(b);
1447 }
1448
1449 int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
1450 _cleanup_free_ char *s = NULL, *d = NULL;
1451 BindMount *c;
1452
1453 assert(b);
1454 assert(n);
1455 assert(item);
1456
1457 s = strdup(item->source);
1458 if (!s)
1459 return -ENOMEM;
1460
1461 d = strdup(item->destination);
1462 if (!d)
1463 return -ENOMEM;
1464
1465 c = reallocarray(*b, *n + 1, sizeof(BindMount));
1466 if (!c)
1467 return -ENOMEM;
1468
1469 *b = c;
1470
1471 c[(*n) ++] = (BindMount) {
1472 .source = TAKE_PTR(s),
1473 .destination = TAKE_PTR(d),
1474 .read_only = item->read_only,
1475 .recursive = item->recursive,
1476 .ignore_enoent = item->ignore_enoent,
1477 };
1478
1479 return 0;
1480 }
1481
1482 void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
1483 size_t i;
1484
1485 assert(t || n == 0);
1486
1487 for (i = 0; i < n; i++) {
1488 free(t[i].path);
1489 free(t[i].options);
1490 }
1491
1492 free(t);
1493 }
1494
1495 int temporary_filesystem_add(
1496 TemporaryFileSystem **t,
1497 size_t *n,
1498 const char *path,
1499 const char *options) {
1500
1501 _cleanup_free_ char *p = NULL, *o = NULL;
1502 TemporaryFileSystem *c;
1503
1504 assert(t);
1505 assert(n);
1506 assert(path);
1507
1508 p = strdup(path);
1509 if (!p)
1510 return -ENOMEM;
1511
1512 if (!isempty(options)) {
1513 o = strdup(options);
1514 if (!o)
1515 return -ENOMEM;
1516 }
1517
1518 c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
1519 if (!c)
1520 return -ENOMEM;
1521
1522 *t = c;
1523
1524 c[(*n) ++] = (TemporaryFileSystem) {
1525 .path = TAKE_PTR(p),
1526 .options = TAKE_PTR(o),
1527 };
1528
1529 return 0;
1530 }
1531
1532 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1533 _cleanup_free_ char *x = NULL;
1534 char bid[SD_ID128_STRING_MAX];
1535 sd_id128_t boot_id;
1536 int r;
1537
1538 assert(id);
1539 assert(prefix);
1540 assert(path);
1541
1542 /* We include the boot id in the directory so that after a
1543 * reboot we can easily identify obsolete directories. */
1544
1545 r = sd_id128_get_boot(&boot_id);
1546 if (r < 0)
1547 return r;
1548
1549 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1550 if (!x)
1551 return -ENOMEM;
1552
1553 RUN_WITH_UMASK(0077)
1554 if (!mkdtemp(x))
1555 return -errno;
1556
1557 RUN_WITH_UMASK(0000) {
1558 char *y;
1559
1560 y = strjoina(x, "/tmp");
1561
1562 if (mkdir(y, 0777 | S_ISVTX) < 0)
1563 return -errno;
1564 }
1565
1566 *path = TAKE_PTR(x);
1567
1568 return 0;
1569 }
1570
1571 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1572 char *a, *b;
1573 int r;
1574
1575 assert(id);
1576 assert(tmp_dir);
1577 assert(var_tmp_dir);
1578
1579 r = setup_one_tmp_dir(id, "/tmp", &a);
1580 if (r < 0)
1581 return r;
1582
1583 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1584 if (r < 0) {
1585 char *t;
1586
1587 t = strjoina(a, "/tmp");
1588 rmdir(t);
1589 rmdir(a);
1590
1591 free(a);
1592 return r;
1593 }
1594
1595 *tmp_dir = a;
1596 *var_tmp_dir = b;
1597
1598 return 0;
1599 }
1600
1601 int setup_netns(int netns_storage_socket[2]) {
1602 _cleanup_close_ int netns = -1;
1603 int r, q;
1604
1605 assert(netns_storage_socket);
1606 assert(netns_storage_socket[0] >= 0);
1607 assert(netns_storage_socket[1] >= 0);
1608
1609 /* We use the passed socketpair as a storage buffer for our
1610 * namespace reference fd. Whatever process runs this first
1611 * shall create a new namespace, all others should just join
1612 * it. To serialize that we use a file lock on the socket
1613 * pair.
1614 *
1615 * It's a bit crazy, but hey, works great! */
1616
1617 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1618 return -errno;
1619
1620 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1621 if (netns == -EAGAIN) {
1622 /* Nothing stored yet, so let's create a new namespace */
1623
1624 if (unshare(CLONE_NEWNET) < 0) {
1625 r = -errno;
1626 goto fail;
1627 }
1628
1629 loopback_setup();
1630
1631 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1632 if (netns < 0) {
1633 r = -errno;
1634 goto fail;
1635 }
1636
1637 r = 1;
1638
1639 } else if (netns < 0) {
1640 r = netns;
1641 goto fail;
1642
1643 } else {
1644 /* Yay, found something, so let's join the namespace */
1645 if (setns(netns, CLONE_NEWNET) < 0) {
1646 r = -errno;
1647 goto fail;
1648 }
1649
1650 r = 0;
1651 }
1652
1653 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1654 if (q < 0) {
1655 r = q;
1656 goto fail;
1657 }
1658
1659 fail:
1660 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1661 return r;
1662 }
1663
1664 bool ns_type_supported(NamespaceType type) {
1665 const char *t, *ns_proc;
1666
1667 t = namespace_type_to_string(type);
1668 if (!t) /* Don't know how to translate this? Then it's not supported */
1669 return false;
1670
1671 ns_proc = strjoina("/proc/self/ns/", t);
1672 return access(ns_proc, F_OK) == 0;
1673 }
1674
1675 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1676 [PROTECT_HOME_NO] = "no",
1677 [PROTECT_HOME_YES] = "yes",
1678 [PROTECT_HOME_READ_ONLY] = "read-only",
1679 [PROTECT_HOME_TMPFS] = "tmpfs",
1680 };
1681
1682 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1683
1684 ProtectHome protect_home_or_bool_from_string(const char *s) {
1685 int r;
1686
1687 r = parse_boolean(s);
1688 if (r > 0)
1689 return PROTECT_HOME_YES;
1690 if (r == 0)
1691 return PROTECT_HOME_NO;
1692
1693 return protect_home_from_string(s);
1694 }
1695
1696 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1697 [PROTECT_SYSTEM_NO] = "no",
1698 [PROTECT_SYSTEM_YES] = "yes",
1699 [PROTECT_SYSTEM_FULL] = "full",
1700 [PROTECT_SYSTEM_STRICT] = "strict",
1701 };
1702
1703 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1704
1705 ProtectSystem protect_system_or_bool_from_string(const char *s) {
1706 int r;
1707
1708 r = parse_boolean(s);
1709 if (r > 0)
1710 return PROTECT_SYSTEM_YES;
1711 if (r == 0)
1712 return PROTECT_SYSTEM_NO;
1713
1714 return protect_system_from_string(s);
1715 }
1716
1717 static const char* const namespace_type_table[] = {
1718 [NAMESPACE_MOUNT] = "mnt",
1719 [NAMESPACE_CGROUP] = "cgroup",
1720 [NAMESPACE_UTS] = "uts",
1721 [NAMESPACE_IPC] = "ipc",
1722 [NAMESPACE_USER] = "user",
1723 [NAMESPACE_PID] = "pid",
1724 [NAMESPACE_NET] = "net",
1725 };
1726
1727 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);