]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
tree-wide: use TAKE_PTR() and TAKE_FD() macros
[thirdparty/systemd.git] / src / core / namespace.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <sched.h>
23 #include <stdio.h>
24 #include <string.h>
25 #include <sys/mount.h>
26 #include <sys/stat.h>
27 #include <unistd.h>
28 #include <linux/fs.h>
29
30 #include "alloc-util.h"
31 #include "base-filesystem.h"
32 #include "dev-setup.h"
33 #include "fd-util.h"
34 #include "fs-util.h"
35 #include "label.h"
36 #include "loop-util.h"
37 #include "loopback-setup.h"
38 #include "missing.h"
39 #include "mkdir.h"
40 #include "mount-util.h"
41 #include "namespace.h"
42 #include "path-util.h"
43 #include "selinux-util.h"
44 #include "socket-util.h"
45 #include "stat-util.h"
46 #include "string-table.h"
47 #include "string-util.h"
48 #include "strv.h"
49 #include "umask-util.h"
50 #include "user-util.h"
51 #include "util.h"
52
53 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
54
55 typedef enum MountMode {
56 /* This is ordered by priority! */
57 INACCESSIBLE,
58 BIND_MOUNT,
59 BIND_MOUNT_RECURSIVE,
60 PRIVATE_TMP,
61 PRIVATE_DEV,
62 BIND_DEV,
63 EMPTY_DIR,
64 SYSFS,
65 PROCFS,
66 READONLY,
67 READWRITE,
68 TMPFS,
69 } MountMode;
70
71 typedef struct MountEntry {
72 const char *path_const; /* Memory allocated on stack or static */
73 MountMode mode:5;
74 bool ignore:1; /* Ignore if path does not exist? */
75 bool has_prefix:1; /* Already is prefixed by the root dir? */
76 bool read_only:1; /* Shall this mount point be read-only? */
77 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
78 const char *source_const; /* The source path, for bind mounts */
79 char *source_malloc;
80 const char *options_const;/* Mount options for tmpfs */
81 char *options_malloc;
82 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
83 } MountEntry;
84
85 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
86 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
87 static const MountEntry apivfs_table[] = {
88 { "/proc", PROCFS, false },
89 { "/dev", BIND_DEV, false },
90 { "/sys", SYSFS, false },
91 };
92
93 /* ProtectKernelTunables= option and the related filesystem APIs */
94 static const MountEntry protect_kernel_tunables_table[] = {
95 { "/proc/sys", READONLY, false },
96 { "/proc/sysrq-trigger", READONLY, true },
97 { "/proc/latency_stats", READONLY, true },
98 { "/proc/mtrr", READONLY, true },
99 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
100 { "/proc/acpi", READONLY, true },
101 { "/proc/timer_stats", READONLY, true },
102 { "/proc/asound", READONLY, true },
103 { "/proc/bus", READONLY, true },
104 { "/proc/fs", READONLY, true },
105 { "/proc/irq", READONLY, true },
106 { "/sys", READONLY, false },
107 { "/sys/kernel/debug", READONLY, true },
108 { "/sys/kernel/tracing", READONLY, true },
109 { "/sys/fs/bpf", READONLY, true },
110 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
111 { "/sys/fs/selinux", READWRITE, true },
112 };
113
114 /* ProtectKernelModules= option */
115 static const MountEntry protect_kernel_modules_table[] = {
116 #if HAVE_SPLIT_USR
117 { "/lib/modules", INACCESSIBLE, true },
118 #endif
119 { "/usr/lib/modules", INACCESSIBLE, true },
120 };
121
122 /*
123 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
124 * system should be protected by ProtectSystem=
125 */
126 static const MountEntry protect_home_read_only_table[] = {
127 { "/home", READONLY, true },
128 { "/run/user", READONLY, true },
129 { "/root", READONLY, true },
130 };
131
132 /* ProtectHome=tmpfs table */
133 static const MountEntry protect_home_tmpfs_table[] = {
134 { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
135 { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
136 { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
137 };
138
139 /* ProtectHome=yes table */
140 static const MountEntry protect_home_yes_table[] = {
141 { "/home", INACCESSIBLE, true },
142 { "/run/user", INACCESSIBLE, true },
143 { "/root", INACCESSIBLE, true },
144 };
145
146 /* ProtectSystem=yes table */
147 static const MountEntry protect_system_yes_table[] = {
148 { "/usr", READONLY, false },
149 { "/boot", READONLY, true },
150 { "/efi", READONLY, true },
151 #if HAVE_SPLIT_USR
152 { "/lib", READONLY, true },
153 { "/lib64", READONLY, true },
154 { "/bin", READONLY, true },
155 # if HAVE_SPLIT_BIN
156 { "/sbin", READONLY, true },
157 # endif
158 #endif
159 };
160
161 /* ProtectSystem=full includes ProtectSystem=yes */
162 static const MountEntry protect_system_full_table[] = {
163 { "/usr", READONLY, false },
164 { "/boot", READONLY, true },
165 { "/efi", READONLY, true },
166 { "/etc", READONLY, false },
167 #if HAVE_SPLIT_USR
168 { "/lib", READONLY, true },
169 { "/lib64", READONLY, true },
170 { "/bin", READONLY, true },
171 # if HAVE_SPLIT_BIN
172 { "/sbin", READONLY, true },
173 # endif
174 #endif
175 };
176
177 /*
178 * ProtectSystem=strict table. In this strict mode, we mount everything
179 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
180 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
181 * protect those, and these options should be fully orthogonal.
182 * (And of course /home and friends are also left writable, as ProtectHome=
183 * shall manage those, orthogonally).
184 */
185 static const MountEntry protect_system_strict_table[] = {
186 { "/", READONLY, false },
187 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
188 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
189 { "/dev", READWRITE, false }, /* PrivateDevices= */
190 { "/home", READWRITE, true }, /* ProtectHome= */
191 { "/run/user", READWRITE, true }, /* ProtectHome= */
192 { "/root", READWRITE, true }, /* ProtectHome= */
193 };
194
195 static const char *mount_entry_path(const MountEntry *p) {
196 assert(p);
197
198 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
199 * otherwise the stack/static ->path field is returned. */
200
201 return p->path_malloc ?: p->path_const;
202 }
203
204 static bool mount_entry_read_only(const MountEntry *p) {
205 assert(p);
206
207 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
208 }
209
210 static const char *mount_entry_source(const MountEntry *p) {
211 assert(p);
212
213 return p->source_malloc ?: p->source_const;
214 }
215
216 static const char *mount_entry_options(const MountEntry *p) {
217 assert(p);
218
219 return p->options_malloc ?: p->options_const;
220 }
221
222 static void mount_entry_done(MountEntry *p) {
223 assert(p);
224
225 p->path_malloc = mfree(p->path_malloc);
226 p->source_malloc = mfree(p->source_malloc);
227 p->options_malloc = mfree(p->options_malloc);
228 }
229
230 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
231 char **i;
232
233 assert(p);
234
235 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
236
237 STRV_FOREACH(i, strv) {
238 bool ignore = false, needs_prefix = false;
239 const char *e = *i;
240
241 /* Look for any prefixes */
242 if (startswith(e, "-")) {
243 e++;
244 ignore = true;
245 }
246 if (startswith(e, "+")) {
247 e++;
248 needs_prefix = true;
249 }
250
251 if (!path_is_absolute(e))
252 return -EINVAL;
253
254 *((*p)++) = (MountEntry) {
255 .path_const = e,
256 .mode = mode,
257 .ignore = ignore,
258 .has_prefix = !needs_prefix && !forcibly_require_prefix,
259 };
260 }
261
262 return 0;
263 }
264
265 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
266 char **i;
267
268 assert(p);
269
270 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
271 * "/private/" boundary directories for DynamicUser=1. */
272
273 STRV_FOREACH(i, strv) {
274
275 *((*p)++) = (MountEntry) {
276 .path_const = *i,
277 .mode = EMPTY_DIR,
278 .ignore = false,
279 .has_prefix = false,
280 .read_only = true,
281 .options_const = "mode=755",
282 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
283 };
284 }
285
286 return 0;
287 }
288
289 static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
290 unsigned i;
291
292 assert(p);
293
294 for (i = 0; i < n; i++) {
295 const BindMount *b = binds + i;
296
297 *((*p)++) = (MountEntry) {
298 .path_const = b->destination,
299 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
300 .read_only = b->read_only,
301 .source_const = b->source,
302 .ignore = b->ignore_enoent,
303 };
304 }
305
306 return 0;
307 }
308
309 static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, unsigned n) {
310 unsigned i;
311 int r;
312
313 assert(p);
314
315 for (i = 0; i < n; i++) {
316 const TemporaryFileSystem *t = tmpfs + i;
317 _cleanup_free_ char *o = NULL, *str = NULL;
318 unsigned long flags = MS_NODEV|MS_STRICTATIME;
319 bool ro = false;
320
321 if (!path_is_absolute(t->path))
322 return -EINVAL;
323
324 if (!isempty(t->options)) {
325 str = strjoin("mode=0755,", t->options);
326 if (!str)
327 return -ENOMEM;
328
329 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
330 if (r < 0)
331 return r;
332
333 ro = !!(flags & MS_RDONLY);
334 if (ro)
335 flags ^= MS_RDONLY;
336 }
337
338 *((*p)++) = (MountEntry) {
339 .path_const = t->path,
340 .mode = TMPFS,
341 .read_only = ro,
342 .options_malloc = o,
343 .flags = flags,
344 };
345
346 o = NULL;
347 }
348
349 return 0;
350 }
351
352 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
353 unsigned i;
354
355 assert(p);
356 assert(mounts);
357
358 /* Adds a list of static pre-defined entries */
359
360 for (i = 0; i < n; i++)
361 *((*p)++) = (MountEntry) {
362 .path_const = mount_entry_path(mounts+i),
363 .mode = mounts[i].mode,
364 .ignore = mounts[i].ignore || ignore_protect,
365 };
366
367 return 0;
368 }
369
370 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
371 assert(p);
372
373 switch (protect_home) {
374
375 case PROTECT_HOME_NO:
376 return 0;
377
378 case PROTECT_HOME_READ_ONLY:
379 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
380
381 case PROTECT_HOME_TMPFS:
382 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
383
384 case PROTECT_HOME_YES:
385 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
386
387 default:
388 assert_not_reached("Unexpected ProtectHome= value");
389 }
390 }
391
392 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
393 assert(p);
394
395 switch (protect_system) {
396
397 case PROTECT_SYSTEM_NO:
398 return 0;
399
400 case PROTECT_SYSTEM_STRICT:
401 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
402
403 case PROTECT_SYSTEM_YES:
404 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
405
406 case PROTECT_SYSTEM_FULL:
407 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
408
409 default:
410 assert_not_reached("Unexpected ProtectSystem= value");
411 }
412 }
413
414 static int mount_path_compare(const void *a, const void *b) {
415 const MountEntry *p = a, *q = b;
416 int d;
417
418 /* If the paths are not equal, then order prefixes first */
419 d = path_compare(mount_entry_path(p), mount_entry_path(q));
420 if (d != 0)
421 return d;
422
423 /* If the paths are equal, check the mode */
424 if (p->mode < q->mode)
425 return -1;
426
427 if (p->mode > q->mode)
428 return 1;
429
430 return 0;
431 }
432
433 static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
434 unsigned i;
435
436 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
437 * that. */
438
439 if (!root_directory)
440 return 0;
441
442 for (i = 0; i < n; i++) {
443 char *s;
444
445 if (m[i].has_prefix)
446 continue;
447
448 s = prefix_root(root_directory, mount_entry_path(m+i));
449 if (!s)
450 return -ENOMEM;
451
452 free_and_replace(m[i].path_malloc, s);
453 m[i].has_prefix = true;
454 }
455
456 return 0;
457 }
458
459 static void drop_duplicates(MountEntry *m, unsigned *n) {
460 MountEntry *f, *t, *previous;
461
462 assert(m);
463 assert(n);
464
465 /* Drops duplicate entries. Expects that the array is properly ordered already. */
466
467 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
468
469 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
470 * above. */
471 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
472 log_debug("%s is duplicate.", mount_entry_path(f));
473 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
474 mount_entry_done(f);
475 continue;
476 }
477
478 *t = *f;
479 previous = t;
480 t++;
481 }
482
483 *n = t - m;
484 }
485
486 static void drop_inaccessible(MountEntry *m, unsigned *n) {
487 MountEntry *f, *t;
488 const char *clear = NULL;
489
490 assert(m);
491 assert(n);
492
493 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
494 * ordered already. */
495
496 for (f = m, t = m; f < m + *n; f++) {
497
498 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
499 * it, as inaccessible paths really should drop the entire subtree. */
500 if (clear && path_startswith(mount_entry_path(f), clear)) {
501 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
502 mount_entry_done(f);
503 continue;
504 }
505
506 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
507
508 *t = *f;
509 t++;
510 }
511
512 *n = t - m;
513 }
514
515 static void drop_nop(MountEntry *m, unsigned *n) {
516 MountEntry *f, *t;
517
518 assert(m);
519 assert(n);
520
521 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
522 * list is ordered by prefixes. */
523
524 for (f = m, t = m; f < m + *n; f++) {
525
526 /* Only suppress such subtrees for READONLY and READWRITE entries */
527 if (IN_SET(f->mode, READONLY, READWRITE)) {
528 MountEntry *p;
529 bool found = false;
530
531 /* Now let's find the first parent of the entry we are looking at. */
532 for (p = t-1; p >= m; p--) {
533 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
534 found = true;
535 break;
536 }
537 }
538
539 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
540 if (found && p->mode == f->mode) {
541 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
542 mount_entry_done(f);
543 continue;
544 }
545 }
546
547 *t = *f;
548 t++;
549 }
550
551 *n = t - m;
552 }
553
554 static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
555 MountEntry *f, *t;
556
557 assert(m);
558 assert(n);
559
560 /* Nothing to do */
561 if (!root_directory)
562 return;
563
564 /* Drops all mounts that are outside of the root directory. */
565
566 for (f = m, t = m; f < m + *n; f++) {
567
568 if (!path_startswith(mount_entry_path(f), root_directory)) {
569 log_debug("%s is outside of root directory.", mount_entry_path(f));
570 mount_entry_done(f);
571 continue;
572 }
573
574 *t = *f;
575 t++;
576 }
577
578 *n = t - m;
579 }
580
581 static int clone_device_node(const char *d, const char *temporary_mount) {
582 const char *dn;
583 struct stat st;
584 int r;
585
586 if (stat(d, &st) < 0) {
587 if (errno == ENOENT)
588 return 0;
589 return -errno;
590 }
591
592 if (!S_ISBLK(st.st_mode) &&
593 !S_ISCHR(st.st_mode))
594 return -EINVAL;
595
596 if (st.st_rdev == 0)
597 return 0;
598
599 dn = strjoina(temporary_mount, d);
600
601 mac_selinux_create_file_prepare(d, st.st_mode);
602 r = mknod(dn, st.st_mode, st.st_rdev);
603 mac_selinux_create_file_clear();
604 if (r < 0)
605 return log_debug_errno(errno, "mknod failed for %s: %m", d);
606
607 return 1;
608 }
609
610 static int mount_private_dev(MountEntry *m) {
611 static const char devnodes[] =
612 "/dev/null\0"
613 "/dev/zero\0"
614 "/dev/full\0"
615 "/dev/random\0"
616 "/dev/urandom\0"
617 "/dev/tty\0";
618
619 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
620 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
621 _cleanup_umask_ mode_t u;
622 int r;
623
624 assert(m);
625
626 u = umask(0000);
627
628 if (!mkdtemp(temporary_mount))
629 return -errno;
630
631 dev = strjoina(temporary_mount, "/dev");
632 (void) mkdir(dev, 0755);
633 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
634 r = -errno;
635 goto fail;
636 }
637
638 devpts = strjoina(temporary_mount, "/dev/pts");
639 (void) mkdir(devpts, 0755);
640 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
641 r = -errno;
642 goto fail;
643 }
644
645 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
646 * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
647 * thus, in that case make a clone
648 *
649 * in nspawn and other containers it will be a symlink, in that case make it a symlink
650 */
651 r = is_symlink("/dev/ptmx");
652 if (r < 0)
653 goto fail;
654 if (r > 0) {
655 devptmx = strjoina(temporary_mount, "/dev/ptmx");
656 if (symlink("pts/ptmx", devptmx) < 0) {
657 r = -errno;
658 goto fail;
659 }
660 } else {
661 r = clone_device_node("/dev/ptmx", temporary_mount);
662 if (r < 0)
663 goto fail;
664 if (r == 0) {
665 r = -ENXIO;
666 goto fail;
667 }
668 }
669
670 devshm = strjoina(temporary_mount, "/dev/shm");
671 (void) mkdir(devshm, 0755);
672 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
673 if (r < 0) {
674 r = -errno;
675 goto fail;
676 }
677
678 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
679 (void) mkdir(devmqueue, 0755);
680 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
681
682 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
683 (void) mkdir(devhugepages, 0755);
684 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
685
686 devlog = strjoina(temporary_mount, "/dev/log");
687 (void) symlink("/run/systemd/journal/dev-log", devlog);
688
689 NULSTR_FOREACH(d, devnodes) {
690 r = clone_device_node(d, temporary_mount);
691 if (r < 0)
692 goto fail;
693 }
694
695 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
696
697 /* Create the /dev directory if missing. It is more likely to be
698 * missing when the service is started with RootDirectory. This is
699 * consistent with mount units creating the mount points when missing.
700 */
701 (void) mkdir_p_label(mount_entry_path(m), 0755);
702
703 /* Unmount everything in old /dev */
704 umount_recursive(mount_entry_path(m), 0);
705 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
706 r = -errno;
707 goto fail;
708 }
709
710 rmdir(dev);
711 rmdir(temporary_mount);
712
713 return 0;
714
715 fail:
716 if (devpts)
717 umount(devpts);
718
719 if (devshm)
720 umount(devshm);
721
722 if (devhugepages)
723 umount(devhugepages);
724
725 if (devmqueue)
726 umount(devmqueue);
727
728 umount(dev);
729 rmdir(dev);
730 rmdir(temporary_mount);
731
732 return r;
733 }
734
735 static int mount_bind_dev(const MountEntry *m) {
736 int r;
737
738 assert(m);
739
740 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
741 * /dev. This is only used when RootDirectory= is set. */
742
743 (void) mkdir_p_label(mount_entry_path(m), 0755);
744
745 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
746 if (r < 0)
747 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
748 if (r > 0) /* make this a NOP if /dev is already a mount point */
749 return 0;
750
751 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
752 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
753
754 return 1;
755 }
756
757 static int mount_sysfs(const MountEntry *m) {
758 int r;
759
760 assert(m);
761
762 (void) mkdir_p_label(mount_entry_path(m), 0755);
763
764 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
765 if (r < 0)
766 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
767 if (r > 0) /* make this a NOP if /sys is already a mount point */
768 return 0;
769
770 /* Bind mount the host's version so that we get all child mounts of it, too. */
771 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
772 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
773
774 return 1;
775 }
776
777 static int mount_procfs(const MountEntry *m) {
778 int r;
779
780 assert(m);
781
782 (void) mkdir_p_label(mount_entry_path(m), 0755);
783
784 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
785 if (r < 0)
786 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
787 if (r > 0) /* make this a NOP if /proc is already a mount point */
788 return 0;
789
790 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
791 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
792 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
793
794 return 1;
795 }
796
797 static int mount_tmpfs(const MountEntry *m) {
798 assert(m);
799
800 /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
801
802 (void) mkdir_p_label(mount_entry_path(m), 0755);
803 (void) umount_recursive(mount_entry_path(m), 0);
804
805 if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
806 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
807
808 return 1;
809 }
810
811 static int mount_entry_chase(
812 const char *root_directory,
813 const MountEntry *m,
814 const char *path,
815 bool chase_nonexistent,
816 char **location) {
817
818 char *chased;
819 int r;
820
821 assert(m);
822
823 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
824 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
825 * that applies). The result is stored in "location". */
826
827 r = chase_symlinks(path, root_directory, CHASE_TRAIL_SLASH | (chase_nonexistent ? CHASE_NONEXISTENT : 0), &chased);
828 if (r == -ENOENT && m->ignore) {
829 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
830 return 0;
831 }
832 if (r < 0)
833 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
834
835 log_debug("Followed symlinks %s → %s.", path, chased);
836
837 free(*location);
838 *location = chased;
839
840 return 1;
841 }
842
843 static int apply_mount(
844 const char *root_directory,
845 MountEntry *m) {
846
847 bool rbind = true, make = false;
848 const char *what;
849 int r;
850
851 assert(m);
852
853 r = mount_entry_chase(root_directory, m, mount_entry_path(m), !IN_SET(m->mode, INACCESSIBLE, READONLY, READWRITE), &m->path_malloc);
854 if (r <= 0)
855 return r;
856
857 log_debug("Applying namespace mount on %s", mount_entry_path(m));
858
859 switch (m->mode) {
860
861 case INACCESSIBLE: {
862 struct stat target;
863
864 /* First, get rid of everything that is below if there
865 * is anything... Then, overmount it with an
866 * inaccessible path. */
867 (void) umount_recursive(mount_entry_path(m), 0);
868
869 if (lstat(mount_entry_path(m), &target) < 0)
870 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
871
872 what = mode_to_inaccessible_node(target.st_mode);
873 if (!what) {
874 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
875 return -ELOOP;
876 }
877 break;
878 }
879
880 case READONLY:
881 case READWRITE:
882 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
883 if (r < 0)
884 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
885 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
886 return 0;
887 /* This isn't a mount point yet, let's make it one. */
888 what = mount_entry_path(m);
889 break;
890
891 case BIND_MOUNT:
892 rbind = false;
893
894 _fallthrough_;
895 case BIND_MOUNT_RECURSIVE:
896 /* Also chase the source mount */
897
898 r = mount_entry_chase(root_directory, m, mount_entry_source(m), false, &m->source_malloc);
899 if (r <= 0)
900 return r;
901
902 what = mount_entry_source(m);
903 make = true;
904 break;
905
906 case EMPTY_DIR:
907 case TMPFS:
908 return mount_tmpfs(m);
909
910 case PRIVATE_TMP:
911 what = mount_entry_source(m);
912 make = true;
913 break;
914
915 case PRIVATE_DEV:
916 return mount_private_dev(m);
917
918 case BIND_DEV:
919 return mount_bind_dev(m);
920
921 case SYSFS:
922 return mount_sysfs(m);
923
924 case PROCFS:
925 return mount_procfs(m);
926
927 default:
928 assert_not_reached("Unknown mode");
929 }
930
931 assert(what);
932
933 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
934 bool try_again = false;
935 r = -errno;
936
937 if (r == -ENOENT && make) {
938 struct stat st;
939
940 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
941
942 if (stat(what, &st) >= 0) {
943
944 (void) mkdir_parents(mount_entry_path(m), 0755);
945
946 if (S_ISDIR(st.st_mode))
947 try_again = mkdir(mount_entry_path(m), 0755) >= 0;
948 else
949 try_again = touch(mount_entry_path(m)) >= 0;
950 }
951 }
952
953 if (try_again) {
954 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
955 r = -errno;
956 else
957 r = 0;
958 }
959
960 if (r < 0)
961 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
962 }
963
964 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
965 return 0;
966 }
967
968 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
969 int r = 0;
970
971 assert(m);
972 assert(proc_self_mountinfo);
973
974 if (mount_entry_read_only(m)) {
975 if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) {
976 /* Make superblock readonly */
977 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0)
978 r = -errno;
979 } else
980 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
981 } else if (m->mode == PRIVATE_DEV) {
982 /* Superblock can be readonly but the submounts can't */
983 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
984 r = -errno;
985 } else
986 return 0;
987
988 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
989 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
990 * read-only mounts already applied. */
991
992 if (r == -ENOENT && m->ignore)
993 r = 0;
994
995 return r;
996 }
997
998 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
999 assert(ns_info);
1000
1001 /*
1002 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1003 * since to protect the API VFS mounts, they need to be around in the
1004 * first place... and RootDirectory= or RootImage= need to be set.
1005 */
1006
1007 /* root_directory should point to a mount point */
1008 return root_directory &&
1009 (ns_info->mount_apivfs ||
1010 ns_info->protect_control_groups ||
1011 ns_info->protect_kernel_tunables);
1012 }
1013
1014 static unsigned namespace_calculate_mounts(
1015 const char* root_directory,
1016 const NamespaceInfo *ns_info,
1017 char** read_write_paths,
1018 char** read_only_paths,
1019 char** inaccessible_paths,
1020 char** empty_directories,
1021 unsigned n_bind_mounts,
1022 unsigned n_temporary_filesystems,
1023 const char* tmp_dir,
1024 const char* var_tmp_dir,
1025 ProtectHome protect_home,
1026 ProtectSystem protect_system) {
1027
1028 unsigned protect_home_cnt;
1029 unsigned protect_system_cnt =
1030 (protect_system == PROTECT_SYSTEM_STRICT ?
1031 ELEMENTSOF(protect_system_strict_table) :
1032 ((protect_system == PROTECT_SYSTEM_FULL) ?
1033 ELEMENTSOF(protect_system_full_table) :
1034 ((protect_system == PROTECT_SYSTEM_YES) ?
1035 ELEMENTSOF(protect_system_yes_table) : 0)));
1036
1037 protect_home_cnt =
1038 (protect_home == PROTECT_HOME_YES ?
1039 ELEMENTSOF(protect_home_yes_table) :
1040 ((protect_home == PROTECT_HOME_READ_ONLY) ?
1041 ELEMENTSOF(protect_home_read_only_table) :
1042 ((protect_home == PROTECT_HOME_TMPFS) ?
1043 ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1044
1045 return !!tmp_dir + !!var_tmp_dir +
1046 strv_length(read_write_paths) +
1047 strv_length(read_only_paths) +
1048 strv_length(inaccessible_paths) +
1049 strv_length(empty_directories) +
1050 n_bind_mounts +
1051 n_temporary_filesystems +
1052 ns_info->private_dev +
1053 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1054 (ns_info->protect_control_groups ? 1 : 0) +
1055 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1056 protect_home_cnt + protect_system_cnt +
1057 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1058 }
1059
1060 int setup_namespace(
1061 const char* root_directory,
1062 const char* root_image,
1063 const NamespaceInfo *ns_info,
1064 char** read_write_paths,
1065 char** read_only_paths,
1066 char** inaccessible_paths,
1067 char** empty_directories,
1068 const BindMount *bind_mounts,
1069 unsigned n_bind_mounts,
1070 const TemporaryFileSystem *temporary_filesystems,
1071 unsigned n_temporary_filesystems,
1072 const char* tmp_dir,
1073 const char* var_tmp_dir,
1074 ProtectHome protect_home,
1075 ProtectSystem protect_system,
1076 unsigned long mount_flags,
1077 DissectImageFlags dissect_image_flags) {
1078
1079 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1080 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1081 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1082 _cleanup_free_ void *root_hash = NULL;
1083 MountEntry *m, *mounts = NULL;
1084 size_t root_hash_size = 0;
1085 bool make_slave = false;
1086 const char *root;
1087 unsigned n_mounts;
1088 bool require_prefix = false;
1089 int r = 0;
1090
1091 assert(ns_info);
1092
1093 if (mount_flags == 0)
1094 mount_flags = MS_SHARED;
1095
1096 if (root_image) {
1097 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1098
1099 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1100 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1101
1102 r = loop_device_make_by_path(root_image,
1103 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1104 &loop_device);
1105 if (r < 0)
1106 return r;
1107
1108 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1109 if (r < 0)
1110 return r;
1111
1112 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1113 if (r < 0)
1114 return r;
1115
1116 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1117 if (r < 0)
1118 return r;
1119 }
1120
1121 if (root_directory)
1122 root = root_directory;
1123 else if (root_image || n_bind_mounts > 0 || n_temporary_filesystems > 0) {
1124
1125 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1126 * the same mount point for all images, which is safe, since they all live in their own namespaces
1127 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1128 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1129 * while we are applying them. */
1130
1131 root = "/run/systemd/unit-root";
1132 (void) mkdir_label(root, 0700);
1133 require_prefix = true;
1134 } else
1135 root = NULL;
1136
1137 n_mounts = namespace_calculate_mounts(
1138 root,
1139 ns_info,
1140 read_write_paths,
1141 read_only_paths,
1142 inaccessible_paths,
1143 empty_directories,
1144 n_bind_mounts,
1145 n_temporary_filesystems,
1146 tmp_dir, var_tmp_dir,
1147 protect_home, protect_system);
1148
1149 /* Set mount slave mode */
1150 if (root || n_mounts > 0)
1151 make_slave = true;
1152
1153 if (n_mounts > 0) {
1154 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1155 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1156 if (r < 0)
1157 goto finish;
1158
1159 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1160 if (r < 0)
1161 goto finish;
1162
1163 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1164 if (r < 0)
1165 goto finish;
1166
1167 r = append_empty_dir_mounts(&m, empty_directories);
1168 if (r < 0)
1169 goto finish;
1170
1171 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1172 if (r < 0)
1173 goto finish;
1174
1175 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1176 if (r < 0)
1177 goto finish;
1178
1179 if (tmp_dir) {
1180 *(m++) = (MountEntry) {
1181 .path_const = "/tmp",
1182 .mode = PRIVATE_TMP,
1183 .source_const = tmp_dir,
1184 };
1185 }
1186
1187 if (var_tmp_dir) {
1188 *(m++) = (MountEntry) {
1189 .path_const = "/var/tmp",
1190 .mode = PRIVATE_TMP,
1191 .source_const = var_tmp_dir,
1192 };
1193 }
1194
1195 if (ns_info->private_dev) {
1196 *(m++) = (MountEntry) {
1197 .path_const = "/dev",
1198 .mode = PRIVATE_DEV,
1199 };
1200 }
1201
1202 if (ns_info->protect_kernel_tunables) {
1203 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1204 if (r < 0)
1205 goto finish;
1206 }
1207
1208 if (ns_info->protect_kernel_modules) {
1209 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1210 if (r < 0)
1211 goto finish;
1212 }
1213
1214 if (ns_info->protect_control_groups) {
1215 *(m++) = (MountEntry) {
1216 .path_const = "/sys/fs/cgroup",
1217 .mode = READONLY,
1218 };
1219 }
1220
1221 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1222 if (r < 0)
1223 goto finish;
1224
1225 r = append_protect_system(&m, protect_system, false);
1226 if (r < 0)
1227 goto finish;
1228
1229 if (namespace_info_mount_apivfs(root, ns_info)) {
1230 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1231 if (r < 0)
1232 goto finish;
1233 }
1234
1235 assert(mounts + n_mounts == m);
1236
1237 /* Prepend the root directory where that's necessary */
1238 r = prefix_where_needed(mounts, n_mounts, root);
1239 if (r < 0)
1240 goto finish;
1241
1242 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
1243
1244 drop_duplicates(mounts, &n_mounts);
1245 drop_outside_root(root, mounts, &n_mounts);
1246 drop_inaccessible(mounts, &n_mounts);
1247 drop_nop(mounts, &n_mounts);
1248 }
1249
1250 if (unshare(CLONE_NEWNS) < 0) {
1251 r = -errno;
1252 goto finish;
1253 }
1254
1255 if (make_slave) {
1256 /* Remount / as SLAVE so that nothing now mounted in the namespace
1257 shows up in the parent */
1258 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1259 r = -errno;
1260 goto finish;
1261 }
1262 }
1263
1264 if (root_image) {
1265 /* A root image is specified, mount it to the right place */
1266 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1267 if (r < 0)
1268 goto finish;
1269
1270 if (decrypted_image) {
1271 r = decrypted_image_relinquish(decrypted_image);
1272 if (r < 0)
1273 goto finish;
1274 }
1275
1276 loop_device_relinquish(loop_device);
1277
1278 } else if (root_directory) {
1279
1280 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1281 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1282 if (r < 0)
1283 goto finish;
1284 if (r == 0) {
1285 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1286 r = -errno;
1287 goto finish;
1288 }
1289 }
1290
1291 } else if (root) {
1292
1293 /* Let's mount the main root directory to the root directory to use */
1294 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1295 r = -errno;
1296 goto finish;
1297 }
1298 }
1299
1300 /* Try to set up the new root directory before mounting anything else there. */
1301 if (root_image || root_directory)
1302 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1303
1304 if (n_mounts > 0) {
1305 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1306 char **blacklist;
1307 unsigned j;
1308
1309 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1310 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1311 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1312 if (!proc_self_mountinfo) {
1313 r = -errno;
1314 goto finish;
1315 }
1316
1317 /* First round, add in all special mounts we need */
1318 for (m = mounts; m < mounts + n_mounts; ++m) {
1319 r = apply_mount(root, m);
1320 if (r < 0)
1321 goto finish;
1322 }
1323
1324 /* Create a blacklist we can pass to bind_mount_recursive() */
1325 blacklist = newa(char*, n_mounts+1);
1326 for (j = 0; j < n_mounts; j++)
1327 blacklist[j] = (char*) mount_entry_path(mounts+j);
1328 blacklist[j] = NULL;
1329
1330 /* Second round, flip the ro bits if necessary. */
1331 for (m = mounts; m < mounts + n_mounts; ++m) {
1332 r = make_read_only(m, blacklist, proc_self_mountinfo);
1333 if (r < 0)
1334 goto finish;
1335 }
1336 }
1337
1338 if (root) {
1339 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1340 r = mount_move_root(root);
1341 if (r < 0)
1342 goto finish;
1343 }
1344
1345 /* Remount / as the desired mode. Note that this will not
1346 * reestablish propagation from our side to the host, since
1347 * what's disconnected is disconnected. */
1348 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1349 r = -errno;
1350 goto finish;
1351 }
1352
1353 r = 0;
1354
1355 finish:
1356 for (m = mounts; m < mounts + n_mounts; m++)
1357 mount_entry_done(m);
1358
1359 return r;
1360 }
1361
1362 void bind_mount_free_many(BindMount *b, unsigned n) {
1363 unsigned i;
1364
1365 assert(b || n == 0);
1366
1367 for (i = 0; i < n; i++) {
1368 free(b[i].source);
1369 free(b[i].destination);
1370 }
1371
1372 free(b);
1373 }
1374
1375 int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1376 _cleanup_free_ char *s = NULL, *d = NULL;
1377 BindMount *c;
1378
1379 assert(b);
1380 assert(n);
1381 assert(item);
1382
1383 s = strdup(item->source);
1384 if (!s)
1385 return -ENOMEM;
1386
1387 d = strdup(item->destination);
1388 if (!d)
1389 return -ENOMEM;
1390
1391 c = reallocarray(*b, *n + 1, sizeof(BindMount));
1392 if (!c)
1393 return -ENOMEM;
1394
1395 *b = c;
1396
1397 c[(*n) ++] = (BindMount) {
1398 .source = TAKE_PTR(s),
1399 .destination = TAKE_PTR(d),
1400 .read_only = item->read_only,
1401 .recursive = item->recursive,
1402 .ignore_enoent = item->ignore_enoent,
1403 };
1404
1405 return 0;
1406 }
1407
1408 void temporary_filesystem_free_many(TemporaryFileSystem *t, unsigned n) {
1409 unsigned i;
1410
1411 assert(t || n == 0);
1412
1413 for (i = 0; i < n; i++) {
1414 free(t[i].path);
1415 free(t[i].options);
1416 }
1417
1418 free(t);
1419 }
1420
1421 int temporary_filesystem_add(
1422 TemporaryFileSystem **t,
1423 unsigned *n,
1424 const char *path,
1425 const char *options) {
1426
1427 _cleanup_free_ char *p = NULL, *o = NULL;
1428 TemporaryFileSystem *c;
1429
1430 assert(t);
1431 assert(n);
1432 assert(path);
1433
1434 p = strdup(path);
1435 if (!p)
1436 return -ENOMEM;
1437
1438 if (!isempty(options)) {
1439 o = strdup(options);
1440 if (!o)
1441 return -ENOMEM;
1442 }
1443
1444 c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
1445 if (!c)
1446 return -ENOMEM;
1447
1448 *t = c;
1449
1450 c[(*n) ++] = (TemporaryFileSystem) {
1451 .path = TAKE_PTR(p),
1452 .options = TAKE_PTR(o),
1453 };
1454
1455 return 0;
1456 }
1457
1458 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1459 _cleanup_free_ char *x = NULL;
1460 char bid[SD_ID128_STRING_MAX];
1461 sd_id128_t boot_id;
1462 int r;
1463
1464 assert(id);
1465 assert(prefix);
1466 assert(path);
1467
1468 /* We include the boot id in the directory so that after a
1469 * reboot we can easily identify obsolete directories. */
1470
1471 r = sd_id128_get_boot(&boot_id);
1472 if (r < 0)
1473 return r;
1474
1475 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1476 if (!x)
1477 return -ENOMEM;
1478
1479 RUN_WITH_UMASK(0077)
1480 if (!mkdtemp(x))
1481 return -errno;
1482
1483 RUN_WITH_UMASK(0000) {
1484 char *y;
1485
1486 y = strjoina(x, "/tmp");
1487
1488 if (mkdir(y, 0777 | S_ISVTX) < 0)
1489 return -errno;
1490 }
1491
1492 *path = TAKE_PTR(x);
1493
1494 return 0;
1495 }
1496
1497 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1498 char *a, *b;
1499 int r;
1500
1501 assert(id);
1502 assert(tmp_dir);
1503 assert(var_tmp_dir);
1504
1505 r = setup_one_tmp_dir(id, "/tmp", &a);
1506 if (r < 0)
1507 return r;
1508
1509 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1510 if (r < 0) {
1511 char *t;
1512
1513 t = strjoina(a, "/tmp");
1514 rmdir(t);
1515 rmdir(a);
1516
1517 free(a);
1518 return r;
1519 }
1520
1521 *tmp_dir = a;
1522 *var_tmp_dir = b;
1523
1524 return 0;
1525 }
1526
1527 int setup_netns(int netns_storage_socket[2]) {
1528 _cleanup_close_ int netns = -1;
1529 int r, q;
1530
1531 assert(netns_storage_socket);
1532 assert(netns_storage_socket[0] >= 0);
1533 assert(netns_storage_socket[1] >= 0);
1534
1535 /* We use the passed socketpair as a storage buffer for our
1536 * namespace reference fd. Whatever process runs this first
1537 * shall create a new namespace, all others should just join
1538 * it. To serialize that we use a file lock on the socket
1539 * pair.
1540 *
1541 * It's a bit crazy, but hey, works great! */
1542
1543 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1544 return -errno;
1545
1546 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1547 if (netns == -EAGAIN) {
1548 /* Nothing stored yet, so let's create a new namespace */
1549
1550 if (unshare(CLONE_NEWNET) < 0) {
1551 r = -errno;
1552 goto fail;
1553 }
1554
1555 loopback_setup();
1556
1557 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1558 if (netns < 0) {
1559 r = -errno;
1560 goto fail;
1561 }
1562
1563 r = 1;
1564
1565 } else if (netns < 0) {
1566 r = netns;
1567 goto fail;
1568
1569 } else {
1570 /* Yay, found something, so let's join the namespace */
1571 if (setns(netns, CLONE_NEWNET) < 0) {
1572 r = -errno;
1573 goto fail;
1574 }
1575
1576 r = 0;
1577 }
1578
1579 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1580 if (q < 0) {
1581 r = q;
1582 goto fail;
1583 }
1584
1585 fail:
1586 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1587 return r;
1588 }
1589
1590 bool ns_type_supported(NamespaceType type) {
1591 const char *t, *ns_proc;
1592
1593 t = namespace_type_to_string(type);
1594 if (!t) /* Don't know how to translate this? Then it's not supported */
1595 return false;
1596
1597 ns_proc = strjoina("/proc/self/ns/", t);
1598 return access(ns_proc, F_OK) == 0;
1599 }
1600
1601 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1602 [PROTECT_HOME_NO] = "no",
1603 [PROTECT_HOME_YES] = "yes",
1604 [PROTECT_HOME_READ_ONLY] = "read-only",
1605 [PROTECT_HOME_TMPFS] = "tmpfs",
1606 };
1607
1608 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1609
1610 ProtectHome parse_protect_home_or_bool(const char *s) {
1611 int r;
1612
1613 r = parse_boolean(s);
1614 if (r > 0)
1615 return PROTECT_HOME_YES;
1616 if (r == 0)
1617 return PROTECT_HOME_NO;
1618
1619 return protect_home_from_string(s);
1620 }
1621
1622 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1623 [PROTECT_SYSTEM_NO] = "no",
1624 [PROTECT_SYSTEM_YES] = "yes",
1625 [PROTECT_SYSTEM_FULL] = "full",
1626 [PROTECT_SYSTEM_STRICT] = "strict",
1627 };
1628
1629 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1630
1631 ProtectSystem parse_protect_system_or_bool(const char *s) {
1632 int r;
1633
1634 r = parse_boolean(s);
1635 if (r > 0)
1636 return PROTECT_SYSTEM_YES;
1637 if (r == 0)
1638 return PROTECT_SYSTEM_NO;
1639
1640 return protect_system_from_string(s);
1641 }
1642
1643 static const char* const namespace_type_table[] = {
1644 [NAMESPACE_MOUNT] = "mnt",
1645 [NAMESPACE_CGROUP] = "cgroup",
1646 [NAMESPACE_UTS] = "uts",
1647 [NAMESPACE_IPC] = "ipc",
1648 [NAMESPACE_USER] = "user",
1649 [NAMESPACE_PID] = "pid",
1650 [NAMESPACE_NET] = "net",
1651 };
1652
1653 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);