]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
Merge pull request #8205 from poettering/bpf-multi
[thirdparty/systemd.git] / src / core / namespace.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <sched.h>
23 #include <stdio.h>
24 #include <string.h>
25 #include <sys/mount.h>
26 #include <sys/stat.h>
27 #include <unistd.h>
28 #include <linux/fs.h>
29
30 #include "alloc-util.h"
31 #include "base-filesystem.h"
32 #include "dev-setup.h"
33 #include "fd-util.h"
34 #include "fs-util.h"
35 #include "label.h"
36 #include "loop-util.h"
37 #include "loopback-setup.h"
38 #include "missing.h"
39 #include "mkdir.h"
40 #include "mount-util.h"
41 #include "namespace.h"
42 #include "path-util.h"
43 #include "selinux-util.h"
44 #include "socket-util.h"
45 #include "stat-util.h"
46 #include "string-table.h"
47 #include "string-util.h"
48 #include "strv.h"
49 #include "umask-util.h"
50 #include "user-util.h"
51 #include "util.h"
52
53 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
54
55 typedef enum MountMode {
56 /* This is ordered by priority! */
57 INACCESSIBLE,
58 BIND_MOUNT,
59 BIND_MOUNT_RECURSIVE,
60 PRIVATE_TMP,
61 PRIVATE_DEV,
62 BIND_DEV,
63 EMPTY_DIR,
64 SYSFS,
65 PROCFS,
66 READONLY,
67 READWRITE,
68 TMPFS,
69 } MountMode;
70
71 typedef struct MountEntry {
72 const char *path_const; /* Memory allocated on stack or static */
73 MountMode mode:5;
74 bool ignore:1; /* Ignore if path does not exist? */
75 bool has_prefix:1; /* Already is prefixed by the root dir? */
76 bool read_only:1; /* Shall this mount point be read-only? */
77 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
78 const char *source_const; /* The source path, for bind mounts */
79 char *source_malloc;
80 const char *options_const;/* Mount options for tmpfs */
81 char *options_malloc;
82 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
83 } MountEntry;
84
85 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
86 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
87 static const MountEntry apivfs_table[] = {
88 { "/proc", PROCFS, false },
89 { "/dev", BIND_DEV, false },
90 { "/sys", SYSFS, false },
91 };
92
93 /* ProtectKernelTunables= option and the related filesystem APIs */
94 static const MountEntry protect_kernel_tunables_table[] = {
95 { "/proc/sys", READONLY, false },
96 { "/proc/sysrq-trigger", READONLY, true },
97 { "/proc/latency_stats", READONLY, true },
98 { "/proc/mtrr", READONLY, true },
99 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
100 { "/proc/acpi", READONLY, true },
101 { "/proc/timer_stats", READONLY, true },
102 { "/proc/asound", READONLY, true },
103 { "/proc/bus", READONLY, true },
104 { "/proc/fs", READONLY, true },
105 { "/proc/irq", READONLY, true },
106 { "/sys", READONLY, false },
107 { "/sys/kernel/debug", READONLY, true },
108 { "/sys/kernel/tracing", READONLY, true },
109 { "/sys/fs/bpf", READONLY, true },
110 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
111 { "/sys/fs/selinux", READWRITE, true },
112 };
113
114 /* ProtectKernelModules= option */
115 static const MountEntry protect_kernel_modules_table[] = {
116 #if HAVE_SPLIT_USR
117 { "/lib/modules", INACCESSIBLE, true },
118 #endif
119 { "/usr/lib/modules", INACCESSIBLE, true },
120 };
121
122 /*
123 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
124 * system should be protected by ProtectSystem=
125 */
126 static const MountEntry protect_home_read_only_table[] = {
127 { "/home", READONLY, true },
128 { "/run/user", READONLY, true },
129 { "/root", READONLY, true },
130 };
131
132 /* ProtectHome=tmpfs table */
133 static const MountEntry protect_home_tmpfs_table[] = {
134 { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
135 { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
136 { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
137 };
138
139 /* ProtectHome=yes table */
140 static const MountEntry protect_home_yes_table[] = {
141 { "/home", INACCESSIBLE, true },
142 { "/run/user", INACCESSIBLE, true },
143 { "/root", INACCESSIBLE, true },
144 };
145
146 /* ProtectSystem=yes table */
147 static const MountEntry protect_system_yes_table[] = {
148 { "/usr", READONLY, false },
149 { "/boot", READONLY, true },
150 { "/efi", READONLY, true },
151 };
152
153 /* ProtectSystem=full includes ProtectSystem=yes */
154 static const MountEntry protect_system_full_table[] = {
155 { "/usr", READONLY, false },
156 { "/boot", READONLY, true },
157 { "/efi", READONLY, true },
158 { "/etc", READONLY, false },
159 };
160
161 /*
162 * ProtectSystem=strict table. In this strict mode, we mount everything
163 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
164 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
165 * protect those, and these options should be fully orthogonal.
166 * (And of course /home and friends are also left writable, as ProtectHome=
167 * shall manage those, orthogonally).
168 */
169 static const MountEntry protect_system_strict_table[] = {
170 { "/", READONLY, false },
171 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
172 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
173 { "/dev", READWRITE, false }, /* PrivateDevices= */
174 { "/home", READWRITE, true }, /* ProtectHome= */
175 { "/run/user", READWRITE, true }, /* ProtectHome= */
176 { "/root", READWRITE, true }, /* ProtectHome= */
177 };
178
179 static const char *mount_entry_path(const MountEntry *p) {
180 assert(p);
181
182 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
183 * otherwise the stack/static ->path field is returned. */
184
185 return p->path_malloc ?: p->path_const;
186 }
187
188 static bool mount_entry_read_only(const MountEntry *p) {
189 assert(p);
190
191 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
192 }
193
194 static const char *mount_entry_source(const MountEntry *p) {
195 assert(p);
196
197 return p->source_malloc ?: p->source_const;
198 }
199
200 static const char *mount_entry_options(const MountEntry *p) {
201 assert(p);
202
203 return p->options_malloc ?: p->options_const;
204 }
205
206 static void mount_entry_done(MountEntry *p) {
207 assert(p);
208
209 p->path_malloc = mfree(p->path_malloc);
210 p->source_malloc = mfree(p->source_malloc);
211 p->options_malloc = mfree(p->options_malloc);
212 }
213
214 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
215 char **i;
216
217 assert(p);
218
219 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
220
221 STRV_FOREACH(i, strv) {
222 bool ignore = false, needs_prefix = false;
223 const char *e = *i;
224
225 /* Look for any prefixes */
226 if (startswith(e, "-")) {
227 e++;
228 ignore = true;
229 }
230 if (startswith(e, "+")) {
231 e++;
232 needs_prefix = true;
233 }
234
235 if (!path_is_absolute(e))
236 return -EINVAL;
237
238 *((*p)++) = (MountEntry) {
239 .path_const = e,
240 .mode = mode,
241 .ignore = ignore,
242 .has_prefix = !needs_prefix && !forcibly_require_prefix,
243 };
244 }
245
246 return 0;
247 }
248
249 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
250 char **i;
251
252 assert(p);
253
254 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
255 * "/private/" boundary directories for DynamicUser=1. */
256
257 STRV_FOREACH(i, strv) {
258
259 *((*p)++) = (MountEntry) {
260 .path_const = *i,
261 .mode = EMPTY_DIR,
262 .ignore = false,
263 .has_prefix = false,
264 .read_only = true,
265 .options_const = "mode=755",
266 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
267 };
268 }
269
270 return 0;
271 }
272
273 static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
274 unsigned i;
275
276 assert(p);
277
278 for (i = 0; i < n; i++) {
279 const BindMount *b = binds + i;
280
281 *((*p)++) = (MountEntry) {
282 .path_const = b->destination,
283 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
284 .read_only = b->read_only,
285 .source_const = b->source,
286 .ignore = b->ignore_enoent,
287 };
288 }
289
290 return 0;
291 }
292
293 static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, unsigned n) {
294 unsigned i;
295 int r;
296
297 assert(p);
298
299 for (i = 0; i < n; i++) {
300 const TemporaryFileSystem *t = tmpfs + i;
301 _cleanup_free_ char *o = NULL, *str = NULL;
302 unsigned long flags = MS_NODEV|MS_STRICTATIME;
303 bool ro = false;
304
305 if (!path_is_absolute(t->path))
306 return -EINVAL;
307
308 if (!isempty(t->options)) {
309 str = strjoin("mode=0755,", t->options);
310 if (!str)
311 return -ENOMEM;
312
313 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
314 if (r < 0)
315 return r;
316
317 ro = !!(flags & MS_RDONLY);
318 if (ro)
319 flags ^= MS_RDONLY;
320 }
321
322 *((*p)++) = (MountEntry) {
323 .path_const = t->path,
324 .mode = TMPFS,
325 .read_only = ro,
326 .options_malloc = o,
327 .flags = flags,
328 };
329
330 o = NULL;
331 }
332
333 return 0;
334 }
335
336 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
337 unsigned i;
338
339 assert(p);
340 assert(mounts);
341
342 /* Adds a list of static pre-defined entries */
343
344 for (i = 0; i < n; i++)
345 *((*p)++) = (MountEntry) {
346 .path_const = mount_entry_path(mounts+i),
347 .mode = mounts[i].mode,
348 .ignore = mounts[i].ignore || ignore_protect,
349 };
350
351 return 0;
352 }
353
354 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
355 assert(p);
356
357 switch (protect_home) {
358
359 case PROTECT_HOME_NO:
360 return 0;
361
362 case PROTECT_HOME_READ_ONLY:
363 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
364
365 case PROTECT_HOME_TMPFS:
366 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
367
368 case PROTECT_HOME_YES:
369 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
370
371 default:
372 assert_not_reached("Unexpected ProtectHome= value");
373 }
374 }
375
376 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
377 assert(p);
378
379 switch (protect_system) {
380
381 case PROTECT_SYSTEM_NO:
382 return 0;
383
384 case PROTECT_SYSTEM_STRICT:
385 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
386
387 case PROTECT_SYSTEM_YES:
388 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
389
390 case PROTECT_SYSTEM_FULL:
391 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
392
393 default:
394 assert_not_reached("Unexpected ProtectSystem= value");
395 }
396 }
397
398 static int mount_path_compare(const void *a, const void *b) {
399 const MountEntry *p = a, *q = b;
400 int d;
401
402 /* If the paths are not equal, then order prefixes first */
403 d = path_compare(mount_entry_path(p), mount_entry_path(q));
404 if (d != 0)
405 return d;
406
407 /* If the paths are equal, check the mode */
408 if (p->mode < q->mode)
409 return -1;
410
411 if (p->mode > q->mode)
412 return 1;
413
414 return 0;
415 }
416
417 static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
418 unsigned i;
419
420 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
421 * that. */
422
423 if (!root_directory)
424 return 0;
425
426 for (i = 0; i < n; i++) {
427 char *s;
428
429 if (m[i].has_prefix)
430 continue;
431
432 s = prefix_root(root_directory, mount_entry_path(m+i));
433 if (!s)
434 return -ENOMEM;
435
436 free_and_replace(m[i].path_malloc, s);
437 m[i].has_prefix = true;
438 }
439
440 return 0;
441 }
442
443 static void drop_duplicates(MountEntry *m, unsigned *n) {
444 MountEntry *f, *t, *previous;
445
446 assert(m);
447 assert(n);
448
449 /* Drops duplicate entries. Expects that the array is properly ordered already. */
450
451 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
452
453 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
454 * above. */
455 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
456 log_debug("%s is duplicate.", mount_entry_path(f));
457 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
458 mount_entry_done(f);
459 continue;
460 }
461
462 *t = *f;
463 previous = t;
464 t++;
465 }
466
467 *n = t - m;
468 }
469
470 static void drop_inaccessible(MountEntry *m, unsigned *n) {
471 MountEntry *f, *t;
472 const char *clear = NULL;
473
474 assert(m);
475 assert(n);
476
477 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
478 * ordered already. */
479
480 for (f = m, t = m; f < m + *n; f++) {
481
482 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
483 * it, as inaccessible paths really should drop the entire subtree. */
484 if (clear && path_startswith(mount_entry_path(f), clear)) {
485 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
486 mount_entry_done(f);
487 continue;
488 }
489
490 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
491
492 *t = *f;
493 t++;
494 }
495
496 *n = t - m;
497 }
498
499 static void drop_nop(MountEntry *m, unsigned *n) {
500 MountEntry *f, *t;
501
502 assert(m);
503 assert(n);
504
505 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
506 * list is ordered by prefixes. */
507
508 for (f = m, t = m; f < m + *n; f++) {
509
510 /* Only suppress such subtrees for READONLY and READWRITE entries */
511 if (IN_SET(f->mode, READONLY, READWRITE)) {
512 MountEntry *p;
513 bool found = false;
514
515 /* Now let's find the first parent of the entry we are looking at. */
516 for (p = t-1; p >= m; p--) {
517 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
518 found = true;
519 break;
520 }
521 }
522
523 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
524 if (found && p->mode == f->mode) {
525 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
526 mount_entry_done(f);
527 continue;
528 }
529 }
530
531 *t = *f;
532 t++;
533 }
534
535 *n = t - m;
536 }
537
538 static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
539 MountEntry *f, *t;
540
541 assert(m);
542 assert(n);
543
544 /* Nothing to do */
545 if (!root_directory)
546 return;
547
548 /* Drops all mounts that are outside of the root directory. */
549
550 for (f = m, t = m; f < m + *n; f++) {
551
552 if (!path_startswith(mount_entry_path(f), root_directory)) {
553 log_debug("%s is outside of root directory.", mount_entry_path(f));
554 mount_entry_done(f);
555 continue;
556 }
557
558 *t = *f;
559 t++;
560 }
561
562 *n = t - m;
563 }
564
565 static int clone_device_node(const char *d, const char *temporary_mount) {
566 const char *dn;
567 struct stat st;
568 int r;
569
570 if (stat(d, &st) < 0) {
571 if (errno == ENOENT)
572 return 0;
573 return -errno;
574 }
575
576 if (!S_ISBLK(st.st_mode) &&
577 !S_ISCHR(st.st_mode))
578 return -EINVAL;
579
580 if (st.st_rdev == 0)
581 return 0;
582
583 dn = strjoina(temporary_mount, d);
584
585 mac_selinux_create_file_prepare(d, st.st_mode);
586 r = mknod(dn, st.st_mode, st.st_rdev);
587 mac_selinux_create_file_clear();
588 if (r < 0)
589 return log_debug_errno(errno, "mknod failed for %s: %m", d);
590
591 return 1;
592 }
593
594 static int mount_private_dev(MountEntry *m) {
595 static const char devnodes[] =
596 "/dev/null\0"
597 "/dev/zero\0"
598 "/dev/full\0"
599 "/dev/random\0"
600 "/dev/urandom\0"
601 "/dev/tty\0";
602
603 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
604 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
605 _cleanup_umask_ mode_t u;
606 int r;
607
608 assert(m);
609
610 u = umask(0000);
611
612 if (!mkdtemp(temporary_mount))
613 return -errno;
614
615 dev = strjoina(temporary_mount, "/dev");
616 (void) mkdir(dev, 0755);
617 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
618 r = -errno;
619 goto fail;
620 }
621
622 devpts = strjoina(temporary_mount, "/dev/pts");
623 (void) mkdir(devpts, 0755);
624 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
625 r = -errno;
626 goto fail;
627 }
628
629 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
630 * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
631 * thus, in that case make a clone
632 *
633 * in nspawn and other containers it will be a symlink, in that case make it a symlink
634 */
635 r = is_symlink("/dev/ptmx");
636 if (r < 0)
637 goto fail;
638 if (r > 0) {
639 devptmx = strjoina(temporary_mount, "/dev/ptmx");
640 if (symlink("pts/ptmx", devptmx) < 0) {
641 r = -errno;
642 goto fail;
643 }
644 } else {
645 r = clone_device_node("/dev/ptmx", temporary_mount);
646 if (r < 0)
647 goto fail;
648 if (r == 0) {
649 r = -ENXIO;
650 goto fail;
651 }
652 }
653
654 devshm = strjoina(temporary_mount, "/dev/shm");
655 (void) mkdir(devshm, 0755);
656 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
657 if (r < 0) {
658 r = -errno;
659 goto fail;
660 }
661
662 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
663 (void) mkdir(devmqueue, 0755);
664 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
665
666 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
667 (void) mkdir(devhugepages, 0755);
668 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
669
670 devlog = strjoina(temporary_mount, "/dev/log");
671 (void) symlink("/run/systemd/journal/dev-log", devlog);
672
673 NULSTR_FOREACH(d, devnodes) {
674 r = clone_device_node(d, temporary_mount);
675 if (r < 0)
676 goto fail;
677 }
678
679 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
680
681 /* Create the /dev directory if missing. It is more likely to be
682 * missing when the service is started with RootDirectory. This is
683 * consistent with mount units creating the mount points when missing.
684 */
685 (void) mkdir_p_label(mount_entry_path(m), 0755);
686
687 /* Unmount everything in old /dev */
688 umount_recursive(mount_entry_path(m), 0);
689 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
690 r = -errno;
691 goto fail;
692 }
693
694 rmdir(dev);
695 rmdir(temporary_mount);
696
697 return 0;
698
699 fail:
700 if (devpts)
701 umount(devpts);
702
703 if (devshm)
704 umount(devshm);
705
706 if (devhugepages)
707 umount(devhugepages);
708
709 if (devmqueue)
710 umount(devmqueue);
711
712 umount(dev);
713 rmdir(dev);
714 rmdir(temporary_mount);
715
716 return r;
717 }
718
719 static int mount_bind_dev(const MountEntry *m) {
720 int r;
721
722 assert(m);
723
724 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
725 * /dev. This is only used when RootDirectory= is set. */
726
727 (void) mkdir_p_label(mount_entry_path(m), 0755);
728
729 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
730 if (r < 0)
731 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
732 if (r > 0) /* make this a NOP if /dev is already a mount point */
733 return 0;
734
735 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
736 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
737
738 return 1;
739 }
740
741 static int mount_sysfs(const MountEntry *m) {
742 int r;
743
744 assert(m);
745
746 (void) mkdir_p_label(mount_entry_path(m), 0755);
747
748 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
749 if (r < 0)
750 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
751 if (r > 0) /* make this a NOP if /sys is already a mount point */
752 return 0;
753
754 /* Bind mount the host's version so that we get all child mounts of it, too. */
755 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
756 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
757
758 return 1;
759 }
760
761 static int mount_procfs(const MountEntry *m) {
762 int r;
763
764 assert(m);
765
766 (void) mkdir_p_label(mount_entry_path(m), 0755);
767
768 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
769 if (r < 0)
770 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
771 if (r > 0) /* make this a NOP if /proc is already a mount point */
772 return 0;
773
774 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
775 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
776 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
777
778 return 1;
779 }
780
781 static int mount_tmpfs(const MountEntry *m) {
782 assert(m);
783
784 /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
785
786 (void) mkdir_p_label(mount_entry_path(m), 0755);
787 (void) umount_recursive(mount_entry_path(m), 0);
788
789 if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
790 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
791
792 return 1;
793 }
794
795 static int mount_entry_chase(
796 const char *root_directory,
797 const MountEntry *m,
798 const char *path,
799 bool chase_nonexistent,
800 char **location) {
801
802 char *chased;
803 int r;
804
805 assert(m);
806
807 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
808 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
809 * that applies). The result is stored in "location". */
810
811 r = chase_symlinks(path, root_directory, chase_nonexistent ? CHASE_NONEXISTENT : 0, &chased);
812 if (r == -ENOENT && m->ignore) {
813 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
814 return 0;
815 }
816 if (r < 0)
817 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
818
819 log_debug("Followed symlinks %s → %s.", path, chased);
820
821 free(*location);
822 *location = chased;
823
824 return 1;
825 }
826
827 static int apply_mount(
828 const char *root_directory,
829 MountEntry *m) {
830
831 bool rbind = true, make = false;
832 const char *what;
833 int r;
834
835 assert(m);
836
837 r = mount_entry_chase(root_directory, m, mount_entry_path(m), !IN_SET(m->mode, INACCESSIBLE, READONLY, READWRITE), &m->path_malloc);
838 if (r <= 0)
839 return r;
840
841 log_debug("Applying namespace mount on %s", mount_entry_path(m));
842
843 switch (m->mode) {
844
845 case INACCESSIBLE: {
846 struct stat target;
847
848 /* First, get rid of everything that is below if there
849 * is anything... Then, overmount it with an
850 * inaccessible path. */
851 (void) umount_recursive(mount_entry_path(m), 0);
852
853 if (lstat(mount_entry_path(m), &target) < 0)
854 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
855
856 what = mode_to_inaccessible_node(target.st_mode);
857 if (!what) {
858 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
859 return -ELOOP;
860 }
861 break;
862 }
863
864 case READONLY:
865 case READWRITE:
866 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
867 if (r < 0)
868 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
869 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
870 return 0;
871 /* This isn't a mount point yet, let's make it one. */
872 what = mount_entry_path(m);
873 break;
874
875 case BIND_MOUNT:
876 rbind = false;
877
878 _fallthrough_;
879 case BIND_MOUNT_RECURSIVE:
880 /* Also chase the source mount */
881
882 r = mount_entry_chase(root_directory, m, mount_entry_source(m), false, &m->source_malloc);
883 if (r <= 0)
884 return r;
885
886 what = mount_entry_source(m);
887 make = true;
888 break;
889
890 case EMPTY_DIR:
891 case TMPFS:
892 return mount_tmpfs(m);
893
894 case PRIVATE_TMP:
895 what = mount_entry_source(m);
896 make = true;
897 break;
898
899 case PRIVATE_DEV:
900 return mount_private_dev(m);
901
902 case BIND_DEV:
903 return mount_bind_dev(m);
904
905 case SYSFS:
906 return mount_sysfs(m);
907
908 case PROCFS:
909 return mount_procfs(m);
910
911 default:
912 assert_not_reached("Unknown mode");
913 }
914
915 assert(what);
916
917 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
918 bool try_again = false;
919 r = -errno;
920
921 if (r == -ENOENT && make) {
922 struct stat st;
923
924 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
925
926 if (stat(what, &st) >= 0) {
927
928 (void) mkdir_parents(mount_entry_path(m), 0755);
929
930 if (S_ISDIR(st.st_mode))
931 try_again = mkdir(mount_entry_path(m), 0755) >= 0;
932 else
933 try_again = touch(mount_entry_path(m)) >= 0;
934 }
935 }
936
937 if (try_again) {
938 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
939 r = -errno;
940 else
941 r = 0;
942 }
943
944 if (r < 0)
945 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
946 }
947
948 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
949 return 0;
950 }
951
952 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
953 int r = 0;
954
955 assert(m);
956 assert(proc_self_mountinfo);
957
958 if (mount_entry_read_only(m)) {
959 if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) {
960 /* Make superblock readonly */
961 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0)
962 r = -errno;
963 } else
964 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
965 } else if (m->mode == PRIVATE_DEV) {
966 /* Superblock can be readonly but the submounts can't */
967 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
968 r = -errno;
969 } else
970 return 0;
971
972 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
973 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
974 * read-only mounts already applied. */
975
976 if (r == -ENOENT && m->ignore)
977 r = 0;
978
979 return r;
980 }
981
982 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
983 assert(ns_info);
984
985 /*
986 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
987 * since to protect the API VFS mounts, they need to be around in the
988 * first place... and RootDirectory= or RootImage= need to be set.
989 */
990
991 /* root_directory should point to a mount point */
992 return root_directory &&
993 (ns_info->mount_apivfs ||
994 ns_info->protect_control_groups ||
995 ns_info->protect_kernel_tunables);
996 }
997
998 static unsigned namespace_calculate_mounts(
999 const char* root_directory,
1000 const NamespaceInfo *ns_info,
1001 char** read_write_paths,
1002 char** read_only_paths,
1003 char** inaccessible_paths,
1004 char** empty_directories,
1005 unsigned n_bind_mounts,
1006 unsigned n_temporary_filesystems,
1007 const char* tmp_dir,
1008 const char* var_tmp_dir,
1009 ProtectHome protect_home,
1010 ProtectSystem protect_system) {
1011
1012 unsigned protect_home_cnt;
1013 unsigned protect_system_cnt =
1014 (protect_system == PROTECT_SYSTEM_STRICT ?
1015 ELEMENTSOF(protect_system_strict_table) :
1016 ((protect_system == PROTECT_SYSTEM_FULL) ?
1017 ELEMENTSOF(protect_system_full_table) :
1018 ((protect_system == PROTECT_SYSTEM_YES) ?
1019 ELEMENTSOF(protect_system_yes_table) : 0)));
1020
1021 protect_home_cnt =
1022 (protect_home == PROTECT_HOME_YES ?
1023 ELEMENTSOF(protect_home_yes_table) :
1024 ((protect_home == PROTECT_HOME_READ_ONLY) ?
1025 ELEMENTSOF(protect_home_read_only_table) :
1026 ((protect_home == PROTECT_HOME_TMPFS) ?
1027 ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1028
1029 return !!tmp_dir + !!var_tmp_dir +
1030 strv_length(read_write_paths) +
1031 strv_length(read_only_paths) +
1032 strv_length(inaccessible_paths) +
1033 strv_length(empty_directories) +
1034 n_bind_mounts +
1035 n_temporary_filesystems +
1036 ns_info->private_dev +
1037 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1038 (ns_info->protect_control_groups ? 1 : 0) +
1039 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1040 protect_home_cnt + protect_system_cnt +
1041 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1042 }
1043
1044 int setup_namespace(
1045 const char* root_directory,
1046 const char* root_image,
1047 const NamespaceInfo *ns_info,
1048 char** read_write_paths,
1049 char** read_only_paths,
1050 char** inaccessible_paths,
1051 char** empty_directories,
1052 const BindMount *bind_mounts,
1053 unsigned n_bind_mounts,
1054 const TemporaryFileSystem *temporary_filesystems,
1055 unsigned n_temporary_filesystems,
1056 const char* tmp_dir,
1057 const char* var_tmp_dir,
1058 ProtectHome protect_home,
1059 ProtectSystem protect_system,
1060 unsigned long mount_flags,
1061 DissectImageFlags dissect_image_flags) {
1062
1063 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1064 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1065 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1066 _cleanup_free_ void *root_hash = NULL;
1067 MountEntry *m, *mounts = NULL;
1068 size_t root_hash_size = 0;
1069 bool make_slave = false;
1070 const char *root;
1071 unsigned n_mounts;
1072 bool require_prefix = false;
1073 int r = 0;
1074
1075 assert(ns_info);
1076
1077 if (mount_flags == 0)
1078 mount_flags = MS_SHARED;
1079
1080 if (root_image) {
1081 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1082
1083 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1084 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1085
1086 r = loop_device_make_by_path(root_image,
1087 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1088 &loop_device);
1089 if (r < 0)
1090 return r;
1091
1092 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1093 if (r < 0)
1094 return r;
1095
1096 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1097 if (r < 0)
1098 return r;
1099
1100 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1101 if (r < 0)
1102 return r;
1103 }
1104
1105 if (root_directory)
1106 root = root_directory;
1107 else if (root_image || n_bind_mounts > 0 || n_temporary_filesystems > 0) {
1108
1109 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1110 * the same mount point for all images, which is safe, since they all live in their own namespaces
1111 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1112 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1113 * while we are applying them. */
1114
1115 root = "/run/systemd/unit-root";
1116 (void) mkdir_label(root, 0700);
1117 require_prefix = true;
1118 } else
1119 root = NULL;
1120
1121 n_mounts = namespace_calculate_mounts(
1122 root,
1123 ns_info,
1124 read_write_paths,
1125 read_only_paths,
1126 inaccessible_paths,
1127 empty_directories,
1128 n_bind_mounts,
1129 n_temporary_filesystems,
1130 tmp_dir, var_tmp_dir,
1131 protect_home, protect_system);
1132
1133 /* Set mount slave mode */
1134 if (root || n_mounts > 0)
1135 make_slave = true;
1136
1137 if (n_mounts > 0) {
1138 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1139 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1140 if (r < 0)
1141 goto finish;
1142
1143 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1144 if (r < 0)
1145 goto finish;
1146
1147 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1148 if (r < 0)
1149 goto finish;
1150
1151 r = append_empty_dir_mounts(&m, empty_directories);
1152 if (r < 0)
1153 goto finish;
1154
1155 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1156 if (r < 0)
1157 goto finish;
1158
1159 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1160 if (r < 0)
1161 goto finish;
1162
1163 if (tmp_dir) {
1164 *(m++) = (MountEntry) {
1165 .path_const = "/tmp",
1166 .mode = PRIVATE_TMP,
1167 .source_const = tmp_dir,
1168 };
1169 }
1170
1171 if (var_tmp_dir) {
1172 *(m++) = (MountEntry) {
1173 .path_const = "/var/tmp",
1174 .mode = PRIVATE_TMP,
1175 .source_const = var_tmp_dir,
1176 };
1177 }
1178
1179 if (ns_info->private_dev) {
1180 *(m++) = (MountEntry) {
1181 .path_const = "/dev",
1182 .mode = PRIVATE_DEV,
1183 };
1184 }
1185
1186 if (ns_info->protect_kernel_tunables) {
1187 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1188 if (r < 0)
1189 goto finish;
1190 }
1191
1192 if (ns_info->protect_kernel_modules) {
1193 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1194 if (r < 0)
1195 goto finish;
1196 }
1197
1198 if (ns_info->protect_control_groups) {
1199 *(m++) = (MountEntry) {
1200 .path_const = "/sys/fs/cgroup",
1201 .mode = READONLY,
1202 };
1203 }
1204
1205 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1206 if (r < 0)
1207 goto finish;
1208
1209 r = append_protect_system(&m, protect_system, false);
1210 if (r < 0)
1211 goto finish;
1212
1213 if (namespace_info_mount_apivfs(root, ns_info)) {
1214 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1215 if (r < 0)
1216 goto finish;
1217 }
1218
1219 assert(mounts + n_mounts == m);
1220
1221 /* Prepend the root directory where that's necessary */
1222 r = prefix_where_needed(mounts, n_mounts, root);
1223 if (r < 0)
1224 goto finish;
1225
1226 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
1227
1228 drop_duplicates(mounts, &n_mounts);
1229 drop_outside_root(root, mounts, &n_mounts);
1230 drop_inaccessible(mounts, &n_mounts);
1231 drop_nop(mounts, &n_mounts);
1232 }
1233
1234 if (unshare(CLONE_NEWNS) < 0) {
1235 r = -errno;
1236 goto finish;
1237 }
1238
1239 if (make_slave) {
1240 /* Remount / as SLAVE so that nothing now mounted in the namespace
1241 shows up in the parent */
1242 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1243 r = -errno;
1244 goto finish;
1245 }
1246 }
1247
1248 if (root_image) {
1249 /* A root image is specified, mount it to the right place */
1250 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1251 if (r < 0)
1252 goto finish;
1253
1254 if (decrypted_image) {
1255 r = decrypted_image_relinquish(decrypted_image);
1256 if (r < 0)
1257 goto finish;
1258 }
1259
1260 loop_device_relinquish(loop_device);
1261
1262 } else if (root_directory) {
1263
1264 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1265 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1266 if (r < 0)
1267 goto finish;
1268 if (r == 0) {
1269 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1270 r = -errno;
1271 goto finish;
1272 }
1273 }
1274
1275 } else if (root) {
1276
1277 /* Let's mount the main root directory to the root directory to use */
1278 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1279 r = -errno;
1280 goto finish;
1281 }
1282 }
1283
1284 /* Try to set up the new root directory before mounting anything else there. */
1285 if (root_image || root_directory)
1286 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1287
1288 if (n_mounts > 0) {
1289 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1290 char **blacklist;
1291 unsigned j;
1292
1293 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1294 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1295 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1296 if (!proc_self_mountinfo) {
1297 r = -errno;
1298 goto finish;
1299 }
1300
1301 /* First round, add in all special mounts we need */
1302 for (m = mounts; m < mounts + n_mounts; ++m) {
1303 r = apply_mount(root, m);
1304 if (r < 0)
1305 goto finish;
1306 }
1307
1308 /* Create a blacklist we can pass to bind_mount_recursive() */
1309 blacklist = newa(char*, n_mounts+1);
1310 for (j = 0; j < n_mounts; j++)
1311 blacklist[j] = (char*) mount_entry_path(mounts+j);
1312 blacklist[j] = NULL;
1313
1314 /* Second round, flip the ro bits if necessary. */
1315 for (m = mounts; m < mounts + n_mounts; ++m) {
1316 r = make_read_only(m, blacklist, proc_self_mountinfo);
1317 if (r < 0)
1318 goto finish;
1319 }
1320 }
1321
1322 if (root) {
1323 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1324 r = mount_move_root(root);
1325 if (r < 0)
1326 goto finish;
1327 }
1328
1329 /* Remount / as the desired mode. Note that this will not
1330 * reestablish propagation from our side to the host, since
1331 * what's disconnected is disconnected. */
1332 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1333 r = -errno;
1334 goto finish;
1335 }
1336
1337 r = 0;
1338
1339 finish:
1340 for (m = mounts; m < mounts + n_mounts; m++)
1341 mount_entry_done(m);
1342
1343 return r;
1344 }
1345
1346 void bind_mount_free_many(BindMount *b, unsigned n) {
1347 unsigned i;
1348
1349 assert(b || n == 0);
1350
1351 for (i = 0; i < n; i++) {
1352 free(b[i].source);
1353 free(b[i].destination);
1354 }
1355
1356 free(b);
1357 }
1358
1359 int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1360 _cleanup_free_ char *s = NULL, *d = NULL;
1361 BindMount *c;
1362
1363 assert(b);
1364 assert(n);
1365 assert(item);
1366
1367 s = strdup(item->source);
1368 if (!s)
1369 return -ENOMEM;
1370
1371 d = strdup(item->destination);
1372 if (!d)
1373 return -ENOMEM;
1374
1375 c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1376 if (!c)
1377 return -ENOMEM;
1378
1379 *b = c;
1380
1381 c[(*n) ++] = (BindMount) {
1382 .source = s,
1383 .destination = d,
1384 .read_only = item->read_only,
1385 .recursive = item->recursive,
1386 .ignore_enoent = item->ignore_enoent,
1387 };
1388
1389 s = d = NULL;
1390 return 0;
1391 }
1392
1393 void temporary_filesystem_free_many(TemporaryFileSystem *t, unsigned n) {
1394 unsigned i;
1395
1396 assert(t || n == 0);
1397
1398 for (i = 0; i < n; i++) {
1399 free(t[i].path);
1400 free(t[i].options);
1401 }
1402
1403 free(t);
1404 }
1405
1406 int temporary_filesystem_add(
1407 TemporaryFileSystem **t,
1408 unsigned *n,
1409 const char *path,
1410 const char *options) {
1411
1412 _cleanup_free_ char *p = NULL, *o = NULL;
1413 TemporaryFileSystem *c;
1414
1415 assert(t);
1416 assert(n);
1417 assert(path);
1418
1419 p = strdup(path);
1420 if (!p)
1421 return -ENOMEM;
1422
1423 if (!isempty(options)) {
1424 o = strdup(options);
1425 if (!o)
1426 return -ENOMEM;
1427 }
1428
1429 c = realloc_multiply(*t, sizeof(TemporaryFileSystem), *n + 1);
1430 if (!c)
1431 return -ENOMEM;
1432
1433 *t = c;
1434
1435 c[(*n) ++] = (TemporaryFileSystem) {
1436 .path = p,
1437 .options = o,
1438 };
1439
1440 p = o = NULL;
1441 return 0;
1442 }
1443
1444 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1445 _cleanup_free_ char *x = NULL;
1446 char bid[SD_ID128_STRING_MAX];
1447 sd_id128_t boot_id;
1448 int r;
1449
1450 assert(id);
1451 assert(prefix);
1452 assert(path);
1453
1454 /* We include the boot id in the directory so that after a
1455 * reboot we can easily identify obsolete directories. */
1456
1457 r = sd_id128_get_boot(&boot_id);
1458 if (r < 0)
1459 return r;
1460
1461 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1462 if (!x)
1463 return -ENOMEM;
1464
1465 RUN_WITH_UMASK(0077)
1466 if (!mkdtemp(x))
1467 return -errno;
1468
1469 RUN_WITH_UMASK(0000) {
1470 char *y;
1471
1472 y = strjoina(x, "/tmp");
1473
1474 if (mkdir(y, 0777 | S_ISVTX) < 0)
1475 return -errno;
1476 }
1477
1478 *path = x;
1479 x = NULL;
1480
1481 return 0;
1482 }
1483
1484 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1485 char *a, *b;
1486 int r;
1487
1488 assert(id);
1489 assert(tmp_dir);
1490 assert(var_tmp_dir);
1491
1492 r = setup_one_tmp_dir(id, "/tmp", &a);
1493 if (r < 0)
1494 return r;
1495
1496 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1497 if (r < 0) {
1498 char *t;
1499
1500 t = strjoina(a, "/tmp");
1501 rmdir(t);
1502 rmdir(a);
1503
1504 free(a);
1505 return r;
1506 }
1507
1508 *tmp_dir = a;
1509 *var_tmp_dir = b;
1510
1511 return 0;
1512 }
1513
1514 int setup_netns(int netns_storage_socket[2]) {
1515 _cleanup_close_ int netns = -1;
1516 int r, q;
1517
1518 assert(netns_storage_socket);
1519 assert(netns_storage_socket[0] >= 0);
1520 assert(netns_storage_socket[1] >= 0);
1521
1522 /* We use the passed socketpair as a storage buffer for our
1523 * namespace reference fd. Whatever process runs this first
1524 * shall create a new namespace, all others should just join
1525 * it. To serialize that we use a file lock on the socket
1526 * pair.
1527 *
1528 * It's a bit crazy, but hey, works great! */
1529
1530 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1531 return -errno;
1532
1533 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1534 if (netns == -EAGAIN) {
1535 /* Nothing stored yet, so let's create a new namespace */
1536
1537 if (unshare(CLONE_NEWNET) < 0) {
1538 r = -errno;
1539 goto fail;
1540 }
1541
1542 loopback_setup();
1543
1544 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1545 if (netns < 0) {
1546 r = -errno;
1547 goto fail;
1548 }
1549
1550 r = 1;
1551
1552 } else if (netns < 0) {
1553 r = netns;
1554 goto fail;
1555
1556 } else {
1557 /* Yay, found something, so let's join the namespace */
1558 if (setns(netns, CLONE_NEWNET) < 0) {
1559 r = -errno;
1560 goto fail;
1561 }
1562
1563 r = 0;
1564 }
1565
1566 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1567 if (q < 0) {
1568 r = q;
1569 goto fail;
1570 }
1571
1572 fail:
1573 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1574 return r;
1575 }
1576
1577 bool ns_type_supported(NamespaceType type) {
1578 const char *t, *ns_proc;
1579
1580 t = namespace_type_to_string(type);
1581 if (!t) /* Don't know how to translate this? Then it's not supported */
1582 return false;
1583
1584 ns_proc = strjoina("/proc/self/ns/", t);
1585 return access(ns_proc, F_OK) == 0;
1586 }
1587
1588 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1589 [PROTECT_HOME_NO] = "no",
1590 [PROTECT_HOME_YES] = "yes",
1591 [PROTECT_HOME_READ_ONLY] = "read-only",
1592 [PROTECT_HOME_TMPFS] = "tmpfs",
1593 };
1594
1595 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1596
1597 ProtectHome parse_protect_home_or_bool(const char *s) {
1598 int r;
1599
1600 r = parse_boolean(s);
1601 if (r > 0)
1602 return PROTECT_HOME_YES;
1603 if (r == 0)
1604 return PROTECT_HOME_NO;
1605
1606 return protect_home_from_string(s);
1607 }
1608
1609 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1610 [PROTECT_SYSTEM_NO] = "no",
1611 [PROTECT_SYSTEM_YES] = "yes",
1612 [PROTECT_SYSTEM_FULL] = "full",
1613 [PROTECT_SYSTEM_STRICT] = "strict",
1614 };
1615
1616 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1617
1618 ProtectSystem parse_protect_system_or_bool(const char *s) {
1619 int r;
1620
1621 r = parse_boolean(s);
1622 if (r > 0)
1623 return PROTECT_SYSTEM_YES;
1624 if (r == 0)
1625 return PROTECT_SYSTEM_NO;
1626
1627 return protect_system_from_string(s);
1628 }
1629
1630 static const char* const namespace_type_table[] = {
1631 [NAMESPACE_MOUNT] = "mnt",
1632 [NAMESPACE_CGROUP] = "cgroup",
1633 [NAMESPACE_UTS] = "uts",
1634 [NAMESPACE_IPC] = "ipc",
1635 [NAMESPACE_USER] = "user",
1636 [NAMESPACE_PID] = "pid",
1637 [NAMESPACE_NET] = "net",
1638 };
1639
1640 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);