]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
core: un-break PrivateDevices= by allowing it to mknod /dev/ptmx
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
15ae422b
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
15ae422b 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
21#include <errno.h>
07630cea 22#include <sched.h>
15ae422b 23#include <stdio.h>
07630cea
LP
24#include <string.h>
25#include <sys/mount.h>
15ae422b 26#include <sys/stat.h>
07630cea 27#include <unistd.h>
25e870b5 28#include <linux/fs.h>
15ae422b 29
b5efdb8a 30#include "alloc-util.h"
10404d52 31#include "base-filesystem.h"
7f112f50 32#include "dev-setup.h"
3ffd4af2 33#include "fd-util.h"
d944dc95 34#include "fs-util.h"
e908468b 35#include "label.h"
915e6d16 36#include "loop-util.h"
07630cea
LP
37#include "loopback-setup.h"
38#include "missing.h"
39#include "mkdir.h"
4349cd7c 40#include "mount-util.h"
3ffd4af2 41#include "namespace.h"
07630cea 42#include "path-util.h"
d7b8eec7 43#include "selinux-util.h"
2583fbea 44#include "socket-util.h"
8b43440b 45#include "string-table.h"
07630cea
LP
46#include "string-util.h"
47#include "strv.h"
affb60b1 48#include "umask-util.h"
ee104e11 49#include "user-util.h"
07630cea 50#include "util.h"
15ae422b 51
737ba3c8 52#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
53
c17ec25e 54typedef enum MountMode {
15ae422b
LP
55 /* This is ordered by priority! */
56 INACCESSIBLE,
d2d6c096
LP
57 BIND_MOUNT,
58 BIND_MOUNT_RECURSIVE,
ac0930c8
LP
59 PRIVATE_TMP,
60 PRIVATE_VAR_TMP,
7f112f50 61 PRIVATE_DEV,
5d997827 62 BIND_DEV,
6c47cd7d 63 EMPTY_DIR,
5d997827
LP
64 SYSFS,
65 PROCFS,
66 READONLY,
59eeb84b 67 READWRITE,
c17ec25e 68} MountMode;
15ae422b 69
34de407a 70typedef struct MountEntry {
5327c910 71 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 72 MountMode mode:5;
5327c910
LP
73 bool ignore:1; /* Ignore if path does not exist? */
74 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 75 bool read_only:1; /* Shall this mount point be read-only? */
5327c910 76 char *path_malloc; /* Use this instead of 'path' if we had to allocate memory */
d2d6c096
LP
77 const char *source_const; /* The source path, for bind mounts */
78 char *source_malloc;
34de407a 79} MountEntry;
15ae422b 80
5d997827
LP
81/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
82 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
83static const MountEntry apivfs_table[] = {
84 { "/proc", PROCFS, false },
85 { "/dev", BIND_DEV, false },
86 { "/sys", SYSFS, false },
87};
f471b2af 88
11a30cec 89/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 90static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
91 { "/proc/sys", READONLY, false },
92 { "/proc/sysrq-trigger", READONLY, true },
93 { "/proc/latency_stats", READONLY, true },
94 { "/proc/mtrr", READONLY, true },
aa70f38b 95 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
96 { "/proc/acpi", READONLY, true },
97 { "/proc/timer_stats", READONLY, true },
98 { "/proc/asound", READONLY, true },
99 { "/proc/bus", READONLY, true },
100 { "/proc/fs", READONLY, true },
101 { "/proc/irq", READONLY, true },
102 { "/sys", READONLY, false },
103 { "/sys/kernel/debug", READONLY, true },
104 { "/sys/kernel/tracing", READONLY, true },
105 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
3a0bf6d6 106 { "/sys/fs/selinux", READWRITE, true },
11a30cec
DH
107};
108
c575770b 109/* ProtectKernelModules= option */
34de407a 110static const MountEntry protect_kernel_modules_table[] = {
349cc4a5 111#if HAVE_SPLIT_USR
c6232fb0 112 { "/lib/modules", INACCESSIBLE, true },
c575770b 113#endif
c6232fb0 114 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
115};
116
b6c432ca
DH
117/*
118 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
119 * system should be protected by ProtectSystem=
120 */
34de407a 121static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
122 { "/home", READONLY, true },
123 { "/run/user", READONLY, true },
124 { "/root", READONLY, true },
b6c432ca
DH
125};
126
127/* ProtectHome=yes table */
34de407a 128static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
129 { "/home", INACCESSIBLE, true },
130 { "/run/user", INACCESSIBLE, true },
131 { "/root", INACCESSIBLE, true },
b6c432ca
DH
132};
133
f471b2af 134/* ProtectSystem=yes table */
34de407a 135static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
136 { "/usr", READONLY, false },
137 { "/boot", READONLY, true },
138 { "/efi", READONLY, true },
f471b2af
DH
139};
140
141/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 142static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
143 { "/usr", READONLY, false },
144 { "/boot", READONLY, true },
145 { "/efi", READONLY, true },
146 { "/etc", READONLY, false },
f471b2af
DH
147};
148
149/*
150 * ProtectSystem=strict table. In this strict mode, we mount everything
151 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
152 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
153 * protect those, and these options should be fully orthogonal.
154 * (And of course /home and friends are also left writable, as ProtectHome=
155 * shall manage those, orthogonally).
156 */
34de407a 157static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
158 { "/", READONLY, false },
159 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
160 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
161 { "/dev", READWRITE, false }, /* PrivateDevices= */
162 { "/home", READWRITE, true }, /* ProtectHome= */
163 { "/run/user", READWRITE, true }, /* ProtectHome= */
164 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
165};
166
34de407a 167static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
168 assert(p);
169
5327c910
LP
170 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
171 * otherwise the stack/static ->path field is returned. */
f0a4feb0 172
5327c910 173 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
174}
175
34de407a 176static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
177 assert(p);
178
179 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
180}
181
d2d6c096
LP
182static const char *mount_entry_source(const MountEntry *p) {
183 assert(p);
184
185 return p->source_malloc ?: p->source_const;
186}
187
1eb7e08e
LP
188static void mount_entry_done(MountEntry *p) {
189 assert(p);
190
191 p->path_malloc = mfree(p->path_malloc);
192 p->source_malloc = mfree(p->source_malloc);
193}
194
d18aff04 195static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
15ae422b
LP
196 char **i;
197
613b411c
LP
198 assert(p);
199
5327c910
LP
200 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
201
15ae422b 202 STRV_FOREACH(i, strv) {
5327c910
LP
203 bool ignore = false, needs_prefix = false;
204 const char *e = *i;
15ae422b 205
5327c910
LP
206 /* Look for any prefixes */
207 if (startswith(e, "-")) {
208 e++;
9c94d52e 209 ignore = true;
ea92ae33 210 }
5327c910
LP
211 if (startswith(e, "+")) {
212 e++;
213 needs_prefix = true;
214 }
ea92ae33 215
5327c910 216 if (!path_is_absolute(e))
15ae422b
LP
217 return -EINVAL;
218
34de407a 219 *((*p)++) = (MountEntry) {
5327c910
LP
220 .path_const = e,
221 .mode = mode,
222 .ignore = ignore,
d18aff04 223 .has_prefix = !needs_prefix && !forcibly_require_prefix,
5327c910 224 };
15ae422b
LP
225 }
226
227 return 0;
228}
229
6c47cd7d
LP
230static int append_empty_dir_mounts(MountEntry **p, char **strv) {
231 char **i;
232
233 assert(p);
234
235 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
236 * "/private/" boundary directories for DynamicUser=1. */
237
238 STRV_FOREACH(i, strv) {
239
240 *((*p)++) = (MountEntry) {
241 .path_const = *i,
242 .mode = EMPTY_DIR,
243 .ignore = false,
244 .has_prefix = false,
245 .read_only = true,
246 };
247 }
248
249 return 0;
250}
251
d2d6c096
LP
252static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
253 unsigned i;
254
255 assert(p);
256
257 for (i = 0; i < n; i++) {
258 const BindMount *b = binds + i;
259
260 *((*p)++) = (MountEntry) {
261 .path_const = b->destination,
262 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
263 .read_only = b->read_only,
264 .source_const = b->source,
265 };
266 }
267
268 return 0;
269}
270
34de407a 271static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 272 unsigned i;
11a30cec
DH
273
274 assert(p);
f471b2af 275 assert(mounts);
11a30cec 276
5327c910 277 /* Adds a list of static pre-defined entries */
f471b2af 278
5327c910 279 for (i = 0; i < n; i++)
34de407a
LP
280 *((*p)++) = (MountEntry) {
281 .path_const = mount_entry_path(mounts+i),
5327c910
LP
282 .mode = mounts[i].mode,
283 .ignore = mounts[i].ignore || ignore_protect,
284 };
f471b2af
DH
285
286 return 0;
287}
288
34de407a 289static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
290 assert(p);
291
5327c910 292 switch (protect_home) {
b6c432ca 293
5327c910 294 case PROTECT_HOME_NO:
b6c432ca
DH
295 return 0;
296
b6c432ca 297 case PROTECT_HOME_READ_ONLY:
5327c910
LP
298 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
299
b6c432ca 300 case PROTECT_HOME_YES:
5327c910
LP
301 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
302
b6c432ca 303 default:
5327c910 304 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 305 }
b6c432ca
DH
306}
307
34de407a 308static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
309 assert(p);
310
5327c910
LP
311 switch (protect_system) {
312
313 case PROTECT_SYSTEM_NO:
f471b2af
DH
314 return 0;
315
f471b2af 316 case PROTECT_SYSTEM_STRICT:
5327c910
LP
317 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
318
f471b2af 319 case PROTECT_SYSTEM_YES:
5327c910
LP
320 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
321
f471b2af 322 case PROTECT_SYSTEM_FULL:
5327c910
LP
323 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
324
f471b2af 325 default:
5327c910 326 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 327 }
11a30cec
DH
328}
329
c17ec25e 330static int mount_path_compare(const void *a, const void *b) {
34de407a 331 const MountEntry *p = a, *q = b;
a0827e2b 332 int d;
15ae422b 333
6ee1a919 334 /* If the paths are not equal, then order prefixes first */
34de407a 335 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
336 if (d != 0)
337 return d;
15ae422b 338
6ee1a919
LP
339 /* If the paths are equal, check the mode */
340 if (p->mode < q->mode)
341 return -1;
15ae422b 342
6ee1a919
LP
343 if (p->mode > q->mode)
344 return 1;
15ae422b 345
6ee1a919 346 return 0;
15ae422b
LP
347}
348
34de407a 349static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
350 unsigned i;
351
352 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
353 * that. */
354
355 if (!root_directory)
356 return 0;
357
358 for (i = 0; i < n; i++) {
359 char *s;
360
361 if (m[i].has_prefix)
362 continue;
363
34de407a 364 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
365 if (!s)
366 return -ENOMEM;
367
368 free(m[i].path_malloc);
369 m[i].path_malloc = s;
370
371 m[i].has_prefix = true;
372 }
373
374 return 0;
375}
376
34de407a
LP
377static void drop_duplicates(MountEntry *m, unsigned *n) {
378 MountEntry *f, *t, *previous;
15ae422b 379
c17ec25e 380 assert(m);
15ae422b 381 assert(n);
15ae422b 382
fe3c2583
LP
383 /* Drops duplicate entries. Expects that the array is properly ordered already. */
384
1d54cd5d 385 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 386
fe3c2583
LP
387 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
388 * above. */
34de407a
LP
389 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
390 log_debug("%s is duplicate.", mount_entry_path(f));
391 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 392 mount_entry_done(f);
15ae422b 393 continue;
fe3c2583 394 }
15ae422b 395
e2d7c1a0 396 *t = *f;
15ae422b 397 previous = t;
fe3c2583
LP
398 t++;
399 }
400
401 *n = t - m;
402}
403
34de407a
LP
404static void drop_inaccessible(MountEntry *m, unsigned *n) {
405 MountEntry *f, *t;
fe3c2583
LP
406 const char *clear = NULL;
407
408 assert(m);
409 assert(n);
410
411 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
412 * ordered already. */
413
1d54cd5d 414 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
415
416 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
417 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
418 if (clear && path_startswith(mount_entry_path(f), clear)) {
419 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 420 mount_entry_done(f);
fe3c2583
LP
421 continue;
422 }
15ae422b 423
34de407a 424 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
425
426 *t = *f;
15ae422b
LP
427 t++;
428 }
429
c17ec25e 430 *n = t - m;
15ae422b
LP
431}
432
34de407a
LP
433static void drop_nop(MountEntry *m, unsigned *n) {
434 MountEntry *f, *t;
7648a565
LP
435
436 assert(m);
437 assert(n);
438
439 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
440 * list is ordered by prefixes. */
441
1d54cd5d 442 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
443
444 /* Only suppress such subtrees for READONLY and READWRITE entries */
445 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 446 MountEntry *p;
7648a565
LP
447 bool found = false;
448
449 /* Now let's find the first parent of the entry we are looking at. */
450 for (p = t-1; p >= m; p--) {
34de407a 451 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
452 found = true;
453 break;
454 }
455 }
456
457 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
458 if (found && p->mode == f->mode) {
34de407a 459 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
1eb7e08e 460 mount_entry_done(f);
7648a565
LP
461 continue;
462 }
463 }
464
465 *t = *f;
466 t++;
467 }
468
469 *n = t - m;
470}
471
34de407a
LP
472static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
473 MountEntry *f, *t;
cd2902c9
LP
474
475 assert(m);
476 assert(n);
477
1d54cd5d 478 /* Nothing to do */
cd2902c9
LP
479 if (!root_directory)
480 return;
481
482 /* Drops all mounts that are outside of the root directory. */
483
1d54cd5d 484 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 485
34de407a
LP
486 if (!path_startswith(mount_entry_path(f), root_directory)) {
487 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 488 mount_entry_done(f);
cd2902c9
LP
489 continue;
490 }
491
492 *t = *f;
493 t++;
494 }
495
496 *n = t - m;
497}
498
414b304b 499static int clone_device_node(const char *d, const char *temporary_mount) {
b5e99f23
ДГ
500 _cleanup_free_ char *dn = NULL;
501 struct stat st;
502 int r;
503
414b304b 504 if (stat(d, &st) < 0) {
b5e99f23
ДГ
505 if (errno == ENOENT)
506 return 0;
507 return -errno;
508 }
509
510 if (!S_ISBLK(st.st_mode) &&
511 !S_ISCHR(st.st_mode))
512 return -EINVAL;
513
514 if (st.st_rdev == 0)
515 return 0;
516
517 dn = strappend(temporary_mount, d);
518 if (!dn)
519 return -ENOMEM;
520
521 mac_selinux_create_file_prepare(d, st.st_mode);
522 r = mknod(dn, st.st_mode, st.st_rdev);
523 mac_selinux_create_file_clear();
524
525 if (r < 0)
526 return -errno;
527
528 return 0;
529}
530
5d997827 531static int mount_private_dev(MountEntry *m) {
7f112f50
LP
532 static const char devnodes[] =
533 "/dev/null\0"
534 "/dev/zero\0"
535 "/dev/full\0"
536 "/dev/random\0"
537 "/dev/urandom\0"
538 "/dev/tty\0";
539
2b85f4e1 540 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 541 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50 542 _cleanup_umask_ mode_t u;
414b304b 543 struct stat st;
7f112f50
LP
544 int r;
545
546 assert(m);
547
548 u = umask(0000);
549
2b85f4e1
LP
550 if (!mkdtemp(temporary_mount))
551 return -errno;
552
63c372cb 553 dev = strjoina(temporary_mount, "/dev");
dc751688 554 (void) mkdir(dev, 0755);
737ba3c8 555 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
556 r = -errno;
557 goto fail;
558 }
559
63c372cb 560 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 561 (void) mkdir(devpts, 0755);
2b85f4e1
LP
562 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
563 r = -errno;
564 goto fail;
565 }
566
414b304b
ДГ
567 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
568 * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
569 * thus, in that case make a clone
570 *
571 * in nspawn and other containers it will be a symlink, in that case make it a symlink
572 */
573 if (lstat("/dev/ptmx", &st) < 0) {
3164e3cb
ZJS
574 r = -errno;
575 goto fail;
576 }
414b304b
ДГ
577 if (S_ISLNK(st.st_mode)) {
578 devptmx = strjoina(temporary_mount, "/dev/ptmx");
579 if (symlink("pts/ptmx", devptmx) < 0) {
580 r = -errno;
581 goto fail;
582 }
583 } else {
584 r = clone_device_node("/dev/ptmx", temporary_mount);
585 if (r < 0)
586 goto fail;
587 }
e06b6479 588
63c372cb 589 devshm = strjoina(temporary_mount, "/dev/shm");
dc751688 590 (void) mkdir(devshm, 01777);
2b85f4e1
LP
591 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
592 if (r < 0) {
593 r = -errno;
594 goto fail;
595 }
596
63c372cb 597 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 598 (void) mkdir(devmqueue, 0755);
3164e3cb 599 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 600
63c372cb 601 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 602 (void) mkdir(devhugepages, 0755);
3164e3cb 603 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 604
63c372cb 605 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 606 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 607
7f112f50 608 NULSTR_FOREACH(d, devnodes) {
b5e99f23
ДГ
609 r = clone_device_node(d, temporary_mount);
610 if (r < 0)
2b85f4e1 611 goto fail;
7f112f50
LP
612 }
613
03cfe0d5 614 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 615
ee818b89
AC
616 /* Create the /dev directory if missing. It is more likely to be
617 * missing when the service is started with RootDirectory. This is
618 * consistent with mount units creating the mount points when missing.
619 */
34de407a 620 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 621
9e5f8252 622 /* Unmount everything in old /dev */
34de407a
LP
623 umount_recursive(mount_entry_path(m), 0);
624 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
625 r = -errno;
626 goto fail;
627 }
7f112f50 628
2b85f4e1
LP
629 rmdir(dev);
630 rmdir(temporary_mount);
7f112f50 631
2b85f4e1 632 return 0;
7f112f50 633
2b85f4e1
LP
634fail:
635 if (devpts)
636 umount(devpts);
7f112f50 637
2b85f4e1
LP
638 if (devshm)
639 umount(devshm);
7f112f50 640
2b85f4e1
LP
641 if (devhugepages)
642 umount(devhugepages);
7f112f50 643
2b85f4e1
LP
644 if (devmqueue)
645 umount(devmqueue);
7f112f50 646
d267c5aa
ZJS
647 umount(dev);
648 rmdir(dev);
2b85f4e1 649 rmdir(temporary_mount);
7f112f50 650
2b85f4e1 651 return r;
7f112f50
LP
652}
653
5d997827
LP
654static int mount_bind_dev(MountEntry *m) {
655 int r;
656
657 assert(m);
658
659 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
660 * /dev. This is only used when RootDirectory= is set. */
661
645767d6
LP
662 (void) mkdir_p_label(mount_entry_path(m), 0755);
663
5d997827
LP
664 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
665 if (r < 0)
666 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
667 if (r > 0) /* make this a NOP if /dev is already a mount point */
668 return 0;
669
670 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
671 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
672
673 return 1;
674}
675
676static int mount_sysfs(MountEntry *m) {
677 int r;
678
679 assert(m);
680
645767d6
LP
681 (void) mkdir_p_label(mount_entry_path(m), 0755);
682
5d997827
LP
683 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
684 if (r < 0)
685 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
686 if (r > 0) /* make this a NOP if /sys is already a mount point */
687 return 0;
688
689 /* Bind mount the host's version so that we get all child mounts of it, too. */
690 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
691 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
692
693 return 1;
694}
695
696static int mount_procfs(MountEntry *m) {
697 int r;
698
699 assert(m);
700
645767d6
LP
701 (void) mkdir_p_label(mount_entry_path(m), 0755);
702
5d997827
LP
703 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
704 if (r < 0)
705 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
706 if (r > 0) /* make this a NOP if /proc is already a mount point */
707 return 0;
708
709 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
710 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
711 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
712
713 return 1;
714}
715
6c47cd7d
LP
716static int mount_empty_dir(MountEntry *m) {
717 assert(m);
718
719 /* First, get rid of everything that is below if there is anything. Then, overmount with our new empty dir */
720
721 (void) mkdir_p_label(mount_entry_path(m), 0755);
722 (void) umount_recursive(mount_entry_path(m), 0);
723
724 if (mount("tmpfs", mount_entry_path(m), "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
725 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
726
727 return 1;
728}
729
d2d6c096
LP
730static int mount_entry_chase(
731 const char *root_directory,
732 MountEntry *m,
733 const char *path,
734 char **location) {
735
8fceda93
LP
736 char *chased;
737 int r;
738
739 assert(m);
740
741 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
d2d6c096
LP
742 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
743 * that applies). The result is stored in "location". */
8fceda93 744
a227a4be
LP
745 r = chase_symlinks(path, root_directory,
746 IN_SET(m->mode, BIND_MOUNT, BIND_MOUNT_RECURSIVE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV, BIND_DEV, EMPTY_DIR, SYSFS, PROCFS) ? CHASE_NONEXISTENT : 0,
747 &chased);
8fceda93 748 if (r == -ENOENT && m->ignore) {
d2d6c096 749 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
8fceda93
LP
750 return 0;
751 }
752 if (r < 0)
d2d6c096 753 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
8fceda93 754
d2d6c096 755 log_debug("Followed symlinks %s → %s.", path, chased);
8fceda93 756
d2d6c096
LP
757 free(*location);
758 *location = chased;
8fceda93
LP
759
760 return 1;
761}
762
ac0930c8 763static int apply_mount(
8fceda93 764 const char *root_directory,
34de407a 765 MountEntry *m,
ac0930c8 766 const char *tmp_dir,
c17ec25e 767 const char *var_tmp_dir) {
ac0930c8 768
a227a4be 769 bool rbind = true, make = false;
15ae422b 770 const char *what;
15ae422b 771 int r;
15ae422b 772
c17ec25e 773 assert(m);
15ae422b 774
d2d6c096 775 r = mount_entry_chase(root_directory, m, mount_entry_path(m), &m->path_malloc);
8fceda93
LP
776 if (r <= 0)
777 return r;
778
34de407a 779 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 780
c17ec25e 781 switch (m->mode) {
15ae422b 782
160cfdbe
LP
783 case INACCESSIBLE: {
784 struct stat target;
6d313367
LP
785
786 /* First, get rid of everything that is below if there
787 * is anything... Then, overmount it with an
c4b41707 788 * inaccessible path. */
34de407a 789 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 790
34de407a
LP
791 if (lstat(mount_entry_path(m), &target) < 0)
792 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
15ae422b 793
c4b41707 794 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
795 if (!what) {
796 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
797 return -ELOOP;
798 }
799 break;
160cfdbe 800 }
fe3c2583 801
15ae422b 802 case READONLY:
15ae422b 803 case READWRITE:
8fceda93 804 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
d944dc95 805 if (r < 0)
34de407a 806 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
807 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
808 return 0;
6b7c9f8b 809 /* This isn't a mount point yet, let's make it one. */
34de407a 810 what = mount_entry_path(m);
6b7c9f8b 811 break;
15ae422b 812
d2d6c096
LP
813 case BIND_MOUNT:
814 rbind = false;
d2d6c096 815
4831981d 816 _fallthrough_;
d2d6c096
LP
817 case BIND_MOUNT_RECURSIVE:
818 /* Also chase the source mount */
5d997827 819
d2d6c096
LP
820 r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
821 if (r <= 0)
822 return r;
823
824 what = mount_entry_source(m);
a227a4be 825 make = true;
d2d6c096
LP
826 break;
827
6c47cd7d
LP
828 case EMPTY_DIR:
829 return mount_empty_dir(m);
830
ac0930c8
LP
831 case PRIVATE_TMP:
832 what = tmp_dir;
a227a4be 833 make = true;
ac0930c8
LP
834 break;
835
836 case PRIVATE_VAR_TMP:
837 what = var_tmp_dir;
a227a4be 838 make = true;
15ae422b 839 break;
e364ad06 840
d6797c92 841 case PRIVATE_DEV:
5d997827
LP
842 return mount_private_dev(m);
843
844 case BIND_DEV:
845 return mount_bind_dev(m);
846
847 case SYSFS:
848 return mount_sysfs(m);
849
850 case PROCFS:
851 return mount_procfs(m);
d6797c92 852
e364ad06
LP
853 default:
854 assert_not_reached("Unknown mode");
15ae422b
LP
855 }
856
ac0930c8 857 assert(what);
15ae422b 858
a227a4be
LP
859 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
860 bool try_again = false;
861 r = -errno;
862
863 if (r == -ENOENT && make) {
864 struct stat st;
865
866 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
867
868 if (stat(what, &st) >= 0) {
869
870 (void) mkdir_parents(mount_entry_path(m), 0755);
871
872 if (S_ISDIR(st.st_mode))
873 try_again = mkdir(mount_entry_path(m), 0755) >= 0;
874 else
875 try_again = touch(mount_entry_path(m)) >= 0;
876 }
877 }
878
879 if (try_again) {
880 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
881 r = -errno;
882 else
883 r = 0;
884 }
885
886 if (r < 0)
887 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
888 }
6b7c9f8b 889
34de407a 890 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 891 return 0;
ac0930c8 892}
15ae422b 893
ac9de0b3 894static int make_read_only(MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
6b7c9f8b 895 int r = 0;
15ae422b 896
c17ec25e 897 assert(m);
ac9de0b3 898 assert(proc_self_mountinfo);
ac0930c8 899
34de407a 900 if (mount_entry_read_only(m))
ac9de0b3 901 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
13e785f7 902 else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't */
34de407a 903 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 904 r = -errno;
737ba3c8 905 } else
6b7c9f8b
LP
906 return 0;
907
908 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
909 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
910 * read-only mounts already applied. */
ac0930c8 911
8fceda93
LP
912 if (r == -ENOENT && m->ignore)
913 r = 0;
5327c910 914
1d54cd5d 915 return r;
d944dc95
LP
916}
917
bb0ff3fb 918static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
5d997827
LP
919 assert(ns_info);
920
9c988f93
DH
921 /*
922 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
923 * since to protect the API VFS mounts, they need to be around in the
924 * first place... and RootDirectory= or RootImage= need to be set.
925 */
5d997827 926
9c988f93
DH
927 /* root_directory should point to a mount point */
928 return root_directory &&
929 (ns_info->mount_apivfs ||
930 ns_info->protect_control_groups ||
931 ns_info->protect_kernel_tunables);
5d997827
LP
932}
933
2652c6c1 934static unsigned namespace_calculate_mounts(
9c988f93 935 const char* root_directory,
bb0ff3fb 936 const NamespaceInfo *ns_info,
2652c6c1
DH
937 char** read_write_paths,
938 char** read_only_paths,
939 char** inaccessible_paths,
6c47cd7d 940 char** empty_directories,
d2d6c096
LP
941 const BindMount *bind_mounts,
942 unsigned n_bind_mounts,
2652c6c1
DH
943 const char* tmp_dir,
944 const char* var_tmp_dir,
2652c6c1
DH
945 ProtectHome protect_home,
946 ProtectSystem protect_system) {
947
b6c432ca 948 unsigned protect_home_cnt;
f471b2af
DH
949 unsigned protect_system_cnt =
950 (protect_system == PROTECT_SYSTEM_STRICT ?
951 ELEMENTSOF(protect_system_strict_table) :
952 ((protect_system == PROTECT_SYSTEM_FULL) ?
953 ELEMENTSOF(protect_system_full_table) :
954 ((protect_system == PROTECT_SYSTEM_YES) ?
955 ELEMENTSOF(protect_system_yes_table) : 0)));
956
b6c432ca
DH
957 protect_home_cnt =
958 (protect_home == PROTECT_HOME_YES ?
959 ELEMENTSOF(protect_home_yes_table) :
960 ((protect_home == PROTECT_HOME_READ_ONLY) ?
961 ELEMENTSOF(protect_home_read_only_table) : 0));
962
2652c6c1
DH
963 return !!tmp_dir + !!var_tmp_dir +
964 strv_length(read_write_paths) +
965 strv_length(read_only_paths) +
966 strv_length(inaccessible_paths) +
6c47cd7d 967 strv_length(empty_directories) +
d2d6c096 968 n_bind_mounts +
c575770b
DH
969 ns_info->private_dev +
970 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
971 (ns_info->protect_control_groups ? 1 : 0) +
972 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
5d997827 973 protect_home_cnt + protect_system_cnt +
9c988f93 974 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
2652c6c1
DH
975}
976
613b411c 977int setup_namespace(
ee818b89 978 const char* root_directory,
915e6d16 979 const char* root_image,
bb0ff3fb 980 const NamespaceInfo *ns_info,
2a624c36
AP
981 char** read_write_paths,
982 char** read_only_paths,
983 char** inaccessible_paths,
6c47cd7d 984 char** empty_directories,
d2d6c096
LP
985 const BindMount *bind_mounts,
986 unsigned n_bind_mounts,
a004cb4c
LP
987 const char* tmp_dir,
988 const char* var_tmp_dir,
1b8689f9
LP
989 ProtectHome protect_home,
990 ProtectSystem protect_system,
915e6d16
LP
991 unsigned long mount_flags,
992 DissectImageFlags dissect_image_flags) {
15ae422b 993
915e6d16 994 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
78ebe980 995 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
915e6d16 996 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
78ebe980 997 _cleanup_free_ void *root_hash = NULL;
34de407a 998 MountEntry *m, *mounts = NULL;
78ebe980 999 size_t root_hash_size = 0;
d944dc95 1000 bool make_slave = false;
e908468b 1001 const char *root;
f0a4feb0 1002 unsigned n_mounts;
d18aff04 1003 bool require_prefix = false;
c17ec25e 1004 int r = 0;
15ae422b 1005
915e6d16
LP
1006 assert(ns_info);
1007
613b411c 1008 if (mount_flags == 0)
c17ec25e 1009 mount_flags = MS_SHARED;
ac0930c8 1010
915e6d16
LP
1011 if (root_image) {
1012 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1013
1014 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1015 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1016
1017 r = loop_device_make_by_path(root_image,
1018 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1019 &loop_device);
1020 if (r < 0)
1021 return r;
1022
78ebe980
LP
1023 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1024 if (r < 0)
1025 return r;
1026
1027 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1028 if (r < 0)
1029 return r;
1030
1031 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
915e6d16
LP
1032 if (r < 0)
1033 return r;
915e6d16
LP
1034 }
1035
e908468b
LP
1036 if (root_directory)
1037 root = root_directory;
1038 else if (root_image || n_bind_mounts > 0) {
1039
1040 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1041 * the same mount point for all images, which is safe, since they all live in their own namespaces
1042 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1043 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1044 * while we are applying them. */
1045
1046 root = "/run/systemd/unit-root";
1047 (void) mkdir_label(root, 0700);
d18aff04 1048 require_prefix = true;
e908468b
LP
1049 } else
1050 root = NULL;
1051
cfbeb4ef 1052 n_mounts = namespace_calculate_mounts(
e908468b 1053 root,
cfbeb4ef
LP
1054 ns_info,
1055 read_write_paths,
1056 read_only_paths,
1057 inaccessible_paths,
6c47cd7d 1058 empty_directories,
d2d6c096 1059 bind_mounts, n_bind_mounts,
cfbeb4ef
LP
1060 tmp_dir, var_tmp_dir,
1061 protect_home, protect_system);
613b411c 1062
2652c6c1 1063 /* Set mount slave mode */
e908468b 1064 if (root || n_mounts > 0)
d944dc95
LP
1065 make_slave = true;
1066
f0a4feb0 1067 if (n_mounts > 0) {
34de407a 1068 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
d18aff04 1069 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
613b411c 1070 if (r < 0)
f0a4feb0 1071 goto finish;
613b411c 1072
d18aff04 1073 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
613b411c 1074 if (r < 0)
f0a4feb0 1075 goto finish;
613b411c 1076
d18aff04 1077 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
613b411c 1078 if (r < 0)
f0a4feb0 1079 goto finish;
7ff7394d 1080
6c47cd7d
LP
1081 r = append_empty_dir_mounts(&m, empty_directories);
1082 if (r < 0)
1083 goto finish;
1084
d2d6c096
LP
1085 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1086 if (r < 0)
1087 goto finish;
1088
613b411c 1089 if (tmp_dir) {
34de407a 1090 *(m++) = (MountEntry) {
5327c910
LP
1091 .path_const = "/tmp",
1092 .mode = PRIVATE_TMP,
1093 };
613b411c 1094 }
7ff7394d 1095
613b411c 1096 if (var_tmp_dir) {
34de407a 1097 *(m++) = (MountEntry) {
5327c910
LP
1098 .path_const = "/var/tmp",
1099 .mode = PRIVATE_VAR_TMP,
1100 };
7ff7394d 1101 }
ac0930c8 1102
c575770b 1103 if (ns_info->private_dev) {
34de407a 1104 *(m++) = (MountEntry) {
5327c910
LP
1105 .path_const = "/dev",
1106 .mode = PRIVATE_DEV,
1107 };
7f112f50
LP
1108 }
1109
c575770b 1110 if (ns_info->protect_kernel_tunables) {
5327c910 1111 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 1112 if (r < 0)
f0a4feb0 1113 goto finish;
c575770b
DH
1114 }
1115
1116 if (ns_info->protect_kernel_modules) {
5327c910 1117 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 1118 if (r < 0)
f0a4feb0 1119 goto finish;
c575770b 1120 }
59eeb84b 1121
c575770b 1122 if (ns_info->protect_control_groups) {
34de407a 1123 *(m++) = (MountEntry) {
5327c910
LP
1124 .path_const = "/sys/fs/cgroup",
1125 .mode = READONLY,
1126 };
59eeb84b
LP
1127 }
1128
5327c910 1129 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 1130 if (r < 0)
f0a4feb0 1131 goto finish;
417116f2 1132
5327c910 1133 r = append_protect_system(&m, protect_system, false);
f471b2af 1134 if (r < 0)
f0a4feb0 1135 goto finish;
417116f2 1136
e908468b 1137 if (namespace_info_mount_apivfs(root, ns_info)) {
5d997827
LP
1138 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1139 if (r < 0)
1140 goto finish;
1141 }
1142
f0a4feb0 1143 assert(mounts + n_mounts == m);
ac0930c8 1144
5327c910 1145 /* Prepend the root directory where that's necessary */
e908468b 1146 r = prefix_where_needed(mounts, n_mounts, root);
5327c910
LP
1147 if (r < 0)
1148 goto finish;
1149
34de407a 1150 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
fe3c2583 1151
f0a4feb0 1152 drop_duplicates(mounts, &n_mounts);
e908468b 1153 drop_outside_root(root, mounts, &n_mounts);
f0a4feb0
DH
1154 drop_inaccessible(mounts, &n_mounts);
1155 drop_nop(mounts, &n_mounts);
15ae422b
LP
1156 }
1157
d944dc95
LP
1158 if (unshare(CLONE_NEWNS) < 0) {
1159 r = -errno;
1160 goto finish;
1161 }
1e4e94c8 1162
d944dc95 1163 if (make_slave) {
c2c13f2d
LP
1164 /* Remount / as SLAVE so that nothing now mounted in the namespace
1165 shows up in the parent */
d944dc95
LP
1166 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1167 r = -errno;
1168 goto finish;
1169 }
ee818b89
AC
1170 }
1171
915e6d16 1172 if (root_image) {
e908468b 1173 /* A root image is specified, mount it to the right place */
2d3a5a73 1174 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
915e6d16
LP
1175 if (r < 0)
1176 goto finish;
1177
07ce7407
TM
1178 if (decrypted_image) {
1179 r = decrypted_image_relinquish(decrypted_image);
1180 if (r < 0)
1181 goto finish;
1182 }
78ebe980 1183
915e6d16
LP
1184 loop_device_relinquish(loop_device);
1185
1186 } else if (root_directory) {
1187
e908468b
LP
1188 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1189 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 1190 if (r < 0)
d944dc95 1191 goto finish;
8f1ad200 1192 if (r == 0) {
e908468b 1193 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
8f1ad200
LP
1194 r = -errno;
1195 goto finish;
1196 }
d944dc95 1197 }
e908468b
LP
1198
1199 } else if (root) {
1200
1201 /* Let's mount the main root directory to the root directory to use */
1202 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1203 r = -errno;
1204 goto finish;
1205 }
ee818b89 1206 }
c2c13f2d 1207
4e0c20de
LP
1208 /* Try to set up the new root directory before mounting anything else there. */
1209 if (root_image || root_directory)
1210 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1211
f0a4feb0 1212 if (n_mounts > 0) {
ac9de0b3 1213 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
6b7c9f8b
LP
1214 char **blacklist;
1215 unsigned j;
1216
ac9de0b3
TR
1217 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1218 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1219 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1220 if (!proc_self_mountinfo) {
1221 r = -errno;
1222 goto finish;
1223 }
1224
6b7c9f8b 1225 /* First round, add in all special mounts we need */
f0a4feb0 1226 for (m = mounts; m < mounts + n_mounts; ++m) {
e908468b 1227 r = apply_mount(root, m, tmp_dir, var_tmp_dir);
c2c13f2d 1228 if (r < 0)
d944dc95 1229 goto finish;
c2c13f2d 1230 }
15ae422b 1231
6b7c9f8b 1232 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
1233 blacklist = newa(char*, n_mounts+1);
1234 for (j = 0; j < n_mounts; j++)
34de407a 1235 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
1236 blacklist[j] = NULL;
1237
1238 /* Second round, flip the ro bits if necessary. */
f0a4feb0 1239 for (m = mounts; m < mounts + n_mounts; ++m) {
ac9de0b3 1240 r = make_read_only(m, blacklist, proc_self_mountinfo);
c2c13f2d 1241 if (r < 0)
d944dc95 1242 goto finish;
c2c13f2d 1243 }
15ae422b
LP
1244 }
1245
e908468b 1246 if (root) {
ee818b89 1247 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
e908468b 1248 r = mount_move_root(root);
d944dc95
LP
1249 if (r < 0)
1250 goto finish;
ee818b89
AC
1251 }
1252
c2c13f2d
LP
1253 /* Remount / as the desired mode. Not that this will not
1254 * reestablish propagation from our side to the host, since
1255 * what's disconnected is disconnected. */
d944dc95
LP
1256 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1257 r = -errno;
1258 goto finish;
1259 }
15ae422b 1260
d944dc95 1261 r = 0;
15ae422b 1262
d944dc95 1263finish:
f0a4feb0 1264 for (m = mounts; m < mounts + n_mounts; m++)
1eb7e08e 1265 mount_entry_done(m);
613b411c
LP
1266
1267 return r;
1268}
1269
d2d6c096
LP
1270void bind_mount_free_many(BindMount *b, unsigned n) {
1271 unsigned i;
1272
1273 assert(b || n == 0);
1274
1275 for (i = 0; i < n; i++) {
1276 free(b[i].source);
1277 free(b[i].destination);
1278 }
1279
1280 free(b);
1281}
1282
1283int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1284 _cleanup_free_ char *s = NULL, *d = NULL;
1285 BindMount *c;
1286
1287 assert(b);
1288 assert(n);
1289 assert(item);
1290
1291 s = strdup(item->source);
1292 if (!s)
1293 return -ENOMEM;
1294
1295 d = strdup(item->destination);
1296 if (!d)
1297 return -ENOMEM;
1298
1299 c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1300 if (!c)
1301 return -ENOMEM;
1302
1303 *b = c;
1304
1305 c[(*n) ++] = (BindMount) {
1306 .source = s,
1307 .destination = d,
1308 .read_only = item->read_only,
1309 .recursive = item->recursive,
1310 .ignore_enoent = item->ignore_enoent,
1311 };
1312
1313 s = d = NULL;
1314 return 0;
1315}
1316
613b411c
LP
1317static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1318 _cleanup_free_ char *x = NULL;
6b46ea73
LP
1319 char bid[SD_ID128_STRING_MAX];
1320 sd_id128_t boot_id;
1321 int r;
613b411c
LP
1322
1323 assert(id);
1324 assert(prefix);
1325 assert(path);
1326
6b46ea73
LP
1327 /* We include the boot id in the directory so that after a
1328 * reboot we can easily identify obsolete directories. */
1329
1330 r = sd_id128_get_boot(&boot_id);
1331 if (r < 0)
1332 return r;
1333
605405c6 1334 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
1335 if (!x)
1336 return -ENOMEM;
1337
1338 RUN_WITH_UMASK(0077)
1339 if (!mkdtemp(x))
1340 return -errno;
1341
1342 RUN_WITH_UMASK(0000) {
1343 char *y;
1344
63c372cb 1345 y = strjoina(x, "/tmp");
613b411c
LP
1346
1347 if (mkdir(y, 0777 | S_ISVTX) < 0)
1348 return -errno;
c17ec25e 1349 }
15ae422b 1350
613b411c
LP
1351 *path = x;
1352 x = NULL;
1353
1354 return 0;
1355}
1356
1357int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1358 char *a, *b;
1359 int r;
1360
1361 assert(id);
1362 assert(tmp_dir);
1363 assert(var_tmp_dir);
1364
1365 r = setup_one_tmp_dir(id, "/tmp", &a);
1366 if (r < 0)
1367 return r;
1368
1369 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1370 if (r < 0) {
1371 char *t;
1372
63c372cb 1373 t = strjoina(a, "/tmp");
613b411c
LP
1374 rmdir(t);
1375 rmdir(a);
1376
1377 free(a);
1378 return r;
1379 }
1380
1381 *tmp_dir = a;
1382 *var_tmp_dir = b;
1383
1384 return 0;
1385}
1386
1387int setup_netns(int netns_storage_socket[2]) {
1388 _cleanup_close_ int netns = -1;
3ee897d6 1389 int r, q;
613b411c
LP
1390
1391 assert(netns_storage_socket);
1392 assert(netns_storage_socket[0] >= 0);
1393 assert(netns_storage_socket[1] >= 0);
1394
1395 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1396 * namespace reference fd. Whatever process runs this first
1397 * shall create a new namespace, all others should just join
1398 * it. To serialize that we use a file lock on the socket
1399 * pair.
613b411c
LP
1400 *
1401 * It's a bit crazy, but hey, works great! */
1402
1403 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1404 return -errno;
1405
3ee897d6
LP
1406 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1407 if (netns == -EAGAIN) {
613b411c
LP
1408 /* Nothing stored yet, so let's create a new namespace */
1409
1410 if (unshare(CLONE_NEWNET) < 0) {
1411 r = -errno;
1412 goto fail;
1413 }
1414
1415 loopback_setup();
1416
1417 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1418 if (netns < 0) {
1419 r = -errno;
1420 goto fail;
1421 }
1422
1423 r = 1;
613b411c 1424
3ee897d6
LP
1425 } else if (netns < 0) {
1426 r = netns;
1427 goto fail;
613b411c 1428
3ee897d6
LP
1429 } else {
1430 /* Yay, found something, so let's join the namespace */
613b411c
LP
1431 if (setns(netns, CLONE_NEWNET) < 0) {
1432 r = -errno;
1433 goto fail;
1434 }
1435
1436 r = 0;
1437 }
1438
3ee897d6
LP
1439 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1440 if (q < 0) {
1441 r = q;
613b411c
LP
1442 goto fail;
1443 }
1444
1445fail:
fe048ce5 1446 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1447 return r;
1448}
417116f2 1449
6e2d7c4f
MS
1450bool ns_type_supported(NamespaceType type) {
1451 const char *t, *ns_proc;
1452
0fa5b831
LP
1453 t = namespace_type_to_string(type);
1454 if (!t) /* Don't know how to translate this? Then it's not supported */
6e2d7c4f
MS
1455 return false;
1456
6e2d7c4f 1457 ns_proc = strjoina("/proc/self/ns/", t);
6e2d7c4f
MS
1458 return access(ns_proc, F_OK) == 0;
1459}
1460
1b8689f9
LP
1461static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1462 [PROTECT_HOME_NO] = "no",
1463 [PROTECT_HOME_YES] = "yes",
1464 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
1465};
1466
1b8689f9
LP
1467DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1468
5e1c6154
YW
1469ProtectHome parse_protect_home_or_bool(const char *s) {
1470 int r;
1471
1472 r = parse_boolean(s);
1473 if (r > 0)
1474 return PROTECT_HOME_YES;
1475 if (r == 0)
1476 return PROTECT_HOME_NO;
1477
1478 return protect_home_from_string(s);
1479}
1480
1b8689f9
LP
1481static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1482 [PROTECT_SYSTEM_NO] = "no",
1483 [PROTECT_SYSTEM_YES] = "yes",
1484 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1485 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1486};
1487
1488DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
6e2d7c4f 1489
03c791aa
YW
1490ProtectSystem parse_protect_system_or_bool(const char *s) {
1491 int r;
1492
1493 r = parse_boolean(s);
1494 if (r > 0)
1495 return PROTECT_SYSTEM_YES;
1496 if (r == 0)
1497 return PROTECT_SYSTEM_NO;
1498
1499 return protect_system_from_string(s);
1500}
1501
6e2d7c4f
MS
1502static const char* const namespace_type_table[] = {
1503 [NAMESPACE_MOUNT] = "mnt",
1504 [NAMESPACE_CGROUP] = "cgroup",
1505 [NAMESPACE_UTS] = "uts",
1506 [NAMESPACE_IPC] = "ipc",
1507 [NAMESPACE_USER] = "user",
1508 [NAMESPACE_PID] = "pid",
1509 [NAMESPACE_NET] = "net",
1510};
1511
1512DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);