]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
test: add tests for %j and %J specifier in test-execute (#8838)
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
15ae422b
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
15ae422b
LP
6***/
7
8#include <errno.h>
07630cea 9#include <sched.h>
15ae422b 10#include <stdio.h>
07630cea
LP
11#include <string.h>
12#include <sys/mount.h>
15ae422b 13#include <sys/stat.h>
07630cea 14#include <unistd.h>
25e870b5 15#include <linux/fs.h>
15ae422b 16
b5efdb8a 17#include "alloc-util.h"
10404d52 18#include "base-filesystem.h"
7f112f50 19#include "dev-setup.h"
3ffd4af2 20#include "fd-util.h"
d944dc95 21#include "fs-util.h"
e908468b 22#include "label.h"
915e6d16 23#include "loop-util.h"
07630cea
LP
24#include "loopback-setup.h"
25#include "missing.h"
26#include "mkdir.h"
4349cd7c 27#include "mount-util.h"
3ffd4af2 28#include "namespace.h"
07630cea 29#include "path-util.h"
d7b8eec7 30#include "selinux-util.h"
2583fbea 31#include "socket-util.h"
36ce7110 32#include "stat-util.h"
8b43440b 33#include "string-table.h"
07630cea
LP
34#include "string-util.h"
35#include "strv.h"
affb60b1 36#include "umask-util.h"
ee104e11 37#include "user-util.h"
07630cea 38#include "util.h"
15ae422b 39
737ba3c8 40#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
41
c17ec25e 42typedef enum MountMode {
15ae422b
LP
43 /* This is ordered by priority! */
44 INACCESSIBLE,
d2d6c096
LP
45 BIND_MOUNT,
46 BIND_MOUNT_RECURSIVE,
ac0930c8 47 PRIVATE_TMP,
7f112f50 48 PRIVATE_DEV,
5d997827 49 BIND_DEV,
6c47cd7d 50 EMPTY_DIR,
5d997827
LP
51 SYSFS,
52 PROCFS,
53 READONLY,
59eeb84b 54 READWRITE,
2abd4e38 55 TMPFS,
c17ec25e 56} MountMode;
15ae422b 57
34de407a 58typedef struct MountEntry {
5327c910 59 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 60 MountMode mode:5;
5327c910
LP
61 bool ignore:1; /* Ignore if path does not exist? */
62 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 63 bool read_only:1; /* Shall this mount point be read-only? */
088696fe 64 bool applied:1; /* Already applied */
55fe7432 65 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
d2d6c096
LP
66 const char *source_const; /* The source path, for bind mounts */
67 char *source_malloc;
2abd4e38
YW
68 const char *options_const;/* Mount options for tmpfs */
69 char *options_malloc;
70 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
088696fe 71 unsigned n_followed;
34de407a 72} MountEntry;
15ae422b 73
5d997827
LP
74/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
75 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
76static const MountEntry apivfs_table[] = {
77 { "/proc", PROCFS, false },
78 { "/dev", BIND_DEV, false },
79 { "/sys", SYSFS, false },
80};
f471b2af 81
11a30cec 82/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 83static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
84 { "/proc/sys", READONLY, false },
85 { "/proc/sysrq-trigger", READONLY, true },
86 { "/proc/latency_stats", READONLY, true },
87 { "/proc/mtrr", READONLY, true },
aa70f38b 88 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
89 { "/proc/acpi", READONLY, true },
90 { "/proc/timer_stats", READONLY, true },
91 { "/proc/asound", READONLY, true },
92 { "/proc/bus", READONLY, true },
93 { "/proc/fs", READONLY, true },
94 { "/proc/irq", READONLY, true },
95 { "/sys", READONLY, false },
96 { "/sys/kernel/debug", READONLY, true },
97 { "/sys/kernel/tracing", READONLY, true },
13a141f0 98 { "/sys/fs/bpf", READONLY, true },
c6232fb0 99 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
3a0bf6d6 100 { "/sys/fs/selinux", READWRITE, true },
11a30cec
DH
101};
102
c575770b 103/* ProtectKernelModules= option */
34de407a 104static const MountEntry protect_kernel_modules_table[] = {
349cc4a5 105#if HAVE_SPLIT_USR
c6232fb0 106 { "/lib/modules", INACCESSIBLE, true },
c575770b 107#endif
c6232fb0 108 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
109};
110
b6c432ca
DH
111/*
112 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
113 * system should be protected by ProtectSystem=
114 */
34de407a 115static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
116 { "/home", READONLY, true },
117 { "/run/user", READONLY, true },
118 { "/root", READONLY, true },
b6c432ca
DH
119};
120
e4da7d8c
YW
121/* ProtectHome=tmpfs table */
122static const MountEntry protect_home_tmpfs_table[] = {
123 { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
124 { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
125 { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
126};
127
b6c432ca 128/* ProtectHome=yes table */
34de407a 129static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
130 { "/home", INACCESSIBLE, true },
131 { "/run/user", INACCESSIBLE, true },
132 { "/root", INACCESSIBLE, true },
b6c432ca
DH
133};
134
f471b2af 135/* ProtectSystem=yes table */
34de407a 136static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
137 { "/usr", READONLY, false },
138 { "/boot", READONLY, true },
139 { "/efi", READONLY, true },
7486f305
AB
140#if HAVE_SPLIT_USR
141 { "/lib", READONLY, true },
142 { "/lib64", READONLY, true },
143 { "/bin", READONLY, true },
671f0f8d 144# if HAVE_SPLIT_BIN
7486f305 145 { "/sbin", READONLY, true },
671f0f8d 146# endif
7486f305 147#endif
f471b2af
DH
148};
149
150/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 151static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
152 { "/usr", READONLY, false },
153 { "/boot", READONLY, true },
154 { "/efi", READONLY, true },
155 { "/etc", READONLY, false },
7486f305
AB
156#if HAVE_SPLIT_USR
157 { "/lib", READONLY, true },
158 { "/lib64", READONLY, true },
159 { "/bin", READONLY, true },
671f0f8d 160# if HAVE_SPLIT_BIN
7486f305 161 { "/sbin", READONLY, true },
671f0f8d 162# endif
7486f305 163#endif
f471b2af
DH
164};
165
166/*
167 * ProtectSystem=strict table. In this strict mode, we mount everything
168 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
169 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
170 * protect those, and these options should be fully orthogonal.
171 * (And of course /home and friends are also left writable, as ProtectHome=
172 * shall manage those, orthogonally).
173 */
34de407a 174static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
175 { "/", READONLY, false },
176 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
177 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
178 { "/dev", READWRITE, false }, /* PrivateDevices= */
179 { "/home", READWRITE, true }, /* ProtectHome= */
180 { "/run/user", READWRITE, true }, /* ProtectHome= */
181 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
182};
183
34de407a 184static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
185 assert(p);
186
5327c910
LP
187 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
188 * otherwise the stack/static ->path field is returned. */
f0a4feb0 189
5327c910 190 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
191}
192
34de407a 193static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
194 assert(p);
195
196 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
197}
198
d2d6c096
LP
199static const char *mount_entry_source(const MountEntry *p) {
200 assert(p);
201
202 return p->source_malloc ?: p->source_const;
203}
204
2abd4e38
YW
205static const char *mount_entry_options(const MountEntry *p) {
206 assert(p);
207
208 return p->options_malloc ?: p->options_const;
209}
210
1eb7e08e
LP
211static void mount_entry_done(MountEntry *p) {
212 assert(p);
213
214 p->path_malloc = mfree(p->path_malloc);
215 p->source_malloc = mfree(p->source_malloc);
2abd4e38 216 p->options_malloc = mfree(p->options_malloc);
1eb7e08e
LP
217}
218
d18aff04 219static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
15ae422b
LP
220 char **i;
221
613b411c
LP
222 assert(p);
223
5327c910
LP
224 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
225
15ae422b 226 STRV_FOREACH(i, strv) {
5327c910
LP
227 bool ignore = false, needs_prefix = false;
228 const char *e = *i;
15ae422b 229
5327c910
LP
230 /* Look for any prefixes */
231 if (startswith(e, "-")) {
232 e++;
9c94d52e 233 ignore = true;
ea92ae33 234 }
5327c910
LP
235 if (startswith(e, "+")) {
236 e++;
237 needs_prefix = true;
238 }
ea92ae33 239
5327c910 240 if (!path_is_absolute(e))
15ae422b
LP
241 return -EINVAL;
242
34de407a 243 *((*p)++) = (MountEntry) {
5327c910
LP
244 .path_const = e,
245 .mode = mode,
246 .ignore = ignore,
d18aff04 247 .has_prefix = !needs_prefix && !forcibly_require_prefix,
5327c910 248 };
15ae422b
LP
249 }
250
251 return 0;
252}
253
6c47cd7d
LP
254static int append_empty_dir_mounts(MountEntry **p, char **strv) {
255 char **i;
256
257 assert(p);
258
259 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
260 * "/private/" boundary directories for DynamicUser=1. */
261
262 STRV_FOREACH(i, strv) {
263
264 *((*p)++) = (MountEntry) {
265 .path_const = *i,
266 .mode = EMPTY_DIR,
267 .ignore = false,
268 .has_prefix = false,
269 .read_only = true,
2abd4e38
YW
270 .options_const = "mode=755",
271 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
6c47cd7d
LP
272 };
273 }
274
275 return 0;
276}
277
d2d6c096
LP
278static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
279 unsigned i;
280
281 assert(p);
282
283 for (i = 0; i < n; i++) {
284 const BindMount *b = binds + i;
285
286 *((*p)++) = (MountEntry) {
287 .path_const = b->destination,
288 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
289 .read_only = b->read_only,
290 .source_const = b->source,
4ca763a9 291 .ignore = b->ignore_enoent,
d2d6c096
LP
292 };
293 }
294
295 return 0;
296}
297
2abd4e38
YW
298static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, unsigned n) {
299 unsigned i;
300 int r;
301
302 assert(p);
303
304 for (i = 0; i < n; i++) {
305 const TemporaryFileSystem *t = tmpfs + i;
306 _cleanup_free_ char *o = NULL, *str = NULL;
307 unsigned long flags = MS_NODEV|MS_STRICTATIME;
308 bool ro = false;
309
310 if (!path_is_absolute(t->path))
311 return -EINVAL;
312
313 if (!isempty(t->options)) {
314 str = strjoin("mode=0755,", t->options);
315 if (!str)
316 return -ENOMEM;
317
318 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
319 if (r < 0)
320 return r;
321
322 ro = !!(flags & MS_RDONLY);
323 if (ro)
324 flags ^= MS_RDONLY;
325 }
326
327 *((*p)++) = (MountEntry) {
328 .path_const = t->path,
329 .mode = TMPFS,
330 .read_only = ro,
331 .options_malloc = o,
332 .flags = flags,
333 };
334
335 o = NULL;
336 }
337
338 return 0;
339}
340
34de407a 341static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 342 unsigned i;
11a30cec
DH
343
344 assert(p);
f471b2af 345 assert(mounts);
11a30cec 346
5327c910 347 /* Adds a list of static pre-defined entries */
f471b2af 348
5327c910 349 for (i = 0; i < n; i++)
34de407a
LP
350 *((*p)++) = (MountEntry) {
351 .path_const = mount_entry_path(mounts+i),
5327c910
LP
352 .mode = mounts[i].mode,
353 .ignore = mounts[i].ignore || ignore_protect,
354 };
f471b2af
DH
355
356 return 0;
357}
358
34de407a 359static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
360 assert(p);
361
5327c910 362 switch (protect_home) {
b6c432ca 363
5327c910 364 case PROTECT_HOME_NO:
b6c432ca
DH
365 return 0;
366
b6c432ca 367 case PROTECT_HOME_READ_ONLY:
5327c910
LP
368 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
369
e4da7d8c
YW
370 case PROTECT_HOME_TMPFS:
371 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
372
b6c432ca 373 case PROTECT_HOME_YES:
5327c910
LP
374 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
375
b6c432ca 376 default:
5327c910 377 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 378 }
b6c432ca
DH
379}
380
34de407a 381static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
382 assert(p);
383
5327c910
LP
384 switch (protect_system) {
385
386 case PROTECT_SYSTEM_NO:
f471b2af
DH
387 return 0;
388
f471b2af 389 case PROTECT_SYSTEM_STRICT:
5327c910
LP
390 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
391
f471b2af 392 case PROTECT_SYSTEM_YES:
5327c910
LP
393 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
394
f471b2af 395 case PROTECT_SYSTEM_FULL:
5327c910
LP
396 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
397
f471b2af 398 default:
5327c910 399 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 400 }
11a30cec
DH
401}
402
c17ec25e 403static int mount_path_compare(const void *a, const void *b) {
34de407a 404 const MountEntry *p = a, *q = b;
a0827e2b 405 int d;
15ae422b 406
6ee1a919 407 /* If the paths are not equal, then order prefixes first */
34de407a 408 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
409 if (d != 0)
410 return d;
15ae422b 411
6ee1a919
LP
412 /* If the paths are equal, check the mode */
413 if (p->mode < q->mode)
414 return -1;
6ee1a919
LP
415 if (p->mode > q->mode)
416 return 1;
15ae422b 417
6ee1a919 418 return 0;
15ae422b
LP
419}
420
34de407a 421static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
422 unsigned i;
423
424 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
425 * that. */
426
427 if (!root_directory)
428 return 0;
429
430 for (i = 0; i < n; i++) {
431 char *s;
432
433 if (m[i].has_prefix)
434 continue;
435
34de407a 436 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
437 if (!s)
438 return -ENOMEM;
439
e282f51f 440 free_and_replace(m[i].path_malloc, s);
5327c910
LP
441 m[i].has_prefix = true;
442 }
443
444 return 0;
445}
446
34de407a
LP
447static void drop_duplicates(MountEntry *m, unsigned *n) {
448 MountEntry *f, *t, *previous;
15ae422b 449
c17ec25e 450 assert(m);
15ae422b 451 assert(n);
15ae422b 452
fe3c2583
LP
453 /* Drops duplicate entries. Expects that the array is properly ordered already. */
454
1d54cd5d 455 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 456
fe3c2583 457 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
088696fe
LP
458 * above. Note that we only drop duplicates that haven't been mounted yet. */
459 if (previous &&
460 path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
461 !f->applied && !previous->applied) {
34de407a
LP
462 log_debug("%s is duplicate.", mount_entry_path(f));
463 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 464 mount_entry_done(f);
15ae422b 465 continue;
fe3c2583 466 }
15ae422b 467
e2d7c1a0 468 *t = *f;
15ae422b 469 previous = t;
fe3c2583
LP
470 t++;
471 }
472
473 *n = t - m;
474}
475
34de407a
LP
476static void drop_inaccessible(MountEntry *m, unsigned *n) {
477 MountEntry *f, *t;
fe3c2583
LP
478 const char *clear = NULL;
479
480 assert(m);
481 assert(n);
482
483 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
484 * ordered already. */
485
1d54cd5d 486 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
487
488 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
489 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
490 if (clear && path_startswith(mount_entry_path(f), clear)) {
491 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 492 mount_entry_done(f);
fe3c2583
LP
493 continue;
494 }
15ae422b 495
34de407a 496 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
497
498 *t = *f;
15ae422b
LP
499 t++;
500 }
501
c17ec25e 502 *n = t - m;
15ae422b
LP
503}
504
34de407a
LP
505static void drop_nop(MountEntry *m, unsigned *n) {
506 MountEntry *f, *t;
7648a565
LP
507
508 assert(m);
509 assert(n);
510
511 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
512 * list is ordered by prefixes. */
513
1d54cd5d 514 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
515
516 /* Only suppress such subtrees for READONLY and READWRITE entries */
517 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 518 MountEntry *p;
7648a565
LP
519 bool found = false;
520
521 /* Now let's find the first parent of the entry we are looking at. */
522 for (p = t-1; p >= m; p--) {
34de407a 523 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
524 found = true;
525 break;
526 }
527 }
528
529 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
530 if (found && p->mode == f->mode) {
34de407a 531 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
1eb7e08e 532 mount_entry_done(f);
7648a565
LP
533 continue;
534 }
535 }
536
537 *t = *f;
538 t++;
539 }
540
541 *n = t - m;
542}
543
34de407a
LP
544static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
545 MountEntry *f, *t;
cd2902c9
LP
546
547 assert(m);
548 assert(n);
549
1d54cd5d 550 /* Nothing to do */
cd2902c9
LP
551 if (!root_directory)
552 return;
553
554 /* Drops all mounts that are outside of the root directory. */
555
1d54cd5d 556 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 557
34de407a
LP
558 if (!path_startswith(mount_entry_path(f), root_directory)) {
559 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 560 mount_entry_done(f);
cd2902c9
LP
561 continue;
562 }
563
564 *t = *f;
565 t++;
566 }
567
568 *n = t - m;
569}
570
16498617 571static int clone_device_node(const char *d, const char *temporary_mount, bool *make_devnode) {
6f7f3a33 572 const char *dn;
b5e99f23
ДГ
573 struct stat st;
574 int r;
575
414b304b 576 if (stat(d, &st) < 0) {
b5e99f23 577 if (errno == ENOENT)
af984e13 578 return -ENXIO;
b5e99f23
ДГ
579 return -errno;
580 }
581
582 if (!S_ISBLK(st.st_mode) &&
583 !S_ISCHR(st.st_mode))
584 return -EINVAL;
585
586 if (st.st_rdev == 0)
af984e13 587 return -ENXIO;
b5e99f23 588
6f7f3a33 589 dn = strjoina(temporary_mount, d);
b5e99f23 590
16498617
CB
591 if (*make_devnode) {
592 mac_selinux_create_file_prepare(d, st.st_mode);
593 r = mknod(dn, st.st_mode, st.st_rdev);
594 mac_selinux_create_file_clear();
595
596 if (r == 0)
af984e13 597 return 0;
16498617
CB
598 if (errno != EPERM)
599 return log_debug_errno(errno, "mknod failed for %s: %m", d);
600
601 *make_devnode = false;
602 }
603
604 /* We're about to fallback to bind-mounting the device
605 * node. So create a dummy bind-mount target. */
606 mac_selinux_create_file_prepare(d, 0);
607 r = mknod(dn, S_IFREG, 0);
b5e99f23 608 mac_selinux_create_file_clear();
b5e99f23 609
16498617
CB
610 if (r < 0 && errno != EEXIST)
611 return log_debug_errno(errno, "mknod fallback failed for %s: %m", d);
612
613 /* Fallback to bind-mounting:
614 * The assumption here is that all used device nodes carry standard
615 * properties. Specifically, the devices nodes we bind-mount should
616 * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx)
617 * and should not carry ACLs. */
618 if (mount(d, dn, NULL, MS_BIND, NULL) < 0)
619 return log_debug_errno(errno, "mount failed for %s: %m", d);
b5e99f23 620
af984e13 621 return 0;
b5e99f23
ДГ
622}
623
5d997827 624static int mount_private_dev(MountEntry *m) {
7f112f50
LP
625 static const char devnodes[] =
626 "/dev/null\0"
627 "/dev/zero\0"
628 "/dev/full\0"
629 "/dev/random\0"
630 "/dev/urandom\0"
631 "/dev/tty\0";
632
2b85f4e1 633 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 634 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
16498617 635 bool can_mknod = true;
7f112f50
LP
636 _cleanup_umask_ mode_t u;
637 int r;
638
639 assert(m);
640
641 u = umask(0000);
642
2b85f4e1
LP
643 if (!mkdtemp(temporary_mount))
644 return -errno;
645
63c372cb 646 dev = strjoina(temporary_mount, "/dev");
dc751688 647 (void) mkdir(dev, 0755);
737ba3c8 648 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
649 r = -errno;
650 goto fail;
651 }
652
63c372cb 653 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 654 (void) mkdir(devpts, 0755);
2b85f4e1
LP
655 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
656 r = -errno;
657 goto fail;
658 }
659
414b304b
ДГ
660 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
661 * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
662 * thus, in that case make a clone
663 *
664 * in nspawn and other containers it will be a symlink, in that case make it a symlink
665 */
36ce7110
LP
666 r = is_symlink("/dev/ptmx");
667 if (r < 0)
3164e3cb 668 goto fail;
36ce7110 669 if (r > 0) {
414b304b
ДГ
670 devptmx = strjoina(temporary_mount, "/dev/ptmx");
671 if (symlink("pts/ptmx", devptmx) < 0) {
672 r = -errno;
673 goto fail;
674 }
675 } else {
16498617 676 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
152c475f
LP
677 if (r < 0)
678 goto fail;
414b304b 679 }
e06b6479 680
63c372cb 681 devshm = strjoina(temporary_mount, "/dev/shm");
8d953682 682 (void) mkdir(devshm, 0755);
2b85f4e1
LP
683 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
684 if (r < 0) {
685 r = -errno;
686 goto fail;
687 }
688
63c372cb 689 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 690 (void) mkdir(devmqueue, 0755);
3164e3cb 691 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 692
63c372cb 693 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 694 (void) mkdir(devhugepages, 0755);
3164e3cb 695 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 696
63c372cb 697 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 698 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 699
7f112f50 700 NULSTR_FOREACH(d, devnodes) {
16498617 701 r = clone_device_node(d, temporary_mount, &can_mknod);
af984e13
ZJS
702 /* ENXIO means the the *source* is not a device file, skip creation in that case */
703 if (r < 0 && r != -ENXIO)
2b85f4e1 704 goto fail;
7f112f50
LP
705 }
706
03cfe0d5 707 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 708
ee818b89
AC
709 /* Create the /dev directory if missing. It is more likely to be
710 * missing when the service is started with RootDirectory. This is
711 * consistent with mount units creating the mount points when missing.
712 */
34de407a 713 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 714
9e5f8252 715 /* Unmount everything in old /dev */
34de407a
LP
716 umount_recursive(mount_entry_path(m), 0);
717 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
718 r = -errno;
719 goto fail;
720 }
7f112f50 721
2b85f4e1
LP
722 rmdir(dev);
723 rmdir(temporary_mount);
7f112f50 724
2b85f4e1 725 return 0;
7f112f50 726
2b85f4e1
LP
727fail:
728 if (devpts)
729 umount(devpts);
7f112f50 730
2b85f4e1
LP
731 if (devshm)
732 umount(devshm);
7f112f50 733
2b85f4e1
LP
734 if (devhugepages)
735 umount(devhugepages);
7f112f50 736
2b85f4e1
LP
737 if (devmqueue)
738 umount(devmqueue);
7f112f50 739
d267c5aa
ZJS
740 umount(dev);
741 rmdir(dev);
2b85f4e1 742 rmdir(temporary_mount);
7f112f50 743
2b85f4e1 744 return r;
7f112f50
LP
745}
746
2a2969fd 747static int mount_bind_dev(const MountEntry *m) {
5d997827
LP
748 int r;
749
750 assert(m);
751
752 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
753 * /dev. This is only used when RootDirectory= is set. */
754
645767d6
LP
755 (void) mkdir_p_label(mount_entry_path(m), 0755);
756
5d997827
LP
757 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
758 if (r < 0)
759 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
760 if (r > 0) /* make this a NOP if /dev is already a mount point */
761 return 0;
762
763 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
764 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
765
766 return 1;
767}
768
2a2969fd 769static int mount_sysfs(const MountEntry *m) {
5d997827
LP
770 int r;
771
772 assert(m);
773
645767d6
LP
774 (void) mkdir_p_label(mount_entry_path(m), 0755);
775
5d997827
LP
776 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
777 if (r < 0)
778 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
779 if (r > 0) /* make this a NOP if /sys is already a mount point */
780 return 0;
781
782 /* Bind mount the host's version so that we get all child mounts of it, too. */
783 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
784 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
785
786 return 1;
787}
788
2a2969fd 789static int mount_procfs(const MountEntry *m) {
5d997827
LP
790 int r;
791
792 assert(m);
793
645767d6
LP
794 (void) mkdir_p_label(mount_entry_path(m), 0755);
795
5d997827
LP
796 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
797 if (r < 0)
798 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
799 if (r > 0) /* make this a NOP if /proc is already a mount point */
800 return 0;
801
802 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
803 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
804 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
805
806 return 1;
807}
808
2abd4e38 809static int mount_tmpfs(const MountEntry *m) {
6c47cd7d
LP
810 assert(m);
811
2abd4e38 812 /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
6c47cd7d
LP
813
814 (void) mkdir_p_label(mount_entry_path(m), 0755);
815 (void) umount_recursive(mount_entry_path(m), 0);
816
2abd4e38 817 if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
6c47cd7d
LP
818 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
819
820 return 1;
821}
822
088696fe 823static int follow_symlink(
d2d6c096 824 const char *root_directory,
088696fe 825 MountEntry *m) {
d2d6c096 826
088696fe 827 _cleanup_free_ char *target = NULL;
8fceda93
LP
828 int r;
829
088696fe
LP
830 /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
831 * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
832 * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
833 * end and already have a fully normalized name. */
8fceda93 834
088696fe
LP
835 r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target);
836 if (r < 0)
837 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
838 if (r > 0) /* Reached the end, nothing more to resolve */
839 return 1;
8fceda93 840
088696fe
LP
841 if (m->n_followed >= CHASE_SYMLINKS_MAX) { /* put a boundary on things */
842 log_debug("Symlink loop on '%s'.", mount_entry_path(m));
843 return -ELOOP;
8fceda93 844 }
8fceda93 845
088696fe 846 log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target);
8fceda93 847
088696fe
LP
848 free_and_replace(m->path_malloc, target);
849 m->has_prefix = true;
8fceda93 850
088696fe
LP
851 m->n_followed ++;
852
853 return 0;
8fceda93
LP
854}
855
ac0930c8 856static int apply_mount(
8fceda93 857 const char *root_directory,
89bd586c 858 MountEntry *m) {
ac0930c8 859
a227a4be 860 bool rbind = true, make = false;
15ae422b 861 const char *what;
15ae422b 862 int r;
15ae422b 863
c17ec25e 864 assert(m);
15ae422b 865
34de407a 866 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 867
c17ec25e 868 switch (m->mode) {
15ae422b 869
160cfdbe
LP
870 case INACCESSIBLE: {
871 struct stat target;
6d313367
LP
872
873 /* First, get rid of everything that is below if there
874 * is anything... Then, overmount it with an
c4b41707 875 * inaccessible path. */
34de407a 876 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 877
088696fe
LP
878 if (lstat(mount_entry_path(m), &target) < 0) {
879 if (errno == ENOENT && m->ignore)
880 return 0;
881
34de407a 882 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
088696fe 883 }
15ae422b 884
c4b41707 885 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
886 if (!what) {
887 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
888 return -ELOOP;
889 }
890 break;
160cfdbe 891 }
fe3c2583 892
15ae422b 893 case READONLY:
15ae422b 894 case READWRITE:
8fceda93 895 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
088696fe
LP
896 if (r == -ENOENT && m->ignore)
897 return 0;
d944dc95 898 if (r < 0)
34de407a 899 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
900 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
901 return 0;
6b7c9f8b 902 /* This isn't a mount point yet, let's make it one. */
34de407a 903 what = mount_entry_path(m);
6b7c9f8b 904 break;
15ae422b 905
d2d6c096
LP
906 case BIND_MOUNT:
907 rbind = false;
d2d6c096 908
4831981d 909 _fallthrough_;
088696fe
LP
910 case BIND_MOUNT_RECURSIVE: {
911 _cleanup_free_ char *chased = NULL;
5d997827 912
088696fe
LP
913 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note that bind
914 * mount source paths are always relative to the host root, hence we pass NULL as root directory to
915 * chase_symlinks() here. */
916
917 r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased);
918 if (r == -ENOENT && m->ignore) {
919 log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
920 return 0;
921 }
922 if (r < 0)
923 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
924
925 log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased);
926
927 free_and_replace(m->source_malloc, chased);
d2d6c096
LP
928
929 what = mount_entry_source(m);
a227a4be 930 make = true;
d2d6c096 931 break;
088696fe 932 }
d2d6c096 933
6c47cd7d 934 case EMPTY_DIR:
2abd4e38
YW
935 case TMPFS:
936 return mount_tmpfs(m);
6c47cd7d 937
ac0930c8 938 case PRIVATE_TMP:
89bd586c 939 what = mount_entry_source(m);
a227a4be 940 make = true;
15ae422b 941 break;
e364ad06 942
d6797c92 943 case PRIVATE_DEV:
5d997827
LP
944 return mount_private_dev(m);
945
946 case BIND_DEV:
947 return mount_bind_dev(m);
948
949 case SYSFS:
950 return mount_sysfs(m);
951
952 case PROCFS:
953 return mount_procfs(m);
d6797c92 954
e364ad06
LP
955 default:
956 assert_not_reached("Unknown mode");
15ae422b
LP
957 }
958
ac0930c8 959 assert(what);
15ae422b 960
a227a4be
LP
961 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
962 bool try_again = false;
963 r = -errno;
964
965 if (r == -ENOENT && make) {
966 struct stat st;
967
968 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
969
e8717862
LP
970 if (stat(what, &st) < 0)
971 log_debug_errno(errno, "Mount point source '%s' is not accessible: %m", what);
972 else {
973 int q;
a227a4be
LP
974
975 (void) mkdir_parents(mount_entry_path(m), 0755);
976
977 if (S_ISDIR(st.st_mode))
e8717862 978 q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0;
a227a4be 979 else
e8717862
LP
980 q = touch(mount_entry_path(m));
981
982 if (q < 0)
983 log_debug_errno(q, "Failed to create destination mount point node '%s': %m", mount_entry_path(m));
984 else
985 try_again = true;
a227a4be
LP
986 }
987 }
988
989 if (try_again) {
990 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
991 r = -errno;
992 else
993 r = 0;
994 }
995
996 if (r < 0)
997 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
998 }
6b7c9f8b 999
34de407a 1000 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 1001 return 0;
ac0930c8 1002}
15ae422b 1003
2a2969fd 1004static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
6b7c9f8b 1005 int r = 0;
15ae422b 1006
c17ec25e 1007 assert(m);
ac9de0b3 1008 assert(proc_self_mountinfo);
ac0930c8 1009
2abd4e38
YW
1010 if (mount_entry_read_only(m)) {
1011 if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) {
1012 /* Make superblock readonly */
1013 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0)
1014 r = -errno;
1015 } else
1016 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
1017 } else if (m->mode == PRIVATE_DEV) {
1018 /* Superblock can be readonly but the submounts can't */
34de407a 1019 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 1020 r = -errno;
737ba3c8 1021 } else
6b7c9f8b
LP
1022 return 0;
1023
1024 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
1025 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
1026 * read-only mounts already applied. */
ac0930c8 1027
8fceda93
LP
1028 if (r == -ENOENT && m->ignore)
1029 r = 0;
5327c910 1030
1d54cd5d 1031 return r;
d944dc95
LP
1032}
1033
bb0ff3fb 1034static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
5d997827
LP
1035 assert(ns_info);
1036
9c988f93
DH
1037 /*
1038 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1039 * since to protect the API VFS mounts, they need to be around in the
1040 * first place... and RootDirectory= or RootImage= need to be set.
1041 */
5d997827 1042
9c988f93
DH
1043 /* root_directory should point to a mount point */
1044 return root_directory &&
1045 (ns_info->mount_apivfs ||
1046 ns_info->protect_control_groups ||
1047 ns_info->protect_kernel_tunables);
5d997827
LP
1048}
1049
2652c6c1 1050static unsigned namespace_calculate_mounts(
9c988f93 1051 const char* root_directory,
bb0ff3fb 1052 const NamespaceInfo *ns_info,
2652c6c1
DH
1053 char** read_write_paths,
1054 char** read_only_paths,
1055 char** inaccessible_paths,
6c47cd7d 1056 char** empty_directories,
d2d6c096 1057 unsigned n_bind_mounts,
2abd4e38 1058 unsigned n_temporary_filesystems,
2652c6c1
DH
1059 const char* tmp_dir,
1060 const char* var_tmp_dir,
2652c6c1
DH
1061 ProtectHome protect_home,
1062 ProtectSystem protect_system) {
1063
b6c432ca 1064 unsigned protect_home_cnt;
f471b2af
DH
1065 unsigned protect_system_cnt =
1066 (protect_system == PROTECT_SYSTEM_STRICT ?
1067 ELEMENTSOF(protect_system_strict_table) :
1068 ((protect_system == PROTECT_SYSTEM_FULL) ?
1069 ELEMENTSOF(protect_system_full_table) :
1070 ((protect_system == PROTECT_SYSTEM_YES) ?
1071 ELEMENTSOF(protect_system_yes_table) : 0)));
1072
b6c432ca
DH
1073 protect_home_cnt =
1074 (protect_home == PROTECT_HOME_YES ?
1075 ELEMENTSOF(protect_home_yes_table) :
1076 ((protect_home == PROTECT_HOME_READ_ONLY) ?
e4da7d8c
YW
1077 ELEMENTSOF(protect_home_read_only_table) :
1078 ((protect_home == PROTECT_HOME_TMPFS) ?
1079 ELEMENTSOF(protect_home_tmpfs_table) : 0)));
b6c432ca 1080
2652c6c1
DH
1081 return !!tmp_dir + !!var_tmp_dir +
1082 strv_length(read_write_paths) +
1083 strv_length(read_only_paths) +
1084 strv_length(inaccessible_paths) +
6c47cd7d 1085 strv_length(empty_directories) +
d2d6c096 1086 n_bind_mounts +
2abd4e38 1087 n_temporary_filesystems +
c575770b
DH
1088 ns_info->private_dev +
1089 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1090 (ns_info->protect_control_groups ? 1 : 0) +
1091 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
5d997827 1092 protect_home_cnt + protect_system_cnt +
9c988f93 1093 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
2652c6c1
DH
1094}
1095
f8b64b57
LP
1096static void normalize_mounts(const char *root_directory, MountEntry *mounts, unsigned *n_mounts) {
1097 assert(n_mounts);
1098 assert(mounts || *n_mounts == 0);
1099
1100 qsort_safe(mounts, *n_mounts, sizeof(MountEntry), mount_path_compare);
1101
1102 drop_duplicates(mounts, n_mounts);
1103 drop_outside_root(root_directory, mounts, n_mounts);
1104 drop_inaccessible(mounts, n_mounts);
1105 drop_nop(mounts, n_mounts);
1106}
1107
613b411c 1108int setup_namespace(
ee818b89 1109 const char* root_directory,
915e6d16 1110 const char* root_image,
bb0ff3fb 1111 const NamespaceInfo *ns_info,
2a624c36
AP
1112 char** read_write_paths,
1113 char** read_only_paths,
1114 char** inaccessible_paths,
6c47cd7d 1115 char** empty_directories,
d2d6c096
LP
1116 const BindMount *bind_mounts,
1117 unsigned n_bind_mounts,
2abd4e38
YW
1118 const TemporaryFileSystem *temporary_filesystems,
1119 unsigned n_temporary_filesystems,
a004cb4c
LP
1120 const char* tmp_dir,
1121 const char* var_tmp_dir,
1b8689f9
LP
1122 ProtectHome protect_home,
1123 ProtectSystem protect_system,
915e6d16
LP
1124 unsigned long mount_flags,
1125 DissectImageFlags dissect_image_flags) {
15ae422b 1126
915e6d16 1127 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
78ebe980 1128 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
915e6d16 1129 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
78ebe980 1130 _cleanup_free_ void *root_hash = NULL;
34de407a 1131 MountEntry *m, *mounts = NULL;
78ebe980 1132 size_t root_hash_size = 0;
d944dc95 1133 bool make_slave = false;
e908468b 1134 const char *root;
f0a4feb0 1135 unsigned n_mounts;
d18aff04 1136 bool require_prefix = false;
c17ec25e 1137 int r = 0;
15ae422b 1138
915e6d16
LP
1139 assert(ns_info);
1140
613b411c 1141 if (mount_flags == 0)
c17ec25e 1142 mount_flags = MS_SHARED;
ac0930c8 1143
915e6d16
LP
1144 if (root_image) {
1145 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1146
c9ef8573
LP
1147 if (protect_system == PROTECT_SYSTEM_STRICT &&
1148 protect_home != PROTECT_HOME_NO &&
1149 strv_isempty(read_write_paths))
915e6d16
LP
1150 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1151
1152 r = loop_device_make_by_path(root_image,
1153 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1154 &loop_device);
1155 if (r < 0)
1156 return r;
1157
78ebe980
LP
1158 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1159 if (r < 0)
1160 return r;
1161
1162 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1163 if (r < 0)
1164 return r;
1165
1166 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
915e6d16
LP
1167 if (r < 0)
1168 return r;
915e6d16
LP
1169 }
1170
e908468b
LP
1171 if (root_directory)
1172 root = root_directory;
2abd4e38 1173 else if (root_image || n_bind_mounts > 0 || n_temporary_filesystems > 0) {
e908468b
LP
1174
1175 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1176 * the same mount point for all images, which is safe, since they all live in their own namespaces
1177 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1178 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1179 * while we are applying them. */
1180
1181 root = "/run/systemd/unit-root";
1182 (void) mkdir_label(root, 0700);
d18aff04 1183 require_prefix = true;
e908468b
LP
1184 } else
1185 root = NULL;
1186
cfbeb4ef 1187 n_mounts = namespace_calculate_mounts(
e908468b 1188 root,
cfbeb4ef
LP
1189 ns_info,
1190 read_write_paths,
1191 read_only_paths,
1192 inaccessible_paths,
6c47cd7d 1193 empty_directories,
f5c52a77 1194 n_bind_mounts,
2abd4e38 1195 n_temporary_filesystems,
cfbeb4ef
LP
1196 tmp_dir, var_tmp_dir,
1197 protect_home, protect_system);
613b411c 1198
2652c6c1 1199 /* Set mount slave mode */
e908468b 1200 if (root || n_mounts > 0)
d944dc95
LP
1201 make_slave = true;
1202
f0a4feb0 1203 if (n_mounts > 0) {
34de407a 1204 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
d18aff04 1205 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
613b411c 1206 if (r < 0)
f0a4feb0 1207 goto finish;
613b411c 1208
d18aff04 1209 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
613b411c 1210 if (r < 0)
f0a4feb0 1211 goto finish;
613b411c 1212
d18aff04 1213 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
613b411c 1214 if (r < 0)
f0a4feb0 1215 goto finish;
7ff7394d 1216
6c47cd7d
LP
1217 r = append_empty_dir_mounts(&m, empty_directories);
1218 if (r < 0)
1219 goto finish;
1220
d2d6c096
LP
1221 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1222 if (r < 0)
1223 goto finish;
1224
2abd4e38
YW
1225 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1226 if (r < 0)
1227 goto finish;
1228
613b411c 1229 if (tmp_dir) {
34de407a 1230 *(m++) = (MountEntry) {
5327c910
LP
1231 .path_const = "/tmp",
1232 .mode = PRIVATE_TMP,
89bd586c 1233 .source_const = tmp_dir,
5327c910 1234 };
613b411c 1235 }
7ff7394d 1236
613b411c 1237 if (var_tmp_dir) {
34de407a 1238 *(m++) = (MountEntry) {
5327c910 1239 .path_const = "/var/tmp",
89bd586c
YW
1240 .mode = PRIVATE_TMP,
1241 .source_const = var_tmp_dir,
5327c910 1242 };
7ff7394d 1243 }
ac0930c8 1244
c575770b 1245 if (ns_info->private_dev) {
34de407a 1246 *(m++) = (MountEntry) {
5327c910
LP
1247 .path_const = "/dev",
1248 .mode = PRIVATE_DEV,
1249 };
7f112f50
LP
1250 }
1251
c575770b 1252 if (ns_info->protect_kernel_tunables) {
5327c910 1253 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 1254 if (r < 0)
f0a4feb0 1255 goto finish;
c575770b
DH
1256 }
1257
1258 if (ns_info->protect_kernel_modules) {
5327c910 1259 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 1260 if (r < 0)
f0a4feb0 1261 goto finish;
c575770b 1262 }
59eeb84b 1263
c575770b 1264 if (ns_info->protect_control_groups) {
34de407a 1265 *(m++) = (MountEntry) {
5327c910
LP
1266 .path_const = "/sys/fs/cgroup",
1267 .mode = READONLY,
1268 };
59eeb84b
LP
1269 }
1270
5327c910 1271 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 1272 if (r < 0)
f0a4feb0 1273 goto finish;
417116f2 1274
5327c910 1275 r = append_protect_system(&m, protect_system, false);
f471b2af 1276 if (r < 0)
f0a4feb0 1277 goto finish;
417116f2 1278
e908468b 1279 if (namespace_info_mount_apivfs(root, ns_info)) {
5d997827
LP
1280 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1281 if (r < 0)
1282 goto finish;
1283 }
1284
f0a4feb0 1285 assert(mounts + n_mounts == m);
ac0930c8 1286
5327c910 1287 /* Prepend the root directory where that's necessary */
e908468b 1288 r = prefix_where_needed(mounts, n_mounts, root);
5327c910
LP
1289 if (r < 0)
1290 goto finish;
1291
f8b64b57 1292 normalize_mounts(root_directory, mounts, &n_mounts);
15ae422b
LP
1293 }
1294
d944dc95
LP
1295 if (unshare(CLONE_NEWNS) < 0) {
1296 r = -errno;
1297 goto finish;
1298 }
1e4e94c8 1299
d944dc95 1300 if (make_slave) {
c2c13f2d
LP
1301 /* Remount / as SLAVE so that nothing now mounted in the namespace
1302 shows up in the parent */
d944dc95
LP
1303 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1304 r = -errno;
1305 goto finish;
1306 }
ee818b89
AC
1307 }
1308
915e6d16 1309 if (root_image) {
e908468b 1310 /* A root image is specified, mount it to the right place */
2d3a5a73 1311 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
915e6d16
LP
1312 if (r < 0)
1313 goto finish;
1314
07ce7407
TM
1315 if (decrypted_image) {
1316 r = decrypted_image_relinquish(decrypted_image);
1317 if (r < 0)
1318 goto finish;
1319 }
78ebe980 1320
915e6d16
LP
1321 loop_device_relinquish(loop_device);
1322
1323 } else if (root_directory) {
1324
e908468b
LP
1325 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1326 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 1327 if (r < 0)
d944dc95 1328 goto finish;
8f1ad200 1329 if (r == 0) {
e908468b 1330 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
8f1ad200
LP
1331 r = -errno;
1332 goto finish;
1333 }
d944dc95 1334 }
e908468b
LP
1335
1336 } else if (root) {
1337
1338 /* Let's mount the main root directory to the root directory to use */
1339 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1340 r = -errno;
1341 goto finish;
1342 }
ee818b89 1343 }
c2c13f2d 1344
4e0c20de
LP
1345 /* Try to set up the new root directory before mounting anything else there. */
1346 if (root_image || root_directory)
1347 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1348
f0a4feb0 1349 if (n_mounts > 0) {
ac9de0b3 1350 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
6b7c9f8b
LP
1351 char **blacklist;
1352 unsigned j;
1353
ac9de0b3
TR
1354 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1355 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1356 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1357 if (!proc_self_mountinfo) {
1358 r = -errno;
1359 goto finish;
1360 }
1361
088696fe
LP
1362 /* First round, establish all mounts we need */
1363 for (;;) {
1364 bool again = false;
1365
1366 for (m = mounts; m < mounts + n_mounts; ++m) {
1367
1368 if (m->applied)
1369 continue;
1370
1371 r = follow_symlink(root, m);
1372 if (r < 0)
1373 goto finish;
1374 if (r == 0) {
1375 /* We hit a symlinked mount point. The entry got rewritten and might point to a
1376 * very different place now. Let's normalize the changed list, and start from
1377 * the beginning. After all to mount the entry at the new location we might
1378 * need some other mounts first */
1379 again = true;
1380 break;
1381 }
1382
1383 r = apply_mount(root, m);
1384 if (r < 0)
1385 goto finish;
1386
1387 m->applied = true;
1388 }
1389
1390 if (!again)
1391 break;
1392
1393 normalize_mounts(root_directory, mounts, &n_mounts);
c2c13f2d 1394 }
15ae422b 1395
6b7c9f8b 1396 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
1397 blacklist = newa(char*, n_mounts+1);
1398 for (j = 0; j < n_mounts; j++)
34de407a 1399 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
1400 blacklist[j] = NULL;
1401
1402 /* Second round, flip the ro bits if necessary. */
f0a4feb0 1403 for (m = mounts; m < mounts + n_mounts; ++m) {
ac9de0b3 1404 r = make_read_only(m, blacklist, proc_self_mountinfo);
c2c13f2d 1405 if (r < 0)
d944dc95 1406 goto finish;
c2c13f2d 1407 }
15ae422b
LP
1408 }
1409
e908468b 1410 if (root) {
ee818b89 1411 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
e908468b 1412 r = mount_move_root(root);
d944dc95
LP
1413 if (r < 0)
1414 goto finish;
ee818b89
AC
1415 }
1416
55fe7432 1417 /* Remount / as the desired mode. Note that this will not
c2c13f2d
LP
1418 * reestablish propagation from our side to the host, since
1419 * what's disconnected is disconnected. */
d944dc95
LP
1420 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1421 r = -errno;
1422 goto finish;
1423 }
15ae422b 1424
d944dc95 1425 r = 0;
15ae422b 1426
d944dc95 1427finish:
f0a4feb0 1428 for (m = mounts; m < mounts + n_mounts; m++)
1eb7e08e 1429 mount_entry_done(m);
613b411c
LP
1430
1431 return r;
1432}
1433
d2d6c096
LP
1434void bind_mount_free_many(BindMount *b, unsigned n) {
1435 unsigned i;
1436
1437 assert(b || n == 0);
1438
1439 for (i = 0; i < n; i++) {
1440 free(b[i].source);
1441 free(b[i].destination);
1442 }
1443
1444 free(b);
1445}
1446
1447int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1448 _cleanup_free_ char *s = NULL, *d = NULL;
1449 BindMount *c;
1450
1451 assert(b);
1452 assert(n);
1453 assert(item);
1454
1455 s = strdup(item->source);
1456 if (!s)
1457 return -ENOMEM;
1458
1459 d = strdup(item->destination);
1460 if (!d)
1461 return -ENOMEM;
1462
aa484f35 1463 c = reallocarray(*b, *n + 1, sizeof(BindMount));
d2d6c096
LP
1464 if (!c)
1465 return -ENOMEM;
1466
1467 *b = c;
1468
1469 c[(*n) ++] = (BindMount) {
1cc6c93a
YW
1470 .source = TAKE_PTR(s),
1471 .destination = TAKE_PTR(d),
d2d6c096
LP
1472 .read_only = item->read_only,
1473 .recursive = item->recursive,
1474 .ignore_enoent = item->ignore_enoent,
1475 };
1476
d2d6c096
LP
1477 return 0;
1478}
1479
2abd4e38
YW
1480void temporary_filesystem_free_many(TemporaryFileSystem *t, unsigned n) {
1481 unsigned i;
1482
1483 assert(t || n == 0);
1484
1485 for (i = 0; i < n; i++) {
1486 free(t[i].path);
1487 free(t[i].options);
1488 }
1489
1490 free(t);
1491}
1492
1493int temporary_filesystem_add(
1494 TemporaryFileSystem **t,
1495 unsigned *n,
1496 const char *path,
1497 const char *options) {
1498
1499 _cleanup_free_ char *p = NULL, *o = NULL;
1500 TemporaryFileSystem *c;
1501
1502 assert(t);
1503 assert(n);
1504 assert(path);
1505
1506 p = strdup(path);
1507 if (!p)
1508 return -ENOMEM;
1509
1510 if (!isempty(options)) {
1511 o = strdup(options);
1512 if (!o)
1513 return -ENOMEM;
1514 }
1515
aa484f35 1516 c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
2abd4e38
YW
1517 if (!c)
1518 return -ENOMEM;
1519
1520 *t = c;
1521
1522 c[(*n) ++] = (TemporaryFileSystem) {
1cc6c93a
YW
1523 .path = TAKE_PTR(p),
1524 .options = TAKE_PTR(o),
2abd4e38
YW
1525 };
1526
2abd4e38
YW
1527 return 0;
1528}
1529
613b411c
LP
1530static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1531 _cleanup_free_ char *x = NULL;
6b46ea73
LP
1532 char bid[SD_ID128_STRING_MAX];
1533 sd_id128_t boot_id;
1534 int r;
613b411c
LP
1535
1536 assert(id);
1537 assert(prefix);
1538 assert(path);
1539
6b46ea73
LP
1540 /* We include the boot id in the directory so that after a
1541 * reboot we can easily identify obsolete directories. */
1542
1543 r = sd_id128_get_boot(&boot_id);
1544 if (r < 0)
1545 return r;
1546
605405c6 1547 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
1548 if (!x)
1549 return -ENOMEM;
1550
1551 RUN_WITH_UMASK(0077)
1552 if (!mkdtemp(x))
1553 return -errno;
1554
1555 RUN_WITH_UMASK(0000) {
1556 char *y;
1557
63c372cb 1558 y = strjoina(x, "/tmp");
613b411c
LP
1559
1560 if (mkdir(y, 0777 | S_ISVTX) < 0)
1561 return -errno;
c17ec25e 1562 }
15ae422b 1563
1cc6c93a 1564 *path = TAKE_PTR(x);
613b411c
LP
1565
1566 return 0;
1567}
1568
1569int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1570 char *a, *b;
1571 int r;
1572
1573 assert(id);
1574 assert(tmp_dir);
1575 assert(var_tmp_dir);
1576
1577 r = setup_one_tmp_dir(id, "/tmp", &a);
1578 if (r < 0)
1579 return r;
1580
1581 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1582 if (r < 0) {
1583 char *t;
1584
63c372cb 1585 t = strjoina(a, "/tmp");
613b411c
LP
1586 rmdir(t);
1587 rmdir(a);
1588
1589 free(a);
1590 return r;
1591 }
1592
1593 *tmp_dir = a;
1594 *var_tmp_dir = b;
1595
1596 return 0;
1597}
1598
1599int setup_netns(int netns_storage_socket[2]) {
1600 _cleanup_close_ int netns = -1;
3ee897d6 1601 int r, q;
613b411c
LP
1602
1603 assert(netns_storage_socket);
1604 assert(netns_storage_socket[0] >= 0);
1605 assert(netns_storage_socket[1] >= 0);
1606
1607 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1608 * namespace reference fd. Whatever process runs this first
1609 * shall create a new namespace, all others should just join
1610 * it. To serialize that we use a file lock on the socket
1611 * pair.
613b411c
LP
1612 *
1613 * It's a bit crazy, but hey, works great! */
1614
1615 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1616 return -errno;
1617
3ee897d6
LP
1618 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1619 if (netns == -EAGAIN) {
613b411c
LP
1620 /* Nothing stored yet, so let's create a new namespace */
1621
1622 if (unshare(CLONE_NEWNET) < 0) {
1623 r = -errno;
1624 goto fail;
1625 }
1626
1627 loopback_setup();
1628
1629 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1630 if (netns < 0) {
1631 r = -errno;
1632 goto fail;
1633 }
1634
1635 r = 1;
613b411c 1636
3ee897d6
LP
1637 } else if (netns < 0) {
1638 r = netns;
1639 goto fail;
613b411c 1640
3ee897d6
LP
1641 } else {
1642 /* Yay, found something, so let's join the namespace */
613b411c
LP
1643 if (setns(netns, CLONE_NEWNET) < 0) {
1644 r = -errno;
1645 goto fail;
1646 }
1647
1648 r = 0;
1649 }
1650
3ee897d6
LP
1651 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1652 if (q < 0) {
1653 r = q;
613b411c
LP
1654 goto fail;
1655 }
1656
1657fail:
fe048ce5 1658 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1659 return r;
1660}
417116f2 1661
6e2d7c4f
MS
1662bool ns_type_supported(NamespaceType type) {
1663 const char *t, *ns_proc;
1664
0fa5b831
LP
1665 t = namespace_type_to_string(type);
1666 if (!t) /* Don't know how to translate this? Then it's not supported */
6e2d7c4f
MS
1667 return false;
1668
6e2d7c4f 1669 ns_proc = strjoina("/proc/self/ns/", t);
6e2d7c4f
MS
1670 return access(ns_proc, F_OK) == 0;
1671}
1672
1b8689f9
LP
1673static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1674 [PROTECT_HOME_NO] = "no",
1675 [PROTECT_HOME_YES] = "yes",
1676 [PROTECT_HOME_READ_ONLY] = "read-only",
e4da7d8c 1677 [PROTECT_HOME_TMPFS] = "tmpfs",
417116f2
LP
1678};
1679
1b8689f9
LP
1680DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1681
5e1c6154
YW
1682ProtectHome parse_protect_home_or_bool(const char *s) {
1683 int r;
1684
1685 r = parse_boolean(s);
1686 if (r > 0)
1687 return PROTECT_HOME_YES;
1688 if (r == 0)
1689 return PROTECT_HOME_NO;
1690
1691 return protect_home_from_string(s);
1692}
1693
1b8689f9
LP
1694static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1695 [PROTECT_SYSTEM_NO] = "no",
1696 [PROTECT_SYSTEM_YES] = "yes",
1697 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1698 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1699};
1700
1701DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
6e2d7c4f 1702
03c791aa
YW
1703ProtectSystem parse_protect_system_or_bool(const char *s) {
1704 int r;
1705
1706 r = parse_boolean(s);
1707 if (r > 0)
1708 return PROTECT_SYSTEM_YES;
1709 if (r == 0)
1710 return PROTECT_SYSTEM_NO;
1711
1712 return protect_system_from_string(s);
1713}
1714
6e2d7c4f
MS
1715static const char* const namespace_type_table[] = {
1716 [NAMESPACE_MOUNT] = "mnt",
1717 [NAMESPACE_CGROUP] = "cgroup",
1718 [NAMESPACE_UTS] = "uts",
1719 [NAMESPACE_IPC] = "ipc",
1720 [NAMESPACE_USER] = "user",
1721 [NAMESPACE_PID] = "pid",
1722 [NAMESPACE_NET] = "net",
1723};
1724
1725DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);