]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
core: split read_mount_options helper out for reuse
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
15ae422b
LP
2
3#include <errno.h>
e08f94ac 4#include <linux/loop.h>
07630cea 5#include <sched.h>
15ae422b 6#include <stdio.h>
07630cea 7#include <sys/mount.h>
07630cea 8#include <unistd.h>
25e870b5 9#include <linux/fs.h>
15ae422b 10
b5efdb8a 11#include "alloc-util.h"
10404d52 12#include "base-filesystem.h"
7f112f50 13#include "dev-setup.h"
3ffd4af2 14#include "fd-util.h"
e5f10caf 15#include "format-util.h"
d944dc95 16#include "fs-util.h"
e908468b 17#include "label.h"
b3d13314 18#include "list.h"
915e6d16 19#include "loop-util.h"
07630cea 20#include "loopback-setup.h"
07630cea 21#include "mkdir.h"
4349cd7c 22#include "mount-util.h"
049af8ad 23#include "mountpoint-util.h"
0cb8e3d1 24#include "namespace-util.h"
3ffd4af2 25#include "namespace.h"
d8b4d14d 26#include "nulstr-util.h"
07630cea 27#include "path-util.h"
d7b8eec7 28#include "selinux-util.h"
2583fbea 29#include "socket-util.h"
760877e9 30#include "sort-util.h"
36ce7110 31#include "stat-util.h"
8b43440b 32#include "string-table.h"
07630cea
LP
33#include "string-util.h"
34#include "strv.h"
a652f050 35#include "tmpfile-util.h"
affb60b1 36#include "umask-util.h"
ee104e11 37#include "user-util.h"
15ae422b 38
737ba3c8 39#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
40
c17ec25e 41typedef enum MountMode {
15ae422b
LP
42 /* This is ordered by priority! */
43 INACCESSIBLE,
b3d13314 44 MOUNT_IMAGES,
d2d6c096
LP
45 BIND_MOUNT,
46 BIND_MOUNT_RECURSIVE,
ac0930c8 47 PRIVATE_TMP,
56a13a49 48 PRIVATE_TMP_READONLY,
7f112f50 49 PRIVATE_DEV,
5d997827 50 BIND_DEV,
6c47cd7d 51 EMPTY_DIR,
5d997827
LP
52 SYSFS,
53 PROCFS,
94293d65 54 RUN,
5d997827 55 READONLY,
59eeb84b 56 READWRITE,
2abd4e38 57 TMPFS,
1e05071d 58 READWRITE_IMPLICIT, /* Should have the lowest priority. */
5beb8688 59 _MOUNT_MODE_MAX,
c17ec25e 60} MountMode;
15ae422b 61
34de407a 62typedef struct MountEntry {
5327c910 63 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 64 MountMode mode:5;
5327c910
LP
65 bool ignore:1; /* Ignore if path does not exist? */
66 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 67 bool read_only:1; /* Shall this mount point be read-only? */
9ce4e4b0 68 bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */
088696fe 69 bool applied:1; /* Already applied */
55fe7432 70 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
b3d13314 71 const char *source_const; /* The source path, for bind mounts or images */
d2d6c096 72 char *source_malloc;
2abd4e38
YW
73 const char *options_const;/* Mount options for tmpfs */
74 char *options_malloc;
75 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
088696fe 76 unsigned n_followed;
427353f6 77 LIST_HEAD(MountOptions, image_options);
34de407a 78} MountEntry;
15ae422b 79
94293d65 80/* If MountAPIVFS= is used, let's mount /sys, /proc, /dev and /run into the it, but only as a fallback if the user hasn't mounted
3fe91079 81 * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
5d997827
LP
82static const MountEntry apivfs_table[] = {
83 { "/proc", PROCFS, false },
84 { "/dev", BIND_DEV, false },
85 { "/sys", SYSFS, false },
94293d65 86 { "/run", RUN, false, .options_const = "mode=755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME },
5d997827 87};
f471b2af 88
11a30cec 89/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 90static const MountEntry protect_kernel_tunables_table[] = {
1e05071d
YW
91 { "/proc/acpi", READONLY, true },
92 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
93 { "/proc/asound", READONLY, true },
94 { "/proc/bus", READONLY, true },
95 { "/proc/fs", READONLY, true },
96 { "/proc/irq", READONLY, true },
97 { "/proc/kallsyms", INACCESSIBLE, true },
98 { "/proc/kcore", INACCESSIBLE, true },
99 { "/proc/latency_stats", READONLY, true },
100 { "/proc/mtrr", READONLY, true },
101 { "/proc/scsi", READONLY, true },
4e399953 102 { "/proc/sys", READONLY, true },
1e05071d
YW
103 { "/proc/sysrq-trigger", READONLY, true },
104 { "/proc/timer_stats", READONLY, true },
105 { "/sys", READONLY, false },
106 { "/sys/fs/bpf", READONLY, true },
107 { "/sys/fs/cgroup", READWRITE_IMPLICIT, false }, /* READONLY is set by ProtectControlGroups= option */
108 { "/sys/fs/selinux", READWRITE_IMPLICIT, true },
109 { "/sys/kernel/debug", READONLY, true },
110 { "/sys/kernel/tracing", READONLY, true },
11a30cec
DH
111};
112
c575770b 113/* ProtectKernelModules= option */
34de407a 114static const MountEntry protect_kernel_modules_table[] = {
349cc4a5 115#if HAVE_SPLIT_USR
c6232fb0 116 { "/lib/modules", INACCESSIBLE, true },
c575770b 117#endif
c6232fb0 118 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
119};
120
94a7b275
KK
121/* ProtectKernelLogs= option */
122static const MountEntry protect_kernel_logs_table[] = {
123 { "/proc/kmsg", INACCESSIBLE, true },
124 { "/dev/kmsg", INACCESSIBLE, true },
125};
126
b6c432ca
DH
127/*
128 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
129 * system should be protected by ProtectSystem=
130 */
34de407a 131static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
132 { "/home", READONLY, true },
133 { "/run/user", READONLY, true },
134 { "/root", READONLY, true },
b6c432ca
DH
135};
136
e4da7d8c
YW
137/* ProtectHome=tmpfs table */
138static const MountEntry protect_home_tmpfs_table[] = {
7d85383e
TM
139 { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
140 { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
141 { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
e4da7d8c
YW
142};
143
b6c432ca 144/* ProtectHome=yes table */
34de407a 145static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
146 { "/home", INACCESSIBLE, true },
147 { "/run/user", INACCESSIBLE, true },
148 { "/root", INACCESSIBLE, true },
b6c432ca
DH
149};
150
f471b2af 151/* ProtectSystem=yes table */
34de407a 152static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
153 { "/usr", READONLY, false },
154 { "/boot", READONLY, true },
155 { "/efi", READONLY, true },
7486f305
AB
156#if HAVE_SPLIT_USR
157 { "/lib", READONLY, true },
158 { "/lib64", READONLY, true },
159 { "/bin", READONLY, true },
671f0f8d 160# if HAVE_SPLIT_BIN
7486f305 161 { "/sbin", READONLY, true },
671f0f8d 162# endif
7486f305 163#endif
f471b2af
DH
164};
165
166/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 167static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
168 { "/usr", READONLY, false },
169 { "/boot", READONLY, true },
170 { "/efi", READONLY, true },
171 { "/etc", READONLY, false },
7486f305
AB
172#if HAVE_SPLIT_USR
173 { "/lib", READONLY, true },
174 { "/lib64", READONLY, true },
175 { "/bin", READONLY, true },
671f0f8d 176# if HAVE_SPLIT_BIN
7486f305 177 { "/sbin", READONLY, true },
671f0f8d 178# endif
7486f305 179#endif
f471b2af
DH
180};
181
182/*
183 * ProtectSystem=strict table. In this strict mode, we mount everything
184 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
185 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
186 * protect those, and these options should be fully orthogonal.
187 * (And of course /home and friends are also left writable, as ProtectHome=
188 * shall manage those, orthogonally).
189 */
34de407a 190static const MountEntry protect_system_strict_table[] = {
1e05071d
YW
191 { "/", READONLY, false },
192 { "/proc", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
193 { "/sys", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
194 { "/dev", READWRITE_IMPLICIT, false }, /* PrivateDevices= */
195 { "/home", READWRITE_IMPLICIT, true }, /* ProtectHome= */
196 { "/run/user", READWRITE_IMPLICIT, true }, /* ProtectHome= */
197 { "/root", READWRITE_IMPLICIT, true }, /* ProtectHome= */
f471b2af
DH
198};
199
5beb8688
YW
200static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
201 [INACCESSIBLE] = "inaccessible",
202 [BIND_MOUNT] = "bind",
203 [BIND_MOUNT_RECURSIVE] = "rbind",
204 [PRIVATE_TMP] = "private-tmp",
205 [PRIVATE_DEV] = "private-dev",
206 [BIND_DEV] = "bind-dev",
207 [EMPTY_DIR] = "empty",
208 [SYSFS] = "sysfs",
209 [PROCFS] = "procfs",
210 [READONLY] = "read-only",
211 [READWRITE] = "read-write",
212 [TMPFS] = "tmpfs",
b3d13314 213 [MOUNT_IMAGES] = "mount-images",
5beb8688
YW
214 [READWRITE_IMPLICIT] = "rw-implicit",
215};
216
217DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
218
34de407a 219static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
220 assert(p);
221
5327c910
LP
222 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
223 * otherwise the stack/static ->path field is returned. */
f0a4feb0 224
5327c910 225 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
226}
227
34de407a 228static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
229 assert(p);
230
56a13a49 231 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE, PRIVATE_TMP_READONLY);
cfbeb4ef
LP
232}
233
d2d6c096
LP
234static const char *mount_entry_source(const MountEntry *p) {
235 assert(p);
236
237 return p->source_malloc ?: p->source_const;
238}
239
2abd4e38
YW
240static const char *mount_entry_options(const MountEntry *p) {
241 assert(p);
242
243 return p->options_malloc ?: p->options_const;
244}
245
1eb7e08e
LP
246static void mount_entry_done(MountEntry *p) {
247 assert(p);
248
249 p->path_malloc = mfree(p->path_malloc);
250 p->source_malloc = mfree(p->source_malloc);
2abd4e38 251 p->options_malloc = mfree(p->options_malloc);
427353f6 252 p->image_options = mount_options_free_all(p->image_options);
1eb7e08e
LP
253}
254
d18aff04 255static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
15ae422b
LP
256 char **i;
257
613b411c
LP
258 assert(p);
259
1e05071d 260 /* Adds a list of user-supplied READWRITE/READWRITE_IMPLICIT/READONLY/INACCESSIBLE entries */
5327c910 261
15ae422b 262 STRV_FOREACH(i, strv) {
5327c910
LP
263 bool ignore = false, needs_prefix = false;
264 const char *e = *i;
15ae422b 265
5327c910
LP
266 /* Look for any prefixes */
267 if (startswith(e, "-")) {
268 e++;
9c94d52e 269 ignore = true;
ea92ae33 270 }
5327c910
LP
271 if (startswith(e, "+")) {
272 e++;
273 needs_prefix = true;
274 }
ea92ae33 275
baaa35ad
ZJS
276 if (!path_is_absolute(e))
277 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
278 "Path is not absolute: %s", e);
15ae422b 279
34de407a 280 *((*p)++) = (MountEntry) {
5327c910
LP
281 .path_const = e,
282 .mode = mode,
283 .ignore = ignore,
d18aff04 284 .has_prefix = !needs_prefix && !forcibly_require_prefix,
5327c910 285 };
15ae422b
LP
286 }
287
288 return 0;
289}
290
6c47cd7d
LP
291static int append_empty_dir_mounts(MountEntry **p, char **strv) {
292 char **i;
293
294 assert(p);
295
296 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
297 * "/private/" boundary directories for DynamicUser=1. */
298
299 STRV_FOREACH(i, strv) {
300
301 *((*p)++) = (MountEntry) {
302 .path_const = *i,
303 .mode = EMPTY_DIR,
304 .ignore = false,
6c47cd7d 305 .read_only = true,
7d85383e 306 .options_const = "mode=755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
2abd4e38 307 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
6c47cd7d
LP
308 };
309 }
310
311 return 0;
312}
313
da6053d0
LP
314static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) {
315 size_t i;
d2d6c096
LP
316
317 assert(p);
318
319 for (i = 0; i < n; i++) {
320 const BindMount *b = binds + i;
321
322 *((*p)++) = (MountEntry) {
323 .path_const = b->destination,
324 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
325 .read_only = b->read_only,
9ce4e4b0 326 .nosuid = b->nosuid,
d2d6c096 327 .source_const = b->source,
4ca763a9 328 .ignore = b->ignore_enoent,
d2d6c096
LP
329 };
330 }
331
332 return 0;
333}
334
b3d13314
LB
335static int append_mount_images(MountEntry **p, const MountImage *mount_images, size_t n) {
336 assert(p);
337
338 for (size_t i = 0; i < n; i++) {
339 const MountImage *m = mount_images + i;
340
341 *((*p)++) = (MountEntry) {
342 .path_const = m->destination,
343 .mode = MOUNT_IMAGES,
344 .source_const = m->source,
427353f6 345 .image_options = m->mount_options,
b3d13314
LB
346 .ignore = m->ignore_enoent,
347 };
348 }
349
350 return 0;
351}
352
da6053d0 353static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) {
2abd4e38
YW
354 assert(p);
355
b67ec8e5 356 for (size_t i = 0; i < n; i++) {
2abd4e38
YW
357 const TemporaryFileSystem *t = tmpfs + i;
358 _cleanup_free_ char *o = NULL, *str = NULL;
ad8e66dc 359 unsigned long flags;
2abd4e38 360 bool ro = false;
b67ec8e5 361 int r;
2abd4e38 362
baaa35ad
ZJS
363 if (!path_is_absolute(t->path))
364 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
365 "Path is not absolute: %s",
366 t->path);
2abd4e38 367
b67ec8e5 368 str = strjoin("mode=0755" NESTED_TMPFS_LIMITS ",", t->options);
ad8e66dc
AJ
369 if (!str)
370 return -ENOMEM;
2abd4e38 371
ad8e66dc
AJ
372 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
373 if (r < 0)
374 return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
2abd4e38 375
ad8e66dc
AJ
376 ro = flags & MS_RDONLY;
377 if (ro)
378 flags ^= MS_RDONLY;
2abd4e38
YW
379
380 *((*p)++) = (MountEntry) {
381 .path_const = t->path,
382 .mode = TMPFS,
383 .read_only = ro,
ad8e66dc 384 .options_malloc = TAKE_PTR(o),
2abd4e38
YW
385 .flags = flags,
386 };
2abd4e38
YW
387 }
388
389 return 0;
390}
391
da6053d0
LP
392static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) {
393 size_t i;
11a30cec
DH
394
395 assert(p);
f471b2af 396 assert(mounts);
11a30cec 397
5327c910 398 /* Adds a list of static pre-defined entries */
f471b2af 399
5327c910 400 for (i = 0; i < n; i++)
34de407a
LP
401 *((*p)++) = (MountEntry) {
402 .path_const = mount_entry_path(mounts+i),
5327c910
LP
403 .mode = mounts[i].mode,
404 .ignore = mounts[i].ignore || ignore_protect,
405 };
f471b2af
DH
406
407 return 0;
408}
409
34de407a 410static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
411 assert(p);
412
5327c910 413 switch (protect_home) {
b6c432ca 414
5327c910 415 case PROTECT_HOME_NO:
b6c432ca
DH
416 return 0;
417
b6c432ca 418 case PROTECT_HOME_READ_ONLY:
5327c910
LP
419 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
420
e4da7d8c
YW
421 case PROTECT_HOME_TMPFS:
422 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
423
b6c432ca 424 case PROTECT_HOME_YES:
5327c910
LP
425 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
426
b6c432ca 427 default:
5327c910 428 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 429 }
b6c432ca
DH
430}
431
34de407a 432static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
433 assert(p);
434
5327c910
LP
435 switch (protect_system) {
436
437 case PROTECT_SYSTEM_NO:
f471b2af
DH
438 return 0;
439
f471b2af 440 case PROTECT_SYSTEM_STRICT:
5327c910
LP
441 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
442
f471b2af 443 case PROTECT_SYSTEM_YES:
5327c910
LP
444 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
445
f471b2af 446 case PROTECT_SYSTEM_FULL:
5327c910
LP
447 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
448
f471b2af 449 default:
5327c910 450 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 451 }
11a30cec
DH
452}
453
93bab288 454static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
a0827e2b 455 int d;
15ae422b 456
6ee1a919 457 /* If the paths are not equal, then order prefixes first */
93bab288 458 d = path_compare(mount_entry_path(a), mount_entry_path(b));
6ee1a919
LP
459 if (d != 0)
460 return d;
15ae422b 461
6ee1a919 462 /* If the paths are equal, check the mode */
93bab288 463 return CMP((int) a->mode, (int) b->mode);
15ae422b
LP
464}
465
da6053d0
LP
466static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) {
467 size_t i;
5327c910 468
4a756839 469 /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
5327c910
LP
470
471 for (i = 0; i < n; i++) {
472 char *s;
473
474 if (m[i].has_prefix)
475 continue;
476
c6134d3e 477 s = path_join(root_directory, mount_entry_path(m+i));
5327c910
LP
478 if (!s)
479 return -ENOMEM;
480
e282f51f 481 free_and_replace(m[i].path_malloc, s);
5327c910
LP
482 m[i].has_prefix = true;
483 }
484
485 return 0;
486}
487
da6053d0 488static void drop_duplicates(MountEntry *m, size_t *n) {
34de407a 489 MountEntry *f, *t, *previous;
15ae422b 490
c17ec25e 491 assert(m);
15ae422b 492 assert(n);
15ae422b 493
fe3c2583
LP
494 /* Drops duplicate entries. Expects that the array is properly ordered already. */
495
1d54cd5d 496 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 497
fe3c2583 498 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
088696fe
LP
499 * above. Note that we only drop duplicates that haven't been mounted yet. */
500 if (previous &&
501 path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
502 !f->applied && !previous->applied) {
5beb8688 503 log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
34de407a 504 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 505 mount_entry_done(f);
15ae422b 506 continue;
fe3c2583 507 }
15ae422b 508
e2d7c1a0 509 *t = *f;
15ae422b 510 previous = t;
fe3c2583
LP
511 t++;
512 }
513
514 *n = t - m;
515}
516
da6053d0 517static void drop_inaccessible(MountEntry *m, size_t *n) {
34de407a 518 MountEntry *f, *t;
fe3c2583
LP
519 const char *clear = NULL;
520
521 assert(m);
522 assert(n);
523
524 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
525 * ordered already. */
526
1d54cd5d 527 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
528
529 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
530 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
531 if (clear && path_startswith(mount_entry_path(f), clear)) {
532 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 533 mount_entry_done(f);
fe3c2583
LP
534 continue;
535 }
15ae422b 536
34de407a 537 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
538
539 *t = *f;
15ae422b
LP
540 t++;
541 }
542
c17ec25e 543 *n = t - m;
15ae422b
LP
544}
545
da6053d0 546static void drop_nop(MountEntry *m, size_t *n) {
34de407a 547 MountEntry *f, *t;
7648a565
LP
548
549 assert(m);
550 assert(n);
551
552 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
553 * list is ordered by prefixes. */
554
1d54cd5d 555 for (f = m, t = m; f < m + *n; f++) {
7648a565 556
1e05071d
YW
557 /* Only suppress such subtrees for READONLY, READWRITE and READWRITE_IMPLICIT entries */
558 if (IN_SET(f->mode, READONLY, READWRITE, READWRITE_IMPLICIT)) {
34de407a 559 MountEntry *p;
7648a565
LP
560 bool found = false;
561
562 /* Now let's find the first parent of the entry we are looking at. */
563 for (p = t-1; p >= m; p--) {
34de407a 564 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
565 found = true;
566 break;
567 }
568 }
569
570 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
571 if (found && p->mode == f->mode) {
5beb8688
YW
572 log_debug("%s (%s) is made redundant by %s (%s)",
573 mount_entry_path(f), mount_mode_to_string(f->mode),
574 mount_entry_path(p), mount_mode_to_string(p->mode));
1eb7e08e 575 mount_entry_done(f);
7648a565
LP
576 continue;
577 }
578 }
579
580 *t = *f;
581 t++;
582 }
583
584 *n = t - m;
585}
586
da6053d0 587static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) {
34de407a 588 MountEntry *f, *t;
cd2902c9
LP
589
590 assert(m);
591 assert(n);
592
1d54cd5d 593 /* Nothing to do */
cd2902c9
LP
594 if (!root_directory)
595 return;
596
597 /* Drops all mounts that are outside of the root directory. */
598
1d54cd5d 599 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 600
34de407a
LP
601 if (!path_startswith(mount_entry_path(f), root_directory)) {
602 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 603 mount_entry_done(f);
cd2902c9
LP
604 continue;
605 }
606
607 *t = *f;
608 t++;
609 }
610
611 *n = t - m;
612}
613
b2a60844
LP
614static int clone_device_node(
615 const char *d,
616 const char *temporary_mount,
617 bool *make_devnode) {
618
619 _cleanup_free_ char *sl = NULL;
620 const char *dn, *bn, *t;
b5e99f23
ДГ
621 struct stat st;
622 int r;
623
414b304b 624 if (stat(d, &st) < 0) {
b2a60844
LP
625 if (errno == ENOENT) {
626 log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
af984e13 627 return -ENXIO;
b2a60844
LP
628 }
629
630 return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
b5e99f23
ДГ
631 }
632
633 if (!S_ISBLK(st.st_mode) &&
baaa35ad
ZJS
634 !S_ISCHR(st.st_mode))
635 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
636 "Device node '%s' to clone is not a device node, ignoring.",
637 d);
b5e99f23 638
6f7f3a33 639 dn = strjoina(temporary_mount, d);
b5e99f23 640
b2a60844 641 /* First, try to create device node properly */
16498617
CB
642 if (*make_devnode) {
643 mac_selinux_create_file_prepare(d, st.st_mode);
644 r = mknod(dn, st.st_mode, st.st_rdev);
645 mac_selinux_create_file_clear();
b2a60844
LP
646 if (r >= 0)
647 goto add_symlink;
16498617
CB
648 if (errno != EPERM)
649 return log_debug_errno(errno, "mknod failed for %s: %m", d);
650
b2a60844 651 /* This didn't work, let's not try this again for the next iterations. */
16498617
CB
652 *make_devnode = false;
653 }
654
2aed63f4 655 /* We're about to fall back to bind-mounting the device
1acf344d
CG
656 * node. So create a dummy bind-mount target.
657 * Do not prepare device-node SELinux label (see issue 13762) */
16498617 658 r = mknod(dn, S_IFREG, 0);
16498617 659 if (r < 0 && errno != EEXIST)
b2a60844 660 return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
16498617 661
21935150
LP
662 /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard
663 * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or
664 * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */
665 r = mount_nofollow_verbose(LOG_DEBUG, d, dn, NULL, MS_BIND, NULL);
666 if (r < 0)
667 return r;
b2a60844
LP
668
669add_symlink:
670 bn = path_startswith(d, "/dev/");
671 if (!bn)
672 return 0;
673
674 /* Create symlinks like /dev/char/1:9 → ../urandom */
cbc056c8
ZJS
675 if (asprintf(&sl, "%s/dev/%s/%u:%u",
676 temporary_mount,
677 S_ISCHR(st.st_mode) ? "char" : "block",
678 major(st.st_rdev), minor(st.st_rdev)) < 0)
b2a60844
LP
679 return log_oom();
680
681 (void) mkdir_parents(sl, 0755);
682
683 t = strjoina("../", bn);
b2a60844 684 if (symlink(t, sl) < 0)
2e4a4fae 685 log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
b5e99f23 686
af984e13 687 return 0;
b5e99f23
ДГ
688}
689
5d997827 690static int mount_private_dev(MountEntry *m) {
7f112f50
LP
691 static const char devnodes[] =
692 "/dev/null\0"
693 "/dev/zero\0"
694 "/dev/full\0"
695 "/dev/random\0"
696 "/dev/urandom\0"
697 "/dev/tty\0";
698
2b85f4e1 699 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 700 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
16498617 701 bool can_mknod = true;
7f112f50
LP
702 _cleanup_umask_ mode_t u;
703 int r;
704
705 assert(m);
706
707 u = umask(0000);
708
2b85f4e1 709 if (!mkdtemp(temporary_mount))
2e4a4fae 710 return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
2b85f4e1 711
63c372cb 712 dev = strjoina(temporary_mount, "/dev");
dc751688 713 (void) mkdir(dev, 0755);
21935150
LP
714 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755" TMPFS_LIMITS_DEV);
715 if (r < 0)
2b85f4e1 716 goto fail;
21935150 717
c3151977
TM
718 r = label_fix_container(dev, "/dev", 0);
719 if (r < 0) {
720 log_debug_errno(errno, "Failed to fix label of '%s' as /dev: %m", dev);
721 goto fail;
722 }
2b85f4e1 723
63c372cb 724 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 725 (void) mkdir(devpts, 0755);
21935150
LP
726 r = mount_nofollow_verbose(LOG_DEBUG, "/dev/pts", devpts, NULL, MS_BIND, NULL);
727 if (r < 0)
2b85f4e1 728 goto fail;
2b85f4e1 729
2e4a4fae
YW
730 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
731 * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
732 * Thus, in that case make a clone.
733 * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
36ce7110 734 r = is_symlink("/dev/ptmx");
2e4a4fae
YW
735 if (r < 0) {
736 log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
3164e3cb 737 goto fail;
2e4a4fae 738 } else if (r > 0) {
414b304b
ДГ
739 devptmx = strjoina(temporary_mount, "/dev/ptmx");
740 if (symlink("pts/ptmx", devptmx) < 0) {
2e4a4fae 741 r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
414b304b
ДГ
742 goto fail;
743 }
744 } else {
16498617 745 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
152c475f
LP
746 if (r < 0)
747 goto fail;
414b304b 748 }
e06b6479 749
63c372cb 750 devshm = strjoina(temporary_mount, "/dev/shm");
8d953682 751 (void) mkdir(devshm, 0755);
21935150
LP
752 r = mount_nofollow_verbose(LOG_DEBUG, "/dev/shm", devshm, NULL, MS_BIND, NULL);
753 if (r < 0)
2b85f4e1 754 goto fail;
2b85f4e1 755
63c372cb 756 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 757 (void) mkdir(devmqueue, 0755);
21935150 758 (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 759
63c372cb 760 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 761 (void) mkdir(devhugepages, 0755);
21935150 762 (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 763
63c372cb 764 devlog = strjoina(temporary_mount, "/dev/log");
2e4a4fae
YW
765 if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
766 log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
82d25240 767
7f112f50 768 NULSTR_FOREACH(d, devnodes) {
16498617 769 r = clone_device_node(d, temporary_mount, &can_mknod);
37b22b3b 770 /* ENXIO means the *source* is not a device file, skip creation in that case */
af984e13 771 if (r < 0 && r != -ENXIO)
2b85f4e1 772 goto fail;
7f112f50
LP
773 }
774
2e4a4fae
YW
775 r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
776 if (r < 0)
105a1a36 777 log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
7f112f50 778
ee818b89
AC
779 /* Create the /dev directory if missing. It is more likely to be
780 * missing when the service is started with RootDirectory. This is
781 * consistent with mount units creating the mount points when missing.
782 */
34de407a 783 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 784
9e5f8252 785 /* Unmount everything in old /dev */
2e4a4fae
YW
786 r = umount_recursive(mount_entry_path(m), 0);
787 if (r < 0)
788 log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
789
21935150
LP
790 r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL);
791 if (r < 0)
2b85f4e1 792 goto fail;
7f112f50 793
1019a48f
LP
794 (void) rmdir(dev);
795 (void) rmdir(temporary_mount);
7f112f50 796
2b85f4e1 797 return 0;
7f112f50 798
2b85f4e1
LP
799fail:
800 if (devpts)
21935150 801 (void) umount_verbose(LOG_DEBUG, devpts, UMOUNT_NOFOLLOW);
7f112f50 802
2b85f4e1 803 if (devshm)
21935150 804 (void) umount_verbose(LOG_DEBUG, devshm, UMOUNT_NOFOLLOW);
7f112f50 805
2b85f4e1 806 if (devhugepages)
21935150 807 (void) umount_verbose(LOG_DEBUG, devhugepages, UMOUNT_NOFOLLOW);
7f112f50 808
2b85f4e1 809 if (devmqueue)
21935150 810 (void) umount_verbose(LOG_DEBUG, devmqueue, UMOUNT_NOFOLLOW);
7f112f50 811
21935150 812 (void) umount_verbose(LOG_DEBUG, dev, UMOUNT_NOFOLLOW);
1019a48f
LP
813 (void) rmdir(dev);
814 (void) rmdir(temporary_mount);
7f112f50 815
2b85f4e1 816 return r;
7f112f50
LP
817}
818
2a2969fd 819static int mount_bind_dev(const MountEntry *m) {
5d997827
LP
820 int r;
821
822 assert(m);
823
824 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
825 * /dev. This is only used when RootDirectory= is set. */
826
645767d6
LP
827 (void) mkdir_p_label(mount_entry_path(m), 0755);
828
5d997827
LP
829 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
830 if (r < 0)
831 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
832 if (r > 0) /* make this a NOP if /dev is already a mount point */
833 return 0;
834
21935150
LP
835 r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
836 if (r < 0)
837 return r;
5d997827
LP
838
839 return 1;
840}
841
2a2969fd 842static int mount_sysfs(const MountEntry *m) {
5d997827
LP
843 int r;
844
845 assert(m);
846
645767d6
LP
847 (void) mkdir_p_label(mount_entry_path(m), 0755);
848
5d997827
LP
849 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
850 if (r < 0)
851 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
852 if (r > 0) /* make this a NOP if /sys is already a mount point */
853 return 0;
854
855 /* Bind mount the host's version so that we get all child mounts of it, too. */
21935150
LP
856 r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
857 if (r < 0)
858 return r;
5d997827
LP
859
860 return 1;
861}
862
4e399953 863static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
61f8a7bd 864 _cleanup_free_ char *opts = NULL;
4e399953 865 const char *entry_path;
61f8a7bd 866 int r, n;
5d997827
LP
867
868 assert(m);
4e399953 869 assert(ns_info);
5d997827 870
4e399953
LP
871 if (ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
872 ns_info->proc_subset != PROC_SUBSET_ALL) {
4e399953
LP
873
874 /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
875 * pretended to be per-instance but actually was per-namespace), hence let's make use of it
876 * if requested. To make sure this logic succeeds only on kernels where hidepid= is
877 * per-instance, we'll exclusively use the textual value for hidepid=, since support was
878 * added in the same commit: if it's supported it is thus also per-instance. */
879
880 opts = strjoin("hidepid=",
881 ns_info->protect_proc == PROTECT_PROC_DEFAULT ? "off" :
882 protect_proc_to_string(ns_info->protect_proc),
883 ns_info->proc_subset == PROC_SUBSET_PID ? ",subset=pid" : "");
884 if (!opts)
885 return -ENOMEM;
4e399953
LP
886 }
887
61f8a7bd
YW
888 entry_path = mount_entry_path(m);
889 (void) mkdir_p_label(entry_path, 0755);
890
891 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
892 * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
893 * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
894 * mounted on /proc/ first. */
895
896 n = umount_recursive(entry_path, 0);
897
898 r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
899 if (r == -EINVAL && opts)
900 /* If this failed with EINVAL then this likely means the textual hidepid= stuff is
901 * not supported by the kernel, and thus the per-instance hidepid= neither, which
902 * means we really don't want to use it, since it would affect our host's /proc
903 * mount. Hence let's gracefully fallback to a classic, unrestricted version. */
904 r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
905 if (r == -EPERM) {
24ee0f9d 906 /* When we do not have enough privileges to mount /proc, fallback to use existing /proc. */
61f8a7bd
YW
907
908 if (n > 0)
909 /* /proc or some of sub-mounts are umounted in the above. Refuse incomplete tree.
910 * Propagate the original error code returned by mount() in the above. */
911 return -EPERM;
912
913 r = path_is_mount_point(entry_path, NULL, 0);
914 if (r < 0)
915 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
916 if (r == 0)
917 /* /proc is not mounted. Propagate the original error code. */
918 return -EPERM;
ad74f28a
YW
919 } else if (r < 0)
920 return r;
5d997827
LP
921
922 return 1;
923}
924
2abd4e38 925static int mount_tmpfs(const MountEntry *m) {
df6b900a 926 const char *entry_path, *inner_path;
abad72be 927 int r;
abad72be 928
6c47cd7d
LP
929 assert(m);
930
df6b900a
LP
931 entry_path = mount_entry_path(m);
932 inner_path = m->path_const;
933
2abd4e38 934 /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
6c47cd7d 935
abad72be
CG
936 (void) mkdir_p_label(entry_path, 0755);
937 (void) umount_recursive(entry_path, 0);
6c47cd7d 938
21935150
LP
939 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m));
940 if (r < 0)
941 return r;
abad72be 942
df6b900a 943 r = label_fix_container(entry_path, inner_path, 0);
abad72be 944 if (r < 0)
df6b900a 945 return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
6c47cd7d
LP
946
947 return 1;
948}
949
94293d65
LB
950static int mount_run(const MountEntry *m) {
951 int r;
952
953 assert(m);
954
955 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
956 if (r < 0 && r != -ENOENT)
957 return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m");
958 if (r > 0) /* make this a NOP if /run is already a mount point */
959 return 0;
960
961 return mount_tmpfs(m);
962}
963
b3d13314
LB
964static int mount_images(const MountEntry *m) {
965 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
966 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
967 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
aee36b4e 968 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
89e62e0b 969 DissectImageFlags dissect_image_flags;
b3d13314
LB
970 int r;
971
89e62e0b
LP
972 assert(m);
973
974 r = verity_settings_load(&verity, mount_entry_source(m), NULL, NULL);
b3d13314
LB
975 if (r < 0)
976 return log_debug_errno(r, "Failed to load root hash: %m");
b3d13314 977
89e62e0b
LP
978 dissect_image_flags =
979 (m->read_only ? DISSECT_IMAGE_READ_ONLY : 0) |
980 (verity.data_path ? DISSECT_IMAGE_NO_PARTITION_TABLE : 0);
981
982 r = loop_device_make_by_path(
983 mount_entry_source(m),
984 m->read_only ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
985 verity.data_path ? 0 : LO_FLAGS_PARTSCAN,
986 &loop_device);
b3d13314
LB
987 if (r < 0)
988 return log_debug_errno(r, "Failed to create loop device for image: %m");
989
89e62e0b
LP
990 r = dissect_image(
991 loop_device->fd,
992 &verity,
993 m->image_options,
994 dissect_image_flags,
995 &dissected_image);
b3d13314 996 /* No partition table? Might be a single-filesystem image, try again */
89e62e0b
LP
997 if (!verity.data_path && r == -ENOPKG)
998 r = dissect_image(
999 loop_device->fd,
1000 &verity,
1001 m->image_options,
1002 dissect_image_flags|DISSECT_IMAGE_NO_PARTITION_TABLE,
1003 &dissected_image);
b3d13314
LB
1004 if (r < 0)
1005 return log_debug_errno(r, "Failed to dissect image: %m");
1006
89e62e0b
LP
1007 r = dissected_image_decrypt(
1008 dissected_image,
1009 NULL,
1010 &verity,
1011 dissect_image_flags,
1012 &decrypted_image);
b3d13314
LB
1013 if (r < 0)
1014 return log_debug_errno(r, "Failed to decrypt dissected image: %m");
1015
1016 r = mkdir_p_label(mount_entry_path(m), 0755);
1017 if (r < 0)
1018 return log_debug_errno(r, "Failed to create destination directory %s: %m", mount_entry_path(m));
1019 r = umount_recursive(mount_entry_path(m), 0);
1020 if (r < 0)
1021 return log_debug_errno(r, "Failed to umount under destination directory %s: %m", mount_entry_path(m));
1022
1023 r = dissected_image_mount(dissected_image, mount_entry_path(m), UID_INVALID, dissect_image_flags);
1024 if (r < 0)
1025 return log_debug_errno(r, "Failed to mount image: %m");
1026
1027 if (decrypted_image) {
1028 r = decrypted_image_relinquish(decrypted_image);
1029 if (r < 0)
1030 return log_debug_errno(r, "Failed to relinquish decrypted image: %m");
1031 }
1032
1033 loop_device_relinquish(loop_device);
1034
1035 return 1;
1036}
1037
088696fe 1038static int follow_symlink(
d2d6c096 1039 const char *root_directory,
088696fe 1040 MountEntry *m) {
d2d6c096 1041
088696fe 1042 _cleanup_free_ char *target = NULL;
8fceda93
LP
1043 int r;
1044
088696fe
LP
1045 /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
1046 * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
1047 * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
1048 * end and already have a fully normalized name. */
8fceda93 1049
a5648b80 1050 r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL);
088696fe
LP
1051 if (r < 0)
1052 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
1053 if (r > 0) /* Reached the end, nothing more to resolve */
1054 return 1;
8fceda93 1055
baaa35ad
ZJS
1056 if (m->n_followed >= CHASE_SYMLINKS_MAX) /* put a boundary on things */
1057 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
1058 "Symlink loop on '%s'.",
1059 mount_entry_path(m));
8fceda93 1060
088696fe 1061 log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target);
8fceda93 1062
088696fe
LP
1063 free_and_replace(m->path_malloc, target);
1064 m->has_prefix = true;
8fceda93 1065
088696fe
LP
1066 m->n_followed ++;
1067
1068 return 0;
8fceda93
LP
1069}
1070
ac0930c8 1071static int apply_mount(
8fceda93 1072 const char *root_directory,
4e399953
LP
1073 MountEntry *m,
1074 const NamespaceInfo *ns_info) {
ac0930c8 1075
e5f10caf 1076 _cleanup_free_ char *inaccessible = NULL;
a227a4be 1077 bool rbind = true, make = false;
15ae422b 1078 const char *what;
15ae422b 1079 int r;
15ae422b 1080
c17ec25e 1081 assert(m);
4e399953 1082 assert(ns_info);
15ae422b 1083
34de407a 1084 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 1085
c17ec25e 1086 switch (m->mode) {
15ae422b 1087
160cfdbe 1088 case INACCESSIBLE: {
e5f10caf
AZ
1089 _cleanup_free_ char *tmp = NULL;
1090 const char *runtime_dir;
160cfdbe 1091 struct stat target;
6d313367
LP
1092
1093 /* First, get rid of everything that is below if there
1094 * is anything... Then, overmount it with an
c4b41707 1095 * inaccessible path. */
34de407a 1096 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 1097
088696fe
LP
1098 if (lstat(mount_entry_path(m), &target) < 0) {
1099 if (errno == ENOENT && m->ignore)
1100 return 0;
1101
cbc056c8
ZJS
1102 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m",
1103 mount_entry_path(m));
088696fe 1104 }
15ae422b 1105
e5f10caf 1106 if (geteuid() == 0)
48b747fa 1107 runtime_dir = "/run";
e5f10caf 1108 else {
48b747fa
LP
1109 if (asprintf(&tmp, "/run/user/" UID_FMT, geteuid()) < 0)
1110 return -ENOMEM;
e5f10caf
AZ
1111
1112 runtime_dir = tmp;
1113 }
1114
1115 r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
1116 if (r < 0)
baaa35ad
ZJS
1117 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
1118 "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
e5f10caf 1119 what = inaccessible;
c4b41707 1120 break;
160cfdbe 1121 }
fe3c2583 1122
15ae422b 1123 case READONLY:
15ae422b 1124 case READWRITE:
1e05071d 1125 case READWRITE_IMPLICIT:
8fceda93 1126 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
088696fe
LP
1127 if (r == -ENOENT && m->ignore)
1128 return 0;
d944dc95 1129 if (r < 0)
cbc056c8
ZJS
1130 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m",
1131 mount_entry_path(m));
1132 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY
1133 * bit for the mount point if needed. */
6b7c9f8b 1134 return 0;
6b7c9f8b 1135 /* This isn't a mount point yet, let's make it one. */
34de407a 1136 what = mount_entry_path(m);
6b7c9f8b 1137 break;
15ae422b 1138
d2d6c096
LP
1139 case BIND_MOUNT:
1140 rbind = false;
d2d6c096 1141
4831981d 1142 _fallthrough_;
088696fe
LP
1143 case BIND_MOUNT_RECURSIVE: {
1144 _cleanup_free_ char *chased = NULL;
5d997827 1145
cbc056c8
ZJS
1146 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note
1147 * that bind mount source paths are always relative to the host root, hence we pass NULL as
1148 * root directory to chase_symlinks() here. */
088696fe 1149
a5648b80 1150 r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL);
088696fe
LP
1151 if (r == -ENOENT && m->ignore) {
1152 log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
1153 return 0;
1154 }
1155 if (r < 0)
1156 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
1157
1158 log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased);
1159
1160 free_and_replace(m->source_malloc, chased);
d2d6c096
LP
1161
1162 what = mount_entry_source(m);
a227a4be 1163 make = true;
d2d6c096 1164 break;
088696fe 1165 }
d2d6c096 1166
6c47cd7d 1167 case EMPTY_DIR:
2abd4e38
YW
1168 case TMPFS:
1169 return mount_tmpfs(m);
6c47cd7d 1170
ac0930c8 1171 case PRIVATE_TMP:
56a13a49 1172 case PRIVATE_TMP_READONLY:
89bd586c 1173 what = mount_entry_source(m);
a227a4be 1174 make = true;
15ae422b 1175 break;
e364ad06 1176
d6797c92 1177 case PRIVATE_DEV:
5d997827
LP
1178 return mount_private_dev(m);
1179
1180 case BIND_DEV:
1181 return mount_bind_dev(m);
1182
1183 case SYSFS:
1184 return mount_sysfs(m);
1185
1186 case PROCFS:
4e399953 1187 return mount_procfs(m, ns_info);
d6797c92 1188
94293d65
LB
1189 case RUN:
1190 return mount_run(m);
1191
b3d13314
LB
1192 case MOUNT_IMAGES:
1193 return mount_images(m);
1194
e364ad06
LP
1195 default:
1196 assert_not_reached("Unknown mode");
15ae422b
LP
1197 }
1198
ac0930c8 1199 assert(what);
15ae422b 1200
21935150
LP
1201 r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
1202 if (r < 0) {
a227a4be 1203 bool try_again = false;
a227a4be
LP
1204
1205 if (r == -ENOENT && make) {
8bab8029 1206 int q;
a227a4be 1207
cbc056c8
ZJS
1208 /* Hmm, either the source or the destination are missing. Let's see if we can create
1209 the destination, then try again. */
a227a4be 1210
8bab8029 1211 (void) mkdir_parents(mount_entry_path(m), 0755);
a227a4be 1212
8bab8029
LB
1213 q = make_mount_point_inode_from_path(what, mount_entry_path(m), 0755);
1214 if (q < 0)
1215 log_error_errno(q, "Failed to create destination mount point node '%s': %m",
1216 mount_entry_path(m));
1217 else
1218 try_again = true;
a227a4be
LP
1219 }
1220
21935150
LP
1221 if (try_again)
1222 r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
a227a4be 1223 if (r < 0)
5dc60faa 1224 return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
a227a4be 1225 }
6b7c9f8b 1226
34de407a 1227 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 1228 return 0;
ac0930c8 1229}
15ae422b 1230
6b000af4 1231static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
9ce4e4b0 1232 unsigned long new_flags = 0, flags_mask = 0;
763a260a 1233 bool submounts = false;
6b7c9f8b 1234 int r = 0;
15ae422b 1235
c17ec25e 1236 assert(m);
ac9de0b3 1237 assert(proc_self_mountinfo);
ac0930c8 1238
9ce4e4b0
LP
1239 if (mount_entry_read_only(m) || m->mode == PRIVATE_DEV) {
1240 new_flags |= MS_RDONLY;
1241 flags_mask |= MS_RDONLY;
1242 }
1243
1244 if (m->nosuid) {
1245 new_flags |= MS_NOSUID;
1246 flags_mask |= MS_NOSUID;
1247 }
1248
1249 if (flags_mask == 0) /* No Change? */
6b7c9f8b
LP
1250 return 0;
1251
9ce4e4b0
LP
1252 /* We generally apply these changes recursively, except for /dev, and the cases we know there's
1253 * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
1254 * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace
1255 * and running Linux <= 4.17. */
1256 submounts =
1257 mount_entry_read_only(m) &&
1258 !IN_SET(m->mode, EMPTY_DIR, TMPFS);
1259 if (submounts)
6b000af4 1260 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
9ce4e4b0 1261 else
7cce68e1 1262 r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
9ce4e4b0 1263
867189b5
LP
1264 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
1265 * read-only already stays this way. This improves compatibility with container managers, where we
1266 * won't attempt to undo read-only mounts already applied. */
ac0930c8 1267
8fceda93 1268 if (r == -ENOENT && m->ignore)
867189b5 1269 return 0;
763a260a 1270 if (r < 0)
9ce4e4b0 1271 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
763a260a 1272 submounts ? " and its submounts" : "");
763a260a 1273 return 0;
d944dc95
LP
1274}
1275
9b68367b 1276static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
5d997827
LP
1277 assert(ns_info);
1278
9c988f93
DH
1279 /*
1280 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1281 * since to protect the API VFS mounts, they need to be around in the
9b68367b 1282 * first place...
9c988f93 1283 */
5d997827 1284
9b68367b
YW
1285 return ns_info->mount_apivfs ||
1286 ns_info->protect_control_groups ||
4e399953
LP
1287 ns_info->protect_kernel_tunables ||
1288 ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
1289 ns_info->proc_subset != PROC_SUBSET_ALL;
5d997827
LP
1290}
1291
da6053d0 1292static size_t namespace_calculate_mounts(
bb0ff3fb 1293 const NamespaceInfo *ns_info,
2652c6c1
DH
1294 char** read_write_paths,
1295 char** read_only_paths,
1296 char** inaccessible_paths,
6c47cd7d 1297 char** empty_directories,
da6053d0
LP
1298 size_t n_bind_mounts,
1299 size_t n_temporary_filesystems,
b3d13314 1300 size_t n_mount_images,
2652c6c1
DH
1301 const char* tmp_dir,
1302 const char* var_tmp_dir,
bbb4e7f3 1303 const char *creds_path,
5e8deb94 1304 const char* log_namespace,
3bdc25a4
LP
1305 bool setup_propagate,
1306 const char* notify_socket) {
2652c6c1 1307
da6053d0
LP
1308 size_t protect_home_cnt;
1309 size_t protect_system_cnt =
52b3d652 1310 (ns_info->protect_system == PROTECT_SYSTEM_STRICT ?
f471b2af 1311 ELEMENTSOF(protect_system_strict_table) :
52b3d652 1312 ((ns_info->protect_system == PROTECT_SYSTEM_FULL) ?
f471b2af 1313 ELEMENTSOF(protect_system_full_table) :
52b3d652 1314 ((ns_info->protect_system == PROTECT_SYSTEM_YES) ?
f471b2af
DH
1315 ELEMENTSOF(protect_system_yes_table) : 0)));
1316
b6c432ca 1317 protect_home_cnt =
52b3d652 1318 (ns_info->protect_home == PROTECT_HOME_YES ?
b6c432ca 1319 ELEMENTSOF(protect_home_yes_table) :
52b3d652 1320 ((ns_info->protect_home == PROTECT_HOME_READ_ONLY) ?
e4da7d8c 1321 ELEMENTSOF(protect_home_read_only_table) :
52b3d652 1322 ((ns_info->protect_home == PROTECT_HOME_TMPFS) ?
e4da7d8c 1323 ELEMENTSOF(protect_home_tmpfs_table) : 0)));
b6c432ca 1324
2652c6c1
DH
1325 return !!tmp_dir + !!var_tmp_dir +
1326 strv_length(read_write_paths) +
1327 strv_length(read_only_paths) +
1328 strv_length(inaccessible_paths) +
6c47cd7d 1329 strv_length(empty_directories) +
d2d6c096 1330 n_bind_mounts +
b3d13314 1331 n_mount_images +
2abd4e38 1332 n_temporary_filesystems +
c575770b
DH
1333 ns_info->private_dev +
1334 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
c575770b 1335 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
94a7b275
KK
1336 (ns_info->protect_kernel_logs ? ELEMENTSOF(protect_kernel_logs_table) : 0) +
1337 (ns_info->protect_control_groups ? 1 : 0) +
5d997827 1338 protect_home_cnt + protect_system_cnt +
aecd5ac6 1339 (ns_info->protect_hostname ? 2 : 0) +
91dd5f7c 1340 (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0) +
bbb4e7f3 1341 (creds_path ? 2 : 1) +
3bdc25a4
LP
1342 !!log_namespace +
1343 setup_propagate + /* /run/systemd/incoming */
1344 !!notify_socket;
2652c6c1
DH
1345}
1346
da6053d0 1347static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) {
9b68367b 1348 assert(root_directory);
f8b64b57
LP
1349 assert(n_mounts);
1350 assert(mounts || *n_mounts == 0);
1351
93bab288 1352 typesafe_qsort(mounts, *n_mounts, mount_path_compare);
f8b64b57
LP
1353
1354 drop_duplicates(mounts, n_mounts);
1355 drop_outside_root(root_directory, mounts, n_mounts);
1356 drop_inaccessible(mounts, n_mounts);
1357 drop_nop(mounts, n_mounts);
1358}
1359
c8c535d5
LP
1360static bool root_read_only(
1361 char **read_only_paths,
1362 ProtectSystem protect_system) {
1363
1364 /* Determine whether the root directory is going to be read-only given the configured settings. */
1365
1366 if (protect_system == PROTECT_SYSTEM_STRICT)
1367 return true;
1368
de46b2be 1369 if (prefixed_path_strv_contains(read_only_paths, "/"))
c8c535d5
LP
1370 return true;
1371
1372 return false;
1373}
1374
1375static bool home_read_only(
1376 char** read_only_paths,
1377 char** inaccessible_paths,
1378 char** empty_directories,
1379 const BindMount *bind_mounts,
1380 size_t n_bind_mounts,
1381 const TemporaryFileSystem *temporary_filesystems,
1382 size_t n_temporary_filesystems,
1383 ProtectHome protect_home) {
1384
1385 size_t i;
1386
1387 /* Determine whether the /home directory is going to be read-only given the configured settings. Yes,
1388 * this is a bit sloppy, since we don't bother checking for cases where / is affected by multiple
1389 * settings. */
1390
1391 if (protect_home != PROTECT_HOME_NO)
1392 return true;
1393
de46b2be
TM
1394 if (prefixed_path_strv_contains(read_only_paths, "/home") ||
1395 prefixed_path_strv_contains(inaccessible_paths, "/home") ||
1396 prefixed_path_strv_contains(empty_directories, "/home"))
c8c535d5
LP
1397 return true;
1398
1399 for (i = 0; i < n_temporary_filesystems; i++)
1400 if (path_equal(temporary_filesystems[i].path, "/home"))
1401 return true;
1402
1403 /* If /home is overmounted with some dir from the host it's not writable. */
1404 for (i = 0; i < n_bind_mounts; i++)
1405 if (path_equal(bind_mounts[i].destination, "/home"))
1406 return true;
1407
1408 return false;
1409}
1410
89e62e0b
LP
1411static int verity_settings_prepare(
1412 VeritySettings *verity,
1413 const char *root_image,
1414 const void *root_hash,
1415 size_t root_hash_size,
1416 const char *root_hash_path,
1417 const void *root_hash_sig,
1418 size_t root_hash_sig_size,
1419 const char *root_hash_sig_path,
1420 const char *verity_data_path) {
1421
1422 int r;
1423
1424 assert(verity);
1425
1426 if (root_hash) {
1427 void *d;
1428
1429 d = memdup(root_hash, root_hash_size);
1430 if (!d)
1431 return -ENOMEM;
1432
1433 free_and_replace(verity->root_hash, d);
1434 verity->root_hash_size = root_hash_size;
aee36b4e 1435 verity->designator = PARTITION_ROOT;
89e62e0b
LP
1436 }
1437
1438 if (root_hash_sig) {
1439 void *d;
1440
1441 d = memdup(root_hash_sig, root_hash_sig_size);
1442 if (!d)
1443 return -ENOMEM;
1444
1445 free_and_replace(verity->root_hash_sig, d);
1446 verity->root_hash_sig_size = root_hash_sig_size;
aee36b4e 1447 verity->designator = PARTITION_ROOT;
89e62e0b
LP
1448 }
1449
1450 if (verity_data_path) {
1451 r = free_and_strdup(&verity->data_path, verity_data_path);
1452 if (r < 0)
1453 return r;
1454 }
1455
1456 r = verity_settings_load(
1457 verity,
1458 root_image,
1459 root_hash_path,
1460 root_hash_sig_path);
1461 if (r < 0)
1462 return log_debug_errno(r, "Failed to load root hash: %m");
1463
1464 return 0;
1465}
1466
613b411c 1467int setup_namespace(
ee818b89 1468 const char* root_directory,
915e6d16 1469 const char* root_image,
18d73705 1470 const MountOptions *root_image_options,
bb0ff3fb 1471 const NamespaceInfo *ns_info,
2a624c36
AP
1472 char** read_write_paths,
1473 char** read_only_paths,
1474 char** inaccessible_paths,
6c47cd7d 1475 char** empty_directories,
d2d6c096 1476 const BindMount *bind_mounts,
da6053d0 1477 size_t n_bind_mounts,
2abd4e38 1478 const TemporaryFileSystem *temporary_filesystems,
da6053d0 1479 size_t n_temporary_filesystems,
b3d13314
LB
1480 const MountImage *mount_images,
1481 size_t n_mount_images,
a004cb4c
LP
1482 const char* tmp_dir,
1483 const char* var_tmp_dir,
bbb4e7f3 1484 const char *creds_path,
91dd5f7c 1485 const char *log_namespace,
915e6d16 1486 unsigned long mount_flags,
0389f4fa
LB
1487 const void *root_hash,
1488 size_t root_hash_size,
1489 const char *root_hash_path,
d4d55b0d
LB
1490 const void *root_hash_sig,
1491 size_t root_hash_sig_size,
1492 const char *root_hash_sig_path,
89e62e0b 1493 const char *verity_data_path,
5e8deb94
LB
1494 const char *propagate_dir,
1495 const char *incoming_dir,
3bdc25a4 1496 const char *notify_socket,
7cc5ef5f
ZJS
1497 DissectImageFlags dissect_image_flags,
1498 char **error_path) {
15ae422b 1499
915e6d16 1500 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
78ebe980 1501 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
915e6d16 1502 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
aee36b4e 1503 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
5f7a690a 1504 MountEntry *m = NULL, *mounts = NULL;
5e8deb94 1505 bool require_prefix = false, setup_propagate = false;
9b68367b 1506 const char *root;
89e62e0b
LP
1507 size_t n_mounts;
1508 int r;
15ae422b 1509
915e6d16
LP
1510 assert(ns_info);
1511
5e8deb94
LB
1512 if (!isempty(propagate_dir) && !isempty(incoming_dir))
1513 setup_propagate = true;
1514
613b411c 1515 if (mount_flags == 0)
c17ec25e 1516 mount_flags = MS_SHARED;
ac0930c8 1517
915e6d16
LP
1518 if (root_image) {
1519 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1520
c8c535d5
LP
1521 /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
1522 if (root_read_only(read_only_paths,
52b3d652 1523 ns_info->protect_system) &&
c8c535d5
LP
1524 home_read_only(read_only_paths, inaccessible_paths, empty_directories,
1525 bind_mounts, n_bind_mounts, temporary_filesystems, n_temporary_filesystems,
52b3d652 1526 ns_info->protect_home) &&
c9ef8573 1527 strv_isempty(read_write_paths))
915e6d16
LP
1528 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1529
89e62e0b
LP
1530 r = verity_settings_prepare(
1531 &verity,
1532 root_image,
1533 root_hash, root_hash_size, root_hash_path,
1534 root_hash_sig, root_hash_sig_size, root_hash_sig_path,
1535 verity_data_path);
915e6d16 1536 if (r < 0)
89e62e0b
LP
1537 return r;
1538
1539 SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity.data_path);
915e6d16 1540
89e62e0b
LP
1541 r = loop_device_make_by_path(
1542 root_image,
1543 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
1544 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
1545 &loop_device);
78ebe980 1546 if (r < 0)
89e62e0b
LP
1547 return log_debug_errno(r, "Failed to create loop device for root image: %m");
1548
1549 r = dissect_image(
1550 loop_device->fd,
1551 &verity,
1552 root_image_options,
1553 dissect_image_flags,
1554 &dissected_image);
78ebe980 1555 if (r < 0)
763a260a 1556 return log_debug_errno(r, "Failed to dissect image: %m");
78ebe980 1557
89e62e0b
LP
1558 r = dissected_image_decrypt(
1559 dissected_image,
1560 NULL,
1561 &verity,
1562 dissect_image_flags,
1563 &decrypted_image);
915e6d16 1564 if (r < 0)
763a260a 1565 return log_debug_errno(r, "Failed to decrypt dissected image: %m");
915e6d16
LP
1566 }
1567
e908468b
LP
1568 if (root_directory)
1569 root = root_directory;
0722b359 1570 else {
77f16dbd
DDM
1571 /* /run/systemd should have been created by PID 1 early on already, but in some cases, like
1572 * when running tests (test-execute), it might not have been created yet so let's make sure
1573 * we create it if it doesn't already exist. */
1574 (void) mkdir_p_label("/run/systemd", 0755);
1575
0722b359
JS
1576 /* Always create the mount namespace in a temporary directory, instead of operating
1577 * directly in the root. The temporary directory prevents any mounts from being
1578 * potentially obscured my other mounts we already applied.
1579 * We use the same mount point for all images, which is safe, since they all live
1580 * in their own namespaces after all, and hence won't see each other. */
e908468b
LP
1581
1582 root = "/run/systemd/unit-root";
1583 (void) mkdir_label(root, 0700);
d18aff04 1584 require_prefix = true;
0722b359 1585 }
e908468b 1586
cfbeb4ef
LP
1587 n_mounts = namespace_calculate_mounts(
1588 ns_info,
1589 read_write_paths,
1590 read_only_paths,
1591 inaccessible_paths,
6c47cd7d 1592 empty_directories,
f5c52a77 1593 n_bind_mounts,
2abd4e38 1594 n_temporary_filesystems,
b3d13314 1595 n_mount_images,
cfbeb4ef 1596 tmp_dir, var_tmp_dir,
bbb4e7f3 1597 creds_path,
5e8deb94 1598 log_namespace,
3bdc25a4
LP
1599 setup_propagate,
1600 notify_socket);
613b411c 1601
f0a4feb0 1602 if (n_mounts > 0) {
5f7a690a
LP
1603 m = mounts = new0(MountEntry, n_mounts);
1604 if (!mounts)
1605 return -ENOMEM;
1606
d18aff04 1607 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
613b411c 1608 if (r < 0)
f0a4feb0 1609 goto finish;
613b411c 1610
d18aff04 1611 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
613b411c 1612 if (r < 0)
f0a4feb0 1613 goto finish;
613b411c 1614
d18aff04 1615 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
613b411c 1616 if (r < 0)
f0a4feb0 1617 goto finish;
7ff7394d 1618
6c47cd7d
LP
1619 r = append_empty_dir_mounts(&m, empty_directories);
1620 if (r < 0)
1621 goto finish;
1622
d2d6c096
LP
1623 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1624 if (r < 0)
1625 goto finish;
1626
2abd4e38
YW
1627 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1628 if (r < 0)
1629 goto finish;
1630
613b411c 1631 if (tmp_dir) {
56a13a49
ZJS
1632 bool ro = streq(tmp_dir, RUN_SYSTEMD_EMPTY);
1633
34de407a 1634 *(m++) = (MountEntry) {
5327c910 1635 .path_const = "/tmp",
56a13a49 1636 .mode = ro ? PRIVATE_TMP_READONLY : PRIVATE_TMP,
89bd586c 1637 .source_const = tmp_dir,
5327c910 1638 };
613b411c 1639 }
7ff7394d 1640
613b411c 1641 if (var_tmp_dir) {
56a13a49
ZJS
1642 bool ro = streq(var_tmp_dir, RUN_SYSTEMD_EMPTY);
1643
34de407a 1644 *(m++) = (MountEntry) {
5327c910 1645 .path_const = "/var/tmp",
56a13a49 1646 .mode = ro ? PRIVATE_TMP_READONLY : PRIVATE_TMP,
89bd586c 1647 .source_const = var_tmp_dir,
5327c910 1648 };
7ff7394d 1649 }
ac0930c8 1650
b3d13314
LB
1651 r = append_mount_images(&m, mount_images, n_mount_images);
1652 if (r < 0)
1653 goto finish;
1654
d46b79bb 1655 if (ns_info->private_dev)
34de407a 1656 *(m++) = (MountEntry) {
5327c910
LP
1657 .path_const = "/dev",
1658 .mode = PRIVATE_DEV,
9ce4e4b0 1659 .flags = DEV_MOUNT_OPTIONS,
5327c910 1660 };
7f112f50 1661
c575770b 1662 if (ns_info->protect_kernel_tunables) {
cbc056c8
ZJS
1663 r = append_static_mounts(&m,
1664 protect_kernel_tunables_table,
1665 ELEMENTSOF(protect_kernel_tunables_table),
1666 ns_info->ignore_protect_paths);
c575770b 1667 if (r < 0)
f0a4feb0 1668 goto finish;
c575770b
DH
1669 }
1670
1671 if (ns_info->protect_kernel_modules) {
cbc056c8
ZJS
1672 r = append_static_mounts(&m,
1673 protect_kernel_modules_table,
1674 ELEMENTSOF(protect_kernel_modules_table),
1675 ns_info->ignore_protect_paths);
c575770b 1676 if (r < 0)
f0a4feb0 1677 goto finish;
c575770b 1678 }
59eeb84b 1679
94a7b275 1680 if (ns_info->protect_kernel_logs) {
cbc056c8
ZJS
1681 r = append_static_mounts(&m,
1682 protect_kernel_logs_table,
1683 ELEMENTSOF(protect_kernel_logs_table),
1684 ns_info->ignore_protect_paths);
94a7b275
KK
1685 if (r < 0)
1686 goto finish;
1687 }
1688
d46b79bb 1689 if (ns_info->protect_control_groups)
34de407a 1690 *(m++) = (MountEntry) {
5327c910
LP
1691 .path_const = "/sys/fs/cgroup",
1692 .mode = READONLY,
1693 };
59eeb84b 1694
52b3d652 1695 r = append_protect_home(&m, ns_info->protect_home, ns_info->ignore_protect_paths);
b6c432ca 1696 if (r < 0)
f0a4feb0 1697 goto finish;
417116f2 1698
52b3d652 1699 r = append_protect_system(&m, ns_info->protect_system, false);
f471b2af 1700 if (r < 0)
f0a4feb0 1701 goto finish;
417116f2 1702
9b68367b 1703 if (namespace_info_mount_apivfs(ns_info)) {
cbc056c8
ZJS
1704 r = append_static_mounts(&m,
1705 apivfs_table,
1706 ELEMENTSOF(apivfs_table),
1707 ns_info->ignore_protect_paths);
5d997827
LP
1708 if (r < 0)
1709 goto finish;
1710 }
1711
aecd5ac6
TM
1712 if (ns_info->protect_hostname) {
1713 *(m++) = (MountEntry) {
1714 .path_const = "/proc/sys/kernel/hostname",
1715 .mode = READONLY,
1716 };
1717 *(m++) = (MountEntry) {
1718 .path_const = "/proc/sys/kernel/domainname",
1719 .mode = READONLY,
1720 };
1721 }
1722
bbb4e7f3
LP
1723 if (creds_path) {
1724 /* If our service has a credentials store configured, then bind that one in, but hide
1725 * everything else. */
1726
1727 *(m++) = (MountEntry) {
1728 .path_const = "/run/credentials",
1729 .mode = TMPFS,
1730 .read_only = true,
1731 .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
1732 .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
1733 };
1734
1735 *(m++) = (MountEntry) {
1736 .path_const = creds_path,
1737 .mode = BIND_MOUNT,
1738 .read_only = true,
1739 .source_const = creds_path,
1740 };
1741 } else {
1742 /* If our service has no credentials store configured, then make the whole
1743 * credentials tree inaccessible wholesale. */
1744
1745 *(m++) = (MountEntry) {
1746 .path_const = "/run/credentials",
1747 .mode = INACCESSIBLE,
1748 .ignore = true,
1749 };
1750 }
1751
91dd5f7c
LP
1752 if (log_namespace) {
1753 _cleanup_free_ char *q;
1754
1755 q = strjoin("/run/systemd/journal.", log_namespace);
1756 if (!q) {
1757 r = -ENOMEM;
1758 goto finish;
1759 }
1760
1761 *(m++) = (MountEntry) {
1762 .path_const = "/run/systemd/journal",
1763 .mode = BIND_MOUNT_RECURSIVE,
1764 .read_only = true,
1765 .source_malloc = TAKE_PTR(q),
1766 };
1767 }
1768
5e8deb94
LB
1769 /* Will be used to add bind mounts at runtime */
1770 if (setup_propagate)
1771 *(m++) = (MountEntry) {
1772 .source_const = propagate_dir,
1773 .path_const = incoming_dir,
1774 .mode = BIND_MOUNT,
1775 .read_only = true,
1776 };
1777
3bdc25a4
LP
1778 if (notify_socket)
1779 *(m++) = (MountEntry) {
1780 .path_const = notify_socket,
1781 .source_const = notify_socket,
1782 .mode = BIND_MOUNT,
1783 .read_only = true,
1784 };
1785
f0a4feb0 1786 assert(mounts + n_mounts == m);
ac0930c8 1787
5327c910 1788 /* Prepend the root directory where that's necessary */
e908468b 1789 r = prefix_where_needed(mounts, n_mounts, root);
5327c910
LP
1790 if (r < 0)
1791 goto finish;
1792
839f1877 1793 normalize_mounts(root, mounts, &n_mounts);
15ae422b
LP
1794 }
1795
1beab8b0
LP
1796 /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
1797
d944dc95 1798 if (unshare(CLONE_NEWNS) < 0) {
763a260a 1799 r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
1beab8b0 1800 if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS))
cbc056c8
ZJS
1801 /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter
1802 * in place that doesn't allow us to create namespaces (or a missing cap), then
1803 * propagate a recognizable error back, which the caller can use to detect this case
1804 * (and only this) and optionally continue without namespacing applied. */
1beab8b0
LP
1805 r = -ENOANO;
1806
d944dc95
LP
1807 goto finish;
1808 }
1e4e94c8 1809
5e8deb94
LB
1810 /* Create the source directory to allow runtime propagation of mounts */
1811 if (setup_propagate)
1812 (void) mkdir_p(propagate_dir, 0600);
1813
9b68367b
YW
1814 /* Remount / as SLAVE so that nothing now mounted in the namespace
1815 * shows up in the parent */
1816 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
763a260a 1817 r = log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
9b68367b 1818 goto finish;
ee818b89
AC
1819 }
1820
915e6d16 1821 if (root_image) {
e908468b 1822 /* A root image is specified, mount it to the right place */
2d3a5a73 1823 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
763a260a
YW
1824 if (r < 0) {
1825 log_debug_errno(r, "Failed to mount root image: %m");
915e6d16 1826 goto finish;
763a260a 1827 }
915e6d16 1828
07ce7407
TM
1829 if (decrypted_image) {
1830 r = decrypted_image_relinquish(decrypted_image);
763a260a
YW
1831 if (r < 0) {
1832 log_debug_errno(r, "Failed to relinquish decrypted image: %m");
07ce7407 1833 goto finish;
763a260a 1834 }
07ce7407 1835 }
78ebe980 1836
915e6d16
LP
1837 loop_device_relinquish(loop_device);
1838
1839 } else if (root_directory) {
1840
e908468b
LP
1841 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1842 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
763a260a
YW
1843 if (r < 0) {
1844 log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
d944dc95 1845 goto finish;
763a260a 1846 }
8f1ad200 1847 if (r == 0) {
21935150
LP
1848 r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
1849 if (r < 0)
8f1ad200 1850 goto finish;
d944dc95 1851 }
e908468b 1852
9b68367b 1853 } else {
e908468b 1854 /* Let's mount the main root directory to the root directory to use */
21935150
LP
1855 r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL);
1856 if (r < 0)
e908468b 1857 goto finish;
ee818b89 1858 }
c2c13f2d 1859
4e0c20de
LP
1860 /* Try to set up the new root directory before mounting anything else there. */
1861 if (root_image || root_directory)
1862 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1863
f0a4feb0 1864 if (n_mounts > 0) {
ac9de0b3 1865 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
6b000af4 1866 _cleanup_free_ char **deny_list = NULL;
da6053d0 1867 size_t j;
6b7c9f8b 1868
cbc056c8
ZJS
1869 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of
1870 * /proc. For example, this is the case with the option: 'InaccessiblePaths=/proc'. */
ac9de0b3
TR
1871 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1872 if (!proc_self_mountinfo) {
763a260a 1873 r = log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m");
7cc5ef5f
ZJS
1874 if (error_path)
1875 *error_path = strdup("/proc/self/mountinfo");
ac9de0b3
TR
1876 goto finish;
1877 }
1878
088696fe
LP
1879 /* First round, establish all mounts we need */
1880 for (;;) {
1881 bool again = false;
1882
1883 for (m = mounts; m < mounts + n_mounts; ++m) {
1884
1885 if (m->applied)
1886 continue;
1887
1888 r = follow_symlink(root, m);
7cc5ef5f
ZJS
1889 if (r < 0) {
1890 if (error_path && mount_entry_path(m))
1891 *error_path = strdup(mount_entry_path(m));
088696fe 1892 goto finish;
7cc5ef5f 1893 }
088696fe 1894 if (r == 0) {
cbc056c8
ZJS
1895 /* We hit a symlinked mount point. The entry got rewritten and might
1896 * point to a very different place now. Let's normalize the changed
1897 * list, and start from the beginning. After all to mount the entry
1898 * at the new location we might need some other mounts first */
088696fe
LP
1899 again = true;
1900 break;
1901 }
1902
4e399953 1903 r = apply_mount(root, m, ns_info);
7cc5ef5f
ZJS
1904 if (r < 0) {
1905 if (error_path && mount_entry_path(m))
1906 *error_path = strdup(mount_entry_path(m));
088696fe 1907 goto finish;
7cc5ef5f 1908 }
088696fe
LP
1909
1910 m->applied = true;
1911 }
1912
1913 if (!again)
1914 break;
1915
839f1877 1916 normalize_mounts(root, mounts, &n_mounts);
c2c13f2d 1917 }
15ae422b 1918
6b000af4
LP
1919 /* Create a deny list we can pass to bind_mount_recursive() */
1920 deny_list = new(char*, n_mounts+1);
1921 if (!deny_list) {
5f7a690a
LP
1922 r = -ENOMEM;
1923 goto finish;
1924 }
f0a4feb0 1925 for (j = 0; j < n_mounts; j++)
6b000af4
LP
1926 deny_list[j] = (char*) mount_entry_path(mounts+j);
1927 deny_list[j] = NULL;
6b7c9f8b
LP
1928
1929 /* Second round, flip the ro bits if necessary. */
f0a4feb0 1930 for (m = mounts; m < mounts + n_mounts; ++m) {
6b000af4 1931 r = make_read_only(m, deny_list, proc_self_mountinfo);
7cc5ef5f
ZJS
1932 if (r < 0) {
1933 if (error_path && mount_entry_path(m))
1934 *error_path = strdup(mount_entry_path(m));
d944dc95 1935 goto finish;
7cc5ef5f 1936 }
c2c13f2d 1937 }
15ae422b
LP
1938 }
1939
9b68367b
YW
1940 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1941 r = mount_move_root(root);
763a260a
YW
1942 if (r < 0) {
1943 log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
9b68367b 1944 goto finish;
763a260a 1945 }
ee818b89 1946
55fe7432 1947 /* Remount / as the desired mode. Note that this will not
c2c13f2d
LP
1948 * reestablish propagation from our side to the host, since
1949 * what's disconnected is disconnected. */
d944dc95 1950 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
763a260a 1951 r = log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
d944dc95
LP
1952 goto finish;
1953 }
15ae422b 1954
5e8deb94
LB
1955 /* bind_mount_in_namespace() will MS_MOVE into that directory, and that's only
1956 * supported for non-shared mounts. This needs to happen after remounting / or it will fail. */
1957 if (setup_propagate) {
1958 r = mount(NULL, incoming_dir, NULL, MS_SLAVE, NULL);
1959 if (r < 0) {
1960 log_error_errno(r, "Failed to remount %s with MS_SLAVE: %m", incoming_dir);
1961 goto finish;
1962 }
1963 }
1964
d944dc95 1965 r = 0;
15ae422b 1966
d944dc95 1967finish:
0cd41757
LP
1968 if (n_mounts > 0)
1969 for (m = mounts; m < mounts + n_mounts; m++)
1970 mount_entry_done(m);
613b411c 1971
5f7a690a
LP
1972 free(mounts);
1973
613b411c
LP
1974 return r;
1975}
1976
da6053d0
LP
1977void bind_mount_free_many(BindMount *b, size_t n) {
1978 size_t i;
d2d6c096
LP
1979
1980 assert(b || n == 0);
1981
1982 for (i = 0; i < n; i++) {
1983 free(b[i].source);
1984 free(b[i].destination);
1985 }
1986
1987 free(b);
1988}
1989
da6053d0 1990int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
d2d6c096
LP
1991 _cleanup_free_ char *s = NULL, *d = NULL;
1992 BindMount *c;
1993
1994 assert(b);
1995 assert(n);
1996 assert(item);
1997
1998 s = strdup(item->source);
1999 if (!s)
2000 return -ENOMEM;
2001
2002 d = strdup(item->destination);
2003 if (!d)
2004 return -ENOMEM;
2005
aa484f35 2006 c = reallocarray(*b, *n + 1, sizeof(BindMount));
d2d6c096
LP
2007 if (!c)
2008 return -ENOMEM;
2009
2010 *b = c;
2011
2012 c[(*n) ++] = (BindMount) {
1cc6c93a
YW
2013 .source = TAKE_PTR(s),
2014 .destination = TAKE_PTR(d),
d2d6c096 2015 .read_only = item->read_only,
9ce4e4b0 2016 .nosuid = item->nosuid,
d2d6c096
LP
2017 .recursive = item->recursive,
2018 .ignore_enoent = item->ignore_enoent,
2019 };
2020
d2d6c096
LP
2021 return 0;
2022}
2023
b3d13314
LB
2024MountImage* mount_image_free_many(MountImage *m, size_t *n) {
2025 size_t i;
2026
2027 assert(n);
2028 assert(m || *n == 0);
2029
2030 for (i = 0; i < *n; i++) {
2031 free(m[i].source);
2032 free(m[i].destination);
427353f6 2033 mount_options_free_all(m[i].mount_options);
b3d13314
LB
2034 }
2035
2036 free(m);
2037 *n = 0;
2038 return NULL;
2039}
2040
2041int mount_image_add(MountImage **m, size_t *n, const MountImage *item) {
2042 _cleanup_free_ char *s = NULL, *d = NULL;
427353f6
LB
2043 _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
2044 MountOptions *i;
b3d13314
LB
2045 MountImage *c;
2046
2047 assert(m);
2048 assert(n);
2049 assert(item);
2050
2051 s = strdup(item->source);
2052 if (!s)
2053 return -ENOMEM;
2054
2055 d = strdup(item->destination);
2056 if (!d)
2057 return -ENOMEM;
2058
427353f6
LB
2059 LIST_FOREACH(mount_options, i, item->mount_options) {
2060 _cleanup_(mount_options_free_allp) MountOptions *o;
2061
2062 o = new(MountOptions, 1);
2063 if (!o)
2064 return -ENOMEM;
2065
2066 *o = (MountOptions) {
2067 .partition_designator = i->partition_designator,
2068 .options = strdup(i->options),
2069 };
2070 if (!o->options)
2071 return -ENOMEM;
2072
2073 LIST_APPEND(mount_options, options, TAKE_PTR(o));
2074 }
2075
b3d13314
LB
2076 c = reallocarray(*m, *n + 1, sizeof(MountImage));
2077 if (!c)
2078 return -ENOMEM;
2079
2080 *m = c;
2081
2082 c[(*n) ++] = (MountImage) {
2083 .source = TAKE_PTR(s),
2084 .destination = TAKE_PTR(d),
427353f6 2085 .mount_options = TAKE_PTR(options),
b3d13314
LB
2086 .ignore_enoent = item->ignore_enoent,
2087 };
2088
2089 return 0;
2090}
2091
da6053d0
LP
2092void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
2093 size_t i;
2abd4e38
YW
2094
2095 assert(t || n == 0);
2096
2097 for (i = 0; i < n; i++) {
2098 free(t[i].path);
2099 free(t[i].options);
2100 }
2101
2102 free(t);
2103}
2104
2105int temporary_filesystem_add(
2106 TemporaryFileSystem **t,
da6053d0 2107 size_t *n,
2abd4e38
YW
2108 const char *path,
2109 const char *options) {
2110
2111 _cleanup_free_ char *p = NULL, *o = NULL;
2112 TemporaryFileSystem *c;
2113
2114 assert(t);
2115 assert(n);
2116 assert(path);
2117
2118 p = strdup(path);
2119 if (!p)
2120 return -ENOMEM;
2121
2122 if (!isempty(options)) {
2123 o = strdup(options);
2124 if (!o)
2125 return -ENOMEM;
2126 }
2127
aa484f35 2128 c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
2abd4e38
YW
2129 if (!c)
2130 return -ENOMEM;
2131
2132 *t = c;
2133
2134 c[(*n) ++] = (TemporaryFileSystem) {
1cc6c93a
YW
2135 .path = TAKE_PTR(p),
2136 .options = TAKE_PTR(o),
2abd4e38
YW
2137 };
2138
2abd4e38
YW
2139 return 0;
2140}
2141
a652f050
JR
2142static int make_tmp_prefix(const char *prefix) {
2143 _cleanup_free_ char *t = NULL;
2144 int r;
2145
2146 /* Don't do anything unless we know the dir is actually missing */
2147 r = access(prefix, F_OK);
2148 if (r >= 0)
2149 return 0;
2150 if (errno != ENOENT)
2151 return -errno;
2152
2153 r = mkdir_parents(prefix, 0755);
2154 if (r < 0)
2155 return r;
2156
2157 r = tempfn_random(prefix, NULL, &t);
2158 if (r < 0)
2159 return r;
2160
2161 if (mkdir(t, 0777) < 0)
2162 return -errno;
2163
2164 if (chmod(t, 01777) < 0) {
2165 r = -errno;
2166 (void) rmdir(t);
2167 return r;
2168 }
2169
2170 if (rename(t, prefix) < 0) {
2171 r = -errno;
2172 (void) rmdir(t);
2173 return r == -EEXIST ? 0 : r; /* it's fine if someone else created the dir by now */
2174 }
2175
2176 return 0;
2177
2178}
2179
56a13a49 2180static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, char **tmp_path) {
613b411c 2181 _cleanup_free_ char *x = NULL;
19cd4e19 2182 _cleanup_free_ char *y = NULL;
6b46ea73
LP
2183 char bid[SD_ID128_STRING_MAX];
2184 sd_id128_t boot_id;
56a13a49 2185 bool rw = true;
6b46ea73 2186 int r;
613b411c
LP
2187
2188 assert(id);
2189 assert(prefix);
2190 assert(path);
2191
6b46ea73
LP
2192 /* We include the boot id in the directory so that after a
2193 * reboot we can easily identify obsolete directories. */
2194
2195 r = sd_id128_get_boot(&boot_id);
2196 if (r < 0)
2197 return r;
2198
605405c6 2199 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
2200 if (!x)
2201 return -ENOMEM;
2202
a652f050
JR
2203 r = make_tmp_prefix(prefix);
2204 if (r < 0)
2205 return r;
2206
613b411c 2207 RUN_WITH_UMASK(0077)
56a13a49
ZJS
2208 if (!mkdtemp(x)) {
2209 if (errno == EROFS || ERRNO_IS_DISK_SPACE(errno))
2210 rw = false;
2211 else
2212 return -errno;
2213 }
613b411c 2214
56a13a49 2215 if (rw) {
19cd4e19 2216 y = strjoin(x, "/tmp");
2217 if (!y)
2218 return -ENOMEM;
2219
2220 RUN_WITH_UMASK(0000) {
2221 if (mkdir(y, 0777 | S_ISVTX) < 0)
2222 return -errno;
2223 }
2224
2225 r = label_fix_container(y, prefix, 0);
56a13a49
ZJS
2226 if (r < 0)
2227 return r;
19cd4e19 2228
2229 if (tmp_path)
2230 *tmp_path = TAKE_PTR(y);
56a13a49
ZJS
2231 } else {
2232 /* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being
2233 * read-only. This way the service will get the EROFS result as if it was writing to the real
2234 * file system. */
2235 r = mkdir_p(RUN_SYSTEMD_EMPTY, 0500);
2236 if (r < 0)
2237 return r;
613b411c 2238
3f181262
LP
2239 r = free_and_strdup(&x, RUN_SYSTEMD_EMPTY);
2240 if (r < 0)
2241 return r;
c17ec25e 2242 }
15ae422b 2243
1cc6c93a 2244 *path = TAKE_PTR(x);
613b411c
LP
2245 return 0;
2246}
2247
2248int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
56a13a49
ZJS
2249 _cleanup_(namespace_cleanup_tmpdirp) char *a = NULL;
2250 _cleanup_(rmdir_and_freep) char *a_tmp = NULL;
2251 char *b;
613b411c
LP
2252 int r;
2253
2254 assert(id);
2255 assert(tmp_dir);
2256 assert(var_tmp_dir);
2257
56a13a49 2258 r = setup_one_tmp_dir(id, "/tmp", &a, &a_tmp);
613b411c
LP
2259 if (r < 0)
2260 return r;
2261
56a13a49
ZJS
2262 r = setup_one_tmp_dir(id, "/var/tmp", &b, NULL);
2263 if (r < 0)
613b411c 2264 return r;
613b411c 2265
56a13a49
ZJS
2266 a_tmp = mfree(a_tmp); /* avoid rmdir */
2267 *tmp_dir = TAKE_PTR(a);
2268 *var_tmp_dir = TAKE_PTR(b);
613b411c
LP
2269
2270 return 0;
2271}
2272
2caa38e9 2273int setup_netns(const int netns_storage_socket[static 2]) {
613b411c 2274 _cleanup_close_ int netns = -1;
3ee897d6 2275 int r, q;
613b411c
LP
2276
2277 assert(netns_storage_socket);
2278 assert(netns_storage_socket[0] >= 0);
2279 assert(netns_storage_socket[1] >= 0);
2280
2281 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
2282 * namespace reference fd. Whatever process runs this first
2283 * shall create a new namespace, all others should just join
2284 * it. To serialize that we use a file lock on the socket
2285 * pair.
613b411c
LP
2286 *
2287 * It's a bit crazy, but hey, works great! */
2288
2289 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
2290 return -errno;
2291
3ee897d6
LP
2292 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
2293 if (netns == -EAGAIN) {
44ffcbae 2294 /* Nothing stored yet, so let's create a new namespace. */
613b411c
LP
2295
2296 if (unshare(CLONE_NEWNET) < 0) {
2297 r = -errno;
2298 goto fail;
2299 }
2300
44ffcbae 2301 (void) loopback_setup();
613b411c
LP
2302
2303 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
2304 if (netns < 0) {
2305 r = -errno;
2306 goto fail;
2307 }
2308
2309 r = 1;
613b411c 2310
3ee897d6
LP
2311 } else if (netns < 0) {
2312 r = netns;
2313 goto fail;
613b411c 2314
3ee897d6
LP
2315 } else {
2316 /* Yay, found something, so let's join the namespace */
613b411c
LP
2317 if (setns(netns, CLONE_NEWNET) < 0) {
2318 r = -errno;
2319 goto fail;
2320 }
2321
2322 r = 0;
2323 }
2324
3ee897d6
LP
2325 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
2326 if (q < 0) {
2327 r = q;
613b411c
LP
2328 goto fail;
2329 }
2330
2331fail:
fe048ce5 2332 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
2333 return r;
2334}
417116f2 2335
2caa38e9 2336int open_netns_path(const int netns_storage_socket[static 2], const char *path) {
51af7fb2
LP
2337 _cleanup_close_ int netns = -1;
2338 int q, r;
2339
2340 assert(netns_storage_socket);
2341 assert(netns_storage_socket[0] >= 0);
2342 assert(netns_storage_socket[1] >= 0);
2343 assert(path);
2344
2345 /* If the storage socket doesn't contain a netns fd yet, open one via the file system and store it in
2346 * it. This is supposed to be called ahead of time, i.e. before setup_netns() which will allocate a
2347 * new anonymous netns if needed. */
2348
2349 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
2350 return -errno;
2351
2352 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
2353 if (netns == -EAGAIN) {
2354 /* Nothing stored yet. Open the file from the file system. */
2355
2356 netns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
2357 if (netns < 0) {
2358 r = -errno;
2359 goto fail;
2360 }
2361
2362 r = fd_is_network_ns(netns);
2363 if (r == 0) { /* Not a netns? Refuse early. */
2364 r = -EINVAL;
2365 goto fail;
2366 }
2367 if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
2368 goto fail;
2369
2370 r = 1;
2371
2372 } else if (netns < 0) {
2373 r = netns;
2374 goto fail;
2375 } else
2376 r = 0; /* Already allocated */
2377
2378 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
2379 if (q < 0) {
2380 r = q;
2381 goto fail;
2382 }
2383
2384fail:
2385 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
2386 return r;
2387}
2388
6e2d7c4f
MS
2389bool ns_type_supported(NamespaceType type) {
2390 const char *t, *ns_proc;
2391
0fa5b831
LP
2392 t = namespace_type_to_string(type);
2393 if (!t) /* Don't know how to translate this? Then it's not supported */
6e2d7c4f
MS
2394 return false;
2395
6e2d7c4f 2396 ns_proc = strjoina("/proc/self/ns/", t);
6e2d7c4f
MS
2397 return access(ns_proc, F_OK) == 0;
2398}
2399
1b8689f9 2400static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
cbc056c8
ZJS
2401 [PROTECT_HOME_NO] = "no",
2402 [PROTECT_HOME_YES] = "yes",
1b8689f9 2403 [PROTECT_HOME_READ_ONLY] = "read-only",
cbc056c8 2404 [PROTECT_HOME_TMPFS] = "tmpfs",
417116f2
LP
2405};
2406
1e8c7bd5 2407DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
5e1c6154 2408
1b8689f9 2409static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
cbc056c8
ZJS
2410 [PROTECT_SYSTEM_NO] = "no",
2411 [PROTECT_SYSTEM_YES] = "yes",
2412 [PROTECT_SYSTEM_FULL] = "full",
3f815163 2413 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
2414};
2415
1e8c7bd5 2416DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
03c791aa 2417
6e2d7c4f 2418static const char* const namespace_type_table[] = {
cbc056c8 2419 [NAMESPACE_MOUNT] = "mnt",
6e2d7c4f 2420 [NAMESPACE_CGROUP] = "cgroup",
cbc056c8
ZJS
2421 [NAMESPACE_UTS] = "uts",
2422 [NAMESPACE_IPC] = "ipc",
2423 [NAMESPACE_USER] = "user",
2424 [NAMESPACE_PID] = "pid",
2425 [NAMESPACE_NET] = "net",
6e2d7c4f
MS
2426};
2427
2428DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
4e399953
LP
2429
2430static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
2431 [PROTECT_PROC_DEFAULT] = "default",
2432 [PROTECT_PROC_NOACCESS] = "noaccess",
2433 [PROTECT_PROC_INVISIBLE] = "invisible",
2434 [PROTECT_PROC_PTRACEABLE] = "ptraceable",
2435};
2436
2437DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
2438
2439static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
2440 [PROC_SUBSET_ALL] = "all",
2441 [PROC_SUBSET_PID] = "pid",
2442};
2443
2444DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);