]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/core/namespace.c
man/systemd-sysext: list ephemeral/ephemeral-import in the list of options
[thirdparty/systemd.git] / src / core / namespace.c
... / ...
CommitLineData
1/* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3#include <linux/loop.h>
4#include <sched.h>
5#include <stdio.h>
6#include <stdlib.h>
7#include <sys/file.h>
8#include <sys/mount.h>
9#include <unistd.h>
10
11#include "alloc-util.h"
12#include "base-filesystem.h"
13#include "chase.h"
14#include "dev-setup.h"
15#include "devnum-util.h"
16#include "dissect-image.h"
17#include "errno-util.h"
18#include "escape.h"
19#include "extension-util.h"
20#include "fd-util.h"
21#include "format-util.h"
22#include "fs-util.h"
23#include "glyph-util.h"
24#include "label-util.h"
25#include "list.h"
26#include "lock-util.h"
27#include "log.h"
28#include "loop-util.h"
29#include "loopback-setup.h"
30#include "missing_magic.h"
31#include "mkdir-label.h"
32#include "mount-util.h"
33#include "mountpoint-util.h"
34#include "namespace.h"
35#include "namespace-util.h"
36#include "nsflags.h"
37#include "nulstr-util.h"
38#include "os-util.h"
39#include "path-util.h"
40#include "pidref.h"
41#include "process-util.h"
42#include "selinux-util.h"
43#include "socket-util.h"
44#include "sort-util.h"
45#include "stat-util.h"
46#include "string-table.h"
47#include "string-util.h"
48#include "strv.h"
49#include "tmpfile-util.h"
50#include "umask-util.h"
51#include "user-util.h"
52#include "vpick.h"
53
54#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
55
56typedef enum MountMode {
57 /* This is ordered by priority! */
58 MOUNT_INACCESSIBLE,
59 MOUNT_OVERLAY,
60 MOUNT_IMAGE,
61 MOUNT_BIND,
62 MOUNT_BIND_RECURSIVE,
63 MOUNT_PRIVATE_TMP,
64 MOUNT_PRIVATE_DEV,
65 MOUNT_BIND_DEV,
66 MOUNT_EMPTY_DIR,
67 MOUNT_PRIVATE_SYSFS,
68 MOUNT_BIND_SYSFS,
69 MOUNT_PROCFS,
70 MOUNT_PRIVATE_CGROUP2FS,
71 MOUNT_READ_ONLY,
72 MOUNT_READ_WRITE,
73 MOUNT_NOEXEC,
74 MOUNT_EXEC,
75 MOUNT_TMPFS,
76 MOUNT_RUN,
77 MOUNT_PRIVATE_TMPFS, /* Mounted outside the root directory, and used by subsequent mounts */
78 MOUNT_EXTENSION_DIRECTORY, /* Bind-mounted outside the root directory, and used by subsequent mounts */
79 MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */
80 MOUNT_MQUEUEFS,
81 MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */
82 _MOUNT_MODE_MAX,
83 _MOUNT_MODE_INVALID = -EINVAL,
84} MountMode;
85
86typedef enum MountEntryState {
87 MOUNT_PENDING,
88 MOUNT_APPLIED,
89 MOUNT_SKIPPED,
90 _MOUNT_ENTRY_STATE_MAX,
91 _MOUNT_ENTRY_STATE_INVALID = -EINVAL,
92} MountEntryState;
93
94typedef struct MountEntry {
95 const char *path_const; /* Memory allocated on stack or static */
96 MountMode mode;
97 bool ignore:1; /* Ignore if path does not exist? */
98 bool has_prefix:1; /* Already prefixed by the root dir? */
99 bool read_only:1; /* Shall this mount point be read-only? */
100 bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */
101 bool noexec:1; /* Shall set MS_NOEXEC on the mount itself */
102 bool exec:1; /* Shall clear MS_NOEXEC on the mount itself */
103 bool create_source_dir:1; /* Create the source directory if it doesn't exist - for implicit bind mounts */
104 mode_t source_dir_mode; /* Mode for the source directory, if it is to be created */
105 MountEntryState state; /* Whether it was already processed or skipped */
106 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
107 const char *unprefixed_path_const; /* If the path was amended with a prefix, these will save the original */
108 char *unprefixed_path_malloc;
109 const char *source_const; /* The source path, for bind mounts or images */
110 char *source_malloc;
111 const char *options_const;/* Mount options for tmpfs */
112 char *options_malloc;
113 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
114 unsigned n_followed;
115 LIST_HEAD(MountOptions, image_options_const);
116 char **overlay_layers;
117 VeritySettings verity;
118 ImageClass filter_class; /* Used for live updates to skip inapplicable images */
119 bool idmapped;
120 uid_t idmap_uid;
121 gid_t idmap_gid;
122} MountEntry;
123
124typedef struct MountList {
125 MountEntry *mounts;
126 size_t n_mounts;
127} MountList;
128
129static const BindMount bind_log_sockets_table[] = {
130 { (char*) "/run/systemd/journal/socket", (char*) "/run/systemd/journal/socket", .read_only = true, .nosuid = true, .noexec = true, .nodev = true, .ignore_enoent = true },
131 { (char*) "/run/systemd/journal/stdout", (char*) "/run/systemd/journal/stdout", .read_only = true, .nosuid = true, .noexec = true, .nodev = true, .ignore_enoent = true },
132 { (char*) "/run/systemd/journal/dev-log", (char*) "/run/systemd/journal/dev-log", .read_only = true, .nosuid = true, .noexec = true, .nodev = true, .ignore_enoent = true },
133};
134
135/* If MountAPIVFS= is used, let's mount /sys, /proc, /dev and /run into the it, but only as a fallback if the user hasn't mounted
136 * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
137static const MountEntry apivfs_table[] = {
138 { "/proc", MOUNT_PROCFS, false },
139 { "/dev", MOUNT_BIND_DEV, false },
140 { "/sys", MOUNT_BIND_SYSFS, false },
141 { "/run", MOUNT_RUN, false, .options_const = "mode=0755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME },
142};
143
144/* ProtectKernelTunables= option and the related filesystem APIs */
145static const MountEntry protect_kernel_tunables_proc_table[] = {
146 { "/proc/acpi", MOUNT_READ_ONLY, true },
147 { "/proc/apm", MOUNT_READ_ONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
148 { "/proc/asound", MOUNT_READ_ONLY, true },
149 { "/proc/bus", MOUNT_READ_ONLY, true },
150 { "/proc/fs", MOUNT_READ_ONLY, true },
151 { "/proc/irq", MOUNT_READ_ONLY, true },
152 { "/proc/kallsyms", MOUNT_INACCESSIBLE, true },
153 { "/proc/kcore", MOUNT_INACCESSIBLE, true },
154 { "/proc/latency_stats", MOUNT_READ_ONLY, true },
155 { "/proc/mtrr", MOUNT_READ_ONLY, true },
156 { "/proc/scsi", MOUNT_READ_ONLY, true },
157 { "/proc/sys", MOUNT_READ_ONLY, true },
158 { "/proc/sysrq-trigger", MOUNT_READ_ONLY, true },
159 { "/proc/timer_stats", MOUNT_READ_ONLY, true },
160};
161
162static const MountEntry protect_kernel_tunables_sys_table[] = {
163 { "/sys", MOUNT_READ_ONLY, false },
164 { "/sys/fs/bpf", MOUNT_READ_ONLY, true },
165 { "/sys/fs/cgroup", MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */
166 { "/sys/fs/selinux", MOUNT_READ_WRITE_IMPLICIT, true },
167 { "/sys/kernel/debug", MOUNT_READ_ONLY, true },
168 { "/sys/kernel/tracing", MOUNT_READ_ONLY, true },
169};
170
171/* ProtectKernelModules= option */
172static const MountEntry protect_kernel_modules_table[] = {
173 { "/usr/lib/modules", MOUNT_INACCESSIBLE, true },
174};
175
176/* ProtectKernelLogs= option */
177static const MountEntry protect_kernel_logs_proc_table[] = {
178 { "/proc/kmsg", MOUNT_INACCESSIBLE, true },
179};
180
181static const MountEntry protect_kernel_logs_dev_table[] = {
182 { "/dev/kmsg", MOUNT_INACCESSIBLE, true },
183};
184
185/*
186 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
187 * system should be protected by ProtectSystem=
188 */
189static const MountEntry protect_home_read_only_table[] = {
190 { "/home", MOUNT_READ_ONLY, true },
191 { "/run/user", MOUNT_READ_ONLY, true },
192 { "/root", MOUNT_READ_ONLY, true },
193};
194
195/* ProtectHome=tmpfs table */
196static const MountEntry protect_home_tmpfs_table[] = {
197 { "/home", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
198 { "/run/user", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
199 { "/root", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
200};
201
202/* ProtectHome=yes table */
203static const MountEntry protect_home_yes_table[] = {
204 { "/home", MOUNT_INACCESSIBLE, true },
205 { "/run/user", MOUNT_INACCESSIBLE, true },
206 { "/root", MOUNT_INACCESSIBLE, true },
207};
208
209/* ProtectControlGroups=yes table */
210static const MountEntry protect_control_groups_yes_table[] = {
211 { "/sys/fs/cgroup", MOUNT_READ_ONLY, false },
212};
213
214/* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so
215 * flags is not set here. */
216static const MountEntry protect_control_groups_private_table[] = {
217 { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false },
218};
219
220/* ProtectControlGroups=strict table */
221static const MountEntry protect_control_groups_strict_table[] = {
222 { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true },
223};
224
225/* ProtectSystem=yes table */
226static const MountEntry protect_system_yes_table[] = {
227 { "/usr", MOUNT_READ_ONLY, false },
228 { "/boot", MOUNT_READ_ONLY, true },
229 { "/efi", MOUNT_READ_ONLY, true },
230};
231
232/* ProtectSystem=full includes ProtectSystem=yes */
233static const MountEntry protect_system_full_table[] = {
234 { "/usr", MOUNT_READ_ONLY, false },
235 { "/boot", MOUNT_READ_ONLY, true },
236 { "/efi", MOUNT_READ_ONLY, true },
237 { "/etc", MOUNT_READ_ONLY, false },
238};
239
240/* ProtectSystem=strict table. In this strict mode, we mount everything read-only, except for /proc, /dev,
241 * /sys which are the kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
242 * protect those, and these options should be fully orthogonal. (And of course /home and friends are also
243 * left writable, as ProtectHome= shall manage those, orthogonally).
244 */
245static const MountEntry protect_system_strict_table[] = {
246 { "/", MOUNT_READ_ONLY, false },
247 { "/proc", MOUNT_READ_WRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
248 { "/sys", MOUNT_READ_WRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
249 { "/dev", MOUNT_READ_WRITE_IMPLICIT, false }, /* PrivateDevices= */
250 { "/home", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
251 { "/run/user", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
252 { "/root", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
253};
254
255/* ProtectHostname=yes able */
256static const MountEntry protect_hostname_yes_table[] = {
257 { "/proc/sys/kernel/hostname", MOUNT_READ_ONLY, false },
258 { "/proc/sys/kernel/domainname", MOUNT_READ_ONLY, false },
259};
260
261static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
262 [MOUNT_INACCESSIBLE] = "inaccessible",
263 [MOUNT_OVERLAY] = "overlay",
264 [MOUNT_IMAGE] = "image",
265 [MOUNT_BIND] = "bind",
266 [MOUNT_BIND_RECURSIVE] = "bind-recursive",
267 [MOUNT_PRIVATE_TMP] = "private-tmp",
268 [MOUNT_PRIVATE_DEV] = "private-dev",
269 [MOUNT_BIND_DEV] = "bind-dev",
270 [MOUNT_EMPTY_DIR] = "empty-dir",
271 [MOUNT_PRIVATE_SYSFS] = "private-sysfs",
272 [MOUNT_BIND_SYSFS] = "bind-sysfs",
273 [MOUNT_PRIVATE_CGROUP2FS] = "private-cgroup2fs",
274 [MOUNT_PROCFS] = "procfs",
275 [MOUNT_READ_ONLY] = "read-only",
276 [MOUNT_READ_WRITE] = "read-write",
277 [MOUNT_NOEXEC] = "noexec",
278 [MOUNT_EXEC] = "exec",
279 [MOUNT_TMPFS] = "tmpfs",
280 [MOUNT_RUN] = "run",
281 [MOUNT_PRIVATE_TMPFS] = "private-tmpfs",
282 [MOUNT_EXTENSION_DIRECTORY] = "extension-directory",
283 [MOUNT_EXTENSION_IMAGE] = "extension-image",
284 [MOUNT_MQUEUEFS] = "mqueuefs",
285 [MOUNT_READ_WRITE_IMPLICIT] = "read-write-implicit",
286};
287
288/* Helper struct for naming simplicity and reusability */
289static const struct {
290 const char *level_env;
291 const char *level_env_print;
292} image_class_info[_IMAGE_CLASS_MAX] = {
293 [IMAGE_SYSEXT] = {
294 .level_env = "SYSEXT_LEVEL",
295 .level_env_print = " SYSEXT_LEVEL=",
296 },
297 [IMAGE_CONFEXT] = {
298 .level_env = "CONFEXT_LEVEL",
299 .level_env_print = " CONFEXT_LEVEL=",
300 }
301};
302
303DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
304
305static const char* mount_entry_path(const MountEntry *p) {
306 assert(p);
307
308 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
309 * otherwise the stack/static ->path field is returned. */
310
311 return p->path_malloc ?: p->path_const;
312}
313
314static const char* mount_entry_unprefixed_path(const MountEntry *p) {
315 assert(p);
316
317 /* Returns the unprefixed path (ie: before prefix_where_needed() ran), if any */
318
319 return p->unprefixed_path_malloc ?: p->unprefixed_path_const ?: mount_entry_path(p);
320}
321
322static void mount_entry_consume_prefix(MountEntry *p, char *new_path) {
323 assert(p);
324 assert(p->path_malloc || p->path_const);
325 assert(new_path);
326
327 /* Saves current path in unprefixed_ variable, and takes over new_path */
328
329 free_and_replace(p->unprefixed_path_malloc, p->path_malloc);
330 /* If we didn't have a path on the heap, then it's a static one */
331 if (!p->unprefixed_path_malloc)
332 p->unprefixed_path_const = p->path_const;
333 p->path_malloc = new_path;
334 p->has_prefix = true;
335}
336
337static bool mount_entry_read_only(const MountEntry *p) {
338 assert(p);
339
340 return p->read_only || IN_SET(p->mode, MOUNT_READ_ONLY, MOUNT_INACCESSIBLE);
341}
342
343static bool mount_entry_noexec(const MountEntry *p) {
344 assert(p);
345
346 return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS, MOUNT_PRIVATE_CGROUP2FS);
347}
348
349static bool mount_entry_exec(const MountEntry *p) {
350 assert(p);
351
352 return p->exec || p->mode == MOUNT_EXEC;
353}
354
355static const char* mount_entry_source(const MountEntry *p) {
356 assert(p);
357
358 return p->source_malloc ?: p->source_const;
359}
360
361static const char* mount_entry_options(const MountEntry *p) {
362 assert(p);
363
364 return p->options_malloc ?: p->options_const;
365}
366
367static void mount_entry_done(MountEntry *p) {
368 assert(p);
369
370 p->path_malloc = mfree(p->path_malloc);
371 p->unprefixed_path_malloc = mfree(p->unprefixed_path_malloc);
372 p->source_malloc = mfree(p->source_malloc);
373 p->options_malloc = mfree(p->options_malloc);
374 p->overlay_layers = strv_free(p->overlay_layers);
375 verity_settings_done(&p->verity);
376}
377
378static void mount_list_done(MountList *ml) {
379 assert(ml);
380
381 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts)
382 mount_entry_done(m);
383
384 ml->mounts = mfree(ml->mounts);
385 ml->n_mounts = 0;
386}
387
388static MountEntry* mount_list_extend(MountList *ml) {
389 assert(ml);
390
391 if (!GREEDY_REALLOC0(ml->mounts, ml->n_mounts+1))
392 return NULL;
393
394 return ml->mounts + ml->n_mounts++;
395}
396
397static int append_access_mounts(MountList *ml, char **strv, MountMode mode, bool forcibly_require_prefix) {
398 assert(ml);
399
400 /* Adds a list of user-supplied READ_WRITE/READ_WRITE_IMPLICIT/READ_ONLY/INACCESSIBLE entries */
401
402 STRV_FOREACH(i, strv) {
403 bool ignore = false, needs_prefix = false;
404 const char *e = *i;
405
406 /* Look for any prefixes */
407 if (startswith(e, "-")) {
408 e++;
409 ignore = true;
410 }
411 if (startswith(e, "+")) {
412 e++;
413 needs_prefix = true;
414 }
415
416 if (!path_is_absolute(e))
417 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", e);
418
419 MountEntry *me = mount_list_extend(ml);
420 if (!me)
421 return log_oom_debug();
422
423 *me = (MountEntry) {
424 .path_const = e,
425 .mode = mode,
426 .ignore = ignore,
427 .has_prefix = !needs_prefix && !forcibly_require_prefix,
428 };
429 }
430
431 return 0;
432}
433
434static int append_empty_dir_mounts(MountList *ml, char **strv) {
435 assert(ml);
436
437 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
438 * "/private/" boundary directories for DynamicUser=1. */
439
440 STRV_FOREACH(i, strv) {
441 MountEntry *me = mount_list_extend(ml);
442 if (!me)
443 return log_oom_debug();
444
445 *me = (MountEntry) {
446 .path_const = *i,
447 .mode = MOUNT_EMPTY_DIR,
448 .ignore = false,
449 .read_only = true,
450 .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
451 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
452 };
453 }
454
455 return 0;
456}
457
458static int append_bind_mounts(MountList *ml, const BindMount *binds, size_t n) {
459 assert(ml);
460 assert(binds || n == 0);
461
462 FOREACH_ARRAY(b, binds, n) {
463 MountEntry *me = mount_list_extend(ml);
464 if (!me)
465 return log_oom_debug();
466
467 *me = (MountEntry) {
468 .path_const = b->destination,
469 .mode = b->recursive ? MOUNT_BIND_RECURSIVE : MOUNT_BIND,
470 .read_only = b->read_only,
471 .nosuid = b->nosuid,
472 .noexec = b->noexec,
473 .flags = b->nodev ? MS_NODEV : 0,
474 .source_const = b->source,
475 .ignore = b->ignore_enoent,
476 .idmapped = b->idmapped,
477 .idmap_uid = b->uid,
478 .idmap_gid = b->gid,
479 };
480 }
481
482 return 0;
483}
484
485static int append_mount_images(MountList *ml, const MountImage *mount_images, size_t n) {
486 int r;
487
488 assert(ml);
489 assert(mount_images || n == 0);
490
491 FOREACH_ARRAY(m, mount_images, n) {
492 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
493 MountEntry *me = mount_list_extend(ml);
494 if (!me)
495 return log_oom_debug();
496
497 r = verity_settings_load(&verity, m->source, /* root_hash_path= */ NULL, /* root_hash_sig_path= */ NULL);
498 if (r < 0)
499 return log_debug_errno(r, "Failed to check verity root hash of %s: %m", m->source);
500
501 *me = (MountEntry) {
502 .path_const = m->destination,
503 .mode = MOUNT_IMAGE,
504 .source_const = m->source,
505 .image_options_const = m->mount_options,
506 .ignore = m->ignore_enoent,
507 .verity = TAKE_GENERIC(verity, VeritySettings, VERITY_SETTINGS_DEFAULT),
508 .filter_class = _IMAGE_CLASS_INVALID,
509 };
510 }
511
512 return 0;
513}
514
515static int append_extensions(
516 MountList *ml,
517 const char *root,
518 const char *private_namespace_dir,
519 char **hierarchies,
520 const MountImage *mount_images,
521 size_t n_mount_images,
522 char **extension_directories) {
523
524 char ***overlays = NULL;
525 size_t n_overlays = 0;
526 int r;
527
528 assert(ml);
529
530 if (n_mount_images == 0 && strv_isempty(extension_directories))
531 return 0;
532
533 assert(private_namespace_dir);
534
535 n_overlays = strv_length(hierarchies);
536 if (n_overlays == 0)
537 return 0;
538
539 /* Prepare a list of overlays, that will have as each element a strv containing all the layers that
540 * will later be concatenated as a lowerdir= parameter for the mount operation.
541 * The overlays vector will have the same number of elements and will correspond to the
542 * hierarchies vector, so they can be iterated upon together. */
543 overlays = new0(char**, n_overlays);
544 if (!overlays)
545 return -ENOMEM;
546
547 CLEANUP_ARRAY(overlays, n_overlays, strv_free_many);
548
549 /* First, prepare a mount for each image, but these won't be visible to the unit, instead
550 * they will be mounted in our propagate directory, and used as a source for the overlay. */
551 for (size_t i = 0; i < n_mount_images; i++) {
552 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
553 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
554 _cleanup_free_ char *mount_point = NULL;
555 const MountImage *m = mount_images + i;
556
557 r = path_pick(/* toplevel_path= */ NULL,
558 /* toplevel_fd= */ AT_FDCWD,
559 m->source,
560 &pick_filter_image_raw,
561 PICK_ARCHITECTURE|PICK_TRIES,
562 &result);
563 if (r == -ENOENT && m->ignore_enoent)
564 continue;
565 if (r < 0)
566 return r;
567 if (!result.path) {
568 if (m->ignore_enoent)
569 continue;
570
571 return log_debug_errno(
572 SYNTHETIC_ERRNO(ENOENT),
573 "No matching entry in .v/ directory %s found.",
574 m->source);
575 }
576
577 r = verity_settings_load(&verity, result.path, /* root_hash_path= */ NULL, /* root_hash_sig_path= */ NULL);
578 if (r < 0)
579 return log_debug_errno(r, "Failed to check verity root hash of %s: %m", result.path);
580
581 if (asprintf(&mount_point, "%s/unit-extensions/%zu", private_namespace_dir, i) < 0)
582 return -ENOMEM;
583
584 for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
585 char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
586 if (!prefixed_hierarchy)
587 return -ENOMEM;
588
589 r = strv_consume(&overlays[j], TAKE_PTR(prefixed_hierarchy));
590 if (r < 0)
591 return r;
592 }
593
594 MountEntry *me = mount_list_extend(ml);
595 if (!me)
596 return -ENOMEM;
597
598 *me = (MountEntry) {
599 .path_malloc = TAKE_PTR(mount_point),
600 .image_options_const = m->mount_options,
601 .ignore = m->ignore_enoent,
602 .source_malloc = TAKE_PTR(result.path),
603 .mode = MOUNT_EXTENSION_IMAGE,
604 .has_prefix = true,
605 .verity = TAKE_GENERIC(verity, VeritySettings, VERITY_SETTINGS_DEFAULT),
606 .filter_class = _IMAGE_CLASS_INVALID,
607 };
608 }
609
610 /* Secondly, extend the lowerdir= parameters with each ExtensionDirectory.
611 * Bind mount them in the same location as the ExtensionImages, so that we
612 * can check that they are valid trees (extension-release.d). */
613 STRV_FOREACH(extension_directory, extension_directories) {
614 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
615 _cleanup_free_ char *mount_point = NULL;
616 const char *e = *extension_directory;
617 bool ignore_enoent = false;
618
619 /* Look for any prefixes */
620 if (startswith(e, "-")) {
621 e++;
622 ignore_enoent = true;
623 }
624 /* Ignore this for now */
625 if (startswith(e, "+"))
626 e++;
627
628 r = path_pick(/* toplevel_path= */ NULL,
629 /* toplevel_fd= */ AT_FDCWD,
630 e,
631 &pick_filter_image_dir,
632 PICK_ARCHITECTURE|PICK_TRIES,
633 &result);
634 if (r == -ENOENT && ignore_enoent)
635 continue;
636 if (r < 0)
637 return r;
638 if (!result.path) {
639 if (ignore_enoent)
640 continue;
641
642 return log_debug_errno(
643 SYNTHETIC_ERRNO(ENOENT),
644 "No matching entry in .v/ directory %s found.",
645 e);
646 }
647
648 /* Pick up the counter where the ExtensionImages left it. */
649 if (asprintf(&mount_point, "%s/unit-extensions/%zu", private_namespace_dir, n_mount_images++) < 0)
650 return -ENOMEM;
651
652 for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
653 char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
654 if (!prefixed_hierarchy)
655 return -ENOMEM;
656
657 r = strv_consume(&overlays[j], TAKE_PTR(prefixed_hierarchy));
658 if (r < 0)
659 return r;
660 }
661
662 MountEntry *me = mount_list_extend(ml);
663 if (!me)
664 return -ENOMEM;
665
666 *me = (MountEntry) {
667 .path_malloc = TAKE_PTR(mount_point),
668 .source_malloc = TAKE_PTR(result.path),
669 .mode = MOUNT_EXTENSION_DIRECTORY,
670 .ignore = ignore_enoent,
671 .has_prefix = true,
672 .read_only = true,
673 .filter_class = _IMAGE_CLASS_INVALID,
674 };
675 }
676
677 /* Then, for each hierarchy, prepare an overlay with the list of lowerdir= strings
678 * set up earlier. */
679 for (size_t i = 0; hierarchies && hierarchies[i]; ++i) {
680 _cleanup_free_ char *prefixed_hierarchy = NULL;
681
682 prefixed_hierarchy = path_join(root, hierarchies[i]);
683 if (!prefixed_hierarchy)
684 return -ENOMEM;
685
686 MountEntry *me = mount_list_extend(ml);
687 if (!me)
688 return -ENOMEM;
689
690 *me = (MountEntry) {
691 .path_malloc = TAKE_PTR(prefixed_hierarchy),
692 .overlay_layers = TAKE_PTR(overlays[i]),
693 .mode = MOUNT_OVERLAY,
694 .has_prefix = true,
695 .ignore = true, /* If the source image doesn't set the ignore bit it will fail earlier. */
696 };
697 }
698
699 return 0;
700}
701
702static int append_tmpfs_mounts(MountList *ml, const TemporaryFileSystem *tmpfs, size_t n) {
703 assert(ml);
704 assert(tmpfs || n == 0);
705
706 FOREACH_ARRAY(t, tmpfs, n) {
707 _cleanup_free_ char *o = NULL, *str = NULL;
708 unsigned long flags;
709 bool ro = false;
710 int r;
711
712 if (!path_is_absolute(t->path))
713 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", t->path);
714
715 str = strjoin("mode=0755" NESTED_TMPFS_LIMITS ",", t->options);
716 if (!str)
717 return -ENOMEM;
718
719 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
720 if (r < 0)
721 return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
722
723 ro = flags & MS_RDONLY;
724 flags &= ~MS_RDONLY;
725
726 MountEntry *me = mount_list_extend(ml);
727 if (!me)
728 return log_oom_debug();
729
730 *me = (MountEntry) {
731 .path_const = t->path,
732 .mode = MOUNT_TMPFS,
733 .read_only = ro,
734 .options_malloc = TAKE_PTR(o),
735 .flags = flags,
736 };
737 }
738
739 return 0;
740}
741
742static int append_private_tmp(MountList *ml, const NamespaceParameters *p) {
743 MountEntry *me;
744
745 assert(ml);
746 assert(p);
747 assert(p->private_tmp == p->private_var_tmp ||
748 (p->private_tmp == PRIVATE_TMP_DISCONNECTED && p->private_var_tmp == PRIVATE_TMP_NO));
749
750 if (p->tmp_dir) {
751 assert(p->private_tmp == PRIVATE_TMP_CONNECTED);
752
753 me = mount_list_extend(ml);
754 if (!me)
755 return log_oom_debug();
756 *me = (MountEntry) {
757 .path_const = "/tmp/",
758 .mode = MOUNT_PRIVATE_TMP,
759 .read_only = streq(p->tmp_dir, RUN_SYSTEMD_EMPTY),
760 .source_const = p->tmp_dir,
761 };
762 }
763
764 if (p->var_tmp_dir) {
765 assert(p->private_var_tmp == PRIVATE_TMP_CONNECTED);
766
767 me = mount_list_extend(ml);
768 if (!me)
769 return log_oom_debug();
770 *me = (MountEntry) {
771 .path_const = "/var/tmp/",
772 .mode = MOUNT_PRIVATE_TMP,
773 .read_only = streq(p->var_tmp_dir, RUN_SYSTEMD_EMPTY),
774 .source_const = p->var_tmp_dir,
775 };
776 }
777
778 if (p->private_tmp != PRIVATE_TMP_DISCONNECTED)
779 return 0;
780
781 if (p->private_var_tmp == PRIVATE_TMP_NO) {
782 me = mount_list_extend(ml);
783 if (!me)
784 return log_oom_debug();
785 *me = (MountEntry) {
786 .path_const = "/tmp/",
787 .mode = MOUNT_PRIVATE_TMPFS,
788 .options_const = "mode=0700" NESTED_TMPFS_LIMITS,
789 .flags = MS_NODEV|MS_STRICTATIME,
790 };
791
792 return 0;
793 }
794
795 _cleanup_free_ char *tmpfs_dir = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
796 tmpfs_dir = path_join(p->private_namespace_dir, "unit-private-tmp");
797 tmp_dir = path_join(tmpfs_dir, "tmp");
798 var_tmp_dir = path_join(tmpfs_dir, "var-tmp");
799 if (!tmpfs_dir || !tmp_dir || !var_tmp_dir)
800 return log_oom_debug();
801
802 me = mount_list_extend(ml);
803 if (!me)
804 return log_oom_debug();
805 *me = (MountEntry) {
806 .path_malloc = TAKE_PTR(tmpfs_dir),
807 .mode = MOUNT_PRIVATE_TMPFS,
808 .options_const = "mode=0700" NESTED_TMPFS_LIMITS,
809 .flags = MS_NODEV|MS_STRICTATIME,
810 .has_prefix = true,
811 };
812
813 me = mount_list_extend(ml);
814 if (!me)
815 return log_oom_debug();
816 *me = (MountEntry) {
817 .source_malloc = TAKE_PTR(tmp_dir),
818 .path_const = "/tmp/",
819 .mode = MOUNT_BIND,
820 .source_dir_mode = 01777,
821 .create_source_dir = true,
822 };
823
824 me = mount_list_extend(ml);
825 if (!me)
826 return log_oom_debug();
827 *me = (MountEntry) {
828 .source_malloc = TAKE_PTR(var_tmp_dir),
829 .path_const = "/var/tmp/",
830 .mode = MOUNT_BIND,
831 .source_dir_mode = 01777,
832 .create_source_dir = true,
833 };
834
835 return 0;
836}
837
838static int append_static_mounts(MountList *ml, const MountEntry *mounts, size_t n, bool ignore_protect) {
839 assert(ml);
840 assert(mounts || n == 0);
841
842 /* Adds a list of static pre-defined entries */
843
844 FOREACH_ARRAY(m, mounts, n) {
845 MountEntry *me = mount_list_extend(ml);
846 if (!me)
847 return log_oom_debug();
848
849 /* No dynamic values allowed. */
850 assert(m->path_const);
851 assert(!m->path_malloc);
852 assert(!m->unprefixed_path_malloc);
853 assert(!m->source_malloc);
854 assert(!m->options_malloc);
855 assert(!m->overlay_layers);
856
857 *me = *m;
858 me->ignore = me->ignore || ignore_protect;
859 }
860
861 return 0;
862}
863
864static int append_protect_control_groups(MountList *ml, ProtectControlGroups protect_control_groups, bool ignore_protect) {
865 assert(ml);
866
867 switch (protect_control_groups) {
868
869 case PROTECT_CONTROL_GROUPS_NO:
870 return 0;
871
872 case PROTECT_CONTROL_GROUPS_YES:
873 return append_static_mounts(ml, protect_control_groups_yes_table, ELEMENTSOF(protect_control_groups_yes_table), ignore_protect);
874
875 case PROTECT_CONTROL_GROUPS_PRIVATE:
876 return append_static_mounts(ml, protect_control_groups_private_table, ELEMENTSOF(protect_control_groups_private_table), ignore_protect);
877
878 case PROTECT_CONTROL_GROUPS_STRICT:
879 return append_static_mounts(ml, protect_control_groups_strict_table, ELEMENTSOF(protect_control_groups_strict_table), ignore_protect);
880
881 default:
882 assert_not_reached();
883 }
884}
885
886static int append_protect_home(MountList *ml, ProtectHome protect_home, bool ignore_protect) {
887 assert(ml);
888
889 switch (protect_home) {
890
891 case PROTECT_HOME_NO:
892 return 0;
893
894 case PROTECT_HOME_READ_ONLY:
895 return append_static_mounts(ml, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
896
897 case PROTECT_HOME_TMPFS:
898 return append_static_mounts(ml, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
899
900 case PROTECT_HOME_YES:
901 return append_static_mounts(ml, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
902
903 default:
904 assert_not_reached();
905 }
906}
907
908static int append_protect_system(MountList *ml, ProtectSystem protect_system, bool ignore_protect) {
909 assert(ml);
910
911 switch (protect_system) {
912
913 case PROTECT_SYSTEM_NO:
914 return 0;
915
916 case PROTECT_SYSTEM_STRICT:
917 return append_static_mounts(ml, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
918
919 case PROTECT_SYSTEM_YES:
920 return append_static_mounts(ml, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
921
922 case PROTECT_SYSTEM_FULL:
923 return append_static_mounts(ml, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
924
925 default:
926 assert_not_reached();
927 }
928}
929
930static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
931 int d;
932
933 /* ExtensionImages/Directories will be used by other mounts as a base, so sort them first
934 * regardless of the prefix - they are set up in the propagate directory anyway */
935 d = -CMP(a->mode == MOUNT_EXTENSION_IMAGE, b->mode == MOUNT_EXTENSION_IMAGE);
936 if (d != 0)
937 return d;
938 d = -CMP(a->mode == MOUNT_EXTENSION_DIRECTORY, b->mode == MOUNT_EXTENSION_DIRECTORY);
939 if (d != 0)
940 return d;
941
942 /* MOUNT_PRIVATE_TMPFS needs to be set up earlier, especially than MOUNT_BIND. */
943 d = -CMP(a->mode == MOUNT_PRIVATE_TMPFS, b->mode == MOUNT_PRIVATE_TMPFS);
944 if (d != 0)
945 return d;
946
947 /* If the paths are not equal, then order prefixes first */
948 d = path_compare(mount_entry_path(a), mount_entry_path(b));
949 if (d != 0)
950 return d;
951
952 /* If the paths are equal, check the mode */
953 return CMP((int) a->mode, (int) b->mode);
954}
955
956static int prefix_where_needed(MountList *ml, const char *root_directory) {
957 /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
958
959 assert(ml);
960
961 FOREACH_ARRAY(me, ml->mounts, ml->n_mounts) {
962 char *s;
963
964 if (me->has_prefix)
965 continue;
966
967 s = path_join(root_directory, mount_entry_path(me));
968 if (!s)
969 return -ENOMEM;
970
971 mount_entry_consume_prefix(me, s);
972 }
973
974 return 0;
975}
976
977static bool verity_has_later_duplicates(MountList *ml, const MountEntry *needle) {
978
979 assert(ml);
980 assert(needle);
981 assert(needle >= ml->mounts && needle < ml->mounts + ml->n_mounts);
982 assert(needle->mode == MOUNT_EXTENSION_IMAGE);
983
984 if (needle->verity.root_hash_size == 0)
985 return false;
986
987 /* Overlayfs rejects supplying the same directory inode twice as determined by filesystem UUID and
988 * file handle in lowerdir=, even if they are mounted on different paths, as it resolves each mount
989 * to its source filesystem, so drop duplicates, and keep the last one. This only covers non-DDI
990 * verity images. Note that the list is ordered, so we only check for the reminder of the list for
991 * each item, rather than the full list from the beginning, as any earlier duplicates will have
992 * already been pruned. */
993
994 for (const MountEntry *m = needle + 1; m < ml->mounts + ml->n_mounts; m++) {
995 if (m->mode != MOUNT_EXTENSION_IMAGE)
996 continue;
997 if (memcmp_nn(m->verity.root_hash,
998 m->verity.root_hash_size,
999 needle->verity.root_hash,
1000 needle->verity.root_hash_size) == 0)
1001 return true;
1002 }
1003
1004 return false;
1005}
1006
1007static void drop_duplicates(MountList *ml) {
1008 MountEntry *f, *t, *previous;
1009
1010 assert(ml);
1011
1012 /* Drops duplicate entries. Expects that the array is properly ordered already. */
1013
1014 for (f = ml->mounts, t = ml->mounts, previous = NULL; f < ml->mounts + ml->n_mounts; f++) {
1015
1016 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
1017 * above. Note that we only drop duplicates that haven't been mounted yet. */
1018 if (previous &&
1019 path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
1020 f->state == MOUNT_PENDING && previous->state == MOUNT_PENDING) {
1021 log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
1022 /* Propagate the flags to the remaining entry */
1023 previous->read_only = previous->read_only || mount_entry_read_only(f);
1024 previous->noexec = previous->noexec || mount_entry_noexec(f);
1025 previous->exec = previous->exec || mount_entry_exec(f);
1026 mount_entry_done(f);
1027 continue;
1028 }
1029
1030 if (f->mode == MOUNT_EXTENSION_IMAGE && verity_has_later_duplicates(ml, f)) {
1031 log_debug("Skipping duplicate extension image %s", mount_entry_source(f));
1032 mount_entry_done(f);
1033 continue;
1034 }
1035
1036 *t = *f;
1037 previous = t;
1038 t++;
1039 }
1040
1041 ml->n_mounts = t - ml->mounts;
1042}
1043
1044static void drop_inaccessible(MountList *ml) {
1045 MountEntry *f, *t;
1046 const char *clear = NULL;
1047
1048 assert(ml);
1049
1050 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
1051 * ordered already. */
1052
1053 for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
1054
1055 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
1056 * it, as inaccessible paths really should drop the entire subtree. */
1057 if (clear && path_startswith(mount_entry_path(f), clear)) {
1058 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1059 mount_entry_done(f);
1060 continue;
1061 }
1062
1063 clear = f->mode == MOUNT_INACCESSIBLE ? mount_entry_path(f) : NULL;
1064
1065 *t = *f;
1066 t++;
1067 }
1068
1069 ml->n_mounts = t - ml->mounts;
1070}
1071
1072static void drop_nop(MountList *ml) {
1073 MountEntry *f, *t;
1074
1075 assert(ml);
1076
1077 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
1078 * list is ordered by prefixes. */
1079
1080 for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
1081
1082 /* Only suppress such subtrees for READ_ONLY, READ_WRITE and READ_WRITE_IMPLICIT entries */
1083 if (IN_SET(f->mode, MOUNT_READ_ONLY, MOUNT_READ_WRITE, MOUNT_READ_WRITE_IMPLICIT)) {
1084 MountEntry *found = NULL;
1085
1086 /* Now let's find the first parent of the entry we are looking at. */
1087 for (MountEntry *p = PTR_SUB1(t, ml->mounts); p; p = PTR_SUB1(p, ml->mounts))
1088 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
1089 found = p;
1090 break;
1091 }
1092
1093 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
1094 if (found && found->mode == f->mode) {
1095 log_debug("%s (%s) is made redundant by %s (%s)",
1096 mount_entry_path(f), mount_mode_to_string(f->mode),
1097 mount_entry_path(found), mount_mode_to_string(found->mode));
1098 mount_entry_done(f);
1099 continue;
1100 }
1101 }
1102
1103 *t = *f;
1104 t++;
1105 }
1106
1107 ml->n_mounts = t - ml->mounts;
1108}
1109
1110static void drop_outside_root(MountList *ml, const char *root_directory) {
1111 MountEntry *f, *t;
1112
1113 assert(ml);
1114
1115 /* Nothing to do */
1116 if (!root_directory)
1117 return;
1118
1119 /* Drops all mounts that are outside of the root directory. */
1120
1121 for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
1122
1123 /* ExtensionImages/Directories bases are opened in /run/[user/xyz/]systemd/unit-extensions
1124 * on the host, and a private (invisible to the guest) tmpfs instance is mounted on
1125 * /run/[user/xyz/]systemd/unit-private-tmp as the storage backend of private /tmp and
1126 * /var/tmp. */
1127 if (!IN_SET(f->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY, MOUNT_PRIVATE_TMPFS) &&
1128 !path_startswith(mount_entry_path(f), root_directory)) {
1129 log_debug("%s is outside of root directory.", mount_entry_path(f));
1130 mount_entry_done(f);
1131 continue;
1132 }
1133
1134 *t = *f;
1135 t++;
1136 }
1137
1138 ml->n_mounts = t - ml->mounts;
1139}
1140
1141static int clone_device_node(const char *node, const char *temporary_mount, bool *make_devnode) {
1142 _cleanup_free_ char *sl = NULL;
1143 const char *dn, *bn;
1144 struct stat st;
1145 int r;
1146
1147 assert(node);
1148 assert(path_is_absolute(node));
1149 assert(temporary_mount);
1150 assert(make_devnode);
1151
1152 if (stat(node, &st) < 0) {
1153 if (errno == ENOENT) {
1154 log_debug_errno(errno, "Device node '%s' to clone does not exist.", node);
1155 return -ENXIO;
1156 }
1157
1158 return log_debug_errno(errno, "Failed to stat() device node '%s' to clone: %m", node);
1159 }
1160
1161 r = stat_verify_device_node(&st);
1162 if (r < 0)
1163 return log_debug_errno(r, "Cannot clone device node '%s': %m", node);
1164
1165 dn = strjoina(temporary_mount, node);
1166
1167 /* First, try to create device node properly */
1168 if (*make_devnode) {
1169 mac_selinux_create_file_prepare(node, st.st_mode);
1170 r = mknod(dn, st.st_mode, st.st_rdev);
1171 mac_selinux_create_file_clear();
1172 if (r >= 0)
1173 goto add_symlink;
1174 if (errno != EPERM)
1175 return log_debug_errno(errno, "Failed to mknod '%s': %m", node);
1176
1177 /* This didn't work, let's not try this again for the next iterations. */
1178 *make_devnode = false;
1179 }
1180
1181 /* We're about to fall back to bind-mounting the device node. So create a dummy bind-mount target.
1182 * Do not prepare device-node SELinux label (see issue 13762) */
1183 r = mknod(dn, S_IFREG, 0);
1184 if (r < 0 && errno != EEXIST)
1185 return log_debug_errno(errno, "Failed to mknod dummy device node for '%s': %m", node);
1186
1187 /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard
1188 * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or
1189 * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */
1190 r = mount_nofollow_verbose(LOG_DEBUG, node, dn, NULL, MS_BIND, NULL);
1191 if (r < 0)
1192 return r;
1193
1194add_symlink:
1195 bn = path_startswith(node, "/dev/");
1196 if (!bn)
1197 return 0;
1198
1199 /* Create symlinks like /dev/char/1:9 → ../urandom */
1200 if (asprintf(&sl, "%s/dev/%s/" DEVNUM_FORMAT_STR,
1201 temporary_mount,
1202 S_ISCHR(st.st_mode) ? "char" : "block",
1203 DEVNUM_FORMAT_VAL(st.st_rdev)) < 0)
1204 return log_oom_debug();
1205
1206 (void) mkdir_parents(sl, 0755);
1207
1208 const char *t = strjoina("../", bn);
1209 if (symlink(t, sl) < 0)
1210 log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
1211
1212 return 0;
1213}
1214
1215static int bind_mount_device_dir(const char *temporary_mount, const char *dir) {
1216 const char *t;
1217
1218 assert(temporary_mount);
1219 assert(dir);
1220 assert(path_is_absolute(dir));
1221
1222 t = strjoina(temporary_mount, dir);
1223
1224 (void) mkdir(t, 0755);
1225 return mount_nofollow_verbose(LOG_DEBUG, dir, t, NULL, MS_BIND, NULL);
1226}
1227
1228static char* settle_runtime_dir(RuntimeScope scope) {
1229 char *runtime_dir;
1230
1231 if (scope != RUNTIME_SCOPE_USER)
1232 return strdup("/run/");
1233
1234 if (asprintf(&runtime_dir, "/run/user/" UID_FMT, geteuid()) < 0)
1235 return NULL;
1236
1237 return runtime_dir;
1238}
1239
1240static int create_temporary_mount_point(RuntimeScope scope, char **ret) {
1241 _cleanup_free_ char *runtime_dir = NULL, *temporary_mount = NULL;
1242
1243 assert(ret);
1244
1245 runtime_dir = settle_runtime_dir(scope);
1246 if (!runtime_dir)
1247 return log_oom_debug();
1248
1249 temporary_mount = path_join(runtime_dir, "systemd/namespace-XXXXXX");
1250 if (!temporary_mount)
1251 return log_oom_debug();
1252
1253 if (!mkdtemp(temporary_mount))
1254 return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
1255
1256 *ret = TAKE_PTR(temporary_mount);
1257 return 0;
1258}
1259
1260static int mount_private_dev(const MountEntry *m, const NamespaceParameters *p) {
1261 static const char devnodes[] =
1262 "/dev/null\0"
1263 "/dev/zero\0"
1264 "/dev/full\0"
1265 "/dev/random\0"
1266 "/dev/urandom\0"
1267 "/dev/tty\0";
1268
1269 _cleanup_(rmdir_and_freep) char *temporary_mount = NULL;
1270 _cleanup_(umount_and_rmdir_and_freep) char *dev = NULL;
1271 bool can_mknod = true;
1272 int r;
1273
1274 assert(m);
1275 assert(p);
1276
1277 r = create_temporary_mount_point(p->runtime_scope, &temporary_mount);
1278 if (r < 0)
1279 return r;
1280
1281 dev = path_join(temporary_mount, "dev");
1282 if (!dev)
1283 return -ENOMEM;
1284
1285 (void) mkdir(dev, 0755);
1286 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=0755" TMPFS_LIMITS_PRIVATE_DEV);
1287 if (r < 0)
1288 return r;
1289
1290 r = label_fix_full(AT_FDCWD, dev, "/dev", 0);
1291 if (r < 0)
1292 return log_debug_errno(r, "Failed to fix label of '%s' as /dev/: %m", dev);
1293
1294 r = bind_mount_device_dir(temporary_mount, "/dev/pts");
1295 if (r < 0)
1296 return r;
1297
1298 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
1299 * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
1300 * Thus, in that case make a clone.
1301 * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
1302 r = is_symlink("/dev/ptmx");
1303 if (r < 0)
1304 return log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
1305 if (r > 0) {
1306 const char *devptmx = strjoina(temporary_mount, "/dev/ptmx");
1307 if (symlink("pts/ptmx", devptmx) < 0)
1308 return log_debug_errno(errno, "Failed to create symlink '%s' to pts/ptmx: %m", devptmx);
1309 } else {
1310 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
1311 if (r < 0)
1312 return r;
1313 }
1314
1315 r = bind_mount_device_dir(temporary_mount, "/dev/shm");
1316 if (r < 0)
1317 return r;
1318
1319 FOREACH_STRING(d, "/dev/mqueue", "/dev/hugepages")
1320 (void) bind_mount_device_dir(temporary_mount, d);
1321
1322 /* We assume /run/systemd/journal/ is available if not changing root, which isn't entirely accurate
1323 * but shouldn't matter, as either way the user would get ENOENT when accessing /dev/log */
1324 if ((!p->root_image && !p->root_directory) || p->bind_log_sockets) {
1325 const char *devlog = strjoina(temporary_mount, "/dev/log");
1326 if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
1327 log_debug_errno(errno,
1328 "Failed to create symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m",
1329 devlog);
1330 }
1331
1332 NULSTR_FOREACH(d, devnodes) {
1333 r = clone_device_node(d, temporary_mount, &can_mknod);
1334 /* ENXIO means the *source* is not a device file, skip creation in that case */
1335 if (r < 0 && r != -ENXIO)
1336 return r;
1337 }
1338
1339 r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
1340 if (r < 0)
1341 log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
1342
1343 /* Create the /dev directory if missing. It is more likely to be missing when the service is started
1344 * with RootDirectory. This is consistent with mount units creating the mount points when missing. */
1345 (void) mkdir_p_label(mount_entry_path(m), 0755);
1346
1347 /* Unmount everything in old /dev */
1348 r = umount_recursive(mount_entry_path(m), 0);
1349 if (r < 0)
1350 log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
1351
1352 r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL);
1353 if (r < 0)
1354 return r;
1355 dev = rmdir_and_free(dev); /* Mount is successfully moved, do not umount() */
1356
1357 return 1;
1358}
1359
1360static int mount_bind_dev(const MountEntry *m) {
1361 int r;
1362
1363 assert(m);
1364
1365 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the
1366 * service's /dev. This is only used when RootDirectory= is set. */
1367
1368 (void) mkdir_p_label(mount_entry_path(m), 0755);
1369
1370 r = path_is_mount_point(mount_entry_path(m));
1371 if (r < 0)
1372 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
1373 if (r > 0) /* make this a NOP if /dev is already a mount point */
1374 return 0;
1375
1376 r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
1377 if (r < 0)
1378 return r;
1379
1380 return 1;
1381}
1382
1383static int mount_bind_sysfs(const MountEntry *m) {
1384 int r;
1385
1386 assert(m);
1387
1388 (void) mkdir_p_label(mount_entry_path(m), 0755);
1389
1390 r = path_is_mount_point(mount_entry_path(m));
1391 if (r < 0)
1392 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
1393 if (r > 0) /* make this a NOP if /sys is already a mount point */
1394 return 0;
1395
1396 /* Bind mount the host's version so that we get all child mounts of it, too. */
1397 r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
1398 if (r < 0)
1399 return r;
1400
1401 return 1;
1402}
1403
1404static int mount_private_apivfs(
1405 const char *fstype,
1406 const char *entry_path,
1407 const char *bind_source,
1408 const char *opts,
1409 RuntimeScope scope) {
1410
1411 _cleanup_(rmdir_and_freep) char *temporary_mount = NULL;
1412 int r;
1413
1414 assert(fstype);
1415 assert(entry_path);
1416 assert(bind_source);
1417
1418 (void) mkdir_p_label(entry_path, 0755);
1419
1420 /* First, check if we have enough privileges to mount a new instance. Note, a new sysfs instance
1421 * cannot be mounted on an already existing mount. Let's use a temporary place. */
1422 r = create_temporary_mount_point(scope, &temporary_mount);
1423 if (r < 0)
1424 return r;
1425
1426 r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
1427 if (ERRNO_IS_NEG_PRIVILEGE(r)) {
1428 /* When we do not have enough privileges to mount a new instance, fall back to use an
1429 * existing mount. */
1430
1431 r = path_is_mount_point(entry_path);
1432 if (r < 0)
1433 return log_debug_errno(r, "Unable to determine whether '%s' is already mounted: %m", entry_path);
1434 if (r > 0)
1435 return 0; /* Use the current mount as is. */
1436
1437 /* We lack permissions to mount a new instance, and it is not already mounted. But we can
1438 * access the host's, so as a final fallback bind-mount it to the destination, as most likely
1439 * we are inside a user manager in an unprivileged user namespace. */
1440 r = mount_nofollow_verbose(LOG_DEBUG, bind_source, entry_path, /* fstype = */ NULL, MS_BIND|MS_REC, /* options = */ NULL);
1441 if (r < 0)
1442 return r;
1443
1444 return 1;
1445 }
1446 if (r < 0)
1447 return r;
1448
1449 /* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */
1450 r = umount_recursive(entry_path, /* flags = */ 0);
1451 if (r < 0)
1452 log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", entry_path);
1453
1454 /* Then, move the new mount instance. */
1455 r = mount_nofollow_verbose(LOG_DEBUG, temporary_mount, entry_path, /* fstype = */ NULL, MS_MOVE, /* options = */ NULL);
1456 if (r < 0)
1457 return r;
1458
1459 /* We mounted a new instance now. Let's bind mount the children over now. This matters for nspawn
1460 * where a bunch of files are overmounted, in particular the boot id. */
1461 (void) bind_mount_submounts(bind_source, entry_path);
1462 return 1;
1463}
1464
1465static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p) {
1466 assert(m);
1467 assert(p);
1468 return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
1469}
1470
1471static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
1472 assert(m);
1473 assert(p);
1474 return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", /* opts = */ NULL, p->runtime_scope);
1475}
1476
1477static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
1478 _cleanup_free_ char *opts = NULL;
1479
1480 assert(m);
1481 assert(p);
1482
1483 if (p->protect_proc != PROTECT_PROC_DEFAULT ||
1484 p->proc_subset != PROC_SUBSET_ALL) {
1485
1486 /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
1487 * pretended to be per-instance but actually was per-namespace), hence let's make use of it
1488 * if requested. To make sure this logic succeeds only on kernels where hidepid= is
1489 * per-instance, we'll exclusively use the textual value for hidepid=, since support was
1490 * added in the same commit: if it's supported it is thus also per-instance. */
1491
1492 const char *hpv = p->protect_proc == PROTECT_PROC_DEFAULT ?
1493 "off" :
1494 protect_proc_to_string(p->protect_proc);
1495
1496 /* hidepid= support was added in 5.8, so we can use fsconfig()/fsopen() (which were added in
1497 * 5.2) to check if hidepid= is supported. This avoids a noisy dmesg log by the kernel when
1498 * trying to use hidepid= on systems where it isn't supported. The same applies for subset=.
1499 * fsopen()/fsconfig() was also backported on some distros which allows us to detect
1500 * hidepid=/subset= support in even more scenarios. */
1501
1502 if (mount_option_supported("proc", "hidepid", hpv) > 0) {
1503 opts = strjoin("hidepid=", hpv);
1504 if (!opts)
1505 return -ENOMEM;
1506 }
1507
1508 if (p->proc_subset == PROC_SUBSET_PID &&
1509 mount_option_supported("proc", "subset", "pid") > 0)
1510 if (!strextend_with_separator(&opts, ",", "subset=pid"))
1511 return -ENOMEM;
1512 }
1513
1514 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
1515 * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
1516 * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
1517 * mounted on /proc/ first. */
1518 return mount_private_apivfs("proc", mount_entry_path(m), "/proc", opts, p->runtime_scope);
1519}
1520
1521static int mount_tmpfs(const MountEntry *m) {
1522 const char *entry_path, *inner_path;
1523 int r;
1524
1525 assert(m);
1526
1527 entry_path = mount_entry_path(m);
1528 inner_path = mount_entry_unprefixed_path(m);
1529
1530 /* First, get rid of everything that is below if there is anything. Then, overmount with our new
1531 * tmpfs */
1532
1533 (void) mkdir_p_label(entry_path, 0755);
1534 (void) umount_recursive(entry_path, 0);
1535
1536 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m));
1537 if (r < 0)
1538 return r;
1539
1540 r = label_fix_full(AT_FDCWD, entry_path, inner_path, 0);
1541 if (r < 0)
1542 return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
1543
1544 return 1;
1545}
1546
1547static int mount_run(const MountEntry *m) {
1548 int r;
1549
1550 assert(m);
1551
1552 r = path_is_mount_point(mount_entry_path(m));
1553 if (r < 0 && r != -ENOENT)
1554 return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m");
1555 if (r > 0) /* make this a NOP if /run is already a mount point */
1556 return 0;
1557
1558 return mount_tmpfs(m);
1559}
1560
1561static int mount_mqueuefs(const MountEntry *m) {
1562 int r;
1563 const char *entry_path;
1564
1565 assert(m);
1566
1567 entry_path = mount_entry_path(m);
1568
1569 (void) mkdir_p_label(entry_path, 0755);
1570 (void) umount_recursive(entry_path, 0);
1571
1572 r = mount_nofollow_verbose(LOG_DEBUG, "mqueue", entry_path, "mqueue", m->flags, mount_entry_options(m));
1573 if (r < 0)
1574 return r;
1575
1576 return 1;
1577}
1578
1579static int mount_image(
1580 MountEntry *m,
1581 const char *root_directory,
1582 const ImagePolicy *image_policy) {
1583
1584 _cleanup_(extension_release_data_done) ExtensionReleaseData rdata = {};
1585 _cleanup_free_ char *extension_name = NULL;
1586 ImageClass required_class = _IMAGE_CLASS_INVALID;
1587 int r;
1588
1589 assert(m);
1590
1591 r = path_extract_filename(mount_entry_source(m), &extension_name);
1592 if (r < 0)
1593 return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
1594
1595 if (m->mode == MOUNT_EXTENSION_IMAGE) {
1596 r = parse_os_release(
1597 empty_to_root(root_directory),
1598 "ID", &rdata.os_release_id,
1599 "ID_LIKE", &rdata.os_release_id_like,
1600 "VERSION_ID", &rdata.os_release_version_id,
1601 image_class_info[IMAGE_SYSEXT].level_env, &rdata.os_release_sysext_level,
1602 image_class_info[IMAGE_CONFEXT].level_env, &rdata.os_release_confext_level,
1603 NULL);
1604 if (r < 0)
1605 return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
1606 if (isempty(rdata.os_release_id))
1607 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s'.", empty_to_root(root_directory));
1608
1609 required_class = m->filter_class;
1610 }
1611
1612 r = verity_dissect_and_mount(
1613 /* src_fd= */ -EBADF,
1614 mount_entry_source(m),
1615 mount_entry_path(m),
1616 m->image_options_const,
1617 image_policy,
1618 /* image_filter= */ NULL,
1619 &rdata,
1620 required_class,
1621 &m->verity,
1622 /* ret_image= */ NULL);
1623 if (r == -ENOENT && m->ignore)
1624 return 0;
1625 if (r == -ESTALE && rdata.os_release_id)
1626 return log_error_errno(r, // FIXME: this should not be logged ad LOG_ERR, as it will result in duplicate logging.
1627 "Failed to mount image %s, extension-release metadata does not match the lower layer's: ID=%s ID_LIKE='%s'%s%s%s%s%s%s",
1628 mount_entry_source(m),
1629 rdata.os_release_id,
1630 strempty(rdata.os_release_id_like),
1631 rdata.os_release_version_id ? " VERSION_ID=" : "",
1632 strempty(rdata.os_release_version_id),
1633 rdata.os_release_sysext_level ? image_class_info[IMAGE_SYSEXT].level_env_print : "",
1634 strempty(rdata.os_release_sysext_level),
1635 rdata.os_release_confext_level ? image_class_info[IMAGE_CONFEXT].level_env_print : "",
1636 strempty(rdata.os_release_confext_level));
1637 if (r == -ENOCSI) {
1638 log_debug("Image %s does not match the expected class, ignoring", mount_entry_source(m));
1639 return 0; /* Nothing to do, wrong class */
1640 }
1641 if (r < 0)
1642 return log_debug_errno(r, "Failed to mount image %s on %s: %m", mount_entry_source(m), mount_entry_path(m));
1643
1644 return 1;
1645}
1646
1647static int mount_overlay(const MountEntry *m) {
1648 _cleanup_free_ char *options = NULL, *layers = NULL;
1649 int r;
1650
1651 assert(m);
1652
1653 /* Extension hierarchies are optional (e.g.: confext might not have /opt) so check if they actually
1654 * exist in an image before attempting to create an overlay with them, otherwise the mount will
1655 * fail. We can't check before this, as the images will not be mounted until now. */
1656
1657 /* Note that lowerdir= parameters are in 'reverse' order, so the top-most directory in the overlay
1658 * comes first in the list. */
1659 STRV_FOREACH_BACKWARDS(o, m->overlay_layers) {
1660 _cleanup_free_ char *escaped = NULL;
1661
1662 r = is_dir(*o, /* follow= */ false);
1663 if (r <= 0) {
1664 if (r != -ENOENT)
1665 log_debug_errno(r,
1666 "Failed to check whether overlay layer source path '%s' exists, ignoring: %m",
1667 *o);
1668 continue;
1669 }
1670
1671 escaped = shell_escape(*o, ",:");
1672 if (!escaped)
1673 return log_oom_debug();
1674
1675 if (!strextend_with_separator(&layers, ":", escaped))
1676 return log_oom_debug();
1677 }
1678
1679 if (!layers) {
1680 log_debug("None of the overlays specified in '%s' exist at the source, skipping.",
1681 mount_entry_options(m));
1682 return 0; /* Only the root is set? Then there's nothing to overlay */
1683 }
1684
1685 options = strjoin("lowerdir=", layers, ":", mount_entry_path(m)); /* The root goes in last */
1686 if (!options)
1687 return log_oom_debug();
1688
1689 (void) mkdir_p_label(mount_entry_path(m), 0755);
1690
1691 r = mount_nofollow_verbose(LOG_DEBUG, "systemd-extensions", mount_entry_path(m), "overlay", MS_RDONLY, options);
1692 if (r == -ENOENT && m->ignore)
1693 return 0;
1694 if (r < 0)
1695 return r;
1696
1697 return 1;
1698}
1699
1700static int follow_symlink(
1701 const char *root_directory,
1702 MountEntry *m) {
1703
1704 _cleanup_free_ char *target = NULL;
1705 int r;
1706
1707 assert(m);
1708
1709 /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
1710 * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
1711 * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
1712 * end and already have a fully normalized name. */
1713
1714 r = chase(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL);
1715 if (r < 0)
1716 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
1717 if (r > 0) /* Reached the end, nothing more to resolve */
1718 return 1;
1719
1720 if (m->n_followed >= CHASE_MAX) /* put a boundary on things */
1721 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
1722 "Symlink loop on '%s'.",
1723 mount_entry_path(m));
1724
1725 log_debug("Followed mount entry path symlink %s %s %s.",
1726 mount_entry_path(m), glyph(GLYPH_ARROW_RIGHT), target);
1727
1728 mount_entry_consume_prefix(m, TAKE_PTR(target));
1729
1730 m->n_followed++;
1731
1732 return 0;
1733}
1734
1735static int apply_one_mount(
1736 const char *root_directory,
1737 MountEntry *m,
1738 const NamespaceParameters *p) {
1739
1740 _cleanup_free_ char *inaccessible = NULL;
1741 bool rbind = true, make = false;
1742 const char *what;
1743 int r;
1744
1745 /* Return 1 when the mount should be post-processed (remounted r/o, etc.), 0 otherwise. In most
1746 * cases post-processing is the right thing, the typical exception is when the mount is gracefully
1747 * skipped. */
1748
1749 assert(m);
1750 assert(p);
1751
1752 log_debug("Applying namespace mount on %s", mount_entry_path(m));
1753
1754 switch (m->mode) {
1755
1756 case MOUNT_INACCESSIBLE: {
1757 _cleanup_free_ char *runtime_dir = NULL;
1758 struct stat target;
1759
1760 /* First, get rid of everything that is below if there
1761 * is anything... Then, overmount it with an
1762 * inaccessible path. */
1763 (void) umount_recursive(mount_entry_path(m), 0);
1764
1765 if (lstat(mount_entry_path(m), &target) < 0) {
1766 if (errno == ENOENT && m->ignore)
1767 return 0;
1768
1769 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m",
1770 mount_entry_path(m));
1771 }
1772
1773 /* We don't pass the literal runtime scope through here but one based purely on our UID. This
1774 * means that the root user's --user services will use the host's inaccessible inodes rather
1775 * then root's private ones. This is preferable since it means device nodes that are
1776 * overmounted to make them inaccessible will be overmounted with a device node, rather than
1777 * an AF_UNIX socket inode. */
1778 runtime_dir = settle_runtime_dir(geteuid() == 0 ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER);
1779 if (!runtime_dir)
1780 return log_oom_debug();
1781
1782 r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
1783 if (r < 0)
1784 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
1785 "File type not supported for inaccessible mounts. Note that symlinks are not allowed.");
1786 what = inaccessible;
1787 break;
1788 }
1789
1790 case MOUNT_READ_ONLY:
1791 case MOUNT_READ_WRITE:
1792 case MOUNT_READ_WRITE_IMPLICIT:
1793 case MOUNT_EXEC:
1794 case MOUNT_NOEXEC:
1795 r = path_is_mount_point_full(mount_entry_path(m), root_directory, /* flags = */ 0);
1796 if (r == -ENOENT && m->ignore)
1797 return 0;
1798 if (r < 0)
1799 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m",
1800 mount_entry_path(m));
1801 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY
1802 * and MS_NOEXEC bits for the mount point if needed. */
1803 return 1;
1804 /* This isn't a mount point yet, let's make it one. */
1805 what = mount_entry_path(m);
1806 break;
1807
1808 case MOUNT_EXTENSION_DIRECTORY: {
1809 _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_id_like = NULL,
1810 *host_os_release_version_id = NULL, *host_os_release_level = NULL,
1811 *extension_name = NULL;
1812 _cleanup_strv_free_ char **extension_release = NULL;
1813 ImageClass class = IMAGE_SYSEXT;
1814
1815 r = path_extract_filename(mount_entry_source(m), &extension_name);
1816 if (r < 0)
1817 return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
1818
1819 r = load_extension_release_pairs(
1820 mount_entry_source(m),
1821 m->filter_class >= 0 ? m->filter_class : IMAGE_SYSEXT,
1822 extension_name,
1823 /* relax_extension_release_check= */ false,
1824 &extension_release);
1825 if (r == -ENOENT) {
1826 if (m->filter_class >= 0)
1827 return 0; /* Nothing to do, wrong class */
1828
1829 r = load_extension_release_pairs(
1830 mount_entry_source(m),
1831 IMAGE_CONFEXT,
1832 extension_name,
1833 /* relax_extension_release_check= */ false,
1834 &extension_release);
1835 if (r >= 0)
1836 class = IMAGE_CONFEXT;
1837 }
1838 if (r == -ENOENT && m->ignore)
1839 return 0;
1840 if (r < 0)
1841 return log_debug_errno(r, "Failed to acquire 'extension-release' data of extension tree %s: %m", mount_entry_source(m));
1842
1843 r = parse_os_release(
1844 empty_to_root(root_directory),
1845 "ID", &host_os_release_id,
1846 "ID_LIKE", &host_os_release_id_like,
1847 "VERSION_ID", &host_os_release_version_id,
1848 image_class_info[class].level_env, &host_os_release_level,
1849 NULL);
1850 if (r < 0)
1851 return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
1852 if (isempty(host_os_release_id))
1853 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s'.", empty_to_root(root_directory));
1854
1855 r = extension_release_validate(
1856 extension_name,
1857 host_os_release_id,
1858 host_os_release_id_like,
1859 host_os_release_version_id,
1860 host_os_release_level,
1861 /* host_extension_scope = */ NULL, /* Leave empty, we need to accept both system and portable */
1862 extension_release,
1863 class);
1864 if (r < 0)
1865 return log_debug_errno(r, "Failed to compare directory %s extension-release metadata with the root's os-release: %m", extension_name);
1866 if (r == 0)
1867 return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's.", extension_name);
1868
1869 _fallthrough_;
1870 }
1871
1872 case MOUNT_BIND:
1873 rbind = false;
1874
1875 _fallthrough_;
1876 case MOUNT_BIND_RECURSIVE: {
1877 _cleanup_free_ char *chased = NULL;
1878
1879 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note
1880 * that bind mount source paths are always relative to the host root, hence we pass NULL as
1881 * root directory to chase() here. */
1882
1883 /* When we create implicit mounts, we might need to create the path ourselves as it is on a
1884 * just-created tmpfs, for example. */
1885 if (m->create_source_dir) {
1886 r = mkdir_p(mount_entry_source(m), m->source_dir_mode);
1887 if (r < 0)
1888 return log_debug_errno(r, "Failed to create source directory %s: %m", mount_entry_source(m));
1889
1890 r = label_fix_full(AT_FDCWD, mount_entry_source(m), mount_entry_unprefixed_path(m), /* flags= */ 0);
1891 if (r < 0)
1892 return log_error_errno(r, "Failed to set label of the source directory %s: %m", mount_entry_source(m));
1893 }
1894
1895 r = chase(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL);
1896 if (r == -ENOENT && m->ignore) {
1897 log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
1898 return 0;
1899 }
1900 if (r < 0)
1901 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
1902
1903 log_debug("Followed source symlinks %s %s %s.",
1904 mount_entry_source(m), glyph(GLYPH_ARROW_RIGHT), chased);
1905
1906 free_and_replace(m->source_malloc, chased);
1907
1908 what = mount_entry_source(m);
1909 make = true;
1910 break;
1911 }
1912
1913 case MOUNT_EMPTY_DIR:
1914 case MOUNT_PRIVATE_TMPFS:
1915 case MOUNT_TMPFS:
1916 return mount_tmpfs(m);
1917
1918 case MOUNT_PRIVATE_TMP:
1919 what = mount_entry_source(m);
1920 make = true;
1921 break;
1922
1923 case MOUNT_PRIVATE_DEV:
1924 return mount_private_dev(m, p);
1925
1926 case MOUNT_BIND_DEV:
1927 return mount_bind_dev(m);
1928
1929 case MOUNT_PRIVATE_SYSFS:
1930 return mount_private_sysfs(m, p);
1931
1932 case MOUNT_BIND_SYSFS:
1933 return mount_bind_sysfs(m);
1934
1935 case MOUNT_PROCFS:
1936 return mount_procfs(m, p);
1937
1938 case MOUNT_PRIVATE_CGROUP2FS:
1939 return mount_private_cgroup2fs(m, p);
1940
1941 case MOUNT_RUN:
1942 return mount_run(m);
1943
1944 case MOUNT_MQUEUEFS:
1945 return mount_mqueuefs(m);
1946
1947 case MOUNT_IMAGE:
1948 return mount_image(m, NULL, p->mount_image_policy);
1949
1950 case MOUNT_EXTENSION_IMAGE:
1951 return mount_image(m, root_directory, p->extension_image_policy);
1952
1953 case MOUNT_OVERLAY:
1954 return mount_overlay(m);
1955
1956 default:
1957 assert_not_reached();
1958 }
1959
1960 assert(what);
1961
1962 r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
1963 if (r < 0) {
1964 bool try_again = false;
1965
1966 if (r == -ENOENT && make) {
1967 int q;
1968
1969 /* Hmm, either the source or the destination are missing. Let's see if we can create
1970 the destination, then try again. */
1971
1972 (void) mkdir_parents(mount_entry_path(m), 0755);
1973
1974 q = make_mount_point_inode_from_path(what, mount_entry_path(m), 0755);
1975 if (q < 0 && q != -EEXIST)
1976 // FIXME: this shouldn't be logged at LOG_WARNING, but be bubbled up, and logged there to avoid duplicate logging
1977 log_warning_errno(q, "Failed to create destination mount point node '%s', ignoring: %m",
1978 mount_entry_path(m));
1979 else
1980 try_again = true;
1981 }
1982
1983 if (try_again)
1984 r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
1985 if (r < 0)
1986 return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m)); // FIXME: this should not be logged here, but be bubbled up, to avoid duplicate logging
1987 }
1988
1989 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
1990
1991 /* Take care of id-mapped mounts */
1992 if (m->idmapped && uid_is_valid(m->idmap_uid) && gid_is_valid(m->idmap_gid)) {
1993 _cleanup_close_ int userns_fd = -EBADF;
1994 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1995
1996 log_debug("Setting an id-mapped mount on %s", mount_entry_path(m));
1997
1998 /* Do mapping from nobody (in setup_exec_directory()) -> this uid */
1999 if (strextendf(&uid_map, UID_FMT " " UID_FMT " 1\n", UID_NOBODY, m->idmap_uid) < 0)
2000 return log_oom();
2001
2002 /* Consider StateDirectory=xxx aaa xxx:aaa/222
2003 * To allow for later symlink creation (by root) in create_symlinks_from_tuples(), map root as well. */
2004 if (m->idmap_uid != 0)
2005 if (!strextend(&uid_map, "0 0 1\n"))
2006 return log_oom();
2007
2008 if (strextendf(&gid_map, GID_FMT " " GID_FMT " 1\n", GID_NOBODY, m->idmap_gid) < 0)
2009 return log_oom();
2010
2011 if (m->idmap_gid != 0)
2012 if (!strextend(&gid_map, "0 0 1\n"))
2013 return log_oom();
2014
2015 userns_fd = userns_acquire(uid_map, gid_map, /* setgroups_deny= */ true);
2016 if (userns_fd < 0)
2017 return log_error_errno(userns_fd, "Failed to allocate user namespace: %m");
2018
2019 /* Drop SUID, add NOEXEC for the mount to avoid root exploits */
2020 r = remount_idmap_fd(STRV_MAKE(mount_entry_path(m)), userns_fd, MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOEXEC | MOUNT_ATTR_NODEV);
2021 if (r < 0)
2022 return log_error_errno(r, "Failed to create an id-mapped mount: %m");
2023
2024 log_debug("ID-mapped mount created successfully for %s from " UID_FMT " to " UID_FMT "", mount_entry_path(m), UID_NOBODY, m->idmap_uid);
2025 }
2026
2027 return 1;
2028}
2029
2030static bool should_propagate_to_submounts(const MountEntry *m) {
2031 assert(m);
2032 return !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS, MOUNT_PRIVATE_TMPFS);
2033}
2034
2035static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
2036 unsigned long new_flags = 0, flags_mask = 0;
2037 bool submounts;
2038 int r;
2039
2040 assert(m);
2041 assert(proc_self_mountinfo);
2042
2043 if (m->state != MOUNT_APPLIED)
2044 return 0;
2045
2046 if (mount_entry_read_only(m) || m->mode == MOUNT_PRIVATE_DEV) {
2047 new_flags |= MS_RDONLY;
2048 flags_mask |= MS_RDONLY;
2049 }
2050
2051 if (m->nosuid) {
2052 new_flags |= MS_NOSUID;
2053 flags_mask |= MS_NOSUID;
2054 }
2055
2056 if (flags_mask == 0) /* No Change? */
2057 return 0;
2058
2059 /* We generally apply these changes recursively, except for /dev, and the cases we know there's
2060 * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
2061 * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace
2062 * and running Linux <= 4.17. */
2063 submounts = mount_entry_read_only(m) && should_propagate_to_submounts(m);
2064 if (submounts)
2065 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
2066 else
2067 r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
2068
2069 /* Note that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
2070 * read-only already stays this way. This improves compatibility with container managers, where we
2071 * won't attempt to undo read-only mounts already applied. */
2072
2073 if (r == -ENOENT && m->ignore)
2074 return 0;
2075 if (r < 0)
2076 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
2077 submounts ? " and its submounts" : "");
2078 return 0;
2079}
2080
2081static int make_noexec(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
2082 unsigned long new_flags = 0, flags_mask = 0;
2083 bool submounts;
2084 int r;
2085
2086 assert(m);
2087 assert(proc_self_mountinfo);
2088
2089 if (m->state != MOUNT_APPLIED)
2090 return 0;
2091
2092 if (mount_entry_noexec(m)) {
2093 new_flags |= MS_NOEXEC;
2094 flags_mask |= MS_NOEXEC;
2095 } else if (mount_entry_exec(m)) {
2096 new_flags &= ~MS_NOEXEC;
2097 flags_mask |= MS_NOEXEC;
2098 }
2099
2100 if (flags_mask == 0) /* No Change? */
2101 return 0;
2102
2103 submounts = should_propagate_to_submounts(m);
2104 if (submounts)
2105 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
2106 else
2107 r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
2108
2109 if (r == -ENOENT && m->ignore)
2110 return 0;
2111 if (r < 0)
2112 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
2113 submounts ? " and its submounts" : "");
2114 return 0;
2115}
2116
2117static int make_nosuid(const MountEntry *m, FILE *proc_self_mountinfo) {
2118 bool submounts;
2119 int r;
2120
2121 assert(m);
2122 assert(proc_self_mountinfo);
2123
2124 if (m->state != MOUNT_APPLIED)
2125 return 0;
2126
2127 submounts = should_propagate_to_submounts(m);
2128 if (submounts)
2129 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, NULL, proc_self_mountinfo);
2130 else
2131 r = bind_remount_one_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, proc_self_mountinfo);
2132 if (r == -ENOENT && m->ignore)
2133 return 0;
2134 if (r < 0)
2135 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
2136 submounts ? " and its submounts" : "");
2137 return 0;
2138}
2139
2140static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
2141 assert(p);
2142
2143 /*
2144 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
2145 * since to protect the API VFS mounts, they need to be around in the
2146 * first place...
2147 */
2148
2149 return p->mount_apivfs ||
2150 p->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
2151 p->protect_kernel_tunables ||
2152 p->protect_proc != PROTECT_PROC_DEFAULT ||
2153 p->proc_subset != PROC_SUBSET_ALL ||
2154 p->private_pids != PRIVATE_PIDS_NO;
2155}
2156
2157/* Walk all mount entries and dropping any unused mounts. This affects all
2158 * mounts:
2159 * - that are implicitly protected by a path that has been rendered inaccessible
2160 * - whose immediate parent requests the same protection mode as the mount itself
2161 * - that are outside of the relevant root directory
2162 * - which are duplicates
2163 */
2164static void sort_and_drop_unused_mounts(MountList *ml, const char *root_directory) {
2165 assert(ml);
2166 assert(root_directory);
2167
2168 assert(ml->mounts || ml->n_mounts == 0);
2169
2170 typesafe_qsort(ml->mounts, ml->n_mounts, mount_path_compare);
2171
2172 drop_duplicates(ml);
2173 drop_outside_root(ml, root_directory);
2174 drop_inaccessible(ml);
2175 drop_nop(ml);
2176}
2177
2178static int create_symlinks_from_tuples(const char *root, char **strv_symlinks) {
2179 int r;
2180
2181 STRV_FOREACH_PAIR(src, dst, strv_symlinks) {
2182 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2183
2184 src_abs = path_join(root, *src);
2185 dst_abs = path_join(root, *dst);
2186 if (!src_abs || !dst_abs)
2187 return -ENOMEM;
2188
2189 r = mkdir_parents_label(dst_abs, 0755);
2190 if (r < 0)
2191 return log_debug_errno(
2192 r,
2193 "Failed to create parent directory for symlink '%s': %m",
2194 dst_abs);
2195
2196 r = symlink_idempotent(src_abs, dst_abs, true);
2197 if (r < 0)
2198 return log_debug_errno(
2199 r,
2200 "Failed to create symlink from '%s' to '%s': %m",
2201 src_abs,
2202 dst_abs);
2203 }
2204
2205 return 0;
2206}
2207
2208static void mount_entry_path_debug_string(const char *root, MountEntry *m, char **ret_path) {
2209 assert(m);
2210
2211 /* Create a string suitable for debugging logs, stripping for example the local working directory.
2212 * For example, with a BindPaths=/var/bar that does not exist on the host:
2213 *
2214 * Before:
2215 * foo.service: Failed to set up mount namespacing: /run/systemd/unit-root/var/bar: No such file or directory
2216 * After:
2217 * foo.service: Failed to set up mount namespacing: /var/bar: No such file or directory
2218 *
2219 * Note that this is an error path, so no OOM check is done on purpose. */
2220
2221 if (!ret_path)
2222 return;
2223
2224 if (!mount_entry_path(m)) {
2225 *ret_path = NULL;
2226 return;
2227 }
2228
2229 if (root) {
2230 const char *e = startswith(mount_entry_path(m), root);
2231 if (e) {
2232 *ret_path = strdup(e);
2233 return;
2234 }
2235 }
2236
2237 *ret_path = strdup(mount_entry_path(m));
2238 return;
2239}
2240
2241static int apply_mounts(
2242 MountList *ml,
2243 const char *root,
2244 const NamespaceParameters *p,
2245 char **reterr_path) {
2246
2247 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
2248 _cleanup_free_ char **deny_list = NULL;
2249 int r;
2250
2251 assert(ml);
2252 assert(root);
2253 assert(p);
2254
2255 if (ml->n_mounts == 0) /* Shortcut: nothing to do */
2256 return 0;
2257
2258 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of
2259 * /proc. For example, this is the case with the option: 'InaccessiblePaths=/proc'. */
2260 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
2261 if (!proc_self_mountinfo) {
2262 r = -errno;
2263
2264 if (reterr_path)
2265 *reterr_path = strdup("/proc/self/mountinfo");
2266
2267 return log_debug_errno(r, "Failed to open %s: %m", "/proc/self/mountinfo");
2268 }
2269
2270 /* First round, establish all mounts we need */
2271 for (;;) {
2272 bool again = false;
2273
2274 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
2275
2276 if (m->state != MOUNT_PENDING)
2277 continue;
2278
2279 /* ExtensionImages/Directories are first opened in the propagate directory, not in
2280 * the root_directory. A private (invisible to the guest) tmpfs instance is mounted
2281 * on /run/[user/xyz/]systemd/unit-private-tmp as the storage backend of private
2282 * /tmp and /var/tmp. */
2283 r = follow_symlink(!IN_SET(m->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY, MOUNT_PRIVATE_TMPFS) ? root : NULL, m);
2284 if (r < 0) {
2285 mount_entry_path_debug_string(root, m, reterr_path);
2286 return r;
2287 }
2288 if (r == 0) {
2289 /* We hit a symlinked mount point. The entry got rewritten and might
2290 * point to a very different place now. Let's normalize the changed
2291 * list, and start from the beginning. After all to mount the entry
2292 * at the new location we might need some other mounts first */
2293 again = true;
2294 break;
2295 }
2296
2297 /* Returns 1 if the mount should be post-processed, 0 otherwise */
2298 r = apply_one_mount(root, m, p);
2299 if (r < 0) {
2300 mount_entry_path_debug_string(root, m, reterr_path);
2301 return r;
2302 }
2303 m->state = r == 0 ? MOUNT_SKIPPED : MOUNT_APPLIED;
2304 }
2305
2306 if (!again)
2307 break;
2308
2309 sort_and_drop_unused_mounts(ml, root);
2310 }
2311
2312 /* Now that all filesystems have been set up, but before the
2313 * read-only switches are flipped, create the exec dirs and other symlinks.
2314 * Note that when /var/lib is not empty/tmpfs, these symlinks will already
2315 * exist, which means this will be a no-op. */
2316 r = create_symlinks_from_tuples(root, p->symlinks);
2317 if (r < 0)
2318 return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m");
2319
2320 /* Create a deny list we can pass to bind_mount_recursive() */
2321 deny_list = new(char*, ml->n_mounts+1);
2322 if (!deny_list)
2323 return -ENOMEM;
2324 for (size_t j = 0; j < ml->n_mounts; j++)
2325 deny_list[j] = (char*) mount_entry_path(ml->mounts+j);
2326 deny_list[ml->n_mounts] = NULL;
2327
2328 /* Second round, flip the ro bits if necessary. */
2329 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
2330 r = make_read_only(m, deny_list, proc_self_mountinfo);
2331 if (r < 0) {
2332 mount_entry_path_debug_string(root, m, reterr_path);
2333 return r;
2334 }
2335 }
2336
2337 /* Third round, flip the noexec bits with a simplified deny list. */
2338 for (size_t j = 0; j < ml->n_mounts; j++)
2339 if (IN_SET((ml->mounts+j)->mode, MOUNT_EXEC, MOUNT_NOEXEC))
2340 deny_list[j] = (char*) mount_entry_path(ml->mounts+j);
2341 deny_list[ml->n_mounts] = NULL;
2342
2343 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
2344 r = make_noexec(m, deny_list, proc_self_mountinfo);
2345 if (r < 0) {
2346 mount_entry_path_debug_string(root, m, reterr_path);
2347 return r;
2348 }
2349 }
2350
2351 /* Fourth round, flip the nosuid bits without a deny list. */
2352 if (p->mount_nosuid)
2353 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
2354 r = make_nosuid(m, proc_self_mountinfo);
2355 if (r < 0) {
2356 mount_entry_path_debug_string(root, m, reterr_path);
2357 return r;
2358 }
2359 }
2360
2361 return 1;
2362}
2363
2364static bool root_read_only(
2365 char **read_only_paths,
2366 ProtectSystem protect_system) {
2367
2368 /* Determine whether the root directory is going to be read-only given the configured settings. */
2369
2370 if (protect_system == PROTECT_SYSTEM_STRICT)
2371 return true;
2372
2373 if (prefixed_path_strv_contains(read_only_paths, "/"))
2374 return true;
2375
2376 return false;
2377}
2378
2379static bool home_read_only(
2380 char * const *read_only_paths,
2381 char * const *inaccessible_paths,
2382 char * const *empty_directories,
2383 const BindMount *bind_mounts,
2384 size_t n_bind_mounts,
2385 const TemporaryFileSystem *temporary_filesystems,
2386 size_t n_temporary_filesystems,
2387 ProtectHome protect_home) {
2388
2389 /* Determine whether the /home directory is going to be read-only given the configured settings. Yes,
2390 * this is a bit sloppy, since we don't bother checking for cases where / is affected by multiple
2391 * settings. */
2392
2393 if (protect_home != PROTECT_HOME_NO)
2394 return true;
2395
2396 if (prefixed_path_strv_contains(read_only_paths, "/home") ||
2397 prefixed_path_strv_contains(inaccessible_paths, "/home") ||
2398 prefixed_path_strv_contains(empty_directories, "/home"))
2399 return true;
2400
2401 FOREACH_ARRAY(i, temporary_filesystems, n_temporary_filesystems)
2402 if (path_equal(i->path, "/home"))
2403 return true;
2404
2405 /* If /home is overmounted with some dir from the host it's not writable. */
2406 FOREACH_ARRAY(i, bind_mounts, n_bind_mounts)
2407 if (path_equal(i->destination, "/home"))
2408 return true;
2409
2410 return false;
2411}
2412
2413int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
2414
2415 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
2416 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
2417 _cleanup_strv_free_ char **hierarchies = NULL;
2418 _cleanup_(mount_list_done) MountList ml = {};
2419 _cleanup_close_ int userns_fd = -EBADF;
2420 bool require_prefix = false;
2421 const char *root;
2422 DissectImageFlags dissect_image_flags =
2423 DISSECT_IMAGE_GENERIC_ROOT |
2424 DISSECT_IMAGE_REQUIRE_ROOT |
2425 DISSECT_IMAGE_DISCARD_ON_LOOP |
2426 DISSECT_IMAGE_RELAX_VAR_CHECK |
2427 DISSECT_IMAGE_FSCK |
2428 DISSECT_IMAGE_USR_NO_ROOT |
2429 DISSECT_IMAGE_GROWFS |
2430 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
2431 DISSECT_IMAGE_PIN_PARTITION_DEVICES |
2432 DISSECT_IMAGE_ALLOW_USERSPACE_VERITY;
2433 int r;
2434
2435 assert(p);
2436
2437 /* Make sure that all mknod(), mkdir() calls we do are unaffected by the umask, and the access modes
2438 * we configure take effect */
2439 BLOCK_WITH_UMASK(0000);
2440
2441 bool setup_propagate = !isempty(p->propagate_dir) && !isempty(p->incoming_dir);
2442 unsigned long mount_propagation_flag = p->mount_propagation_flag != 0 ? p->mount_propagation_flag : MS_SHARED;
2443
2444 if (p->root_image) {
2445 /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
2446 if (root_read_only(p->read_only_paths,
2447 p->protect_system) &&
2448 home_read_only(p->read_only_paths, p->inaccessible_paths, p->empty_directories,
2449 p->bind_mounts, p->n_bind_mounts, p->temporary_filesystems, p->n_temporary_filesystems,
2450 p->protect_home) &&
2451 strv_isempty(p->read_write_paths))
2452 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
2453
2454 SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path);
2455
2456 if (p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2457 /* In system mode we mount directly */
2458
2459 r = loop_device_make_by_path(
2460 p->root_image,
2461 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
2462 /* sector_size= */ UINT32_MAX,
2463 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
2464 LOCK_SH,
2465 &loop_device);
2466 if (r < 0)
2467 return log_debug_errno(r, "Failed to create loop device for root image: %m");
2468
2469 r = dissect_loop_device(
2470 loop_device,
2471 p->verity,
2472 p->root_image_options,
2473 p->root_image_policy,
2474 /* image_filter= */ NULL,
2475 dissect_image_flags,
2476 &dissected_image);
2477 if (r < 0)
2478 return log_debug_errno(r, "Failed to dissect image: %m");
2479
2480 r = dissected_image_load_verity_sig_partition(
2481 dissected_image,
2482 loop_device->fd,
2483 p->verity);
2484 if (r < 0)
2485 return r;
2486
2487 r = dissected_image_guess_verity_roothash(
2488 dissected_image,
2489 p->verity);
2490 if (r < 0)
2491 return r;
2492
2493 r = dissected_image_decrypt(
2494 dissected_image,
2495 NULL,
2496 p->verity,
2497 dissect_image_flags);
2498 if (r < 0)
2499 return log_debug_errno(r, "Failed to decrypt dissected image: %m");
2500 } else {
2501 userns_fd = namespace_open_by_type(NAMESPACE_USER);
2502 if (userns_fd < 0)
2503 return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m");
2504
2505 r = mountfsd_mount_image(
2506 p->root_image,
2507 userns_fd,
2508 p->root_image_policy,
2509 dissect_image_flags,
2510 &dissected_image);
2511 if (r < 0)
2512 return r;
2513 }
2514 }
2515
2516 if (p->root_directory)
2517 root = p->root_directory;
2518 else {
2519 /* /run/systemd should have been created by PID 1 early on already, but in some cases, like
2520 * when running tests (test-execute), it might not have been created yet so let's make sure
2521 * we create it if it doesn't already exist. */
2522 (void) mkdir_p_label("/run/systemd", 0755);
2523
2524 /* Always create the mount namespace in a temporary directory, instead of operating directly
2525 * in the root. The temporary directory prevents any mounts from being potentially obscured
2526 * my other mounts we already applied. We use the same mount point for all images, which is
2527 * safe, since they all live in their own namespaces after all, and hence won't see each
2528 * other. (Note: this directory is also created by PID 1 early on, we create it here for
2529 * similar reasons as /run/systemd/ first.) */
2530 root = "/run/systemd/mount-rootfs";
2531 (void) mkdir_label(root, 0555);
2532
2533 require_prefix = true;
2534 }
2535
2536 if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) {
2537 /* Hierarchy population needs to be done for sysext and confext extension images */
2538 r = parse_env_extension_hierarchies(&hierarchies, "SYSTEMD_SYSEXT_AND_CONFEXT_HIERARCHIES");
2539 if (r < 0)
2540 return r;
2541 }
2542
2543 r = append_access_mounts(&ml, p->read_write_paths, MOUNT_READ_WRITE, require_prefix);
2544 if (r < 0)
2545 return r;
2546
2547 r = append_access_mounts(&ml, p->read_only_paths, MOUNT_READ_ONLY, require_prefix);
2548 if (r < 0)
2549 return r;
2550
2551 r = append_access_mounts(&ml, p->inaccessible_paths, MOUNT_INACCESSIBLE, require_prefix);
2552 if (r < 0)
2553 return r;
2554
2555 r = append_access_mounts(&ml, p->exec_paths, MOUNT_EXEC, require_prefix);
2556 if (r < 0)
2557 return r;
2558
2559 r = append_access_mounts(&ml, p->no_exec_paths, MOUNT_NOEXEC, require_prefix);
2560 if (r < 0)
2561 return r;
2562
2563 r = append_empty_dir_mounts(&ml, p->empty_directories);
2564 if (r < 0)
2565 return r;
2566
2567 r = append_bind_mounts(&ml, p->bind_mounts, p->n_bind_mounts);
2568 if (r < 0)
2569 return r;
2570
2571 r = append_tmpfs_mounts(&ml, p->temporary_filesystems, p->n_temporary_filesystems);
2572 if (r < 0)
2573 return r;
2574
2575 r = append_private_tmp(&ml, p);
2576 if (r < 0)
2577 return r;
2578
2579 r = append_mount_images(&ml, p->mount_images, p->n_mount_images);
2580 if (r < 0)
2581 return r;
2582
2583 r = append_extensions(&ml, root, p->private_namespace_dir, hierarchies, p->extension_images, p->n_extension_images, p->extension_directories);
2584 if (r < 0)
2585 return r;
2586
2587 if (p->private_dev) {
2588 MountEntry *me = mount_list_extend(&ml);
2589 if (!me)
2590 return log_oom_debug();
2591
2592 *me = (MountEntry) {
2593 .path_const = "/dev",
2594 .mode = MOUNT_PRIVATE_DEV,
2595 .flags = DEV_MOUNT_OPTIONS,
2596 };
2597 }
2598
2599 /* In case /proc is successfully mounted with pid tree subset only (ProcSubset=pid), the protective
2600 mounts to non-pid /proc paths would fail. But the pid only option may have failed gracefully, so
2601 let's try the mounts but it's not fatal if they don't succeed. */
2602 bool ignore_protect_proc = p->ignore_protect_paths || p->proc_subset == PROC_SUBSET_PID;
2603 if (p->protect_kernel_tunables) {
2604 r = append_static_mounts(&ml,
2605 protect_kernel_tunables_proc_table,
2606 ELEMENTSOF(protect_kernel_tunables_proc_table),
2607 ignore_protect_proc);
2608 if (r < 0)
2609 return r;
2610
2611 r = append_static_mounts(&ml,
2612 protect_kernel_tunables_sys_table,
2613 ELEMENTSOF(protect_kernel_tunables_sys_table),
2614 p->ignore_protect_paths);
2615 if (r < 0)
2616 return r;
2617 }
2618
2619 if (p->protect_kernel_modules) {
2620 r = append_static_mounts(&ml,
2621 protect_kernel_modules_table,
2622 ELEMENTSOF(protect_kernel_modules_table),
2623 p->ignore_protect_paths);
2624 if (r < 0)
2625 return r;
2626 }
2627
2628 if (p->protect_kernel_logs) {
2629 r = append_static_mounts(&ml,
2630 protect_kernel_logs_proc_table,
2631 ELEMENTSOF(protect_kernel_logs_proc_table),
2632 ignore_protect_proc);
2633 if (r < 0)
2634 return r;
2635
2636 r = append_static_mounts(&ml,
2637 protect_kernel_logs_dev_table,
2638 ELEMENTSOF(protect_kernel_logs_dev_table),
2639 p->ignore_protect_paths);
2640 if (r < 0)
2641 return r;
2642 }
2643
2644 r = append_protect_control_groups(&ml, p->protect_control_groups, false);
2645 if (r < 0)
2646 return r;
2647
2648 r = append_protect_home(&ml, p->protect_home, p->ignore_protect_paths);
2649 if (r < 0)
2650 return r;
2651
2652 r = append_protect_system(&ml, p->protect_system, false);
2653 if (r < 0)
2654 return r;
2655
2656 if (namespace_parameters_mount_apivfs(p)) {
2657 r = append_static_mounts(&ml,
2658 apivfs_table,
2659 ELEMENTSOF(apivfs_table),
2660 p->ignore_protect_paths);
2661 if (r < 0)
2662 return r;
2663 }
2664
2665 /* Only mount /proc/sys/kernel/hostname and domainname read-only if ProtectHostname=yes. Otherwise,
2666 * ProtectHostname=no allows changing hostname for the host, and ProtectHostname=private allows
2667 * changing the hostname in the unit's UTS namespace. Note, if proc is mounted with subset=pid then
2668 * neither of the two paths will exist, i.e. they are implicitly protected by the mount option. */
2669 if (p->protect_hostname == PROTECT_HOSTNAME_YES) {
2670 r = append_static_mounts(
2671 &ml,
2672 protect_hostname_yes_table,
2673 ELEMENTSOF(protect_hostname_yes_table),
2674 ignore_protect_proc);
2675 if (r < 0)
2676 return r;
2677 }
2678
2679 if (p->private_network) {
2680 MountEntry *me = mount_list_extend(&ml);
2681 if (!me)
2682 return log_oom_debug();
2683
2684 *me = (MountEntry) {
2685 .path_const = "/sys",
2686 .mode = MOUNT_PRIVATE_SYSFS,
2687 };
2688 }
2689
2690 if (p->private_ipc) {
2691 MountEntry *me = mount_list_extend(&ml);
2692 if (!me)
2693 return log_oom_debug();
2694
2695 *me = (MountEntry) {
2696 .path_const = "/dev/mqueue",
2697 .mode = MOUNT_MQUEUEFS,
2698 .flags = MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
2699 };
2700 }
2701
2702 if (p->creds_path) {
2703 /* If our service has a credentials store configured, then bind that one in, but hide
2704 * everything else. */
2705
2706 MountEntry *me = mount_list_extend(&ml);
2707 if (!me)
2708 return log_oom_debug();
2709
2710 *me = (MountEntry) {
2711 .path_const = "/run/credentials",
2712 .mode = MOUNT_TMPFS,
2713 .read_only = true,
2714 .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
2715 .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
2716 };
2717
2718 me = mount_list_extend(&ml);
2719 if (!me)
2720 return log_oom_debug();
2721
2722 *me = (MountEntry) {
2723 .path_const = p->creds_path,
2724 .mode = MOUNT_BIND,
2725 .read_only = true,
2726 .source_const = p->creds_path,
2727 .ignore = true,
2728 };
2729 } else {
2730 /* If our service has no credentials store configured, then make the whole credentials tree
2731 * inaccessible wholesale. */
2732
2733 MountEntry *me = mount_list_extend(&ml);
2734 if (!me)
2735 return log_oom_debug();
2736
2737 *me = (MountEntry) {
2738 .path_const = "/run/credentials",
2739 .mode = MOUNT_INACCESSIBLE,
2740 .ignore = true,
2741 };
2742 }
2743
2744 if (p->log_namespace) {
2745 _cleanup_free_ char *q = NULL;
2746
2747 q = strjoin("/run/systemd/journal.", p->log_namespace);
2748 if (!q)
2749 return log_oom_debug();
2750
2751 MountEntry *me = mount_list_extend(&ml);
2752 if (!me)
2753 return log_oom_debug();
2754
2755 *me = (MountEntry) {
2756 .path_const = "/run/systemd/journal",
2757 .mode = MOUNT_BIND_RECURSIVE,
2758 .read_only = true,
2759 .source_malloc = TAKE_PTR(q),
2760 };
2761
2762 } else if (p->bind_log_sockets) {
2763 r = append_bind_mounts(&ml, bind_log_sockets_table, ELEMENTSOF(bind_log_sockets_table));
2764 if (r < 0)
2765 return r;
2766 }
2767
2768 /* Will be used to add bind mounts at runtime */
2769 if (setup_propagate) {
2770 MountEntry *me = mount_list_extend(&ml);
2771 if (!me)
2772 return log_oom_debug();
2773
2774 *me = (MountEntry) {
2775 .source_const = p->propagate_dir,
2776 .path_const = p->incoming_dir,
2777 .mode = MOUNT_BIND,
2778 .read_only = true,
2779 };
2780 }
2781
2782 if (p->notify_socket_path) {
2783 MountEntry *me = mount_list_extend(&ml);
2784 if (!me)
2785 return log_oom_debug();
2786
2787 *me = (MountEntry) {
2788 .path_const = p->notify_socket_path,
2789 .source_const = p->host_notify_socket,
2790 .mode = MOUNT_BIND,
2791 .read_only = true,
2792 };
2793 }
2794
2795 if (p->host_os_release_stage) {
2796 MountEntry *me = mount_list_extend(&ml);
2797 if (!me)
2798 return log_oom_debug();
2799
2800 *me = (MountEntry) {
2801 .path_const = "/run/host/.os-release-stage/",
2802 .source_const = p->host_os_release_stage,
2803 .mode = MOUNT_BIND,
2804 .read_only = true,
2805 .ignore = true, /* Live copy, don't hard-fail if it goes missing */
2806 };
2807 }
2808
2809 /* Prepend the root directory where that's necessary */
2810 r = prefix_where_needed(&ml, root);
2811 if (r < 0)
2812 return r;
2813
2814 sort_and_drop_unused_mounts(&ml, root);
2815
2816 /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
2817
2818 if (unshare(CLONE_NEWNS) < 0) {
2819 r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
2820
2821 if (ERRNO_IS_PRIVILEGE(r) ||
2822 ERRNO_IS_NOT_SUPPORTED(r))
2823 /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter
2824 * in place that doesn't allow us to create namespaces (or a missing cap), then
2825 * propagate a recognizable error back, which the caller can use to detect this case
2826 * (and only this) and optionally continue without namespacing applied. */
2827 return -ENOANO;
2828
2829 return r;
2830 }
2831
2832 /* Create the source directory to allow runtime propagation of mounts */
2833 if (setup_propagate)
2834 (void) mkdir_p(p->propagate_dir, 0600);
2835
2836 if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) {
2837 /* ExtensionImages/Directories mountpoint directories will be created while parsing the
2838 * mounts to create, so have the parent ready */
2839 char *extension_dir = strjoina(p->private_namespace_dir, "/unit-extensions");
2840 (void) mkdir_p(extension_dir, 0600);
2841 }
2842
2843 /* Remount / as SLAVE so that nothing now mounted in the namespace
2844 * shows up in the parent */
2845 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2846 return log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
2847
2848 if (p->root_image) {
2849 /* A root image is specified, mount it to the right place */
2850 r = dissected_image_mount(
2851 dissected_image,
2852 root,
2853 /* uid_shift= */ UID_INVALID,
2854 /* uid_range= */ UID_INVALID,
2855 userns_fd,
2856 dissect_image_flags);
2857 if (r < 0)
2858 return log_debug_errno(r, "Failed to mount root image: %m");
2859
2860 /* Now release the block device lock, so that udevd is free to call BLKRRPART on the device
2861 * if it likes. */
2862 if (loop_device) {
2863 r = loop_device_flock(loop_device, LOCK_UN);
2864 if (r < 0)
2865 return log_debug_errno(r, "Failed to release lock on loopback block device: %m");
2866 }
2867
2868 r = dissected_image_relinquish(dissected_image);
2869 if (r < 0)
2870 return log_debug_errno(r, "Failed to relinquish dissected image: %m");
2871
2872 } else if (p->root_directory) {
2873
2874 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
2875 r = path_is_mount_point_full(root, /* root = */ NULL, AT_SYMLINK_FOLLOW);
2876 if (r < 0)
2877 return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
2878 if (r == 0) {
2879 r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
2880 if (r < 0)
2881 return r;
2882 }
2883
2884 } else {
2885 /* Let's mount the main root directory to the root directory to use */
2886 r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL);
2887 if (r < 0)
2888 return r;
2889 }
2890
2891 /* Try to set up the new root directory before mounting anything else there. */
2892 if (p->root_image || p->root_directory)
2893 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
2894
2895 /* Now make the magic happen */
2896 r = apply_mounts(&ml, root, p, reterr_path);
2897 if (r < 0)
2898 return r;
2899
2900 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
2901 r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
2902 if (r == -EINVAL && p->root_directory) {
2903 /* If we are using root_directory and we don't have privileges (ie: user manager in a user
2904 * namespace) and the root_directory is already a mount point in the parent namespace,
2905 * MS_MOVE will fail as we don't have permission to change it (with EINVAL rather than
2906 * EPERM). Attempt to bind-mount it over itself (like we do above if it's not already a
2907 * mount point) and try again. */
2908 r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
2909 if (r < 0)
2910 return r;
2911 r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
2912 }
2913 if (r < 0)
2914 return log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
2915
2916 /* Remount / as the desired mode. Note that this will not reestablish propagation from our side to
2917 * the host, since what's disconnected is disconnected. */
2918 if (mount(NULL, "/", NULL, mount_propagation_flag | MS_REC, NULL) < 0)
2919 return log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
2920
2921 /* bind_mount_in_namespace() will MS_MOVE into that directory, and that's only supported for
2922 * non-shared mounts. This needs to happen after remounting / or it will fail. */
2923 if (setup_propagate && mount(NULL, p->incoming_dir, NULL, MS_SLAVE, NULL) < 0)
2924 return log_debug_errno(errno, "Failed to remount %s with MS_SLAVE: %m", p->incoming_dir);
2925
2926 return 0;
2927}
2928
2929void bind_mount_free_many(BindMount *b, size_t n) {
2930 assert(b || n == 0);
2931
2932 FOREACH_ARRAY(i, b, n) {
2933 free(i->source);
2934 free(i->destination);
2935 }
2936
2937 free(b);
2938}
2939
2940int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
2941 _cleanup_free_ char *s = NULL, *d = NULL;
2942
2943 assert(b);
2944 assert(n);
2945 assert(item);
2946
2947 s = strdup(item->source);
2948 if (!s)
2949 return -ENOMEM;
2950
2951 d = strdup(item->destination);
2952 if (!d)
2953 return -ENOMEM;
2954
2955 if (!GREEDY_REALLOC(*b, *n + 1))
2956 return -ENOMEM;
2957
2958 (*b)[(*n)++] = (BindMount) {
2959 .source = TAKE_PTR(s),
2960 .destination = TAKE_PTR(d),
2961 .read_only = item->read_only,
2962 .nodev = item->nodev,
2963 .nosuid = item->nosuid,
2964 .noexec = item->noexec,
2965 .recursive = item->recursive,
2966 .ignore_enoent = item->ignore_enoent,
2967 };
2968
2969 return 0;
2970}
2971
2972MountImage* mount_image_free_many(MountImage *m, size_t *n) {
2973 assert(n);
2974 assert(m || *n == 0);
2975
2976 for (size_t i = 0; i < *n; i++) {
2977 free(m[i].source);
2978 free(m[i].destination);
2979 mount_options_free_all(m[i].mount_options);
2980 }
2981
2982 free(m);
2983 *n = 0;
2984 return NULL;
2985}
2986
2987int mount_image_add(MountImage **m, size_t *n, const MountImage *item) {
2988 _cleanup_free_ char *s = NULL, *d = NULL;
2989 _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
2990
2991 assert(m);
2992 assert(n);
2993 assert(item);
2994
2995 s = strdup(item->source);
2996 if (!s)
2997 return -ENOMEM;
2998
2999 if (item->destination) {
3000 d = strdup(item->destination);
3001 if (!d)
3002 return -ENOMEM;
3003 }
3004
3005 LIST_FOREACH(mount_options, i, item->mount_options) {
3006 _cleanup_(mount_options_free_allp) MountOptions *o = NULL;
3007
3008 o = new(MountOptions, 1);
3009 if (!o)
3010 return -ENOMEM;
3011
3012 *o = (MountOptions) {
3013 .partition_designator = i->partition_designator,
3014 .options = strdup(i->options),
3015 };
3016 if (!o->options)
3017 return -ENOMEM;
3018
3019 LIST_APPEND(mount_options, options, TAKE_PTR(o));
3020 }
3021
3022 if (!GREEDY_REALLOC(*m, *n + 1))
3023 return -ENOMEM;
3024
3025 (*m)[(*n)++] = (MountImage) {
3026 .source = TAKE_PTR(s),
3027 .destination = TAKE_PTR(d),
3028 .mount_options = TAKE_PTR(options),
3029 .ignore_enoent = item->ignore_enoent,
3030 .type = item->type,
3031 };
3032
3033 return 0;
3034}
3035
3036void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
3037 assert(t || n == 0);
3038
3039 for (size_t i = 0; i < n; i++) {
3040 free(t[i].path);
3041 free(t[i].options);
3042 }
3043
3044 free(t);
3045}
3046
3047int temporary_filesystem_add(
3048 TemporaryFileSystem **t,
3049 size_t *n,
3050 const char *path,
3051 const char *options) {
3052
3053 _cleanup_free_ char *p = NULL, *o = NULL;
3054
3055 assert(t);
3056 assert(n);
3057 assert(path);
3058
3059 p = strdup(path);
3060 if (!p)
3061 return -ENOMEM;
3062
3063 if (!isempty(options)) {
3064 o = strdup(options);
3065 if (!o)
3066 return -ENOMEM;
3067 }
3068
3069 if (!GREEDY_REALLOC(*t, *n + 1))
3070 return -ENOMEM;
3071
3072 (*t)[(*n)++] = (TemporaryFileSystem) {
3073 .path = TAKE_PTR(p),
3074 .options = TAKE_PTR(o),
3075 };
3076
3077 return 0;
3078}
3079
3080static int make_tmp_prefix(const char *prefix) {
3081 _cleanup_free_ char *t = NULL;
3082 _cleanup_close_ int fd = -EBADF;
3083 int r;
3084
3085 /* Don't do anything unless we know the dir is actually missing */
3086 r = access(prefix, F_OK);
3087 if (r >= 0)
3088 return 0;
3089 if (errno != ENOENT)
3090 return -errno;
3091
3092 WITH_UMASK(000)
3093 r = mkdir_parents(prefix, 0755);
3094 if (r < 0)
3095 return r;
3096
3097 r = tempfn_random(prefix, NULL, &t);
3098 if (r < 0)
3099 return r;
3100
3101 /* umask will corrupt this access mode, but that doesn't matter, we need to call chmod() anyway for
3102 * the suid bit, below. */
3103 fd = open_mkdir(t, O_EXCL|O_CLOEXEC, 0777);
3104 if (fd < 0)
3105 return fd;
3106
3107 r = RET_NERRNO(fchmod(fd, 01777));
3108 if (r < 0) {
3109 (void) rmdir(t);
3110 return r;
3111 }
3112
3113 r = RET_NERRNO(rename(t, prefix));
3114 if (r < 0) {
3115 (void) rmdir(t);
3116 return r == -EEXIST ? 0 : r; /* it's fine if someone else created the dir by now */
3117 }
3118
3119 return 0;
3120}
3121
3122static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, char **tmp_path) {
3123 _cleanup_free_ char *x = NULL;
3124 _cleanup_free_ char *y = NULL;
3125 sd_id128_t boot_id;
3126 bool rw = true;
3127 int r;
3128
3129 assert(id);
3130 assert(prefix);
3131 assert(path);
3132
3133 /* We include the boot id in the directory so that after a
3134 * reboot we can easily identify obsolete directories. */
3135
3136 r = sd_id128_get_boot(&boot_id);
3137 if (r < 0)
3138 return r;
3139
3140 x = strjoin(prefix, "/systemd-private-", SD_ID128_TO_STRING(boot_id), "-", id, "-XXXXXX");
3141 if (!x)
3142 return -ENOMEM;
3143
3144 r = make_tmp_prefix(prefix);
3145 if (r < 0)
3146 return r;
3147
3148 WITH_UMASK(0077)
3149 if (!mkdtemp(x)) {
3150 if (errno == EROFS || ERRNO_IS_DISK_SPACE(errno))
3151 rw = false;
3152 else
3153 return -errno;
3154 }
3155
3156 if (rw) {
3157 y = strjoin(x, "/tmp");
3158 if (!y)
3159 return -ENOMEM;
3160
3161 WITH_UMASK(0000)
3162 if (mkdir(y, 0777 | S_ISVTX) < 0)
3163 return -errno;
3164
3165 r = label_fix_full(AT_FDCWD, y, prefix, 0);
3166 if (r < 0)
3167 return r;
3168
3169 if (tmp_path)
3170 *tmp_path = TAKE_PTR(y);
3171 } else {
3172 /* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being
3173 * read-only. This way the service will get the EROFS result as if it was writing to the real
3174 * file system. */
3175 WITH_UMASK(0000)
3176 r = mkdir_p(RUN_SYSTEMD_EMPTY, 0500);
3177 if (r < 0)
3178 return r;
3179
3180 r = free_and_strdup(&x, RUN_SYSTEMD_EMPTY);
3181 if (r < 0)
3182 return r;
3183 }
3184
3185 *path = TAKE_PTR(x);
3186 return 0;
3187}
3188
3189char* namespace_cleanup_tmpdir(char *p) {
3190 PROTECT_ERRNO;
3191 if (!streq_ptr(p, RUN_SYSTEMD_EMPTY))
3192 (void) rmdir(p);
3193 return mfree(p);
3194}
3195
3196int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
3197 _cleanup_(namespace_cleanup_tmpdirp) char *a = NULL;
3198 _cleanup_(rmdir_and_freep) char *a_tmp = NULL;
3199 char *b;
3200 int r;
3201
3202 assert(id);
3203 assert(tmp_dir);
3204 assert(var_tmp_dir);
3205
3206 r = setup_one_tmp_dir(id, "/tmp", &a, &a_tmp);
3207 if (r < 0)
3208 return r;
3209
3210 r = setup_one_tmp_dir(id, "/var/tmp", &b, NULL);
3211 if (r < 0)
3212 return r;
3213
3214 a_tmp = mfree(a_tmp); /* avoid rmdir */
3215 *tmp_dir = TAKE_PTR(a);
3216 *var_tmp_dir = TAKE_PTR(b);
3217
3218 return 0;
3219}
3220
3221int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag) {
3222 _cleanup_close_ int ns = -EBADF;
3223 const char *ns_name, *ns_path;
3224 int r;
3225
3226 assert(ns_storage_socket);
3227 assert(ns_storage_socket[0] >= 0);
3228 assert(ns_storage_socket[1] >= 0);
3229
3230 ns_name = ASSERT_PTR(namespace_single_flag_to_string(nsflag));
3231
3232 /* We use the passed socketpair as a storage buffer for our namespace reference fd. Whatever process
3233 * runs this first shall create a new namespace, all others should just join it. To serialize that we
3234 * use a file lock on the socket pair.
3235 *
3236 * It's a bit crazy, but hey, works great! */
3237
3238 r = posix_lock(ns_storage_socket[0], LOCK_EX);
3239 if (r < 0)
3240 return r;
3241
3242 CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]);
3243
3244 ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3245 if (ns >= 0) {
3246 /* Yay, found something, so let's join the namespace */
3247 r = RET_NERRNO(setns(ns, nsflag));
3248 if (r < 0)
3249 return r;
3250
3251 return 0;
3252 }
3253
3254 if (ns != -EAGAIN)
3255 return ns;
3256
3257 /* Nothing stored yet, so let's create a new namespace. */
3258
3259 if (unshare(nsflag) < 0)
3260 return -errno;
3261
3262 if (nsflag == CLONE_NEWNET)
3263 (void) loopback_setup();
3264
3265 ns_path = strjoina("/proc/self/ns/", ns_name);
3266 ns = open(ns_path, O_RDONLY|O_CLOEXEC|O_NOCTTY);
3267 if (ns < 0)
3268 return -errno;
3269
3270 r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
3271 if (r < 0)
3272 return r;
3273
3274 return 1;
3275}
3276
3277int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, unsigned long nsflag) {
3278 _cleanup_close_ int ns = -EBADF;
3279 NamespaceType type;
3280 int r;
3281
3282 assert(ns_storage_socket);
3283 assert(ns_storage_socket[0] >= 0);
3284 assert(ns_storage_socket[1] >= 0);
3285 assert(path);
3286
3287 /* If the storage socket doesn't contain a ns fd yet, open one via the file system and store it in
3288 * it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will
3289 * allocate a new anonymous ns if needed. */
3290
3291 type = clone_flag_to_namespace_type(nsflag);
3292 assert(type >= 0);
3293
3294 r = posix_lock(ns_storage_socket[0], LOCK_EX);
3295 if (r < 0)
3296 return r;
3297
3298 CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]);
3299
3300 ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3301 if (ns >= 0)
3302 return 0;
3303 if (ns != -EAGAIN)
3304 return ns;
3305
3306 /* Nothing stored yet. Open the file from the file system. */
3307
3308 ns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
3309 if (ns < 0)
3310 return -errno;
3311
3312 r = fd_is_namespace(ns, type);
3313 if (r < 0)
3314 return r;
3315 if (r == 0)
3316 return -EINVAL;
3317
3318 r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
3319 if (r < 0)
3320 return r;
3321
3322 return 1;
3323}
3324
3325static int is_extension_overlay(const char *path, int fd) {
3326 _cleanup_free_ char *source = NULL;
3327 _cleanup_close_ int dfd = -EBADF;
3328 int r;
3329
3330 assert(path);
3331
3332 if (fd < 0) {
3333 r = chase(path, /* root= */ NULL, CHASE_TRAIL_SLASH|CHASE_MUST_BE_DIRECTORY, /* ret_path= */ NULL, &dfd);
3334 if (r < 0)
3335 return r;
3336 fd = dfd;
3337 }
3338
3339 r = is_mount_point_at(fd, /* filename= */ NULL, /* flags= */ 0);
3340 if (r < 0)
3341 return log_debug_errno(r, "Unable to determine whether '%s' is a mount point: %m", path);
3342 if (r == 0)
3343 return 0;
3344
3345 r = fd_is_fs_type(fd, OVERLAYFS_SUPER_MAGIC);
3346 if (r < 0)
3347 return log_debug_errno(r, "Failed to check if %s is an overlayfs: %m", path);
3348 if (r == 0)
3349 return 0;
3350
3351 /* Check the 'source' field of the mount on mount_path */
3352 r = path_get_mount_info_at(fd, /* path= */ NULL, /* ret_fstype= */ NULL, /* ret_options= */ NULL, &source);
3353 if (r < 0)
3354 return log_debug_errno(r, "Failed to get mount info for %s: %m", path);
3355 if (!streq_ptr(source, "systemd-extensions"))
3356 return 0;
3357
3358 return 1;
3359}
3360
3361static int unpeel_get_fd(const char *mount_path, int *ret_fd) {
3362 _cleanup_close_pair_ int pipe_fds[2] = EBADF_PAIR;
3363 _cleanup_close_ int fs_fd = -EBADF;
3364 pid_t pid;
3365 int r;
3366
3367 assert(mount_path);
3368 assert(ret_fd);
3369
3370 r = socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pipe_fds);
3371 if (r < 0)
3372 return log_debug_errno(errno, "Failed to create socket pair: %m");
3373
3374 /* Clone mount namespace here to unpeel without affecting live process */
3375 r = safe_fork("(sd-ns-unpeel)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, &pid);
3376 if (r < 0)
3377 return r;
3378 if (r == 0) {
3379 _cleanup_close_ int dir_fd = -EBADF;
3380
3381 pipe_fds[0] = safe_close(pipe_fds[0]);
3382
3383 /* Opportunistically unmount any overlay at this path */
3384 r = is_extension_overlay(mount_path, /* fd= */ -EBADF);
3385 if (r < 0) {
3386 log_debug_errno(r, "Unable to determine whether '%s' is an extension overlay: %m", mount_path);
3387 _exit(EXIT_FAILURE);
3388 }
3389 if (r > 0) {
3390 r = umount_recursive(mount_path, MNT_DETACH);
3391 if (r < 0)
3392 _exit(EXIT_FAILURE);
3393 if (r == 0) /* no umounts done, possible if a previous reload deleted all extensions */
3394 log_debug("No overlay layer unmountable from %s", mount_path);
3395 }
3396
3397 /* Now that /mount_path is exposed, get an FD for it and pass back */
3398 dir_fd = open_tree(-EBADF, mount_path, AT_SYMLINK_NOFOLLOW|OPEN_TREE_CLONE);
3399 if (dir_fd < 0) {
3400 log_debug_errno(errno, "Failed to clone mount %s: %m", mount_path);
3401 _exit(EXIT_FAILURE);
3402 }
3403
3404 r = fd_is_fs_type(dir_fd, OVERLAYFS_SUPER_MAGIC);
3405 if (r < 0) {
3406 log_debug_errno(r, "Unable to determine whether '%s' is an overlay after opening mount tree: %m", mount_path);
3407 _exit(EXIT_FAILURE);
3408 }
3409 if (r > 0) {
3410 log_debug_errno(r, "'%s' is still an overlay after opening mount tree: %m", mount_path);
3411 _exit(EXIT_FAILURE);
3412 }
3413
3414 r = send_one_fd(pipe_fds[1], dir_fd, 0);
3415 if (r < 0) {
3416 log_debug_errno(r, "Failed to send mount fd: %m");
3417 _exit(EXIT_FAILURE);
3418 }
3419
3420 _exit(EXIT_SUCCESS);
3421 }
3422
3423 pipe_fds[1] = safe_close(pipe_fds[1]);
3424
3425 r = receive_one_fd(pipe_fds[0], 0);
3426 if (r < 0)
3427 return log_debug_errno(r, "Failed to receive mount fd: %m");
3428 fs_fd = r;
3429
3430 r = fd_is_fs_type(fs_fd, OVERLAYFS_SUPER_MAGIC);
3431 if (r < 0)
3432 return log_debug_errno(r, "Unable to determine if unpeeled directory refers to overlayfs: %m");
3433 if (r > 0)
3434 return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Unpeeled mount is still an overlayfs, something is weird, refusing.");
3435
3436 *ret_fd = TAKE_FD(fs_fd);
3437 return 0;
3438}
3439
3440/* In target namespace, unmounts an existing overlayfs at mount_path (if one exists), grabs FD from the
3441 * underlying directory, and sets up a new overlayfs mount. Coordinates with parent process over pair_fd:
3442 * 1. Creates and sends new overlay fs fd to parent
3443 * 2. Fake-unmounts overlay at mount_path to obtain underlying directory fd to build new overlay
3444 * 3. Waits for parent to configure layers
3445 * 4. Performs final mount at mount_path
3446 *
3447 * This is used by refresh_extensions_in_namespace() to peel back any existing overlays and reapply them.
3448 */
3449static int unpeel_mount_and_setup_overlay(int pair_fd, const char *mount_path) {
3450 _cleanup_close_ int dir_unpeeled_fd = -EBADF, overlay_fs_fd = -EBADF, mount_fd = -EBADF;
3451 int r;
3452
3453 assert(pair_fd >= 0);
3454 assert(mount_path);
3455
3456 /* Create new OverlayFS and send to parent */
3457 overlay_fs_fd = fsopen("overlay", FSOPEN_CLOEXEC);
3458 if (overlay_fs_fd < 0)
3459 return log_debug_errno(errno, "Failed to create overlay fs for %s: %m", mount_path);
3460
3461 r = send_one_fd(pair_fd, overlay_fs_fd, /* flags= */ 0);
3462 if (r < 0)
3463 return log_debug_errno(r, "Failed to send overlay fs fd to parent: %m");
3464
3465 /* Unpeel in cloned mount namespace to get underlying directory fd */
3466 r = unpeel_get_fd(mount_path, &dir_unpeeled_fd);
3467 if (r < 0)
3468 return log_debug_errno(r, "Failed to unpeel mount %s: %m", mount_path);
3469
3470 /* Send the fd to the parent */
3471 r = send_one_fd(pair_fd, dir_unpeeled_fd, /* flags= */ 0);
3472 if (r < 0)
3473 return log_debug_errno(r, "Failed to send %s fd to parent: %m", mount_path);
3474
3475 /* Wait for parent to signal overlay configuration completion */
3476 log_debug("Waiting for configured overlay fs for %s", mount_path);
3477 r = receive_one_fd(pair_fd, 0);
3478 if (r < 0)
3479 return log_debug_errno(r, "Failed to receive configured overlay: %m");
3480
3481 /* Create the mount */
3482 mount_fd = fsmount(overlay_fs_fd, FSMOUNT_CLOEXEC, /* flags= */ 0);
3483 if (mount_fd < 0)
3484 return log_debug_errno(errno, "Failed to create overlay mount: %m");
3485
3486 /* Move mount to final location */
3487 r = mount_exchange_graceful(mount_fd, mount_path, /* mount_beneath= */ true);
3488 if (r < 0)
3489 return log_debug_errno(r, "Failed to move overlay to %s: %m", mount_path);
3490
3491 return 0;
3492}
3493
3494static int refresh_grandchild_proc(
3495 const PidRef *target,
3496 MountList *ml,
3497 const char *overlay_prefix,
3498 int pidns_fd,
3499 int mntns_fd,
3500 int root_fd,
3501 int pipe_fd) {
3502
3503 int r;
3504
3505 assert(pidref_is_set(target));
3506 assert(ml);
3507 assert(overlay_prefix);
3508 assert(pidns_fd >= 0);
3509 assert(mntns_fd >= 0);
3510 assert(root_fd >= 0);
3511 assert(pipe_fd >= 0);
3512
3513 r = namespace_enter(pidns_fd, mntns_fd, /* netns_fd= */ -EBADF, /* userns_fd= */ -EBADF, root_fd);
3514 if (r < 0)
3515 return log_debug_errno(r, "Failed to enter namespace: %m");
3516
3517 /* Handle each overlay mount path */
3518 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
3519 if (m->mode != MOUNT_OVERLAY)
3520 continue;
3521
3522 /* Need an absolute path under the child namespace, rather than the root's */
3523 _cleanup_free_ char *mount_path = NULL;
3524 mount_path = path_join("/",
3525 path_startswith(mount_entry_unprefixed_path(m), overlay_prefix) ?:
3526 mount_entry_unprefixed_path(m));
3527 if (!mount_path)
3528 return log_oom_debug();
3529
3530 /* If there are no extensions mounted for this overlay layer, instead of setting everything
3531 * up, the correct behavior is to unmount the existing overlay in the target namespace to
3532 * expose the original files. */
3533 if (strv_isempty(m->overlay_layers)) {
3534 r = is_extension_overlay(mount_path, /* fd= */ -EBADF);
3535 if (r < 0)
3536 return log_debug_errno(r, "Unable to determine whether '%s' is an extension overlay: %m", mount_path);
3537 if (r == 0)
3538 continue;
3539
3540 log_debug("No extensions for %s, undoing existing mount", mount_path);
3541 (void) umount_recursive(mount_path, MNT_DETACH);
3542
3543 continue;
3544 }
3545
3546 r = unpeel_mount_and_setup_overlay(pipe_fd, mount_path);
3547 if (r < 0)
3548 return log_debug_errno(r, "Failed to setup overlay mount for %s: %m", mount_path);
3549 }
3550
3551 return 0;
3552}
3553
3554static int handle_mount_from_grandchild(
3555 MountEntry *m,
3556 const char *overlay_prefix,
3557 int **fd_layers,
3558 size_t *n_fd_layers,
3559 int pipe_fd) {
3560
3561 _cleanup_free_ char *layers = NULL, *options = NULL, *hierarchy_path_moved_mount = NULL;
3562 _cleanup_close_ int hierarchy_path_fd = -EBADF, overlay_fs_fd = -EBADF;
3563 _cleanup_strv_free_ char **new_layers = NULL;
3564 int r;
3565
3566 assert(m);
3567 assert(overlay_prefix);
3568 assert(fd_layers);
3569 assert(n_fd_layers);
3570 assert(pipe_fd >= 0);
3571
3572 if (m->mode != MOUNT_OVERLAY)
3573 return 0;
3574
3575 const char *mount_path = path_startswith(mount_entry_unprefixed_path(m), overlay_prefix);
3576 if (!mount_path)
3577 mount_path = mount_entry_unprefixed_path(m);
3578
3579 /* If there are no extensions mounted for this overlay layer, we only need to
3580 * unmount the existing overlay (this is handled in the grandchild process) and
3581 * would skip the usual cooperative processing here.
3582 */
3583 if (strv_isempty(m->overlay_layers)) {
3584 log_debug("No layers for %s, skip setting up overlay", mount_path);
3585 return 0;
3586 }
3587
3588 /* Receive the fds from grandchild */
3589 overlay_fs_fd = receive_one_fd(pipe_fd, 0);
3590 if (overlay_fs_fd < 0)
3591 return log_debug_errno(overlay_fs_fd, "Failed to receive overlay fs fd from grandchild: %m");
3592
3593 hierarchy_path_fd = receive_one_fd(pipe_fd, 0);
3594 if (hierarchy_path_fd < 0)
3595 return log_debug_errno(hierarchy_path_fd, "Failed to receive fd from grandchild for %s: %m", mount_path);
3596
3597 /* move_mount so that it is visible on our end. */
3598 hierarchy_path_moved_mount = path_join(overlay_prefix, mount_path);
3599 if (!hierarchy_path_moved_mount)
3600 return log_oom_debug();
3601
3602 (void) mkdir_p_label(hierarchy_path_moved_mount, 0555);
3603 r = move_mount(hierarchy_path_fd, "", AT_FDCWD, hierarchy_path_moved_mount, MOVE_MOUNT_F_EMPTY_PATH);
3604 if (r < 0)
3605 return log_debug_errno(r, "Failed to move mount for %s: %m", mount_path);
3606
3607 /* Turn all overlay layer directories into FD-based references */
3608 if (!GREEDY_REALLOC(*fd_layers, *n_fd_layers + strv_length(m->overlay_layers)))
3609 return log_oom_debug();
3610
3611 STRV_FOREACH(ol, m->overlay_layers) {
3612 _cleanup_close_ int tree_fd = -EBADF;
3613
3614 tree_fd = open_tree(-EBADF, *ol, /* flags= */ 0);
3615 if (tree_fd < 0)
3616 return log_debug_errno(errno, "Failed to open_tree overlay layer '%s': %m", *ol);
3617
3618 r = strv_extend(&new_layers, FORMAT_PROC_FD_PATH(tree_fd));
3619 if (r < 0)
3620 return log_oom_debug();
3621
3622 *fd_layers[(*n_fd_layers)++] = TAKE_FD(tree_fd);
3623 }
3624 m->overlay_layers = strv_free(m->overlay_layers);
3625 m->overlay_layers = TAKE_PTR(new_layers);
3626
3627 layers = strv_join(m->overlay_layers, ":");
3628 if (!layers)
3629 return log_oom_debug();
3630
3631 /* Append the underlying hierarchy path as the last lowerdir */
3632 options = strjoin(layers, ":", FORMAT_PROC_FD_PATH(hierarchy_path_fd));
3633 if (!options)
3634 return log_oom_debug();
3635
3636 if (fsconfig(overlay_fs_fd, FSCONFIG_SET_STRING, "lowerdir", options, 0) < 0)
3637 return log_debug_errno(errno, "Failed to set lowerdir=%s: %m", options);
3638
3639 if (fsconfig(overlay_fs_fd, FSCONFIG_SET_STRING, "source", "systemd-extensions", 0) < 0)
3640 return log_debug_errno(errno, "Failed to set source=systemd-extensions: %m");
3641
3642 /* Create the superblock */
3643 if (fsconfig(overlay_fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
3644 return log_debug_errno(errno, "Failed to create overlay superblock: %m");
3645
3646 /* Signal completion to grandchild */
3647 r = send_one_fd(pipe_fd, overlay_fs_fd, 0);
3648 if (r < 0)
3649 return log_debug_errno(r, "Failed to signal overlay configuration complete for %s: %m", mount_path);
3650
3651 return 0;
3652}
3653
3654static int refresh_apply_and_prune(const NamespaceParameters *p, MountList *ml) {
3655 int r;
3656
3657 assert(p);
3658 assert(ml);
3659
3660 /* Open all extensions on the host, drop all sysexts since they won't have /etc/. The list of
3661 * overlays also need to be updated, so that if it's empty after a confext has been removed, the
3662 * child process can correctly undo the overlay in the target namespace, rather than attempting to
3663 * mount an empty overlay which the kernel does not allow, so this pruning has to be done here and
3664 * not later (nor earlier, as we don't know if an image is a confext until this point). */
3665 MountEntry *f, *t;
3666 for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
3667 if (IN_SET(f->mode, MOUNT_EXTENSION_DIRECTORY, MOUNT_EXTENSION_IMAGE)) {
3668 f->filter_class = IMAGE_CONFEXT;
3669
3670 r = apply_one_mount("/", f, p);
3671 if (r < 0)
3672 return r;
3673 /* Nothing happened? Then it is not a confext, prune it from the lists */
3674 if (r == 0) {
3675 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
3676 if (m->mode != MOUNT_OVERLAY)
3677 continue;
3678
3679 _cleanup_strv_free_ char **pruned = NULL;
3680
3681 STRV_FOREACH(ol, m->overlay_layers)
3682 if (!path_startswith(*ol, mount_entry_path(f))) {
3683 r = strv_extend(&pruned, *ol);
3684 if (r < 0)
3685 return log_oom_debug();
3686 }
3687 strv_free(m->overlay_layers);
3688 m->overlay_layers = TAKE_PTR(pruned);
3689 }
3690 mount_entry_done(f);
3691 continue;
3692 }
3693 }
3694
3695 *t = *f;
3696 t++;
3697 }
3698
3699 ml->n_mounts = t - ml->mounts;
3700
3701 return 0;
3702}
3703
3704int refresh_extensions_in_namespace(
3705 const PidRef *target,
3706 const char *hierarchy_env,
3707 const NamespaceParameters *p) {
3708
3709 _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, pidns_fd = -EBADF;
3710 const char *overlay_prefix = "/run/systemd/mount-rootfs";
3711 _cleanup_(mount_list_done) MountList ml = {};
3712 _cleanup_free_ char *extension_dir = NULL;
3713 _cleanup_strv_free_ char **hierarchies = NULL;
3714 int r;
3715
3716 assert(pidref_is_set(target));
3717 assert(hierarchy_env);
3718 assert(p);
3719
3720 log_debug("Refreshing extensions in-namespace for hierarchy '%s'", hierarchy_env);
3721
3722 r = pidref_namespace_open(target, &pidns_fd, &mntns_fd, /* ret_netns_fd= */ NULL, /* ret_userns_fd= */ NULL, &root_fd);
3723 if (r < 0)
3724 return log_debug_errno(r, "Failed to open namespace: %m");
3725
3726 r = is_our_namespace(mntns_fd, NAMESPACE_MOUNT);
3727 if (r < 0)
3728 return log_debug_errno(r, "Failed to check if target namespace is separate: %m");
3729 if (r > 0)
3730 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Target namespace is not separate, cannot reload extensions");
3731
3732 extension_dir = path_join(p->private_namespace_dir, "unit-extensions");
3733 if (!extension_dir)
3734 return log_oom_debug();
3735
3736 r = parse_env_extension_hierarchies(&hierarchies, hierarchy_env);
3737 if (r < 0)
3738 return r;
3739
3740 r = append_extensions(
3741 &ml,
3742 overlay_prefix,
3743 p->private_namespace_dir,
3744 hierarchies,
3745 p->extension_images,
3746 p->n_extension_images,
3747 p->extension_directories);
3748 if (r < 0)
3749 return r;
3750
3751 sort_and_drop_unused_mounts(&ml, overlay_prefix);
3752 if (ml.n_mounts == 0)
3753 return 0;
3754
3755 /**
3756 * There are three main steps:
3757 * 1. In child, set up the extension images and directories in a slave mountns, so that we have
3758 * access to their FDs
3759 * 2. Fork into a grandchild, which will enter the target namespace and attempt to "unpeel" the
3760 * overlays to obtain FDs the underlying directories, over which we will reapply the overlays
3761 * 3. In the child again, receive the FDs and reapply the overlays
3762 */
3763 r = safe_fork("(sd-ns-refresh-exts)",
3764 FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE,
3765 NULL);
3766 if (r < 0)
3767 return r;
3768 if (r == 0) {
3769 /* Child (host namespace) */
3770 _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
3771 _cleanup_(sigkill_waitp) pid_t grandchild_pid = 0;
3772
3773 (void) mkdir_p_label(overlay_prefix, 0555);
3774
3775 r = refresh_apply_and_prune(p, &ml);
3776 if (r < 0) {
3777 log_debug_errno(r, "Failed to apply extensions for refreshing: %m");
3778 _exit(EXIT_FAILURE);
3779 }
3780
3781 /* Create a grandchild process to handle the unmounting and reopening of hierarchy */
3782 r = socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair);
3783 if (r < 0) {
3784 log_debug_errno(errno, "Failed to create socket pair: %m");
3785 _exit(EXIT_FAILURE);
3786 }
3787
3788 r = safe_fork("(sd-ns-refresh-exts-grandchild)",
3789 FORK_LOG|FORK_DEATHSIG_SIGKILL,
3790 &grandchild_pid);
3791 if (r < 0)
3792 _exit(EXIT_FAILURE);
3793 if (r == 0) {
3794 /* Grandchild (target service namespace) */
3795 pair[0] = safe_close(pair[0]);
3796
3797 r = refresh_grandchild_proc(target, &ml, overlay_prefix, pidns_fd, mntns_fd, root_fd, pair[1]);
3798 if (r < 0) {
3799 pair[1] = safe_close(pair[1]);
3800 _exit(EXIT_FAILURE);
3801 }
3802
3803 _exit(EXIT_SUCCESS);
3804 }
3805
3806 pair[1] = safe_close(pair[1]);
3807
3808 /* Until kernel 6.15, the FDs to the individual layers used to set up the OverlayFS via
3809 * lowerdir=/proc/self/fd/X need to remain open until the OverlayFS mount is _attached_
3810 * (as opposed to merely created) to its mount point, hence we need to ensure these FDs
3811 * stay open until the grandchild has attached the mount and exited. */
3812 // TODO: once the kernel baseline is >= 6.15, move the FD array into the helper function
3813 // and close them immediately
3814 int *fd_layers = NULL;
3815 size_t n_fd_layers = 0;
3816 CLEANUP_ARRAY(fd_layers, n_fd_layers, close_many_and_free);
3817
3818 FOREACH_ARRAY(m, ml.mounts, ml.n_mounts) {
3819 r = handle_mount_from_grandchild(m, overlay_prefix, &fd_layers, &n_fd_layers, pair[0]);
3820 if (r < 0)
3821 _exit(EXIT_FAILURE);
3822 }
3823
3824 r = wait_for_terminate_and_check("(sd-ns-refresh-exts-grandchild)", TAKE_PID(grandchild_pid), 0);
3825 if (r < 0) {
3826 log_debug_errno(r, "Failed to wait for target namespace process to finish: %m");
3827 _exit(EXIT_FAILURE);
3828 }
3829 if (r != EXIT_SUCCESS) {
3830 log_debug("Target namespace fork did not succeed");
3831 _exit(EXIT_FAILURE);
3832 }
3833
3834 _exit(EXIT_SUCCESS);
3835 }
3836
3837 return 0;
3838}
3839
3840static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
3841 [PROTECT_HOME_NO] = "no",
3842 [PROTECT_HOME_YES] = "yes",
3843 [PROTECT_HOME_READ_ONLY] = "read-only",
3844 [PROTECT_HOME_TMPFS] = "tmpfs",
3845};
3846
3847DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
3848
3849static const char *const protect_hostname_table[_PROTECT_HOSTNAME_MAX] = {
3850 [PROTECT_HOSTNAME_NO] = "no",
3851 [PROTECT_HOSTNAME_YES] = "yes",
3852 [PROTECT_HOSTNAME_PRIVATE] = "private",
3853};
3854
3855DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_hostname, ProtectHostname, PROTECT_HOSTNAME_YES);
3856
3857static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
3858 [PROTECT_SYSTEM_NO] = "no",
3859 [PROTECT_SYSTEM_YES] = "yes",
3860 [PROTECT_SYSTEM_FULL] = "full",
3861 [PROTECT_SYSTEM_STRICT] = "strict",
3862};
3863
3864DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
3865
3866static const char *const protect_control_groups_table[_PROTECT_CONTROL_GROUPS_MAX] = {
3867 [PROTECT_CONTROL_GROUPS_NO] = "no",
3868 [PROTECT_CONTROL_GROUPS_YES] = "yes",
3869 [PROTECT_CONTROL_GROUPS_PRIVATE] = "private",
3870 [PROTECT_CONTROL_GROUPS_STRICT] = "strict",
3871};
3872
3873DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_control_groups, ProtectControlGroups, PROTECT_CONTROL_GROUPS_YES);
3874
3875static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
3876 [PROTECT_PROC_DEFAULT] = "default",
3877 [PROTECT_PROC_NOACCESS] = "noaccess",
3878 [PROTECT_PROC_INVISIBLE] = "invisible",
3879 [PROTECT_PROC_PTRACEABLE] = "ptraceable",
3880};
3881
3882DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
3883
3884static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
3885 [PROC_SUBSET_ALL] = "all",
3886 [PROC_SUBSET_PID] = "pid",
3887};
3888
3889DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
3890
3891static const char* const private_tmp_table[_PRIVATE_TMP_MAX] = {
3892 [PRIVATE_TMP_NO] = "no",
3893 [PRIVATE_TMP_CONNECTED] = "connected",
3894 [PRIVATE_TMP_DISCONNECTED] = "disconnected",
3895};
3896
3897DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_tmp, PrivateTmp, PRIVATE_TMP_CONNECTED);
3898
3899static const char* const private_users_table[_PRIVATE_USERS_MAX] = {
3900 [PRIVATE_USERS_NO] = "no",
3901 [PRIVATE_USERS_SELF] = "self",
3902 [PRIVATE_USERS_IDENTITY] = "identity",
3903 [PRIVATE_USERS_FULL] = "full",
3904};
3905
3906DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF);
3907
3908static const char* const private_pids_table[_PRIVATE_PIDS_MAX] = {
3909 [PRIVATE_PIDS_NO] = "no",
3910 [PRIVATE_PIDS_YES] = "yes",
3911};
3912
3913DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_pids, PrivatePIDs, PRIVATE_PIDS_YES);