]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
man/systemd-sysext: list ephemeral/ephemeral-import in the list of options
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
15ae422b 2
e08f94ac 3#include <linux/loop.h>
07630cea 4#include <sched.h>
15ae422b 5#include <stdio.h>
836e4e7e 6#include <stdlib.h>
19df770f 7#include <sys/file.h>
07630cea 8#include <sys/mount.h>
07630cea 9#include <unistd.h>
15ae422b 10
b5efdb8a 11#include "alloc-util.h"
10404d52 12#include "base-filesystem.h"
f461a28d 13#include "chase.h"
7f112f50 14#include "dev-setup.h"
ec61371f 15#include "devnum-util.h"
836e4e7e
DDM
16#include "dissect-image.h"
17#include "errno-util.h"
93f59701 18#include "escape.h"
d51f8eb3 19#include "extension-util.h"
3ffd4af2 20#include "fd-util.h"
e5f10caf 21#include "format-util.h"
836e4e7e 22#include "fs-util.h"
e2341b6b 23#include "glyph-util.h"
0690160e 24#include "label-util.h"
b3d13314 25#include "list.h"
13339577 26#include "lock-util.h"
8aa304d3 27#include "log.h"
915e6d16 28#include "loop-util.h"
07630cea 29#include "loopback-setup.h"
dfdeb0b1 30#include "missing_magic.h"
35cd0ba5 31#include "mkdir-label.h"
4349cd7c 32#include "mount-util.h"
049af8ad 33#include "mountpoint-util.h"
3ffd4af2 34#include "namespace.h"
1cf40697 35#include "namespace-util.h"
54c2459d 36#include "nsflags.h"
d8b4d14d 37#include "nulstr-util.h"
93f59701 38#include "os-util.h"
07630cea 39#include "path-util.h"
dfdeb0b1 40#include "pidref.h"
41#include "process-util.h"
d7b8eec7 42#include "selinux-util.h"
2583fbea 43#include "socket-util.h"
760877e9 44#include "sort-util.h"
36ce7110 45#include "stat-util.h"
8b43440b 46#include "string-table.h"
07630cea
LP
47#include "string-util.h"
48#include "strv.h"
a652f050 49#include "tmpfile-util.h"
affb60b1 50#include "umask-util.h"
ee104e11 51#include "user-util.h"
5e79dd96 52#include "vpick.h"
15ae422b 53
737ba3c8 54#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
55
c17ec25e 56typedef enum MountMode {
15ae422b 57 /* This is ordered by priority! */
a868e437
LP
58 MOUNT_INACCESSIBLE,
59 MOUNT_OVERLAY,
60 MOUNT_IMAGE,
61 MOUNT_BIND,
62 MOUNT_BIND_RECURSIVE,
63 MOUNT_PRIVATE_TMP,
a868e437
LP
64 MOUNT_PRIVATE_DEV,
65 MOUNT_BIND_DEV,
66 MOUNT_EMPTY_DIR,
67 MOUNT_PRIVATE_SYSFS,
68 MOUNT_BIND_SYSFS,
69 MOUNT_PROCFS,
cd58b5a1 70 MOUNT_PRIVATE_CGROUP2FS,
a868e437
LP
71 MOUNT_READ_ONLY,
72 MOUNT_READ_WRITE,
73 MOUNT_NOEXEC,
74 MOUNT_EXEC,
75 MOUNT_TMPFS,
76 MOUNT_RUN,
0e551b04 77 MOUNT_PRIVATE_TMPFS, /* Mounted outside the root directory, and used by subsequent mounts */
a868e437
LP
78 MOUNT_EXTENSION_DIRECTORY, /* Bind-mounted outside the root directory, and used by subsequent mounts */
79 MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */
80 MOUNT_MQUEUEFS,
81 MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */
5beb8688 82 _MOUNT_MODE_MAX,
a868e437 83 _MOUNT_MODE_INVALID = -EINVAL,
c17ec25e 84} MountMode;
15ae422b 85
63862de4 86typedef enum MountEntryState {
67248bbd
LP
87 MOUNT_PENDING,
88 MOUNT_APPLIED,
89 MOUNT_SKIPPED,
90 _MOUNT_ENTRY_STATE_MAX,
91 _MOUNT_ENTRY_STATE_INVALID = -EINVAL,
63862de4
LB
92} MountEntryState;
93
34de407a 94typedef struct MountEntry {
5327c910 95 const char *path_const; /* Memory allocated on stack or static */
a868e437 96 MountMode mode;
5327c910 97 bool ignore:1; /* Ignore if path does not exist? */
368a3071 98 bool has_prefix:1; /* Already prefixed by the root dir? */
cfbeb4ef 99 bool read_only:1; /* Shall this mount point be read-only? */
9ce4e4b0 100 bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */
ddc155b2
TM
101 bool noexec:1; /* Shall set MS_NOEXEC on the mount itself */
102 bool exec:1; /* Shall clear MS_NOEXEC on the mount itself */
0e551b04
LB
103 bool create_source_dir:1; /* Create the source directory if it doesn't exist - for implicit bind mounts */
104 mode_t source_dir_mode; /* Mode for the source directory, if it is to be created */
63862de4 105 MountEntryState state; /* Whether it was already processed or skipped */
55fe7432 106 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
809ceb82
LB
107 const char *unprefixed_path_const; /* If the path was amended with a prefix, these will save the original */
108 char *unprefixed_path_malloc;
b3d13314 109 const char *source_const; /* The source path, for bind mounts or images */
d2d6c096 110 char *source_malloc;
2abd4e38
YW
111 const char *options_const;/* Mount options for tmpfs */
112 char *options_malloc;
113 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
088696fe 114 unsigned n_followed;
9dc6a6af 115 LIST_HEAD(MountOptions, image_options_const);
f0304df6 116 char **overlay_layers;
a1a40297 117 VeritySettings verity;
8d9e00ea 118 ImageClass filter_class; /* Used for live updates to skip inapplicable images */
eae51272
AB
119 bool idmapped;
120 uid_t idmap_uid;
121 gid_t idmap_gid;
34de407a 122} MountEntry;
15ae422b 123
063c977a
LP
124typedef struct MountList {
125 MountEntry *mounts;
126 size_t n_mounts;
127} MountList;
128
7a9f0125 129static const BindMount bind_log_sockets_table[] = {
95f9e85a
MY
130 { (char*) "/run/systemd/journal/socket", (char*) "/run/systemd/journal/socket", .read_only = true, .nosuid = true, .noexec = true, .nodev = true, .ignore_enoent = true },
131 { (char*) "/run/systemd/journal/stdout", (char*) "/run/systemd/journal/stdout", .read_only = true, .nosuid = true, .noexec = true, .nodev = true, .ignore_enoent = true },
132 { (char*) "/run/systemd/journal/dev-log", (char*) "/run/systemd/journal/dev-log", .read_only = true, .nosuid = true, .noexec = true, .nodev = true, .ignore_enoent = true },
368a3071
MY
133};
134
94293d65 135/* If MountAPIVFS= is used, let's mount /sys, /proc, /dev and /run into the it, but only as a fallback if the user hasn't mounted
3fe91079 136 * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
5d997827 137static const MountEntry apivfs_table[] = {
a868e437
LP
138 { "/proc", MOUNT_PROCFS, false },
139 { "/dev", MOUNT_BIND_DEV, false },
140 { "/sys", MOUNT_BIND_SYSFS, false },
141 { "/run", MOUNT_RUN, false, .options_const = "mode=0755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME },
5d997827 142};
f471b2af 143
11a30cec 144/* ProtectKernelTunables= option and the related filesystem APIs */
788e7201 145static const MountEntry protect_kernel_tunables_proc_table[] = {
a868e437
LP
146 { "/proc/acpi", MOUNT_READ_ONLY, true },
147 { "/proc/apm", MOUNT_READ_ONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
148 { "/proc/asound", MOUNT_READ_ONLY, true },
149 { "/proc/bus", MOUNT_READ_ONLY, true },
150 { "/proc/fs", MOUNT_READ_ONLY, true },
151 { "/proc/irq", MOUNT_READ_ONLY, true },
152 { "/proc/kallsyms", MOUNT_INACCESSIBLE, true },
153 { "/proc/kcore", MOUNT_INACCESSIBLE, true },
154 { "/proc/latency_stats", MOUNT_READ_ONLY, true },
155 { "/proc/mtrr", MOUNT_READ_ONLY, true },
156 { "/proc/scsi", MOUNT_READ_ONLY, true },
157 { "/proc/sys", MOUNT_READ_ONLY, true },
158 { "/proc/sysrq-trigger", MOUNT_READ_ONLY, true },
159 { "/proc/timer_stats", MOUNT_READ_ONLY, true },
788e7201
TM
160};
161
162static const MountEntry protect_kernel_tunables_sys_table[] = {
a868e437
LP
163 { "/sys", MOUNT_READ_ONLY, false },
164 { "/sys/fs/bpf", MOUNT_READ_ONLY, true },
165 { "/sys/fs/cgroup", MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */
166 { "/sys/fs/selinux", MOUNT_READ_WRITE_IMPLICIT, true },
167 { "/sys/kernel/debug", MOUNT_READ_ONLY, true },
168 { "/sys/kernel/tracing", MOUNT_READ_ONLY, true },
11a30cec
DH
169};
170
c575770b 171/* ProtectKernelModules= option */
34de407a 172static const MountEntry protect_kernel_modules_table[] = {
a868e437 173 { "/usr/lib/modules", MOUNT_INACCESSIBLE, true },
c575770b
DH
174};
175
94a7b275 176/* ProtectKernelLogs= option */
788e7201 177static const MountEntry protect_kernel_logs_proc_table[] = {
a868e437 178 { "/proc/kmsg", MOUNT_INACCESSIBLE, true },
788e7201
TM
179};
180
181static const MountEntry protect_kernel_logs_dev_table[] = {
a868e437 182 { "/dev/kmsg", MOUNT_INACCESSIBLE, true },
94a7b275
KK
183};
184
b6c432ca
DH
185/*
186 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
187 * system should be protected by ProtectSystem=
188 */
34de407a 189static const MountEntry protect_home_read_only_table[] = {
a868e437
LP
190 { "/home", MOUNT_READ_ONLY, true },
191 { "/run/user", MOUNT_READ_ONLY, true },
192 { "/root", MOUNT_READ_ONLY, true },
b6c432ca
DH
193};
194
e4da7d8c
YW
195/* ProtectHome=tmpfs table */
196static const MountEntry protect_home_tmpfs_table[] = {
a868e437
LP
197 { "/home", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
198 { "/run/user", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
199 { "/root", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
e4da7d8c
YW
200};
201
b6c432ca 202/* ProtectHome=yes table */
34de407a 203static const MountEntry protect_home_yes_table[] = {
a868e437
LP
204 { "/home", MOUNT_INACCESSIBLE, true },
205 { "/run/user", MOUNT_INACCESSIBLE, true },
206 { "/root", MOUNT_INACCESSIBLE, true },
b6c432ca
DH
207};
208
5fe29238
RW
209/* ProtectControlGroups=yes table */
210static const MountEntry protect_control_groups_yes_table[] = {
211 { "/sys/fs/cgroup", MOUNT_READ_ONLY, false },
212};
213
cd58b5a1 214/* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so
1614d0c4 215 * flags is not set here. */
cd58b5a1 216static const MountEntry protect_control_groups_private_table[] = {
1614d0c4 217 { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false },
cd58b5a1
RW
218};
219
220/* ProtectControlGroups=strict table */
221static const MountEntry protect_control_groups_strict_table[] = {
1614d0c4 222 { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true },
cd58b5a1
RW
223};
224
f471b2af 225/* ProtectSystem=yes table */
34de407a 226static const MountEntry protect_system_yes_table[] = {
a868e437
LP
227 { "/usr", MOUNT_READ_ONLY, false },
228 { "/boot", MOUNT_READ_ONLY, true },
229 { "/efi", MOUNT_READ_ONLY, true },
f471b2af
DH
230};
231
232/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 233static const MountEntry protect_system_full_table[] = {
a868e437
LP
234 { "/usr", MOUNT_READ_ONLY, false },
235 { "/boot", MOUNT_READ_ONLY, true },
236 { "/efi", MOUNT_READ_ONLY, true },
237 { "/etc", MOUNT_READ_ONLY, false },
f471b2af
DH
238};
239
3d1b999b
LP
240/* ProtectSystem=strict table. In this strict mode, we mount everything read-only, except for /proc, /dev,
241 * /sys which are the kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
242 * protect those, and these options should be fully orthogonal. (And of course /home and friends are also
243 * left writable, as ProtectHome= shall manage those, orthogonally).
f471b2af 244 */
34de407a 245static const MountEntry protect_system_strict_table[] = {
d69ee5ac 246 { "/", MOUNT_READ_ONLY, false },
a868e437
LP
247 { "/proc", MOUNT_READ_WRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
248 { "/sys", MOUNT_READ_WRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
249 { "/dev", MOUNT_READ_WRITE_IMPLICIT, false }, /* PrivateDevices= */
250 { "/home", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
251 { "/run/user", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
252 { "/root", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
f471b2af
DH
253};
254
3d1b999b 255/* ProtectHostname=yes able */
6746f288 256static const MountEntry protect_hostname_yes_table[] = {
a868e437
LP
257 { "/proc/sys/kernel/hostname", MOUNT_READ_ONLY, false },
258 { "/proc/sys/kernel/domainname", MOUNT_READ_ONLY, false },
3d1b999b
LP
259};
260
5beb8688 261static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
a868e437
LP
262 [MOUNT_INACCESSIBLE] = "inaccessible",
263 [MOUNT_OVERLAY] = "overlay",
264 [MOUNT_IMAGE] = "image",
265 [MOUNT_BIND] = "bind",
266 [MOUNT_BIND_RECURSIVE] = "bind-recursive",
267 [MOUNT_PRIVATE_TMP] = "private-tmp",
a868e437
LP
268 [MOUNT_PRIVATE_DEV] = "private-dev",
269 [MOUNT_BIND_DEV] = "bind-dev",
270 [MOUNT_EMPTY_DIR] = "empty-dir",
271 [MOUNT_PRIVATE_SYSFS] = "private-sysfs",
272 [MOUNT_BIND_SYSFS] = "bind-sysfs",
cd58b5a1 273 [MOUNT_PRIVATE_CGROUP2FS] = "private-cgroup2fs",
a868e437
LP
274 [MOUNT_PROCFS] = "procfs",
275 [MOUNT_READ_ONLY] = "read-only",
276 [MOUNT_READ_WRITE] = "read-write",
277 [MOUNT_NOEXEC] = "noexec",
278 [MOUNT_EXEC] = "exec",
279 [MOUNT_TMPFS] = "tmpfs",
280 [MOUNT_RUN] = "run",
0e551b04 281 [MOUNT_PRIVATE_TMPFS] = "private-tmpfs",
a868e437
LP
282 [MOUNT_EXTENSION_DIRECTORY] = "extension-directory",
283 [MOUNT_EXTENSION_IMAGE] = "extension-image",
284 [MOUNT_MQUEUEFS] = "mqueuefs",
285 [MOUNT_READ_WRITE_IMPLICIT] = "read-write-implicit",
5beb8688
YW
286};
287
55ea4ef0
MG
288/* Helper struct for naming simplicity and reusability */
289static const struct {
290 const char *level_env;
291 const char *level_env_print;
292} image_class_info[_IMAGE_CLASS_MAX] = {
293 [IMAGE_SYSEXT] = {
294 .level_env = "SYSEXT_LEVEL",
295 .level_env_print = " SYSEXT_LEVEL=",
296 },
297 [IMAGE_CONFEXT] = {
298 .level_env = "CONFEXT_LEVEL",
299 .level_env_print = " CONFEXT_LEVEL=",
300 }
301};
302
5beb8688
YW
303DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
304
d69ee5ac 305static const char* mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
306 assert(p);
307
5327c910
LP
308 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
309 * otherwise the stack/static ->path field is returned. */
f0a4feb0 310
5327c910 311 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
312}
313
d69ee5ac 314static const char* mount_entry_unprefixed_path(const MountEntry *p) {
809ceb82
LB
315 assert(p);
316
317 /* Returns the unprefixed path (ie: before prefix_where_needed() ran), if any */
318
319 return p->unprefixed_path_malloc ?: p->unprefixed_path_const ?: mount_entry_path(p);
320}
321
322static void mount_entry_consume_prefix(MountEntry *p, char *new_path) {
323 assert(p);
324 assert(p->path_malloc || p->path_const);
325 assert(new_path);
326
327 /* Saves current path in unprefixed_ variable, and takes over new_path */
328
329 free_and_replace(p->unprefixed_path_malloc, p->path_malloc);
330 /* If we didn't have a path on the heap, then it's a static one */
331 if (!p->unprefixed_path_malloc)
332 p->unprefixed_path_const = p->path_const;
333 p->path_malloc = new_path;
334 p->has_prefix = true;
335}
336
34de407a 337static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
338 assert(p);
339
5811a011 340 return p->read_only || IN_SET(p->mode, MOUNT_READ_ONLY, MOUNT_INACCESSIBLE);
cfbeb4ef
LP
341}
342
ddc155b2
TM
343static bool mount_entry_noexec(const MountEntry *p) {
344 assert(p);
345
1614d0c4 346 return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS, MOUNT_PRIVATE_CGROUP2FS);
ddc155b2
TM
347}
348
349static bool mount_entry_exec(const MountEntry *p) {
350 assert(p);
351
a868e437 352 return p->exec || p->mode == MOUNT_EXEC;
ddc155b2
TM
353}
354
d69ee5ac 355static const char* mount_entry_source(const MountEntry *p) {
d2d6c096
LP
356 assert(p);
357
358 return p->source_malloc ?: p->source_const;
359}
360
d69ee5ac 361static const char* mount_entry_options(const MountEntry *p) {
2abd4e38
YW
362 assert(p);
363
364 return p->options_malloc ?: p->options_const;
365}
366
1eb7e08e
LP
367static void mount_entry_done(MountEntry *p) {
368 assert(p);
369
370 p->path_malloc = mfree(p->path_malloc);
809ceb82 371 p->unprefixed_path_malloc = mfree(p->unprefixed_path_malloc);
1eb7e08e 372 p->source_malloc = mfree(p->source_malloc);
2abd4e38 373 p->options_malloc = mfree(p->options_malloc);
f0304df6 374 p->overlay_layers = strv_free(p->overlay_layers);
a1a40297 375 verity_settings_done(&p->verity);
1eb7e08e
LP
376}
377
063c977a
LP
378static void mount_list_done(MountList *ml) {
379 assert(ml);
380
381 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts)
382 mount_entry_done(m);
383
384 ml->mounts = mfree(ml->mounts);
385 ml->n_mounts = 0;
386}
387
d69ee5ac 388static MountEntry* mount_list_extend(MountList *ml) {
063c977a
LP
389 assert(ml);
390
391 if (!GREEDY_REALLOC0(ml->mounts, ml->n_mounts+1))
392 return NULL;
393
394 return ml->mounts + ml->n_mounts++;
395}
396
397static int append_access_mounts(MountList *ml, char **strv, MountMode mode, bool forcibly_require_prefix) {
398 assert(ml);
613b411c 399
a868e437 400 /* Adds a list of user-supplied READ_WRITE/READ_WRITE_IMPLICIT/READ_ONLY/INACCESSIBLE entries */
5327c910 401
15ae422b 402 STRV_FOREACH(i, strv) {
5327c910
LP
403 bool ignore = false, needs_prefix = false;
404 const char *e = *i;
15ae422b 405
5327c910
LP
406 /* Look for any prefixes */
407 if (startswith(e, "-")) {
408 e++;
9c94d52e 409 ignore = true;
ea92ae33 410 }
5327c910
LP
411 if (startswith(e, "+")) {
412 e++;
413 needs_prefix = true;
414 }
ea92ae33 415
baaa35ad 416 if (!path_is_absolute(e))
063c977a 417 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", e);
15ae422b 418
063c977a
LP
419 MountEntry *me = mount_list_extend(ml);
420 if (!me)
421 return log_oom_debug();
422
423 *me = (MountEntry) {
5327c910
LP
424 .path_const = e,
425 .mode = mode,
426 .ignore = ignore,
d18aff04 427 .has_prefix = !needs_prefix && !forcibly_require_prefix,
5327c910 428 };
15ae422b
LP
429 }
430
431 return 0;
432}
433
063c977a
LP
434static int append_empty_dir_mounts(MountList *ml, char **strv) {
435 assert(ml);
6c47cd7d
LP
436
437 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
438 * "/private/" boundary directories for DynamicUser=1. */
439
440 STRV_FOREACH(i, strv) {
063c977a
LP
441 MountEntry *me = mount_list_extend(ml);
442 if (!me)
443 return log_oom_debug();
6c47cd7d 444
063c977a 445 *me = (MountEntry) {
6c47cd7d 446 .path_const = *i,
a868e437 447 .mode = MOUNT_EMPTY_DIR,
6c47cd7d 448 .ignore = false,
6c47cd7d 449 .read_only = true,
9f563f27 450 .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
2abd4e38 451 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
6c47cd7d
LP
452 };
453 }
454
455 return 0;
456}
457
063c977a
LP
458static int append_bind_mounts(MountList *ml, const BindMount *binds, size_t n) {
459 assert(ml);
460 assert(binds || n == 0);
d2d6c096 461
063c977a
LP
462 FOREACH_ARRAY(b, binds, n) {
463 MountEntry *me = mount_list_extend(ml);
464 if (!me)
465 return log_oom_debug();
d2d6c096 466
063c977a 467 *me = (MountEntry) {
d2d6c096 468 .path_const = b->destination,
a868e437 469 .mode = b->recursive ? MOUNT_BIND_RECURSIVE : MOUNT_BIND,
d2d6c096 470 .read_only = b->read_only,
9ce4e4b0 471 .nosuid = b->nosuid,
95f9e85a
MY
472 .noexec = b->noexec,
473 .flags = b->nodev ? MS_NODEV : 0,
d2d6c096 474 .source_const = b->source,
4ca763a9 475 .ignore = b->ignore_enoent,
eae51272
AB
476 .idmapped = b->idmapped,
477 .idmap_uid = b->uid,
478 .idmap_gid = b->gid,
d2d6c096
LP
479 };
480 }
481
482 return 0;
483}
484
063c977a 485static int append_mount_images(MountList *ml, const MountImage *mount_images, size_t n) {
59a83e11
LB
486 int r;
487
063c977a
LP
488 assert(ml);
489 assert(mount_images || n == 0);
b3d13314 490
063c977a 491 FOREACH_ARRAY(m, mount_images, n) {
59a83e11 492 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
063c977a
LP
493 MountEntry *me = mount_list_extend(ml);
494 if (!me)
495 return log_oom_debug();
b3d13314 496
59a83e11
LB
497 r = verity_settings_load(&verity, m->source, /* root_hash_path= */ NULL, /* root_hash_sig_path= */ NULL);
498 if (r < 0)
499 return log_debug_errno(r, "Failed to check verity root hash of %s: %m", m->source);
500
063c977a 501 *me = (MountEntry) {
b3d13314 502 .path_const = m->destination,
a868e437 503 .mode = MOUNT_IMAGE,
b3d13314 504 .source_const = m->source,
9dc6a6af 505 .image_options_const = m->mount_options,
b3d13314 506 .ignore = m->ignore_enoent,
59a83e11 507 .verity = TAKE_GENERIC(verity, VeritySettings, VERITY_SETTINGS_DEFAULT),
dfdeb0b1 508 .filter_class = _IMAGE_CLASS_INVALID,
b3d13314
LB
509 };
510 }
511
512 return 0;
513}
514
a07b9926 515static int append_extensions(
063c977a 516 MountList *ml,
93f59701 517 const char *root,
0e551b04 518 const char *private_namespace_dir,
93f59701
LB
519 char **hierarchies,
520 const MountImage *mount_images,
c43d4393 521 size_t n_mount_images,
a07b9926 522 char **extension_directories) {
93f59701 523
f0304df6
LB
524 char ***overlays = NULL;
525 size_t n_overlays = 0;
93f59701
LB
526 int r;
527
063c977a
LP
528 assert(ml);
529
c43d4393 530 if (n_mount_images == 0 && strv_isempty(extension_directories))
93f59701
LB
531 return 0;
532
0e551b04 533 assert(private_namespace_dir);
24759d8f 534
f0304df6
LB
535 n_overlays = strv_length(hierarchies);
536 if (n_overlays == 0)
537 return 0;
538
539 /* Prepare a list of overlays, that will have as each element a strv containing all the layers that
540 * will later be concatenated as a lowerdir= parameter for the mount operation.
93f59701
LB
541 * The overlays vector will have the same number of elements and will correspond to the
542 * hierarchies vector, so they can be iterated upon together. */
f0304df6
LB
543 overlays = new0(char**, n_overlays);
544 if (!overlays)
545 return -ENOMEM;
93f59701 546
f0304df6 547 CLEANUP_ARRAY(overlays, n_overlays, strv_free_many);
93f59701
LB
548
549 /* First, prepare a mount for each image, but these won't be visible to the unit, instead
550 * they will be mounted in our propagate directory, and used as a source for the overlay. */
c43d4393 551 for (size_t i = 0; i < n_mount_images; i++) {
a1a40297 552 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
5e79dd96 553 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
93f59701
LB
554 _cleanup_free_ char *mount_point = NULL;
555 const MountImage *m = mount_images + i;
556
5e79dd96
LB
557 r = path_pick(/* toplevel_path= */ NULL,
558 /* toplevel_fd= */ AT_FDCWD,
559 m->source,
560 &pick_filter_image_raw,
561 PICK_ARCHITECTURE|PICK_TRIES,
562 &result);
e1efa9d8
LB
563 if (r == -ENOENT && m->ignore_enoent)
564 continue;
5e79dd96
LB
565 if (r < 0)
566 return r;
00f546e2
LB
567 if (!result.path) {
568 if (m->ignore_enoent)
569 continue;
570
5e79dd96
LB
571 return log_debug_errno(
572 SYNTHETIC_ERRNO(ENOENT),
573 "No matching entry in .v/ directory %s found.",
574 m->source);
00f546e2 575 }
5e79dd96 576
a1a40297
LB
577 r = verity_settings_load(&verity, result.path, /* root_hash_path= */ NULL, /* root_hash_sig_path= */ NULL);
578 if (r < 0)
579 return log_debug_errno(r, "Failed to check verity root hash of %s: %m", result.path);
580
0e551b04 581 if (asprintf(&mount_point, "%s/unit-extensions/%zu", private_namespace_dir, i) < 0)
93f59701
LB
582 return -ENOMEM;
583
584 for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
f0304df6 585 char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
93f59701
LB
586 if (!prefixed_hierarchy)
587 return -ENOMEM;
588
f0304df6
LB
589 r = strv_consume(&overlays[j], TAKE_PTR(prefixed_hierarchy));
590 if (r < 0)
591 return r;
93f59701
LB
592 }
593
063c977a
LP
594 MountEntry *me = mount_list_extend(ml);
595 if (!me)
f0304df6 596 return -ENOMEM;
063c977a
LP
597
598 *me = (MountEntry) {
93f59701 599 .path_malloc = TAKE_PTR(mount_point),
9dc6a6af 600 .image_options_const = m->mount_options,
93f59701 601 .ignore = m->ignore_enoent,
5e79dd96 602 .source_malloc = TAKE_PTR(result.path),
a868e437 603 .mode = MOUNT_EXTENSION_IMAGE,
93f59701 604 .has_prefix = true,
a1a40297 605 .verity = TAKE_GENERIC(verity, VeritySettings, VERITY_SETTINGS_DEFAULT),
dfdeb0b1 606 .filter_class = _IMAGE_CLASS_INVALID,
93f59701
LB
607 };
608 }
609
a07b9926
LB
610 /* Secondly, extend the lowerdir= parameters with each ExtensionDirectory.
611 * Bind mount them in the same location as the ExtensionImages, so that we
612 * can check that they are valid trees (extension-release.d). */
613 STRV_FOREACH(extension_directory, extension_directories) {
622efc54
LB
614 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
615 _cleanup_free_ char *mount_point = NULL;
a07b9926
LB
616 const char *e = *extension_directory;
617 bool ignore_enoent = false;
618
a07b9926
LB
619 /* Look for any prefixes */
620 if (startswith(e, "-")) {
621 e++;
622 ignore_enoent = true;
623 }
624 /* Ignore this for now */
625 if (startswith(e, "+"))
626 e++;
627
622efc54
LB
628 r = path_pick(/* toplevel_path= */ NULL,
629 /* toplevel_fd= */ AT_FDCWD,
630 e,
631 &pick_filter_image_dir,
632 PICK_ARCHITECTURE|PICK_TRIES,
633 &result);
e1efa9d8
LB
634 if (r == -ENOENT && ignore_enoent)
635 continue;
622efc54
LB
636 if (r < 0)
637 return r;
00f546e2
LB
638 if (!result.path) {
639 if (ignore_enoent)
640 continue;
641
622efc54
LB
642 return log_debug_errno(
643 SYNTHETIC_ERRNO(ENOENT),
644 "No matching entry in .v/ directory %s found.",
645 e);
00f546e2
LB
646 }
647
648 /* Pick up the counter where the ExtensionImages left it. */
649 if (asprintf(&mount_point, "%s/unit-extensions/%zu", private_namespace_dir, n_mount_images++) < 0)
650 return -ENOMEM;
a07b9926
LB
651
652 for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
f0304df6 653 char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
a07b9926
LB
654 if (!prefixed_hierarchy)
655 return -ENOMEM;
656
f0304df6
LB
657 r = strv_consume(&overlays[j], TAKE_PTR(prefixed_hierarchy));
658 if (r < 0)
659 return r;
a07b9926
LB
660 }
661
063c977a
LP
662 MountEntry *me = mount_list_extend(ml);
663 if (!me)
f0304df6 664 return -ENOMEM;
063c977a
LP
665
666 *me = (MountEntry) {
a07b9926 667 .path_malloc = TAKE_PTR(mount_point),
622efc54 668 .source_malloc = TAKE_PTR(result.path),
a868e437 669 .mode = MOUNT_EXTENSION_DIRECTORY,
a07b9926
LB
670 .ignore = ignore_enoent,
671 .has_prefix = true,
672 .read_only = true,
dfdeb0b1 673 .filter_class = _IMAGE_CLASS_INVALID,
a07b9926
LB
674 };
675 }
676
93f59701
LB
677 /* Then, for each hierarchy, prepare an overlay with the list of lowerdir= strings
678 * set up earlier. */
679 for (size_t i = 0; hierarchies && hierarchies[i]; ++i) {
680 _cleanup_free_ char *prefixed_hierarchy = NULL;
681
682 prefixed_hierarchy = path_join(root, hierarchies[i]);
683 if (!prefixed_hierarchy)
684 return -ENOMEM;
685
063c977a
LP
686 MountEntry *me = mount_list_extend(ml);
687 if (!me)
f0304df6 688 return -ENOMEM;
063c977a
LP
689
690 *me = (MountEntry) {
93f59701 691 .path_malloc = TAKE_PTR(prefixed_hierarchy),
f0304df6 692 .overlay_layers = TAKE_PTR(overlays[i]),
a868e437 693 .mode = MOUNT_OVERLAY,
93f59701
LB
694 .has_prefix = true,
695 .ignore = true, /* If the source image doesn't set the ignore bit it will fail earlier. */
696 };
697 }
698
699 return 0;
700}
701
063c977a
LP
702static int append_tmpfs_mounts(MountList *ml, const TemporaryFileSystem *tmpfs, size_t n) {
703 assert(ml);
704 assert(tmpfs || n == 0);
2abd4e38 705
063c977a 706 FOREACH_ARRAY(t, tmpfs, n) {
2abd4e38 707 _cleanup_free_ char *o = NULL, *str = NULL;
ad8e66dc 708 unsigned long flags;
2abd4e38 709 bool ro = false;
b67ec8e5 710 int r;
2abd4e38 711
baaa35ad 712 if (!path_is_absolute(t->path))
063c977a 713 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", t->path);
2abd4e38 714
b67ec8e5 715 str = strjoin("mode=0755" NESTED_TMPFS_LIMITS ",", t->options);
ad8e66dc
AJ
716 if (!str)
717 return -ENOMEM;
2abd4e38 718
ad8e66dc
AJ
719 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
720 if (r < 0)
721 return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
2abd4e38 722
ad8e66dc 723 ro = flags & MS_RDONLY;
5ba46b99 724 flags &= ~MS_RDONLY;
2abd4e38 725
063c977a
LP
726 MountEntry *me = mount_list_extend(ml);
727 if (!me)
728 return log_oom_debug();
729
730 *me = (MountEntry) {
2abd4e38 731 .path_const = t->path,
a868e437 732 .mode = MOUNT_TMPFS,
2abd4e38 733 .read_only = ro,
ad8e66dc 734 .options_malloc = TAKE_PTR(o),
2abd4e38
YW
735 .flags = flags,
736 };
2abd4e38
YW
737 }
738
739 return 0;
740}
741
13208591
YW
742static int append_private_tmp(MountList *ml, const NamespaceParameters *p) {
743 MountEntry *me;
744
745 assert(ml);
746 assert(p);
6156bec7
YW
747 assert(p->private_tmp == p->private_var_tmp ||
748 (p->private_tmp == PRIVATE_TMP_DISCONNECTED && p->private_var_tmp == PRIVATE_TMP_NO));
13208591
YW
749
750 if (p->tmp_dir) {
751 assert(p->private_tmp == PRIVATE_TMP_CONNECTED);
752
753 me = mount_list_extend(ml);
754 if (!me)
755 return log_oom_debug();
756 *me = (MountEntry) {
757 .path_const = "/tmp/",
758 .mode = MOUNT_PRIVATE_TMP,
759 .read_only = streq(p->tmp_dir, RUN_SYSTEMD_EMPTY),
760 .source_const = p->tmp_dir,
761 };
762 }
763
764 if (p->var_tmp_dir) {
6156bec7 765 assert(p->private_var_tmp == PRIVATE_TMP_CONNECTED);
13208591
YW
766
767 me = mount_list_extend(ml);
768 if (!me)
769 return log_oom_debug();
770 *me = (MountEntry) {
771 .path_const = "/var/tmp/",
772 .mode = MOUNT_PRIVATE_TMP,
773 .read_only = streq(p->var_tmp_dir, RUN_SYSTEMD_EMPTY),
774 .source_const = p->var_tmp_dir,
775 };
776 }
777
778 if (p->private_tmp != PRIVATE_TMP_DISCONNECTED)
779 return 0;
780
6156bec7
YW
781 if (p->private_var_tmp == PRIVATE_TMP_NO) {
782 me = mount_list_extend(ml);
783 if (!me)
784 return log_oom_debug();
785 *me = (MountEntry) {
786 .path_const = "/tmp/",
787 .mode = MOUNT_PRIVATE_TMPFS,
788 .options_const = "mode=0700" NESTED_TMPFS_LIMITS,
789 .flags = MS_NODEV|MS_STRICTATIME,
790 };
791
792 return 0;
793 }
794
13208591
YW
795 _cleanup_free_ char *tmpfs_dir = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
796 tmpfs_dir = path_join(p->private_namespace_dir, "unit-private-tmp");
797 tmp_dir = path_join(tmpfs_dir, "tmp");
798 var_tmp_dir = path_join(tmpfs_dir, "var-tmp");
799 if (!tmpfs_dir || !tmp_dir || !var_tmp_dir)
800 return log_oom_debug();
801
802 me = mount_list_extend(ml);
803 if (!me)
804 return log_oom_debug();
805 *me = (MountEntry) {
806 .path_malloc = TAKE_PTR(tmpfs_dir),
807 .mode = MOUNT_PRIVATE_TMPFS,
808 .options_const = "mode=0700" NESTED_TMPFS_LIMITS,
809 .flags = MS_NODEV|MS_STRICTATIME,
810 .has_prefix = true,
811 };
812
813 me = mount_list_extend(ml);
814 if (!me)
815 return log_oom_debug();
816 *me = (MountEntry) {
817 .source_malloc = TAKE_PTR(tmp_dir),
818 .path_const = "/tmp/",
819 .mode = MOUNT_BIND,
820 .source_dir_mode = 01777,
821 .create_source_dir = true,
822 };
823
824 me = mount_list_extend(ml);
825 if (!me)
826 return log_oom_debug();
827 *me = (MountEntry) {
828 .source_malloc = TAKE_PTR(var_tmp_dir),
829 .path_const = "/var/tmp/",
830 .mode = MOUNT_BIND,
831 .source_dir_mode = 01777,
832 .create_source_dir = true,
833 };
834
835 return 0;
836}
837
063c977a
LP
838static int append_static_mounts(MountList *ml, const MountEntry *mounts, size_t n, bool ignore_protect) {
839 assert(ml);
840 assert(mounts || n == 0);
11a30cec 841
5327c910 842 /* Adds a list of static pre-defined entries */
f471b2af 843
063c977a
LP
844 FOREACH_ARRAY(m, mounts, n) {
845 MountEntry *me = mount_list_extend(ml);
846 if (!me)
847 return log_oom_debug();
848
0cc496b2
YW
849 /* No dynamic values allowed. */
850 assert(m->path_const);
851 assert(!m->path_malloc);
852 assert(!m->unprefixed_path_malloc);
853 assert(!m->source_malloc);
854 assert(!m->options_malloc);
855 assert(!m->overlay_layers);
856
857 *me = *m;
858 me->ignore = me->ignore || ignore_protect;
063c977a 859 }
f471b2af
DH
860
861 return 0;
862}
863
5fe29238
RW
864static int append_protect_control_groups(MountList *ml, ProtectControlGroups protect_control_groups, bool ignore_protect) {
865 assert(ml);
866
867 switch (protect_control_groups) {
868
869 case PROTECT_CONTROL_GROUPS_NO:
870 return 0;
871
872 case PROTECT_CONTROL_GROUPS_YES:
873 return append_static_mounts(ml, protect_control_groups_yes_table, ELEMENTSOF(protect_control_groups_yes_table), ignore_protect);
874
cd58b5a1
RW
875 case PROTECT_CONTROL_GROUPS_PRIVATE:
876 return append_static_mounts(ml, protect_control_groups_private_table, ELEMENTSOF(protect_control_groups_private_table), ignore_protect);
877
878 case PROTECT_CONTROL_GROUPS_STRICT:
879 return append_static_mounts(ml, protect_control_groups_strict_table, ELEMENTSOF(protect_control_groups_strict_table), ignore_protect);
880
5fe29238
RW
881 default:
882 assert_not_reached();
883 }
884}
885
063c977a
LP
886static int append_protect_home(MountList *ml, ProtectHome protect_home, bool ignore_protect) {
887 assert(ml);
c575770b 888
5327c910 889 switch (protect_home) {
b6c432ca 890
5327c910 891 case PROTECT_HOME_NO:
b6c432ca
DH
892 return 0;
893
b6c432ca 894 case PROTECT_HOME_READ_ONLY:
063c977a 895 return append_static_mounts(ml, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
5327c910 896
e4da7d8c 897 case PROTECT_HOME_TMPFS:
063c977a 898 return append_static_mounts(ml, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
e4da7d8c 899
b6c432ca 900 case PROTECT_HOME_YES:
063c977a 901 return append_static_mounts(ml, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
5327c910 902
b6c432ca 903 default:
04499a70 904 assert_not_reached();
b6c432ca 905 }
b6c432ca
DH
906}
907
063c977a
LP
908static int append_protect_system(MountList *ml, ProtectSystem protect_system, bool ignore_protect) {
909 assert(ml);
f471b2af 910
5327c910
LP
911 switch (protect_system) {
912
913 case PROTECT_SYSTEM_NO:
f471b2af
DH
914 return 0;
915
f471b2af 916 case PROTECT_SYSTEM_STRICT:
063c977a 917 return append_static_mounts(ml, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
5327c910 918
f471b2af 919 case PROTECT_SYSTEM_YES:
063c977a 920 return append_static_mounts(ml, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
5327c910 921
f471b2af 922 case PROTECT_SYSTEM_FULL:
063c977a 923 return append_static_mounts(ml, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
5327c910 924
f471b2af 925 default:
04499a70 926 assert_not_reached();
f471b2af 927 }
11a30cec
DH
928}
929
93bab288 930static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
a0827e2b 931 int d;
15ae422b 932
a07b9926 933 /* ExtensionImages/Directories will be used by other mounts as a base, so sort them first
93f59701 934 * regardless of the prefix - they are set up in the propagate directory anyway */
a868e437 935 d = -CMP(a->mode == MOUNT_EXTENSION_IMAGE, b->mode == MOUNT_EXTENSION_IMAGE);
a07b9926
LB
936 if (d != 0)
937 return d;
a868e437 938 d = -CMP(a->mode == MOUNT_EXTENSION_DIRECTORY, b->mode == MOUNT_EXTENSION_DIRECTORY);
93f59701
LB
939 if (d != 0)
940 return d;
941
7fef1996
YW
942 /* MOUNT_PRIVATE_TMPFS needs to be set up earlier, especially than MOUNT_BIND. */
943 d = -CMP(a->mode == MOUNT_PRIVATE_TMPFS, b->mode == MOUNT_PRIVATE_TMPFS);
944 if (d != 0)
945 return d;
946
6ee1a919 947 /* If the paths are not equal, then order prefixes first */
93bab288 948 d = path_compare(mount_entry_path(a), mount_entry_path(b));
6ee1a919
LP
949 if (d != 0)
950 return d;
15ae422b 951
6ee1a919 952 /* If the paths are equal, check the mode */
93bab288 953 return CMP((int) a->mode, (int) b->mode);
15ae422b
LP
954}
955
063c977a 956static int prefix_where_needed(MountList *ml, const char *root_directory) {
4a756839 957 /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
5327c910 958
063c977a 959 assert(ml);
fe96c0f8 960
063c977a 961 FOREACH_ARRAY(me, ml->mounts, ml->n_mounts) {
5327c910
LP
962 char *s;
963
063c977a 964 if (me->has_prefix)
5327c910
LP
965 continue;
966
063c977a 967 s = path_join(root_directory, mount_entry_path(me));
5327c910
LP
968 if (!s)
969 return -ENOMEM;
970
063c977a 971 mount_entry_consume_prefix(me, s);
5327c910
LP
972 }
973
974 return 0;
975}
976
a1a40297
LB
977static bool verity_has_later_duplicates(MountList *ml, const MountEntry *needle) {
978
979 assert(ml);
980 assert(needle);
981 assert(needle >= ml->mounts && needle < ml->mounts + ml->n_mounts);
982 assert(needle->mode == MOUNT_EXTENSION_IMAGE);
983
984 if (needle->verity.root_hash_size == 0)
985 return false;
986
987 /* Overlayfs rejects supplying the same directory inode twice as determined by filesystem UUID and
988 * file handle in lowerdir=, even if they are mounted on different paths, as it resolves each mount
989 * to its source filesystem, so drop duplicates, and keep the last one. This only covers non-DDI
990 * verity images. Note that the list is ordered, so we only check for the reminder of the list for
991 * each item, rather than the full list from the beginning, as any earlier duplicates will have
992 * already been pruned. */
993
994 for (const MountEntry *m = needle + 1; m < ml->mounts + ml->n_mounts; m++) {
995 if (m->mode != MOUNT_EXTENSION_IMAGE)
996 continue;
997 if (memcmp_nn(m->verity.root_hash,
998 m->verity.root_hash_size,
999 needle->verity.root_hash,
1000 needle->verity.root_hash_size) == 0)
1001 return true;
1002 }
1003
1004 return false;
1005}
1006
063c977a 1007static void drop_duplicates(MountList *ml) {
34de407a 1008 MountEntry *f, *t, *previous;
15ae422b 1009
063c977a 1010 assert(ml);
15ae422b 1011
fe3c2583
LP
1012 /* Drops duplicate entries. Expects that the array is properly ordered already. */
1013
063c977a 1014 for (f = ml->mounts, t = ml->mounts, previous = NULL; f < ml->mounts + ml->n_mounts; f++) {
15ae422b 1015
fe3c2583 1016 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
088696fe
LP
1017 * above. Note that we only drop duplicates that haven't been mounted yet. */
1018 if (previous &&
1019 path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
63862de4 1020 f->state == MOUNT_PENDING && previous->state == MOUNT_PENDING) {
5beb8688 1021 log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
ddc155b2
TM
1022 /* Propagate the flags to the remaining entry */
1023 previous->read_only = previous->read_only || mount_entry_read_only(f);
1024 previous->noexec = previous->noexec || mount_entry_noexec(f);
1025 previous->exec = previous->exec || mount_entry_exec(f);
1eb7e08e 1026 mount_entry_done(f);
15ae422b 1027 continue;
fe3c2583 1028 }
15ae422b 1029
a1a40297
LB
1030 if (f->mode == MOUNT_EXTENSION_IMAGE && verity_has_later_duplicates(ml, f)) {
1031 log_debug("Skipping duplicate extension image %s", mount_entry_source(f));
1032 mount_entry_done(f);
1033 continue;
1034 }
1035
e2d7c1a0 1036 *t = *f;
15ae422b 1037 previous = t;
fe3c2583
LP
1038 t++;
1039 }
1040
063c977a 1041 ml->n_mounts = t - ml->mounts;
fe3c2583
LP
1042}
1043
063c977a 1044static void drop_inaccessible(MountList *ml) {
34de407a 1045 MountEntry *f, *t;
fe3c2583
LP
1046 const char *clear = NULL;
1047
063c977a 1048 assert(ml);
fe3c2583
LP
1049
1050 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
1051 * ordered already. */
1052
063c977a 1053 for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
fe3c2583
LP
1054
1055 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
1056 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
1057 if (clear && path_startswith(mount_entry_path(f), clear)) {
1058 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 1059 mount_entry_done(f);
fe3c2583
LP
1060 continue;
1061 }
15ae422b 1062
a868e437 1063 clear = f->mode == MOUNT_INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
1064
1065 *t = *f;
15ae422b
LP
1066 t++;
1067 }
1068
063c977a 1069 ml->n_mounts = t - ml->mounts;
15ae422b
LP
1070}
1071
063c977a 1072static void drop_nop(MountList *ml) {
34de407a 1073 MountEntry *f, *t;
7648a565 1074
063c977a 1075 assert(ml);
7648a565
LP
1076
1077 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
1078 * list is ordered by prefixes. */
1079
063c977a 1080 for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
7648a565 1081
a868e437
LP
1082 /* Only suppress such subtrees for READ_ONLY, READ_WRITE and READ_WRITE_IMPLICIT entries */
1083 if (IN_SET(f->mode, MOUNT_READ_ONLY, MOUNT_READ_WRITE, MOUNT_READ_WRITE_IMPLICIT)) {
e7bf2fca 1084 MountEntry *found = NULL;
7648a565
LP
1085
1086 /* Now let's find the first parent of the entry we are looking at. */
063c977a 1087 for (MountEntry *p = PTR_SUB1(t, ml->mounts); p; p = PTR_SUB1(p, ml->mounts))
34de407a 1088 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
e7bf2fca 1089 found = p;
7648a565
LP
1090 break;
1091 }
7648a565
LP
1092
1093 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
e7bf2fca 1094 if (found && found->mode == f->mode) {
5beb8688
YW
1095 log_debug("%s (%s) is made redundant by %s (%s)",
1096 mount_entry_path(f), mount_mode_to_string(f->mode),
e7bf2fca 1097 mount_entry_path(found), mount_mode_to_string(found->mode));
1eb7e08e 1098 mount_entry_done(f);
7648a565
LP
1099 continue;
1100 }
1101 }
1102
1103 *t = *f;
1104 t++;
1105 }
1106
063c977a 1107 ml->n_mounts = t - ml->mounts;
7648a565
LP
1108}
1109
063c977a 1110static void drop_outside_root(MountList *ml, const char *root_directory) {
34de407a 1111 MountEntry *f, *t;
cd2902c9 1112
063c977a 1113 assert(ml);
cd2902c9 1114
1d54cd5d 1115 /* Nothing to do */
cd2902c9
LP
1116 if (!root_directory)
1117 return;
1118
1119 /* Drops all mounts that are outside of the root directory. */
1120
063c977a 1121 for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
cd2902c9 1122
0e551b04
LB
1123 /* ExtensionImages/Directories bases are opened in /run/[user/xyz/]systemd/unit-extensions
1124 * on the host, and a private (invisible to the guest) tmpfs instance is mounted on
1125 * /run/[user/xyz/]systemd/unit-private-tmp as the storage backend of private /tmp and
1126 * /var/tmp. */
1127 if (!IN_SET(f->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY, MOUNT_PRIVATE_TMPFS) &&
1128 !path_startswith(mount_entry_path(f), root_directory)) {
34de407a 1129 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 1130 mount_entry_done(f);
cd2902c9
LP
1131 continue;
1132 }
1133
1134 *t = *f;
1135 t++;
1136 }
1137
063c977a 1138 ml->n_mounts = t - ml->mounts;
cd2902c9
LP
1139}
1140
6117dda8 1141static int clone_device_node(const char *node, const char *temporary_mount, bool *make_devnode) {
b2a60844 1142 _cleanup_free_ char *sl = NULL;
6117dda8 1143 const char *dn, *bn;
b5e99f23
ДГ
1144 struct stat st;
1145 int r;
1146
6117dda8
MY
1147 assert(node);
1148 assert(path_is_absolute(node));
1149 assert(temporary_mount);
1150 assert(make_devnode);
1151
1152 if (stat(node, &st) < 0) {
b2a60844 1153 if (errno == ENOENT) {
6117dda8 1154 log_debug_errno(errno, "Device node '%s' to clone does not exist.", node);
af984e13 1155 return -ENXIO;
b2a60844
LP
1156 }
1157
6117dda8 1158 return log_debug_errno(errno, "Failed to stat() device node '%s' to clone: %m", node);
b5e99f23
ДГ
1159 }
1160
6117dda8
MY
1161 r = stat_verify_device_node(&st);
1162 if (r < 0)
1163 return log_debug_errno(r, "Cannot clone device node '%s': %m", node);
b5e99f23 1164
6117dda8 1165 dn = strjoina(temporary_mount, node);
b5e99f23 1166
b2a60844 1167 /* First, try to create device node properly */
16498617 1168 if (*make_devnode) {
6117dda8 1169 mac_selinux_create_file_prepare(node, st.st_mode);
16498617
CB
1170 r = mknod(dn, st.st_mode, st.st_rdev);
1171 mac_selinux_create_file_clear();
b2a60844
LP
1172 if (r >= 0)
1173 goto add_symlink;
16498617 1174 if (errno != EPERM)
6117dda8 1175 return log_debug_errno(errno, "Failed to mknod '%s': %m", node);
16498617 1176
b2a60844 1177 /* This didn't work, let's not try this again for the next iterations. */
16498617
CB
1178 *make_devnode = false;
1179 }
1180
d73020f2 1181 /* We're about to fall back to bind-mounting the device node. So create a dummy bind-mount target.
1acf344d 1182 * Do not prepare device-node SELinux label (see issue 13762) */
16498617 1183 r = mknod(dn, S_IFREG, 0);
16498617 1184 if (r < 0 && errno != EEXIST)
6117dda8 1185 return log_debug_errno(errno, "Failed to mknod dummy device node for '%s': %m", node);
16498617 1186
21935150
LP
1187 /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard
1188 * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or
1189 * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */
6117dda8 1190 r = mount_nofollow_verbose(LOG_DEBUG, node, dn, NULL, MS_BIND, NULL);
21935150
LP
1191 if (r < 0)
1192 return r;
b2a60844
LP
1193
1194add_symlink:
6117dda8 1195 bn = path_startswith(node, "/dev/");
b2a60844
LP
1196 if (!bn)
1197 return 0;
1198
1199 /* Create symlinks like /dev/char/1:9 → ../urandom */
ec61371f 1200 if (asprintf(&sl, "%s/dev/%s/" DEVNUM_FORMAT_STR,
cbc056c8
ZJS
1201 temporary_mount,
1202 S_ISCHR(st.st_mode) ? "char" : "block",
ec61371f 1203 DEVNUM_FORMAT_VAL(st.st_rdev)) < 0)
d4f0878e 1204 return log_oom_debug();
b2a60844
LP
1205
1206 (void) mkdir_parents(sl, 0755);
1207
6117dda8 1208 const char *t = strjoina("../", bn);
b2a60844 1209 if (symlink(t, sl) < 0)
2e4a4fae 1210 log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
b5e99f23 1211
af984e13 1212 return 0;
b5e99f23
ДГ
1213}
1214
66825795
MY
1215static int bind_mount_device_dir(const char *temporary_mount, const char *dir) {
1216 const char *t;
1217
1218 assert(temporary_mount);
1219 assert(dir);
1220 assert(path_is_absolute(dir));
1221
1222 t = strjoina(temporary_mount, dir);
1223
1224 (void) mkdir(t, 0755);
1225 return mount_nofollow_verbose(LOG_DEBUG, dir, t, NULL, MS_BIND, NULL);
1226}
1227
1228static char* settle_runtime_dir(RuntimeScope scope) {
cd7f3702
DDM
1229 char *runtime_dir;
1230
1231 if (scope != RUNTIME_SCOPE_USER)
1232 return strdup("/run/");
1233
1234 if (asprintf(&runtime_dir, "/run/user/" UID_FMT, geteuid()) < 0)
1235 return NULL;
1236
1237 return runtime_dir;
1238}
1239
0c3d606c
YW
1240static int create_temporary_mount_point(RuntimeScope scope, char **ret) {
1241 _cleanup_free_ char *runtime_dir = NULL, *temporary_mount = NULL;
1242
1243 assert(ret);
1244
1245 runtime_dir = settle_runtime_dir(scope);
1246 if (!runtime_dir)
1247 return log_oom_debug();
1248
1249 temporary_mount = path_join(runtime_dir, "systemd/namespace-XXXXXX");
1250 if (!temporary_mount)
1251 return log_oom_debug();
1252
1253 if (!mkdtemp(temporary_mount))
1254 return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
1255
1256 *ret = TAKE_PTR(temporary_mount);
1257 return 0;
1258}
1259
119820f8 1260static int mount_private_dev(const MountEntry *m, const NamespaceParameters *p) {
7f112f50
LP
1261 static const char devnodes[] =
1262 "/dev/null\0"
1263 "/dev/zero\0"
1264 "/dev/full\0"
1265 "/dev/random\0"
1266 "/dev/urandom\0"
1267 "/dev/tty\0";
1268
66825795
MY
1269 _cleanup_(rmdir_and_freep) char *temporary_mount = NULL;
1270 _cleanup_(umount_and_rmdir_and_freep) char *dev = NULL;
16498617 1271 bool can_mknod = true;
7f112f50
LP
1272 int r;
1273
1274 assert(m);
119820f8 1275 assert(p);
7f112f50 1276
119820f8 1277 r = create_temporary_mount_point(p->runtime_scope, &temporary_mount);
0c3d606c
YW
1278 if (r < 0)
1279 return r;
2b85f4e1 1280
66825795
MY
1281 dev = path_join(temporary_mount, "dev");
1282 if (!dev)
1283 return -ENOMEM;
1284
dc751688 1285 (void) mkdir(dev, 0755);
9f563f27 1286 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=0755" TMPFS_LIMITS_PRIVATE_DEV);
21935150 1287 if (r < 0)
66825795 1288 return r;
21935150 1289
03bc11d1 1290 r = label_fix_full(AT_FDCWD, dev, "/dev", 0);
66825795
MY
1291 if (r < 0)
1292 return log_debug_errno(r, "Failed to fix label of '%s' as /dev/: %m", dev);
2b85f4e1 1293
66825795 1294 r = bind_mount_device_dir(temporary_mount, "/dev/pts");
21935150 1295 if (r < 0)
66825795 1296 return r;
2b85f4e1 1297
2e4a4fae
YW
1298 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
1299 * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
1300 * Thus, in that case make a clone.
1301 * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
36ce7110 1302 r = is_symlink("/dev/ptmx");
66825795
MY
1303 if (r < 0)
1304 return log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
1305 if (r > 0) {
1306 const char *devptmx = strjoina(temporary_mount, "/dev/ptmx");
1307 if (symlink("pts/ptmx", devptmx) < 0)
1308 return log_debug_errno(errno, "Failed to create symlink '%s' to pts/ptmx: %m", devptmx);
414b304b 1309 } else {
16498617 1310 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
152c475f 1311 if (r < 0)
66825795 1312 return r;
414b304b 1313 }
e06b6479 1314
66825795 1315 r = bind_mount_device_dir(temporary_mount, "/dev/shm");
21935150 1316 if (r < 0)
66825795 1317 return r;
2b85f4e1 1318
66825795
MY
1319 FOREACH_STRING(d, "/dev/mqueue", "/dev/hugepages")
1320 (void) bind_mount_device_dir(temporary_mount, d);
2b85f4e1 1321
cc4f736a
MY
1322 /* We assume /run/systemd/journal/ is available if not changing root, which isn't entirely accurate
1323 * but shouldn't matter, as either way the user would get ENOENT when accessing /dev/log */
7a9f0125 1324 if ((!p->root_image && !p->root_directory) || p->bind_log_sockets) {
119820f8
MY
1325 const char *devlog = strjoina(temporary_mount, "/dev/log");
1326 if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
1327 log_debug_errno(errno,
1328 "Failed to create symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m",
1329 devlog);
1330 }
82d25240 1331
7f112f50 1332 NULSTR_FOREACH(d, devnodes) {
16498617 1333 r = clone_device_node(d, temporary_mount, &can_mknod);
37b22b3b 1334 /* ENXIO means the *source* is not a device file, skip creation in that case */
af984e13 1335 if (r < 0 && r != -ENXIO)
66825795 1336 return r;
7f112f50
LP
1337 }
1338
2e4a4fae
YW
1339 r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
1340 if (r < 0)
105a1a36 1341 log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
7f112f50 1342
d73020f2
LP
1343 /* Create the /dev directory if missing. It is more likely to be missing when the service is started
1344 * with RootDirectory. This is consistent with mount units creating the mount points when missing. */
34de407a 1345 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 1346
9e5f8252 1347 /* Unmount everything in old /dev */
2e4a4fae
YW
1348 r = umount_recursive(mount_entry_path(m), 0);
1349 if (r < 0)
1350 log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
1351
21935150
LP
1352 r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL);
1353 if (r < 0)
66825795
MY
1354 return r;
1355 dev = rmdir_and_free(dev); /* Mount is successfully moved, do not umount() */
7f112f50 1356
63862de4 1357 return 1;
7f112f50
LP
1358}
1359
2a2969fd 1360static int mount_bind_dev(const MountEntry *m) {
5d997827
LP
1361 int r;
1362
1363 assert(m);
1364
d73020f2
LP
1365 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the
1366 * service's /dev. This is only used when RootDirectory= is set. */
5d997827 1367
645767d6
LP
1368 (void) mkdir_p_label(mount_entry_path(m), 0755);
1369
b409aacb 1370 r = path_is_mount_point(mount_entry_path(m));
5d997827
LP
1371 if (r < 0)
1372 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
1373 if (r > 0) /* make this a NOP if /dev is already a mount point */
1374 return 0;
1375
63862de4
LB
1376 r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
1377 if (r < 0)
1378 return r;
1379
1380 return 1;
5d997827
LP
1381}
1382
10028263 1383static int mount_bind_sysfs(const MountEntry *m) {
5d997827
LP
1384 int r;
1385
1386 assert(m);
1387
645767d6
LP
1388 (void) mkdir_p_label(mount_entry_path(m), 0755);
1389
b409aacb 1390 r = path_is_mount_point(mount_entry_path(m));
5d997827
LP
1391 if (r < 0)
1392 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
1393 if (r > 0) /* make this a NOP if /sys is already a mount point */
1394 return 0;
1395
1396 /* Bind mount the host's version so that we get all child mounts of it, too. */
63862de4
LB
1397 r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
1398 if (r < 0)
1399 return r;
1400
1401 return 1;
5d997827
LP
1402}
1403
b18f4eed
YW
1404static int mount_private_apivfs(
1405 const char *fstype,
1406 const char *entry_path,
1407 const char *bind_source,
4793605d
YW
1408 const char *opts,
1409 RuntimeScope scope) {
b18f4eed 1410
4793605d
YW
1411 _cleanup_(rmdir_and_freep) char *temporary_mount = NULL;
1412 int r;
553e3660 1413
b18f4eed
YW
1414 assert(fstype);
1415 assert(entry_path);
1416 assert(bind_source);
553e3660
YW
1417
1418 (void) mkdir_p_label(entry_path, 0755);
1419
4793605d
YW
1420 /* First, check if we have enough privileges to mount a new instance. Note, a new sysfs instance
1421 * cannot be mounted on an already existing mount. Let's use a temporary place. */
1422 r = create_temporary_mount_point(scope, &temporary_mount);
1423 if (r < 0)
1424 return r;
553e3660 1425
4793605d 1426 r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
553e3660 1427 if (ERRNO_IS_NEG_PRIVILEGE(r)) {
b18f4eed
YW
1428 /* When we do not have enough privileges to mount a new instance, fall back to use an
1429 * existing mount. */
553e3660 1430
b409aacb 1431 r = path_is_mount_point(entry_path);
b18f4eed
YW
1432 if (r < 0)
1433 return log_debug_errno(r, "Unable to determine whether '%s' is already mounted: %m", entry_path);
1434 if (r > 0)
1435 return 0; /* Use the current mount as is. */
553e3660 1436
b18f4eed
YW
1437 /* We lack permissions to mount a new instance, and it is not already mounted. But we can
1438 * access the host's, so as a final fallback bind-mount it to the destination, as most likely
1439 * we are inside a user manager in an unprivileged user namespace. */
396b3a1e 1440 r = mount_nofollow_verbose(LOG_DEBUG, bind_source, entry_path, /* fstype = */ NULL, MS_BIND|MS_REC, /* options = */ NULL);
63862de4
LB
1441 if (r < 0)
1442 return r;
1443
1444 return 1;
0cba5bdc
MY
1445 }
1446 if (r < 0)
553e3660
YW
1447 return r;
1448
4793605d
YW
1449 /* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */
1450 r = umount_recursive(entry_path, /* flags = */ 0);
1451 if (r < 0)
1452 log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", entry_path);
1453
1454 /* Then, move the new mount instance. */
396b3a1e 1455 r = mount_nofollow_verbose(LOG_DEBUG, temporary_mount, entry_path, /* fstype = */ NULL, MS_MOVE, /* options = */ NULL);
4793605d
YW
1456 if (r < 0)
1457 return r;
1458
b18f4eed
YW
1459 /* We mounted a new instance now. Let's bind mount the children over now. This matters for nspawn
1460 * where a bunch of files are overmounted, in particular the boot id. */
1461 (void) bind_mount_submounts(bind_source, entry_path);
63862de4 1462 return 1;
553e3660
YW
1463}
1464
4793605d 1465static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p) {
b18f4eed 1466 assert(m);
4793605d
YW
1467 assert(p);
1468 return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
b18f4eed
YW
1469}
1470
cd58b5a1 1471static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
cd58b5a1
RW
1472 assert(m);
1473 assert(p);
1614d0c4 1474 return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", /* opts = */ NULL, p->runtime_scope);
cd58b5a1
RW
1475}
1476
79d956db 1477static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
61f8a7bd 1478 _cleanup_free_ char *opts = NULL;
5d997827
LP
1479
1480 assert(m);
79d956db 1481 assert(p);
5d997827 1482
79d956db
LP
1483 if (p->protect_proc != PROTECT_PROC_DEFAULT ||
1484 p->proc_subset != PROC_SUBSET_ALL) {
4e399953
LP
1485
1486 /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
1487 * pretended to be per-instance but actually was per-namespace), hence let's make use of it
1488 * if requested. To make sure this logic succeeds only on kernels where hidepid= is
1489 * per-instance, we'll exclusively use the textual value for hidepid=, since support was
1490 * added in the same commit: if it's supported it is thus also per-instance. */
1491
79d956db 1492 const char *hpv = p->protect_proc == PROTECT_PROC_DEFAULT ?
7c76e181 1493 "off" :
79d956db 1494 protect_proc_to_string(p->protect_proc);
1c265fcd
DDM
1495
1496 /* hidepid= support was added in 5.8, so we can use fsconfig()/fsopen() (which were added in
1497 * 5.2) to check if hidepid= is supported. This avoids a noisy dmesg log by the kernel when
1498 * trying to use hidepid= on systems where it isn't supported. The same applies for subset=.
1499 * fsopen()/fsconfig() was also backported on some distros which allows us to detect
1500 * hidepid=/subset= support in even more scenarios. */
1501
0cba5bdc 1502 if (mount_option_supported("proc", "hidepid", hpv) > 0) {
1c265fcd
DDM
1503 opts = strjoin("hidepid=", hpv);
1504 if (!opts)
1505 return -ENOMEM;
1506 }
1507
79d956db 1508 if (p->proc_subset == PROC_SUBSET_PID &&
0cba5bdc 1509 mount_option_supported("proc", "subset", "pid") > 0)
1c265fcd
DDM
1510 if (!strextend_with_separator(&opts, ",", "subset=pid"))
1511 return -ENOMEM;
4e399953
LP
1512 }
1513
61f8a7bd
YW
1514 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
1515 * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
1516 * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
1517 * mounted on /proc/ first. */
4793605d 1518 return mount_private_apivfs("proc", mount_entry_path(m), "/proc", opts, p->runtime_scope);
5d997827
LP
1519}
1520
2abd4e38 1521static int mount_tmpfs(const MountEntry *m) {
df6b900a 1522 const char *entry_path, *inner_path;
abad72be 1523 int r;
abad72be 1524
6c47cd7d
LP
1525 assert(m);
1526
df6b900a 1527 entry_path = mount_entry_path(m);
809ceb82 1528 inner_path = mount_entry_unprefixed_path(m);
df6b900a 1529
d73020f2
LP
1530 /* First, get rid of everything that is below if there is anything. Then, overmount with our new
1531 * tmpfs */
6c47cd7d 1532
abad72be
CG
1533 (void) mkdir_p_label(entry_path, 0755);
1534 (void) umount_recursive(entry_path, 0);
6c47cd7d 1535
21935150
LP
1536 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m));
1537 if (r < 0)
1538 return r;
abad72be 1539
03bc11d1 1540 r = label_fix_full(AT_FDCWD, entry_path, inner_path, 0);
abad72be 1541 if (r < 0)
df6b900a 1542 return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
6c47cd7d 1543
63862de4 1544 return 1;
6c47cd7d
LP
1545}
1546
94293d65
LB
1547static int mount_run(const MountEntry *m) {
1548 int r;
1549
1550 assert(m);
1551
b409aacb 1552 r = path_is_mount_point(mount_entry_path(m));
94293d65
LB
1553 if (r < 0 && r != -ENOENT)
1554 return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m");
1555 if (r > 0) /* make this a NOP if /run is already a mount point */
1556 return 0;
1557
1558 return mount_tmpfs(m);
1559}
1560
80271a44
XR
1561static int mount_mqueuefs(const MountEntry *m) {
1562 int r;
1563 const char *entry_path;
1564
1565 assert(m);
1566
1567 entry_path = mount_entry_path(m);
1568
1569 (void) mkdir_p_label(entry_path, 0755);
1570 (void) umount_recursive(entry_path, 0);
1571
1572 r = mount_nofollow_verbose(LOG_DEBUG, "mqueue", entry_path, "mqueue", m->flags, mount_entry_options(m));
1573 if (r < 0)
1574 return r;
1575
63862de4 1576 return 1;
80271a44
XR
1577}
1578
84be0c71 1579static int mount_image(
a1a40297 1580 MountEntry *m,
84be0c71
LP
1581 const char *root_directory,
1582 const ImagePolicy *image_policy) {
93f59701 1583
bcd904d4
LP
1584 _cleanup_(extension_release_data_done) ExtensionReleaseData rdata = {};
1585 _cleanup_free_ char *extension_name = NULL;
dfdeb0b1 1586 ImageClass required_class = _IMAGE_CLASS_INVALID;
b3d13314
LB
1587 int r;
1588
89e62e0b
LP
1589 assert(m);
1590
55ea4ef0
MG
1591 r = path_extract_filename(mount_entry_source(m), &extension_name);
1592 if (r < 0)
1593 return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
1594
a868e437 1595 if (m->mode == MOUNT_EXTENSION_IMAGE) {
93f59701
LB
1596 r = parse_os_release(
1597 empty_to_root(root_directory),
bcd904d4 1598 "ID", &rdata.os_release_id,
0af99376 1599 "ID_LIKE", &rdata.os_release_id_like,
bcd904d4
LP
1600 "VERSION_ID", &rdata.os_release_version_id,
1601 image_class_info[IMAGE_SYSEXT].level_env, &rdata.os_release_sysext_level,
1602 image_class_info[IMAGE_CONFEXT].level_env, &rdata.os_release_confext_level,
93f59701
LB
1603 NULL);
1604 if (r < 0)
1605 return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
bcd904d4 1606 if (isempty(rdata.os_release_id))
4e494e6a 1607 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s'.", empty_to_root(root_directory));
dfdeb0b1 1608
1609 required_class = m->filter_class;
93f59701
LB
1610 }
1611
1612 r = verity_dissect_and_mount(
9d37b8f6 1613 /* src_fd= */ -EBADF,
84be0c71
LP
1614 mount_entry_source(m),
1615 mount_entry_path(m),
9dc6a6af 1616 m->image_options_const,
84be0c71 1617 image_policy,
f1395724 1618 /* image_filter= */ NULL,
bcd904d4 1619 &rdata,
dfdeb0b1 1620 required_class,
a1a40297 1621 &m->verity,
3e107272 1622 /* ret_image= */ NULL);
b850a9b2
LB
1623 if (r == -ENOENT && m->ignore)
1624 return 0;
bcd904d4 1625 if (r == -ESTALE && rdata.os_release_id)
7fcd1343 1626 return log_error_errno(r, // FIXME: this should not be logged ad LOG_ERR, as it will result in duplicate logging.
0af99376 1627 "Failed to mount image %s, extension-release metadata does not match the lower layer's: ID=%s ID_LIKE='%s'%s%s%s%s%s%s",
db4c8a25 1628 mount_entry_source(m),
bcd904d4 1629 rdata.os_release_id,
0af99376 1630 strempty(rdata.os_release_id_like),
bcd904d4
LP
1631 rdata.os_release_version_id ? " VERSION_ID=" : "",
1632 strempty(rdata.os_release_version_id),
1633 rdata.os_release_sysext_level ? image_class_info[IMAGE_SYSEXT].level_env_print : "",
1634 strempty(rdata.os_release_sysext_level),
1635 rdata.os_release_confext_level ? image_class_info[IMAGE_CONFEXT].level_env_print : "",
1636 strempty(rdata.os_release_confext_level));
dfdeb0b1 1637 if (r == -ENOCSI) {
1638 log_debug("Image %s does not match the expected class, ignoring", mount_entry_source(m));
1639 return 0; /* Nothing to do, wrong class */
1640 }
b3d13314 1641 if (r < 0)
4beda316 1642 return log_debug_errno(r, "Failed to mount image %s on %s: %m", mount_entry_source(m), mount_entry_path(m));
b3d13314 1643
63862de4 1644 return 1;
b3d13314
LB
1645}
1646
93f59701 1647static int mount_overlay(const MountEntry *m) {
f0304df6 1648 _cleanup_free_ char *options = NULL, *layers = NULL;
93f59701
LB
1649 int r;
1650
1651 assert(m);
1652
f0304df6
LB
1653 /* Extension hierarchies are optional (e.g.: confext might not have /opt) so check if they actually
1654 * exist in an image before attempting to create an overlay with them, otherwise the mount will
1655 * fail. We can't check before this, as the images will not be mounted until now. */
1656
1657 /* Note that lowerdir= parameters are in 'reverse' order, so the top-most directory in the overlay
1658 * comes first in the list. */
1659 STRV_FOREACH_BACKWARDS(o, m->overlay_layers) {
1660 _cleanup_free_ char *escaped = NULL;
1661
1662 r = is_dir(*o, /* follow= */ false);
1663 if (r <= 0) {
1664 if (r != -ENOENT)
1665 log_debug_errno(r,
1666 "Failed to check whether overlay layer source path '%s' exists, ignoring: %m",
1667 *o);
1668 continue;
1669 }
1670
1671 escaped = shell_escape(*o, ",:");
1672 if (!escaped)
1673 return log_oom_debug();
1674
1675 if (!strextend_with_separator(&layers, ":", escaped))
1676 return log_oom_debug();
1677 }
1678
1679 if (!layers) {
1680 log_debug("None of the overlays specified in '%s' exist at the source, skipping.",
1681 mount_entry_options(m));
1682 return 0; /* Only the root is set? Then there's nothing to overlay */
1683 }
1684
1685 options = strjoin("lowerdir=", layers, ":", mount_entry_path(m)); /* The root goes in last */
1686 if (!options)
1687 return log_oom_debug();
93f59701
LB
1688
1689 (void) mkdir_p_label(mount_entry_path(m), 0755);
1690
dfdeb0b1 1691 r = mount_nofollow_verbose(LOG_DEBUG, "systemd-extensions", mount_entry_path(m), "overlay", MS_RDONLY, options);
93f59701
LB
1692 if (r == -ENOENT && m->ignore)
1693 return 0;
63862de4
LB
1694 if (r < 0)
1695 return r;
93f59701 1696
63862de4 1697 return 1;
93f59701
LB
1698}
1699
088696fe 1700static int follow_symlink(
d2d6c096 1701 const char *root_directory,
088696fe 1702 MountEntry *m) {
d2d6c096 1703
088696fe 1704 _cleanup_free_ char *target = NULL;
8fceda93
LP
1705 int r;
1706
e2663cbd
MY
1707 assert(m);
1708
088696fe
LP
1709 /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
1710 * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
1711 * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
1712 * end and already have a fully normalized name. */
8fceda93 1713
f461a28d 1714 r = chase(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL);
088696fe
LP
1715 if (r < 0)
1716 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
1717 if (r > 0) /* Reached the end, nothing more to resolve */
1718 return 1;
8fceda93 1719
f461a28d 1720 if (m->n_followed >= CHASE_MAX) /* put a boundary on things */
baaa35ad
ZJS
1721 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
1722 "Symlink loop on '%s'.",
1723 mount_entry_path(m));
8fceda93 1724
e2341b6b 1725 log_debug("Followed mount entry path symlink %s %s %s.",
1ae9b0cf 1726 mount_entry_path(m), glyph(GLYPH_ARROW_RIGHT), target);
8fceda93 1727
809ceb82 1728 mount_entry_consume_prefix(m, TAKE_PTR(target));
8fceda93 1729
b3a9d980 1730 m->n_followed++;
088696fe
LP
1731
1732 return 0;
8fceda93
LP
1733}
1734
82fb2da2 1735static int apply_one_mount(
8fceda93 1736 const char *root_directory,
4e399953 1737 MountEntry *m,
79d956db 1738 const NamespaceParameters *p) {
ac0930c8 1739
e5f10caf 1740 _cleanup_free_ char *inaccessible = NULL;
a227a4be 1741 bool rbind = true, make = false;
15ae422b 1742 const char *what;
15ae422b 1743 int r;
15ae422b 1744
63862de4
LB
1745 /* Return 1 when the mount should be post-processed (remounted r/o, etc.), 0 otherwise. In most
1746 * cases post-processing is the right thing, the typical exception is when the mount is gracefully
1747 * skipped. */
1748
c17ec25e 1749 assert(m);
79d956db 1750 assert(p);
15ae422b 1751
34de407a 1752 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 1753
c17ec25e 1754 switch (m->mode) {
15ae422b 1755
a868e437 1756 case MOUNT_INACCESSIBLE: {
cd7f3702 1757 _cleanup_free_ char *runtime_dir = NULL;
160cfdbe 1758 struct stat target;
6d313367
LP
1759
1760 /* First, get rid of everything that is below if there
1761 * is anything... Then, overmount it with an
c4b41707 1762 * inaccessible path. */
34de407a 1763 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 1764
088696fe
LP
1765 if (lstat(mount_entry_path(m), &target) < 0) {
1766 if (errno == ENOENT && m->ignore)
1767 return 0;
1768
cbc056c8
ZJS
1769 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m",
1770 mount_entry_path(m));
088696fe 1771 }
15ae422b 1772
cd7f3702
DDM
1773 /* We don't pass the literal runtime scope through here but one based purely on our UID. This
1774 * means that the root user's --user services will use the host's inaccessible inodes rather
1775 * then root's private ones. This is preferable since it means device nodes that are
1776 * overmounted to make them inaccessible will be overmounted with a device node, rather than
1777 * an AF_UNIX socket inode. */
1778 runtime_dir = settle_runtime_dir(geteuid() == 0 ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER);
1779 if (!runtime_dir)
1780 return log_oom_debug();
e5f10caf
AZ
1781
1782 r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
1783 if (r < 0)
baaa35ad 1784 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
4e494e6a 1785 "File type not supported for inaccessible mounts. Note that symlinks are not allowed.");
e5f10caf 1786 what = inaccessible;
c4b41707 1787 break;
160cfdbe 1788 }
fe3c2583 1789
a868e437
LP
1790 case MOUNT_READ_ONLY:
1791 case MOUNT_READ_WRITE:
1792 case MOUNT_READ_WRITE_IMPLICIT:
1793 case MOUNT_EXEC:
1794 case MOUNT_NOEXEC:
b409aacb 1795 r = path_is_mount_point_full(mount_entry_path(m), root_directory, /* flags = */ 0);
088696fe
LP
1796 if (r == -ENOENT && m->ignore)
1797 return 0;
d944dc95 1798 if (r < 0)
cbc056c8
ZJS
1799 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m",
1800 mount_entry_path(m));
1801 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY
ddc155b2 1802 * and MS_NOEXEC bits for the mount point if needed. */
63862de4 1803 return 1;
6b7c9f8b 1804 /* This isn't a mount point yet, let's make it one. */
34de407a 1805 what = mount_entry_path(m);
6b7c9f8b 1806 break;
15ae422b 1807
a868e437 1808 case MOUNT_EXTENSION_DIRECTORY: {
0af99376
CG
1809 _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_id_like = NULL,
1810 *host_os_release_version_id = NULL, *host_os_release_level = NULL,
1811 *extension_name = NULL;
a07b9926 1812 _cleanup_strv_free_ char **extension_release = NULL;
55ea4ef0 1813 ImageClass class = IMAGE_SYSEXT;
a07b9926
LB
1814
1815 r = path_extract_filename(mount_entry_source(m), &extension_name);
1816 if (r < 0)
1817 return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
1818
010ea061 1819 r = load_extension_release_pairs(
1820 mount_entry_source(m),
dfdeb0b1 1821 m->filter_class >= 0 ? m->filter_class : IMAGE_SYSEXT,
010ea061 1822 extension_name,
1823 /* relax_extension_release_check= */ false,
1824 &extension_release);
55ea4ef0 1825 if (r == -ENOENT) {
dfdeb0b1 1826 if (m->filter_class >= 0)
1827 return 0; /* Nothing to do, wrong class */
1828
010ea061 1829 r = load_extension_release_pairs(
1830 mount_entry_source(m),
1831 IMAGE_CONFEXT,
1832 extension_name,
1833 /* relax_extension_release_check= */ false,
1834 &extension_release);
55ea4ef0
MG
1835 if (r >= 0)
1836 class = IMAGE_CONFEXT;
1837 }
010ea061 1838 if (r == -ENOENT && m->ignore)
1839 return 0;
5897469a
LB
1840 if (r < 0)
1841 return log_debug_errno(r, "Failed to acquire 'extension-release' data of extension tree %s: %m", mount_entry_source(m));
55ea4ef0 1842
a07b9926
LB
1843 r = parse_os_release(
1844 empty_to_root(root_directory),
1845 "ID", &host_os_release_id,
0af99376 1846 "ID_LIKE", &host_os_release_id_like,
a07b9926 1847 "VERSION_ID", &host_os_release_version_id,
55ea4ef0 1848 image_class_info[class].level_env, &host_os_release_level,
a07b9926
LB
1849 NULL);
1850 if (r < 0)
1851 return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
1852 if (isempty(host_os_release_id))
4e494e6a 1853 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s'.", empty_to_root(root_directory));
a07b9926 1854
a07b9926
LB
1855 r = extension_release_validate(
1856 extension_name,
1857 host_os_release_id,
0af99376 1858 host_os_release_id_like,
a07b9926 1859 host_os_release_version_id,
55ea4ef0 1860 host_os_release_level,
e2663cbd 1861 /* host_extension_scope = */ NULL, /* Leave empty, we need to accept both system and portable */
30dfe035 1862 extension_release,
55ea4ef0 1863 class);
a07b9926
LB
1864 if (r < 0)
1865 return log_debug_errno(r, "Failed to compare directory %s extension-release metadata with the root's os-release: %m", extension_name);
e2663cbd 1866 if (r == 0)
4e494e6a 1867 return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's.", extension_name);
a07b9926
LB
1868
1869 _fallthrough_;
1870 }
1871
a868e437 1872 case MOUNT_BIND:
d2d6c096 1873 rbind = false;
d2d6c096 1874
4831981d 1875 _fallthrough_;
a868e437 1876 case MOUNT_BIND_RECURSIVE: {
088696fe 1877 _cleanup_free_ char *chased = NULL;
5d997827 1878
cbc056c8
ZJS
1879 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note
1880 * that bind mount source paths are always relative to the host root, hence we pass NULL as
f461a28d 1881 * root directory to chase() here. */
088696fe 1882
0e551b04
LB
1883 /* When we create implicit mounts, we might need to create the path ourselves as it is on a
1884 * just-created tmpfs, for example. */
1885 if (m->create_source_dir) {
1886 r = mkdir_p(mount_entry_source(m), m->source_dir_mode);
1887 if (r < 0)
1888 return log_debug_errno(r, "Failed to create source directory %s: %m", mount_entry_source(m));
a128273f
MS
1889
1890 r = label_fix_full(AT_FDCWD, mount_entry_source(m), mount_entry_unprefixed_path(m), /* flags= */ 0);
1891 if (r < 0)
1892 return log_error_errno(r, "Failed to set label of the source directory %s: %m", mount_entry_source(m));
0e551b04
LB
1893 }
1894
f461a28d 1895 r = chase(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL);
088696fe
LP
1896 if (r == -ENOENT && m->ignore) {
1897 log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
1898 return 0;
1899 }
1900 if (r < 0)
1901 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
1902
e2341b6b 1903 log_debug("Followed source symlinks %s %s %s.",
1ae9b0cf 1904 mount_entry_source(m), glyph(GLYPH_ARROW_RIGHT), chased);
088696fe
LP
1905
1906 free_and_replace(m->source_malloc, chased);
d2d6c096
LP
1907
1908 what = mount_entry_source(m);
a227a4be 1909 make = true;
d2d6c096 1910 break;
088696fe 1911 }
d2d6c096 1912
a868e437 1913 case MOUNT_EMPTY_DIR:
0e551b04 1914 case MOUNT_PRIVATE_TMPFS:
a868e437 1915 case MOUNT_TMPFS:
2abd4e38 1916 return mount_tmpfs(m);
6c47cd7d 1917
a868e437 1918 case MOUNT_PRIVATE_TMP:
89bd586c 1919 what = mount_entry_source(m);
a227a4be 1920 make = true;
15ae422b 1921 break;
e364ad06 1922
a868e437 1923 case MOUNT_PRIVATE_DEV:
119820f8 1924 return mount_private_dev(m, p);
5d997827 1925
a868e437 1926 case MOUNT_BIND_DEV:
5d997827
LP
1927 return mount_bind_dev(m);
1928
a868e437 1929 case MOUNT_PRIVATE_SYSFS:
4793605d 1930 return mount_private_sysfs(m, p);
277b269e 1931
a868e437 1932 case MOUNT_BIND_SYSFS:
10028263 1933 return mount_bind_sysfs(m);
5d997827 1934
a868e437 1935 case MOUNT_PROCFS:
79d956db 1936 return mount_procfs(m, p);
d6797c92 1937
cd58b5a1
RW
1938 case MOUNT_PRIVATE_CGROUP2FS:
1939 return mount_private_cgroup2fs(m, p);
1940
a868e437 1941 case MOUNT_RUN:
94293d65
LB
1942 return mount_run(m);
1943
a868e437 1944 case MOUNT_MQUEUEFS:
80271a44
XR
1945 return mount_mqueuefs(m);
1946
a868e437 1947 case MOUNT_IMAGE:
79d956db 1948 return mount_image(m, NULL, p->mount_image_policy);
93f59701 1949
a868e437 1950 case MOUNT_EXTENSION_IMAGE:
79d956db 1951 return mount_image(m, root_directory, p->extension_image_policy);
93f59701 1952
a868e437 1953 case MOUNT_OVERLAY:
93f59701 1954 return mount_overlay(m);
b3d13314 1955
e364ad06 1956 default:
04499a70 1957 assert_not_reached();
15ae422b
LP
1958 }
1959
ac0930c8 1960 assert(what);
15ae422b 1961
21935150
LP
1962 r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
1963 if (r < 0) {
a227a4be 1964 bool try_again = false;
a227a4be
LP
1965
1966 if (r == -ENOENT && make) {
8bab8029 1967 int q;
a227a4be 1968
cbc056c8
ZJS
1969 /* Hmm, either the source or the destination are missing. Let's see if we can create
1970 the destination, then try again. */
a227a4be 1971
8bab8029 1972 (void) mkdir_parents(mount_entry_path(m), 0755);
a227a4be 1973
8bab8029 1974 q = make_mount_point_inode_from_path(what, mount_entry_path(m), 0755);
c3f0f6f8
LB
1975 if (q < 0 && q != -EEXIST)
1976 // FIXME: this shouldn't be logged at LOG_WARNING, but be bubbled up, and logged there to avoid duplicate logging
1977 log_warning_errno(q, "Failed to create destination mount point node '%s', ignoring: %m",
1978 mount_entry_path(m));
1979 else
8bab8029 1980 try_again = true;
a227a4be
LP
1981 }
1982
21935150
LP
1983 if (try_again)
1984 r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
a227a4be 1985 if (r < 0)
7fcd1343 1986 return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m)); // FIXME: this should not be logged here, but be bubbled up, to avoid duplicate logging
a227a4be 1987 }
6b7c9f8b 1988
34de407a 1989 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
eae51272
AB
1990
1991 /* Take care of id-mapped mounts */
1992 if (m->idmapped && uid_is_valid(m->idmap_uid) && gid_is_valid(m->idmap_gid)) {
1993 _cleanup_close_ int userns_fd = -EBADF;
1994 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1995
1996 log_debug("Setting an id-mapped mount on %s", mount_entry_path(m));
1997
1998 /* Do mapping from nobody (in setup_exec_directory()) -> this uid */
5c9327e3 1999 if (strextendf(&uid_map, UID_FMT " " UID_FMT " 1\n", UID_NOBODY, m->idmap_uid) < 0)
eae51272
AB
2000 return log_oom();
2001
2002 /* Consider StateDirectory=xxx aaa xxx:aaa/222
2003 * To allow for later symlink creation (by root) in create_symlinks_from_tuples(), map root as well. */
5c9327e3
LP
2004 if (m->idmap_uid != 0)
2005 if (!strextend(&uid_map, "0 0 1\n"))
eae51272 2006 return log_oom();
eae51272 2007
5c9327e3 2008 if (strextendf(&gid_map, GID_FMT " " GID_FMT " 1\n", GID_NOBODY, m->idmap_gid) < 0)
eae51272
AB
2009 return log_oom();
2010
5c9327e3
LP
2011 if (m->idmap_gid != 0)
2012 if (!strextend(&gid_map, "0 0 1\n"))
eae51272 2013 return log_oom();
eae51272 2014
6431c34b 2015 userns_fd = userns_acquire(uid_map, gid_map, /* setgroups_deny= */ true);
eae51272
AB
2016 if (userns_fd < 0)
2017 return log_error_errno(userns_fd, "Failed to allocate user namespace: %m");
2018
2019 /* Drop SUID, add NOEXEC for the mount to avoid root exploits */
2020 r = remount_idmap_fd(STRV_MAKE(mount_entry_path(m)), userns_fd, MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOEXEC | MOUNT_ATTR_NODEV);
2021 if (r < 0)
2022 return log_error_errno(r, "Failed to create an id-mapped mount: %m");
2023
2024 log_debug("ID-mapped mount created successfully for %s from " UID_FMT " to " UID_FMT "", mount_entry_path(m), UID_NOBODY, m->idmap_uid);
2025 }
2026
63862de4 2027 return 1;
ac0930c8 2028}
15ae422b 2029
97bf617a
YW
2030static bool should_propagate_to_submounts(const MountEntry *m) {
2031 assert(m);
2032 return !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS, MOUNT_PRIVATE_TMPFS);
2033}
2034
6b000af4 2035static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
9ce4e4b0 2036 unsigned long new_flags = 0, flags_mask = 0;
57ccd9f6
YW
2037 bool submounts;
2038 int r;
15ae422b 2039
c17ec25e 2040 assert(m);
ac9de0b3 2041 assert(proc_self_mountinfo);
ac0930c8 2042
63862de4
LB
2043 if (m->state != MOUNT_APPLIED)
2044 return 0;
2045
a868e437 2046 if (mount_entry_read_only(m) || m->mode == MOUNT_PRIVATE_DEV) {
9ce4e4b0
LP
2047 new_flags |= MS_RDONLY;
2048 flags_mask |= MS_RDONLY;
2049 }
2050
2051 if (m->nosuid) {
2052 new_flags |= MS_NOSUID;
2053 flags_mask |= MS_NOSUID;
2054 }
2055
2056 if (flags_mask == 0) /* No Change? */
6b7c9f8b
LP
2057 return 0;
2058
9ce4e4b0
LP
2059 /* We generally apply these changes recursively, except for /dev, and the cases we know there's
2060 * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
2061 * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace
2062 * and running Linux <= 4.17. */
97bf617a 2063 submounts = mount_entry_read_only(m) && should_propagate_to_submounts(m);
9ce4e4b0 2064 if (submounts)
6b000af4 2065 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
9ce4e4b0 2066 else
7cce68e1 2067 r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
9ce4e4b0 2068
ddc155b2 2069 /* Note that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
867189b5
LP
2070 * read-only already stays this way. This improves compatibility with container managers, where we
2071 * won't attempt to undo read-only mounts already applied. */
ac0930c8 2072
8fceda93 2073 if (r == -ENOENT && m->ignore)
867189b5 2074 return 0;
763a260a 2075 if (r < 0)
9ce4e4b0 2076 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
763a260a 2077 submounts ? " and its submounts" : "");
763a260a 2078 return 0;
d944dc95
LP
2079}
2080
ddc155b2
TM
2081static int make_noexec(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
2082 unsigned long new_flags = 0, flags_mask = 0;
57ccd9f6
YW
2083 bool submounts;
2084 int r;
ddc155b2
TM
2085
2086 assert(m);
2087 assert(proc_self_mountinfo);
2088
63862de4
LB
2089 if (m->state != MOUNT_APPLIED)
2090 return 0;
2091
ddc155b2
TM
2092 if (mount_entry_noexec(m)) {
2093 new_flags |= MS_NOEXEC;
2094 flags_mask |= MS_NOEXEC;
2095 } else if (mount_entry_exec(m)) {
2096 new_flags &= ~MS_NOEXEC;
2097 flags_mask |= MS_NOEXEC;
2098 }
2099
2100 if (flags_mask == 0) /* No Change? */
2101 return 0;
2102
97bf617a 2103 submounts = should_propagate_to_submounts(m);
ddc155b2
TM
2104 if (submounts)
2105 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
2106 else
2107 r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
2108
2109 if (r == -ENOENT && m->ignore)
2110 return 0;
2111 if (r < 0)
2112 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
2113 submounts ? " and its submounts" : "");
2114 return 0;
2115}
2116
6720e356 2117static int make_nosuid(const MountEntry *m, FILE *proc_self_mountinfo) {
57ccd9f6
YW
2118 bool submounts;
2119 int r;
6720e356
YW
2120
2121 assert(m);
2122 assert(proc_self_mountinfo);
2123
63862de4
LB
2124 if (m->state != MOUNT_APPLIED)
2125 return 0;
2126
97bf617a 2127 submounts = should_propagate_to_submounts(m);
6720e356
YW
2128 if (submounts)
2129 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, NULL, proc_self_mountinfo);
2130 else
2131 r = bind_remount_one_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, proc_self_mountinfo);
2132 if (r == -ENOENT && m->ignore)
2133 return 0;
2134 if (r < 0)
2135 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
2136 submounts ? " and its submounts" : "");
2137 return 0;
2138}
2139
79d956db
LP
2140static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
2141 assert(p);
5d997827 2142
9c988f93
DH
2143 /*
2144 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
2145 * since to protect the API VFS mounts, they need to be around in the
9b68367b 2146 * first place...
9c988f93 2147 */
5d997827 2148
79d956db 2149 return p->mount_apivfs ||
5fe29238 2150 p->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
79d956db
LP
2151 p->protect_kernel_tunables ||
2152 p->protect_proc != PROTECT_PROC_DEFAULT ||
406f1775
DDM
2153 p->proc_subset != PROC_SUBSET_ALL ||
2154 p->private_pids != PRIVATE_PIDS_NO;
5d997827
LP
2155}
2156
fbf90c0d
CB
2157/* Walk all mount entries and dropping any unused mounts. This affects all
2158 * mounts:
2159 * - that are implicitly protected by a path that has been rendered inaccessible
2160 * - whose immediate parent requests the same protection mode as the mount itself
2161 * - that are outside of the relevant root directory
2162 * - which are duplicates
2163 */
1913ffb8 2164static void sort_and_drop_unused_mounts(MountList *ml, const char *root_directory) {
063c977a 2165 assert(ml);
9b68367b 2166 assert(root_directory);
f8b64b57 2167
063c977a
LP
2168 assert(ml->mounts || ml->n_mounts == 0);
2169
2170 typesafe_qsort(ml->mounts, ml->n_mounts, mount_path_compare);
f8b64b57 2171
063c977a
LP
2172 drop_duplicates(ml);
2173 drop_outside_root(ml, root_directory);
2174 drop_inaccessible(ml);
2175 drop_nop(ml);
f8b64b57
LP
2176}
2177
df61e79a 2178static int create_symlinks_from_tuples(const char *root, char **strv_symlinks) {
df61e79a
LB
2179 int r;
2180
2181 STRV_FOREACH_PAIR(src, dst, strv_symlinks) {
2182 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2183
2184 src_abs = path_join(root, *src);
2185 dst_abs = path_join(root, *dst);
2186 if (!src_abs || !dst_abs)
2187 return -ENOMEM;
2188
2189 r = mkdir_parents_label(dst_abs, 0755);
2190 if (r < 0)
15461b7f
LB
2191 return log_debug_errno(
2192 r,
2193 "Failed to create parent directory for symlink '%s': %m",
2194 dst_abs);
df61e79a
LB
2195
2196 r = symlink_idempotent(src_abs, dst_abs, true);
2197 if (r < 0)
15461b7f
LB
2198 return log_debug_errno(
2199 r,
2200 "Failed to create symlink from '%s' to '%s': %m",
2201 src_abs,
2202 dst_abs);
df61e79a
LB
2203 }
2204
2205 return 0;
2206}
2207
e73c042b 2208static void mount_entry_path_debug_string(const char *root, MountEntry *m, char **ret_path) {
13b3af4a
LB
2209 assert(m);
2210
2211 /* Create a string suitable for debugging logs, stripping for example the local working directory.
2212 * For example, with a BindPaths=/var/bar that does not exist on the host:
2213 *
2214 * Before:
2215 * foo.service: Failed to set up mount namespacing: /run/systemd/unit-root/var/bar: No such file or directory
2216 * After:
2217 * foo.service: Failed to set up mount namespacing: /var/bar: No such file or directory
2218 *
2219 * Note that this is an error path, so no OOM check is done on purpose. */
2220
e73c042b 2221 if (!ret_path)
13b3af4a
LB
2222 return;
2223
2224 if (!mount_entry_path(m)) {
e73c042b 2225 *ret_path = NULL;
13b3af4a
LB
2226 return;
2227 }
2228
2229 if (root) {
2230 const char *e = startswith(mount_entry_path(m), root);
2231 if (e) {
e73c042b 2232 *ret_path = strdup(e);
13b3af4a
LB
2233 return;
2234 }
2235 }
2236
e73c042b 2237 *ret_path = strdup(mount_entry_path(m));
13b3af4a
LB
2238 return;
2239}
2240
82fb2da2 2241static int apply_mounts(
063c977a 2242 MountList *ml,
82fb2da2 2243 const char *root,
79d956db 2244 const NamespaceParameters *p,
e73c042b 2245 char **reterr_path) {
82fb2da2
LB
2246
2247 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
2248 _cleanup_free_ char **deny_list = NULL;
82fb2da2
LB
2249 int r;
2250
063c977a 2251 assert(ml);
82fb2da2 2252 assert(root);
063c977a
LP
2253 assert(p);
2254
2255 if (ml->n_mounts == 0) /* Shortcut: nothing to do */
2256 return 0;
82fb2da2
LB
2257
2258 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of
2259 * /proc. For example, this is the case with the option: 'InaccessiblePaths=/proc'. */
2260 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
2261 if (!proc_self_mountinfo) {
d60e3b40
LP
2262 r = -errno;
2263
e73c042b
RW
2264 if (reterr_path)
2265 *reterr_path = strdup("/proc/self/mountinfo");
d60e3b40 2266
42ba9974 2267 return log_debug_errno(r, "Failed to open %s: %m", "/proc/self/mountinfo");
82fb2da2
LB
2268 }
2269
2270 /* First round, establish all mounts we need */
2271 for (;;) {
2272 bool again = false;
2273
063c977a 2274 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
82fb2da2 2275
63862de4 2276 if (m->state != MOUNT_PENDING)
82fb2da2
LB
2277 continue;
2278
0e551b04
LB
2279 /* ExtensionImages/Directories are first opened in the propagate directory, not in
2280 * the root_directory. A private (invisible to the guest) tmpfs instance is mounted
2281 * on /run/[user/xyz/]systemd/unit-private-tmp as the storage backend of private
2282 * /tmp and /var/tmp. */
2283 r = follow_symlink(!IN_SET(m->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY, MOUNT_PRIVATE_TMPFS) ? root : NULL, m);
82fb2da2 2284 if (r < 0) {
e73c042b 2285 mount_entry_path_debug_string(root, m, reterr_path);
82fb2da2
LB
2286 return r;
2287 }
2288 if (r == 0) {
2289 /* We hit a symlinked mount point. The entry got rewritten and might
2290 * point to a very different place now. Let's normalize the changed
2291 * list, and start from the beginning. After all to mount the entry
2292 * at the new location we might need some other mounts first */
2293 again = true;
2294 break;
2295 }
2296
63862de4 2297 /* Returns 1 if the mount should be post-processed, 0 otherwise */
79d956db 2298 r = apply_one_mount(root, m, p);
82fb2da2 2299 if (r < 0) {
e73c042b 2300 mount_entry_path_debug_string(root, m, reterr_path);
82fb2da2
LB
2301 return r;
2302 }
63862de4 2303 m->state = r == 0 ? MOUNT_SKIPPED : MOUNT_APPLIED;
82fb2da2
LB
2304 }
2305
2306 if (!again)
2307 break;
2308
1913ffb8 2309 sort_and_drop_unused_mounts(ml, root);
82fb2da2
LB
2310 }
2311
df61e79a 2312 /* Now that all filesystems have been set up, but before the
663e2756 2313 * read-only switches are flipped, create the exec dirs and other symlinks.
df61e79a
LB
2314 * Note that when /var/lib is not empty/tmpfs, these symlinks will already
2315 * exist, which means this will be a no-op. */
79d956db 2316 r = create_symlinks_from_tuples(root, p->symlinks);
df61e79a 2317 if (r < 0)
663e2756 2318 return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m");
df61e79a 2319
82fb2da2 2320 /* Create a deny list we can pass to bind_mount_recursive() */
063c977a 2321 deny_list = new(char*, ml->n_mounts+1);
82fb2da2
LB
2322 if (!deny_list)
2323 return -ENOMEM;
063c977a
LP
2324 for (size_t j = 0; j < ml->n_mounts; j++)
2325 deny_list[j] = (char*) mount_entry_path(ml->mounts+j);
2326 deny_list[ml->n_mounts] = NULL;
82fb2da2
LB
2327
2328 /* Second round, flip the ro bits if necessary. */
063c977a 2329 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
82fb2da2
LB
2330 r = make_read_only(m, deny_list, proc_self_mountinfo);
2331 if (r < 0) {
e73c042b 2332 mount_entry_path_debug_string(root, m, reterr_path);
82fb2da2
LB
2333 return r;
2334 }
2335 }
2336
2337 /* Third round, flip the noexec bits with a simplified deny list. */
063c977a 2338 for (size_t j = 0; j < ml->n_mounts; j++)
a868e437 2339 if (IN_SET((ml->mounts+j)->mode, MOUNT_EXEC, MOUNT_NOEXEC))
063c977a
LP
2340 deny_list[j] = (char*) mount_entry_path(ml->mounts+j);
2341 deny_list[ml->n_mounts] = NULL;
82fb2da2 2342
063c977a 2343 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
82fb2da2
LB
2344 r = make_noexec(m, deny_list, proc_self_mountinfo);
2345 if (r < 0) {
e73c042b 2346 mount_entry_path_debug_string(root, m, reterr_path);
82fb2da2
LB
2347 return r;
2348 }
2349 }
2350
6720e356 2351 /* Fourth round, flip the nosuid bits without a deny list. */
79d956db 2352 if (p->mount_nosuid)
063c977a 2353 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
6720e356
YW
2354 r = make_nosuid(m, proc_self_mountinfo);
2355 if (r < 0) {
e73c042b 2356 mount_entry_path_debug_string(root, m, reterr_path);
6720e356
YW
2357 return r;
2358 }
2359 }
2360
82fb2da2
LB
2361 return 1;
2362}
2363
c8c535d5
LP
2364static bool root_read_only(
2365 char **read_only_paths,
2366 ProtectSystem protect_system) {
2367
2368 /* Determine whether the root directory is going to be read-only given the configured settings. */
2369
2370 if (protect_system == PROTECT_SYSTEM_STRICT)
2371 return true;
2372
de46b2be 2373 if (prefixed_path_strv_contains(read_only_paths, "/"))
c8c535d5
LP
2374 return true;
2375
2376 return false;
2377}
2378
2379static bool home_read_only(
d99511ad
MY
2380 char * const *read_only_paths,
2381 char * const *inaccessible_paths,
2382 char * const *empty_directories,
c8c535d5
LP
2383 const BindMount *bind_mounts,
2384 size_t n_bind_mounts,
2385 const TemporaryFileSystem *temporary_filesystems,
2386 size_t n_temporary_filesystems,
2387 ProtectHome protect_home) {
2388
c8c535d5
LP
2389 /* Determine whether the /home directory is going to be read-only given the configured settings. Yes,
2390 * this is a bit sloppy, since we don't bother checking for cases where / is affected by multiple
2391 * settings. */
2392
2393 if (protect_home != PROTECT_HOME_NO)
2394 return true;
2395
de46b2be
TM
2396 if (prefixed_path_strv_contains(read_only_paths, "/home") ||
2397 prefixed_path_strv_contains(inaccessible_paths, "/home") ||
2398 prefixed_path_strv_contains(empty_directories, "/home"))
c8c535d5
LP
2399 return true;
2400
d99511ad
MY
2401 FOREACH_ARRAY(i, temporary_filesystems, n_temporary_filesystems)
2402 if (path_equal(i->path, "/home"))
c8c535d5
LP
2403 return true;
2404
2405 /* If /home is overmounted with some dir from the host it's not writable. */
d99511ad
MY
2406 FOREACH_ARRAY(i, bind_mounts, n_bind_mounts)
2407 if (path_equal(i->destination, "/home"))
c8c535d5
LP
2408 return true;
2409
2410 return false;
2411}
2412
e73c042b 2413int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
15ae422b 2414
915e6d16
LP
2415 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
2416 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
93f59701 2417 _cleanup_strv_free_ char **hierarchies = NULL;
063c977a 2418 _cleanup_(mount_list_done) MountList ml = {};
046a1487 2419 _cleanup_close_ int userns_fd = -EBADF;
79d956db 2420 bool require_prefix = false;
24759d8f 2421 const char *root;
9cb1709b
LP
2422 DissectImageFlags dissect_image_flags =
2423 DISSECT_IMAGE_GENERIC_ROOT |
2424 DISSECT_IMAGE_REQUIRE_ROOT |
2425 DISSECT_IMAGE_DISCARD_ON_LOOP |
2426 DISSECT_IMAGE_RELAX_VAR_CHECK |
2427 DISSECT_IMAGE_FSCK |
c65f854a 2428 DISSECT_IMAGE_USR_NO_ROOT |
73d88b80
LP
2429 DISSECT_IMAGE_GROWFS |
2430 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
f4a63ce2
LP
2431 DISSECT_IMAGE_PIN_PARTITION_DEVICES |
2432 DISSECT_IMAGE_ALLOW_USERSPACE_VERITY;
89e62e0b 2433 int r;
15ae422b 2434
79d956db 2435 assert(p);
915e6d16 2436
cdf42f9b
LP
2437 /* Make sure that all mknod(), mkdir() calls we do are unaffected by the umask, and the access modes
2438 * we configure take effect */
2439 BLOCK_WITH_UMASK(0000);
2440
79d956db
LP
2441 bool setup_propagate = !isempty(p->propagate_dir) && !isempty(p->incoming_dir);
2442 unsigned long mount_propagation_flag = p->mount_propagation_flag != 0 ? p->mount_propagation_flag : MS_SHARED;
ac0930c8 2443
79d956db 2444 if (p->root_image) {
c8c535d5 2445 /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
79d956db
LP
2446 if (root_read_only(p->read_only_paths,
2447 p->protect_system) &&
2448 home_read_only(p->read_only_paths, p->inaccessible_paths, p->empty_directories,
2449 p->bind_mounts, p->n_bind_mounts, p->temporary_filesystems, p->n_temporary_filesystems,
2450 p->protect_home) &&
2451 strv_isempty(p->read_write_paths))
915e6d16
LP
2452 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
2453
79d956db 2454 SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path);
915e6d16 2455
046a1487
LP
2456 if (p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2457 /* In system mode we mount directly */
78ebe980 2458
046a1487
LP
2459 r = loop_device_make_by_path(
2460 p->root_image,
2461 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
2462 /* sector_size= */ UINT32_MAX,
2463 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
2464 LOCK_SH,
2465 &loop_device);
2466 if (r < 0)
2467 return log_debug_errno(r, "Failed to create loop device for root image: %m");
2468
2469 r = dissect_loop_device(
2470 loop_device,
2471 p->verity,
2472 p->root_image_options,
2473 p->root_image_policy,
f1395724 2474 /* image_filter= */ NULL,
046a1487
LP
2475 dissect_image_flags,
2476 &dissected_image);
2477 if (r < 0)
2478 return log_debug_errno(r, "Failed to dissect image: %m");
88b3300f 2479
046a1487
LP
2480 r = dissected_image_load_verity_sig_partition(
2481 dissected_image,
2482 loop_device->fd,
2483 p->verity);
2484 if (r < 0)
2485 return r;
2486
e34c8989
LP
2487 r = dissected_image_guess_verity_roothash(
2488 dissected_image,
2489 p->verity);
2490 if (r < 0)
2491 return r;
2492
046a1487
LP
2493 r = dissected_image_decrypt(
2494 dissected_image,
2495 NULL,
2496 p->verity,
2497 dissect_image_flags);
2498 if (r < 0)
2499 return log_debug_errno(r, "Failed to decrypt dissected image: %m");
2500 } else {
2501 userns_fd = namespace_open_by_type(NAMESPACE_USER);
2502 if (userns_fd < 0)
2503 return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m");
2504
2505 r = mountfsd_mount_image(
2506 p->root_image,
2507 userns_fd,
2508 p->root_image_policy,
2509 dissect_image_flags,
2510 &dissected_image);
2511 if (r < 0)
2512 return r;
2513 }
915e6d16
LP
2514 }
2515
79d956db
LP
2516 if (p->root_directory)
2517 root = p->root_directory;
0722b359 2518 else {
77f16dbd
DDM
2519 /* /run/systemd should have been created by PID 1 early on already, but in some cases, like
2520 * when running tests (test-execute), it might not have been created yet so let's make sure
2521 * we create it if it doesn't already exist. */
2522 (void) mkdir_p_label("/run/systemd", 0755);
2523
d73020f2
LP
2524 /* Always create the mount namespace in a temporary directory, instead of operating directly
2525 * in the root. The temporary directory prevents any mounts from being potentially obscured
2526 * my other mounts we already applied. We use the same mount point for all images, which is
2527 * safe, since they all live in their own namespaces after all, and hence won't see each
be79c235
LP
2528 * other. (Note: this directory is also created by PID 1 early on, we create it here for
2529 * similar reasons as /run/systemd/ first.) */
2530 root = "/run/systemd/mount-rootfs";
2531 (void) mkdir_label(root, 0555);
e908468b 2532
d18aff04 2533 require_prefix = true;
0722b359 2534 }
e908468b 2535
79d956db 2536 if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) {
55ea4ef0
MG
2537 /* Hierarchy population needs to be done for sysext and confext extension images */
2538 r = parse_env_extension_hierarchies(&hierarchies, "SYSTEMD_SYSEXT_AND_CONFEXT_HIERARCHIES");
93f59701
LB
2539 if (r < 0)
2540 return r;
2541 }
2542
a868e437 2543 r = append_access_mounts(&ml, p->read_write_paths, MOUNT_READ_WRITE, require_prefix);
063c977a
LP
2544 if (r < 0)
2545 return r;
613b411c 2546
a868e437 2547 r = append_access_mounts(&ml, p->read_only_paths, MOUNT_READ_ONLY, require_prefix);
063c977a
LP
2548 if (r < 0)
2549 return r;
5f7a690a 2550
a868e437 2551 r = append_access_mounts(&ml, p->inaccessible_paths, MOUNT_INACCESSIBLE, require_prefix);
063c977a
LP
2552 if (r < 0)
2553 return r;
613b411c 2554
a868e437 2555 r = append_access_mounts(&ml, p->exec_paths, MOUNT_EXEC, require_prefix);
063c977a
LP
2556 if (r < 0)
2557 return r;
613b411c 2558
a868e437 2559 r = append_access_mounts(&ml, p->no_exec_paths, MOUNT_NOEXEC, require_prefix);
063c977a
LP
2560 if (r < 0)
2561 return r;
2562
2563 r = append_empty_dir_mounts(&ml, p->empty_directories);
2564 if (r < 0)
2565 return r;
2566
2567 r = append_bind_mounts(&ml, p->bind_mounts, p->n_bind_mounts);
2568 if (r < 0)
2569 return r;
2570
2571 r = append_tmpfs_mounts(&ml, p->temporary_filesystems, p->n_temporary_filesystems);
2572 if (r < 0)
2573 return r;
7ff7394d 2574
13208591
YW
2575 r = append_private_tmp(&ml, p);
2576 if (r < 0)
2577 return r;
063c977a
LP
2578
2579 r = append_mount_images(&ml, p->mount_images, p->n_mount_images);
2580 if (r < 0)
2581 return r;
2582
0e551b04 2583 r = append_extensions(&ml, root, p->private_namespace_dir, hierarchies, p->extension_images, p->n_extension_images, p->extension_directories);
063c977a
LP
2584 if (r < 0)
2585 return r;
2586
2587 if (p->private_dev) {
2588 MountEntry *me = mount_list_extend(&ml);
2589 if (!me)
2590 return log_oom_debug();
2591
2592 *me = (MountEntry) {
2593 .path_const = "/dev",
a868e437 2594 .mode = MOUNT_PRIVATE_DEV,
063c977a
LP
2595 .flags = DEV_MOUNT_OPTIONS,
2596 };
2597 }
2598
2599 /* In case /proc is successfully mounted with pid tree subset only (ProcSubset=pid), the protective
2600 mounts to non-pid /proc paths would fail. But the pid only option may have failed gracefully, so
2601 let's try the mounts but it's not fatal if they don't succeed. */
2602 bool ignore_protect_proc = p->ignore_protect_paths || p->proc_subset == PROC_SUBSET_PID;
2603 if (p->protect_kernel_tunables) {
2604 r = append_static_mounts(&ml,
2605 protect_kernel_tunables_proc_table,
2606 ELEMENTSOF(protect_kernel_tunables_proc_table),
2607 ignore_protect_proc);
ddc155b2 2608 if (r < 0)
063c977a 2609 return r;
ddc155b2 2610
063c977a
LP
2611 r = append_static_mounts(&ml,
2612 protect_kernel_tunables_sys_table,
2613 ELEMENTSOF(protect_kernel_tunables_sys_table),
2614 p->ignore_protect_paths);
ddc155b2 2615 if (r < 0)
063c977a
LP
2616 return r;
2617 }
ddc155b2 2618
063c977a
LP
2619 if (p->protect_kernel_modules) {
2620 r = append_static_mounts(&ml,
2621 protect_kernel_modules_table,
2622 ELEMENTSOF(protect_kernel_modules_table),
2623 p->ignore_protect_paths);
6c47cd7d 2624 if (r < 0)
063c977a
LP
2625 return r;
2626 }
6c47cd7d 2627
063c977a
LP
2628 if (p->protect_kernel_logs) {
2629 r = append_static_mounts(&ml,
2630 protect_kernel_logs_proc_table,
2631 ELEMENTSOF(protect_kernel_logs_proc_table),
2632 ignore_protect_proc);
d2d6c096 2633 if (r < 0)
063c977a 2634 return r;
d2d6c096 2635
063c977a
LP
2636 r = append_static_mounts(&ml,
2637 protect_kernel_logs_dev_table,
2638 ELEMENTSOF(protect_kernel_logs_dev_table),
2639 p->ignore_protect_paths);
2abd4e38 2640 if (r < 0)
063c977a
LP
2641 return r;
2642 }
2abd4e38 2643
5fe29238
RW
2644 r = append_protect_control_groups(&ml, p->protect_control_groups, false);
2645 if (r < 0)
2646 return r;
7ff7394d 2647
063c977a
LP
2648 r = append_protect_home(&ml, p->protect_home, p->ignore_protect_paths);
2649 if (r < 0)
2650 return r;
56a13a49 2651
063c977a
LP
2652 r = append_protect_system(&ml, p->protect_system, false);
2653 if (r < 0)
2654 return r;
ac0930c8 2655
063c977a
LP
2656 if (namespace_parameters_mount_apivfs(p)) {
2657 r = append_static_mounts(&ml,
2658 apivfs_table,
2659 ELEMENTSOF(apivfs_table),
2660 p->ignore_protect_paths);
b3d13314 2661 if (r < 0)
063c977a
LP
2662 return r;
2663 }
b3d13314 2664
2e6025b1
YW
2665 /* Only mount /proc/sys/kernel/hostname and domainname read-only if ProtectHostname=yes. Otherwise,
2666 * ProtectHostname=no allows changing hostname for the host, and ProtectHostname=private allows
2667 * changing the hostname in the unit's UTS namespace. Note, if proc is mounted with subset=pid then
2668 * neither of the two paths will exist, i.e. they are implicitly protected by the mount option. */
2669 if (p->protect_hostname == PROTECT_HOSTNAME_YES) {
3d1b999b
LP
2670 r = append_static_mounts(
2671 &ml,
6746f288
RW
2672 protect_hostname_yes_table,
2673 ELEMENTSOF(protect_hostname_yes_table),
3d1b999b
LP
2674 ignore_protect_proc);
2675 if (r < 0)
2676 return r;
063c977a 2677 }
788e7201 2678
063c977a
LP
2679 if (p->private_network) {
2680 MountEntry *me = mount_list_extend(&ml);
2681 if (!me)
2682 return log_oom_debug();
94a7b275 2683
063c977a
LP
2684 *me = (MountEntry) {
2685 .path_const = "/sys",
a868e437 2686 .mode = MOUNT_PRIVATE_SYSFS,
063c977a
LP
2687 };
2688 }
59eeb84b 2689
063c977a
LP
2690 if (p->private_ipc) {
2691 MountEntry *me = mount_list_extend(&ml);
2692 if (!me)
2693 return log_oom_debug();
417116f2 2694
063c977a
LP
2695 *me = (MountEntry) {
2696 .path_const = "/dev/mqueue",
a868e437 2697 .mode = MOUNT_MQUEUEFS,
063c977a
LP
2698 .flags = MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
2699 };
2700 }
417116f2 2701
063c977a
LP
2702 if (p->creds_path) {
2703 /* If our service has a credentials store configured, then bind that one in, but hide
2704 * everything else. */
5d997827 2705
063c977a
LP
2706 MountEntry *me = mount_list_extend(&ml);
2707 if (!me)
2708 return log_oom_debug();
aecd5ac6 2709
063c977a
LP
2710 *me = (MountEntry) {
2711 .path_const = "/run/credentials",
a868e437 2712 .mode = MOUNT_TMPFS,
063c977a
LP
2713 .read_only = true,
2714 .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
2715 .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
2716 };
bbb4e7f3 2717
063c977a
LP
2718 me = mount_list_extend(&ml);
2719 if (!me)
2720 return log_oom_debug();
91dd5f7c 2721
063c977a
LP
2722 *me = (MountEntry) {
2723 .path_const = p->creds_path,
a868e437 2724 .mode = MOUNT_BIND,
063c977a
LP
2725 .read_only = true,
2726 .source_const = p->creds_path,
2727 .ignore = true,
2728 };
2729 } else {
2730 /* If our service has no credentials store configured, then make the whole credentials tree
2731 * inaccessible wholesale. */
91dd5f7c 2732
063c977a
LP
2733 MountEntry *me = mount_list_extend(&ml);
2734 if (!me)
2735 return log_oom_debug();
91dd5f7c 2736
063c977a
LP
2737 *me = (MountEntry) {
2738 .path_const = "/run/credentials",
a868e437 2739 .mode = MOUNT_INACCESSIBLE,
063c977a
LP
2740 .ignore = true,
2741 };
2742 }
2743
2744 if (p->log_namespace) {
2745 _cleanup_free_ char *q = NULL;
2746
2747 q = strjoin("/run/systemd/journal.", p->log_namespace);
2748 if (!q)
d4f0878e 2749 return log_oom_debug();
5327c910 2750
063c977a
LP
2751 MountEntry *me = mount_list_extend(&ml);
2752 if (!me)
2753 return log_oom_debug();
2754
2755 *me = (MountEntry) {
2756 .path_const = "/run/systemd/journal",
a868e437 2757 .mode = MOUNT_BIND_RECURSIVE,
063c977a
LP
2758 .read_only = true,
2759 .source_malloc = TAKE_PTR(q),
2760 };
368a3071 2761
7a9f0125
MY
2762 } else if (p->bind_log_sockets) {
2763 r = append_bind_mounts(&ml, bind_log_sockets_table, ELEMENTSOF(bind_log_sockets_table));
368a3071
MY
2764 if (r < 0)
2765 return r;
15ae422b
LP
2766 }
2767
063c977a
LP
2768 /* Will be used to add bind mounts at runtime */
2769 if (setup_propagate) {
2770 MountEntry *me = mount_list_extend(&ml);
2771 if (!me)
2772 return log_oom_debug();
2773
2774 *me = (MountEntry) {
2775 .source_const = p->propagate_dir,
2776 .path_const = p->incoming_dir,
a868e437 2777 .mode = MOUNT_BIND,
063c977a
LP
2778 .read_only = true,
2779 };
2780 }
2781
284dd31e 2782 if (p->notify_socket_path) {
063c977a
LP
2783 MountEntry *me = mount_list_extend(&ml);
2784 if (!me)
2785 return log_oom_debug();
2786
2787 *me = (MountEntry) {
284dd31e 2788 .path_const = p->notify_socket_path,
15816441 2789 .source_const = p->host_notify_socket,
a868e437 2790 .mode = MOUNT_BIND,
063c977a
LP
2791 .read_only = true,
2792 };
2793 }
2794
2795 if (p->host_os_release_stage) {
2796 MountEntry *me = mount_list_extend(&ml);
2797 if (!me)
2798 return log_oom_debug();
2799
2800 *me = (MountEntry) {
2801 .path_const = "/run/host/.os-release-stage/",
2802 .source_const = p->host_os_release_stage,
a868e437 2803 .mode = MOUNT_BIND,
063c977a
LP
2804 .read_only = true,
2805 .ignore = true, /* Live copy, don't hard-fail if it goes missing */
2806 };
2807 }
2808
2809 /* Prepend the root directory where that's necessary */
2810 r = prefix_where_needed(&ml, root);
2811 if (r < 0)
2812 return r;
2813
1913ffb8 2814 sort_and_drop_unused_mounts(&ml, root);
063c977a 2815
1beab8b0
LP
2816 /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
2817
d944dc95 2818 if (unshare(CLONE_NEWNS) < 0) {
763a260a 2819 r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
063c977a 2820
eacfbd89
LP
2821 if (ERRNO_IS_PRIVILEGE(r) ||
2822 ERRNO_IS_NOT_SUPPORTED(r))
cbc056c8
ZJS
2823 /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter
2824 * in place that doesn't allow us to create namespaces (or a missing cap), then
2825 * propagate a recognizable error back, which the caller can use to detect this case
2826 * (and only this) and optionally continue without namespacing applied. */
063c977a 2827 return -ENOANO;
1beab8b0 2828
063c977a 2829 return r;
d944dc95 2830 }
1e4e94c8 2831
5e8deb94
LB
2832 /* Create the source directory to allow runtime propagation of mounts */
2833 if (setup_propagate)
79d956db 2834 (void) mkdir_p(p->propagate_dir, 0600);
5e8deb94 2835
0e551b04 2836 if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) {
3ebf0b0b
LB
2837 /* ExtensionImages/Directories mountpoint directories will be created while parsing the
2838 * mounts to create, so have the parent ready */
0e551b04
LB
2839 char *extension_dir = strjoina(p->private_namespace_dir, "/unit-extensions");
2840 (void) mkdir_p(extension_dir, 0600);
2841 }
93f59701 2842
9b68367b
YW
2843 /* Remount / as SLAVE so that nothing now mounted in the namespace
2844 * shows up in the parent */
063c977a
LP
2845 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2846 return log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
ee818b89 2847
79d956db 2848 if (p->root_image) {
e908468b 2849 /* A root image is specified, mount it to the right place */
8d9a1d59
LP
2850 r = dissected_image_mount(
2851 dissected_image,
2852 root,
2853 /* uid_shift= */ UID_INVALID,
2854 /* uid_range= */ UID_INVALID,
046a1487 2855 userns_fd,
8d9a1d59 2856 dissect_image_flags);
063c977a
LP
2857 if (r < 0)
2858 return log_debug_errno(r, "Failed to mount root image: %m");
41bc4849
LP
2859
2860 /* Now release the block device lock, so that udevd is free to call BLKRRPART on the device
2861 * if it likes. */
046a1487
LP
2862 if (loop_device) {
2863 r = loop_device_flock(loop_device, LOCK_UN);
2864 if (r < 0)
2865 return log_debug_errno(r, "Failed to release lock on loopback block device: %m");
2866 }
915e6d16 2867
3044d343 2868 r = dissected_image_relinquish(dissected_image);
063c977a
LP
2869 if (r < 0)
2870 return log_debug_errno(r, "Failed to relinquish dissected image: %m");
78ebe980 2871
79d956db 2872 } else if (p->root_directory) {
915e6d16 2873
e908468b 2874 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
b409aacb 2875 r = path_is_mount_point_full(root, /* root = */ NULL, AT_SYMLINK_FOLLOW);
063c977a
LP
2876 if (r < 0)
2877 return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
8f1ad200 2878 if (r == 0) {
21935150
LP
2879 r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
2880 if (r < 0)
063c977a 2881 return r;
d944dc95 2882 }
e908468b 2883
9b68367b 2884 } else {
e908468b 2885 /* Let's mount the main root directory to the root directory to use */
21935150
LP
2886 r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL);
2887 if (r < 0)
063c977a 2888 return r;
ee818b89 2889 }
c2c13f2d 2890
4e0c20de 2891 /* Try to set up the new root directory before mounting anything else there. */
79d956db 2892 if (p->root_image || p->root_directory)
4e0c20de
LP
2893 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
2894
82fb2da2 2895 /* Now make the magic happen */
e73c042b 2896 r = apply_mounts(&ml, root, p, reterr_path);
82fb2da2 2897 if (r < 0)
063c977a 2898 return r;
15ae422b 2899
9b68367b 2900 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
9d50f850 2901 r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
79d956db 2902 if (r == -EINVAL && p->root_directory) {
ea63a260
LB
2903 /* If we are using root_directory and we don't have privileges (ie: user manager in a user
2904 * namespace) and the root_directory is already a mount point in the parent namespace,
2905 * MS_MOVE will fail as we don't have permission to change it (with EINVAL rather than
2906 * EPERM). Attempt to bind-mount it over itself (like we do above if it's not already a
2907 * mount point) and try again. */
2908 r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
2909 if (r < 0)
063c977a 2910 return r;
9d50f850 2911 r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
ea63a260 2912 }
063c977a
LP
2913 if (r < 0)
2914 return log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
ee818b89 2915
874cdcbc
LP
2916 /* Remount / as the desired mode. Note that this will not reestablish propagation from our side to
2917 * the host, since what's disconnected is disconnected. */
063c977a
LP
2918 if (mount(NULL, "/", NULL, mount_propagation_flag | MS_REC, NULL) < 0)
2919 return log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
15ae422b 2920
bbe92ea5
MY
2921 /* bind_mount_in_namespace() will MS_MOVE into that directory, and that's only supported for
2922 * non-shared mounts. This needs to happen after remounting / or it will fail. */
2923 if (setup_propagate && mount(NULL, p->incoming_dir, NULL, MS_SLAVE, NULL) < 0)
2924 return log_debug_errno(errno, "Failed to remount %s with MS_SLAVE: %m", p->incoming_dir);
5e8deb94 2925
063c977a 2926 return 0;
613b411c
LP
2927}
2928
da6053d0 2929void bind_mount_free_many(BindMount *b, size_t n) {
d2d6c096
LP
2930 assert(b || n == 0);
2931
16871b60
MY
2932 FOREACH_ARRAY(i, b, n) {
2933 free(i->source);
2934 free(i->destination);
d2d6c096
LP
2935 }
2936
2937 free(b);
2938}
2939
da6053d0 2940int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
d2d6c096 2941 _cleanup_free_ char *s = NULL, *d = NULL;
d2d6c096
LP
2942
2943 assert(b);
2944 assert(n);
2945 assert(item);
2946
2947 s = strdup(item->source);
2948 if (!s)
2949 return -ENOMEM;
2950
2951 d = strdup(item->destination);
2952 if (!d)
2953 return -ENOMEM;
2954
432aab24 2955 if (!GREEDY_REALLOC(*b, *n + 1))
d2d6c096
LP
2956 return -ENOMEM;
2957
432aab24 2958 (*b)[(*n)++] = (BindMount) {
1cc6c93a
YW
2959 .source = TAKE_PTR(s),
2960 .destination = TAKE_PTR(d),
d2d6c096 2961 .read_only = item->read_only,
95f9e85a 2962 .nodev = item->nodev,
9ce4e4b0 2963 .nosuid = item->nosuid,
95f9e85a 2964 .noexec = item->noexec,
d2d6c096
LP
2965 .recursive = item->recursive,
2966 .ignore_enoent = item->ignore_enoent,
2967 };
2968
d2d6c096
LP
2969 return 0;
2970}
2971
b3d13314 2972MountImage* mount_image_free_many(MountImage *m, size_t *n) {
b3d13314
LB
2973 assert(n);
2974 assert(m || *n == 0);
2975
fe96c0f8 2976 for (size_t i = 0; i < *n; i++) {
b3d13314
LB
2977 free(m[i].source);
2978 free(m[i].destination);
427353f6 2979 mount_options_free_all(m[i].mount_options);
b3d13314
LB
2980 }
2981
2982 free(m);
2983 *n = 0;
2984 return NULL;
2985}
2986
2987int mount_image_add(MountImage **m, size_t *n, const MountImage *item) {
2988 _cleanup_free_ char *s = NULL, *d = NULL;
427353f6 2989 _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
b3d13314
LB
2990
2991 assert(m);
2992 assert(n);
2993 assert(item);
2994
2995 s = strdup(item->source);
2996 if (!s)
2997 return -ENOMEM;
2998
93f59701
LB
2999 if (item->destination) {
3000 d = strdup(item->destination);
3001 if (!d)
3002 return -ENOMEM;
3003 }
b3d13314 3004
427353f6 3005 LIST_FOREACH(mount_options, i, item->mount_options) {
c2b2df60 3006 _cleanup_(mount_options_free_allp) MountOptions *o = NULL;
427353f6
LB
3007
3008 o = new(MountOptions, 1);
3009 if (!o)
3010 return -ENOMEM;
3011
3012 *o = (MountOptions) {
3013 .partition_designator = i->partition_designator,
3014 .options = strdup(i->options),
3015 };
3016 if (!o->options)
3017 return -ENOMEM;
3018
3019 LIST_APPEND(mount_options, options, TAKE_PTR(o));
3020 }
3021
223a67e5 3022 if (!GREEDY_REALLOC(*m, *n + 1))
b3d13314
LB
3023 return -ENOMEM;
3024
223a67e5 3025 (*m)[(*n)++] = (MountImage) {
b3d13314
LB
3026 .source = TAKE_PTR(s),
3027 .destination = TAKE_PTR(d),
427353f6 3028 .mount_options = TAKE_PTR(options),
b3d13314 3029 .ignore_enoent = item->ignore_enoent,
93f59701 3030 .type = item->type,
b3d13314
LB
3031 };
3032
3033 return 0;
3034}
3035
da6053d0 3036void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
2abd4e38
YW
3037 assert(t || n == 0);
3038
fe96c0f8 3039 for (size_t i = 0; i < n; i++) {
2abd4e38
YW
3040 free(t[i].path);
3041 free(t[i].options);
3042 }
3043
3044 free(t);
3045}
3046
3047int temporary_filesystem_add(
3048 TemporaryFileSystem **t,
da6053d0 3049 size_t *n,
2abd4e38
YW
3050 const char *path,
3051 const char *options) {
3052
3053 _cleanup_free_ char *p = NULL, *o = NULL;
2abd4e38
YW
3054
3055 assert(t);
3056 assert(n);
3057 assert(path);
3058
3059 p = strdup(path);
3060 if (!p)
3061 return -ENOMEM;
3062
3063 if (!isempty(options)) {
3064 o = strdup(options);
3065 if (!o)
3066 return -ENOMEM;
3067 }
3068
223a67e5 3069 if (!GREEDY_REALLOC(*t, *n + 1))
2abd4e38
YW
3070 return -ENOMEM;
3071
223a67e5 3072 (*t)[(*n)++] = (TemporaryFileSystem) {
1cc6c93a
YW
3073 .path = TAKE_PTR(p),
3074 .options = TAKE_PTR(o),
2abd4e38
YW
3075 };
3076
2abd4e38
YW
3077 return 0;
3078}
3079
a652f050
JR
3080static int make_tmp_prefix(const char *prefix) {
3081 _cleanup_free_ char *t = NULL;
254d1313 3082 _cleanup_close_ int fd = -EBADF;
a652f050
JR
3083 int r;
3084
3085 /* Don't do anything unless we know the dir is actually missing */
3086 r = access(prefix, F_OK);
3087 if (r >= 0)
3088 return 0;
3089 if (errno != ENOENT)
3090 return -errno;
3091
2053593f 3092 WITH_UMASK(000)
30443439 3093 r = mkdir_parents(prefix, 0755);
a652f050
JR
3094 if (r < 0)
3095 return r;
3096
3097 r = tempfn_random(prefix, NULL, &t);
3098 if (r < 0)
3099 return r;
3100
96603ea0
LP
3101 /* umask will corrupt this access mode, but that doesn't matter, we need to call chmod() anyway for
3102 * the suid bit, below. */
c29778a1 3103 fd = open_mkdir(t, O_EXCL|O_CLOEXEC, 0777);
96603ea0
LP
3104 if (fd < 0)
3105 return fd;
a652f050 3106
96603ea0
LP
3107 r = RET_NERRNO(fchmod(fd, 01777));
3108 if (r < 0) {
a652f050
JR
3109 (void) rmdir(t);
3110 return r;
3111 }
3112
96603ea0
LP
3113 r = RET_NERRNO(rename(t, prefix));
3114 if (r < 0) {
a652f050
JR
3115 (void) rmdir(t);
3116 return r == -EEXIST ? 0 : r; /* it's fine if someone else created the dir by now */
3117 }
3118
3119 return 0;
a652f050
JR
3120}
3121
56a13a49 3122static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, char **tmp_path) {
613b411c 3123 _cleanup_free_ char *x = NULL;
19cd4e19 3124 _cleanup_free_ char *y = NULL;
6b46ea73 3125 sd_id128_t boot_id;
56a13a49 3126 bool rw = true;
6b46ea73 3127 int r;
613b411c
LP
3128
3129 assert(id);
3130 assert(prefix);
3131 assert(path);
3132
6b46ea73
LP
3133 /* We include the boot id in the directory so that after a
3134 * reboot we can easily identify obsolete directories. */
3135
3136 r = sd_id128_get_boot(&boot_id);
3137 if (r < 0)
3138 return r;
3139
85b55869 3140 x = strjoin(prefix, "/systemd-private-", SD_ID128_TO_STRING(boot_id), "-", id, "-XXXXXX");
613b411c
LP
3141 if (!x)
3142 return -ENOMEM;
3143
a652f050
JR
3144 r = make_tmp_prefix(prefix);
3145 if (r < 0)
3146 return r;
3147
2053593f 3148 WITH_UMASK(0077)
56a13a49
ZJS
3149 if (!mkdtemp(x)) {
3150 if (errno == EROFS || ERRNO_IS_DISK_SPACE(errno))
3151 rw = false;
3152 else
3153 return -errno;
3154 }
613b411c 3155
56a13a49 3156 if (rw) {
19cd4e19 3157 y = strjoin(x, "/tmp");
3158 if (!y)
3159 return -ENOMEM;
3160
2053593f 3161 WITH_UMASK(0000)
19cd4e19 3162 if (mkdir(y, 0777 | S_ISVTX) < 0)
7c76e181 3163 return -errno;
19cd4e19 3164
03bc11d1 3165 r = label_fix_full(AT_FDCWD, y, prefix, 0);
56a13a49
ZJS
3166 if (r < 0)
3167 return r;
19cd4e19 3168
3169 if (tmp_path)
3170 *tmp_path = TAKE_PTR(y);
56a13a49
ZJS
3171 } else {
3172 /* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being
3173 * read-only. This way the service will get the EROFS result as if it was writing to the real
3174 * file system. */
2053593f 3175 WITH_UMASK(0000)
30443439 3176 r = mkdir_p(RUN_SYSTEMD_EMPTY, 0500);
56a13a49
ZJS
3177 if (r < 0)
3178 return r;
613b411c 3179
3f181262
LP
3180 r = free_and_strdup(&x, RUN_SYSTEMD_EMPTY);
3181 if (r < 0)
3182 return r;
c17ec25e 3183 }
15ae422b 3184
1cc6c93a 3185 *path = TAKE_PTR(x);
613b411c
LP
3186 return 0;
3187}
3188
836e4e7e
DDM
3189char* namespace_cleanup_tmpdir(char *p) {
3190 PROTECT_ERRNO;
3191 if (!streq_ptr(p, RUN_SYSTEMD_EMPTY))
3192 (void) rmdir(p);
3193 return mfree(p);
3194}
3195
613b411c 3196int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
56a13a49
ZJS
3197 _cleanup_(namespace_cleanup_tmpdirp) char *a = NULL;
3198 _cleanup_(rmdir_and_freep) char *a_tmp = NULL;
3199 char *b;
613b411c
LP
3200 int r;
3201
3202 assert(id);
3203 assert(tmp_dir);
3204 assert(var_tmp_dir);
3205
56a13a49 3206 r = setup_one_tmp_dir(id, "/tmp", &a, &a_tmp);
613b411c
LP
3207 if (r < 0)
3208 return r;
3209
56a13a49
ZJS
3210 r = setup_one_tmp_dir(id, "/var/tmp", &b, NULL);
3211 if (r < 0)
613b411c 3212 return r;
613b411c 3213
56a13a49
ZJS
3214 a_tmp = mfree(a_tmp); /* avoid rmdir */
3215 *tmp_dir = TAKE_PTR(a);
3216 *var_tmp_dir = TAKE_PTR(b);
613b411c
LP
3217
3218 return 0;
3219}
3220
13339577 3221int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag) {
5bb1d7fb 3222 _cleanup_close_ int ns = -EBADF;
54c2459d 3223 const char *ns_name, *ns_path;
a5387637 3224 int r;
613b411c 3225
54c2459d
XR
3226 assert(ns_storage_socket);
3227 assert(ns_storage_socket[0] >= 0);
3228 assert(ns_storage_socket[1] >= 0);
3229
a5387637 3230 ns_name = ASSERT_PTR(namespace_single_flag_to_string(nsflag));
613b411c 3231
a5387637
LP
3232 /* We use the passed socketpair as a storage buffer for our namespace reference fd. Whatever process
3233 * runs this first shall create a new namespace, all others should just join it. To serialize that we
3234 * use a file lock on the socket pair.
613b411c
LP
3235 *
3236 * It's a bit crazy, but hey, works great! */
3237
13339577
DDM
3238 r = posix_lock(ns_storage_socket[0], LOCK_EX);
3239 if (r < 0)
3240 return r;
613b411c 3241
13339577 3242 CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]);
613b411c 3243
13339577
DDM
3244 ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3245 if (ns >= 0) {
3246 /* Yay, found something, so let's join the namespace */
3247 r = RET_NERRNO(setns(ns, nsflag));
3248 if (r < 0)
3249 return r;
613b411c 3250
13339577
DDM
3251 return 0;
3252 }
613b411c 3253
13339577
DDM
3254 if (ns != -EAGAIN)
3255 return ns;
613b411c 3256
13339577 3257 /* Nothing stored yet, so let's create a new namespace. */
613b411c 3258
13339577
DDM
3259 if (unshare(nsflag) < 0)
3260 return -errno;
613b411c 3261
a5387637
LP
3262 if (nsflag == CLONE_NEWNET)
3263 (void) loopback_setup();
613b411c 3264
13339577
DDM
3265 ns_path = strjoina("/proc/self/ns/", ns_name);
3266 ns = open(ns_path, O_RDONLY|O_CLOEXEC|O_NOCTTY);
3267 if (ns < 0)
3268 return -errno;
613b411c 3269
13339577
DDM
3270 r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
3271 if (r < 0)
3272 return r;
613b411c 3273
13339577 3274 return 1;
15ae422b 3275}
417116f2 3276
13339577 3277int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, unsigned long nsflag) {
5bb1d7fb 3278 _cleanup_close_ int ns = -EBADF;
07610caf 3279 NamespaceType type;
13339577 3280 int r;
51af7fb2 3281
54c2459d
XR
3282 assert(ns_storage_socket);
3283 assert(ns_storage_socket[0] >= 0);
3284 assert(ns_storage_socket[1] >= 0);
51af7fb2
LP
3285 assert(path);
3286
54c2459d
XR
3287 /* If the storage socket doesn't contain a ns fd yet, open one via the file system and store it in
3288 * it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will
3289 * allocate a new anonymous ns if needed. */
51af7fb2 3290
07610caf
MY
3291 type = clone_flag_to_namespace_type(nsflag);
3292 assert(type >= 0);
3293
13339577
DDM
3294 r = posix_lock(ns_storage_socket[0], LOCK_EX);
3295 if (r < 0)
3296 return r;
51af7fb2 3297
13339577 3298 CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]);
51af7fb2 3299
13339577
DDM
3300 ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3301 if (ns >= 0)
3302 return 0;
3303 if (ns != -EAGAIN)
3304 return ns;
51af7fb2 3305
13339577 3306 /* Nothing stored yet. Open the file from the file system. */
51af7fb2 3307
13339577
DDM
3308 ns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
3309 if (ns < 0)
3310 return -errno;
51af7fb2 3311
07610caf
MY
3312 r = fd_is_namespace(ns, type);
3313 if (r < 0)
3314 return r;
13339577
DDM
3315 if (r == 0)
3316 return -EINVAL;
51af7fb2 3317
13339577
DDM
3318 r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
3319 if (r < 0)
3320 return r;
51af7fb2 3321
13339577 3322 return 1;
51af7fb2
LP
3323}
3324
dfdeb0b1 3325static int is_extension_overlay(const char *path, int fd) {
3326 _cleanup_free_ char *source = NULL;
3327 _cleanup_close_ int dfd = -EBADF;
3328 int r;
3329
3330 assert(path);
3331
3332 if (fd < 0) {
3333 r = chase(path, /* root= */ NULL, CHASE_TRAIL_SLASH|CHASE_MUST_BE_DIRECTORY, /* ret_path= */ NULL, &dfd);
3334 if (r < 0)
3335 return r;
3336 fd = dfd;
3337 }
3338
3339 r = is_mount_point_at(fd, /* filename= */ NULL, /* flags= */ 0);
3340 if (r < 0)
3341 return log_debug_errno(r, "Unable to determine whether '%s' is a mount point: %m", path);
3342 if (r == 0)
3343 return 0;
3344
3345 r = fd_is_fs_type(fd, OVERLAYFS_SUPER_MAGIC);
3346 if (r < 0)
3347 return log_debug_errno(r, "Failed to check if %s is an overlayfs: %m", path);
3348 if (r == 0)
3349 return 0;
3350
3351 /* Check the 'source' field of the mount on mount_path */
3352 r = path_get_mount_info_at(fd, /* path= */ NULL, /* ret_fstype= */ NULL, /* ret_options= */ NULL, &source);
3353 if (r < 0)
3354 return log_debug_errno(r, "Failed to get mount info for %s: %m", path);
3355 if (!streq_ptr(source, "systemd-extensions"))
3356 return 0;
3357
3358 return 1;
3359}
3360
3361static int unpeel_get_fd(const char *mount_path, int *ret_fd) {
3362 _cleanup_close_pair_ int pipe_fds[2] = EBADF_PAIR;
3363 _cleanup_close_ int fs_fd = -EBADF;
3364 pid_t pid;
3365 int r;
3366
3367 assert(mount_path);
3368 assert(ret_fd);
3369
3370 r = socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pipe_fds);
3371 if (r < 0)
3372 return log_debug_errno(errno, "Failed to create socket pair: %m");
3373
3374 /* Clone mount namespace here to unpeel without affecting live process */
3375 r = safe_fork("(sd-ns-unpeel)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, &pid);
3376 if (r < 0)
3377 return r;
3378 if (r == 0) {
3379 _cleanup_close_ int dir_fd = -EBADF;
3380
3381 pipe_fds[0] = safe_close(pipe_fds[0]);
3382
3383 /* Opportunistically unmount any overlay at this path */
3384 r = is_extension_overlay(mount_path, /* fd= */ -EBADF);
3385 if (r < 0) {
3386 log_debug_errno(r, "Unable to determine whether '%s' is an extension overlay: %m", mount_path);
3387 _exit(EXIT_FAILURE);
3388 }
3389 if (r > 0) {
3390 r = umount_recursive(mount_path, MNT_DETACH);
3391 if (r < 0)
3392 _exit(EXIT_FAILURE);
3393 if (r == 0) /* no umounts done, possible if a previous reload deleted all extensions */
3394 log_debug("No overlay layer unmountable from %s", mount_path);
3395 }
3396
3397 /* Now that /mount_path is exposed, get an FD for it and pass back */
3398 dir_fd = open_tree(-EBADF, mount_path, AT_SYMLINK_NOFOLLOW|OPEN_TREE_CLONE);
3399 if (dir_fd < 0) {
3400 log_debug_errno(errno, "Failed to clone mount %s: %m", mount_path);
3401 _exit(EXIT_FAILURE);
3402 }
3403
3404 r = fd_is_fs_type(dir_fd, OVERLAYFS_SUPER_MAGIC);
3405 if (r < 0) {
3406 log_debug_errno(r, "Unable to determine whether '%s' is an overlay after opening mount tree: %m", mount_path);
3407 _exit(EXIT_FAILURE);
3408 }
3409 if (r > 0) {
3410 log_debug_errno(r, "'%s' is still an overlay after opening mount tree: %m", mount_path);
3411 _exit(EXIT_FAILURE);
3412 }
3413
3414 r = send_one_fd(pipe_fds[1], dir_fd, 0);
3415 if (r < 0) {
3416 log_debug_errno(r, "Failed to send mount fd: %m");
3417 _exit(EXIT_FAILURE);
3418 }
3419
3420 _exit(EXIT_SUCCESS);
3421 }
3422
3423 pipe_fds[1] = safe_close(pipe_fds[1]);
3424
3425 r = receive_one_fd(pipe_fds[0], 0);
3426 if (r < 0)
3427 return log_debug_errno(r, "Failed to receive mount fd: %m");
3428 fs_fd = r;
3429
3430 r = fd_is_fs_type(fs_fd, OVERLAYFS_SUPER_MAGIC);
3431 if (r < 0)
3432 return log_debug_errno(r, "Unable to determine if unpeeled directory refers to overlayfs: %m");
3433 if (r > 0)
3434 return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Unpeeled mount is still an overlayfs, something is weird, refusing.");
3435
3436 *ret_fd = TAKE_FD(fs_fd);
3437 return 0;
3438}
3439
3440/* In target namespace, unmounts an existing overlayfs at mount_path (if one exists), grabs FD from the
3441 * underlying directory, and sets up a new overlayfs mount. Coordinates with parent process over pair_fd:
3442 * 1. Creates and sends new overlay fs fd to parent
3443 * 2. Fake-unmounts overlay at mount_path to obtain underlying directory fd to build new overlay
3444 * 3. Waits for parent to configure layers
3445 * 4. Performs final mount at mount_path
3446 *
3447 * This is used by refresh_extensions_in_namespace() to peel back any existing overlays and reapply them.
3448 */
3449static int unpeel_mount_and_setup_overlay(int pair_fd, const char *mount_path) {
3450 _cleanup_close_ int dir_unpeeled_fd = -EBADF, overlay_fs_fd = -EBADF, mount_fd = -EBADF;
3451 int r;
3452
3453 assert(pair_fd >= 0);
3454 assert(mount_path);
3455
3456 /* Create new OverlayFS and send to parent */
3457 overlay_fs_fd = fsopen("overlay", FSOPEN_CLOEXEC);
3458 if (overlay_fs_fd < 0)
3459 return log_debug_errno(errno, "Failed to create overlay fs for %s: %m", mount_path);
3460
3461 r = send_one_fd(pair_fd, overlay_fs_fd, /* flags= */ 0);
3462 if (r < 0)
3463 return log_debug_errno(r, "Failed to send overlay fs fd to parent: %m");
3464
3465 /* Unpeel in cloned mount namespace to get underlying directory fd */
3466 r = unpeel_get_fd(mount_path, &dir_unpeeled_fd);
3467 if (r < 0)
3468 return log_debug_errno(r, "Failed to unpeel mount %s: %m", mount_path);
3469
3470 /* Send the fd to the parent */
3471 r = send_one_fd(pair_fd, dir_unpeeled_fd, /* flags= */ 0);
3472 if (r < 0)
3473 return log_debug_errno(r, "Failed to send %s fd to parent: %m", mount_path);
3474
3475 /* Wait for parent to signal overlay configuration completion */
3476 log_debug("Waiting for configured overlay fs for %s", mount_path);
3477 r = receive_one_fd(pair_fd, 0);
3478 if (r < 0)
3479 return log_debug_errno(r, "Failed to receive configured overlay: %m");
3480
3481 /* Create the mount */
3482 mount_fd = fsmount(overlay_fs_fd, FSMOUNT_CLOEXEC, /* flags= */ 0);
3483 if (mount_fd < 0)
3484 return log_debug_errno(errno, "Failed to create overlay mount: %m");
3485
3486 /* Move mount to final location */
3487 r = mount_exchange_graceful(mount_fd, mount_path, /* mount_beneath= */ true);
3488 if (r < 0)
3489 return log_debug_errno(r, "Failed to move overlay to %s: %m", mount_path);
3490
3491 return 0;
3492}
3493
3494static int refresh_grandchild_proc(
3495 const PidRef *target,
3496 MountList *ml,
3497 const char *overlay_prefix,
3498 int pidns_fd,
3499 int mntns_fd,
3500 int root_fd,
3501 int pipe_fd) {
3502
3503 int r;
3504
3505 assert(pidref_is_set(target));
3506 assert(ml);
3507 assert(overlay_prefix);
3508 assert(pidns_fd >= 0);
3509 assert(mntns_fd >= 0);
3510 assert(root_fd >= 0);
3511 assert(pipe_fd >= 0);
3512
3513 r = namespace_enter(pidns_fd, mntns_fd, /* netns_fd= */ -EBADF, /* userns_fd= */ -EBADF, root_fd);
3514 if (r < 0)
3515 return log_debug_errno(r, "Failed to enter namespace: %m");
3516
3517 /* Handle each overlay mount path */
3518 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
3519 if (m->mode != MOUNT_OVERLAY)
3520 continue;
3521
3522 /* Need an absolute path under the child namespace, rather than the root's */
3523 _cleanup_free_ char *mount_path = NULL;
3524 mount_path = path_join("/",
3525 path_startswith(mount_entry_unprefixed_path(m), overlay_prefix) ?:
3526 mount_entry_unprefixed_path(m));
3527 if (!mount_path)
3528 return log_oom_debug();
3529
3530 /* If there are no extensions mounted for this overlay layer, instead of setting everything
3531 * up, the correct behavior is to unmount the existing overlay in the target namespace to
3532 * expose the original files. */
3533 if (strv_isempty(m->overlay_layers)) {
3534 r = is_extension_overlay(mount_path, /* fd= */ -EBADF);
3535 if (r < 0)
3536 return log_debug_errno(r, "Unable to determine whether '%s' is an extension overlay: %m", mount_path);
3537 if (r == 0)
3538 continue;
3539
3540 log_debug("No extensions for %s, undoing existing mount", mount_path);
3541 (void) umount_recursive(mount_path, MNT_DETACH);
3542
3543 continue;
3544 }
3545
3546 r = unpeel_mount_and_setup_overlay(pipe_fd, mount_path);
3547 if (r < 0)
3548 return log_debug_errno(r, "Failed to setup overlay mount for %s: %m", mount_path);
3549 }
3550
3551 return 0;
3552}
3553
3554static int handle_mount_from_grandchild(
3555 MountEntry *m,
3556 const char *overlay_prefix,
3557 int **fd_layers,
3558 size_t *n_fd_layers,
3559 int pipe_fd) {
3560
3561 _cleanup_free_ char *layers = NULL, *options = NULL, *hierarchy_path_moved_mount = NULL;
3562 _cleanup_close_ int hierarchy_path_fd = -EBADF, overlay_fs_fd = -EBADF;
3563 _cleanup_strv_free_ char **new_layers = NULL;
3564 int r;
3565
3566 assert(m);
3567 assert(overlay_prefix);
3568 assert(fd_layers);
3569 assert(n_fd_layers);
3570 assert(pipe_fd >= 0);
3571
3572 if (m->mode != MOUNT_OVERLAY)
3573 return 0;
3574
3575 const char *mount_path = path_startswith(mount_entry_unprefixed_path(m), overlay_prefix);
3576 if (!mount_path)
3577 mount_path = mount_entry_unprefixed_path(m);
3578
3579 /* If there are no extensions mounted for this overlay layer, we only need to
3580 * unmount the existing overlay (this is handled in the grandchild process) and
3581 * would skip the usual cooperative processing here.
3582 */
3583 if (strv_isempty(m->overlay_layers)) {
3584 log_debug("No layers for %s, skip setting up overlay", mount_path);
3585 return 0;
3586 }
3587
3588 /* Receive the fds from grandchild */
3589 overlay_fs_fd = receive_one_fd(pipe_fd, 0);
3590 if (overlay_fs_fd < 0)
3591 return log_debug_errno(overlay_fs_fd, "Failed to receive overlay fs fd from grandchild: %m");
3592
3593 hierarchy_path_fd = receive_one_fd(pipe_fd, 0);
3594 if (hierarchy_path_fd < 0)
3595 return log_debug_errno(hierarchy_path_fd, "Failed to receive fd from grandchild for %s: %m", mount_path);
3596
3597 /* move_mount so that it is visible on our end. */
3598 hierarchy_path_moved_mount = path_join(overlay_prefix, mount_path);
3599 if (!hierarchy_path_moved_mount)
3600 return log_oom_debug();
3601
3602 (void) mkdir_p_label(hierarchy_path_moved_mount, 0555);
3603 r = move_mount(hierarchy_path_fd, "", AT_FDCWD, hierarchy_path_moved_mount, MOVE_MOUNT_F_EMPTY_PATH);
3604 if (r < 0)
3605 return log_debug_errno(r, "Failed to move mount for %s: %m", mount_path);
3606
3607 /* Turn all overlay layer directories into FD-based references */
3608 if (!GREEDY_REALLOC(*fd_layers, *n_fd_layers + strv_length(m->overlay_layers)))
3609 return log_oom_debug();
3610
3611 STRV_FOREACH(ol, m->overlay_layers) {
3612 _cleanup_close_ int tree_fd = -EBADF;
3613
3614 tree_fd = open_tree(-EBADF, *ol, /* flags= */ 0);
3615 if (tree_fd < 0)
3616 return log_debug_errno(errno, "Failed to open_tree overlay layer '%s': %m", *ol);
3617
3618 r = strv_extend(&new_layers, FORMAT_PROC_FD_PATH(tree_fd));
3619 if (r < 0)
3620 return log_oom_debug();
3621
3622 *fd_layers[(*n_fd_layers)++] = TAKE_FD(tree_fd);
3623 }
3624 m->overlay_layers = strv_free(m->overlay_layers);
3625 m->overlay_layers = TAKE_PTR(new_layers);
3626
3627 layers = strv_join(m->overlay_layers, ":");
3628 if (!layers)
3629 return log_oom_debug();
3630
3631 /* Append the underlying hierarchy path as the last lowerdir */
3632 options = strjoin(layers, ":", FORMAT_PROC_FD_PATH(hierarchy_path_fd));
3633 if (!options)
3634 return log_oom_debug();
3635
3636 if (fsconfig(overlay_fs_fd, FSCONFIG_SET_STRING, "lowerdir", options, 0) < 0)
3637 return log_debug_errno(errno, "Failed to set lowerdir=%s: %m", options);
3638
3639 if (fsconfig(overlay_fs_fd, FSCONFIG_SET_STRING, "source", "systemd-extensions", 0) < 0)
3640 return log_debug_errno(errno, "Failed to set source=systemd-extensions: %m");
3641
3642 /* Create the superblock */
3643 if (fsconfig(overlay_fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
3644 return log_debug_errno(errno, "Failed to create overlay superblock: %m");
3645
3646 /* Signal completion to grandchild */
3647 r = send_one_fd(pipe_fd, overlay_fs_fd, 0);
3648 if (r < 0)
3649 return log_debug_errno(r, "Failed to signal overlay configuration complete for %s: %m", mount_path);
3650
3651 return 0;
3652}
3653
3654static int refresh_apply_and_prune(const NamespaceParameters *p, MountList *ml) {
3655 int r;
3656
3657 assert(p);
3658 assert(ml);
3659
3660 /* Open all extensions on the host, drop all sysexts since they won't have /etc/. The list of
3661 * overlays also need to be updated, so that if it's empty after a confext has been removed, the
3662 * child process can correctly undo the overlay in the target namespace, rather than attempting to
3663 * mount an empty overlay which the kernel does not allow, so this pruning has to be done here and
3664 * not later (nor earlier, as we don't know if an image is a confext until this point). */
3665 MountEntry *f, *t;
3666 for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
3667 if (IN_SET(f->mode, MOUNT_EXTENSION_DIRECTORY, MOUNT_EXTENSION_IMAGE)) {
3668 f->filter_class = IMAGE_CONFEXT;
3669
3670 r = apply_one_mount("/", f, p);
3671 if (r < 0)
3672 return r;
3673 /* Nothing happened? Then it is not a confext, prune it from the lists */
3674 if (r == 0) {
3675 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
3676 if (m->mode != MOUNT_OVERLAY)
3677 continue;
3678
3679 _cleanup_strv_free_ char **pruned = NULL;
3680
3681 STRV_FOREACH(ol, m->overlay_layers)
3682 if (!path_startswith(*ol, mount_entry_path(f))) {
3683 r = strv_extend(&pruned, *ol);
3684 if (r < 0)
3685 return log_oom_debug();
3686 }
3687 strv_free(m->overlay_layers);
3688 m->overlay_layers = TAKE_PTR(pruned);
3689 }
3690 mount_entry_done(f);
3691 continue;
3692 }
3693 }
3694
3695 *t = *f;
3696 t++;
3697 }
3698
3699 ml->n_mounts = t - ml->mounts;
3700
3701 return 0;
3702}
3703
3704int refresh_extensions_in_namespace(
3705 const PidRef *target,
3706 const char *hierarchy_env,
3707 const NamespaceParameters *p) {
3708
3709 _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, pidns_fd = -EBADF;
3710 const char *overlay_prefix = "/run/systemd/mount-rootfs";
3711 _cleanup_(mount_list_done) MountList ml = {};
3712 _cleanup_free_ char *extension_dir = NULL;
3713 _cleanup_strv_free_ char **hierarchies = NULL;
3714 int r;
3715
3716 assert(pidref_is_set(target));
3717 assert(hierarchy_env);
3718 assert(p);
3719
3720 log_debug("Refreshing extensions in-namespace for hierarchy '%s'", hierarchy_env);
3721
3722 r = pidref_namespace_open(target, &pidns_fd, &mntns_fd, /* ret_netns_fd= */ NULL, /* ret_userns_fd= */ NULL, &root_fd);
3723 if (r < 0)
3724 return log_debug_errno(r, "Failed to open namespace: %m");
3725
3726 r = is_our_namespace(mntns_fd, NAMESPACE_MOUNT);
3727 if (r < 0)
3728 return log_debug_errno(r, "Failed to check if target namespace is separate: %m");
3729 if (r > 0)
3730 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Target namespace is not separate, cannot reload extensions");
3731
3732 extension_dir = path_join(p->private_namespace_dir, "unit-extensions");
3733 if (!extension_dir)
3734 return log_oom_debug();
3735
3736 r = parse_env_extension_hierarchies(&hierarchies, hierarchy_env);
3737 if (r < 0)
3738 return r;
3739
3740 r = append_extensions(
3741 &ml,
3742 overlay_prefix,
3743 p->private_namespace_dir,
3744 hierarchies,
3745 p->extension_images,
3746 p->n_extension_images,
3747 p->extension_directories);
3748 if (r < 0)
3749 return r;
3750
3751 sort_and_drop_unused_mounts(&ml, overlay_prefix);
3752 if (ml.n_mounts == 0)
3753 return 0;
3754
3755 /**
3756 * There are three main steps:
3757 * 1. In child, set up the extension images and directories in a slave mountns, so that we have
3758 * access to their FDs
3759 * 2. Fork into a grandchild, which will enter the target namespace and attempt to "unpeel" the
3760 * overlays to obtain FDs the underlying directories, over which we will reapply the overlays
3761 * 3. In the child again, receive the FDs and reapply the overlays
3762 */
3763 r = safe_fork("(sd-ns-refresh-exts)",
3764 FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE,
3765 NULL);
3766 if (r < 0)
3767 return r;
3768 if (r == 0) {
3769 /* Child (host namespace) */
3770 _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
3771 _cleanup_(sigkill_waitp) pid_t grandchild_pid = 0;
3772
3773 (void) mkdir_p_label(overlay_prefix, 0555);
3774
3775 r = refresh_apply_and_prune(p, &ml);
3776 if (r < 0) {
3777 log_debug_errno(r, "Failed to apply extensions for refreshing: %m");
3778 _exit(EXIT_FAILURE);
3779 }
3780
3781 /* Create a grandchild process to handle the unmounting and reopening of hierarchy */
3782 r = socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair);
3783 if (r < 0) {
3784 log_debug_errno(errno, "Failed to create socket pair: %m");
3785 _exit(EXIT_FAILURE);
3786 }
3787
3788 r = safe_fork("(sd-ns-refresh-exts-grandchild)",
3789 FORK_LOG|FORK_DEATHSIG_SIGKILL,
3790 &grandchild_pid);
3791 if (r < 0)
3792 _exit(EXIT_FAILURE);
3793 if (r == 0) {
3794 /* Grandchild (target service namespace) */
3795 pair[0] = safe_close(pair[0]);
3796
3797 r = refresh_grandchild_proc(target, &ml, overlay_prefix, pidns_fd, mntns_fd, root_fd, pair[1]);
3798 if (r < 0) {
3799 pair[1] = safe_close(pair[1]);
3800 _exit(EXIT_FAILURE);
3801 }
3802
3803 _exit(EXIT_SUCCESS);
3804 }
3805
3806 pair[1] = safe_close(pair[1]);
3807
3808 /* Until kernel 6.15, the FDs to the individual layers used to set up the OverlayFS via
3809 * lowerdir=/proc/self/fd/X need to remain open until the OverlayFS mount is _attached_
3810 * (as opposed to merely created) to its mount point, hence we need to ensure these FDs
3811 * stay open until the grandchild has attached the mount and exited. */
3812 // TODO: once the kernel baseline is >= 6.15, move the FD array into the helper function
3813 // and close them immediately
3814 int *fd_layers = NULL;
3815 size_t n_fd_layers = 0;
3816 CLEANUP_ARRAY(fd_layers, n_fd_layers, close_many_and_free);
3817
3818 FOREACH_ARRAY(m, ml.mounts, ml.n_mounts) {
3819 r = handle_mount_from_grandchild(m, overlay_prefix, &fd_layers, &n_fd_layers, pair[0]);
3820 if (r < 0)
3821 _exit(EXIT_FAILURE);
3822 }
3823
3824 r = wait_for_terminate_and_check("(sd-ns-refresh-exts-grandchild)", TAKE_PID(grandchild_pid), 0);
3825 if (r < 0) {
3826 log_debug_errno(r, "Failed to wait for target namespace process to finish: %m");
3827 _exit(EXIT_FAILURE);
3828 }
3829 if (r != EXIT_SUCCESS) {
3830 log_debug("Target namespace fork did not succeed");
3831 _exit(EXIT_FAILURE);
3832 }
3833
3834 _exit(EXIT_SUCCESS);
3835 }
3836
3837 return 0;
3838}
3839
1b8689f9 3840static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
cbc056c8
ZJS
3841 [PROTECT_HOME_NO] = "no",
3842 [PROTECT_HOME_YES] = "yes",
1b8689f9 3843 [PROTECT_HOME_READ_ONLY] = "read-only",
cbc056c8 3844 [PROTECT_HOME_TMPFS] = "tmpfs",
417116f2
LP
3845};
3846
1e8c7bd5 3847DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
5e1c6154 3848
6746f288
RW
3849static const char *const protect_hostname_table[_PROTECT_HOSTNAME_MAX] = {
3850 [PROTECT_HOSTNAME_NO] = "no",
3851 [PROTECT_HOSTNAME_YES] = "yes",
cf48bde7 3852 [PROTECT_HOSTNAME_PRIVATE] = "private",
6746f288
RW
3853};
3854
3855DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_hostname, ProtectHostname, PROTECT_HOSTNAME_YES);
3856
1b8689f9 3857static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
cbc056c8
ZJS
3858 [PROTECT_SYSTEM_NO] = "no",
3859 [PROTECT_SYSTEM_YES] = "yes",
3860 [PROTECT_SYSTEM_FULL] = "full",
3f815163 3861 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
3862};
3863
1e8c7bd5 3864DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
03c791aa 3865
5fe29238
RW
3866static const char *const protect_control_groups_table[_PROTECT_CONTROL_GROUPS_MAX] = {
3867 [PROTECT_CONTROL_GROUPS_NO] = "no",
3868 [PROTECT_CONTROL_GROUPS_YES] = "yes",
cd58b5a1
RW
3869 [PROTECT_CONTROL_GROUPS_PRIVATE] = "private",
3870 [PROTECT_CONTROL_GROUPS_STRICT] = "strict",
5fe29238
RW
3871};
3872
3873DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_control_groups, ProtectControlGroups, PROTECT_CONTROL_GROUPS_YES);
3874
4e399953
LP
3875static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
3876 [PROTECT_PROC_DEFAULT] = "default",
3877 [PROTECT_PROC_NOACCESS] = "noaccess",
3878 [PROTECT_PROC_INVISIBLE] = "invisible",
3879 [PROTECT_PROC_PTRACEABLE] = "ptraceable",
3880};
3881
3882DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
3883
3884static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
3885 [PROC_SUBSET_ALL] = "all",
3886 [PROC_SUBSET_PID] = "pid",
3887};
3888
3889DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
0e551b04
LB
3890
3891static const char* const private_tmp_table[_PRIVATE_TMP_MAX] = {
2e8a581b 3892 [PRIVATE_TMP_NO] = "no",
0e551b04
LB
3893 [PRIVATE_TMP_CONNECTED] = "connected",
3894 [PRIVATE_TMP_DISCONNECTED] = "disconnected",
3895};
3896
9d50d053 3897DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_tmp, PrivateTmp, PRIVATE_TMP_CONNECTED);
fa693fdc
DDM
3898
3899static const char* const private_users_table[_PRIVATE_USERS_MAX] = {
edd3f4d9 3900 [PRIVATE_USERS_NO] = "no",
fa693fdc
DDM
3901 [PRIVATE_USERS_SELF] = "self",
3902 [PRIVATE_USERS_IDENTITY] = "identity",
705cc829 3903 [PRIVATE_USERS_FULL] = "full",
fa693fdc
DDM
3904};
3905
3906DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF);
406f1775
DDM
3907
3908static const char* const private_pids_table[_PRIVATE_PIDS_MAX] = {
3909 [PRIVATE_PIDS_NO] = "no",
3910 [PRIVATE_PIDS_YES] = "yes",
3911};
3912
3913DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_pids, PrivatePIDs, PRIVATE_PIDS_YES);