]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
Merge pull request #29247 from naraghavan/naraghavan/dhcpv6-vendor-options
[thirdparty/systemd.git] / src / core / namespace.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <linux/loop.h>
5 #include <sched.h>
6 #include <stdio.h>
7 #include <sys/file.h>
8 #include <sys/mount.h>
9 #include <unistd.h>
10 #if WANT_LINUX_FS_H
11 #include <linux/fs.h>
12 #endif
13
14 #include "alloc-util.h"
15 #include "base-filesystem.h"
16 #include "chase.h"
17 #include "dev-setup.h"
18 #include "devnum-util.h"
19 #include "env-util.h"
20 #include "escape.h"
21 #include "extension-util.h"
22 #include "fd-util.h"
23 #include "format-util.h"
24 #include "glyph-util.h"
25 #include "label-util.h"
26 #include "list.h"
27 #include "lock-util.h"
28 #include "loop-util.h"
29 #include "loopback-setup.h"
30 #include "missing_syscall.h"
31 #include "mkdir-label.h"
32 #include "mount-util.h"
33 #include "mountpoint-util.h"
34 #include "namespace-util.h"
35 #include "namespace.h"
36 #include "nsflags.h"
37 #include "nulstr-util.h"
38 #include "os-util.h"
39 #include "path-util.h"
40 #include "selinux-util.h"
41 #include "socket-util.h"
42 #include "sort-util.h"
43 #include "stat-util.h"
44 #include "string-table.h"
45 #include "string-util.h"
46 #include "strv.h"
47 #include "tmpfile-util.h"
48 #include "umask-util.h"
49 #include "user-util.h"
50
51 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
52
53 typedef enum MountMode {
54 /* This is ordered by priority! */
55 MOUNT_INACCESSIBLE,
56 MOUNT_OVERLAY,
57 MOUNT_IMAGE,
58 MOUNT_BIND,
59 MOUNT_BIND_RECURSIVE,
60 MOUNT_PRIVATE_TMP,
61 MOUNT_PRIVATE_TMP_READ_ONLY,
62 MOUNT_PRIVATE_DEV,
63 MOUNT_BIND_DEV,
64 MOUNT_EMPTY_DIR,
65 MOUNT_PRIVATE_SYSFS,
66 MOUNT_BIND_SYSFS,
67 MOUNT_PROCFS,
68 MOUNT_READ_ONLY,
69 MOUNT_READ_WRITE,
70 MOUNT_NOEXEC,
71 MOUNT_EXEC,
72 MOUNT_TMPFS,
73 MOUNT_RUN,
74 MOUNT_EXTENSION_DIRECTORY, /* Bind-mounted outside the root directory, and used by subsequent mounts */
75 MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */
76 MOUNT_MQUEUEFS,
77 MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */
78 _MOUNT_MODE_MAX,
79 _MOUNT_MODE_INVALID = -EINVAL,
80 } MountMode;
81
82 typedef enum MountEntryState {
83 MOUNT_PENDING,
84 MOUNT_APPLIED,
85 MOUNT_SKIPPED,
86 _MOUNT_ENTRY_STATE_MAX,
87 _MOUNT_ENTRY_STATE_INVALID = -EINVAL,
88 } MountEntryState;
89
90 typedef struct MountEntry {
91 const char *path_const; /* Memory allocated on stack or static */
92 MountMode mode;
93 bool ignore:1; /* Ignore if path does not exist? */
94 bool has_prefix:1; /* Already is prefixed by the root dir? */
95 bool read_only:1; /* Shall this mount point be read-only? */
96 bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */
97 bool noexec:1; /* Shall set MS_NOEXEC on the mount itself */
98 bool exec:1; /* Shall clear MS_NOEXEC on the mount itself */
99 MountEntryState state; /* Whether it was already processed or skipped */
100 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
101 const char *unprefixed_path_const; /* If the path was amended with a prefix, these will save the original */
102 char *unprefixed_path_malloc;
103 const char *source_const; /* The source path, for bind mounts or images */
104 char *source_malloc;
105 const char *options_const;/* Mount options for tmpfs */
106 char *options_malloc;
107 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
108 unsigned n_followed;
109 LIST_HEAD(MountOptions, image_options_const);
110 } MountEntry;
111
112 typedef struct MountList {
113 MountEntry *mounts;
114 size_t n_mounts;
115 } MountList;
116
117 /* If MountAPIVFS= is used, let's mount /sys, /proc, /dev and /run into the it, but only as a fallback if the user hasn't mounted
118 * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
119 static const MountEntry apivfs_table[] = {
120 { "/proc", MOUNT_PROCFS, false },
121 { "/dev", MOUNT_BIND_DEV, false },
122 { "/sys", MOUNT_BIND_SYSFS, false },
123 { "/run", MOUNT_RUN, false, .options_const = "mode=0755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME },
124 };
125
126 /* ProtectKernelTunables= option and the related filesystem APIs */
127 static const MountEntry protect_kernel_tunables_proc_table[] = {
128 { "/proc/acpi", MOUNT_READ_ONLY, true },
129 { "/proc/apm", MOUNT_READ_ONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
130 { "/proc/asound", MOUNT_READ_ONLY, true },
131 { "/proc/bus", MOUNT_READ_ONLY, true },
132 { "/proc/fs", MOUNT_READ_ONLY, true },
133 { "/proc/irq", MOUNT_READ_ONLY, true },
134 { "/proc/kallsyms", MOUNT_INACCESSIBLE, true },
135 { "/proc/kcore", MOUNT_INACCESSIBLE, true },
136 { "/proc/latency_stats", MOUNT_READ_ONLY, true },
137 { "/proc/mtrr", MOUNT_READ_ONLY, true },
138 { "/proc/scsi", MOUNT_READ_ONLY, true },
139 { "/proc/sys", MOUNT_READ_ONLY, true },
140 { "/proc/sysrq-trigger", MOUNT_READ_ONLY, true },
141 { "/proc/timer_stats", MOUNT_READ_ONLY, true },
142 };
143
144 static const MountEntry protect_kernel_tunables_sys_table[] = {
145 { "/sys", MOUNT_READ_ONLY, false },
146 { "/sys/fs/bpf", MOUNT_READ_ONLY, true },
147 { "/sys/fs/cgroup", MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */
148 { "/sys/fs/selinux", MOUNT_READ_WRITE_IMPLICIT, true },
149 { "/sys/kernel/debug", MOUNT_READ_ONLY, true },
150 { "/sys/kernel/tracing", MOUNT_READ_ONLY, true },
151 };
152
153 /* ProtectKernelModules= option */
154 static const MountEntry protect_kernel_modules_table[] = {
155 { "/usr/lib/modules", MOUNT_INACCESSIBLE, true },
156 };
157
158 /* ProtectKernelLogs= option */
159 static const MountEntry protect_kernel_logs_proc_table[] = {
160 { "/proc/kmsg", MOUNT_INACCESSIBLE, true },
161 };
162
163 static const MountEntry protect_kernel_logs_dev_table[] = {
164 { "/dev/kmsg", MOUNT_INACCESSIBLE, true },
165 };
166
167 /*
168 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
169 * system should be protected by ProtectSystem=
170 */
171 static const MountEntry protect_home_read_only_table[] = {
172 { "/home", MOUNT_READ_ONLY, true },
173 { "/run/user", MOUNT_READ_ONLY, true },
174 { "/root", MOUNT_READ_ONLY, true },
175 };
176
177 /* ProtectHome=tmpfs table */
178 static const MountEntry protect_home_tmpfs_table[] = {
179 { "/home", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
180 { "/run/user", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
181 { "/root", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
182 };
183
184 /* ProtectHome=yes table */
185 static const MountEntry protect_home_yes_table[] = {
186 { "/home", MOUNT_INACCESSIBLE, true },
187 { "/run/user", MOUNT_INACCESSIBLE, true },
188 { "/root", MOUNT_INACCESSIBLE, true },
189 };
190
191 /* ProtectSystem=yes table */
192 static const MountEntry protect_system_yes_table[] = {
193 { "/usr", MOUNT_READ_ONLY, false },
194 { "/boot", MOUNT_READ_ONLY, true },
195 { "/efi", MOUNT_READ_ONLY, true },
196 };
197
198 /* ProtectSystem=full includes ProtectSystem=yes */
199 static const MountEntry protect_system_full_table[] = {
200 { "/usr", MOUNT_READ_ONLY, false },
201 { "/boot", MOUNT_READ_ONLY, true },
202 { "/efi", MOUNT_READ_ONLY, true },
203 { "/etc", MOUNT_READ_ONLY, false },
204 };
205
206 /* ProtectSystem=strict table. In this strict mode, we mount everything read-only, except for /proc, /dev,
207 * /sys which are the kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
208 * protect those, and these options should be fully orthogonal. (And of course /home and friends are also
209 * left writable, as ProtectHome= shall manage those, orthogonally).
210 */
211 static const MountEntry protect_system_strict_table[] = {
212 { "/", MOUNT_READ_ONLY, false },
213 { "/proc", MOUNT_READ_WRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
214 { "/sys", MOUNT_READ_WRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
215 { "/dev", MOUNT_READ_WRITE_IMPLICIT, false }, /* PrivateDevices= */
216 { "/home", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
217 { "/run/user", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
218 { "/root", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
219 };
220
221 /* ProtectHostname=yes able */
222 static const MountEntry protect_hostname_table[] = {
223 { "/proc/sys/kernel/hostname", MOUNT_READ_ONLY, false },
224 { "/proc/sys/kernel/domainname", MOUNT_READ_ONLY, false },
225 };
226
227 static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
228 [MOUNT_INACCESSIBLE] = "inaccessible",
229 [MOUNT_OVERLAY] = "overlay",
230 [MOUNT_IMAGE] = "image",
231 [MOUNT_BIND] = "bind",
232 [MOUNT_BIND_RECURSIVE] = "bind-recursive",
233 [MOUNT_PRIVATE_TMP] = "private-tmp",
234 [MOUNT_PRIVATE_TMP_READ_ONLY] = "private-tmp-read-only",
235 [MOUNT_PRIVATE_DEV] = "private-dev",
236 [MOUNT_BIND_DEV] = "bind-dev",
237 [MOUNT_EMPTY_DIR] = "empty-dir",
238 [MOUNT_PRIVATE_SYSFS] = "private-sysfs",
239 [MOUNT_BIND_SYSFS] = "bind-sysfs",
240 [MOUNT_PROCFS] = "procfs",
241 [MOUNT_READ_ONLY] = "read-only",
242 [MOUNT_READ_WRITE] = "read-write",
243 [MOUNT_NOEXEC] = "noexec",
244 [MOUNT_EXEC] = "exec",
245 [MOUNT_TMPFS] = "tmpfs",
246 [MOUNT_RUN] = "run",
247 [MOUNT_EXTENSION_DIRECTORY] = "extension-directory",
248 [MOUNT_EXTENSION_IMAGE] = "extension-image",
249 [MOUNT_MQUEUEFS] = "mqueuefs",
250 [MOUNT_READ_WRITE_IMPLICIT] = "read-write-implicit",
251 };
252
253 /* Helper struct for naming simplicity and reusability */
254 static const struct {
255 const char *level_env;
256 const char *level_env_print;
257 } image_class_info[_IMAGE_CLASS_MAX] = {
258 [IMAGE_SYSEXT] = {
259 .level_env = "SYSEXT_LEVEL",
260 .level_env_print = " SYSEXT_LEVEL=",
261 },
262 [IMAGE_CONFEXT] = {
263 .level_env = "CONFEXT_LEVEL",
264 .level_env_print = " CONFEXT_LEVEL=",
265 }
266 };
267
268 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
269
270 static const char *mount_entry_path(const MountEntry *p) {
271 assert(p);
272
273 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
274 * otherwise the stack/static ->path field is returned. */
275
276 return p->path_malloc ?: p->path_const;
277 }
278
279 static const char *mount_entry_unprefixed_path(const MountEntry *p) {
280 assert(p);
281
282 /* Returns the unprefixed path (ie: before prefix_where_needed() ran), if any */
283
284 return p->unprefixed_path_malloc ?: p->unprefixed_path_const ?: mount_entry_path(p);
285 }
286
287 static void mount_entry_consume_prefix(MountEntry *p, char *new_path) {
288 assert(p);
289 assert(p->path_malloc || p->path_const);
290 assert(new_path);
291
292 /* Saves current path in unprefixed_ variable, and takes over new_path */
293
294 free_and_replace(p->unprefixed_path_malloc, p->path_malloc);
295 /* If we didn't have a path on the heap, then it's a static one */
296 if (!p->unprefixed_path_malloc)
297 p->unprefixed_path_const = p->path_const;
298 p->path_malloc = new_path;
299 p->has_prefix = true;
300 }
301
302 static bool mount_entry_read_only(const MountEntry *p) {
303 assert(p);
304
305 return p->read_only || IN_SET(p->mode, MOUNT_READ_ONLY, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_TMP_READ_ONLY);
306 }
307
308 static bool mount_entry_noexec(const MountEntry *p) {
309 assert(p);
310
311 return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS);
312 }
313
314 static bool mount_entry_exec(const MountEntry *p) {
315 assert(p);
316
317 return p->exec || p->mode == MOUNT_EXEC;
318 }
319
320 static const char *mount_entry_source(const MountEntry *p) {
321 assert(p);
322
323 return p->source_malloc ?: p->source_const;
324 }
325
326 static const char *mount_entry_options(const MountEntry *p) {
327 assert(p);
328
329 return p->options_malloc ?: p->options_const;
330 }
331
332 static void mount_entry_done(MountEntry *p) {
333 assert(p);
334
335 p->path_malloc = mfree(p->path_malloc);
336 p->unprefixed_path_malloc = mfree(p->unprefixed_path_malloc);
337 p->source_malloc = mfree(p->source_malloc);
338 p->options_malloc = mfree(p->options_malloc);
339 }
340
341 static void mount_list_done(MountList *ml) {
342 assert(ml);
343
344 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts)
345 mount_entry_done(m);
346
347 ml->mounts = mfree(ml->mounts);
348 ml->n_mounts = 0;
349 }
350
351 static MountEntry *mount_list_extend(MountList *ml) {
352 assert(ml);
353
354 if (!GREEDY_REALLOC0(ml->mounts, ml->n_mounts+1))
355 return NULL;
356
357 return ml->mounts + ml->n_mounts++;
358 }
359
360 static int append_access_mounts(MountList *ml, char **strv, MountMode mode, bool forcibly_require_prefix) {
361 assert(ml);
362
363 /* Adds a list of user-supplied READ_WRITE/READ_WRITE_IMPLICIT/READ_ONLY/INACCESSIBLE entries */
364
365 STRV_FOREACH(i, strv) {
366 bool ignore = false, needs_prefix = false;
367 const char *e = *i;
368
369 /* Look for any prefixes */
370 if (startswith(e, "-")) {
371 e++;
372 ignore = true;
373 }
374 if (startswith(e, "+")) {
375 e++;
376 needs_prefix = true;
377 }
378
379 if (!path_is_absolute(e))
380 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", e);
381
382 MountEntry *me = mount_list_extend(ml);
383 if (!me)
384 return log_oom_debug();
385
386 *me = (MountEntry) {
387 .path_const = e,
388 .mode = mode,
389 .ignore = ignore,
390 .has_prefix = !needs_prefix && !forcibly_require_prefix,
391 };
392 }
393
394 return 0;
395 }
396
397 static int append_empty_dir_mounts(MountList *ml, char **strv) {
398 assert(ml);
399
400 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
401 * "/private/" boundary directories for DynamicUser=1. */
402
403 STRV_FOREACH(i, strv) {
404 MountEntry *me = mount_list_extend(ml);
405 if (!me)
406 return log_oom_debug();
407
408 *me = (MountEntry) {
409 .path_const = *i,
410 .mode = MOUNT_EMPTY_DIR,
411 .ignore = false,
412 .read_only = true,
413 .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
414 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
415 };
416 }
417
418 return 0;
419 }
420
421 static int append_bind_mounts(MountList *ml, const BindMount *binds, size_t n) {
422 assert(ml);
423 assert(binds || n == 0);
424
425 FOREACH_ARRAY(b, binds, n) {
426 MountEntry *me = mount_list_extend(ml);
427 if (!me)
428 return log_oom_debug();
429
430 *me = (MountEntry) {
431 .path_const = b->destination,
432 .mode = b->recursive ? MOUNT_BIND_RECURSIVE : MOUNT_BIND,
433 .read_only = b->read_only,
434 .nosuid = b->nosuid,
435 .source_const = b->source,
436 .ignore = b->ignore_enoent,
437 };
438 }
439
440 return 0;
441 }
442
443 static int append_mount_images(MountList *ml, const MountImage *mount_images, size_t n) {
444 assert(ml);
445 assert(mount_images || n == 0);
446
447 FOREACH_ARRAY(m, mount_images, n) {
448 MountEntry *me = mount_list_extend(ml);
449 if (!me)
450 return log_oom_debug();
451
452 *me = (MountEntry) {
453 .path_const = m->destination,
454 .mode = MOUNT_IMAGE,
455 .source_const = m->source,
456 .image_options_const = m->mount_options,
457 .ignore = m->ignore_enoent,
458 };
459 }
460
461 return 0;
462 }
463
464 static int append_extensions(
465 MountList *ml,
466 const char *root,
467 const char *extension_dir,
468 char **hierarchies,
469 const MountImage *mount_images,
470 size_t n,
471 char **extension_directories) {
472
473 _cleanup_strv_free_ char **overlays = NULL;
474 int r;
475
476 assert(ml);
477
478 if (n == 0 && strv_isempty(extension_directories))
479 return 0;
480
481 assert(extension_dir);
482
483 /* Prepare a list of overlays, that will have as each element a string suitable for being
484 * passed as a lowerdir= parameter, so start with the hierarchy on the root.
485 * The overlays vector will have the same number of elements and will correspond to the
486 * hierarchies vector, so they can be iterated upon together. */
487 STRV_FOREACH(hierarchy, hierarchies) {
488 _cleanup_free_ char *prefixed_hierarchy = NULL;
489
490 prefixed_hierarchy = path_join(root, *hierarchy);
491 if (!prefixed_hierarchy)
492 return -ENOMEM;
493
494 r = strv_consume(&overlays, TAKE_PTR(prefixed_hierarchy));
495 if (r < 0)
496 return r;
497 }
498
499 /* First, prepare a mount for each image, but these won't be visible to the unit, instead
500 * they will be mounted in our propagate directory, and used as a source for the overlay. */
501 for (size_t i = 0; i < n; i++) {
502 _cleanup_free_ char *mount_point = NULL;
503 const MountImage *m = mount_images + i;
504
505 if (asprintf(&mount_point, "%s/%zu", extension_dir, i) < 0)
506 return -ENOMEM;
507
508 for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
509 _cleanup_free_ char *prefixed_hierarchy = NULL, *escaped = NULL, *lowerdir = NULL;
510
511 prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
512 if (!prefixed_hierarchy)
513 return -ENOMEM;
514
515 escaped = shell_escape(prefixed_hierarchy, ",:");
516 if (!escaped)
517 return -ENOMEM;
518
519 /* Note that lowerdir= parameters are in 'reverse' order, so the
520 * top-most directory in the overlay comes first in the list. */
521 lowerdir = strjoin(escaped, ":", overlays[j]);
522 if (!lowerdir)
523 return -ENOMEM;
524
525 free_and_replace(overlays[j], lowerdir);
526 }
527
528 MountEntry *me = mount_list_extend(ml);
529 if (!me)
530 return log_oom_debug();
531
532 *me = (MountEntry) {
533 .path_malloc = TAKE_PTR(mount_point),
534 .image_options_const = m->mount_options,
535 .ignore = m->ignore_enoent,
536 .source_const = m->source,
537 .mode = MOUNT_EXTENSION_IMAGE,
538 .has_prefix = true,
539 };
540 }
541
542 /* Secondly, extend the lowerdir= parameters with each ExtensionDirectory.
543 * Bind mount them in the same location as the ExtensionImages, so that we
544 * can check that they are valid trees (extension-release.d). */
545 STRV_FOREACH(extension_directory, extension_directories) {
546 _cleanup_free_ char *mount_point = NULL, *source = NULL;
547 const char *e = *extension_directory;
548 bool ignore_enoent = false;
549
550 /* Pick up the counter where the ExtensionImages left it. */
551 if (asprintf(&mount_point, "%s/%zu", extension_dir, n++) < 0)
552 return -ENOMEM;
553
554 /* Look for any prefixes */
555 if (startswith(e, "-")) {
556 e++;
557 ignore_enoent = true;
558 }
559 /* Ignore this for now */
560 if (startswith(e, "+"))
561 e++;
562
563 source = strdup(e);
564 if (!source)
565 return -ENOMEM;
566
567 for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
568 _cleanup_free_ char *prefixed_hierarchy = NULL, *escaped = NULL, *lowerdir = NULL;
569
570 prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
571 if (!prefixed_hierarchy)
572 return -ENOMEM;
573
574 escaped = shell_escape(prefixed_hierarchy, ",:");
575 if (!escaped)
576 return -ENOMEM;
577
578 /* Note that lowerdir= parameters are in 'reverse' order, so the
579 * top-most directory in the overlay comes first in the list. */
580 lowerdir = strjoin(escaped, ":", overlays[j]);
581 if (!lowerdir)
582 return -ENOMEM;
583
584 free_and_replace(overlays[j], lowerdir);
585 }
586
587 MountEntry *me = mount_list_extend(ml);
588 if (!me)
589 return log_oom_debug();
590
591 *me = (MountEntry) {
592 .path_malloc = TAKE_PTR(mount_point),
593 .source_malloc = TAKE_PTR(source),
594 .mode = MOUNT_EXTENSION_DIRECTORY,
595 .ignore = ignore_enoent,
596 .has_prefix = true,
597 .read_only = true,
598 };
599 }
600
601 /* Then, for each hierarchy, prepare an overlay with the list of lowerdir= strings
602 * set up earlier. */
603 for (size_t i = 0; hierarchies && hierarchies[i]; ++i) {
604 _cleanup_free_ char *prefixed_hierarchy = NULL;
605
606 prefixed_hierarchy = path_join(root, hierarchies[i]);
607 if (!prefixed_hierarchy)
608 return -ENOMEM;
609
610 MountEntry *me = mount_list_extend(ml);
611 if (!me)
612 return log_oom_debug();
613
614 *me = (MountEntry) {
615 .path_malloc = TAKE_PTR(prefixed_hierarchy),
616 .options_malloc = TAKE_PTR(overlays[i]),
617 .mode = MOUNT_OVERLAY,
618 .has_prefix = true,
619 .ignore = true, /* If the source image doesn't set the ignore bit it will fail earlier. */
620 };
621 }
622
623 return 0;
624 }
625
626 static int append_tmpfs_mounts(MountList *ml, const TemporaryFileSystem *tmpfs, size_t n) {
627 assert(ml);
628 assert(tmpfs || n == 0);
629
630 FOREACH_ARRAY(t, tmpfs, n) {
631 _cleanup_free_ char *o = NULL, *str = NULL;
632 unsigned long flags;
633 bool ro = false;
634 int r;
635
636 if (!path_is_absolute(t->path))
637 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", t->path);
638
639 str = strjoin("mode=0755" NESTED_TMPFS_LIMITS ",", t->options);
640 if (!str)
641 return -ENOMEM;
642
643 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
644 if (r < 0)
645 return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
646
647 ro = flags & MS_RDONLY;
648 if (ro)
649 flags ^= MS_RDONLY;
650
651 MountEntry *me = mount_list_extend(ml);
652 if (!me)
653 return log_oom_debug();
654
655 *me = (MountEntry) {
656 .path_const = t->path,
657 .mode = MOUNT_TMPFS,
658 .read_only = ro,
659 .options_malloc = TAKE_PTR(o),
660 .flags = flags,
661 };
662 }
663
664 return 0;
665 }
666
667 static int append_static_mounts(MountList *ml, const MountEntry *mounts, size_t n, bool ignore_protect) {
668 assert(ml);
669 assert(mounts || n == 0);
670
671 /* Adds a list of static pre-defined entries */
672
673 FOREACH_ARRAY(m, mounts, n) {
674 MountEntry *me = mount_list_extend(ml);
675 if (!me)
676 return log_oom_debug();
677
678 *me = (MountEntry) {
679 .path_const = mount_entry_path(m),
680 .mode = m->mode,
681 .ignore = m->ignore || ignore_protect,
682 };
683 }
684
685 return 0;
686 }
687
688 static int append_protect_home(MountList *ml, ProtectHome protect_home, bool ignore_protect) {
689 assert(ml);
690
691 switch (protect_home) {
692
693 case PROTECT_HOME_NO:
694 return 0;
695
696 case PROTECT_HOME_READ_ONLY:
697 return append_static_mounts(ml, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
698
699 case PROTECT_HOME_TMPFS:
700 return append_static_mounts(ml, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
701
702 case PROTECT_HOME_YES:
703 return append_static_mounts(ml, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
704
705 default:
706 assert_not_reached();
707 }
708 }
709
710 static int append_protect_system(MountList *ml, ProtectSystem protect_system, bool ignore_protect) {
711 assert(ml);
712
713 switch (protect_system) {
714
715 case PROTECT_SYSTEM_NO:
716 return 0;
717
718 case PROTECT_SYSTEM_STRICT:
719 return append_static_mounts(ml, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
720
721 case PROTECT_SYSTEM_YES:
722 return append_static_mounts(ml, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
723
724 case PROTECT_SYSTEM_FULL:
725 return append_static_mounts(ml, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
726
727 default:
728 assert_not_reached();
729 }
730 }
731
732 static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
733 int d;
734
735 /* ExtensionImages/Directories will be used by other mounts as a base, so sort them first
736 * regardless of the prefix - they are set up in the propagate directory anyway */
737 d = -CMP(a->mode == MOUNT_EXTENSION_IMAGE, b->mode == MOUNT_EXTENSION_IMAGE);
738 if (d != 0)
739 return d;
740 d = -CMP(a->mode == MOUNT_EXTENSION_DIRECTORY, b->mode == MOUNT_EXTENSION_DIRECTORY);
741 if (d != 0)
742 return d;
743
744 /* If the paths are not equal, then order prefixes first */
745 d = path_compare(mount_entry_path(a), mount_entry_path(b));
746 if (d != 0)
747 return d;
748
749 /* If the paths are equal, check the mode */
750 return CMP((int) a->mode, (int) b->mode);
751 }
752
753 static int prefix_where_needed(MountList *ml, const char *root_directory) {
754 /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
755
756 assert(ml);
757
758 FOREACH_ARRAY(me, ml->mounts, ml->n_mounts) {
759 char *s;
760
761 if (me->has_prefix)
762 continue;
763
764 s = path_join(root_directory, mount_entry_path(me));
765 if (!s)
766 return -ENOMEM;
767
768 mount_entry_consume_prefix(me, s);
769 }
770
771 return 0;
772 }
773
774 static void drop_duplicates(MountList *ml) {
775 MountEntry *f, *t, *previous;
776
777 assert(ml);
778
779 /* Drops duplicate entries. Expects that the array is properly ordered already. */
780
781 for (f = ml->mounts, t = ml->mounts, previous = NULL; f < ml->mounts + ml->n_mounts; f++) {
782
783 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
784 * above. Note that we only drop duplicates that haven't been mounted yet. */
785 if (previous &&
786 path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
787 f->state == MOUNT_PENDING && previous->state == MOUNT_PENDING) {
788 log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
789 /* Propagate the flags to the remaining entry */
790 previous->read_only = previous->read_only || mount_entry_read_only(f);
791 previous->noexec = previous->noexec || mount_entry_noexec(f);
792 previous->exec = previous->exec || mount_entry_exec(f);
793 mount_entry_done(f);
794 continue;
795 }
796
797 *t = *f;
798 previous = t;
799 t++;
800 }
801
802 ml->n_mounts = t - ml->mounts;
803 }
804
805 static void drop_inaccessible(MountList *ml) {
806 MountEntry *f, *t;
807 const char *clear = NULL;
808
809 assert(ml);
810
811 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
812 * ordered already. */
813
814 for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
815
816 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
817 * it, as inaccessible paths really should drop the entire subtree. */
818 if (clear && path_startswith(mount_entry_path(f), clear)) {
819 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
820 mount_entry_done(f);
821 continue;
822 }
823
824 clear = f->mode == MOUNT_INACCESSIBLE ? mount_entry_path(f) : NULL;
825
826 *t = *f;
827 t++;
828 }
829
830 ml->n_mounts = t - ml->mounts;
831 }
832
833 static void drop_nop(MountList *ml) {
834 MountEntry *f, *t;
835
836 assert(ml);
837
838 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
839 * list is ordered by prefixes. */
840
841 for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
842
843 /* Only suppress such subtrees for READ_ONLY, READ_WRITE and READ_WRITE_IMPLICIT entries */
844 if (IN_SET(f->mode, MOUNT_READ_ONLY, MOUNT_READ_WRITE, MOUNT_READ_WRITE_IMPLICIT)) {
845 MountEntry *found = NULL;
846
847 /* Now let's find the first parent of the entry we are looking at. */
848 for (MountEntry *p = PTR_SUB1(t, ml->mounts); p; p = PTR_SUB1(p, ml->mounts))
849 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
850 found = p;
851 break;
852 }
853
854 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
855 if (found && found->mode == f->mode) {
856 log_debug("%s (%s) is made redundant by %s (%s)",
857 mount_entry_path(f), mount_mode_to_string(f->mode),
858 mount_entry_path(found), mount_mode_to_string(found->mode));
859 mount_entry_done(f);
860 continue;
861 }
862 }
863
864 *t = *f;
865 t++;
866 }
867
868 ml->n_mounts = t - ml->mounts;
869 }
870
871 static void drop_outside_root(MountList *ml, const char *root_directory) {
872 MountEntry *f, *t;
873
874 assert(ml);
875
876 /* Nothing to do */
877 if (!root_directory)
878 return;
879
880 /* Drops all mounts that are outside of the root directory. */
881
882 for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
883
884 /* ExtensionImages/Directories bases are opened in /run/systemd/unit-extensions on the host */
885 if (!IN_SET(f->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) && !path_startswith(mount_entry_path(f), root_directory)) {
886 log_debug("%s is outside of root directory.", mount_entry_path(f));
887 mount_entry_done(f);
888 continue;
889 }
890
891 *t = *f;
892 t++;
893 }
894
895 ml->n_mounts = t - ml->mounts;
896 }
897
898 static int clone_device_node(
899 const char *d,
900 const char *temporary_mount,
901 bool *make_devnode) {
902
903 _cleanup_free_ char *sl = NULL;
904 const char *dn, *bn, *t;
905 struct stat st;
906 int r;
907
908 if (stat(d, &st) < 0) {
909 if (errno == ENOENT) {
910 log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
911 return -ENXIO;
912 }
913
914 return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
915 }
916
917 if (!S_ISBLK(st.st_mode) &&
918 !S_ISCHR(st.st_mode))
919 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
920 "Device node '%s' to clone is not a device node, ignoring.",
921 d);
922
923 dn = strjoina(temporary_mount, d);
924
925 /* First, try to create device node properly */
926 if (*make_devnode) {
927 mac_selinux_create_file_prepare(d, st.st_mode);
928 r = mknod(dn, st.st_mode, st.st_rdev);
929 mac_selinux_create_file_clear();
930 if (r >= 0)
931 goto add_symlink;
932 if (errno != EPERM)
933 return log_debug_errno(errno, "mknod failed for %s: %m", d);
934
935 /* This didn't work, let's not try this again for the next iterations. */
936 *make_devnode = false;
937 }
938
939 /* We're about to fall back to bind-mounting the device node. So create a dummy bind-mount target.
940 * Do not prepare device-node SELinux label (see issue 13762) */
941 r = mknod(dn, S_IFREG, 0);
942 if (r < 0 && errno != EEXIST)
943 return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
944
945 /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard
946 * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or
947 * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */
948 r = mount_nofollow_verbose(LOG_DEBUG, d, dn, NULL, MS_BIND, NULL);
949 if (r < 0)
950 return r;
951
952 add_symlink:
953 bn = path_startswith(d, "/dev/");
954 if (!bn)
955 return 0;
956
957 /* Create symlinks like /dev/char/1:9 → ../urandom */
958 if (asprintf(&sl, "%s/dev/%s/" DEVNUM_FORMAT_STR,
959 temporary_mount,
960 S_ISCHR(st.st_mode) ? "char" : "block",
961 DEVNUM_FORMAT_VAL(st.st_rdev)) < 0)
962 return log_oom_debug();
963
964 (void) mkdir_parents(sl, 0755);
965
966 t = strjoina("../", bn);
967 if (symlink(t, sl) < 0)
968 log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
969
970 return 0;
971 }
972
973 static char *settle_runtime_dir(RuntimeScope scope) {
974 char *runtime_dir;
975
976 if (scope != RUNTIME_SCOPE_USER)
977 return strdup("/run/");
978
979 if (asprintf(&runtime_dir, "/run/user/" UID_FMT, geteuid()) < 0)
980 return NULL;
981
982 return runtime_dir;
983 }
984
985 static int create_temporary_mount_point(RuntimeScope scope, char **ret) {
986 _cleanup_free_ char *runtime_dir = NULL, *temporary_mount = NULL;
987
988 assert(ret);
989
990 runtime_dir = settle_runtime_dir(scope);
991 if (!runtime_dir)
992 return log_oom_debug();
993
994 temporary_mount = path_join(runtime_dir, "systemd/namespace-XXXXXX");
995 if (!temporary_mount)
996 return log_oom_debug();
997
998 if (!mkdtemp(temporary_mount))
999 return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
1000
1001 *ret = TAKE_PTR(temporary_mount);
1002 return 0;
1003 }
1004
1005 static int mount_private_dev(MountEntry *m, RuntimeScope scope) {
1006 static const char devnodes[] =
1007 "/dev/null\0"
1008 "/dev/zero\0"
1009 "/dev/full\0"
1010 "/dev/random\0"
1011 "/dev/urandom\0"
1012 "/dev/tty\0";
1013
1014 _cleanup_free_ char *temporary_mount = NULL;
1015 const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
1016 bool can_mknod = true;
1017 int r;
1018
1019 assert(m);
1020
1021 r = create_temporary_mount_point(scope, &temporary_mount);
1022 if (r < 0)
1023 return r;
1024
1025 dev = strjoina(temporary_mount, "/dev");
1026 (void) mkdir(dev, 0755);
1027 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=0755" TMPFS_LIMITS_PRIVATE_DEV);
1028 if (r < 0)
1029 goto fail;
1030
1031 r = label_fix_full(AT_FDCWD, dev, "/dev", 0);
1032 if (r < 0) {
1033 log_debug_errno(r, "Failed to fix label of '%s' as /dev: %m", dev);
1034 goto fail;
1035 }
1036
1037 devpts = strjoina(temporary_mount, "/dev/pts");
1038 (void) mkdir(devpts, 0755);
1039 r = mount_nofollow_verbose(LOG_DEBUG, "/dev/pts", devpts, NULL, MS_BIND, NULL);
1040 if (r < 0)
1041 goto fail;
1042
1043 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
1044 * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
1045 * Thus, in that case make a clone.
1046 * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
1047 r = is_symlink("/dev/ptmx");
1048 if (r < 0) {
1049 log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
1050 goto fail;
1051 } else if (r > 0) {
1052 devptmx = strjoina(temporary_mount, "/dev/ptmx");
1053 if (symlink("pts/ptmx", devptmx) < 0) {
1054 r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
1055 goto fail;
1056 }
1057 } else {
1058 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
1059 if (r < 0)
1060 goto fail;
1061 }
1062
1063 devshm = strjoina(temporary_mount, "/dev/shm");
1064 (void) mkdir(devshm, 0755);
1065 r = mount_nofollow_verbose(LOG_DEBUG, "/dev/shm", devshm, NULL, MS_BIND, NULL);
1066 if (r < 0)
1067 goto fail;
1068
1069 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
1070 (void) mkdir(devmqueue, 0755);
1071 (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
1072
1073 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
1074 (void) mkdir(devhugepages, 0755);
1075 (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
1076
1077 devlog = strjoina(temporary_mount, "/dev/log");
1078 if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
1079 log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
1080
1081 NULSTR_FOREACH(d, devnodes) {
1082 r = clone_device_node(d, temporary_mount, &can_mknod);
1083 /* ENXIO means the *source* is not a device file, skip creation in that case */
1084 if (r < 0 && r != -ENXIO)
1085 goto fail;
1086 }
1087
1088 r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
1089 if (r < 0)
1090 log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
1091
1092 /* Make the bind mount read-only. */
1093 r = mount_nofollow_verbose(LOG_DEBUG, NULL, dev, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1094 if (r < 0)
1095 return r;
1096
1097 /* Create the /dev directory if missing. It is more likely to be missing when the service is started
1098 * with RootDirectory. This is consistent with mount units creating the mount points when missing. */
1099 (void) mkdir_p_label(mount_entry_path(m), 0755);
1100
1101 /* Unmount everything in old /dev */
1102 r = umount_recursive(mount_entry_path(m), 0);
1103 if (r < 0)
1104 log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
1105
1106 r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL);
1107 if (r < 0)
1108 goto fail;
1109
1110 (void) rmdir(dev);
1111 (void) rmdir(temporary_mount);
1112
1113 return 1;
1114
1115 fail:
1116 if (devpts)
1117 (void) umount_verbose(LOG_DEBUG, devpts, UMOUNT_NOFOLLOW);
1118
1119 if (devshm)
1120 (void) umount_verbose(LOG_DEBUG, devshm, UMOUNT_NOFOLLOW);
1121
1122 if (devhugepages)
1123 (void) umount_verbose(LOG_DEBUG, devhugepages, UMOUNT_NOFOLLOW);
1124
1125 if (devmqueue)
1126 (void) umount_verbose(LOG_DEBUG, devmqueue, UMOUNT_NOFOLLOW);
1127
1128 (void) umount_verbose(LOG_DEBUG, dev, UMOUNT_NOFOLLOW);
1129 (void) rmdir(dev);
1130 (void) rmdir(temporary_mount);
1131
1132 return r;
1133 }
1134
1135 static int mount_bind_dev(const MountEntry *m) {
1136 int r;
1137
1138 assert(m);
1139
1140 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the
1141 * service's /dev. This is only used when RootDirectory= is set. */
1142
1143 (void) mkdir_p_label(mount_entry_path(m), 0755);
1144
1145 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
1146 if (r < 0)
1147 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
1148 if (r > 0) /* make this a NOP if /dev is already a mount point */
1149 return 0;
1150
1151 r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
1152 if (r < 0)
1153 return r;
1154
1155 return 1;
1156 }
1157
1158 static int mount_bind_sysfs(const MountEntry *m) {
1159 int r;
1160
1161 assert(m);
1162
1163 (void) mkdir_p_label(mount_entry_path(m), 0755);
1164
1165 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
1166 if (r < 0)
1167 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
1168 if (r > 0) /* make this a NOP if /sys is already a mount point */
1169 return 0;
1170
1171 /* Bind mount the host's version so that we get all child mounts of it, too. */
1172 r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
1173 if (r < 0)
1174 return r;
1175
1176 return 1;
1177 }
1178
1179 static int mount_private_apivfs(
1180 const char *fstype,
1181 const char *entry_path,
1182 const char *bind_source,
1183 const char *opts,
1184 RuntimeScope scope) {
1185
1186 _cleanup_(rmdir_and_freep) char *temporary_mount = NULL;
1187 int r;
1188
1189 assert(fstype);
1190 assert(entry_path);
1191 assert(bind_source);
1192
1193 (void) mkdir_p_label(entry_path, 0755);
1194
1195 /* First, check if we have enough privileges to mount a new instance. Note, a new sysfs instance
1196 * cannot be mounted on an already existing mount. Let's use a temporary place. */
1197 r = create_temporary_mount_point(scope, &temporary_mount);
1198 if (r < 0)
1199 return r;
1200
1201 r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
1202 if (r == -EINVAL && opts)
1203 /* If this failed with EINVAL then this likely means the textual hidepid= stuff for procfs is
1204 * not supported by the kernel, and thus the per-instance hidepid= neither, which means we
1205 * really don't want to use it, since it would affect our host's /proc mount. Hence let's
1206 * gracefully fallback to a classic, unrestricted version. */
1207 r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
1208 if (ERRNO_IS_NEG_PRIVILEGE(r)) {
1209 /* When we do not have enough privileges to mount a new instance, fall back to use an
1210 * existing mount. */
1211
1212 r = path_is_mount_point(entry_path, /* root = */ NULL, /* flags = */ 0);
1213 if (r < 0)
1214 return log_debug_errno(r, "Unable to determine whether '%s' is already mounted: %m", entry_path);
1215 if (r > 0)
1216 return 0; /* Use the current mount as is. */
1217
1218 /* We lack permissions to mount a new instance, and it is not already mounted. But we can
1219 * access the host's, so as a final fallback bind-mount it to the destination, as most likely
1220 * we are inside a user manager in an unprivileged user namespace. */
1221 r = mount_nofollow_verbose(LOG_DEBUG, bind_source, entry_path, /* fstype = */ NULL, MS_BIND|MS_REC, /* opts = */ NULL);
1222 if (r < 0)
1223 return r;
1224
1225 return 1;
1226
1227 } else if (r < 0)
1228 return r;
1229
1230 /* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */
1231 r = umount_recursive(entry_path, /* flags = */ 0);
1232 if (r < 0)
1233 log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", entry_path);
1234
1235 /* Then, move the new mount instance. */
1236 r = mount_nofollow_verbose(LOG_DEBUG, temporary_mount, entry_path, /* fstype = */ NULL, MS_MOVE, /* opts = */ NULL);
1237 if (r < 0)
1238 return r;
1239
1240 /* We mounted a new instance now. Let's bind mount the children over now. This matters for nspawn
1241 * where a bunch of files are overmounted, in particular the boot id. */
1242 (void) bind_mount_submounts(bind_source, entry_path);
1243 return 1;
1244 }
1245
1246 static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p) {
1247 assert(m);
1248 assert(p);
1249 return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
1250 }
1251
1252 static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
1253 _cleanup_free_ char *opts = NULL;
1254
1255 assert(m);
1256 assert(p);
1257
1258 if (p->protect_proc != PROTECT_PROC_DEFAULT ||
1259 p->proc_subset != PROC_SUBSET_ALL) {
1260
1261 /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
1262 * pretended to be per-instance but actually was per-namespace), hence let's make use of it
1263 * if requested. To make sure this logic succeeds only on kernels where hidepid= is
1264 * per-instance, we'll exclusively use the textual value for hidepid=, since support was
1265 * added in the same commit: if it's supported it is thus also per-instance. */
1266
1267 const char *hpv = p->protect_proc == PROTECT_PROC_DEFAULT ?
1268 "off" :
1269 protect_proc_to_string(p->protect_proc);
1270
1271 /* hidepid= support was added in 5.8, so we can use fsconfig()/fsopen() (which were added in
1272 * 5.2) to check if hidepid= is supported. This avoids a noisy dmesg log by the kernel when
1273 * trying to use hidepid= on systems where it isn't supported. The same applies for subset=.
1274 * fsopen()/fsconfig() was also backported on some distros which allows us to detect
1275 * hidepid=/subset= support in even more scenarios. */
1276
1277 if (mount_option_supported("proc", "hidepid", hpv) != 0) {
1278 opts = strjoin("hidepid=", hpv);
1279 if (!opts)
1280 return -ENOMEM;
1281 }
1282
1283 if (p->proc_subset == PROC_SUBSET_PID &&
1284 mount_option_supported("proc", "subset", "pid") != 0)
1285 if (!strextend_with_separator(&opts, ",", "subset=pid"))
1286 return -ENOMEM;
1287 }
1288
1289 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
1290 * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
1291 * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
1292 * mounted on /proc/ first. */
1293 return mount_private_apivfs("proc", mount_entry_path(m), "/proc", opts, p->runtime_scope);
1294 }
1295
1296 static int mount_tmpfs(const MountEntry *m) {
1297 const char *entry_path, *inner_path;
1298 int r;
1299
1300 assert(m);
1301
1302 entry_path = mount_entry_path(m);
1303 inner_path = mount_entry_unprefixed_path(m);
1304
1305 /* First, get rid of everything that is below if there is anything. Then, overmount with our new
1306 * tmpfs */
1307
1308 (void) mkdir_p_label(entry_path, 0755);
1309 (void) umount_recursive(entry_path, 0);
1310
1311 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m));
1312 if (r < 0)
1313 return r;
1314
1315 r = label_fix_full(AT_FDCWD, entry_path, inner_path, 0);
1316 if (r < 0)
1317 return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
1318
1319 return 1;
1320 }
1321
1322 static int mount_run(const MountEntry *m) {
1323 int r;
1324
1325 assert(m);
1326
1327 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
1328 if (r < 0 && r != -ENOENT)
1329 return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m");
1330 if (r > 0) /* make this a NOP if /run is already a mount point */
1331 return 0;
1332
1333 return mount_tmpfs(m);
1334 }
1335
1336 static int mount_mqueuefs(const MountEntry *m) {
1337 int r;
1338 const char *entry_path;
1339
1340 assert(m);
1341
1342 entry_path = mount_entry_path(m);
1343
1344 (void) mkdir_p_label(entry_path, 0755);
1345 (void) umount_recursive(entry_path, 0);
1346
1347 r = mount_nofollow_verbose(LOG_DEBUG, "mqueue", entry_path, "mqueue", m->flags, mount_entry_options(m));
1348 if (r < 0)
1349 return r;
1350
1351 return 1;
1352 }
1353
1354 static int mount_image(
1355 const MountEntry *m,
1356 const char *root_directory,
1357 const ImagePolicy *image_policy) {
1358
1359 _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
1360 *host_os_release_sysext_level = NULL, *host_os_release_confext_level = NULL,
1361 *extension_name = NULL;
1362 int r;
1363
1364 assert(m);
1365
1366 r = path_extract_filename(mount_entry_source(m), &extension_name);
1367 if (r < 0)
1368 return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
1369
1370 if (m->mode == MOUNT_EXTENSION_IMAGE) {
1371 r = parse_os_release(
1372 empty_to_root(root_directory),
1373 "ID", &host_os_release_id,
1374 "VERSION_ID", &host_os_release_version_id,
1375 image_class_info[IMAGE_SYSEXT].level_env, &host_os_release_sysext_level,
1376 image_class_info[IMAGE_CONFEXT].level_env, &host_os_release_confext_level,
1377 NULL);
1378 if (r < 0)
1379 return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
1380 if (isempty(host_os_release_id))
1381 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
1382 }
1383
1384 r = verity_dissect_and_mount(
1385 /* src_fd= */ -1,
1386 mount_entry_source(m),
1387 mount_entry_path(m),
1388 m->image_options_const,
1389 image_policy,
1390 host_os_release_id,
1391 host_os_release_version_id,
1392 host_os_release_sysext_level,
1393 host_os_release_confext_level,
1394 /* required_sysext_scope= */ NULL,
1395 /* ret_image= */ NULL);
1396 if (r == -ENOENT && m->ignore)
1397 return 0;
1398 if (r == -ESTALE && host_os_release_id)
1399 return log_error_errno(r, // FIXME: this should not be logged ad LOG_ERR, as it will result in duplicate logging.
1400 "Failed to mount image %s, extension-release metadata does not match the lower layer's: ID=%s%s%s%s%s%s%s",
1401 mount_entry_source(m),
1402 host_os_release_id,
1403 host_os_release_version_id ? " VERSION_ID=" : "",
1404 strempty(host_os_release_version_id),
1405 host_os_release_sysext_level ? image_class_info[IMAGE_SYSEXT].level_env_print : "",
1406 strempty(host_os_release_sysext_level),
1407 host_os_release_confext_level ? image_class_info[IMAGE_CONFEXT].level_env_print : "",
1408 strempty(host_os_release_confext_level));
1409 if (r < 0)
1410 return log_debug_errno(r, "Failed to mount image %s on %s: %m", mount_entry_source(m), mount_entry_path(m));
1411
1412 return 1;
1413 }
1414
1415 static int mount_overlay(const MountEntry *m) {
1416 const char *options;
1417 int r;
1418
1419 assert(m);
1420
1421 options = strjoina("lowerdir=", mount_entry_options(m));
1422
1423 (void) mkdir_p_label(mount_entry_path(m), 0755);
1424
1425 r = mount_nofollow_verbose(LOG_DEBUG, "overlay", mount_entry_path(m), "overlay", MS_RDONLY, options);
1426 if (r == -ENOENT && m->ignore)
1427 return 0;
1428 if (r < 0)
1429 return r;
1430
1431 return 1;
1432 }
1433
1434 static int follow_symlink(
1435 const char *root_directory,
1436 MountEntry *m) {
1437
1438 _cleanup_free_ char *target = NULL;
1439 int r;
1440
1441 /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
1442 * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
1443 * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
1444 * end and already have a fully normalized name. */
1445
1446 r = chase(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL);
1447 if (r < 0)
1448 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
1449 if (r > 0) /* Reached the end, nothing more to resolve */
1450 return 1;
1451
1452 if (m->n_followed >= CHASE_MAX) /* put a boundary on things */
1453 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
1454 "Symlink loop on '%s'.",
1455 mount_entry_path(m));
1456
1457 log_debug("Followed mount entry path symlink %s %s %s.",
1458 mount_entry_path(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), target);
1459
1460 mount_entry_consume_prefix(m, TAKE_PTR(target));
1461
1462 m->n_followed ++;
1463
1464 return 0;
1465 }
1466
1467 static int apply_one_mount(
1468 const char *root_directory,
1469 MountEntry *m,
1470 const NamespaceParameters *p) {
1471
1472 _cleanup_free_ char *inaccessible = NULL;
1473 bool rbind = true, make = false;
1474 const char *what;
1475 int r;
1476
1477 /* Return 1 when the mount should be post-processed (remounted r/o, etc.), 0 otherwise. In most
1478 * cases post-processing is the right thing, the typical exception is when the mount is gracefully
1479 * skipped. */
1480
1481 assert(m);
1482 assert(p);
1483
1484 log_debug("Applying namespace mount on %s", mount_entry_path(m));
1485
1486 switch (m->mode) {
1487
1488 case MOUNT_INACCESSIBLE: {
1489 _cleanup_free_ char *runtime_dir = NULL;
1490 struct stat target;
1491
1492 /* First, get rid of everything that is below if there
1493 * is anything... Then, overmount it with an
1494 * inaccessible path. */
1495 (void) umount_recursive(mount_entry_path(m), 0);
1496
1497 if (lstat(mount_entry_path(m), &target) < 0) {
1498 if (errno == ENOENT && m->ignore)
1499 return 0;
1500
1501 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m",
1502 mount_entry_path(m));
1503 }
1504
1505 /* We don't pass the literal runtime scope through here but one based purely on our UID. This
1506 * means that the root user's --user services will use the host's inaccessible inodes rather
1507 * then root's private ones. This is preferable since it means device nodes that are
1508 * overmounted to make them inaccessible will be overmounted with a device node, rather than
1509 * an AF_UNIX socket inode. */
1510 runtime_dir = settle_runtime_dir(geteuid() == 0 ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER);
1511 if (!runtime_dir)
1512 return log_oom_debug();
1513
1514 r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
1515 if (r < 0)
1516 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
1517 "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
1518 what = inaccessible;
1519 break;
1520 }
1521
1522 case MOUNT_READ_ONLY:
1523 case MOUNT_READ_WRITE:
1524 case MOUNT_READ_WRITE_IMPLICIT:
1525 case MOUNT_EXEC:
1526 case MOUNT_NOEXEC:
1527 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
1528 if (r == -ENOENT && m->ignore)
1529 return 0;
1530 if (r < 0)
1531 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m",
1532 mount_entry_path(m));
1533 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY
1534 * and MS_NOEXEC bits for the mount point if needed. */
1535 return 1;
1536 /* This isn't a mount point yet, let's make it one. */
1537 what = mount_entry_path(m);
1538 break;
1539
1540 case MOUNT_EXTENSION_DIRECTORY: {
1541 _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
1542 *host_os_release_level = NULL, *extension_name = NULL;
1543 _cleanup_strv_free_ char **extension_release = NULL;
1544 ImageClass class = IMAGE_SYSEXT;
1545
1546 r = path_extract_filename(mount_entry_source(m), &extension_name);
1547 if (r < 0)
1548 return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
1549
1550 r = load_extension_release_pairs(mount_entry_source(m), IMAGE_SYSEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release);
1551 if (r == -ENOENT) {
1552 r = load_extension_release_pairs(mount_entry_source(m), IMAGE_CONFEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release);
1553 if (r >= 0)
1554 class = IMAGE_CONFEXT;
1555 }
1556 if (r < 0)
1557 return log_debug_errno(r, "Failed to acquire 'extension-release' data of extension tree %s: %m", mount_entry_source(m));
1558
1559 r = parse_os_release(
1560 empty_to_root(root_directory),
1561 "ID", &host_os_release_id,
1562 "VERSION_ID", &host_os_release_version_id,
1563 image_class_info[class].level_env, &host_os_release_level,
1564 NULL);
1565 if (r < 0)
1566 return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
1567 if (isempty(host_os_release_id))
1568 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
1569
1570 r = load_extension_release_pairs(mount_entry_source(m), class, extension_name, /* relax_extension_release_check= */ false, &extension_release);
1571 if (r == -ENOENT && m->ignore)
1572 return 0;
1573 if (r < 0)
1574 return log_debug_errno(r, "Failed to parse directory %s extension-release metadata: %m", extension_name);
1575
1576 r = extension_release_validate(
1577 extension_name,
1578 host_os_release_id,
1579 host_os_release_version_id,
1580 host_os_release_level,
1581 /* host_extension_scope */ NULL, /* Leave empty, we need to accept both system and portable */
1582 extension_release,
1583 class);
1584 if (r == 0)
1585 return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's", extension_name);
1586 if (r < 0)
1587 return log_debug_errno(r, "Failed to compare directory %s extension-release metadata with the root's os-release: %m", extension_name);
1588
1589 _fallthrough_;
1590 }
1591
1592 case MOUNT_BIND:
1593 rbind = false;
1594
1595 _fallthrough_;
1596 case MOUNT_BIND_RECURSIVE: {
1597 _cleanup_free_ char *chased = NULL;
1598
1599 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note
1600 * that bind mount source paths are always relative to the host root, hence we pass NULL as
1601 * root directory to chase() here. */
1602
1603 r = chase(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL);
1604 if (r == -ENOENT && m->ignore) {
1605 log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
1606 return 0;
1607 }
1608 if (r < 0)
1609 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
1610
1611 log_debug("Followed source symlinks %s %s %s.",
1612 mount_entry_source(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), chased);
1613
1614 free_and_replace(m->source_malloc, chased);
1615
1616 what = mount_entry_source(m);
1617 make = true;
1618 break;
1619 }
1620
1621 case MOUNT_EMPTY_DIR:
1622 case MOUNT_TMPFS:
1623 return mount_tmpfs(m);
1624
1625 case MOUNT_PRIVATE_TMP:
1626 case MOUNT_PRIVATE_TMP_READ_ONLY:
1627 what = mount_entry_source(m);
1628 make = true;
1629 break;
1630
1631 case MOUNT_PRIVATE_DEV:
1632 return mount_private_dev(m, p->runtime_scope);
1633
1634 case MOUNT_BIND_DEV:
1635 return mount_bind_dev(m);
1636
1637 case MOUNT_PRIVATE_SYSFS:
1638 return mount_private_sysfs(m, p);
1639
1640 case MOUNT_BIND_SYSFS:
1641 return mount_bind_sysfs(m);
1642
1643 case MOUNT_PROCFS:
1644 return mount_procfs(m, p);
1645
1646 case MOUNT_RUN:
1647 return mount_run(m);
1648
1649 case MOUNT_MQUEUEFS:
1650 return mount_mqueuefs(m);
1651
1652 case MOUNT_IMAGE:
1653 return mount_image(m, NULL, p->mount_image_policy);
1654
1655 case MOUNT_EXTENSION_IMAGE:
1656 return mount_image(m, root_directory, p->extension_image_policy);
1657
1658 case MOUNT_OVERLAY:
1659 return mount_overlay(m);
1660
1661 default:
1662 assert_not_reached();
1663 }
1664
1665 assert(what);
1666
1667 r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
1668 if (r < 0) {
1669 bool try_again = false;
1670
1671 if (r == -ENOENT && make) {
1672 int q;
1673
1674 /* Hmm, either the source or the destination are missing. Let's see if we can create
1675 the destination, then try again. */
1676
1677 (void) mkdir_parents(mount_entry_path(m), 0755);
1678
1679 q = make_mount_point_inode_from_path(what, mount_entry_path(m), 0755);
1680 if (q < 0) {
1681 if (q != -EEXIST) // FIXME: this shouldn't be logged at LOG_WARNING, but be bubbled up, and logged there to avoid duplicate logging
1682 log_warning_errno(q, "Failed to create destination mount point node '%s', ignoring: %m",
1683 mount_entry_path(m));
1684 } else
1685 try_again = true;
1686 }
1687
1688 if (try_again)
1689 r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
1690 if (r < 0)
1691 return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m)); // FIXME: this should not be logged here, but be bubbled up, to avoid duplicate logging
1692 }
1693
1694 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
1695 return 1;
1696 }
1697
1698 static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
1699 unsigned long new_flags = 0, flags_mask = 0;
1700 bool submounts;
1701 int r;
1702
1703 assert(m);
1704 assert(proc_self_mountinfo);
1705
1706 if (m->state != MOUNT_APPLIED)
1707 return 0;
1708
1709 if (mount_entry_read_only(m) || m->mode == MOUNT_PRIVATE_DEV) {
1710 new_flags |= MS_RDONLY;
1711 flags_mask |= MS_RDONLY;
1712 }
1713
1714 if (m->nosuid) {
1715 new_flags |= MS_NOSUID;
1716 flags_mask |= MS_NOSUID;
1717 }
1718
1719 if (flags_mask == 0) /* No Change? */
1720 return 0;
1721
1722 /* We generally apply these changes recursively, except for /dev, and the cases we know there's
1723 * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
1724 * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace
1725 * and running Linux <= 4.17. */
1726 submounts =
1727 mount_entry_read_only(m) &&
1728 !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
1729 if (submounts)
1730 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
1731 else
1732 r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
1733
1734 /* Note that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
1735 * read-only already stays this way. This improves compatibility with container managers, where we
1736 * won't attempt to undo read-only mounts already applied. */
1737
1738 if (r == -ENOENT && m->ignore)
1739 return 0;
1740 if (r < 0)
1741 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
1742 submounts ? " and its submounts" : "");
1743 return 0;
1744 }
1745
1746 static int make_noexec(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
1747 unsigned long new_flags = 0, flags_mask = 0;
1748 bool submounts;
1749 int r;
1750
1751 assert(m);
1752 assert(proc_self_mountinfo);
1753
1754 if (m->state != MOUNT_APPLIED)
1755 return 0;
1756
1757 if (mount_entry_noexec(m)) {
1758 new_flags |= MS_NOEXEC;
1759 flags_mask |= MS_NOEXEC;
1760 } else if (mount_entry_exec(m)) {
1761 new_flags &= ~MS_NOEXEC;
1762 flags_mask |= MS_NOEXEC;
1763 }
1764
1765 if (flags_mask == 0) /* No Change? */
1766 return 0;
1767
1768 submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
1769
1770 if (submounts)
1771 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
1772 else
1773 r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
1774
1775 if (r == -ENOENT && m->ignore)
1776 return 0;
1777 if (r < 0)
1778 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
1779 submounts ? " and its submounts" : "");
1780 return 0;
1781 }
1782
1783 static int make_nosuid(const MountEntry *m, FILE *proc_self_mountinfo) {
1784 bool submounts;
1785 int r;
1786
1787 assert(m);
1788 assert(proc_self_mountinfo);
1789
1790 if (m->state != MOUNT_APPLIED)
1791 return 0;
1792
1793 submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
1794 if (submounts)
1795 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, NULL, proc_self_mountinfo);
1796 else
1797 r = bind_remount_one_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, proc_self_mountinfo);
1798 if (r == -ENOENT && m->ignore)
1799 return 0;
1800 if (r < 0)
1801 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
1802 submounts ? " and its submounts" : "");
1803 return 0;
1804 }
1805
1806 static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
1807 assert(p);
1808
1809 /*
1810 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1811 * since to protect the API VFS mounts, they need to be around in the
1812 * first place...
1813 */
1814
1815 return p->mount_apivfs ||
1816 p->protect_control_groups ||
1817 p->protect_kernel_tunables ||
1818 p->protect_proc != PROTECT_PROC_DEFAULT ||
1819 p->proc_subset != PROC_SUBSET_ALL;
1820 }
1821
1822 /* Walk all mount entries and dropping any unused mounts. This affects all
1823 * mounts:
1824 * - that are implicitly protected by a path that has been rendered inaccessible
1825 * - whose immediate parent requests the same protection mode as the mount itself
1826 * - that are outside of the relevant root directory
1827 * - which are duplicates
1828 */
1829 static void drop_unused_mounts(MountList *ml, const char *root_directory) {
1830 assert(ml);
1831 assert(root_directory);
1832
1833 assert(ml->mounts || ml->n_mounts == 0);
1834
1835 typesafe_qsort(ml->mounts, ml->n_mounts, mount_path_compare);
1836
1837 drop_duplicates(ml);
1838 drop_outside_root(ml, root_directory);
1839 drop_inaccessible(ml);
1840 drop_nop(ml);
1841 }
1842
1843 static int create_symlinks_from_tuples(const char *root, char **strv_symlinks) {
1844 int r;
1845
1846 STRV_FOREACH_PAIR(src, dst, strv_symlinks) {
1847 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
1848
1849 src_abs = path_join(root, *src);
1850 dst_abs = path_join(root, *dst);
1851 if (!src_abs || !dst_abs)
1852 return -ENOMEM;
1853
1854 r = mkdir_parents_label(dst_abs, 0755);
1855 if (r < 0)
1856 return log_debug_errno(
1857 r,
1858 "Failed to create parent directory for symlink '%s': %m",
1859 dst_abs);
1860
1861 r = symlink_idempotent(src_abs, dst_abs, true);
1862 if (r < 0)
1863 return log_debug_errno(
1864 r,
1865 "Failed to create symlink from '%s' to '%s': %m",
1866 src_abs,
1867 dst_abs);
1868 }
1869
1870 return 0;
1871 }
1872
1873 static void mount_entry_path_debug_string(const char *root, MountEntry *m, char **error_path) {
1874 assert(m);
1875
1876 /* Create a string suitable for debugging logs, stripping for example the local working directory.
1877 * For example, with a BindPaths=/var/bar that does not exist on the host:
1878 *
1879 * Before:
1880 * foo.service: Failed to set up mount namespacing: /run/systemd/unit-root/var/bar: No such file or directory
1881 * After:
1882 * foo.service: Failed to set up mount namespacing: /var/bar: No such file or directory
1883 *
1884 * Note that this is an error path, so no OOM check is done on purpose. */
1885
1886 if (!error_path)
1887 return;
1888
1889 if (!mount_entry_path(m)) {
1890 *error_path = NULL;
1891 return;
1892 }
1893
1894 if (root) {
1895 const char *e = startswith(mount_entry_path(m), root);
1896 if (e) {
1897 *error_path = strdup(e);
1898 return;
1899 }
1900 }
1901
1902 *error_path = strdup(mount_entry_path(m));
1903 return;
1904 }
1905
1906 static int apply_mounts(
1907 MountList *ml,
1908 const char *root,
1909 const NamespaceParameters *p,
1910 char **error_path) {
1911
1912 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1913 _cleanup_free_ char **deny_list = NULL;
1914 int r;
1915
1916 assert(ml);
1917 assert(root);
1918 assert(p);
1919
1920 if (ml->n_mounts == 0) /* Shortcut: nothing to do */
1921 return 0;
1922
1923 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of
1924 * /proc. For example, this is the case with the option: 'InaccessiblePaths=/proc'. */
1925 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1926 if (!proc_self_mountinfo) {
1927 r = -errno;
1928
1929 if (error_path)
1930 *error_path = strdup("/proc/self/mountinfo");
1931
1932 return log_debug_errno(r, "Failed to open /proc/self/mountinfo: %m");
1933 }
1934
1935 /* First round, establish all mounts we need */
1936 for (;;) {
1937 bool again = false;
1938
1939 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
1940
1941 if (m->state != MOUNT_PENDING)
1942 continue;
1943
1944 /* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */
1945 r = follow_symlink(!IN_SET(m->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) ? root : NULL, m);
1946 if (r < 0) {
1947 mount_entry_path_debug_string(root, m, error_path);
1948 return r;
1949 }
1950 if (r == 0) {
1951 /* We hit a symlinked mount point. The entry got rewritten and might
1952 * point to a very different place now. Let's normalize the changed
1953 * list, and start from the beginning. After all to mount the entry
1954 * at the new location we might need some other mounts first */
1955 again = true;
1956 break;
1957 }
1958
1959 /* Returns 1 if the mount should be post-processed, 0 otherwise */
1960 r = apply_one_mount(root, m, p);
1961 if (r < 0) {
1962 mount_entry_path_debug_string(root, m, error_path);
1963 return r;
1964 }
1965 m->state = r == 0 ? MOUNT_SKIPPED : MOUNT_APPLIED;
1966 }
1967
1968 if (!again)
1969 break;
1970
1971 drop_unused_mounts(ml, root);
1972 }
1973
1974 /* Now that all filesystems have been set up, but before the
1975 * read-only switches are flipped, create the exec dirs and other symlinks.
1976 * Note that when /var/lib is not empty/tmpfs, these symlinks will already
1977 * exist, which means this will be a no-op. */
1978 r = create_symlinks_from_tuples(root, p->symlinks);
1979 if (r < 0)
1980 return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m");
1981
1982 /* Create a deny list we can pass to bind_mount_recursive() */
1983 deny_list = new(char*, ml->n_mounts+1);
1984 if (!deny_list)
1985 return -ENOMEM;
1986 for (size_t j = 0; j < ml->n_mounts; j++)
1987 deny_list[j] = (char*) mount_entry_path(ml->mounts+j);
1988 deny_list[ml->n_mounts] = NULL;
1989
1990 /* Second round, flip the ro bits if necessary. */
1991 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
1992 r = make_read_only(m, deny_list, proc_self_mountinfo);
1993 if (r < 0) {
1994 mount_entry_path_debug_string(root, m, error_path);
1995 return r;
1996 }
1997 }
1998
1999 /* Third round, flip the noexec bits with a simplified deny list. */
2000 for (size_t j = 0; j < ml->n_mounts; j++)
2001 if (IN_SET((ml->mounts+j)->mode, MOUNT_EXEC, MOUNT_NOEXEC))
2002 deny_list[j] = (char*) mount_entry_path(ml->mounts+j);
2003 deny_list[ml->n_mounts] = NULL;
2004
2005 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
2006 r = make_noexec(m, deny_list, proc_self_mountinfo);
2007 if (r < 0) {
2008 mount_entry_path_debug_string(root, m, error_path);
2009 return r;
2010 }
2011 }
2012
2013 /* Fourth round, flip the nosuid bits without a deny list. */
2014 if (p->mount_nosuid)
2015 FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
2016 r = make_nosuid(m, proc_self_mountinfo);
2017 if (r < 0) {
2018 mount_entry_path_debug_string(root, m, error_path);
2019 return r;
2020 }
2021 }
2022
2023 return 1;
2024 }
2025
2026 static bool root_read_only(
2027 char **read_only_paths,
2028 ProtectSystem protect_system) {
2029
2030 /* Determine whether the root directory is going to be read-only given the configured settings. */
2031
2032 if (protect_system == PROTECT_SYSTEM_STRICT)
2033 return true;
2034
2035 if (prefixed_path_strv_contains(read_only_paths, "/"))
2036 return true;
2037
2038 return false;
2039 }
2040
2041 static bool home_read_only(
2042 char** read_only_paths,
2043 char** inaccessible_paths,
2044 char** empty_directories,
2045 const BindMount *bind_mounts,
2046 size_t n_bind_mounts,
2047 const TemporaryFileSystem *temporary_filesystems,
2048 size_t n_temporary_filesystems,
2049 ProtectHome protect_home) {
2050
2051 /* Determine whether the /home directory is going to be read-only given the configured settings. Yes,
2052 * this is a bit sloppy, since we don't bother checking for cases where / is affected by multiple
2053 * settings. */
2054
2055 if (protect_home != PROTECT_HOME_NO)
2056 return true;
2057
2058 if (prefixed_path_strv_contains(read_only_paths, "/home") ||
2059 prefixed_path_strv_contains(inaccessible_paths, "/home") ||
2060 prefixed_path_strv_contains(empty_directories, "/home"))
2061 return true;
2062
2063 for (size_t i = 0; i < n_temporary_filesystems; i++)
2064 if (path_equal(temporary_filesystems[i].path, "/home"))
2065 return true;
2066
2067 /* If /home is overmounted with some dir from the host it's not writable. */
2068 for (size_t i = 0; i < n_bind_mounts; i++)
2069 if (path_equal(bind_mounts[i].destination, "/home"))
2070 return true;
2071
2072 return false;
2073 }
2074
2075 int setup_namespace(const NamespaceParameters *p, char **error_path) {
2076
2077 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
2078 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
2079 _cleanup_strv_free_ char **hierarchies = NULL;
2080 _cleanup_(mount_list_done) MountList ml = {};
2081 bool require_prefix = false;
2082 const char *root;
2083 DissectImageFlags dissect_image_flags =
2084 DISSECT_IMAGE_GENERIC_ROOT |
2085 DISSECT_IMAGE_REQUIRE_ROOT |
2086 DISSECT_IMAGE_DISCARD_ON_LOOP |
2087 DISSECT_IMAGE_RELAX_VAR_CHECK |
2088 DISSECT_IMAGE_FSCK |
2089 DISSECT_IMAGE_USR_NO_ROOT |
2090 DISSECT_IMAGE_GROWFS |
2091 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
2092 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
2093 int r;
2094
2095 assert(p);
2096
2097 /* Make sure that all mknod(), mkdir() calls we do are unaffected by the umask, and the access modes
2098 * we configure take effect */
2099 BLOCK_WITH_UMASK(0000);
2100
2101 bool setup_propagate = !isempty(p->propagate_dir) && !isempty(p->incoming_dir);
2102 unsigned long mount_propagation_flag = p->mount_propagation_flag != 0 ? p->mount_propagation_flag : MS_SHARED;
2103
2104 if (p->root_image) {
2105 /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
2106 if (root_read_only(p->read_only_paths,
2107 p->protect_system) &&
2108 home_read_only(p->read_only_paths, p->inaccessible_paths, p->empty_directories,
2109 p->bind_mounts, p->n_bind_mounts, p->temporary_filesystems, p->n_temporary_filesystems,
2110 p->protect_home) &&
2111 strv_isempty(p->read_write_paths))
2112 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
2113
2114 SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path);
2115
2116 r = loop_device_make_by_path(
2117 p->root_image,
2118 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
2119 /* sector_size= */ UINT32_MAX,
2120 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
2121 LOCK_SH,
2122 &loop_device);
2123 if (r < 0)
2124 return log_debug_errno(r, "Failed to create loop device for root image: %m");
2125
2126 r = dissect_loop_device(
2127 loop_device,
2128 p->verity,
2129 p->root_image_options,
2130 p->root_image_policy,
2131 dissect_image_flags,
2132 &dissected_image);
2133 if (r < 0)
2134 return log_debug_errno(r, "Failed to dissect image: %m");
2135
2136 r = dissected_image_load_verity_sig_partition(
2137 dissected_image,
2138 loop_device->fd,
2139 p->verity);
2140 if (r < 0)
2141 return r;
2142
2143 r = dissected_image_decrypt(
2144 dissected_image,
2145 NULL,
2146 p->verity,
2147 dissect_image_flags);
2148 if (r < 0)
2149 return log_debug_errno(r, "Failed to decrypt dissected image: %m");
2150 }
2151
2152 if (p->root_directory)
2153 root = p->root_directory;
2154 else {
2155 /* /run/systemd should have been created by PID 1 early on already, but in some cases, like
2156 * when running tests (test-execute), it might not have been created yet so let's make sure
2157 * we create it if it doesn't already exist. */
2158 (void) mkdir_p_label("/run/systemd", 0755);
2159
2160 /* Always create the mount namespace in a temporary directory, instead of operating directly
2161 * in the root. The temporary directory prevents any mounts from being potentially obscured
2162 * my other mounts we already applied. We use the same mount point for all images, which is
2163 * safe, since they all live in their own namespaces after all, and hence won't see each
2164 * other. (Note: this directory is also created by PID 1 early on, we create it here for
2165 * similar reasons as /run/systemd/ first.) */
2166 root = "/run/systemd/mount-rootfs";
2167 (void) mkdir_label(root, 0555);
2168
2169 require_prefix = true;
2170 }
2171
2172 if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) {
2173 /* Hierarchy population needs to be done for sysext and confext extension images */
2174 r = parse_env_extension_hierarchies(&hierarchies, "SYSTEMD_SYSEXT_AND_CONFEXT_HIERARCHIES");
2175 if (r < 0)
2176 return r;
2177 }
2178
2179 r = append_access_mounts(&ml, p->read_write_paths, MOUNT_READ_WRITE, require_prefix);
2180 if (r < 0)
2181 return r;
2182
2183 r = append_access_mounts(&ml, p->read_only_paths, MOUNT_READ_ONLY, require_prefix);
2184 if (r < 0)
2185 return r;
2186
2187 r = append_access_mounts(&ml, p->inaccessible_paths, MOUNT_INACCESSIBLE, require_prefix);
2188 if (r < 0)
2189 return r;
2190
2191 r = append_access_mounts(&ml, p->exec_paths, MOUNT_EXEC, require_prefix);
2192 if (r < 0)
2193 return r;
2194
2195 r = append_access_mounts(&ml, p->no_exec_paths, MOUNT_NOEXEC, require_prefix);
2196 if (r < 0)
2197 return r;
2198
2199 r = append_empty_dir_mounts(&ml, p->empty_directories);
2200 if (r < 0)
2201 return r;
2202
2203 r = append_bind_mounts(&ml, p->bind_mounts, p->n_bind_mounts);
2204 if (r < 0)
2205 return r;
2206
2207 r = append_tmpfs_mounts(&ml, p->temporary_filesystems, p->n_temporary_filesystems);
2208 if (r < 0)
2209 return r;
2210
2211 if (p->tmp_dir) {
2212 bool ro = streq(p->tmp_dir, RUN_SYSTEMD_EMPTY);
2213
2214 MountEntry *me = mount_list_extend(&ml);
2215 if (!me)
2216 return log_oom_debug();
2217
2218 *me = (MountEntry) {
2219 .path_const = "/tmp",
2220 .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP,
2221 .source_const = p->tmp_dir,
2222 };
2223 }
2224
2225 if (p->var_tmp_dir) {
2226 bool ro = streq(p->var_tmp_dir, RUN_SYSTEMD_EMPTY);
2227
2228 MountEntry *me = mount_list_extend(&ml);
2229 if (!me)
2230 return log_oom_debug();
2231
2232 *me = (MountEntry) {
2233 .path_const = "/var/tmp",
2234 .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP,
2235 .source_const = p->var_tmp_dir,
2236 };
2237 }
2238
2239 r = append_mount_images(&ml, p->mount_images, p->n_mount_images);
2240 if (r < 0)
2241 return r;
2242
2243 r = append_extensions(&ml, root, p->extension_dir, hierarchies, p->extension_images, p->n_extension_images, p->extension_directories);
2244 if (r < 0)
2245 return r;
2246
2247 if (p->private_dev) {
2248 MountEntry *me = mount_list_extend(&ml);
2249 if (!me)
2250 return log_oom_debug();
2251
2252 *me = (MountEntry) {
2253 .path_const = "/dev",
2254 .mode = MOUNT_PRIVATE_DEV,
2255 .flags = DEV_MOUNT_OPTIONS,
2256 };
2257 }
2258
2259 /* In case /proc is successfully mounted with pid tree subset only (ProcSubset=pid), the protective
2260 mounts to non-pid /proc paths would fail. But the pid only option may have failed gracefully, so
2261 let's try the mounts but it's not fatal if they don't succeed. */
2262 bool ignore_protect_proc = p->ignore_protect_paths || p->proc_subset == PROC_SUBSET_PID;
2263 if (p->protect_kernel_tunables) {
2264 r = append_static_mounts(&ml,
2265 protect_kernel_tunables_proc_table,
2266 ELEMENTSOF(protect_kernel_tunables_proc_table),
2267 ignore_protect_proc);
2268 if (r < 0)
2269 return r;
2270
2271 r = append_static_mounts(&ml,
2272 protect_kernel_tunables_sys_table,
2273 ELEMENTSOF(protect_kernel_tunables_sys_table),
2274 p->ignore_protect_paths);
2275 if (r < 0)
2276 return r;
2277 }
2278
2279 if (p->protect_kernel_modules) {
2280 r = append_static_mounts(&ml,
2281 protect_kernel_modules_table,
2282 ELEMENTSOF(protect_kernel_modules_table),
2283 p->ignore_protect_paths);
2284 if (r < 0)
2285 return r;
2286 }
2287
2288 if (p->protect_kernel_logs) {
2289 r = append_static_mounts(&ml,
2290 protect_kernel_logs_proc_table,
2291 ELEMENTSOF(protect_kernel_logs_proc_table),
2292 ignore_protect_proc);
2293 if (r < 0)
2294 return r;
2295
2296 r = append_static_mounts(&ml,
2297 protect_kernel_logs_dev_table,
2298 ELEMENTSOF(protect_kernel_logs_dev_table),
2299 p->ignore_protect_paths);
2300 if (r < 0)
2301 return r;
2302 }
2303
2304 if (p->protect_control_groups) {
2305 MountEntry *me = mount_list_extend(&ml);
2306 if (!me)
2307 return log_oom_debug();
2308
2309 *me = (MountEntry) {
2310 .path_const = "/sys/fs/cgroup",
2311 .mode = MOUNT_READ_ONLY,
2312 };
2313 }
2314
2315 r = append_protect_home(&ml, p->protect_home, p->ignore_protect_paths);
2316 if (r < 0)
2317 return r;
2318
2319 r = append_protect_system(&ml, p->protect_system, false);
2320 if (r < 0)
2321 return r;
2322
2323 if (namespace_parameters_mount_apivfs(p)) {
2324 r = append_static_mounts(&ml,
2325 apivfs_table,
2326 ELEMENTSOF(apivfs_table),
2327 p->ignore_protect_paths);
2328 if (r < 0)
2329 return r;
2330 }
2331
2332 /* Note, if proc is mounted with subset=pid then neither of the two paths will exist, i.e. they are
2333 * implicitly protected by the mount option. */
2334 if (p->protect_hostname) {
2335 r = append_static_mounts(
2336 &ml,
2337 protect_hostname_table,
2338 ELEMENTSOF(protect_hostname_table),
2339 ignore_protect_proc);
2340 if (r < 0)
2341 return r;
2342 }
2343
2344 if (p->private_network) {
2345 MountEntry *me = mount_list_extend(&ml);
2346 if (!me)
2347 return log_oom_debug();
2348
2349 *me = (MountEntry) {
2350 .path_const = "/sys",
2351 .mode = MOUNT_PRIVATE_SYSFS,
2352 };
2353 }
2354
2355 if (p->private_ipc) {
2356 MountEntry *me = mount_list_extend(&ml);
2357 if (!me)
2358 return log_oom_debug();
2359
2360 *me = (MountEntry) {
2361 .path_const = "/dev/mqueue",
2362 .mode = MOUNT_MQUEUEFS,
2363 .flags = MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
2364 };
2365 }
2366
2367 if (p->creds_path) {
2368 /* If our service has a credentials store configured, then bind that one in, but hide
2369 * everything else. */
2370
2371 MountEntry *me = mount_list_extend(&ml);
2372 if (!me)
2373 return log_oom_debug();
2374
2375 *me = (MountEntry) {
2376 .path_const = "/run/credentials",
2377 .mode = MOUNT_TMPFS,
2378 .read_only = true,
2379 .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
2380 .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
2381 };
2382
2383 me = mount_list_extend(&ml);
2384 if (!me)
2385 return log_oom_debug();
2386
2387 *me = (MountEntry) {
2388 .path_const = p->creds_path,
2389 .mode = MOUNT_BIND,
2390 .read_only = true,
2391 .source_const = p->creds_path,
2392 .ignore = true,
2393 };
2394 } else {
2395 /* If our service has no credentials store configured, then make the whole credentials tree
2396 * inaccessible wholesale. */
2397
2398 MountEntry *me = mount_list_extend(&ml);
2399 if (!me)
2400 return log_oom_debug();
2401
2402 *me = (MountEntry) {
2403 .path_const = "/run/credentials",
2404 .mode = MOUNT_INACCESSIBLE,
2405 .ignore = true,
2406 };
2407 }
2408
2409 if (p->log_namespace) {
2410 _cleanup_free_ char *q = NULL;
2411
2412 q = strjoin("/run/systemd/journal.", p->log_namespace);
2413 if (!q)
2414 return log_oom_debug();
2415
2416 MountEntry *me = mount_list_extend(&ml);
2417 if (!me)
2418 return log_oom_debug();
2419
2420 *me = (MountEntry) {
2421 .path_const = "/run/systemd/journal",
2422 .mode = MOUNT_BIND_RECURSIVE,
2423 .read_only = true,
2424 .source_malloc = TAKE_PTR(q),
2425 };
2426 }
2427
2428 /* Will be used to add bind mounts at runtime */
2429 if (setup_propagate) {
2430 MountEntry *me = mount_list_extend(&ml);
2431 if (!me)
2432 return log_oom_debug();
2433
2434 *me = (MountEntry) {
2435 .source_const = p->propagate_dir,
2436 .path_const = p->incoming_dir,
2437 .mode = MOUNT_BIND,
2438 .read_only = true,
2439 };
2440 }
2441
2442 if (p->notify_socket) {
2443 MountEntry *me = mount_list_extend(&ml);
2444 if (!me)
2445 return log_oom_debug();
2446
2447 *me = (MountEntry) {
2448 .path_const = p->notify_socket,
2449 .source_const = p->notify_socket,
2450 .mode = MOUNT_BIND,
2451 .read_only = true,
2452 };
2453 }
2454
2455 if (p->host_os_release_stage) {
2456 MountEntry *me = mount_list_extend(&ml);
2457 if (!me)
2458 return log_oom_debug();
2459
2460 *me = (MountEntry) {
2461 .path_const = "/run/host/.os-release-stage/",
2462 .source_const = p->host_os_release_stage,
2463 .mode = MOUNT_BIND,
2464 .read_only = true,
2465 .ignore = true, /* Live copy, don't hard-fail if it goes missing */
2466 };
2467 }
2468
2469 /* Prepend the root directory where that's necessary */
2470 r = prefix_where_needed(&ml, root);
2471 if (r < 0)
2472 return r;
2473
2474 drop_unused_mounts(&ml, root);
2475
2476 /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
2477
2478 if (unshare(CLONE_NEWNS) < 0) {
2479 r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
2480
2481 if (ERRNO_IS_PRIVILEGE(r) ||
2482 ERRNO_IS_NOT_SUPPORTED(r))
2483 /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter
2484 * in place that doesn't allow us to create namespaces (or a missing cap), then
2485 * propagate a recognizable error back, which the caller can use to detect this case
2486 * (and only this) and optionally continue without namespacing applied. */
2487 return -ENOANO;
2488
2489 return r;
2490 }
2491
2492 /* Create the source directory to allow runtime propagation of mounts */
2493 if (setup_propagate)
2494 (void) mkdir_p(p->propagate_dir, 0600);
2495
2496 if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories))
2497 /* ExtensionImages/Directories mountpoint directories will be created while parsing the
2498 * mounts to create, so have the parent ready */
2499 (void) mkdir_p(p->extension_dir, 0600);
2500
2501 /* Remount / as SLAVE so that nothing now mounted in the namespace
2502 * shows up in the parent */
2503 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2504 return log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
2505
2506 if (p->root_image) {
2507 /* A root image is specified, mount it to the right place */
2508 r = dissected_image_mount(
2509 dissected_image,
2510 root,
2511 /* uid_shift= */ UID_INVALID,
2512 /* uid_range= */ UID_INVALID,
2513 /* userns_fd= */ -EBADF,
2514 dissect_image_flags);
2515 if (r < 0)
2516 return log_debug_errno(r, "Failed to mount root image: %m");
2517
2518 /* Now release the block device lock, so that udevd is free to call BLKRRPART on the device
2519 * if it likes. */
2520 r = loop_device_flock(loop_device, LOCK_UN);
2521 if (r < 0)
2522 return log_debug_errno(r, "Failed to release lock on loopback block device: %m");
2523
2524 r = dissected_image_relinquish(dissected_image);
2525 if (r < 0)
2526 return log_debug_errno(r, "Failed to relinquish dissected image: %m");
2527
2528 } else if (p->root_directory) {
2529
2530 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
2531 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
2532 if (r < 0)
2533 return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
2534 if (r == 0) {
2535 r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
2536 if (r < 0)
2537 return r;
2538 }
2539
2540 } else {
2541 /* Let's mount the main root directory to the root directory to use */
2542 r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL);
2543 if (r < 0)
2544 return r;
2545 }
2546
2547 /* Try to set up the new root directory before mounting anything else there. */
2548 if (p->root_image || p->root_directory)
2549 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
2550
2551 /* Now make the magic happen */
2552 r = apply_mounts(&ml, root, p, error_path);
2553 if (r < 0)
2554 return r;
2555
2556 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
2557 r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
2558 if (r == -EINVAL && p->root_directory) {
2559 /* If we are using root_directory and we don't have privileges (ie: user manager in a user
2560 * namespace) and the root_directory is already a mount point in the parent namespace,
2561 * MS_MOVE will fail as we don't have permission to change it (with EINVAL rather than
2562 * EPERM). Attempt to bind-mount it over itself (like we do above if it's not already a
2563 * mount point) and try again. */
2564 r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
2565 if (r < 0)
2566 return r;
2567 r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
2568 }
2569 if (r < 0)
2570 return log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
2571
2572 /* Remount / as the desired mode. Note that this will not reestablish propagation from our side to
2573 * the host, since what's disconnected is disconnected. */
2574 if (mount(NULL, "/", NULL, mount_propagation_flag | MS_REC, NULL) < 0)
2575 return log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
2576
2577 /* bind_mount_in_namespace() will MS_MOVE into that directory, and that's only supported for
2578 * non-shared mounts. This needs to happen after remounting / or it will fail. */
2579 if (setup_propagate && mount(NULL, p->incoming_dir, NULL, MS_SLAVE, NULL) < 0)
2580 return log_debug_errno(errno, "Failed to remount %s with MS_SLAVE: %m", p->incoming_dir);
2581
2582 return 0;
2583 }
2584
2585 void bind_mount_free_many(BindMount *b, size_t n) {
2586 assert(b || n == 0);
2587
2588 for (size_t i = 0; i < n; i++) {
2589 free(b[i].source);
2590 free(b[i].destination);
2591 }
2592
2593 free(b);
2594 }
2595
2596 int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
2597 _cleanup_free_ char *s = NULL, *d = NULL;
2598 BindMount *c;
2599
2600 assert(b);
2601 assert(n);
2602 assert(item);
2603
2604 s = strdup(item->source);
2605 if (!s)
2606 return -ENOMEM;
2607
2608 d = strdup(item->destination);
2609 if (!d)
2610 return -ENOMEM;
2611
2612 c = reallocarray(*b, *n + 1, sizeof(BindMount));
2613 if (!c)
2614 return -ENOMEM;
2615
2616 *b = c;
2617
2618 c[(*n) ++] = (BindMount) {
2619 .source = TAKE_PTR(s),
2620 .destination = TAKE_PTR(d),
2621 .read_only = item->read_only,
2622 .nosuid = item->nosuid,
2623 .recursive = item->recursive,
2624 .ignore_enoent = item->ignore_enoent,
2625 };
2626
2627 return 0;
2628 }
2629
2630 MountImage* mount_image_free_many(MountImage *m, size_t *n) {
2631 assert(n);
2632 assert(m || *n == 0);
2633
2634 for (size_t i = 0; i < *n; i++) {
2635 free(m[i].source);
2636 free(m[i].destination);
2637 mount_options_free_all(m[i].mount_options);
2638 }
2639
2640 free(m);
2641 *n = 0;
2642 return NULL;
2643 }
2644
2645 int mount_image_add(MountImage **m, size_t *n, const MountImage *item) {
2646 _cleanup_free_ char *s = NULL, *d = NULL;
2647 _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
2648 MountImage *c;
2649
2650 assert(m);
2651 assert(n);
2652 assert(item);
2653
2654 s = strdup(item->source);
2655 if (!s)
2656 return -ENOMEM;
2657
2658 if (item->destination) {
2659 d = strdup(item->destination);
2660 if (!d)
2661 return -ENOMEM;
2662 }
2663
2664 LIST_FOREACH(mount_options, i, item->mount_options) {
2665 _cleanup_(mount_options_free_allp) MountOptions *o = NULL;
2666
2667 o = new(MountOptions, 1);
2668 if (!o)
2669 return -ENOMEM;
2670
2671 *o = (MountOptions) {
2672 .partition_designator = i->partition_designator,
2673 .options = strdup(i->options),
2674 };
2675 if (!o->options)
2676 return -ENOMEM;
2677
2678 LIST_APPEND(mount_options, options, TAKE_PTR(o));
2679 }
2680
2681 c = reallocarray(*m, *n + 1, sizeof(MountImage));
2682 if (!c)
2683 return -ENOMEM;
2684
2685 *m = c;
2686
2687 c[(*n) ++] = (MountImage) {
2688 .source = TAKE_PTR(s),
2689 .destination = TAKE_PTR(d),
2690 .mount_options = TAKE_PTR(options),
2691 .ignore_enoent = item->ignore_enoent,
2692 .type = item->type,
2693 };
2694
2695 return 0;
2696 }
2697
2698 void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
2699 assert(t || n == 0);
2700
2701 for (size_t i = 0; i < n; i++) {
2702 free(t[i].path);
2703 free(t[i].options);
2704 }
2705
2706 free(t);
2707 }
2708
2709 int temporary_filesystem_add(
2710 TemporaryFileSystem **t,
2711 size_t *n,
2712 const char *path,
2713 const char *options) {
2714
2715 _cleanup_free_ char *p = NULL, *o = NULL;
2716 TemporaryFileSystem *c;
2717
2718 assert(t);
2719 assert(n);
2720 assert(path);
2721
2722 p = strdup(path);
2723 if (!p)
2724 return -ENOMEM;
2725
2726 if (!isempty(options)) {
2727 o = strdup(options);
2728 if (!o)
2729 return -ENOMEM;
2730 }
2731
2732 c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
2733 if (!c)
2734 return -ENOMEM;
2735
2736 *t = c;
2737
2738 c[(*n) ++] = (TemporaryFileSystem) {
2739 .path = TAKE_PTR(p),
2740 .options = TAKE_PTR(o),
2741 };
2742
2743 return 0;
2744 }
2745
2746 static int make_tmp_prefix(const char *prefix) {
2747 _cleanup_free_ char *t = NULL;
2748 _cleanup_close_ int fd = -EBADF;
2749 int r;
2750
2751 /* Don't do anything unless we know the dir is actually missing */
2752 r = access(prefix, F_OK);
2753 if (r >= 0)
2754 return 0;
2755 if (errno != ENOENT)
2756 return -errno;
2757
2758 WITH_UMASK(000)
2759 r = mkdir_parents(prefix, 0755);
2760 if (r < 0)
2761 return r;
2762
2763 r = tempfn_random(prefix, NULL, &t);
2764 if (r < 0)
2765 return r;
2766
2767 /* umask will corrupt this access mode, but that doesn't matter, we need to call chmod() anyway for
2768 * the suid bit, below. */
2769 fd = open_mkdir_at(AT_FDCWD, t, O_EXCL|O_CLOEXEC, 0777);
2770 if (fd < 0)
2771 return fd;
2772
2773 r = RET_NERRNO(fchmod(fd, 01777));
2774 if (r < 0) {
2775 (void) rmdir(t);
2776 return r;
2777 }
2778
2779 r = RET_NERRNO(rename(t, prefix));
2780 if (r < 0) {
2781 (void) rmdir(t);
2782 return r == -EEXIST ? 0 : r; /* it's fine if someone else created the dir by now */
2783 }
2784
2785 return 0;
2786
2787 }
2788
2789 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, char **tmp_path) {
2790 _cleanup_free_ char *x = NULL;
2791 _cleanup_free_ char *y = NULL;
2792 sd_id128_t boot_id;
2793 bool rw = true;
2794 int r;
2795
2796 assert(id);
2797 assert(prefix);
2798 assert(path);
2799
2800 /* We include the boot id in the directory so that after a
2801 * reboot we can easily identify obsolete directories. */
2802
2803 r = sd_id128_get_boot(&boot_id);
2804 if (r < 0)
2805 return r;
2806
2807 x = strjoin(prefix, "/systemd-private-", SD_ID128_TO_STRING(boot_id), "-", id, "-XXXXXX");
2808 if (!x)
2809 return -ENOMEM;
2810
2811 r = make_tmp_prefix(prefix);
2812 if (r < 0)
2813 return r;
2814
2815 WITH_UMASK(0077)
2816 if (!mkdtemp(x)) {
2817 if (errno == EROFS || ERRNO_IS_DISK_SPACE(errno))
2818 rw = false;
2819 else
2820 return -errno;
2821 }
2822
2823 if (rw) {
2824 y = strjoin(x, "/tmp");
2825 if (!y)
2826 return -ENOMEM;
2827
2828 WITH_UMASK(0000)
2829 if (mkdir(y, 0777 | S_ISVTX) < 0)
2830 return -errno;
2831
2832 r = label_fix_full(AT_FDCWD, y, prefix, 0);
2833 if (r < 0)
2834 return r;
2835
2836 if (tmp_path)
2837 *tmp_path = TAKE_PTR(y);
2838 } else {
2839 /* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being
2840 * read-only. This way the service will get the EROFS result as if it was writing to the real
2841 * file system. */
2842 WITH_UMASK(0000)
2843 r = mkdir_p(RUN_SYSTEMD_EMPTY, 0500);
2844 if (r < 0)
2845 return r;
2846
2847 r = free_and_strdup(&x, RUN_SYSTEMD_EMPTY);
2848 if (r < 0)
2849 return r;
2850 }
2851
2852 *path = TAKE_PTR(x);
2853 return 0;
2854 }
2855
2856 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
2857 _cleanup_(namespace_cleanup_tmpdirp) char *a = NULL;
2858 _cleanup_(rmdir_and_freep) char *a_tmp = NULL;
2859 char *b;
2860 int r;
2861
2862 assert(id);
2863 assert(tmp_dir);
2864 assert(var_tmp_dir);
2865
2866 r = setup_one_tmp_dir(id, "/tmp", &a, &a_tmp);
2867 if (r < 0)
2868 return r;
2869
2870 r = setup_one_tmp_dir(id, "/var/tmp", &b, NULL);
2871 if (r < 0)
2872 return r;
2873
2874 a_tmp = mfree(a_tmp); /* avoid rmdir */
2875 *tmp_dir = TAKE_PTR(a);
2876 *var_tmp_dir = TAKE_PTR(b);
2877
2878 return 0;
2879 }
2880
2881 int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag) {
2882 _cleanup_close_ int ns = -EBADF;
2883 int r;
2884 const char *ns_name, *ns_path;
2885
2886 assert(ns_storage_socket);
2887 assert(ns_storage_socket[0] >= 0);
2888 assert(ns_storage_socket[1] >= 0);
2889
2890 ns_name = namespace_single_flag_to_string(nsflag);
2891 assert(ns_name);
2892
2893 /* We use the passed socketpair as a storage buffer for our
2894 * namespace reference fd. Whatever process runs this first
2895 * shall create a new namespace, all others should just join
2896 * it. To serialize that we use a file lock on the socket
2897 * pair.
2898 *
2899 * It's a bit crazy, but hey, works great! */
2900
2901 r = posix_lock(ns_storage_socket[0], LOCK_EX);
2902 if (r < 0)
2903 return r;
2904
2905 CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]);
2906
2907 ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2908 if (ns >= 0) {
2909 /* Yay, found something, so let's join the namespace */
2910 r = RET_NERRNO(setns(ns, nsflag));
2911 if (r < 0)
2912 return r;
2913
2914 return 0;
2915 }
2916
2917 if (ns != -EAGAIN)
2918 return ns;
2919
2920 /* Nothing stored yet, so let's create a new namespace. */
2921
2922 if (unshare(nsflag) < 0)
2923 return -errno;
2924
2925 (void) loopback_setup();
2926
2927 ns_path = strjoina("/proc/self/ns/", ns_name);
2928 ns = open(ns_path, O_RDONLY|O_CLOEXEC|O_NOCTTY);
2929 if (ns < 0)
2930 return -errno;
2931
2932 r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
2933 if (r < 0)
2934 return r;
2935
2936 return 1;
2937 }
2938
2939 int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, unsigned long nsflag) {
2940 _cleanup_close_ int ns = -EBADF;
2941 int r;
2942
2943 assert(ns_storage_socket);
2944 assert(ns_storage_socket[0] >= 0);
2945 assert(ns_storage_socket[1] >= 0);
2946 assert(path);
2947
2948 /* If the storage socket doesn't contain a ns fd yet, open one via the file system and store it in
2949 * it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will
2950 * allocate a new anonymous ns if needed. */
2951
2952 r = posix_lock(ns_storage_socket[0], LOCK_EX);
2953 if (r < 0)
2954 return r;
2955
2956 CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]);
2957
2958 ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2959 if (ns >= 0)
2960 return 0;
2961 if (ns != -EAGAIN)
2962 return ns;
2963
2964 /* Nothing stored yet. Open the file from the file system. */
2965
2966 ns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
2967 if (ns < 0)
2968 return -errno;
2969
2970 r = fd_is_ns(ns, nsflag);
2971 if (r == 0)
2972 return -EINVAL;
2973 if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
2974 return r;
2975
2976 r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
2977 if (r < 0)
2978 return r;
2979
2980 return 1;
2981 }
2982
2983 bool ns_type_supported(NamespaceType type) {
2984 const char *t, *ns_proc;
2985
2986 t = namespace_type_to_string(type);
2987 if (!t) /* Don't know how to translate this? Then it's not supported */
2988 return false;
2989
2990 ns_proc = strjoina("/proc/self/ns/", t);
2991 return access(ns_proc, F_OK) == 0;
2992 }
2993
2994 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
2995 [PROTECT_HOME_NO] = "no",
2996 [PROTECT_HOME_YES] = "yes",
2997 [PROTECT_HOME_READ_ONLY] = "read-only",
2998 [PROTECT_HOME_TMPFS] = "tmpfs",
2999 };
3000
3001 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
3002
3003 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
3004 [PROTECT_SYSTEM_NO] = "no",
3005 [PROTECT_SYSTEM_YES] = "yes",
3006 [PROTECT_SYSTEM_FULL] = "full",
3007 [PROTECT_SYSTEM_STRICT] = "strict",
3008 };
3009
3010 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
3011
3012 static const char* const namespace_type_table[] = {
3013 [NAMESPACE_MOUNT] = "mnt",
3014 [NAMESPACE_CGROUP] = "cgroup",
3015 [NAMESPACE_UTS] = "uts",
3016 [NAMESPACE_IPC] = "ipc",
3017 [NAMESPACE_USER] = "user",
3018 [NAMESPACE_PID] = "pid",
3019 [NAMESPACE_NET] = "net",
3020 [NAMESPACE_TIME] = "time",
3021 };
3022
3023 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
3024
3025 static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
3026 [PROTECT_PROC_DEFAULT] = "default",
3027 [PROTECT_PROC_NOACCESS] = "noaccess",
3028 [PROTECT_PROC_INVISIBLE] = "invisible",
3029 [PROTECT_PROC_PTRACEABLE] = "ptraceable",
3030 };
3031
3032 DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
3033
3034 static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
3035 [PROC_SUBSET_ALL] = "all",
3036 [PROC_SUBSET_PID] = "pid",
3037 };
3038
3039 DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);