]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
core: wrap some long lines and other formatting changes
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
15ae422b
LP
2
3#include <errno.h>
e08f94ac 4#include <linux/loop.h>
07630cea 5#include <sched.h>
15ae422b 6#include <stdio.h>
07630cea 7#include <sys/mount.h>
07630cea 8#include <unistd.h>
25e870b5 9#include <linux/fs.h>
15ae422b 10
b5efdb8a 11#include "alloc-util.h"
10404d52 12#include "base-filesystem.h"
7f112f50 13#include "dev-setup.h"
3ffd4af2 14#include "fd-util.h"
e5f10caf 15#include "format-util.h"
d944dc95 16#include "fs-util.h"
e908468b 17#include "label.h"
915e6d16 18#include "loop-util.h"
07630cea 19#include "loopback-setup.h"
07630cea 20#include "mkdir.h"
4349cd7c 21#include "mount-util.h"
049af8ad 22#include "mountpoint-util.h"
0cb8e3d1 23#include "namespace-util.h"
3ffd4af2 24#include "namespace.h"
d8b4d14d 25#include "nulstr-util.h"
07630cea 26#include "path-util.h"
d7b8eec7 27#include "selinux-util.h"
2583fbea 28#include "socket-util.h"
760877e9 29#include "sort-util.h"
36ce7110 30#include "stat-util.h"
8b43440b 31#include "string-table.h"
07630cea
LP
32#include "string-util.h"
33#include "strv.h"
a652f050 34#include "tmpfile-util.h"
affb60b1 35#include "umask-util.h"
ee104e11 36#include "user-util.h"
15ae422b 37
737ba3c8 38#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
39
c17ec25e 40typedef enum MountMode {
15ae422b
LP
41 /* This is ordered by priority! */
42 INACCESSIBLE,
d2d6c096
LP
43 BIND_MOUNT,
44 BIND_MOUNT_RECURSIVE,
ac0930c8 45 PRIVATE_TMP,
7f112f50 46 PRIVATE_DEV,
5d997827 47 BIND_DEV,
6c47cd7d 48 EMPTY_DIR,
5d997827
LP
49 SYSFS,
50 PROCFS,
51 READONLY,
59eeb84b 52 READWRITE,
2abd4e38 53 TMPFS,
1e05071d 54 READWRITE_IMPLICIT, /* Should have the lowest priority. */
5beb8688 55 _MOUNT_MODE_MAX,
c17ec25e 56} MountMode;
15ae422b 57
34de407a 58typedef struct MountEntry {
5327c910 59 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 60 MountMode mode:5;
5327c910
LP
61 bool ignore:1; /* Ignore if path does not exist? */
62 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 63 bool read_only:1; /* Shall this mount point be read-only? */
9ce4e4b0 64 bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */
088696fe 65 bool applied:1; /* Already applied */
55fe7432 66 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
d2d6c096
LP
67 const char *source_const; /* The source path, for bind mounts */
68 char *source_malloc;
2abd4e38
YW
69 const char *options_const;/* Mount options for tmpfs */
70 char *options_malloc;
71 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
088696fe 72 unsigned n_followed;
34de407a 73} MountEntry;
15ae422b 74
5d997827 75/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
3fe91079 76 * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
5d997827
LP
77static const MountEntry apivfs_table[] = {
78 { "/proc", PROCFS, false },
79 { "/dev", BIND_DEV, false },
80 { "/sys", SYSFS, false },
81};
f471b2af 82
11a30cec 83/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 84static const MountEntry protect_kernel_tunables_table[] = {
1e05071d
YW
85 { "/proc/acpi", READONLY, true },
86 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
87 { "/proc/asound", READONLY, true },
88 { "/proc/bus", READONLY, true },
89 { "/proc/fs", READONLY, true },
90 { "/proc/irq", READONLY, true },
91 { "/proc/kallsyms", INACCESSIBLE, true },
92 { "/proc/kcore", INACCESSIBLE, true },
93 { "/proc/latency_stats", READONLY, true },
94 { "/proc/mtrr", READONLY, true },
95 { "/proc/scsi", READONLY, true },
96 { "/proc/sys", READONLY, false },
97 { "/proc/sysrq-trigger", READONLY, true },
98 { "/proc/timer_stats", READONLY, true },
99 { "/sys", READONLY, false },
100 { "/sys/fs/bpf", READONLY, true },
101 { "/sys/fs/cgroup", READWRITE_IMPLICIT, false }, /* READONLY is set by ProtectControlGroups= option */
102 { "/sys/fs/selinux", READWRITE_IMPLICIT, true },
103 { "/sys/kernel/debug", READONLY, true },
104 { "/sys/kernel/tracing", READONLY, true },
11a30cec
DH
105};
106
c575770b 107/* ProtectKernelModules= option */
34de407a 108static const MountEntry protect_kernel_modules_table[] = {
349cc4a5 109#if HAVE_SPLIT_USR
c6232fb0 110 { "/lib/modules", INACCESSIBLE, true },
c575770b 111#endif
c6232fb0 112 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
113};
114
94a7b275
KK
115/* ProtectKernelLogs= option */
116static const MountEntry protect_kernel_logs_table[] = {
117 { "/proc/kmsg", INACCESSIBLE, true },
118 { "/dev/kmsg", INACCESSIBLE, true },
119};
120
b6c432ca
DH
121/*
122 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
123 * system should be protected by ProtectSystem=
124 */
34de407a 125static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
126 { "/home", READONLY, true },
127 { "/run/user", READONLY, true },
128 { "/root", READONLY, true },
b6c432ca
DH
129};
130
e4da7d8c
YW
131/* ProtectHome=tmpfs table */
132static const MountEntry protect_home_tmpfs_table[] = {
7d85383e
TM
133 { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
134 { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
135 { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
e4da7d8c
YW
136};
137
b6c432ca 138/* ProtectHome=yes table */
34de407a 139static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
140 { "/home", INACCESSIBLE, true },
141 { "/run/user", INACCESSIBLE, true },
142 { "/root", INACCESSIBLE, true },
b6c432ca
DH
143};
144
f471b2af 145/* ProtectSystem=yes table */
34de407a 146static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
147 { "/usr", READONLY, false },
148 { "/boot", READONLY, true },
149 { "/efi", READONLY, true },
7486f305
AB
150#if HAVE_SPLIT_USR
151 { "/lib", READONLY, true },
152 { "/lib64", READONLY, true },
153 { "/bin", READONLY, true },
671f0f8d 154# if HAVE_SPLIT_BIN
7486f305 155 { "/sbin", READONLY, true },
671f0f8d 156# endif
7486f305 157#endif
f471b2af
DH
158};
159
160/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 161static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
162 { "/usr", READONLY, false },
163 { "/boot", READONLY, true },
164 { "/efi", READONLY, true },
165 { "/etc", READONLY, false },
7486f305
AB
166#if HAVE_SPLIT_USR
167 { "/lib", READONLY, true },
168 { "/lib64", READONLY, true },
169 { "/bin", READONLY, true },
671f0f8d 170# if HAVE_SPLIT_BIN
7486f305 171 { "/sbin", READONLY, true },
671f0f8d 172# endif
7486f305 173#endif
f471b2af
DH
174};
175
176/*
177 * ProtectSystem=strict table. In this strict mode, we mount everything
178 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
179 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
180 * protect those, and these options should be fully orthogonal.
181 * (And of course /home and friends are also left writable, as ProtectHome=
182 * shall manage those, orthogonally).
183 */
34de407a 184static const MountEntry protect_system_strict_table[] = {
1e05071d
YW
185 { "/", READONLY, false },
186 { "/proc", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
187 { "/sys", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
188 { "/dev", READWRITE_IMPLICIT, false }, /* PrivateDevices= */
189 { "/home", READWRITE_IMPLICIT, true }, /* ProtectHome= */
190 { "/run/user", READWRITE_IMPLICIT, true }, /* ProtectHome= */
191 { "/root", READWRITE_IMPLICIT, true }, /* ProtectHome= */
f471b2af
DH
192};
193
5beb8688
YW
194static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
195 [INACCESSIBLE] = "inaccessible",
196 [BIND_MOUNT] = "bind",
197 [BIND_MOUNT_RECURSIVE] = "rbind",
198 [PRIVATE_TMP] = "private-tmp",
199 [PRIVATE_DEV] = "private-dev",
200 [BIND_DEV] = "bind-dev",
201 [EMPTY_DIR] = "empty",
202 [SYSFS] = "sysfs",
203 [PROCFS] = "procfs",
204 [READONLY] = "read-only",
205 [READWRITE] = "read-write",
206 [TMPFS] = "tmpfs",
207 [READWRITE_IMPLICIT] = "rw-implicit",
208};
209
210DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
211
34de407a 212static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
213 assert(p);
214
5327c910
LP
215 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
216 * otherwise the stack/static ->path field is returned. */
f0a4feb0 217
5327c910 218 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
219}
220
34de407a 221static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
222 assert(p);
223
224 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
225}
226
d2d6c096
LP
227static const char *mount_entry_source(const MountEntry *p) {
228 assert(p);
229
230 return p->source_malloc ?: p->source_const;
231}
232
2abd4e38
YW
233static const char *mount_entry_options(const MountEntry *p) {
234 assert(p);
235
236 return p->options_malloc ?: p->options_const;
237}
238
1eb7e08e
LP
239static void mount_entry_done(MountEntry *p) {
240 assert(p);
241
242 p->path_malloc = mfree(p->path_malloc);
243 p->source_malloc = mfree(p->source_malloc);
2abd4e38 244 p->options_malloc = mfree(p->options_malloc);
1eb7e08e
LP
245}
246
d18aff04 247static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
15ae422b
LP
248 char **i;
249
613b411c
LP
250 assert(p);
251
1e05071d 252 /* Adds a list of user-supplied READWRITE/READWRITE_IMPLICIT/READONLY/INACCESSIBLE entries */
5327c910 253
15ae422b 254 STRV_FOREACH(i, strv) {
5327c910
LP
255 bool ignore = false, needs_prefix = false;
256 const char *e = *i;
15ae422b 257
5327c910
LP
258 /* Look for any prefixes */
259 if (startswith(e, "-")) {
260 e++;
9c94d52e 261 ignore = true;
ea92ae33 262 }
5327c910
LP
263 if (startswith(e, "+")) {
264 e++;
265 needs_prefix = true;
266 }
ea92ae33 267
baaa35ad
ZJS
268 if (!path_is_absolute(e))
269 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
270 "Path is not absolute: %s", e);
15ae422b 271
34de407a 272 *((*p)++) = (MountEntry) {
5327c910
LP
273 .path_const = e,
274 .mode = mode,
275 .ignore = ignore,
d18aff04 276 .has_prefix = !needs_prefix && !forcibly_require_prefix,
5327c910 277 };
15ae422b
LP
278 }
279
280 return 0;
281}
282
6c47cd7d
LP
283static int append_empty_dir_mounts(MountEntry **p, char **strv) {
284 char **i;
285
286 assert(p);
287
288 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
289 * "/private/" boundary directories for DynamicUser=1. */
290
291 STRV_FOREACH(i, strv) {
292
293 *((*p)++) = (MountEntry) {
294 .path_const = *i,
295 .mode = EMPTY_DIR,
296 .ignore = false,
6c47cd7d 297 .read_only = true,
7d85383e 298 .options_const = "mode=755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
2abd4e38 299 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
6c47cd7d
LP
300 };
301 }
302
303 return 0;
304}
305
da6053d0
LP
306static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) {
307 size_t i;
d2d6c096
LP
308
309 assert(p);
310
311 for (i = 0; i < n; i++) {
312 const BindMount *b = binds + i;
313
314 *((*p)++) = (MountEntry) {
315 .path_const = b->destination,
316 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
317 .read_only = b->read_only,
9ce4e4b0 318 .nosuid = b->nosuid,
d2d6c096 319 .source_const = b->source,
4ca763a9 320 .ignore = b->ignore_enoent,
d2d6c096
LP
321 };
322 }
323
324 return 0;
325}
326
da6053d0
LP
327static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) {
328 size_t i;
2abd4e38
YW
329 int r;
330
331 assert(p);
332
333 for (i = 0; i < n; i++) {
334 const TemporaryFileSystem *t = tmpfs + i;
335 _cleanup_free_ char *o = NULL, *str = NULL;
ad8e66dc 336 unsigned long flags;
2abd4e38
YW
337 bool ro = false;
338
baaa35ad
ZJS
339 if (!path_is_absolute(t->path))
340 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
341 "Path is not absolute: %s",
342 t->path);
2abd4e38 343
7d85383e 344 str = strjoin("mode=0755" TMPFS_LIMITS_TEMPORARY_FS ",", t->options);
ad8e66dc
AJ
345 if (!str)
346 return -ENOMEM;
2abd4e38 347
ad8e66dc
AJ
348 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
349 if (r < 0)
350 return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
2abd4e38 351
ad8e66dc
AJ
352 ro = flags & MS_RDONLY;
353 if (ro)
354 flags ^= MS_RDONLY;
2abd4e38
YW
355
356 *((*p)++) = (MountEntry) {
357 .path_const = t->path,
358 .mode = TMPFS,
359 .read_only = ro,
ad8e66dc 360 .options_malloc = TAKE_PTR(o),
2abd4e38
YW
361 .flags = flags,
362 };
2abd4e38
YW
363 }
364
365 return 0;
366}
367
da6053d0
LP
368static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) {
369 size_t i;
11a30cec
DH
370
371 assert(p);
f471b2af 372 assert(mounts);
11a30cec 373
5327c910 374 /* Adds a list of static pre-defined entries */
f471b2af 375
5327c910 376 for (i = 0; i < n; i++)
34de407a
LP
377 *((*p)++) = (MountEntry) {
378 .path_const = mount_entry_path(mounts+i),
5327c910
LP
379 .mode = mounts[i].mode,
380 .ignore = mounts[i].ignore || ignore_protect,
381 };
f471b2af
DH
382
383 return 0;
384}
385
34de407a 386static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
387 assert(p);
388
5327c910 389 switch (protect_home) {
b6c432ca 390
5327c910 391 case PROTECT_HOME_NO:
b6c432ca
DH
392 return 0;
393
b6c432ca 394 case PROTECT_HOME_READ_ONLY:
5327c910
LP
395 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
396
e4da7d8c
YW
397 case PROTECT_HOME_TMPFS:
398 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
399
b6c432ca 400 case PROTECT_HOME_YES:
5327c910
LP
401 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
402
b6c432ca 403 default:
5327c910 404 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 405 }
b6c432ca
DH
406}
407
34de407a 408static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
409 assert(p);
410
5327c910
LP
411 switch (protect_system) {
412
413 case PROTECT_SYSTEM_NO:
f471b2af
DH
414 return 0;
415
f471b2af 416 case PROTECT_SYSTEM_STRICT:
5327c910
LP
417 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
418
f471b2af 419 case PROTECT_SYSTEM_YES:
5327c910
LP
420 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
421
f471b2af 422 case PROTECT_SYSTEM_FULL:
5327c910
LP
423 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
424
f471b2af 425 default:
5327c910 426 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 427 }
11a30cec
DH
428}
429
93bab288 430static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
a0827e2b 431 int d;
15ae422b 432
6ee1a919 433 /* If the paths are not equal, then order prefixes first */
93bab288 434 d = path_compare(mount_entry_path(a), mount_entry_path(b));
6ee1a919
LP
435 if (d != 0)
436 return d;
15ae422b 437
6ee1a919 438 /* If the paths are equal, check the mode */
93bab288 439 return CMP((int) a->mode, (int) b->mode);
15ae422b
LP
440}
441
da6053d0
LP
442static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) {
443 size_t i;
5327c910 444
4a756839 445 /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
5327c910
LP
446
447 for (i = 0; i < n; i++) {
448 char *s;
449
450 if (m[i].has_prefix)
451 continue;
452
c6134d3e 453 s = path_join(root_directory, mount_entry_path(m+i));
5327c910
LP
454 if (!s)
455 return -ENOMEM;
456
e282f51f 457 free_and_replace(m[i].path_malloc, s);
5327c910
LP
458 m[i].has_prefix = true;
459 }
460
461 return 0;
462}
463
da6053d0 464static void drop_duplicates(MountEntry *m, size_t *n) {
34de407a 465 MountEntry *f, *t, *previous;
15ae422b 466
c17ec25e 467 assert(m);
15ae422b 468 assert(n);
15ae422b 469
fe3c2583
LP
470 /* Drops duplicate entries. Expects that the array is properly ordered already. */
471
1d54cd5d 472 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 473
fe3c2583 474 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
088696fe
LP
475 * above. Note that we only drop duplicates that haven't been mounted yet. */
476 if (previous &&
477 path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
478 !f->applied && !previous->applied) {
5beb8688 479 log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
34de407a 480 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 481 mount_entry_done(f);
15ae422b 482 continue;
fe3c2583 483 }
15ae422b 484
e2d7c1a0 485 *t = *f;
15ae422b 486 previous = t;
fe3c2583
LP
487 t++;
488 }
489
490 *n = t - m;
491}
492
da6053d0 493static void drop_inaccessible(MountEntry *m, size_t *n) {
34de407a 494 MountEntry *f, *t;
fe3c2583
LP
495 const char *clear = NULL;
496
497 assert(m);
498 assert(n);
499
500 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
501 * ordered already. */
502
1d54cd5d 503 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
504
505 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
506 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
507 if (clear && path_startswith(mount_entry_path(f), clear)) {
508 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 509 mount_entry_done(f);
fe3c2583
LP
510 continue;
511 }
15ae422b 512
34de407a 513 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
514
515 *t = *f;
15ae422b
LP
516 t++;
517 }
518
c17ec25e 519 *n = t - m;
15ae422b
LP
520}
521
da6053d0 522static void drop_nop(MountEntry *m, size_t *n) {
34de407a 523 MountEntry *f, *t;
7648a565
LP
524
525 assert(m);
526 assert(n);
527
528 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
529 * list is ordered by prefixes. */
530
1d54cd5d 531 for (f = m, t = m; f < m + *n; f++) {
7648a565 532
1e05071d
YW
533 /* Only suppress such subtrees for READONLY, READWRITE and READWRITE_IMPLICIT entries */
534 if (IN_SET(f->mode, READONLY, READWRITE, READWRITE_IMPLICIT)) {
34de407a 535 MountEntry *p;
7648a565
LP
536 bool found = false;
537
538 /* Now let's find the first parent of the entry we are looking at. */
539 for (p = t-1; p >= m; p--) {
34de407a 540 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
541 found = true;
542 break;
543 }
544 }
545
546 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
547 if (found && p->mode == f->mode) {
5beb8688
YW
548 log_debug("%s (%s) is made redundant by %s (%s)",
549 mount_entry_path(f), mount_mode_to_string(f->mode),
550 mount_entry_path(p), mount_mode_to_string(p->mode));
1eb7e08e 551 mount_entry_done(f);
7648a565
LP
552 continue;
553 }
554 }
555
556 *t = *f;
557 t++;
558 }
559
560 *n = t - m;
561}
562
da6053d0 563static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) {
34de407a 564 MountEntry *f, *t;
cd2902c9
LP
565
566 assert(m);
567 assert(n);
568
1d54cd5d 569 /* Nothing to do */
cd2902c9
LP
570 if (!root_directory)
571 return;
572
573 /* Drops all mounts that are outside of the root directory. */
574
1d54cd5d 575 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 576
34de407a
LP
577 if (!path_startswith(mount_entry_path(f), root_directory)) {
578 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 579 mount_entry_done(f);
cd2902c9
LP
580 continue;
581 }
582
583 *t = *f;
584 t++;
585 }
586
587 *n = t - m;
588}
589
b2a60844
LP
590static int clone_device_node(
591 const char *d,
592 const char *temporary_mount,
593 bool *make_devnode) {
594
595 _cleanup_free_ char *sl = NULL;
596 const char *dn, *bn, *t;
b5e99f23
ДГ
597 struct stat st;
598 int r;
599
414b304b 600 if (stat(d, &st) < 0) {
b2a60844
LP
601 if (errno == ENOENT) {
602 log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
af984e13 603 return -ENXIO;
b2a60844
LP
604 }
605
606 return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
b5e99f23
ДГ
607 }
608
609 if (!S_ISBLK(st.st_mode) &&
baaa35ad
ZJS
610 !S_ISCHR(st.st_mode))
611 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
612 "Device node '%s' to clone is not a device node, ignoring.",
613 d);
b5e99f23 614
6f7f3a33 615 dn = strjoina(temporary_mount, d);
b5e99f23 616
b2a60844 617 /* First, try to create device node properly */
16498617
CB
618 if (*make_devnode) {
619 mac_selinux_create_file_prepare(d, st.st_mode);
620 r = mknod(dn, st.st_mode, st.st_rdev);
621 mac_selinux_create_file_clear();
b2a60844
LP
622 if (r >= 0)
623 goto add_symlink;
16498617
CB
624 if (errno != EPERM)
625 return log_debug_errno(errno, "mknod failed for %s: %m", d);
626
b2a60844 627 /* This didn't work, let's not try this again for the next iterations. */
16498617
CB
628 *make_devnode = false;
629 }
630
631 /* We're about to fallback to bind-mounting the device
1acf344d
CG
632 * node. So create a dummy bind-mount target.
633 * Do not prepare device-node SELinux label (see issue 13762) */
16498617 634 r = mknod(dn, S_IFREG, 0);
16498617 635 if (r < 0 && errno != EEXIST)
b2a60844 636 return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
16498617
CB
637
638 /* Fallback to bind-mounting:
639 * The assumption here is that all used device nodes carry standard
640 * properties. Specifically, the devices nodes we bind-mount should
641 * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx)
642 * and should not carry ACLs. */
643 if (mount(d, dn, NULL, MS_BIND, NULL) < 0)
b2a60844
LP
644 return log_debug_errno(errno, "Bind mounting failed for '%s': %m", d);
645
646add_symlink:
647 bn = path_startswith(d, "/dev/");
648 if (!bn)
649 return 0;
650
651 /* Create symlinks like /dev/char/1:9 → ../urandom */
cbc056c8
ZJS
652 if (asprintf(&sl, "%s/dev/%s/%u:%u",
653 temporary_mount,
654 S_ISCHR(st.st_mode) ? "char" : "block",
655 major(st.st_rdev), minor(st.st_rdev)) < 0)
b2a60844
LP
656 return log_oom();
657
658 (void) mkdir_parents(sl, 0755);
659
660 t = strjoina("../", bn);
b2a60844 661 if (symlink(t, sl) < 0)
2e4a4fae 662 log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
b5e99f23 663
af984e13 664 return 0;
b5e99f23
ДГ
665}
666
5d997827 667static int mount_private_dev(MountEntry *m) {
7f112f50
LP
668 static const char devnodes[] =
669 "/dev/null\0"
670 "/dev/zero\0"
671 "/dev/full\0"
672 "/dev/random\0"
673 "/dev/urandom\0"
674 "/dev/tty\0";
675
2b85f4e1 676 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 677 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
16498617 678 bool can_mknod = true;
7f112f50
LP
679 _cleanup_umask_ mode_t u;
680 int r;
681
682 assert(m);
683
684 u = umask(0000);
685
2b85f4e1 686 if (!mkdtemp(temporary_mount))
2e4a4fae 687 return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
2b85f4e1 688
63c372cb 689 dev = strjoina(temporary_mount, "/dev");
dc751688 690 (void) mkdir(dev, 0755);
7d85383e 691 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755" TMPFS_LIMITS_DEV) < 0) {
2e4a4fae 692 r = log_debug_errno(errno, "Failed to mount tmpfs on '%s': %m", dev);
2b85f4e1
LP
693 goto fail;
694 }
c3151977
TM
695 r = label_fix_container(dev, "/dev", 0);
696 if (r < 0) {
697 log_debug_errno(errno, "Failed to fix label of '%s' as /dev: %m", dev);
698 goto fail;
699 }
2b85f4e1 700
63c372cb 701 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 702 (void) mkdir(devpts, 0755);
2b85f4e1 703 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
2e4a4fae 704 r = log_debug_errno(errno, "Failed to bind mount /dev/pts on '%s': %m", devpts);
2b85f4e1
LP
705 goto fail;
706 }
707
2e4a4fae
YW
708 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
709 * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
710 * Thus, in that case make a clone.
711 * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
36ce7110 712 r = is_symlink("/dev/ptmx");
2e4a4fae
YW
713 if (r < 0) {
714 log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
3164e3cb 715 goto fail;
2e4a4fae 716 } else if (r > 0) {
414b304b
ДГ
717 devptmx = strjoina(temporary_mount, "/dev/ptmx");
718 if (symlink("pts/ptmx", devptmx) < 0) {
2e4a4fae 719 r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
414b304b
ДГ
720 goto fail;
721 }
722 } else {
16498617 723 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
152c475f
LP
724 if (r < 0)
725 goto fail;
414b304b 726 }
e06b6479 727
63c372cb 728 devshm = strjoina(temporary_mount, "/dev/shm");
8d953682 729 (void) mkdir(devshm, 0755);
2b85f4e1
LP
730 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
731 if (r < 0) {
2e4a4fae 732 r = log_debug_errno(errno, "Failed to bind mount /dev/shm on '%s': %m", devshm);
2b85f4e1
LP
733 goto fail;
734 }
735
63c372cb 736 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 737 (void) mkdir(devmqueue, 0755);
2e4a4fae
YW
738 if (mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL) < 0)
739 log_debug_errno(errno, "Failed to bind mount /dev/mqueue on '%s', ignoring: %m", devmqueue);
2b85f4e1 740
63c372cb 741 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 742 (void) mkdir(devhugepages, 0755);
2e4a4fae
YW
743 if (mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL) < 0)
744 log_debug_errno(errno, "Failed to bind mount /dev/hugepages on '%s', ignoring: %m", devhugepages);
2b85f4e1 745
63c372cb 746 devlog = strjoina(temporary_mount, "/dev/log");
2e4a4fae
YW
747 if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
748 log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
82d25240 749
7f112f50 750 NULSTR_FOREACH(d, devnodes) {
16498617 751 r = clone_device_node(d, temporary_mount, &can_mknod);
37b22b3b 752 /* ENXIO means the *source* is not a device file, skip creation in that case */
af984e13 753 if (r < 0 && r != -ENXIO)
2b85f4e1 754 goto fail;
7f112f50
LP
755 }
756
2e4a4fae
YW
757 r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
758 if (r < 0)
105a1a36 759 log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
7f112f50 760
ee818b89
AC
761 /* Create the /dev directory if missing. It is more likely to be
762 * missing when the service is started with RootDirectory. This is
763 * consistent with mount units creating the mount points when missing.
764 */
34de407a 765 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 766
9e5f8252 767 /* Unmount everything in old /dev */
2e4a4fae
YW
768 r = umount_recursive(mount_entry_path(m), 0);
769 if (r < 0)
770 log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
771
34de407a 772 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2e4a4fae 773 r = log_debug_errno(errno, "Failed to move mount point '%s' to '%s': %m", dev, mount_entry_path(m));
2b85f4e1
LP
774 goto fail;
775 }
7f112f50 776
1019a48f
LP
777 (void) rmdir(dev);
778 (void) rmdir(temporary_mount);
7f112f50 779
2b85f4e1 780 return 0;
7f112f50 781
2b85f4e1
LP
782fail:
783 if (devpts)
1019a48f 784 (void) umount(devpts);
7f112f50 785
2b85f4e1 786 if (devshm)
1019a48f 787 (void) umount(devshm);
7f112f50 788
2b85f4e1 789 if (devhugepages)
1019a48f 790 (void) umount(devhugepages);
7f112f50 791
2b85f4e1 792 if (devmqueue)
1019a48f 793 (void) umount(devmqueue);
7f112f50 794
1019a48f
LP
795 (void) umount(dev);
796 (void) rmdir(dev);
797 (void) rmdir(temporary_mount);
7f112f50 798
2b85f4e1 799 return r;
7f112f50
LP
800}
801
2a2969fd 802static int mount_bind_dev(const MountEntry *m) {
5d997827
LP
803 int r;
804
805 assert(m);
806
807 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
808 * /dev. This is only used when RootDirectory= is set. */
809
645767d6
LP
810 (void) mkdir_p_label(mount_entry_path(m), 0755);
811
5d997827
LP
812 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
813 if (r < 0)
814 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
815 if (r > 0) /* make this a NOP if /dev is already a mount point */
816 return 0;
817
818 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
819 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
820
821 return 1;
822}
823
2a2969fd 824static int mount_sysfs(const MountEntry *m) {
5d997827
LP
825 int r;
826
827 assert(m);
828
645767d6
LP
829 (void) mkdir_p_label(mount_entry_path(m), 0755);
830
5d997827
LP
831 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
832 if (r < 0)
833 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
834 if (r > 0) /* make this a NOP if /sys is already a mount point */
835 return 0;
836
837 /* Bind mount the host's version so that we get all child mounts of it, too. */
838 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
839 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
840
841 return 1;
842}
843
2a2969fd 844static int mount_procfs(const MountEntry *m) {
5d997827
LP
845 int r;
846
847 assert(m);
848
645767d6
LP
849 (void) mkdir_p_label(mount_entry_path(m), 0755);
850
5d997827
LP
851 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
852 if (r < 0)
853 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
854 if (r > 0) /* make this a NOP if /proc is already a mount point */
855 return 0;
856
857 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
858 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
859 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
860
861 return 1;
862}
863
2abd4e38 864static int mount_tmpfs(const MountEntry *m) {
6c47cd7d
LP
865 assert(m);
866
2abd4e38 867 /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
6c47cd7d
LP
868
869 (void) mkdir_p_label(mount_entry_path(m), 0755);
870 (void) umount_recursive(mount_entry_path(m), 0);
871
2abd4e38 872 if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
6c47cd7d
LP
873 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
874
875 return 1;
876}
877
088696fe 878static int follow_symlink(
d2d6c096 879 const char *root_directory,
088696fe 880 MountEntry *m) {
d2d6c096 881
088696fe 882 _cleanup_free_ char *target = NULL;
8fceda93
LP
883 int r;
884
088696fe
LP
885 /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
886 * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
887 * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
888 * end and already have a fully normalized name. */
8fceda93 889
a5648b80 890 r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL);
088696fe
LP
891 if (r < 0)
892 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
893 if (r > 0) /* Reached the end, nothing more to resolve */
894 return 1;
8fceda93 895
baaa35ad
ZJS
896 if (m->n_followed >= CHASE_SYMLINKS_MAX) /* put a boundary on things */
897 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
898 "Symlink loop on '%s'.",
899 mount_entry_path(m));
8fceda93 900
088696fe 901 log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target);
8fceda93 902
088696fe
LP
903 free_and_replace(m->path_malloc, target);
904 m->has_prefix = true;
8fceda93 905
088696fe
LP
906 m->n_followed ++;
907
908 return 0;
8fceda93
LP
909}
910
ac0930c8 911static int apply_mount(
8fceda93 912 const char *root_directory,
89bd586c 913 MountEntry *m) {
ac0930c8 914
e5f10caf 915 _cleanup_free_ char *inaccessible = NULL;
a227a4be 916 bool rbind = true, make = false;
15ae422b 917 const char *what;
15ae422b 918 int r;
15ae422b 919
c17ec25e 920 assert(m);
15ae422b 921
34de407a 922 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 923
c17ec25e 924 switch (m->mode) {
15ae422b 925
160cfdbe 926 case INACCESSIBLE: {
e5f10caf
AZ
927 _cleanup_free_ char *tmp = NULL;
928 const char *runtime_dir;
160cfdbe 929 struct stat target;
6d313367
LP
930
931 /* First, get rid of everything that is below if there
932 * is anything... Then, overmount it with an
c4b41707 933 * inaccessible path. */
34de407a 934 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 935
088696fe
LP
936 if (lstat(mount_entry_path(m), &target) < 0) {
937 if (errno == ENOENT && m->ignore)
938 return 0;
939
cbc056c8
ZJS
940 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m",
941 mount_entry_path(m));
088696fe 942 }
15ae422b 943
e5f10caf 944 if (geteuid() == 0)
48b747fa 945 runtime_dir = "/run";
e5f10caf 946 else {
48b747fa
LP
947 if (asprintf(&tmp, "/run/user/" UID_FMT, geteuid()) < 0)
948 return -ENOMEM;
e5f10caf
AZ
949
950 runtime_dir = tmp;
951 }
952
953 r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
954 if (r < 0)
baaa35ad
ZJS
955 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
956 "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
e5f10caf 957 what = inaccessible;
c4b41707 958 break;
160cfdbe 959 }
fe3c2583 960
15ae422b 961 case READONLY:
15ae422b 962 case READWRITE:
1e05071d 963 case READWRITE_IMPLICIT:
8fceda93 964 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
088696fe
LP
965 if (r == -ENOENT && m->ignore)
966 return 0;
d944dc95 967 if (r < 0)
cbc056c8
ZJS
968 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m",
969 mount_entry_path(m));
970 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY
971 * bit for the mount point if needed. */
6b7c9f8b 972 return 0;
6b7c9f8b 973 /* This isn't a mount point yet, let's make it one. */
34de407a 974 what = mount_entry_path(m);
6b7c9f8b 975 break;
15ae422b 976
d2d6c096
LP
977 case BIND_MOUNT:
978 rbind = false;
d2d6c096 979
4831981d 980 _fallthrough_;
088696fe
LP
981 case BIND_MOUNT_RECURSIVE: {
982 _cleanup_free_ char *chased = NULL;
5d997827 983
cbc056c8
ZJS
984 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note
985 * that bind mount source paths are always relative to the host root, hence we pass NULL as
986 * root directory to chase_symlinks() here. */
088696fe 987
a5648b80 988 r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL);
088696fe
LP
989 if (r == -ENOENT && m->ignore) {
990 log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
991 return 0;
992 }
993 if (r < 0)
994 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
995
996 log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased);
997
998 free_and_replace(m->source_malloc, chased);
d2d6c096
LP
999
1000 what = mount_entry_source(m);
a227a4be 1001 make = true;
d2d6c096 1002 break;
088696fe 1003 }
d2d6c096 1004
6c47cd7d 1005 case EMPTY_DIR:
2abd4e38
YW
1006 case TMPFS:
1007 return mount_tmpfs(m);
6c47cd7d 1008
ac0930c8 1009 case PRIVATE_TMP:
89bd586c 1010 what = mount_entry_source(m);
a227a4be 1011 make = true;
15ae422b 1012 break;
e364ad06 1013
d6797c92 1014 case PRIVATE_DEV:
5d997827
LP
1015 return mount_private_dev(m);
1016
1017 case BIND_DEV:
1018 return mount_bind_dev(m);
1019
1020 case SYSFS:
1021 return mount_sysfs(m);
1022
1023 case PROCFS:
1024 return mount_procfs(m);
d6797c92 1025
e364ad06
LP
1026 default:
1027 assert_not_reached("Unknown mode");
15ae422b
LP
1028 }
1029
ac0930c8 1030 assert(what);
15ae422b 1031
a227a4be
LP
1032 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
1033 bool try_again = false;
1034 r = -errno;
1035
1036 if (r == -ENOENT && make) {
1037 struct stat st;
1038
cbc056c8
ZJS
1039 /* Hmm, either the source or the destination are missing. Let's see if we can create
1040 the destination, then try again. */
a227a4be 1041
e8717862 1042 if (stat(what, &st) < 0)
5dc60faa 1043 log_error_errno(errno, "Mount point source '%s' is not accessible: %m", what);
e8717862
LP
1044 else {
1045 int q;
a227a4be
LP
1046
1047 (void) mkdir_parents(mount_entry_path(m), 0755);
1048
1049 if (S_ISDIR(st.st_mode))
e8717862 1050 q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0;
a227a4be 1051 else
e8717862
LP
1052 q = touch(mount_entry_path(m));
1053
1054 if (q < 0)
cbc056c8
ZJS
1055 log_error_errno(q, "Failed to create destination mount point node '%s': %m",
1056 mount_entry_path(m));
e8717862
LP
1057 else
1058 try_again = true;
a227a4be
LP
1059 }
1060 }
1061
1062 if (try_again) {
1063 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
1064 r = -errno;
1065 else
1066 r = 0;
1067 }
1068
1069 if (r < 0)
5dc60faa 1070 return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
a227a4be 1071 }
6b7c9f8b 1072
34de407a 1073 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 1074 return 0;
ac0930c8 1075}
15ae422b 1076
6b000af4 1077static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
9ce4e4b0 1078 unsigned long new_flags = 0, flags_mask = 0;
763a260a 1079 bool submounts = false;
6b7c9f8b 1080 int r = 0;
15ae422b 1081
c17ec25e 1082 assert(m);
ac9de0b3 1083 assert(proc_self_mountinfo);
ac0930c8 1084
9ce4e4b0
LP
1085 if (mount_entry_read_only(m) || m->mode == PRIVATE_DEV) {
1086 new_flags |= MS_RDONLY;
1087 flags_mask |= MS_RDONLY;
1088 }
1089
1090 if (m->nosuid) {
1091 new_flags |= MS_NOSUID;
1092 flags_mask |= MS_NOSUID;
1093 }
1094
1095 if (flags_mask == 0) /* No Change? */
6b7c9f8b
LP
1096 return 0;
1097
9ce4e4b0
LP
1098 /* We generally apply these changes recursively, except for /dev, and the cases we know there's
1099 * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
1100 * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace
1101 * and running Linux <= 4.17. */
1102 submounts =
1103 mount_entry_read_only(m) &&
1104 !IN_SET(m->mode, EMPTY_DIR, TMPFS);
1105 if (submounts)
6b000af4 1106 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
9ce4e4b0 1107 else
7cce68e1 1108 r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
9ce4e4b0 1109
867189b5
LP
1110 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
1111 * read-only already stays this way. This improves compatibility with container managers, where we
1112 * won't attempt to undo read-only mounts already applied. */
ac0930c8 1113
8fceda93 1114 if (r == -ENOENT && m->ignore)
867189b5 1115 return 0;
763a260a 1116 if (r < 0)
9ce4e4b0 1117 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
763a260a 1118 submounts ? " and its submounts" : "");
763a260a 1119 return 0;
d944dc95
LP
1120}
1121
9b68367b 1122static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
5d997827
LP
1123 assert(ns_info);
1124
9c988f93
DH
1125 /*
1126 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1127 * since to protect the API VFS mounts, they need to be around in the
9b68367b 1128 * first place...
9c988f93 1129 */
5d997827 1130
9b68367b
YW
1131 return ns_info->mount_apivfs ||
1132 ns_info->protect_control_groups ||
1133 ns_info->protect_kernel_tunables;
5d997827
LP
1134}
1135
da6053d0 1136static size_t namespace_calculate_mounts(
bb0ff3fb 1137 const NamespaceInfo *ns_info,
2652c6c1
DH
1138 char** read_write_paths,
1139 char** read_only_paths,
1140 char** inaccessible_paths,
6c47cd7d 1141 char** empty_directories,
da6053d0
LP
1142 size_t n_bind_mounts,
1143 size_t n_temporary_filesystems,
2652c6c1
DH
1144 const char* tmp_dir,
1145 const char* var_tmp_dir,
91dd5f7c 1146 const char* log_namespace,
2652c6c1
DH
1147 ProtectHome protect_home,
1148 ProtectSystem protect_system) {
1149
da6053d0
LP
1150 size_t protect_home_cnt;
1151 size_t protect_system_cnt =
f471b2af
DH
1152 (protect_system == PROTECT_SYSTEM_STRICT ?
1153 ELEMENTSOF(protect_system_strict_table) :
1154 ((protect_system == PROTECT_SYSTEM_FULL) ?
1155 ELEMENTSOF(protect_system_full_table) :
1156 ((protect_system == PROTECT_SYSTEM_YES) ?
1157 ELEMENTSOF(protect_system_yes_table) : 0)));
1158
b6c432ca
DH
1159 protect_home_cnt =
1160 (protect_home == PROTECT_HOME_YES ?
1161 ELEMENTSOF(protect_home_yes_table) :
1162 ((protect_home == PROTECT_HOME_READ_ONLY) ?
e4da7d8c
YW
1163 ELEMENTSOF(protect_home_read_only_table) :
1164 ((protect_home == PROTECT_HOME_TMPFS) ?
1165 ELEMENTSOF(protect_home_tmpfs_table) : 0)));
b6c432ca 1166
2652c6c1
DH
1167 return !!tmp_dir + !!var_tmp_dir +
1168 strv_length(read_write_paths) +
1169 strv_length(read_only_paths) +
1170 strv_length(inaccessible_paths) +
6c47cd7d 1171 strv_length(empty_directories) +
d2d6c096 1172 n_bind_mounts +
2abd4e38 1173 n_temporary_filesystems +
c575770b
DH
1174 ns_info->private_dev +
1175 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
c575770b 1176 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
94a7b275
KK
1177 (ns_info->protect_kernel_logs ? ELEMENTSOF(protect_kernel_logs_table) : 0) +
1178 (ns_info->protect_control_groups ? 1 : 0) +
5d997827 1179 protect_home_cnt + protect_system_cnt +
aecd5ac6 1180 (ns_info->protect_hostname ? 2 : 0) +
91dd5f7c
LP
1181 (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0) +
1182 !!log_namespace;
2652c6c1
DH
1183}
1184
da6053d0 1185static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) {
9b68367b 1186 assert(root_directory);
f8b64b57
LP
1187 assert(n_mounts);
1188 assert(mounts || *n_mounts == 0);
1189
93bab288 1190 typesafe_qsort(mounts, *n_mounts, mount_path_compare);
f8b64b57
LP
1191
1192 drop_duplicates(mounts, n_mounts);
1193 drop_outside_root(root_directory, mounts, n_mounts);
1194 drop_inaccessible(mounts, n_mounts);
1195 drop_nop(mounts, n_mounts);
1196}
1197
c8c535d5
LP
1198static bool root_read_only(
1199 char **read_only_paths,
1200 ProtectSystem protect_system) {
1201
1202 /* Determine whether the root directory is going to be read-only given the configured settings. */
1203
1204 if (protect_system == PROTECT_SYSTEM_STRICT)
1205 return true;
1206
de46b2be 1207 if (prefixed_path_strv_contains(read_only_paths, "/"))
c8c535d5
LP
1208 return true;
1209
1210 return false;
1211}
1212
1213static bool home_read_only(
1214 char** read_only_paths,
1215 char** inaccessible_paths,
1216 char** empty_directories,
1217 const BindMount *bind_mounts,
1218 size_t n_bind_mounts,
1219 const TemporaryFileSystem *temporary_filesystems,
1220 size_t n_temporary_filesystems,
1221 ProtectHome protect_home) {
1222
1223 size_t i;
1224
1225 /* Determine whether the /home directory is going to be read-only given the configured settings. Yes,
1226 * this is a bit sloppy, since we don't bother checking for cases where / is affected by multiple
1227 * settings. */
1228
1229 if (protect_home != PROTECT_HOME_NO)
1230 return true;
1231
de46b2be
TM
1232 if (prefixed_path_strv_contains(read_only_paths, "/home") ||
1233 prefixed_path_strv_contains(inaccessible_paths, "/home") ||
1234 prefixed_path_strv_contains(empty_directories, "/home"))
c8c535d5
LP
1235 return true;
1236
1237 for (i = 0; i < n_temporary_filesystems; i++)
1238 if (path_equal(temporary_filesystems[i].path, "/home"))
1239 return true;
1240
1241 /* If /home is overmounted with some dir from the host it's not writable. */
1242 for (i = 0; i < n_bind_mounts; i++)
1243 if (path_equal(bind_mounts[i].destination, "/home"))
1244 return true;
1245
1246 return false;
1247}
1248
613b411c 1249int setup_namespace(
ee818b89 1250 const char* root_directory,
915e6d16 1251 const char* root_image,
bb0ff3fb 1252 const NamespaceInfo *ns_info,
2a624c36
AP
1253 char** read_write_paths,
1254 char** read_only_paths,
1255 char** inaccessible_paths,
6c47cd7d 1256 char** empty_directories,
d2d6c096 1257 const BindMount *bind_mounts,
da6053d0 1258 size_t n_bind_mounts,
2abd4e38 1259 const TemporaryFileSystem *temporary_filesystems,
da6053d0 1260 size_t n_temporary_filesystems,
a004cb4c
LP
1261 const char* tmp_dir,
1262 const char* var_tmp_dir,
91dd5f7c 1263 const char *log_namespace,
1b8689f9
LP
1264 ProtectHome protect_home,
1265 ProtectSystem protect_system,
915e6d16 1266 unsigned long mount_flags,
0389f4fa
LB
1267 const void *root_hash,
1268 size_t root_hash_size,
1269 const char *root_hash_path,
d4d55b0d
LB
1270 const void *root_hash_sig,
1271 size_t root_hash_sig_size,
1272 const char *root_hash_sig_path,
0389f4fa 1273 const char *root_verity,
7cc5ef5f
ZJS
1274 DissectImageFlags dissect_image_flags,
1275 char **error_path) {
15ae422b 1276
915e6d16 1277 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
78ebe980 1278 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
915e6d16 1279 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
0389f4fa 1280 _cleanup_free_ void *root_hash_decoded = NULL;
c2923fdc 1281 _cleanup_free_ char *verity_data = NULL, *hash_sig_path = NULL;
5f7a690a 1282 MountEntry *m = NULL, *mounts = NULL;
0389f4fa 1283 size_t n_mounts;
d18aff04 1284 bool require_prefix = false;
9b68367b 1285 const char *root;
c17ec25e 1286 int r = 0;
15ae422b 1287
915e6d16
LP
1288 assert(ns_info);
1289
613b411c 1290 if (mount_flags == 0)
c17ec25e 1291 mount_flags = MS_SHARED;
ac0930c8 1292
915e6d16
LP
1293 if (root_image) {
1294 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1295
c8c535d5
LP
1296 /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
1297 if (root_read_only(read_only_paths,
1298 protect_system) &&
1299 home_read_only(read_only_paths, inaccessible_paths, empty_directories,
1300 bind_mounts, n_bind_mounts, temporary_filesystems, n_temporary_filesystems,
1301 protect_home) &&
c9ef8573 1302 strv_isempty(read_write_paths))
915e6d16
LP
1303 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1304
1305 r = loop_device_make_by_path(root_image,
b0a94268 1306 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
e08f94ac 1307 LO_FLAGS_PARTSCAN,
915e6d16
LP
1308 &loop_device);
1309 if (r < 0)
763a260a 1310 return log_debug_errno(r, "Failed to create loop device for root image: %m");
915e6d16 1311
cbc056c8
ZJS
1312 r = verity_metadata_load(root_image,
1313 root_hash_path,
1314 root_hash ? NULL : &root_hash_decoded,
1315 root_hash ? NULL : &root_hash_size,
1316 root_verity ? NULL : &verity_data,
1317 root_hash_sig || root_hash_sig_path ? NULL : &hash_sig_path);
78ebe980 1318 if (r < 0)
763a260a 1319 return log_debug_errno(r, "Failed to load root hash: %m");
0389f4fa 1320 dissect_image_flags |= root_verity || verity_data ? DISSECT_IMAGE_NO_PARTITION_TABLE : 0;
78ebe980 1321
cbc056c8
ZJS
1322 r = dissect_image(loop_device->fd,
1323 root_hash ?: root_hash_decoded,
1324 root_hash_size,
1325 root_verity ?: verity_data,
1326 dissect_image_flags,
1327 &dissected_image);
78ebe980 1328 if (r < 0)
763a260a 1329 return log_debug_errno(r, "Failed to dissect image: %m");
78ebe980 1330
cbc056c8
ZJS
1331 r = dissected_image_decrypt(dissected_image,
1332 NULL,
1333 root_hash ?: root_hash_decoded,
1334 root_hash_size,
1335 root_verity ?: verity_data,
1336 root_hash_sig_path ?: hash_sig_path,
1337 root_hash_sig,
1338 root_hash_sig_size,
1339 dissect_image_flags,
1340 &decrypted_image);
915e6d16 1341 if (r < 0)
763a260a 1342 return log_debug_errno(r, "Failed to decrypt dissected image: %m");
915e6d16
LP
1343 }
1344
e908468b
LP
1345 if (root_directory)
1346 root = root_directory;
0722b359
JS
1347 else {
1348 /* Always create the mount namespace in a temporary directory, instead of operating
1349 * directly in the root. The temporary directory prevents any mounts from being
1350 * potentially obscured my other mounts we already applied.
1351 * We use the same mount point for all images, which is safe, since they all live
1352 * in their own namespaces after all, and hence won't see each other. */
e908468b
LP
1353
1354 root = "/run/systemd/unit-root";
1355 (void) mkdir_label(root, 0700);
d18aff04 1356 require_prefix = true;
0722b359 1357 }
e908468b 1358
cfbeb4ef
LP
1359 n_mounts = namespace_calculate_mounts(
1360 ns_info,
1361 read_write_paths,
1362 read_only_paths,
1363 inaccessible_paths,
6c47cd7d 1364 empty_directories,
f5c52a77 1365 n_bind_mounts,
2abd4e38 1366 n_temporary_filesystems,
cfbeb4ef 1367 tmp_dir, var_tmp_dir,
91dd5f7c 1368 log_namespace,
cfbeb4ef 1369 protect_home, protect_system);
613b411c 1370
f0a4feb0 1371 if (n_mounts > 0) {
5f7a690a
LP
1372 m = mounts = new0(MountEntry, n_mounts);
1373 if (!mounts)
1374 return -ENOMEM;
1375
d18aff04 1376 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
613b411c 1377 if (r < 0)
f0a4feb0 1378 goto finish;
613b411c 1379
d18aff04 1380 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
613b411c 1381 if (r < 0)
f0a4feb0 1382 goto finish;
613b411c 1383
d18aff04 1384 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
613b411c 1385 if (r < 0)
f0a4feb0 1386 goto finish;
7ff7394d 1387
6c47cd7d
LP
1388 r = append_empty_dir_mounts(&m, empty_directories);
1389 if (r < 0)
1390 goto finish;
1391
d2d6c096
LP
1392 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1393 if (r < 0)
1394 goto finish;
1395
2abd4e38
YW
1396 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1397 if (r < 0)
1398 goto finish;
1399
613b411c 1400 if (tmp_dir) {
34de407a 1401 *(m++) = (MountEntry) {
5327c910
LP
1402 .path_const = "/tmp",
1403 .mode = PRIVATE_TMP,
89bd586c 1404 .source_const = tmp_dir,
5327c910 1405 };
613b411c 1406 }
7ff7394d 1407
613b411c 1408 if (var_tmp_dir) {
34de407a 1409 *(m++) = (MountEntry) {
5327c910 1410 .path_const = "/var/tmp",
89bd586c
YW
1411 .mode = PRIVATE_TMP,
1412 .source_const = var_tmp_dir,
5327c910 1413 };
7ff7394d 1414 }
ac0930c8 1415
c575770b 1416 if (ns_info->private_dev) {
34de407a 1417 *(m++) = (MountEntry) {
5327c910
LP
1418 .path_const = "/dev",
1419 .mode = PRIVATE_DEV,
9ce4e4b0 1420 .flags = DEV_MOUNT_OPTIONS,
5327c910 1421 };
7f112f50
LP
1422 }
1423
c575770b 1424 if (ns_info->protect_kernel_tunables) {
cbc056c8
ZJS
1425 r = append_static_mounts(&m,
1426 protect_kernel_tunables_table,
1427 ELEMENTSOF(protect_kernel_tunables_table),
1428 ns_info->ignore_protect_paths);
c575770b 1429 if (r < 0)
f0a4feb0 1430 goto finish;
c575770b
DH
1431 }
1432
1433 if (ns_info->protect_kernel_modules) {
cbc056c8
ZJS
1434 r = append_static_mounts(&m,
1435 protect_kernel_modules_table,
1436 ELEMENTSOF(protect_kernel_modules_table),
1437 ns_info->ignore_protect_paths);
c575770b 1438 if (r < 0)
f0a4feb0 1439 goto finish;
c575770b 1440 }
59eeb84b 1441
94a7b275 1442 if (ns_info->protect_kernel_logs) {
cbc056c8
ZJS
1443 r = append_static_mounts(&m,
1444 protect_kernel_logs_table,
1445 ELEMENTSOF(protect_kernel_logs_table),
1446 ns_info->ignore_protect_paths);
94a7b275
KK
1447 if (r < 0)
1448 goto finish;
1449 }
1450
c575770b 1451 if (ns_info->protect_control_groups) {
34de407a 1452 *(m++) = (MountEntry) {
5327c910
LP
1453 .path_const = "/sys/fs/cgroup",
1454 .mode = READONLY,
1455 };
59eeb84b
LP
1456 }
1457
5327c910 1458 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 1459 if (r < 0)
f0a4feb0 1460 goto finish;
417116f2 1461
5327c910 1462 r = append_protect_system(&m, protect_system, false);
f471b2af 1463 if (r < 0)
f0a4feb0 1464 goto finish;
417116f2 1465
9b68367b 1466 if (namespace_info_mount_apivfs(ns_info)) {
cbc056c8
ZJS
1467 r = append_static_mounts(&m,
1468 apivfs_table,
1469 ELEMENTSOF(apivfs_table),
1470 ns_info->ignore_protect_paths);
5d997827
LP
1471 if (r < 0)
1472 goto finish;
1473 }
1474
aecd5ac6
TM
1475 if (ns_info->protect_hostname) {
1476 *(m++) = (MountEntry) {
1477 .path_const = "/proc/sys/kernel/hostname",
1478 .mode = READONLY,
1479 };
1480 *(m++) = (MountEntry) {
1481 .path_const = "/proc/sys/kernel/domainname",
1482 .mode = READONLY,
1483 };
1484 }
1485
91dd5f7c
LP
1486 if (log_namespace) {
1487 _cleanup_free_ char *q;
1488
1489 q = strjoin("/run/systemd/journal.", log_namespace);
1490 if (!q) {
1491 r = -ENOMEM;
1492 goto finish;
1493 }
1494
1495 *(m++) = (MountEntry) {
1496 .path_const = "/run/systemd/journal",
1497 .mode = BIND_MOUNT_RECURSIVE,
1498 .read_only = true,
1499 .source_malloc = TAKE_PTR(q),
1500 };
1501 }
1502
f0a4feb0 1503 assert(mounts + n_mounts == m);
ac0930c8 1504
5327c910 1505 /* Prepend the root directory where that's necessary */
e908468b 1506 r = prefix_where_needed(mounts, n_mounts, root);
5327c910
LP
1507 if (r < 0)
1508 goto finish;
1509
839f1877 1510 normalize_mounts(root, mounts, &n_mounts);
15ae422b
LP
1511 }
1512
1beab8b0
LP
1513 /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
1514
d944dc95 1515 if (unshare(CLONE_NEWNS) < 0) {
763a260a 1516 r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
1beab8b0 1517 if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS))
cbc056c8
ZJS
1518 /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter
1519 * in place that doesn't allow us to create namespaces (or a missing cap), then
1520 * propagate a recognizable error back, which the caller can use to detect this case
1521 * (and only this) and optionally continue without namespacing applied. */
1beab8b0
LP
1522 r = -ENOANO;
1523
d944dc95
LP
1524 goto finish;
1525 }
1e4e94c8 1526
9b68367b
YW
1527 /* Remount / as SLAVE so that nothing now mounted in the namespace
1528 * shows up in the parent */
1529 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
763a260a 1530 r = log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
9b68367b 1531 goto finish;
ee818b89
AC
1532 }
1533
915e6d16 1534 if (root_image) {
e908468b 1535 /* A root image is specified, mount it to the right place */
2d3a5a73 1536 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
763a260a
YW
1537 if (r < 0) {
1538 log_debug_errno(r, "Failed to mount root image: %m");
915e6d16 1539 goto finish;
763a260a 1540 }
915e6d16 1541
07ce7407
TM
1542 if (decrypted_image) {
1543 r = decrypted_image_relinquish(decrypted_image);
763a260a
YW
1544 if (r < 0) {
1545 log_debug_errno(r, "Failed to relinquish decrypted image: %m");
07ce7407 1546 goto finish;
763a260a 1547 }
07ce7407 1548 }
78ebe980 1549
915e6d16
LP
1550 loop_device_relinquish(loop_device);
1551
1552 } else if (root_directory) {
1553
e908468b
LP
1554 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1555 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
763a260a
YW
1556 if (r < 0) {
1557 log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
d944dc95 1558 goto finish;
763a260a 1559 }
8f1ad200 1560 if (r == 0) {
e908468b 1561 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
763a260a 1562 r = log_debug_errno(errno, "Failed to bind mount '%s': %m", root);
8f1ad200
LP
1563 goto finish;
1564 }
d944dc95 1565 }
e908468b 1566
9b68367b 1567 } else {
e908468b
LP
1568
1569 /* Let's mount the main root directory to the root directory to use */
1570 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
763a260a 1571 r = log_debug_errno(errno, "Failed to bind mount '/' on '%s': %m", root);
e908468b
LP
1572 goto finish;
1573 }
ee818b89 1574 }
c2c13f2d 1575
4e0c20de
LP
1576 /* Try to set up the new root directory before mounting anything else there. */
1577 if (root_image || root_directory)
1578 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1579
f0a4feb0 1580 if (n_mounts > 0) {
ac9de0b3 1581 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
6b000af4 1582 _cleanup_free_ char **deny_list = NULL;
da6053d0 1583 size_t j;
6b7c9f8b 1584
cbc056c8
ZJS
1585 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of
1586 * /proc. For example, this is the case with the option: 'InaccessiblePaths=/proc'. */
ac9de0b3
TR
1587 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1588 if (!proc_self_mountinfo) {
763a260a 1589 r = log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m");
7cc5ef5f
ZJS
1590 if (error_path)
1591 *error_path = strdup("/proc/self/mountinfo");
ac9de0b3
TR
1592 goto finish;
1593 }
1594
088696fe
LP
1595 /* First round, establish all mounts we need */
1596 for (;;) {
1597 bool again = false;
1598
1599 for (m = mounts; m < mounts + n_mounts; ++m) {
1600
1601 if (m->applied)
1602 continue;
1603
1604 r = follow_symlink(root, m);
7cc5ef5f
ZJS
1605 if (r < 0) {
1606 if (error_path && mount_entry_path(m))
1607 *error_path = strdup(mount_entry_path(m));
088696fe 1608 goto finish;
7cc5ef5f 1609 }
088696fe 1610 if (r == 0) {
cbc056c8
ZJS
1611 /* We hit a symlinked mount point. The entry got rewritten and might
1612 * point to a very different place now. Let's normalize the changed
1613 * list, and start from the beginning. After all to mount the entry
1614 * at the new location we might need some other mounts first */
088696fe
LP
1615 again = true;
1616 break;
1617 }
1618
1619 r = apply_mount(root, m);
7cc5ef5f
ZJS
1620 if (r < 0) {
1621 if (error_path && mount_entry_path(m))
1622 *error_path = strdup(mount_entry_path(m));
088696fe 1623 goto finish;
7cc5ef5f 1624 }
088696fe
LP
1625
1626 m->applied = true;
1627 }
1628
1629 if (!again)
1630 break;
1631
839f1877 1632 normalize_mounts(root, mounts, &n_mounts);
c2c13f2d 1633 }
15ae422b 1634
6b000af4
LP
1635 /* Create a deny list we can pass to bind_mount_recursive() */
1636 deny_list = new(char*, n_mounts+1);
1637 if (!deny_list) {
5f7a690a
LP
1638 r = -ENOMEM;
1639 goto finish;
1640 }
f0a4feb0 1641 for (j = 0; j < n_mounts; j++)
6b000af4
LP
1642 deny_list[j] = (char*) mount_entry_path(mounts+j);
1643 deny_list[j] = NULL;
6b7c9f8b
LP
1644
1645 /* Second round, flip the ro bits if necessary. */
f0a4feb0 1646 for (m = mounts; m < mounts + n_mounts; ++m) {
6b000af4 1647 r = make_read_only(m, deny_list, proc_self_mountinfo);
7cc5ef5f
ZJS
1648 if (r < 0) {
1649 if (error_path && mount_entry_path(m))
1650 *error_path = strdup(mount_entry_path(m));
d944dc95 1651 goto finish;
7cc5ef5f 1652 }
c2c13f2d 1653 }
15ae422b
LP
1654 }
1655
9b68367b
YW
1656 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1657 r = mount_move_root(root);
763a260a
YW
1658 if (r < 0) {
1659 log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
9b68367b 1660 goto finish;
763a260a 1661 }
ee818b89 1662
55fe7432 1663 /* Remount / as the desired mode. Note that this will not
c2c13f2d
LP
1664 * reestablish propagation from our side to the host, since
1665 * what's disconnected is disconnected. */
d944dc95 1666 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
763a260a 1667 r = log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
d944dc95
LP
1668 goto finish;
1669 }
15ae422b 1670
d944dc95 1671 r = 0;
15ae422b 1672
d944dc95 1673finish:
0cd41757
LP
1674 if (n_mounts > 0)
1675 for (m = mounts; m < mounts + n_mounts; m++)
1676 mount_entry_done(m);
613b411c 1677
5f7a690a
LP
1678 free(mounts);
1679
613b411c
LP
1680 return r;
1681}
1682
da6053d0
LP
1683void bind_mount_free_many(BindMount *b, size_t n) {
1684 size_t i;
d2d6c096
LP
1685
1686 assert(b || n == 0);
1687
1688 for (i = 0; i < n; i++) {
1689 free(b[i].source);
1690 free(b[i].destination);
1691 }
1692
1693 free(b);
1694}
1695
da6053d0 1696int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
d2d6c096
LP
1697 _cleanup_free_ char *s = NULL, *d = NULL;
1698 BindMount *c;
1699
1700 assert(b);
1701 assert(n);
1702 assert(item);
1703
1704 s = strdup(item->source);
1705 if (!s)
1706 return -ENOMEM;
1707
1708 d = strdup(item->destination);
1709 if (!d)
1710 return -ENOMEM;
1711
aa484f35 1712 c = reallocarray(*b, *n + 1, sizeof(BindMount));
d2d6c096
LP
1713 if (!c)
1714 return -ENOMEM;
1715
1716 *b = c;
1717
1718 c[(*n) ++] = (BindMount) {
1cc6c93a
YW
1719 .source = TAKE_PTR(s),
1720 .destination = TAKE_PTR(d),
d2d6c096 1721 .read_only = item->read_only,
9ce4e4b0 1722 .nosuid = item->nosuid,
d2d6c096
LP
1723 .recursive = item->recursive,
1724 .ignore_enoent = item->ignore_enoent,
1725 };
1726
d2d6c096
LP
1727 return 0;
1728}
1729
da6053d0
LP
1730void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
1731 size_t i;
2abd4e38
YW
1732
1733 assert(t || n == 0);
1734
1735 for (i = 0; i < n; i++) {
1736 free(t[i].path);
1737 free(t[i].options);
1738 }
1739
1740 free(t);
1741}
1742
1743int temporary_filesystem_add(
1744 TemporaryFileSystem **t,
da6053d0 1745 size_t *n,
2abd4e38
YW
1746 const char *path,
1747 const char *options) {
1748
1749 _cleanup_free_ char *p = NULL, *o = NULL;
1750 TemporaryFileSystem *c;
1751
1752 assert(t);
1753 assert(n);
1754 assert(path);
1755
1756 p = strdup(path);
1757 if (!p)
1758 return -ENOMEM;
1759
1760 if (!isempty(options)) {
1761 o = strdup(options);
1762 if (!o)
1763 return -ENOMEM;
1764 }
1765
aa484f35 1766 c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
2abd4e38
YW
1767 if (!c)
1768 return -ENOMEM;
1769
1770 *t = c;
1771
1772 c[(*n) ++] = (TemporaryFileSystem) {
1cc6c93a
YW
1773 .path = TAKE_PTR(p),
1774 .options = TAKE_PTR(o),
2abd4e38
YW
1775 };
1776
2abd4e38
YW
1777 return 0;
1778}
1779
a652f050
JR
1780static int make_tmp_prefix(const char *prefix) {
1781 _cleanup_free_ char *t = NULL;
1782 int r;
1783
1784 /* Don't do anything unless we know the dir is actually missing */
1785 r = access(prefix, F_OK);
1786 if (r >= 0)
1787 return 0;
1788 if (errno != ENOENT)
1789 return -errno;
1790
1791 r = mkdir_parents(prefix, 0755);
1792 if (r < 0)
1793 return r;
1794
1795 r = tempfn_random(prefix, NULL, &t);
1796 if (r < 0)
1797 return r;
1798
1799 if (mkdir(t, 0777) < 0)
1800 return -errno;
1801
1802 if (chmod(t, 01777) < 0) {
1803 r = -errno;
1804 (void) rmdir(t);
1805 return r;
1806 }
1807
1808 if (rename(t, prefix) < 0) {
1809 r = -errno;
1810 (void) rmdir(t);
1811 return r == -EEXIST ? 0 : r; /* it's fine if someone else created the dir by now */
1812 }
1813
1814 return 0;
1815
1816}
1817
613b411c
LP
1818static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1819 _cleanup_free_ char *x = NULL;
6b46ea73
LP
1820 char bid[SD_ID128_STRING_MAX];
1821 sd_id128_t boot_id;
1822 int r;
613b411c
LP
1823
1824 assert(id);
1825 assert(prefix);
1826 assert(path);
1827
6b46ea73
LP
1828 /* We include the boot id in the directory so that after a
1829 * reboot we can easily identify obsolete directories. */
1830
1831 r = sd_id128_get_boot(&boot_id);
1832 if (r < 0)
1833 return r;
1834
605405c6 1835 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
1836 if (!x)
1837 return -ENOMEM;
1838
a652f050
JR
1839 r = make_tmp_prefix(prefix);
1840 if (r < 0)
1841 return r;
1842
613b411c
LP
1843 RUN_WITH_UMASK(0077)
1844 if (!mkdtemp(x))
1845 return -errno;
1846
1847 RUN_WITH_UMASK(0000) {
1848 char *y;
1849
63c372cb 1850 y = strjoina(x, "/tmp");
613b411c
LP
1851
1852 if (mkdir(y, 0777 | S_ISVTX) < 0)
1853 return -errno;
c17ec25e 1854 }
15ae422b 1855
1cc6c93a 1856 *path = TAKE_PTR(x);
613b411c
LP
1857
1858 return 0;
1859}
1860
1861int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1862 char *a, *b;
1863 int r;
1864
1865 assert(id);
1866 assert(tmp_dir);
1867 assert(var_tmp_dir);
1868
1869 r = setup_one_tmp_dir(id, "/tmp", &a);
1870 if (r < 0)
1871 return r;
1872
1873 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1874 if (r < 0) {
1875 char *t;
1876
63c372cb 1877 t = strjoina(a, "/tmp");
6990fb6b
LP
1878 (void) rmdir(t);
1879 (void) rmdir(a);
613b411c
LP
1880
1881 free(a);
1882 return r;
1883 }
1884
1885 *tmp_dir = a;
1886 *var_tmp_dir = b;
1887
1888 return 0;
1889}
1890
2caa38e9 1891int setup_netns(const int netns_storage_socket[static 2]) {
613b411c 1892 _cleanup_close_ int netns = -1;
3ee897d6 1893 int r, q;
613b411c
LP
1894
1895 assert(netns_storage_socket);
1896 assert(netns_storage_socket[0] >= 0);
1897 assert(netns_storage_socket[1] >= 0);
1898
1899 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1900 * namespace reference fd. Whatever process runs this first
1901 * shall create a new namespace, all others should just join
1902 * it. To serialize that we use a file lock on the socket
1903 * pair.
613b411c
LP
1904 *
1905 * It's a bit crazy, but hey, works great! */
1906
1907 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1908 return -errno;
1909
3ee897d6
LP
1910 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1911 if (netns == -EAGAIN) {
44ffcbae 1912 /* Nothing stored yet, so let's create a new namespace. */
613b411c
LP
1913
1914 if (unshare(CLONE_NEWNET) < 0) {
1915 r = -errno;
1916 goto fail;
1917 }
1918
44ffcbae 1919 (void) loopback_setup();
613b411c
LP
1920
1921 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1922 if (netns < 0) {
1923 r = -errno;
1924 goto fail;
1925 }
1926
1927 r = 1;
613b411c 1928
3ee897d6
LP
1929 } else if (netns < 0) {
1930 r = netns;
1931 goto fail;
613b411c 1932
3ee897d6
LP
1933 } else {
1934 /* Yay, found something, so let's join the namespace */
613b411c
LP
1935 if (setns(netns, CLONE_NEWNET) < 0) {
1936 r = -errno;
1937 goto fail;
1938 }
1939
1940 r = 0;
1941 }
1942
3ee897d6
LP
1943 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1944 if (q < 0) {
1945 r = q;
613b411c
LP
1946 goto fail;
1947 }
1948
1949fail:
fe048ce5 1950 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1951 return r;
1952}
417116f2 1953
2caa38e9 1954int open_netns_path(const int netns_storage_socket[static 2], const char *path) {
51af7fb2
LP
1955 _cleanup_close_ int netns = -1;
1956 int q, r;
1957
1958 assert(netns_storage_socket);
1959 assert(netns_storage_socket[0] >= 0);
1960 assert(netns_storage_socket[1] >= 0);
1961 assert(path);
1962
1963 /* If the storage socket doesn't contain a netns fd yet, open one via the file system and store it in
1964 * it. This is supposed to be called ahead of time, i.e. before setup_netns() which will allocate a
1965 * new anonymous netns if needed. */
1966
1967 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1968 return -errno;
1969
1970 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1971 if (netns == -EAGAIN) {
1972 /* Nothing stored yet. Open the file from the file system. */
1973
1974 netns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
1975 if (netns < 0) {
1976 r = -errno;
1977 goto fail;
1978 }
1979
1980 r = fd_is_network_ns(netns);
1981 if (r == 0) { /* Not a netns? Refuse early. */
1982 r = -EINVAL;
1983 goto fail;
1984 }
1985 if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
1986 goto fail;
1987
1988 r = 1;
1989
1990 } else if (netns < 0) {
1991 r = netns;
1992 goto fail;
1993 } else
1994 r = 0; /* Already allocated */
1995
1996 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1997 if (q < 0) {
1998 r = q;
1999 goto fail;
2000 }
2001
2002fail:
2003 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
2004 return r;
2005}
2006
6e2d7c4f
MS
2007bool ns_type_supported(NamespaceType type) {
2008 const char *t, *ns_proc;
2009
0fa5b831
LP
2010 t = namespace_type_to_string(type);
2011 if (!t) /* Don't know how to translate this? Then it's not supported */
6e2d7c4f
MS
2012 return false;
2013
6e2d7c4f 2014 ns_proc = strjoina("/proc/self/ns/", t);
6e2d7c4f
MS
2015 return access(ns_proc, F_OK) == 0;
2016}
2017
1b8689f9 2018static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
cbc056c8
ZJS
2019 [PROTECT_HOME_NO] = "no",
2020 [PROTECT_HOME_YES] = "yes",
1b8689f9 2021 [PROTECT_HOME_READ_ONLY] = "read-only",
cbc056c8 2022 [PROTECT_HOME_TMPFS] = "tmpfs",
417116f2
LP
2023};
2024
1e8c7bd5 2025DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
5e1c6154 2026
1b8689f9 2027static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
cbc056c8
ZJS
2028 [PROTECT_SYSTEM_NO] = "no",
2029 [PROTECT_SYSTEM_YES] = "yes",
2030 [PROTECT_SYSTEM_FULL] = "full",
3f815163 2031 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
2032};
2033
1e8c7bd5 2034DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
03c791aa 2035
6e2d7c4f 2036static const char* const namespace_type_table[] = {
cbc056c8 2037 [NAMESPACE_MOUNT] = "mnt",
6e2d7c4f 2038 [NAMESPACE_CGROUP] = "cgroup",
cbc056c8
ZJS
2039 [NAMESPACE_UTS] = "uts",
2040 [NAMESPACE_IPC] = "ipc",
2041 [NAMESPACE_USER] = "user",
2042 [NAMESPACE_PID] = "pid",
2043 [NAMESPACE_NET] = "net",
6e2d7c4f
MS
2044};
2045
2046DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);