]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
Remove /sbin from paths if split-bin is false (#8324)
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
15ae422b
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
15ae422b 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
21#include <errno.h>
07630cea 22#include <sched.h>
15ae422b 23#include <stdio.h>
07630cea
LP
24#include <string.h>
25#include <sys/mount.h>
15ae422b 26#include <sys/stat.h>
07630cea 27#include <unistd.h>
25e870b5 28#include <linux/fs.h>
15ae422b 29
b5efdb8a 30#include "alloc-util.h"
10404d52 31#include "base-filesystem.h"
7f112f50 32#include "dev-setup.h"
3ffd4af2 33#include "fd-util.h"
d944dc95 34#include "fs-util.h"
e908468b 35#include "label.h"
915e6d16 36#include "loop-util.h"
07630cea
LP
37#include "loopback-setup.h"
38#include "missing.h"
39#include "mkdir.h"
4349cd7c 40#include "mount-util.h"
3ffd4af2 41#include "namespace.h"
07630cea 42#include "path-util.h"
d7b8eec7 43#include "selinux-util.h"
2583fbea 44#include "socket-util.h"
36ce7110 45#include "stat-util.h"
8b43440b 46#include "string-table.h"
07630cea
LP
47#include "string-util.h"
48#include "strv.h"
affb60b1 49#include "umask-util.h"
ee104e11 50#include "user-util.h"
07630cea 51#include "util.h"
15ae422b 52
737ba3c8 53#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
54
c17ec25e 55typedef enum MountMode {
15ae422b
LP
56 /* This is ordered by priority! */
57 INACCESSIBLE,
d2d6c096
LP
58 BIND_MOUNT,
59 BIND_MOUNT_RECURSIVE,
ac0930c8 60 PRIVATE_TMP,
7f112f50 61 PRIVATE_DEV,
5d997827 62 BIND_DEV,
6c47cd7d 63 EMPTY_DIR,
5d997827
LP
64 SYSFS,
65 PROCFS,
66 READONLY,
59eeb84b 67 READWRITE,
2abd4e38 68 TMPFS,
c17ec25e 69} MountMode;
15ae422b 70
34de407a 71typedef struct MountEntry {
5327c910 72 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 73 MountMode mode:5;
5327c910
LP
74 bool ignore:1; /* Ignore if path does not exist? */
75 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 76 bool read_only:1; /* Shall this mount point be read-only? */
55fe7432 77 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
d2d6c096
LP
78 const char *source_const; /* The source path, for bind mounts */
79 char *source_malloc;
2abd4e38
YW
80 const char *options_const;/* Mount options for tmpfs */
81 char *options_malloc;
82 unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
34de407a 83} MountEntry;
15ae422b 84
5d997827
LP
85/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
86 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
87static const MountEntry apivfs_table[] = {
88 { "/proc", PROCFS, false },
89 { "/dev", BIND_DEV, false },
90 { "/sys", SYSFS, false },
91};
f471b2af 92
11a30cec 93/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 94static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
95 { "/proc/sys", READONLY, false },
96 { "/proc/sysrq-trigger", READONLY, true },
97 { "/proc/latency_stats", READONLY, true },
98 { "/proc/mtrr", READONLY, true },
aa70f38b 99 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
100 { "/proc/acpi", READONLY, true },
101 { "/proc/timer_stats", READONLY, true },
102 { "/proc/asound", READONLY, true },
103 { "/proc/bus", READONLY, true },
104 { "/proc/fs", READONLY, true },
105 { "/proc/irq", READONLY, true },
106 { "/sys", READONLY, false },
107 { "/sys/kernel/debug", READONLY, true },
108 { "/sys/kernel/tracing", READONLY, true },
13a141f0 109 { "/sys/fs/bpf", READONLY, true },
c6232fb0 110 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
3a0bf6d6 111 { "/sys/fs/selinux", READWRITE, true },
11a30cec
DH
112};
113
c575770b 114/* ProtectKernelModules= option */
34de407a 115static const MountEntry protect_kernel_modules_table[] = {
349cc4a5 116#if HAVE_SPLIT_USR
c6232fb0 117 { "/lib/modules", INACCESSIBLE, true },
c575770b 118#endif
c6232fb0 119 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
120};
121
b6c432ca
DH
122/*
123 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
124 * system should be protected by ProtectSystem=
125 */
34de407a 126static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
127 { "/home", READONLY, true },
128 { "/run/user", READONLY, true },
129 { "/root", READONLY, true },
b6c432ca
DH
130};
131
e4da7d8c
YW
132/* ProtectHome=tmpfs table */
133static const MountEntry protect_home_tmpfs_table[] = {
134 { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
135 { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
136 { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
137};
138
b6c432ca 139/* ProtectHome=yes table */
34de407a 140static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
141 { "/home", INACCESSIBLE, true },
142 { "/run/user", INACCESSIBLE, true },
143 { "/root", INACCESSIBLE, true },
b6c432ca
DH
144};
145
f471b2af 146/* ProtectSystem=yes table */
34de407a 147static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
148 { "/usr", READONLY, false },
149 { "/boot", READONLY, true },
150 { "/efi", READONLY, true },
7486f305
AB
151#if HAVE_SPLIT_USR
152 { "/lib", READONLY, true },
153 { "/lib64", READONLY, true },
154 { "/bin", READONLY, true },
671f0f8d 155# if HAVE_SPLIT_BIN
7486f305 156 { "/sbin", READONLY, true },
671f0f8d 157# endif
7486f305 158#endif
f471b2af
DH
159};
160
161/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 162static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
163 { "/usr", READONLY, false },
164 { "/boot", READONLY, true },
165 { "/efi", READONLY, true },
166 { "/etc", READONLY, false },
7486f305
AB
167#if HAVE_SPLIT_USR
168 { "/lib", READONLY, true },
169 { "/lib64", READONLY, true },
170 { "/bin", READONLY, true },
671f0f8d 171# if HAVE_SPLIT_BIN
7486f305 172 { "/sbin", READONLY, true },
671f0f8d 173# endif
7486f305 174#endif
f471b2af
DH
175};
176
177/*
178 * ProtectSystem=strict table. In this strict mode, we mount everything
179 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
180 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
181 * protect those, and these options should be fully orthogonal.
182 * (And of course /home and friends are also left writable, as ProtectHome=
183 * shall manage those, orthogonally).
184 */
34de407a 185static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
186 { "/", READONLY, false },
187 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
188 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
189 { "/dev", READWRITE, false }, /* PrivateDevices= */
190 { "/home", READWRITE, true }, /* ProtectHome= */
191 { "/run/user", READWRITE, true }, /* ProtectHome= */
192 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
193};
194
34de407a 195static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
196 assert(p);
197
5327c910
LP
198 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
199 * otherwise the stack/static ->path field is returned. */
f0a4feb0 200
5327c910 201 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
202}
203
34de407a 204static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
205 assert(p);
206
207 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
208}
209
d2d6c096
LP
210static const char *mount_entry_source(const MountEntry *p) {
211 assert(p);
212
213 return p->source_malloc ?: p->source_const;
214}
215
2abd4e38
YW
216static const char *mount_entry_options(const MountEntry *p) {
217 assert(p);
218
219 return p->options_malloc ?: p->options_const;
220}
221
1eb7e08e
LP
222static void mount_entry_done(MountEntry *p) {
223 assert(p);
224
225 p->path_malloc = mfree(p->path_malloc);
226 p->source_malloc = mfree(p->source_malloc);
2abd4e38 227 p->options_malloc = mfree(p->options_malloc);
1eb7e08e
LP
228}
229
d18aff04 230static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
15ae422b
LP
231 char **i;
232
613b411c
LP
233 assert(p);
234
5327c910
LP
235 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
236
15ae422b 237 STRV_FOREACH(i, strv) {
5327c910
LP
238 bool ignore = false, needs_prefix = false;
239 const char *e = *i;
15ae422b 240
5327c910
LP
241 /* Look for any prefixes */
242 if (startswith(e, "-")) {
243 e++;
9c94d52e 244 ignore = true;
ea92ae33 245 }
5327c910
LP
246 if (startswith(e, "+")) {
247 e++;
248 needs_prefix = true;
249 }
ea92ae33 250
5327c910 251 if (!path_is_absolute(e))
15ae422b
LP
252 return -EINVAL;
253
34de407a 254 *((*p)++) = (MountEntry) {
5327c910
LP
255 .path_const = e,
256 .mode = mode,
257 .ignore = ignore,
d18aff04 258 .has_prefix = !needs_prefix && !forcibly_require_prefix,
5327c910 259 };
15ae422b
LP
260 }
261
262 return 0;
263}
264
6c47cd7d
LP
265static int append_empty_dir_mounts(MountEntry **p, char **strv) {
266 char **i;
267
268 assert(p);
269
270 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
271 * "/private/" boundary directories for DynamicUser=1. */
272
273 STRV_FOREACH(i, strv) {
274
275 *((*p)++) = (MountEntry) {
276 .path_const = *i,
277 .mode = EMPTY_DIR,
278 .ignore = false,
279 .has_prefix = false,
280 .read_only = true,
2abd4e38
YW
281 .options_const = "mode=755",
282 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
6c47cd7d
LP
283 };
284 }
285
286 return 0;
287}
288
d2d6c096
LP
289static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
290 unsigned i;
291
292 assert(p);
293
294 for (i = 0; i < n; i++) {
295 const BindMount *b = binds + i;
296
297 *((*p)++) = (MountEntry) {
298 .path_const = b->destination,
299 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
300 .read_only = b->read_only,
301 .source_const = b->source,
4ca763a9 302 .ignore = b->ignore_enoent,
d2d6c096
LP
303 };
304 }
305
306 return 0;
307}
308
2abd4e38
YW
309static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, unsigned n) {
310 unsigned i;
311 int r;
312
313 assert(p);
314
315 for (i = 0; i < n; i++) {
316 const TemporaryFileSystem *t = tmpfs + i;
317 _cleanup_free_ char *o = NULL, *str = NULL;
318 unsigned long flags = MS_NODEV|MS_STRICTATIME;
319 bool ro = false;
320
321 if (!path_is_absolute(t->path))
322 return -EINVAL;
323
324 if (!isempty(t->options)) {
325 str = strjoin("mode=0755,", t->options);
326 if (!str)
327 return -ENOMEM;
328
329 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
330 if (r < 0)
331 return r;
332
333 ro = !!(flags & MS_RDONLY);
334 if (ro)
335 flags ^= MS_RDONLY;
336 }
337
338 *((*p)++) = (MountEntry) {
339 .path_const = t->path,
340 .mode = TMPFS,
341 .read_only = ro,
342 .options_malloc = o,
343 .flags = flags,
344 };
345
346 o = NULL;
347 }
348
349 return 0;
350}
351
34de407a 352static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 353 unsigned i;
11a30cec
DH
354
355 assert(p);
f471b2af 356 assert(mounts);
11a30cec 357
5327c910 358 /* Adds a list of static pre-defined entries */
f471b2af 359
5327c910 360 for (i = 0; i < n; i++)
34de407a
LP
361 *((*p)++) = (MountEntry) {
362 .path_const = mount_entry_path(mounts+i),
5327c910
LP
363 .mode = mounts[i].mode,
364 .ignore = mounts[i].ignore || ignore_protect,
365 };
f471b2af
DH
366
367 return 0;
368}
369
34de407a 370static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
371 assert(p);
372
5327c910 373 switch (protect_home) {
b6c432ca 374
5327c910 375 case PROTECT_HOME_NO:
b6c432ca
DH
376 return 0;
377
b6c432ca 378 case PROTECT_HOME_READ_ONLY:
5327c910
LP
379 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
380
e4da7d8c
YW
381 case PROTECT_HOME_TMPFS:
382 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
383
b6c432ca 384 case PROTECT_HOME_YES:
5327c910
LP
385 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
386
b6c432ca 387 default:
5327c910 388 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 389 }
b6c432ca
DH
390}
391
34de407a 392static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
393 assert(p);
394
5327c910
LP
395 switch (protect_system) {
396
397 case PROTECT_SYSTEM_NO:
f471b2af
DH
398 return 0;
399
f471b2af 400 case PROTECT_SYSTEM_STRICT:
5327c910
LP
401 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
402
f471b2af 403 case PROTECT_SYSTEM_YES:
5327c910
LP
404 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
405
f471b2af 406 case PROTECT_SYSTEM_FULL:
5327c910
LP
407 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
408
f471b2af 409 default:
5327c910 410 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 411 }
11a30cec
DH
412}
413
c17ec25e 414static int mount_path_compare(const void *a, const void *b) {
34de407a 415 const MountEntry *p = a, *q = b;
a0827e2b 416 int d;
15ae422b 417
6ee1a919 418 /* If the paths are not equal, then order prefixes first */
34de407a 419 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
420 if (d != 0)
421 return d;
15ae422b 422
6ee1a919
LP
423 /* If the paths are equal, check the mode */
424 if (p->mode < q->mode)
425 return -1;
15ae422b 426
6ee1a919
LP
427 if (p->mode > q->mode)
428 return 1;
15ae422b 429
6ee1a919 430 return 0;
15ae422b
LP
431}
432
34de407a 433static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
434 unsigned i;
435
436 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
437 * that. */
438
439 if (!root_directory)
440 return 0;
441
442 for (i = 0; i < n; i++) {
443 char *s;
444
445 if (m[i].has_prefix)
446 continue;
447
34de407a 448 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
449 if (!s)
450 return -ENOMEM;
451
e282f51f 452 free_and_replace(m[i].path_malloc, s);
5327c910
LP
453 m[i].has_prefix = true;
454 }
455
456 return 0;
457}
458
34de407a
LP
459static void drop_duplicates(MountEntry *m, unsigned *n) {
460 MountEntry *f, *t, *previous;
15ae422b 461
c17ec25e 462 assert(m);
15ae422b 463 assert(n);
15ae422b 464
fe3c2583
LP
465 /* Drops duplicate entries. Expects that the array is properly ordered already. */
466
1d54cd5d 467 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 468
fe3c2583
LP
469 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
470 * above. */
34de407a
LP
471 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
472 log_debug("%s is duplicate.", mount_entry_path(f));
473 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 474 mount_entry_done(f);
15ae422b 475 continue;
fe3c2583 476 }
15ae422b 477
e2d7c1a0 478 *t = *f;
15ae422b 479 previous = t;
fe3c2583
LP
480 t++;
481 }
482
483 *n = t - m;
484}
485
34de407a
LP
486static void drop_inaccessible(MountEntry *m, unsigned *n) {
487 MountEntry *f, *t;
fe3c2583
LP
488 const char *clear = NULL;
489
490 assert(m);
491 assert(n);
492
493 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
494 * ordered already. */
495
1d54cd5d 496 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
497
498 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
499 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
500 if (clear && path_startswith(mount_entry_path(f), clear)) {
501 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 502 mount_entry_done(f);
fe3c2583
LP
503 continue;
504 }
15ae422b 505
34de407a 506 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
507
508 *t = *f;
15ae422b
LP
509 t++;
510 }
511
c17ec25e 512 *n = t - m;
15ae422b
LP
513}
514
34de407a
LP
515static void drop_nop(MountEntry *m, unsigned *n) {
516 MountEntry *f, *t;
7648a565
LP
517
518 assert(m);
519 assert(n);
520
521 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
522 * list is ordered by prefixes. */
523
1d54cd5d 524 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
525
526 /* Only suppress such subtrees for READONLY and READWRITE entries */
527 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 528 MountEntry *p;
7648a565
LP
529 bool found = false;
530
531 /* Now let's find the first parent of the entry we are looking at. */
532 for (p = t-1; p >= m; p--) {
34de407a 533 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
534 found = true;
535 break;
536 }
537 }
538
539 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
540 if (found && p->mode == f->mode) {
34de407a 541 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
1eb7e08e 542 mount_entry_done(f);
7648a565
LP
543 continue;
544 }
545 }
546
547 *t = *f;
548 t++;
549 }
550
551 *n = t - m;
552}
553
34de407a
LP
554static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
555 MountEntry *f, *t;
cd2902c9
LP
556
557 assert(m);
558 assert(n);
559
1d54cd5d 560 /* Nothing to do */
cd2902c9
LP
561 if (!root_directory)
562 return;
563
564 /* Drops all mounts that are outside of the root directory. */
565
1d54cd5d 566 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 567
34de407a
LP
568 if (!path_startswith(mount_entry_path(f), root_directory)) {
569 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 570 mount_entry_done(f);
cd2902c9
LP
571 continue;
572 }
573
574 *t = *f;
575 t++;
576 }
577
578 *n = t - m;
579}
580
414b304b 581static int clone_device_node(const char *d, const char *temporary_mount) {
6f7f3a33 582 const char *dn;
b5e99f23
ДГ
583 struct stat st;
584 int r;
585
414b304b 586 if (stat(d, &st) < 0) {
b5e99f23
ДГ
587 if (errno == ENOENT)
588 return 0;
589 return -errno;
590 }
591
592 if (!S_ISBLK(st.st_mode) &&
593 !S_ISCHR(st.st_mode))
594 return -EINVAL;
595
596 if (st.st_rdev == 0)
597 return 0;
598
6f7f3a33 599 dn = strjoina(temporary_mount, d);
b5e99f23
ДГ
600
601 mac_selinux_create_file_prepare(d, st.st_mode);
602 r = mknod(dn, st.st_mode, st.st_rdev);
603 mac_selinux_create_file_clear();
b5e99f23 604 if (r < 0)
225874dc 605 return log_debug_errno(errno, "mknod failed for %s: %m", d);
b5e99f23 606
98b1d2b8 607 return 1;
b5e99f23
ДГ
608}
609
5d997827 610static int mount_private_dev(MountEntry *m) {
7f112f50
LP
611 static const char devnodes[] =
612 "/dev/null\0"
613 "/dev/zero\0"
614 "/dev/full\0"
615 "/dev/random\0"
616 "/dev/urandom\0"
617 "/dev/tty\0";
618
2b85f4e1 619 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 620 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
621 _cleanup_umask_ mode_t u;
622 int r;
623
624 assert(m);
625
626 u = umask(0000);
627
2b85f4e1
LP
628 if (!mkdtemp(temporary_mount))
629 return -errno;
630
63c372cb 631 dev = strjoina(temporary_mount, "/dev");
dc751688 632 (void) mkdir(dev, 0755);
737ba3c8 633 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
634 r = -errno;
635 goto fail;
636 }
637
63c372cb 638 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 639 (void) mkdir(devpts, 0755);
2b85f4e1
LP
640 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
641 r = -errno;
642 goto fail;
643 }
644
414b304b
ДГ
645 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
646 * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
647 * thus, in that case make a clone
648 *
649 * in nspawn and other containers it will be a symlink, in that case make it a symlink
650 */
36ce7110
LP
651 r = is_symlink("/dev/ptmx");
652 if (r < 0)
3164e3cb 653 goto fail;
36ce7110 654 if (r > 0) {
414b304b
ДГ
655 devptmx = strjoina(temporary_mount, "/dev/ptmx");
656 if (symlink("pts/ptmx", devptmx) < 0) {
657 r = -errno;
658 goto fail;
659 }
660 } else {
661 r = clone_device_node("/dev/ptmx", temporary_mount);
152c475f
LP
662 if (r < 0)
663 goto fail;
664 if (r == 0) {
665 r = -ENXIO;
414b304b 666 goto fail;
152c475f 667 }
414b304b 668 }
e06b6479 669
63c372cb 670 devshm = strjoina(temporary_mount, "/dev/shm");
8d953682 671 (void) mkdir(devshm, 0755);
2b85f4e1
LP
672 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
673 if (r < 0) {
674 r = -errno;
675 goto fail;
676 }
677
63c372cb 678 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 679 (void) mkdir(devmqueue, 0755);
3164e3cb 680 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 681
63c372cb 682 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 683 (void) mkdir(devhugepages, 0755);
3164e3cb 684 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 685
63c372cb 686 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 687 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 688
7f112f50 689 NULSTR_FOREACH(d, devnodes) {
b5e99f23
ДГ
690 r = clone_device_node(d, temporary_mount);
691 if (r < 0)
2b85f4e1 692 goto fail;
7f112f50
LP
693 }
694
03cfe0d5 695 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 696
ee818b89
AC
697 /* Create the /dev directory if missing. It is more likely to be
698 * missing when the service is started with RootDirectory. This is
699 * consistent with mount units creating the mount points when missing.
700 */
34de407a 701 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 702
9e5f8252 703 /* Unmount everything in old /dev */
34de407a
LP
704 umount_recursive(mount_entry_path(m), 0);
705 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
706 r = -errno;
707 goto fail;
708 }
7f112f50 709
2b85f4e1
LP
710 rmdir(dev);
711 rmdir(temporary_mount);
7f112f50 712
2b85f4e1 713 return 0;
7f112f50 714
2b85f4e1
LP
715fail:
716 if (devpts)
717 umount(devpts);
7f112f50 718
2b85f4e1
LP
719 if (devshm)
720 umount(devshm);
7f112f50 721
2b85f4e1
LP
722 if (devhugepages)
723 umount(devhugepages);
7f112f50 724
2b85f4e1
LP
725 if (devmqueue)
726 umount(devmqueue);
7f112f50 727
d267c5aa
ZJS
728 umount(dev);
729 rmdir(dev);
2b85f4e1 730 rmdir(temporary_mount);
7f112f50 731
2b85f4e1 732 return r;
7f112f50
LP
733}
734
2a2969fd 735static int mount_bind_dev(const MountEntry *m) {
5d997827
LP
736 int r;
737
738 assert(m);
739
740 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
741 * /dev. This is only used when RootDirectory= is set. */
742
645767d6
LP
743 (void) mkdir_p_label(mount_entry_path(m), 0755);
744
5d997827
LP
745 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
746 if (r < 0)
747 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
748 if (r > 0) /* make this a NOP if /dev is already a mount point */
749 return 0;
750
751 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
752 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
753
754 return 1;
755}
756
2a2969fd 757static int mount_sysfs(const MountEntry *m) {
5d997827
LP
758 int r;
759
760 assert(m);
761
645767d6
LP
762 (void) mkdir_p_label(mount_entry_path(m), 0755);
763
5d997827
LP
764 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
765 if (r < 0)
766 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
767 if (r > 0) /* make this a NOP if /sys is already a mount point */
768 return 0;
769
770 /* Bind mount the host's version so that we get all child mounts of it, too. */
771 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
772 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
773
774 return 1;
775}
776
2a2969fd 777static int mount_procfs(const MountEntry *m) {
5d997827
LP
778 int r;
779
780 assert(m);
781
645767d6
LP
782 (void) mkdir_p_label(mount_entry_path(m), 0755);
783
5d997827
LP
784 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
785 if (r < 0)
786 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
787 if (r > 0) /* make this a NOP if /proc is already a mount point */
788 return 0;
789
790 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
791 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
792 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
793
794 return 1;
795}
796
2abd4e38 797static int mount_tmpfs(const MountEntry *m) {
6c47cd7d
LP
798 assert(m);
799
2abd4e38 800 /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
6c47cd7d
LP
801
802 (void) mkdir_p_label(mount_entry_path(m), 0755);
803 (void) umount_recursive(mount_entry_path(m), 0);
804
2abd4e38 805 if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
6c47cd7d
LP
806 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
807
808 return 1;
809}
810
d2d6c096
LP
811static int mount_entry_chase(
812 const char *root_directory,
2a2969fd 813 const MountEntry *m,
d2d6c096 814 const char *path,
4ca763a9 815 bool chase_nonexistent,
d2d6c096
LP
816 char **location) {
817
8fceda93
LP
818 char *chased;
819 int r;
820
821 assert(m);
822
823 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
d2d6c096
LP
824 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
825 * that applies). The result is stored in "location". */
8fceda93 826
4ca763a9 827 r = chase_symlinks(path, root_directory, chase_nonexistent ? CHASE_NONEXISTENT : 0, &chased);
8fceda93 828 if (r == -ENOENT && m->ignore) {
d2d6c096 829 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
8fceda93
LP
830 return 0;
831 }
832 if (r < 0)
d2d6c096 833 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
8fceda93 834
d2d6c096 835 log_debug("Followed symlinks %s → %s.", path, chased);
8fceda93 836
d2d6c096
LP
837 free(*location);
838 *location = chased;
8fceda93
LP
839
840 return 1;
841}
842
ac0930c8 843static int apply_mount(
8fceda93 844 const char *root_directory,
89bd586c 845 MountEntry *m) {
ac0930c8 846
a227a4be 847 bool rbind = true, make = false;
15ae422b 848 const char *what;
15ae422b 849 int r;
15ae422b 850
c17ec25e 851 assert(m);
15ae422b 852
4ca763a9 853 r = mount_entry_chase(root_directory, m, mount_entry_path(m), !IN_SET(m->mode, INACCESSIBLE, READONLY, READWRITE), &m->path_malloc);
8fceda93
LP
854 if (r <= 0)
855 return r;
856
34de407a 857 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 858
c17ec25e 859 switch (m->mode) {
15ae422b 860
160cfdbe
LP
861 case INACCESSIBLE: {
862 struct stat target;
6d313367
LP
863
864 /* First, get rid of everything that is below if there
865 * is anything... Then, overmount it with an
c4b41707 866 * inaccessible path. */
34de407a 867 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 868
34de407a
LP
869 if (lstat(mount_entry_path(m), &target) < 0)
870 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
15ae422b 871
c4b41707 872 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
873 if (!what) {
874 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
875 return -ELOOP;
876 }
877 break;
160cfdbe 878 }
fe3c2583 879
15ae422b 880 case READONLY:
15ae422b 881 case READWRITE:
8fceda93 882 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
d944dc95 883 if (r < 0)
34de407a 884 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
885 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
886 return 0;
6b7c9f8b 887 /* This isn't a mount point yet, let's make it one. */
34de407a 888 what = mount_entry_path(m);
6b7c9f8b 889 break;
15ae422b 890
d2d6c096
LP
891 case BIND_MOUNT:
892 rbind = false;
d2d6c096 893
4831981d 894 _fallthrough_;
d2d6c096
LP
895 case BIND_MOUNT_RECURSIVE:
896 /* Also chase the source mount */
5d997827 897
4ca763a9 898 r = mount_entry_chase(root_directory, m, mount_entry_source(m), false, &m->source_malloc);
d2d6c096
LP
899 if (r <= 0)
900 return r;
901
902 what = mount_entry_source(m);
a227a4be 903 make = true;
d2d6c096
LP
904 break;
905
6c47cd7d 906 case EMPTY_DIR:
2abd4e38
YW
907 case TMPFS:
908 return mount_tmpfs(m);
6c47cd7d 909
ac0930c8 910 case PRIVATE_TMP:
89bd586c 911 what = mount_entry_source(m);
a227a4be 912 make = true;
15ae422b 913 break;
e364ad06 914
d6797c92 915 case PRIVATE_DEV:
5d997827
LP
916 return mount_private_dev(m);
917
918 case BIND_DEV:
919 return mount_bind_dev(m);
920
921 case SYSFS:
922 return mount_sysfs(m);
923
924 case PROCFS:
925 return mount_procfs(m);
d6797c92 926
e364ad06
LP
927 default:
928 assert_not_reached("Unknown mode");
15ae422b
LP
929 }
930
ac0930c8 931 assert(what);
15ae422b 932
a227a4be
LP
933 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
934 bool try_again = false;
935 r = -errno;
936
937 if (r == -ENOENT && make) {
938 struct stat st;
939
940 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
941
942 if (stat(what, &st) >= 0) {
943
944 (void) mkdir_parents(mount_entry_path(m), 0755);
945
946 if (S_ISDIR(st.st_mode))
947 try_again = mkdir(mount_entry_path(m), 0755) >= 0;
948 else
949 try_again = touch(mount_entry_path(m)) >= 0;
950 }
951 }
952
953 if (try_again) {
954 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
955 r = -errno;
956 else
957 r = 0;
958 }
959
960 if (r < 0)
961 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
962 }
6b7c9f8b 963
34de407a 964 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 965 return 0;
ac0930c8 966}
15ae422b 967
2a2969fd 968static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
6b7c9f8b 969 int r = 0;
15ae422b 970
c17ec25e 971 assert(m);
ac9de0b3 972 assert(proc_self_mountinfo);
ac0930c8 973
2abd4e38
YW
974 if (mount_entry_read_only(m)) {
975 if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) {
976 /* Make superblock readonly */
977 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0)
978 r = -errno;
979 } else
980 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
981 } else if (m->mode == PRIVATE_DEV) {
982 /* Superblock can be readonly but the submounts can't */
34de407a 983 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 984 r = -errno;
737ba3c8 985 } else
6b7c9f8b
LP
986 return 0;
987
988 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
989 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
990 * read-only mounts already applied. */
ac0930c8 991
8fceda93
LP
992 if (r == -ENOENT && m->ignore)
993 r = 0;
5327c910 994
1d54cd5d 995 return r;
d944dc95
LP
996}
997
bb0ff3fb 998static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
5d997827
LP
999 assert(ns_info);
1000
9c988f93
DH
1001 /*
1002 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1003 * since to protect the API VFS mounts, they need to be around in the
1004 * first place... and RootDirectory= or RootImage= need to be set.
1005 */
5d997827 1006
9c988f93
DH
1007 /* root_directory should point to a mount point */
1008 return root_directory &&
1009 (ns_info->mount_apivfs ||
1010 ns_info->protect_control_groups ||
1011 ns_info->protect_kernel_tunables);
5d997827
LP
1012}
1013
2652c6c1 1014static unsigned namespace_calculate_mounts(
9c988f93 1015 const char* root_directory,
bb0ff3fb 1016 const NamespaceInfo *ns_info,
2652c6c1
DH
1017 char** read_write_paths,
1018 char** read_only_paths,
1019 char** inaccessible_paths,
6c47cd7d 1020 char** empty_directories,
d2d6c096 1021 unsigned n_bind_mounts,
2abd4e38 1022 unsigned n_temporary_filesystems,
2652c6c1
DH
1023 const char* tmp_dir,
1024 const char* var_tmp_dir,
2652c6c1
DH
1025 ProtectHome protect_home,
1026 ProtectSystem protect_system) {
1027
b6c432ca 1028 unsigned protect_home_cnt;
f471b2af
DH
1029 unsigned protect_system_cnt =
1030 (protect_system == PROTECT_SYSTEM_STRICT ?
1031 ELEMENTSOF(protect_system_strict_table) :
1032 ((protect_system == PROTECT_SYSTEM_FULL) ?
1033 ELEMENTSOF(protect_system_full_table) :
1034 ((protect_system == PROTECT_SYSTEM_YES) ?
1035 ELEMENTSOF(protect_system_yes_table) : 0)));
1036
b6c432ca
DH
1037 protect_home_cnt =
1038 (protect_home == PROTECT_HOME_YES ?
1039 ELEMENTSOF(protect_home_yes_table) :
1040 ((protect_home == PROTECT_HOME_READ_ONLY) ?
e4da7d8c
YW
1041 ELEMENTSOF(protect_home_read_only_table) :
1042 ((protect_home == PROTECT_HOME_TMPFS) ?
1043 ELEMENTSOF(protect_home_tmpfs_table) : 0)));
b6c432ca 1044
2652c6c1
DH
1045 return !!tmp_dir + !!var_tmp_dir +
1046 strv_length(read_write_paths) +
1047 strv_length(read_only_paths) +
1048 strv_length(inaccessible_paths) +
6c47cd7d 1049 strv_length(empty_directories) +
d2d6c096 1050 n_bind_mounts +
2abd4e38 1051 n_temporary_filesystems +
c575770b
DH
1052 ns_info->private_dev +
1053 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1054 (ns_info->protect_control_groups ? 1 : 0) +
1055 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
5d997827 1056 protect_home_cnt + protect_system_cnt +
9c988f93 1057 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
2652c6c1
DH
1058}
1059
613b411c 1060int setup_namespace(
ee818b89 1061 const char* root_directory,
915e6d16 1062 const char* root_image,
bb0ff3fb 1063 const NamespaceInfo *ns_info,
2a624c36
AP
1064 char** read_write_paths,
1065 char** read_only_paths,
1066 char** inaccessible_paths,
6c47cd7d 1067 char** empty_directories,
d2d6c096
LP
1068 const BindMount *bind_mounts,
1069 unsigned n_bind_mounts,
2abd4e38
YW
1070 const TemporaryFileSystem *temporary_filesystems,
1071 unsigned n_temporary_filesystems,
a004cb4c
LP
1072 const char* tmp_dir,
1073 const char* var_tmp_dir,
1b8689f9
LP
1074 ProtectHome protect_home,
1075 ProtectSystem protect_system,
915e6d16
LP
1076 unsigned long mount_flags,
1077 DissectImageFlags dissect_image_flags) {
15ae422b 1078
915e6d16 1079 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
78ebe980 1080 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
915e6d16 1081 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
78ebe980 1082 _cleanup_free_ void *root_hash = NULL;
34de407a 1083 MountEntry *m, *mounts = NULL;
78ebe980 1084 size_t root_hash_size = 0;
d944dc95 1085 bool make_slave = false;
e908468b 1086 const char *root;
f0a4feb0 1087 unsigned n_mounts;
d18aff04 1088 bool require_prefix = false;
c17ec25e 1089 int r = 0;
15ae422b 1090
915e6d16
LP
1091 assert(ns_info);
1092
613b411c 1093 if (mount_flags == 0)
c17ec25e 1094 mount_flags = MS_SHARED;
ac0930c8 1095
915e6d16
LP
1096 if (root_image) {
1097 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1098
1099 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1100 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1101
1102 r = loop_device_make_by_path(root_image,
1103 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1104 &loop_device);
1105 if (r < 0)
1106 return r;
1107
78ebe980
LP
1108 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1109 if (r < 0)
1110 return r;
1111
1112 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1113 if (r < 0)
1114 return r;
1115
1116 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
915e6d16
LP
1117 if (r < 0)
1118 return r;
915e6d16
LP
1119 }
1120
e908468b
LP
1121 if (root_directory)
1122 root = root_directory;
2abd4e38 1123 else if (root_image || n_bind_mounts > 0 || n_temporary_filesystems > 0) {
e908468b
LP
1124
1125 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1126 * the same mount point for all images, which is safe, since they all live in their own namespaces
1127 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1128 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1129 * while we are applying them. */
1130
1131 root = "/run/systemd/unit-root";
1132 (void) mkdir_label(root, 0700);
d18aff04 1133 require_prefix = true;
e908468b
LP
1134 } else
1135 root = NULL;
1136
cfbeb4ef 1137 n_mounts = namespace_calculate_mounts(
e908468b 1138 root,
cfbeb4ef
LP
1139 ns_info,
1140 read_write_paths,
1141 read_only_paths,
1142 inaccessible_paths,
6c47cd7d 1143 empty_directories,
f5c52a77 1144 n_bind_mounts,
2abd4e38 1145 n_temporary_filesystems,
cfbeb4ef
LP
1146 tmp_dir, var_tmp_dir,
1147 protect_home, protect_system);
613b411c 1148
2652c6c1 1149 /* Set mount slave mode */
e908468b 1150 if (root || n_mounts > 0)
d944dc95
LP
1151 make_slave = true;
1152
f0a4feb0 1153 if (n_mounts > 0) {
34de407a 1154 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
d18aff04 1155 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
613b411c 1156 if (r < 0)
f0a4feb0 1157 goto finish;
613b411c 1158
d18aff04 1159 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
613b411c 1160 if (r < 0)
f0a4feb0 1161 goto finish;
613b411c 1162
d18aff04 1163 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
613b411c 1164 if (r < 0)
f0a4feb0 1165 goto finish;
7ff7394d 1166
6c47cd7d
LP
1167 r = append_empty_dir_mounts(&m, empty_directories);
1168 if (r < 0)
1169 goto finish;
1170
d2d6c096
LP
1171 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1172 if (r < 0)
1173 goto finish;
1174
2abd4e38
YW
1175 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1176 if (r < 0)
1177 goto finish;
1178
613b411c 1179 if (tmp_dir) {
34de407a 1180 *(m++) = (MountEntry) {
5327c910
LP
1181 .path_const = "/tmp",
1182 .mode = PRIVATE_TMP,
89bd586c 1183 .source_const = tmp_dir,
5327c910 1184 };
613b411c 1185 }
7ff7394d 1186
613b411c 1187 if (var_tmp_dir) {
34de407a 1188 *(m++) = (MountEntry) {
5327c910 1189 .path_const = "/var/tmp",
89bd586c
YW
1190 .mode = PRIVATE_TMP,
1191 .source_const = var_tmp_dir,
5327c910 1192 };
7ff7394d 1193 }
ac0930c8 1194
c575770b 1195 if (ns_info->private_dev) {
34de407a 1196 *(m++) = (MountEntry) {
5327c910
LP
1197 .path_const = "/dev",
1198 .mode = PRIVATE_DEV,
1199 };
7f112f50
LP
1200 }
1201
c575770b 1202 if (ns_info->protect_kernel_tunables) {
5327c910 1203 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 1204 if (r < 0)
f0a4feb0 1205 goto finish;
c575770b
DH
1206 }
1207
1208 if (ns_info->protect_kernel_modules) {
5327c910 1209 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 1210 if (r < 0)
f0a4feb0 1211 goto finish;
c575770b 1212 }
59eeb84b 1213
c575770b 1214 if (ns_info->protect_control_groups) {
34de407a 1215 *(m++) = (MountEntry) {
5327c910
LP
1216 .path_const = "/sys/fs/cgroup",
1217 .mode = READONLY,
1218 };
59eeb84b
LP
1219 }
1220
5327c910 1221 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 1222 if (r < 0)
f0a4feb0 1223 goto finish;
417116f2 1224
5327c910 1225 r = append_protect_system(&m, protect_system, false);
f471b2af 1226 if (r < 0)
f0a4feb0 1227 goto finish;
417116f2 1228
e908468b 1229 if (namespace_info_mount_apivfs(root, ns_info)) {
5d997827
LP
1230 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1231 if (r < 0)
1232 goto finish;
1233 }
1234
f0a4feb0 1235 assert(mounts + n_mounts == m);
ac0930c8 1236
5327c910 1237 /* Prepend the root directory where that's necessary */
e908468b 1238 r = prefix_where_needed(mounts, n_mounts, root);
5327c910
LP
1239 if (r < 0)
1240 goto finish;
1241
34de407a 1242 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
fe3c2583 1243
f0a4feb0 1244 drop_duplicates(mounts, &n_mounts);
e908468b 1245 drop_outside_root(root, mounts, &n_mounts);
f0a4feb0
DH
1246 drop_inaccessible(mounts, &n_mounts);
1247 drop_nop(mounts, &n_mounts);
15ae422b
LP
1248 }
1249
d944dc95
LP
1250 if (unshare(CLONE_NEWNS) < 0) {
1251 r = -errno;
1252 goto finish;
1253 }
1e4e94c8 1254
d944dc95 1255 if (make_slave) {
c2c13f2d
LP
1256 /* Remount / as SLAVE so that nothing now mounted in the namespace
1257 shows up in the parent */
d944dc95
LP
1258 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1259 r = -errno;
1260 goto finish;
1261 }
ee818b89
AC
1262 }
1263
915e6d16 1264 if (root_image) {
e908468b 1265 /* A root image is specified, mount it to the right place */
2d3a5a73 1266 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
915e6d16
LP
1267 if (r < 0)
1268 goto finish;
1269
07ce7407
TM
1270 if (decrypted_image) {
1271 r = decrypted_image_relinquish(decrypted_image);
1272 if (r < 0)
1273 goto finish;
1274 }
78ebe980 1275
915e6d16
LP
1276 loop_device_relinquish(loop_device);
1277
1278 } else if (root_directory) {
1279
e908468b
LP
1280 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1281 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 1282 if (r < 0)
d944dc95 1283 goto finish;
8f1ad200 1284 if (r == 0) {
e908468b 1285 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
8f1ad200
LP
1286 r = -errno;
1287 goto finish;
1288 }
d944dc95 1289 }
e908468b
LP
1290
1291 } else if (root) {
1292
1293 /* Let's mount the main root directory to the root directory to use */
1294 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1295 r = -errno;
1296 goto finish;
1297 }
ee818b89 1298 }
c2c13f2d 1299
4e0c20de
LP
1300 /* Try to set up the new root directory before mounting anything else there. */
1301 if (root_image || root_directory)
1302 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1303
f0a4feb0 1304 if (n_mounts > 0) {
ac9de0b3 1305 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
6b7c9f8b
LP
1306 char **blacklist;
1307 unsigned j;
1308
ac9de0b3
TR
1309 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1310 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1311 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1312 if (!proc_self_mountinfo) {
1313 r = -errno;
1314 goto finish;
1315 }
1316
6b7c9f8b 1317 /* First round, add in all special mounts we need */
f0a4feb0 1318 for (m = mounts; m < mounts + n_mounts; ++m) {
89bd586c 1319 r = apply_mount(root, m);
c2c13f2d 1320 if (r < 0)
d944dc95 1321 goto finish;
c2c13f2d 1322 }
15ae422b 1323
6b7c9f8b 1324 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
1325 blacklist = newa(char*, n_mounts+1);
1326 for (j = 0; j < n_mounts; j++)
34de407a 1327 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
1328 blacklist[j] = NULL;
1329
1330 /* Second round, flip the ro bits if necessary. */
f0a4feb0 1331 for (m = mounts; m < mounts + n_mounts; ++m) {
ac9de0b3 1332 r = make_read_only(m, blacklist, proc_self_mountinfo);
c2c13f2d 1333 if (r < 0)
d944dc95 1334 goto finish;
c2c13f2d 1335 }
15ae422b
LP
1336 }
1337
e908468b 1338 if (root) {
ee818b89 1339 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
e908468b 1340 r = mount_move_root(root);
d944dc95
LP
1341 if (r < 0)
1342 goto finish;
ee818b89
AC
1343 }
1344
55fe7432 1345 /* Remount / as the desired mode. Note that this will not
c2c13f2d
LP
1346 * reestablish propagation from our side to the host, since
1347 * what's disconnected is disconnected. */
d944dc95
LP
1348 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1349 r = -errno;
1350 goto finish;
1351 }
15ae422b 1352
d944dc95 1353 r = 0;
15ae422b 1354
d944dc95 1355finish:
f0a4feb0 1356 for (m = mounts; m < mounts + n_mounts; m++)
1eb7e08e 1357 mount_entry_done(m);
613b411c
LP
1358
1359 return r;
1360}
1361
d2d6c096
LP
1362void bind_mount_free_many(BindMount *b, unsigned n) {
1363 unsigned i;
1364
1365 assert(b || n == 0);
1366
1367 for (i = 0; i < n; i++) {
1368 free(b[i].source);
1369 free(b[i].destination);
1370 }
1371
1372 free(b);
1373}
1374
1375int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1376 _cleanup_free_ char *s = NULL, *d = NULL;
1377 BindMount *c;
1378
1379 assert(b);
1380 assert(n);
1381 assert(item);
1382
1383 s = strdup(item->source);
1384 if (!s)
1385 return -ENOMEM;
1386
1387 d = strdup(item->destination);
1388 if (!d)
1389 return -ENOMEM;
1390
aa484f35 1391 c = reallocarray(*b, *n + 1, sizeof(BindMount));
d2d6c096
LP
1392 if (!c)
1393 return -ENOMEM;
1394
1395 *b = c;
1396
1397 c[(*n) ++] = (BindMount) {
1398 .source = s,
1399 .destination = d,
1400 .read_only = item->read_only,
1401 .recursive = item->recursive,
1402 .ignore_enoent = item->ignore_enoent,
1403 };
1404
1405 s = d = NULL;
1406 return 0;
1407}
1408
2abd4e38
YW
1409void temporary_filesystem_free_many(TemporaryFileSystem *t, unsigned n) {
1410 unsigned i;
1411
1412 assert(t || n == 0);
1413
1414 for (i = 0; i < n; i++) {
1415 free(t[i].path);
1416 free(t[i].options);
1417 }
1418
1419 free(t);
1420}
1421
1422int temporary_filesystem_add(
1423 TemporaryFileSystem **t,
1424 unsigned *n,
1425 const char *path,
1426 const char *options) {
1427
1428 _cleanup_free_ char *p = NULL, *o = NULL;
1429 TemporaryFileSystem *c;
1430
1431 assert(t);
1432 assert(n);
1433 assert(path);
1434
1435 p = strdup(path);
1436 if (!p)
1437 return -ENOMEM;
1438
1439 if (!isempty(options)) {
1440 o = strdup(options);
1441 if (!o)
1442 return -ENOMEM;
1443 }
1444
aa484f35 1445 c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
2abd4e38
YW
1446 if (!c)
1447 return -ENOMEM;
1448
1449 *t = c;
1450
1451 c[(*n) ++] = (TemporaryFileSystem) {
1452 .path = p,
1453 .options = o,
1454 };
1455
1456 p = o = NULL;
1457 return 0;
1458}
1459
613b411c
LP
1460static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1461 _cleanup_free_ char *x = NULL;
6b46ea73
LP
1462 char bid[SD_ID128_STRING_MAX];
1463 sd_id128_t boot_id;
1464 int r;
613b411c
LP
1465
1466 assert(id);
1467 assert(prefix);
1468 assert(path);
1469
6b46ea73
LP
1470 /* We include the boot id in the directory so that after a
1471 * reboot we can easily identify obsolete directories. */
1472
1473 r = sd_id128_get_boot(&boot_id);
1474 if (r < 0)
1475 return r;
1476
605405c6 1477 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
1478 if (!x)
1479 return -ENOMEM;
1480
1481 RUN_WITH_UMASK(0077)
1482 if (!mkdtemp(x))
1483 return -errno;
1484
1485 RUN_WITH_UMASK(0000) {
1486 char *y;
1487
63c372cb 1488 y = strjoina(x, "/tmp");
613b411c
LP
1489
1490 if (mkdir(y, 0777 | S_ISVTX) < 0)
1491 return -errno;
c17ec25e 1492 }
15ae422b 1493
613b411c
LP
1494 *path = x;
1495 x = NULL;
1496
1497 return 0;
1498}
1499
1500int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1501 char *a, *b;
1502 int r;
1503
1504 assert(id);
1505 assert(tmp_dir);
1506 assert(var_tmp_dir);
1507
1508 r = setup_one_tmp_dir(id, "/tmp", &a);
1509 if (r < 0)
1510 return r;
1511
1512 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1513 if (r < 0) {
1514 char *t;
1515
63c372cb 1516 t = strjoina(a, "/tmp");
613b411c
LP
1517 rmdir(t);
1518 rmdir(a);
1519
1520 free(a);
1521 return r;
1522 }
1523
1524 *tmp_dir = a;
1525 *var_tmp_dir = b;
1526
1527 return 0;
1528}
1529
1530int setup_netns(int netns_storage_socket[2]) {
1531 _cleanup_close_ int netns = -1;
3ee897d6 1532 int r, q;
613b411c
LP
1533
1534 assert(netns_storage_socket);
1535 assert(netns_storage_socket[0] >= 0);
1536 assert(netns_storage_socket[1] >= 0);
1537
1538 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1539 * namespace reference fd. Whatever process runs this first
1540 * shall create a new namespace, all others should just join
1541 * it. To serialize that we use a file lock on the socket
1542 * pair.
613b411c
LP
1543 *
1544 * It's a bit crazy, but hey, works great! */
1545
1546 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1547 return -errno;
1548
3ee897d6
LP
1549 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1550 if (netns == -EAGAIN) {
613b411c
LP
1551 /* Nothing stored yet, so let's create a new namespace */
1552
1553 if (unshare(CLONE_NEWNET) < 0) {
1554 r = -errno;
1555 goto fail;
1556 }
1557
1558 loopback_setup();
1559
1560 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1561 if (netns < 0) {
1562 r = -errno;
1563 goto fail;
1564 }
1565
1566 r = 1;
613b411c 1567
3ee897d6
LP
1568 } else if (netns < 0) {
1569 r = netns;
1570 goto fail;
613b411c 1571
3ee897d6
LP
1572 } else {
1573 /* Yay, found something, so let's join the namespace */
613b411c
LP
1574 if (setns(netns, CLONE_NEWNET) < 0) {
1575 r = -errno;
1576 goto fail;
1577 }
1578
1579 r = 0;
1580 }
1581
3ee897d6
LP
1582 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1583 if (q < 0) {
1584 r = q;
613b411c
LP
1585 goto fail;
1586 }
1587
1588fail:
fe048ce5 1589 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1590 return r;
1591}
417116f2 1592
6e2d7c4f
MS
1593bool ns_type_supported(NamespaceType type) {
1594 const char *t, *ns_proc;
1595
0fa5b831
LP
1596 t = namespace_type_to_string(type);
1597 if (!t) /* Don't know how to translate this? Then it's not supported */
6e2d7c4f
MS
1598 return false;
1599
6e2d7c4f 1600 ns_proc = strjoina("/proc/self/ns/", t);
6e2d7c4f
MS
1601 return access(ns_proc, F_OK) == 0;
1602}
1603
1b8689f9
LP
1604static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1605 [PROTECT_HOME_NO] = "no",
1606 [PROTECT_HOME_YES] = "yes",
1607 [PROTECT_HOME_READ_ONLY] = "read-only",
e4da7d8c 1608 [PROTECT_HOME_TMPFS] = "tmpfs",
417116f2
LP
1609};
1610
1b8689f9
LP
1611DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1612
5e1c6154
YW
1613ProtectHome parse_protect_home_or_bool(const char *s) {
1614 int r;
1615
1616 r = parse_boolean(s);
1617 if (r > 0)
1618 return PROTECT_HOME_YES;
1619 if (r == 0)
1620 return PROTECT_HOME_NO;
1621
1622 return protect_home_from_string(s);
1623}
1624
1b8689f9
LP
1625static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1626 [PROTECT_SYSTEM_NO] = "no",
1627 [PROTECT_SYSTEM_YES] = "yes",
1628 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1629 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1630};
1631
1632DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
6e2d7c4f 1633
03c791aa
YW
1634ProtectSystem parse_protect_system_or_bool(const char *s) {
1635 int r;
1636
1637 r = parse_boolean(s);
1638 if (r > 0)
1639 return PROTECT_SYSTEM_YES;
1640 if (r == 0)
1641 return PROTECT_SYSTEM_NO;
1642
1643 return protect_system_from_string(s);
1644}
1645
6e2d7c4f
MS
1646static const char* const namespace_type_table[] = {
1647 [NAMESPACE_MOUNT] = "mnt",
1648 [NAMESPACE_CGROUP] = "cgroup",
1649 [NAMESPACE_UTS] = "uts",
1650 [NAMESPACE_IPC] = "ipc",
1651 [NAMESPACE_USER] = "user",
1652 [NAMESPACE_PID] = "pid",
1653 [NAMESPACE_NET] = "net",
1654};
1655
1656DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);