]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
nspawn: remove unnecessary mount option parsing logic
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
15ae422b
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
15ae422b 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
21#include <errno.h>
07630cea 22#include <sched.h>
15ae422b 23#include <stdio.h>
07630cea
LP
24#include <string.h>
25#include <sys/mount.h>
15ae422b 26#include <sys/stat.h>
07630cea 27#include <unistd.h>
25e870b5 28#include <linux/fs.h>
15ae422b 29
b5efdb8a 30#include "alloc-util.h"
10404d52 31#include "base-filesystem.h"
7f112f50 32#include "dev-setup.h"
3ffd4af2 33#include "fd-util.h"
d944dc95 34#include "fs-util.h"
e908468b 35#include "label.h"
915e6d16 36#include "loop-util.h"
07630cea
LP
37#include "loopback-setup.h"
38#include "missing.h"
39#include "mkdir.h"
4349cd7c 40#include "mount-util.h"
3ffd4af2 41#include "namespace.h"
07630cea 42#include "path-util.h"
d7b8eec7 43#include "selinux-util.h"
2583fbea 44#include "socket-util.h"
36ce7110 45#include "stat-util.h"
8b43440b 46#include "string-table.h"
07630cea
LP
47#include "string-util.h"
48#include "strv.h"
affb60b1 49#include "umask-util.h"
ee104e11 50#include "user-util.h"
07630cea 51#include "util.h"
15ae422b 52
737ba3c8 53#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
54
c17ec25e 55typedef enum MountMode {
15ae422b
LP
56 /* This is ordered by priority! */
57 INACCESSIBLE,
d2d6c096
LP
58 BIND_MOUNT,
59 BIND_MOUNT_RECURSIVE,
ac0930c8 60 PRIVATE_TMP,
7f112f50 61 PRIVATE_DEV,
5d997827 62 BIND_DEV,
6c47cd7d 63 EMPTY_DIR,
5d997827
LP
64 SYSFS,
65 PROCFS,
66 READONLY,
59eeb84b 67 READWRITE,
c17ec25e 68} MountMode;
15ae422b 69
34de407a 70typedef struct MountEntry {
5327c910 71 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 72 MountMode mode:5;
5327c910
LP
73 bool ignore:1; /* Ignore if path does not exist? */
74 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 75 bool read_only:1; /* Shall this mount point be read-only? */
55fe7432 76 char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
d2d6c096
LP
77 const char *source_const; /* The source path, for bind mounts */
78 char *source_malloc;
34de407a 79} MountEntry;
15ae422b 80
5d997827
LP
81/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
82 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
83static const MountEntry apivfs_table[] = {
84 { "/proc", PROCFS, false },
85 { "/dev", BIND_DEV, false },
86 { "/sys", SYSFS, false },
87};
f471b2af 88
11a30cec 89/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 90static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
91 { "/proc/sys", READONLY, false },
92 { "/proc/sysrq-trigger", READONLY, true },
93 { "/proc/latency_stats", READONLY, true },
94 { "/proc/mtrr", READONLY, true },
aa70f38b 95 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
96 { "/proc/acpi", READONLY, true },
97 { "/proc/timer_stats", READONLY, true },
98 { "/proc/asound", READONLY, true },
99 { "/proc/bus", READONLY, true },
100 { "/proc/fs", READONLY, true },
101 { "/proc/irq", READONLY, true },
102 { "/sys", READONLY, false },
103 { "/sys/kernel/debug", READONLY, true },
104 { "/sys/kernel/tracing", READONLY, true },
105 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
3a0bf6d6 106 { "/sys/fs/selinux", READWRITE, true },
11a30cec
DH
107};
108
c575770b 109/* ProtectKernelModules= option */
34de407a 110static const MountEntry protect_kernel_modules_table[] = {
349cc4a5 111#if HAVE_SPLIT_USR
c6232fb0 112 { "/lib/modules", INACCESSIBLE, true },
c575770b 113#endif
c6232fb0 114 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
115};
116
b6c432ca
DH
117/*
118 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
119 * system should be protected by ProtectSystem=
120 */
34de407a 121static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
122 { "/home", READONLY, true },
123 { "/run/user", READONLY, true },
124 { "/root", READONLY, true },
b6c432ca
DH
125};
126
127/* ProtectHome=yes table */
34de407a 128static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
129 { "/home", INACCESSIBLE, true },
130 { "/run/user", INACCESSIBLE, true },
131 { "/root", INACCESSIBLE, true },
b6c432ca
DH
132};
133
f471b2af 134/* ProtectSystem=yes table */
34de407a 135static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
136 { "/usr", READONLY, false },
137 { "/boot", READONLY, true },
138 { "/efi", READONLY, true },
f471b2af
DH
139};
140
141/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 142static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
143 { "/usr", READONLY, false },
144 { "/boot", READONLY, true },
145 { "/efi", READONLY, true },
146 { "/etc", READONLY, false },
f471b2af
DH
147};
148
149/*
150 * ProtectSystem=strict table. In this strict mode, we mount everything
151 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
152 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
153 * protect those, and these options should be fully orthogonal.
154 * (And of course /home and friends are also left writable, as ProtectHome=
155 * shall manage those, orthogonally).
156 */
34de407a 157static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
158 { "/", READONLY, false },
159 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
160 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
161 { "/dev", READWRITE, false }, /* PrivateDevices= */
162 { "/home", READWRITE, true }, /* ProtectHome= */
163 { "/run/user", READWRITE, true }, /* ProtectHome= */
164 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
165};
166
34de407a 167static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
168 assert(p);
169
5327c910
LP
170 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
171 * otherwise the stack/static ->path field is returned. */
f0a4feb0 172
5327c910 173 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
174}
175
34de407a 176static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
177 assert(p);
178
179 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
180}
181
d2d6c096
LP
182static const char *mount_entry_source(const MountEntry *p) {
183 assert(p);
184
185 return p->source_malloc ?: p->source_const;
186}
187
1eb7e08e
LP
188static void mount_entry_done(MountEntry *p) {
189 assert(p);
190
191 p->path_malloc = mfree(p->path_malloc);
192 p->source_malloc = mfree(p->source_malloc);
193}
194
d18aff04 195static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
15ae422b
LP
196 char **i;
197
613b411c
LP
198 assert(p);
199
5327c910
LP
200 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
201
15ae422b 202 STRV_FOREACH(i, strv) {
5327c910
LP
203 bool ignore = false, needs_prefix = false;
204 const char *e = *i;
15ae422b 205
5327c910
LP
206 /* Look for any prefixes */
207 if (startswith(e, "-")) {
208 e++;
9c94d52e 209 ignore = true;
ea92ae33 210 }
5327c910
LP
211 if (startswith(e, "+")) {
212 e++;
213 needs_prefix = true;
214 }
ea92ae33 215
5327c910 216 if (!path_is_absolute(e))
15ae422b
LP
217 return -EINVAL;
218
34de407a 219 *((*p)++) = (MountEntry) {
5327c910
LP
220 .path_const = e,
221 .mode = mode,
222 .ignore = ignore,
d18aff04 223 .has_prefix = !needs_prefix && !forcibly_require_prefix,
5327c910 224 };
15ae422b
LP
225 }
226
227 return 0;
228}
229
6c47cd7d
LP
230static int append_empty_dir_mounts(MountEntry **p, char **strv) {
231 char **i;
232
233 assert(p);
234
235 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
236 * "/private/" boundary directories for DynamicUser=1. */
237
238 STRV_FOREACH(i, strv) {
239
240 *((*p)++) = (MountEntry) {
241 .path_const = *i,
242 .mode = EMPTY_DIR,
243 .ignore = false,
244 .has_prefix = false,
245 .read_only = true,
246 };
247 }
248
249 return 0;
250}
251
d2d6c096
LP
252static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
253 unsigned i;
254
255 assert(p);
256
257 for (i = 0; i < n; i++) {
258 const BindMount *b = binds + i;
259
260 *((*p)++) = (MountEntry) {
261 .path_const = b->destination,
262 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
263 .read_only = b->read_only,
264 .source_const = b->source,
265 };
266 }
267
268 return 0;
269}
270
34de407a 271static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 272 unsigned i;
11a30cec
DH
273
274 assert(p);
f471b2af 275 assert(mounts);
11a30cec 276
5327c910 277 /* Adds a list of static pre-defined entries */
f471b2af 278
5327c910 279 for (i = 0; i < n; i++)
34de407a
LP
280 *((*p)++) = (MountEntry) {
281 .path_const = mount_entry_path(mounts+i),
5327c910
LP
282 .mode = mounts[i].mode,
283 .ignore = mounts[i].ignore || ignore_protect,
284 };
f471b2af
DH
285
286 return 0;
287}
288
34de407a 289static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
290 assert(p);
291
5327c910 292 switch (protect_home) {
b6c432ca 293
5327c910 294 case PROTECT_HOME_NO:
b6c432ca
DH
295 return 0;
296
b6c432ca 297 case PROTECT_HOME_READ_ONLY:
5327c910
LP
298 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
299
b6c432ca 300 case PROTECT_HOME_YES:
5327c910
LP
301 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
302
b6c432ca 303 default:
5327c910 304 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 305 }
b6c432ca
DH
306}
307
34de407a 308static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
309 assert(p);
310
5327c910
LP
311 switch (protect_system) {
312
313 case PROTECT_SYSTEM_NO:
f471b2af
DH
314 return 0;
315
f471b2af 316 case PROTECT_SYSTEM_STRICT:
5327c910
LP
317 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
318
f471b2af 319 case PROTECT_SYSTEM_YES:
5327c910
LP
320 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
321
f471b2af 322 case PROTECT_SYSTEM_FULL:
5327c910
LP
323 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
324
f471b2af 325 default:
5327c910 326 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 327 }
11a30cec
DH
328}
329
c17ec25e 330static int mount_path_compare(const void *a, const void *b) {
34de407a 331 const MountEntry *p = a, *q = b;
a0827e2b 332 int d;
15ae422b 333
6ee1a919 334 /* If the paths are not equal, then order prefixes first */
34de407a 335 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
336 if (d != 0)
337 return d;
15ae422b 338
6ee1a919
LP
339 /* If the paths are equal, check the mode */
340 if (p->mode < q->mode)
341 return -1;
15ae422b 342
6ee1a919
LP
343 if (p->mode > q->mode)
344 return 1;
15ae422b 345
6ee1a919 346 return 0;
15ae422b
LP
347}
348
34de407a 349static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
350 unsigned i;
351
352 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
353 * that. */
354
355 if (!root_directory)
356 return 0;
357
358 for (i = 0; i < n; i++) {
359 char *s;
360
361 if (m[i].has_prefix)
362 continue;
363
34de407a 364 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
365 if (!s)
366 return -ENOMEM;
367
e282f51f 368 free_and_replace(m[i].path_malloc, s);
5327c910
LP
369 m[i].has_prefix = true;
370 }
371
372 return 0;
373}
374
34de407a
LP
375static void drop_duplicates(MountEntry *m, unsigned *n) {
376 MountEntry *f, *t, *previous;
15ae422b 377
c17ec25e 378 assert(m);
15ae422b 379 assert(n);
15ae422b 380
fe3c2583
LP
381 /* Drops duplicate entries. Expects that the array is properly ordered already. */
382
1d54cd5d 383 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 384
fe3c2583
LP
385 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
386 * above. */
34de407a
LP
387 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
388 log_debug("%s is duplicate.", mount_entry_path(f));
389 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 390 mount_entry_done(f);
15ae422b 391 continue;
fe3c2583 392 }
15ae422b 393
e2d7c1a0 394 *t = *f;
15ae422b 395 previous = t;
fe3c2583
LP
396 t++;
397 }
398
399 *n = t - m;
400}
401
34de407a
LP
402static void drop_inaccessible(MountEntry *m, unsigned *n) {
403 MountEntry *f, *t;
fe3c2583
LP
404 const char *clear = NULL;
405
406 assert(m);
407 assert(n);
408
409 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
410 * ordered already. */
411
1d54cd5d 412 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
413
414 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
415 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
416 if (clear && path_startswith(mount_entry_path(f), clear)) {
417 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 418 mount_entry_done(f);
fe3c2583
LP
419 continue;
420 }
15ae422b 421
34de407a 422 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
423
424 *t = *f;
15ae422b
LP
425 t++;
426 }
427
c17ec25e 428 *n = t - m;
15ae422b
LP
429}
430
34de407a
LP
431static void drop_nop(MountEntry *m, unsigned *n) {
432 MountEntry *f, *t;
7648a565
LP
433
434 assert(m);
435 assert(n);
436
437 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
438 * list is ordered by prefixes. */
439
1d54cd5d 440 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
441
442 /* Only suppress such subtrees for READONLY and READWRITE entries */
443 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 444 MountEntry *p;
7648a565
LP
445 bool found = false;
446
447 /* Now let's find the first parent of the entry we are looking at. */
448 for (p = t-1; p >= m; p--) {
34de407a 449 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
450 found = true;
451 break;
452 }
453 }
454
455 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
456 if (found && p->mode == f->mode) {
34de407a 457 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
1eb7e08e 458 mount_entry_done(f);
7648a565
LP
459 continue;
460 }
461 }
462
463 *t = *f;
464 t++;
465 }
466
467 *n = t - m;
468}
469
34de407a
LP
470static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
471 MountEntry *f, *t;
cd2902c9
LP
472
473 assert(m);
474 assert(n);
475
1d54cd5d 476 /* Nothing to do */
cd2902c9
LP
477 if (!root_directory)
478 return;
479
480 /* Drops all mounts that are outside of the root directory. */
481
1d54cd5d 482 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 483
34de407a
LP
484 if (!path_startswith(mount_entry_path(f), root_directory)) {
485 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 486 mount_entry_done(f);
cd2902c9
LP
487 continue;
488 }
489
490 *t = *f;
491 t++;
492 }
493
494 *n = t - m;
495}
496
414b304b 497static int clone_device_node(const char *d, const char *temporary_mount) {
6f7f3a33 498 const char *dn;
b5e99f23
ДГ
499 struct stat st;
500 int r;
501
414b304b 502 if (stat(d, &st) < 0) {
b5e99f23
ДГ
503 if (errno == ENOENT)
504 return 0;
505 return -errno;
506 }
507
508 if (!S_ISBLK(st.st_mode) &&
509 !S_ISCHR(st.st_mode))
510 return -EINVAL;
511
512 if (st.st_rdev == 0)
513 return 0;
514
6f7f3a33 515 dn = strjoina(temporary_mount, d);
b5e99f23
ДГ
516
517 mac_selinux_create_file_prepare(d, st.st_mode);
518 r = mknod(dn, st.st_mode, st.st_rdev);
519 mac_selinux_create_file_clear();
b5e99f23 520 if (r < 0)
225874dc 521 return log_debug_errno(errno, "mknod failed for %s: %m", d);
b5e99f23 522
98b1d2b8 523 return 1;
b5e99f23
ДГ
524}
525
5d997827 526static int mount_private_dev(MountEntry *m) {
7f112f50
LP
527 static const char devnodes[] =
528 "/dev/null\0"
529 "/dev/zero\0"
530 "/dev/full\0"
531 "/dev/random\0"
532 "/dev/urandom\0"
533 "/dev/tty\0";
534
2b85f4e1 535 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 536 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
537 _cleanup_umask_ mode_t u;
538 int r;
539
540 assert(m);
541
542 u = umask(0000);
543
2b85f4e1
LP
544 if (!mkdtemp(temporary_mount))
545 return -errno;
546
63c372cb 547 dev = strjoina(temporary_mount, "/dev");
dc751688 548 (void) mkdir(dev, 0755);
737ba3c8 549 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
550 r = -errno;
551 goto fail;
552 }
553
63c372cb 554 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 555 (void) mkdir(devpts, 0755);
2b85f4e1
LP
556 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
557 r = -errno;
558 goto fail;
559 }
560
414b304b
ДГ
561 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
562 * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
563 * thus, in that case make a clone
564 *
565 * in nspawn and other containers it will be a symlink, in that case make it a symlink
566 */
36ce7110
LP
567 r = is_symlink("/dev/ptmx");
568 if (r < 0)
3164e3cb 569 goto fail;
36ce7110 570 if (r > 0) {
414b304b
ДГ
571 devptmx = strjoina(temporary_mount, "/dev/ptmx");
572 if (symlink("pts/ptmx", devptmx) < 0) {
573 r = -errno;
574 goto fail;
575 }
576 } else {
577 r = clone_device_node("/dev/ptmx", temporary_mount);
152c475f
LP
578 if (r < 0)
579 goto fail;
580 if (r == 0) {
581 r = -ENXIO;
414b304b 582 goto fail;
152c475f 583 }
414b304b 584 }
e06b6479 585
63c372cb 586 devshm = strjoina(temporary_mount, "/dev/shm");
8d953682 587 (void) mkdir(devshm, 0755);
2b85f4e1
LP
588 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
589 if (r < 0) {
590 r = -errno;
591 goto fail;
592 }
593
63c372cb 594 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 595 (void) mkdir(devmqueue, 0755);
3164e3cb 596 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 597
63c372cb 598 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 599 (void) mkdir(devhugepages, 0755);
3164e3cb 600 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 601
63c372cb 602 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 603 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 604
7f112f50 605 NULSTR_FOREACH(d, devnodes) {
b5e99f23
ДГ
606 r = clone_device_node(d, temporary_mount);
607 if (r < 0)
2b85f4e1 608 goto fail;
7f112f50
LP
609 }
610
03cfe0d5 611 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 612
ee818b89
AC
613 /* Create the /dev directory if missing. It is more likely to be
614 * missing when the service is started with RootDirectory. This is
615 * consistent with mount units creating the mount points when missing.
616 */
34de407a 617 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 618
9e5f8252 619 /* Unmount everything in old /dev */
34de407a
LP
620 umount_recursive(mount_entry_path(m), 0);
621 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
622 r = -errno;
623 goto fail;
624 }
7f112f50 625
2b85f4e1
LP
626 rmdir(dev);
627 rmdir(temporary_mount);
7f112f50 628
2b85f4e1 629 return 0;
7f112f50 630
2b85f4e1
LP
631fail:
632 if (devpts)
633 umount(devpts);
7f112f50 634
2b85f4e1
LP
635 if (devshm)
636 umount(devshm);
7f112f50 637
2b85f4e1
LP
638 if (devhugepages)
639 umount(devhugepages);
7f112f50 640
2b85f4e1
LP
641 if (devmqueue)
642 umount(devmqueue);
7f112f50 643
d267c5aa
ZJS
644 umount(dev);
645 rmdir(dev);
2b85f4e1 646 rmdir(temporary_mount);
7f112f50 647
2b85f4e1 648 return r;
7f112f50
LP
649}
650
2a2969fd 651static int mount_bind_dev(const MountEntry *m) {
5d997827
LP
652 int r;
653
654 assert(m);
655
656 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
657 * /dev. This is only used when RootDirectory= is set. */
658
645767d6
LP
659 (void) mkdir_p_label(mount_entry_path(m), 0755);
660
5d997827
LP
661 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
662 if (r < 0)
663 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
664 if (r > 0) /* make this a NOP if /dev is already a mount point */
665 return 0;
666
667 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
668 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
669
670 return 1;
671}
672
2a2969fd 673static int mount_sysfs(const MountEntry *m) {
5d997827
LP
674 int r;
675
676 assert(m);
677
645767d6
LP
678 (void) mkdir_p_label(mount_entry_path(m), 0755);
679
5d997827
LP
680 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
681 if (r < 0)
682 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
683 if (r > 0) /* make this a NOP if /sys is already a mount point */
684 return 0;
685
686 /* Bind mount the host's version so that we get all child mounts of it, too. */
687 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
688 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
689
690 return 1;
691}
692
2a2969fd 693static int mount_procfs(const MountEntry *m) {
5d997827
LP
694 int r;
695
696 assert(m);
697
645767d6
LP
698 (void) mkdir_p_label(mount_entry_path(m), 0755);
699
5d997827
LP
700 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
701 if (r < 0)
702 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
703 if (r > 0) /* make this a NOP if /proc is already a mount point */
704 return 0;
705
706 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
707 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
708 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
709
710 return 1;
711}
712
2a2969fd 713static int mount_empty_dir(const MountEntry *m) {
6c47cd7d
LP
714 assert(m);
715
716 /* First, get rid of everything that is below if there is anything. Then, overmount with our new empty dir */
717
718 (void) mkdir_p_label(mount_entry_path(m), 0755);
719 (void) umount_recursive(mount_entry_path(m), 0);
720
721 if (mount("tmpfs", mount_entry_path(m), "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
722 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
723
724 return 1;
725}
726
d2d6c096
LP
727static int mount_entry_chase(
728 const char *root_directory,
2a2969fd 729 const MountEntry *m,
d2d6c096
LP
730 const char *path,
731 char **location) {
732
8fceda93
LP
733 char *chased;
734 int r;
f863b1c6 735 unsigned flags = 0;
8fceda93
LP
736
737 assert(m);
738
739 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
d2d6c096
LP
740 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
741 * that applies). The result is stored in "location". */
8fceda93 742
f863b1c6
ZJS
743 if (IN_SET(m->mode,
744 BIND_MOUNT,
745 BIND_MOUNT_RECURSIVE,
746 PRIVATE_TMP,
f863b1c6
ZJS
747 PRIVATE_DEV,
748 BIND_DEV,
749 EMPTY_DIR,
750 SYSFS,
751 PROCFS))
752 flags |= CHASE_NONEXISTENT;
753
754 r = chase_symlinks(path, root_directory, flags, &chased);
8fceda93 755 if (r == -ENOENT && m->ignore) {
d2d6c096 756 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
8fceda93
LP
757 return 0;
758 }
759 if (r < 0)
d2d6c096 760 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
8fceda93 761
d2d6c096 762 log_debug("Followed symlinks %s → %s.", path, chased);
8fceda93 763
d2d6c096
LP
764 free(*location);
765 *location = chased;
8fceda93
LP
766
767 return 1;
768}
769
ac0930c8 770static int apply_mount(
8fceda93 771 const char *root_directory,
89bd586c 772 MountEntry *m) {
ac0930c8 773
a227a4be 774 bool rbind = true, make = false;
15ae422b 775 const char *what;
15ae422b 776 int r;
15ae422b 777
c17ec25e 778 assert(m);
15ae422b 779
d2d6c096 780 r = mount_entry_chase(root_directory, m, mount_entry_path(m), &m->path_malloc);
8fceda93
LP
781 if (r <= 0)
782 return r;
783
34de407a 784 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 785
c17ec25e 786 switch (m->mode) {
15ae422b 787
160cfdbe
LP
788 case INACCESSIBLE: {
789 struct stat target;
6d313367
LP
790
791 /* First, get rid of everything that is below if there
792 * is anything... Then, overmount it with an
c4b41707 793 * inaccessible path. */
34de407a 794 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 795
34de407a
LP
796 if (lstat(mount_entry_path(m), &target) < 0)
797 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
15ae422b 798
c4b41707 799 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
800 if (!what) {
801 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
802 return -ELOOP;
803 }
804 break;
160cfdbe 805 }
fe3c2583 806
15ae422b 807 case READONLY:
15ae422b 808 case READWRITE:
8fceda93 809 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
d944dc95 810 if (r < 0)
34de407a 811 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
812 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
813 return 0;
6b7c9f8b 814 /* This isn't a mount point yet, let's make it one. */
34de407a 815 what = mount_entry_path(m);
6b7c9f8b 816 break;
15ae422b 817
d2d6c096
LP
818 case BIND_MOUNT:
819 rbind = false;
d2d6c096 820
4831981d 821 _fallthrough_;
d2d6c096
LP
822 case BIND_MOUNT_RECURSIVE:
823 /* Also chase the source mount */
5d997827 824
d2d6c096
LP
825 r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
826 if (r <= 0)
827 return r;
828
829 what = mount_entry_source(m);
a227a4be 830 make = true;
d2d6c096
LP
831 break;
832
6c47cd7d
LP
833 case EMPTY_DIR:
834 return mount_empty_dir(m);
835
ac0930c8 836 case PRIVATE_TMP:
89bd586c 837 what = mount_entry_source(m);
a227a4be 838 make = true;
15ae422b 839 break;
e364ad06 840
d6797c92 841 case PRIVATE_DEV:
5d997827
LP
842 return mount_private_dev(m);
843
844 case BIND_DEV:
845 return mount_bind_dev(m);
846
847 case SYSFS:
848 return mount_sysfs(m);
849
850 case PROCFS:
851 return mount_procfs(m);
d6797c92 852
e364ad06
LP
853 default:
854 assert_not_reached("Unknown mode");
15ae422b
LP
855 }
856
ac0930c8 857 assert(what);
15ae422b 858
a227a4be
LP
859 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
860 bool try_again = false;
861 r = -errno;
862
863 if (r == -ENOENT && make) {
864 struct stat st;
865
866 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
867
868 if (stat(what, &st) >= 0) {
869
870 (void) mkdir_parents(mount_entry_path(m), 0755);
871
872 if (S_ISDIR(st.st_mode))
873 try_again = mkdir(mount_entry_path(m), 0755) >= 0;
874 else
875 try_again = touch(mount_entry_path(m)) >= 0;
876 }
877 }
878
879 if (try_again) {
880 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
881 r = -errno;
882 else
883 r = 0;
884 }
885
886 if (r < 0)
887 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
888 }
6b7c9f8b 889
34de407a 890 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 891 return 0;
ac0930c8 892}
15ae422b 893
2a2969fd 894static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
6b7c9f8b 895 int r = 0;
15ae422b 896
c17ec25e 897 assert(m);
ac9de0b3 898 assert(proc_self_mountinfo);
ac0930c8 899
34de407a 900 if (mount_entry_read_only(m))
ac9de0b3 901 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
13e785f7 902 else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't */
34de407a 903 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 904 r = -errno;
737ba3c8 905 } else
6b7c9f8b
LP
906 return 0;
907
908 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
909 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
910 * read-only mounts already applied. */
ac0930c8 911
8fceda93
LP
912 if (r == -ENOENT && m->ignore)
913 r = 0;
5327c910 914
1d54cd5d 915 return r;
d944dc95
LP
916}
917
bb0ff3fb 918static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
5d997827
LP
919 assert(ns_info);
920
9c988f93
DH
921 /*
922 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
923 * since to protect the API VFS mounts, they need to be around in the
924 * first place... and RootDirectory= or RootImage= need to be set.
925 */
5d997827 926
9c988f93
DH
927 /* root_directory should point to a mount point */
928 return root_directory &&
929 (ns_info->mount_apivfs ||
930 ns_info->protect_control_groups ||
931 ns_info->protect_kernel_tunables);
5d997827
LP
932}
933
2652c6c1 934static unsigned namespace_calculate_mounts(
9c988f93 935 const char* root_directory,
bb0ff3fb 936 const NamespaceInfo *ns_info,
2652c6c1
DH
937 char** read_write_paths,
938 char** read_only_paths,
939 char** inaccessible_paths,
6c47cd7d 940 char** empty_directories,
d2d6c096 941 unsigned n_bind_mounts,
2652c6c1
DH
942 const char* tmp_dir,
943 const char* var_tmp_dir,
2652c6c1
DH
944 ProtectHome protect_home,
945 ProtectSystem protect_system) {
946
b6c432ca 947 unsigned protect_home_cnt;
f471b2af
DH
948 unsigned protect_system_cnt =
949 (protect_system == PROTECT_SYSTEM_STRICT ?
950 ELEMENTSOF(protect_system_strict_table) :
951 ((protect_system == PROTECT_SYSTEM_FULL) ?
952 ELEMENTSOF(protect_system_full_table) :
953 ((protect_system == PROTECT_SYSTEM_YES) ?
954 ELEMENTSOF(protect_system_yes_table) : 0)));
955
b6c432ca
DH
956 protect_home_cnt =
957 (protect_home == PROTECT_HOME_YES ?
958 ELEMENTSOF(protect_home_yes_table) :
959 ((protect_home == PROTECT_HOME_READ_ONLY) ?
960 ELEMENTSOF(protect_home_read_only_table) : 0));
961
2652c6c1
DH
962 return !!tmp_dir + !!var_tmp_dir +
963 strv_length(read_write_paths) +
964 strv_length(read_only_paths) +
965 strv_length(inaccessible_paths) +
6c47cd7d 966 strv_length(empty_directories) +
d2d6c096 967 n_bind_mounts +
c575770b
DH
968 ns_info->private_dev +
969 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
970 (ns_info->protect_control_groups ? 1 : 0) +
971 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
5d997827 972 protect_home_cnt + protect_system_cnt +
9c988f93 973 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
2652c6c1
DH
974}
975
613b411c 976int setup_namespace(
ee818b89 977 const char* root_directory,
915e6d16 978 const char* root_image,
bb0ff3fb 979 const NamespaceInfo *ns_info,
2a624c36
AP
980 char** read_write_paths,
981 char** read_only_paths,
982 char** inaccessible_paths,
6c47cd7d 983 char** empty_directories,
d2d6c096
LP
984 const BindMount *bind_mounts,
985 unsigned n_bind_mounts,
a004cb4c
LP
986 const char* tmp_dir,
987 const char* var_tmp_dir,
1b8689f9
LP
988 ProtectHome protect_home,
989 ProtectSystem protect_system,
915e6d16
LP
990 unsigned long mount_flags,
991 DissectImageFlags dissect_image_flags) {
15ae422b 992
915e6d16 993 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
78ebe980 994 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
915e6d16 995 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
78ebe980 996 _cleanup_free_ void *root_hash = NULL;
34de407a 997 MountEntry *m, *mounts = NULL;
78ebe980 998 size_t root_hash_size = 0;
d944dc95 999 bool make_slave = false;
e908468b 1000 const char *root;
f0a4feb0 1001 unsigned n_mounts;
d18aff04 1002 bool require_prefix = false;
c17ec25e 1003 int r = 0;
15ae422b 1004
915e6d16
LP
1005 assert(ns_info);
1006
613b411c 1007 if (mount_flags == 0)
c17ec25e 1008 mount_flags = MS_SHARED;
ac0930c8 1009
915e6d16
LP
1010 if (root_image) {
1011 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1012
1013 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1014 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1015
1016 r = loop_device_make_by_path(root_image,
1017 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1018 &loop_device);
1019 if (r < 0)
1020 return r;
1021
78ebe980
LP
1022 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1023 if (r < 0)
1024 return r;
1025
1026 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1027 if (r < 0)
1028 return r;
1029
1030 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
915e6d16
LP
1031 if (r < 0)
1032 return r;
915e6d16
LP
1033 }
1034
e908468b
LP
1035 if (root_directory)
1036 root = root_directory;
1037 else if (root_image || n_bind_mounts > 0) {
1038
1039 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1040 * the same mount point for all images, which is safe, since they all live in their own namespaces
1041 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1042 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1043 * while we are applying them. */
1044
1045 root = "/run/systemd/unit-root";
1046 (void) mkdir_label(root, 0700);
d18aff04 1047 require_prefix = true;
e908468b
LP
1048 } else
1049 root = NULL;
1050
cfbeb4ef 1051 n_mounts = namespace_calculate_mounts(
e908468b 1052 root,
cfbeb4ef
LP
1053 ns_info,
1054 read_write_paths,
1055 read_only_paths,
1056 inaccessible_paths,
6c47cd7d 1057 empty_directories,
f5c52a77 1058 n_bind_mounts,
cfbeb4ef
LP
1059 tmp_dir, var_tmp_dir,
1060 protect_home, protect_system);
613b411c 1061
2652c6c1 1062 /* Set mount slave mode */
e908468b 1063 if (root || n_mounts > 0)
d944dc95
LP
1064 make_slave = true;
1065
f0a4feb0 1066 if (n_mounts > 0) {
34de407a 1067 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
d18aff04 1068 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
613b411c 1069 if (r < 0)
f0a4feb0 1070 goto finish;
613b411c 1071
d18aff04 1072 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
613b411c 1073 if (r < 0)
f0a4feb0 1074 goto finish;
613b411c 1075
d18aff04 1076 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
613b411c 1077 if (r < 0)
f0a4feb0 1078 goto finish;
7ff7394d 1079
6c47cd7d
LP
1080 r = append_empty_dir_mounts(&m, empty_directories);
1081 if (r < 0)
1082 goto finish;
1083
d2d6c096
LP
1084 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1085 if (r < 0)
1086 goto finish;
1087
613b411c 1088 if (tmp_dir) {
34de407a 1089 *(m++) = (MountEntry) {
5327c910
LP
1090 .path_const = "/tmp",
1091 .mode = PRIVATE_TMP,
89bd586c 1092 .source_const = tmp_dir,
5327c910 1093 };
613b411c 1094 }
7ff7394d 1095
613b411c 1096 if (var_tmp_dir) {
34de407a 1097 *(m++) = (MountEntry) {
5327c910 1098 .path_const = "/var/tmp",
89bd586c
YW
1099 .mode = PRIVATE_TMP,
1100 .source_const = var_tmp_dir,
5327c910 1101 };
7ff7394d 1102 }
ac0930c8 1103
c575770b 1104 if (ns_info->private_dev) {
34de407a 1105 *(m++) = (MountEntry) {
5327c910
LP
1106 .path_const = "/dev",
1107 .mode = PRIVATE_DEV,
1108 };
7f112f50
LP
1109 }
1110
c575770b 1111 if (ns_info->protect_kernel_tunables) {
5327c910 1112 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 1113 if (r < 0)
f0a4feb0 1114 goto finish;
c575770b
DH
1115 }
1116
1117 if (ns_info->protect_kernel_modules) {
5327c910 1118 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 1119 if (r < 0)
f0a4feb0 1120 goto finish;
c575770b 1121 }
59eeb84b 1122
c575770b 1123 if (ns_info->protect_control_groups) {
34de407a 1124 *(m++) = (MountEntry) {
5327c910
LP
1125 .path_const = "/sys/fs/cgroup",
1126 .mode = READONLY,
1127 };
59eeb84b
LP
1128 }
1129
5327c910 1130 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 1131 if (r < 0)
f0a4feb0 1132 goto finish;
417116f2 1133
5327c910 1134 r = append_protect_system(&m, protect_system, false);
f471b2af 1135 if (r < 0)
f0a4feb0 1136 goto finish;
417116f2 1137
e908468b 1138 if (namespace_info_mount_apivfs(root, ns_info)) {
5d997827
LP
1139 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1140 if (r < 0)
1141 goto finish;
1142 }
1143
f0a4feb0 1144 assert(mounts + n_mounts == m);
ac0930c8 1145
5327c910 1146 /* Prepend the root directory where that's necessary */
e908468b 1147 r = prefix_where_needed(mounts, n_mounts, root);
5327c910
LP
1148 if (r < 0)
1149 goto finish;
1150
34de407a 1151 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
fe3c2583 1152
f0a4feb0 1153 drop_duplicates(mounts, &n_mounts);
e908468b 1154 drop_outside_root(root, mounts, &n_mounts);
f0a4feb0
DH
1155 drop_inaccessible(mounts, &n_mounts);
1156 drop_nop(mounts, &n_mounts);
15ae422b
LP
1157 }
1158
d944dc95
LP
1159 if (unshare(CLONE_NEWNS) < 0) {
1160 r = -errno;
1161 goto finish;
1162 }
1e4e94c8 1163
d944dc95 1164 if (make_slave) {
c2c13f2d
LP
1165 /* Remount / as SLAVE so that nothing now mounted in the namespace
1166 shows up in the parent */
d944dc95
LP
1167 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1168 r = -errno;
1169 goto finish;
1170 }
ee818b89
AC
1171 }
1172
915e6d16 1173 if (root_image) {
e908468b 1174 /* A root image is specified, mount it to the right place */
2d3a5a73 1175 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
915e6d16
LP
1176 if (r < 0)
1177 goto finish;
1178
07ce7407
TM
1179 if (decrypted_image) {
1180 r = decrypted_image_relinquish(decrypted_image);
1181 if (r < 0)
1182 goto finish;
1183 }
78ebe980 1184
915e6d16
LP
1185 loop_device_relinquish(loop_device);
1186
1187 } else if (root_directory) {
1188
e908468b
LP
1189 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1190 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 1191 if (r < 0)
d944dc95 1192 goto finish;
8f1ad200 1193 if (r == 0) {
e908468b 1194 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
8f1ad200
LP
1195 r = -errno;
1196 goto finish;
1197 }
d944dc95 1198 }
e908468b
LP
1199
1200 } else if (root) {
1201
1202 /* Let's mount the main root directory to the root directory to use */
1203 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1204 r = -errno;
1205 goto finish;
1206 }
ee818b89 1207 }
c2c13f2d 1208
4e0c20de
LP
1209 /* Try to set up the new root directory before mounting anything else there. */
1210 if (root_image || root_directory)
1211 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1212
f0a4feb0 1213 if (n_mounts > 0) {
ac9de0b3 1214 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
6b7c9f8b
LP
1215 char **blacklist;
1216 unsigned j;
1217
ac9de0b3
TR
1218 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1219 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1220 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1221 if (!proc_self_mountinfo) {
1222 r = -errno;
1223 goto finish;
1224 }
1225
6b7c9f8b 1226 /* First round, add in all special mounts we need */
f0a4feb0 1227 for (m = mounts; m < mounts + n_mounts; ++m) {
89bd586c 1228 r = apply_mount(root, m);
c2c13f2d 1229 if (r < 0)
d944dc95 1230 goto finish;
c2c13f2d 1231 }
15ae422b 1232
6b7c9f8b 1233 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
1234 blacklist = newa(char*, n_mounts+1);
1235 for (j = 0; j < n_mounts; j++)
34de407a 1236 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
1237 blacklist[j] = NULL;
1238
1239 /* Second round, flip the ro bits if necessary. */
f0a4feb0 1240 for (m = mounts; m < mounts + n_mounts; ++m) {
ac9de0b3 1241 r = make_read_only(m, blacklist, proc_self_mountinfo);
c2c13f2d 1242 if (r < 0)
d944dc95 1243 goto finish;
c2c13f2d 1244 }
15ae422b
LP
1245 }
1246
e908468b 1247 if (root) {
ee818b89 1248 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
e908468b 1249 r = mount_move_root(root);
d944dc95
LP
1250 if (r < 0)
1251 goto finish;
ee818b89
AC
1252 }
1253
55fe7432 1254 /* Remount / as the desired mode. Note that this will not
c2c13f2d
LP
1255 * reestablish propagation from our side to the host, since
1256 * what's disconnected is disconnected. */
d944dc95
LP
1257 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1258 r = -errno;
1259 goto finish;
1260 }
15ae422b 1261
d944dc95 1262 r = 0;
15ae422b 1263
d944dc95 1264finish:
f0a4feb0 1265 for (m = mounts; m < mounts + n_mounts; m++)
1eb7e08e 1266 mount_entry_done(m);
613b411c
LP
1267
1268 return r;
1269}
1270
d2d6c096
LP
1271void bind_mount_free_many(BindMount *b, unsigned n) {
1272 unsigned i;
1273
1274 assert(b || n == 0);
1275
1276 for (i = 0; i < n; i++) {
1277 free(b[i].source);
1278 free(b[i].destination);
1279 }
1280
1281 free(b);
1282}
1283
1284int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1285 _cleanup_free_ char *s = NULL, *d = NULL;
1286 BindMount *c;
1287
1288 assert(b);
1289 assert(n);
1290 assert(item);
1291
1292 s = strdup(item->source);
1293 if (!s)
1294 return -ENOMEM;
1295
1296 d = strdup(item->destination);
1297 if (!d)
1298 return -ENOMEM;
1299
1300 c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1301 if (!c)
1302 return -ENOMEM;
1303
1304 *b = c;
1305
1306 c[(*n) ++] = (BindMount) {
1307 .source = s,
1308 .destination = d,
1309 .read_only = item->read_only,
1310 .recursive = item->recursive,
1311 .ignore_enoent = item->ignore_enoent,
1312 };
1313
1314 s = d = NULL;
1315 return 0;
1316}
1317
613b411c
LP
1318static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1319 _cleanup_free_ char *x = NULL;
6b46ea73
LP
1320 char bid[SD_ID128_STRING_MAX];
1321 sd_id128_t boot_id;
1322 int r;
613b411c
LP
1323
1324 assert(id);
1325 assert(prefix);
1326 assert(path);
1327
6b46ea73
LP
1328 /* We include the boot id in the directory so that after a
1329 * reboot we can easily identify obsolete directories. */
1330
1331 r = sd_id128_get_boot(&boot_id);
1332 if (r < 0)
1333 return r;
1334
605405c6 1335 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
1336 if (!x)
1337 return -ENOMEM;
1338
1339 RUN_WITH_UMASK(0077)
1340 if (!mkdtemp(x))
1341 return -errno;
1342
1343 RUN_WITH_UMASK(0000) {
1344 char *y;
1345
63c372cb 1346 y = strjoina(x, "/tmp");
613b411c
LP
1347
1348 if (mkdir(y, 0777 | S_ISVTX) < 0)
1349 return -errno;
c17ec25e 1350 }
15ae422b 1351
613b411c
LP
1352 *path = x;
1353 x = NULL;
1354
1355 return 0;
1356}
1357
1358int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1359 char *a, *b;
1360 int r;
1361
1362 assert(id);
1363 assert(tmp_dir);
1364 assert(var_tmp_dir);
1365
1366 r = setup_one_tmp_dir(id, "/tmp", &a);
1367 if (r < 0)
1368 return r;
1369
1370 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1371 if (r < 0) {
1372 char *t;
1373
63c372cb 1374 t = strjoina(a, "/tmp");
613b411c
LP
1375 rmdir(t);
1376 rmdir(a);
1377
1378 free(a);
1379 return r;
1380 }
1381
1382 *tmp_dir = a;
1383 *var_tmp_dir = b;
1384
1385 return 0;
1386}
1387
1388int setup_netns(int netns_storage_socket[2]) {
1389 _cleanup_close_ int netns = -1;
3ee897d6 1390 int r, q;
613b411c
LP
1391
1392 assert(netns_storage_socket);
1393 assert(netns_storage_socket[0] >= 0);
1394 assert(netns_storage_socket[1] >= 0);
1395
1396 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1397 * namespace reference fd. Whatever process runs this first
1398 * shall create a new namespace, all others should just join
1399 * it. To serialize that we use a file lock on the socket
1400 * pair.
613b411c
LP
1401 *
1402 * It's a bit crazy, but hey, works great! */
1403
1404 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1405 return -errno;
1406
3ee897d6
LP
1407 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1408 if (netns == -EAGAIN) {
613b411c
LP
1409 /* Nothing stored yet, so let's create a new namespace */
1410
1411 if (unshare(CLONE_NEWNET) < 0) {
1412 r = -errno;
1413 goto fail;
1414 }
1415
1416 loopback_setup();
1417
1418 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1419 if (netns < 0) {
1420 r = -errno;
1421 goto fail;
1422 }
1423
1424 r = 1;
613b411c 1425
3ee897d6
LP
1426 } else if (netns < 0) {
1427 r = netns;
1428 goto fail;
613b411c 1429
3ee897d6
LP
1430 } else {
1431 /* Yay, found something, so let's join the namespace */
613b411c
LP
1432 if (setns(netns, CLONE_NEWNET) < 0) {
1433 r = -errno;
1434 goto fail;
1435 }
1436
1437 r = 0;
1438 }
1439
3ee897d6
LP
1440 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1441 if (q < 0) {
1442 r = q;
613b411c
LP
1443 goto fail;
1444 }
1445
1446fail:
fe048ce5 1447 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1448 return r;
1449}
417116f2 1450
6e2d7c4f
MS
1451bool ns_type_supported(NamespaceType type) {
1452 const char *t, *ns_proc;
1453
0fa5b831
LP
1454 t = namespace_type_to_string(type);
1455 if (!t) /* Don't know how to translate this? Then it's not supported */
6e2d7c4f
MS
1456 return false;
1457
6e2d7c4f 1458 ns_proc = strjoina("/proc/self/ns/", t);
6e2d7c4f
MS
1459 return access(ns_proc, F_OK) == 0;
1460}
1461
1b8689f9
LP
1462static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1463 [PROTECT_HOME_NO] = "no",
1464 [PROTECT_HOME_YES] = "yes",
1465 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
1466};
1467
1b8689f9
LP
1468DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1469
5e1c6154
YW
1470ProtectHome parse_protect_home_or_bool(const char *s) {
1471 int r;
1472
1473 r = parse_boolean(s);
1474 if (r > 0)
1475 return PROTECT_HOME_YES;
1476 if (r == 0)
1477 return PROTECT_HOME_NO;
1478
1479 return protect_home_from_string(s);
1480}
1481
1b8689f9
LP
1482static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1483 [PROTECT_SYSTEM_NO] = "no",
1484 [PROTECT_SYSTEM_YES] = "yes",
1485 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1486 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1487};
1488
1489DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
6e2d7c4f 1490
03c791aa
YW
1491ProtectSystem parse_protect_system_or_bool(const char *s) {
1492 int r;
1493
1494 r = parse_boolean(s);
1495 if (r > 0)
1496 return PROTECT_SYSTEM_YES;
1497 if (r == 0)
1498 return PROTECT_SYSTEM_NO;
1499
1500 return protect_system_from_string(s);
1501}
1502
6e2d7c4f
MS
1503static const char* const namespace_type_table[] = {
1504 [NAMESPACE_MOUNT] = "mnt",
1505 [NAMESPACE_CGROUP] = "cgroup",
1506 [NAMESPACE_UTS] = "uts",
1507 [NAMESPACE_IPC] = "ipc",
1508 [NAMESPACE_USER] = "user",
1509 [NAMESPACE_PID] = "pid",
1510 [NAMESPACE_NET] = "net",
1511};
1512
1513DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);