]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
namespace: use is_symlink() helper
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
15ae422b
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
15ae422b 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
21#include <errno.h>
07630cea 22#include <sched.h>
15ae422b 23#include <stdio.h>
07630cea
LP
24#include <string.h>
25#include <sys/mount.h>
15ae422b 26#include <sys/stat.h>
07630cea 27#include <unistd.h>
25e870b5 28#include <linux/fs.h>
15ae422b 29
b5efdb8a 30#include "alloc-util.h"
10404d52 31#include "base-filesystem.h"
7f112f50 32#include "dev-setup.h"
3ffd4af2 33#include "fd-util.h"
d944dc95 34#include "fs-util.h"
e908468b 35#include "label.h"
915e6d16 36#include "loop-util.h"
07630cea
LP
37#include "loopback-setup.h"
38#include "missing.h"
39#include "mkdir.h"
4349cd7c 40#include "mount-util.h"
3ffd4af2 41#include "namespace.h"
07630cea 42#include "path-util.h"
d7b8eec7 43#include "selinux-util.h"
2583fbea 44#include "socket-util.h"
36ce7110 45#include "stat-util.h"
8b43440b 46#include "string-table.h"
07630cea
LP
47#include "string-util.h"
48#include "strv.h"
affb60b1 49#include "umask-util.h"
ee104e11 50#include "user-util.h"
07630cea 51#include "util.h"
15ae422b 52
737ba3c8 53#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
54
c17ec25e 55typedef enum MountMode {
15ae422b
LP
56 /* This is ordered by priority! */
57 INACCESSIBLE,
d2d6c096
LP
58 BIND_MOUNT,
59 BIND_MOUNT_RECURSIVE,
ac0930c8
LP
60 PRIVATE_TMP,
61 PRIVATE_VAR_TMP,
7f112f50 62 PRIVATE_DEV,
5d997827 63 BIND_DEV,
6c47cd7d 64 EMPTY_DIR,
5d997827
LP
65 SYSFS,
66 PROCFS,
67 READONLY,
59eeb84b 68 READWRITE,
c17ec25e 69} MountMode;
15ae422b 70
34de407a 71typedef struct MountEntry {
5327c910 72 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 73 MountMode mode:5;
5327c910
LP
74 bool ignore:1; /* Ignore if path does not exist? */
75 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 76 bool read_only:1; /* Shall this mount point be read-only? */
5327c910 77 char *path_malloc; /* Use this instead of 'path' if we had to allocate memory */
d2d6c096
LP
78 const char *source_const; /* The source path, for bind mounts */
79 char *source_malloc;
34de407a 80} MountEntry;
15ae422b 81
5d997827
LP
82/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
83 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
84static const MountEntry apivfs_table[] = {
85 { "/proc", PROCFS, false },
86 { "/dev", BIND_DEV, false },
87 { "/sys", SYSFS, false },
88};
f471b2af 89
11a30cec 90/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 91static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
92 { "/proc/sys", READONLY, false },
93 { "/proc/sysrq-trigger", READONLY, true },
94 { "/proc/latency_stats", READONLY, true },
95 { "/proc/mtrr", READONLY, true },
aa70f38b 96 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
97 { "/proc/acpi", READONLY, true },
98 { "/proc/timer_stats", READONLY, true },
99 { "/proc/asound", READONLY, true },
100 { "/proc/bus", READONLY, true },
101 { "/proc/fs", READONLY, true },
102 { "/proc/irq", READONLY, true },
103 { "/sys", READONLY, false },
104 { "/sys/kernel/debug", READONLY, true },
105 { "/sys/kernel/tracing", READONLY, true },
106 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
3a0bf6d6 107 { "/sys/fs/selinux", READWRITE, true },
11a30cec
DH
108};
109
c575770b 110/* ProtectKernelModules= option */
34de407a 111static const MountEntry protect_kernel_modules_table[] = {
349cc4a5 112#if HAVE_SPLIT_USR
c6232fb0 113 { "/lib/modules", INACCESSIBLE, true },
c575770b 114#endif
c6232fb0 115 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
116};
117
b6c432ca
DH
118/*
119 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
120 * system should be protected by ProtectSystem=
121 */
34de407a 122static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
123 { "/home", READONLY, true },
124 { "/run/user", READONLY, true },
125 { "/root", READONLY, true },
b6c432ca
DH
126};
127
128/* ProtectHome=yes table */
34de407a 129static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
130 { "/home", INACCESSIBLE, true },
131 { "/run/user", INACCESSIBLE, true },
132 { "/root", INACCESSIBLE, true },
b6c432ca
DH
133};
134
f471b2af 135/* ProtectSystem=yes table */
34de407a 136static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
137 { "/usr", READONLY, false },
138 { "/boot", READONLY, true },
139 { "/efi", READONLY, true },
f471b2af
DH
140};
141
142/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 143static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
144 { "/usr", READONLY, false },
145 { "/boot", READONLY, true },
146 { "/efi", READONLY, true },
147 { "/etc", READONLY, false },
f471b2af
DH
148};
149
150/*
151 * ProtectSystem=strict table. In this strict mode, we mount everything
152 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
153 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
154 * protect those, and these options should be fully orthogonal.
155 * (And of course /home and friends are also left writable, as ProtectHome=
156 * shall manage those, orthogonally).
157 */
34de407a 158static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
159 { "/", READONLY, false },
160 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
161 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
162 { "/dev", READWRITE, false }, /* PrivateDevices= */
163 { "/home", READWRITE, true }, /* ProtectHome= */
164 { "/run/user", READWRITE, true }, /* ProtectHome= */
165 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
166};
167
34de407a 168static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
169 assert(p);
170
5327c910
LP
171 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
172 * otherwise the stack/static ->path field is returned. */
f0a4feb0 173
5327c910 174 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
175}
176
34de407a 177static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
178 assert(p);
179
180 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
181}
182
d2d6c096
LP
183static const char *mount_entry_source(const MountEntry *p) {
184 assert(p);
185
186 return p->source_malloc ?: p->source_const;
187}
188
1eb7e08e
LP
189static void mount_entry_done(MountEntry *p) {
190 assert(p);
191
192 p->path_malloc = mfree(p->path_malloc);
193 p->source_malloc = mfree(p->source_malloc);
194}
195
d18aff04 196static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
15ae422b
LP
197 char **i;
198
613b411c
LP
199 assert(p);
200
5327c910
LP
201 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
202
15ae422b 203 STRV_FOREACH(i, strv) {
5327c910
LP
204 bool ignore = false, needs_prefix = false;
205 const char *e = *i;
15ae422b 206
5327c910
LP
207 /* Look for any prefixes */
208 if (startswith(e, "-")) {
209 e++;
9c94d52e 210 ignore = true;
ea92ae33 211 }
5327c910
LP
212 if (startswith(e, "+")) {
213 e++;
214 needs_prefix = true;
215 }
ea92ae33 216
5327c910 217 if (!path_is_absolute(e))
15ae422b
LP
218 return -EINVAL;
219
34de407a 220 *((*p)++) = (MountEntry) {
5327c910
LP
221 .path_const = e,
222 .mode = mode,
223 .ignore = ignore,
d18aff04 224 .has_prefix = !needs_prefix && !forcibly_require_prefix,
5327c910 225 };
15ae422b
LP
226 }
227
228 return 0;
229}
230
6c47cd7d
LP
231static int append_empty_dir_mounts(MountEntry **p, char **strv) {
232 char **i;
233
234 assert(p);
235
236 /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
237 * "/private/" boundary directories for DynamicUser=1. */
238
239 STRV_FOREACH(i, strv) {
240
241 *((*p)++) = (MountEntry) {
242 .path_const = *i,
243 .mode = EMPTY_DIR,
244 .ignore = false,
245 .has_prefix = false,
246 .read_only = true,
247 };
248 }
249
250 return 0;
251}
252
d2d6c096
LP
253static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
254 unsigned i;
255
256 assert(p);
257
258 for (i = 0; i < n; i++) {
259 const BindMount *b = binds + i;
260
261 *((*p)++) = (MountEntry) {
262 .path_const = b->destination,
263 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
264 .read_only = b->read_only,
265 .source_const = b->source,
266 };
267 }
268
269 return 0;
270}
271
34de407a 272static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 273 unsigned i;
11a30cec
DH
274
275 assert(p);
f471b2af 276 assert(mounts);
11a30cec 277
5327c910 278 /* Adds a list of static pre-defined entries */
f471b2af 279
5327c910 280 for (i = 0; i < n; i++)
34de407a
LP
281 *((*p)++) = (MountEntry) {
282 .path_const = mount_entry_path(mounts+i),
5327c910
LP
283 .mode = mounts[i].mode,
284 .ignore = mounts[i].ignore || ignore_protect,
285 };
f471b2af
DH
286
287 return 0;
288}
289
34de407a 290static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
291 assert(p);
292
5327c910 293 switch (protect_home) {
b6c432ca 294
5327c910 295 case PROTECT_HOME_NO:
b6c432ca
DH
296 return 0;
297
b6c432ca 298 case PROTECT_HOME_READ_ONLY:
5327c910
LP
299 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
300
b6c432ca 301 case PROTECT_HOME_YES:
5327c910
LP
302 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
303
b6c432ca 304 default:
5327c910 305 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 306 }
b6c432ca
DH
307}
308
34de407a 309static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
310 assert(p);
311
5327c910
LP
312 switch (protect_system) {
313
314 case PROTECT_SYSTEM_NO:
f471b2af
DH
315 return 0;
316
f471b2af 317 case PROTECT_SYSTEM_STRICT:
5327c910
LP
318 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
319
f471b2af 320 case PROTECT_SYSTEM_YES:
5327c910
LP
321 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
322
f471b2af 323 case PROTECT_SYSTEM_FULL:
5327c910
LP
324 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
325
f471b2af 326 default:
5327c910 327 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 328 }
11a30cec
DH
329}
330
c17ec25e 331static int mount_path_compare(const void *a, const void *b) {
34de407a 332 const MountEntry *p = a, *q = b;
a0827e2b 333 int d;
15ae422b 334
6ee1a919 335 /* If the paths are not equal, then order prefixes first */
34de407a 336 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
337 if (d != 0)
338 return d;
15ae422b 339
6ee1a919
LP
340 /* If the paths are equal, check the mode */
341 if (p->mode < q->mode)
342 return -1;
15ae422b 343
6ee1a919
LP
344 if (p->mode > q->mode)
345 return 1;
15ae422b 346
6ee1a919 347 return 0;
15ae422b
LP
348}
349
34de407a 350static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
351 unsigned i;
352
353 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
354 * that. */
355
356 if (!root_directory)
357 return 0;
358
359 for (i = 0; i < n; i++) {
360 char *s;
361
362 if (m[i].has_prefix)
363 continue;
364
34de407a 365 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
366 if (!s)
367 return -ENOMEM;
368
369 free(m[i].path_malloc);
370 m[i].path_malloc = s;
371
372 m[i].has_prefix = true;
373 }
374
375 return 0;
376}
377
34de407a
LP
378static void drop_duplicates(MountEntry *m, unsigned *n) {
379 MountEntry *f, *t, *previous;
15ae422b 380
c17ec25e 381 assert(m);
15ae422b 382 assert(n);
15ae422b 383
fe3c2583
LP
384 /* Drops duplicate entries. Expects that the array is properly ordered already. */
385
1d54cd5d 386 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 387
fe3c2583
LP
388 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
389 * above. */
34de407a
LP
390 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
391 log_debug("%s is duplicate.", mount_entry_path(f));
392 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 393 mount_entry_done(f);
15ae422b 394 continue;
fe3c2583 395 }
15ae422b 396
e2d7c1a0 397 *t = *f;
15ae422b 398 previous = t;
fe3c2583
LP
399 t++;
400 }
401
402 *n = t - m;
403}
404
34de407a
LP
405static void drop_inaccessible(MountEntry *m, unsigned *n) {
406 MountEntry *f, *t;
fe3c2583
LP
407 const char *clear = NULL;
408
409 assert(m);
410 assert(n);
411
412 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
413 * ordered already. */
414
1d54cd5d 415 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
416
417 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
418 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
419 if (clear && path_startswith(mount_entry_path(f), clear)) {
420 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 421 mount_entry_done(f);
fe3c2583
LP
422 continue;
423 }
15ae422b 424
34de407a 425 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
426
427 *t = *f;
15ae422b
LP
428 t++;
429 }
430
c17ec25e 431 *n = t - m;
15ae422b
LP
432}
433
34de407a
LP
434static void drop_nop(MountEntry *m, unsigned *n) {
435 MountEntry *f, *t;
7648a565
LP
436
437 assert(m);
438 assert(n);
439
440 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
441 * list is ordered by prefixes. */
442
1d54cd5d 443 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
444
445 /* Only suppress such subtrees for READONLY and READWRITE entries */
446 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 447 MountEntry *p;
7648a565
LP
448 bool found = false;
449
450 /* Now let's find the first parent of the entry we are looking at. */
451 for (p = t-1; p >= m; p--) {
34de407a 452 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
453 found = true;
454 break;
455 }
456 }
457
458 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
459 if (found && p->mode == f->mode) {
34de407a 460 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
1eb7e08e 461 mount_entry_done(f);
7648a565
LP
462 continue;
463 }
464 }
465
466 *t = *f;
467 t++;
468 }
469
470 *n = t - m;
471}
472
34de407a
LP
473static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
474 MountEntry *f, *t;
cd2902c9
LP
475
476 assert(m);
477 assert(n);
478
1d54cd5d 479 /* Nothing to do */
cd2902c9
LP
480 if (!root_directory)
481 return;
482
483 /* Drops all mounts that are outside of the root directory. */
484
1d54cd5d 485 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 486
34de407a
LP
487 if (!path_startswith(mount_entry_path(f), root_directory)) {
488 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 489 mount_entry_done(f);
cd2902c9
LP
490 continue;
491 }
492
493 *t = *f;
494 t++;
495 }
496
497 *n = t - m;
498}
499
414b304b 500static int clone_device_node(const char *d, const char *temporary_mount) {
6f7f3a33 501 const char *dn;
b5e99f23
ДГ
502 struct stat st;
503 int r;
504
414b304b 505 if (stat(d, &st) < 0) {
b5e99f23
ДГ
506 if (errno == ENOENT)
507 return 0;
508 return -errno;
509 }
510
511 if (!S_ISBLK(st.st_mode) &&
512 !S_ISCHR(st.st_mode))
513 return -EINVAL;
514
515 if (st.st_rdev == 0)
516 return 0;
517
6f7f3a33 518 dn = strjoina(temporary_mount, d);
b5e99f23
ДГ
519
520 mac_selinux_create_file_prepare(d, st.st_mode);
521 r = mknod(dn, st.st_mode, st.st_rdev);
522 mac_selinux_create_file_clear();
b5e99f23 523 if (r < 0)
225874dc 524 return log_debug_errno(errno, "mknod failed for %s: %m", d);
b5e99f23 525
98b1d2b8 526 return 1;
b5e99f23
ДГ
527}
528
5d997827 529static int mount_private_dev(MountEntry *m) {
7f112f50
LP
530 static const char devnodes[] =
531 "/dev/null\0"
532 "/dev/zero\0"
533 "/dev/full\0"
534 "/dev/random\0"
535 "/dev/urandom\0"
536 "/dev/tty\0";
537
2b85f4e1 538 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 539 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
540 _cleanup_umask_ mode_t u;
541 int r;
542
543 assert(m);
544
545 u = umask(0000);
546
2b85f4e1
LP
547 if (!mkdtemp(temporary_mount))
548 return -errno;
549
63c372cb 550 dev = strjoina(temporary_mount, "/dev");
dc751688 551 (void) mkdir(dev, 0755);
737ba3c8 552 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
553 r = -errno;
554 goto fail;
555 }
556
63c372cb 557 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 558 (void) mkdir(devpts, 0755);
2b85f4e1
LP
559 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
560 r = -errno;
561 goto fail;
562 }
563
414b304b
ДГ
564 /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
565 * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
566 * thus, in that case make a clone
567 *
568 * in nspawn and other containers it will be a symlink, in that case make it a symlink
569 */
36ce7110
LP
570 r = is_symlink("/dev/ptmx");
571 if (r < 0)
3164e3cb 572 goto fail;
36ce7110 573 if (r > 0) {
414b304b
ДГ
574 devptmx = strjoina(temporary_mount, "/dev/ptmx");
575 if (symlink("pts/ptmx", devptmx) < 0) {
576 r = -errno;
577 goto fail;
578 }
579 } else {
580 r = clone_device_node("/dev/ptmx", temporary_mount);
98b1d2b8 581 if (r != 1)
414b304b
ДГ
582 goto fail;
583 }
e06b6479 584
63c372cb 585 devshm = strjoina(temporary_mount, "/dev/shm");
8d953682 586 (void) mkdir(devshm, 0755);
2b85f4e1
LP
587 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
588 if (r < 0) {
589 r = -errno;
590 goto fail;
591 }
592
63c372cb 593 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 594 (void) mkdir(devmqueue, 0755);
3164e3cb 595 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 596
63c372cb 597 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 598 (void) mkdir(devhugepages, 0755);
3164e3cb 599 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 600
63c372cb 601 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 602 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 603
7f112f50 604 NULSTR_FOREACH(d, devnodes) {
b5e99f23
ДГ
605 r = clone_device_node(d, temporary_mount);
606 if (r < 0)
2b85f4e1 607 goto fail;
7f112f50
LP
608 }
609
03cfe0d5 610 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 611
ee818b89
AC
612 /* Create the /dev directory if missing. It is more likely to be
613 * missing when the service is started with RootDirectory. This is
614 * consistent with mount units creating the mount points when missing.
615 */
34de407a 616 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 617
9e5f8252 618 /* Unmount everything in old /dev */
34de407a
LP
619 umount_recursive(mount_entry_path(m), 0);
620 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
621 r = -errno;
622 goto fail;
623 }
7f112f50 624
2b85f4e1
LP
625 rmdir(dev);
626 rmdir(temporary_mount);
7f112f50 627
2b85f4e1 628 return 0;
7f112f50 629
2b85f4e1
LP
630fail:
631 if (devpts)
632 umount(devpts);
7f112f50 633
2b85f4e1
LP
634 if (devshm)
635 umount(devshm);
7f112f50 636
2b85f4e1
LP
637 if (devhugepages)
638 umount(devhugepages);
7f112f50 639
2b85f4e1
LP
640 if (devmqueue)
641 umount(devmqueue);
7f112f50 642
d267c5aa
ZJS
643 umount(dev);
644 rmdir(dev);
2b85f4e1 645 rmdir(temporary_mount);
7f112f50 646
2b85f4e1 647 return r;
7f112f50
LP
648}
649
5d997827
LP
650static int mount_bind_dev(MountEntry *m) {
651 int r;
652
653 assert(m);
654
655 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
656 * /dev. This is only used when RootDirectory= is set. */
657
645767d6
LP
658 (void) mkdir_p_label(mount_entry_path(m), 0755);
659
5d997827
LP
660 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
661 if (r < 0)
662 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
663 if (r > 0) /* make this a NOP if /dev is already a mount point */
664 return 0;
665
666 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
667 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
668
669 return 1;
670}
671
672static int mount_sysfs(MountEntry *m) {
673 int r;
674
675 assert(m);
676
645767d6
LP
677 (void) mkdir_p_label(mount_entry_path(m), 0755);
678
5d997827
LP
679 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
680 if (r < 0)
681 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
682 if (r > 0) /* make this a NOP if /sys is already a mount point */
683 return 0;
684
685 /* Bind mount the host's version so that we get all child mounts of it, too. */
686 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
687 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
688
689 return 1;
690}
691
692static int mount_procfs(MountEntry *m) {
693 int r;
694
695 assert(m);
696
645767d6
LP
697 (void) mkdir_p_label(mount_entry_path(m), 0755);
698
5d997827
LP
699 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
700 if (r < 0)
701 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
702 if (r > 0) /* make this a NOP if /proc is already a mount point */
703 return 0;
704
705 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
706 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
707 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
708
709 return 1;
710}
711
6c47cd7d
LP
712static int mount_empty_dir(MountEntry *m) {
713 assert(m);
714
715 /* First, get rid of everything that is below if there is anything. Then, overmount with our new empty dir */
716
717 (void) mkdir_p_label(mount_entry_path(m), 0755);
718 (void) umount_recursive(mount_entry_path(m), 0);
719
720 if (mount("tmpfs", mount_entry_path(m), "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
721 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
722
723 return 1;
724}
725
d2d6c096
LP
726static int mount_entry_chase(
727 const char *root_directory,
728 MountEntry *m,
729 const char *path,
730 char **location) {
731
8fceda93
LP
732 char *chased;
733 int r;
734
735 assert(m);
736
737 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
d2d6c096
LP
738 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
739 * that applies). The result is stored in "location". */
8fceda93 740
a227a4be
LP
741 r = chase_symlinks(path, root_directory,
742 IN_SET(m->mode, BIND_MOUNT, BIND_MOUNT_RECURSIVE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV, BIND_DEV, EMPTY_DIR, SYSFS, PROCFS) ? CHASE_NONEXISTENT : 0,
743 &chased);
8fceda93 744 if (r == -ENOENT && m->ignore) {
d2d6c096 745 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
8fceda93
LP
746 return 0;
747 }
748 if (r < 0)
d2d6c096 749 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
8fceda93 750
d2d6c096 751 log_debug("Followed symlinks %s → %s.", path, chased);
8fceda93 752
d2d6c096
LP
753 free(*location);
754 *location = chased;
8fceda93
LP
755
756 return 1;
757}
758
ac0930c8 759static int apply_mount(
8fceda93 760 const char *root_directory,
34de407a 761 MountEntry *m,
ac0930c8 762 const char *tmp_dir,
c17ec25e 763 const char *var_tmp_dir) {
ac0930c8 764
a227a4be 765 bool rbind = true, make = false;
15ae422b 766 const char *what;
15ae422b 767 int r;
15ae422b 768
c17ec25e 769 assert(m);
15ae422b 770
d2d6c096 771 r = mount_entry_chase(root_directory, m, mount_entry_path(m), &m->path_malloc);
8fceda93
LP
772 if (r <= 0)
773 return r;
774
34de407a 775 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 776
c17ec25e 777 switch (m->mode) {
15ae422b 778
160cfdbe
LP
779 case INACCESSIBLE: {
780 struct stat target;
6d313367
LP
781
782 /* First, get rid of everything that is below if there
783 * is anything... Then, overmount it with an
c4b41707 784 * inaccessible path. */
34de407a 785 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 786
34de407a
LP
787 if (lstat(mount_entry_path(m), &target) < 0)
788 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
15ae422b 789
c4b41707 790 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
791 if (!what) {
792 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
793 return -ELOOP;
794 }
795 break;
160cfdbe 796 }
fe3c2583 797
15ae422b 798 case READONLY:
15ae422b 799 case READWRITE:
8fceda93 800 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
d944dc95 801 if (r < 0)
34de407a 802 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
803 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
804 return 0;
6b7c9f8b 805 /* This isn't a mount point yet, let's make it one. */
34de407a 806 what = mount_entry_path(m);
6b7c9f8b 807 break;
15ae422b 808
d2d6c096
LP
809 case BIND_MOUNT:
810 rbind = false;
d2d6c096 811
4831981d 812 _fallthrough_;
d2d6c096
LP
813 case BIND_MOUNT_RECURSIVE:
814 /* Also chase the source mount */
5d997827 815
d2d6c096
LP
816 r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
817 if (r <= 0)
818 return r;
819
820 what = mount_entry_source(m);
a227a4be 821 make = true;
d2d6c096
LP
822 break;
823
6c47cd7d
LP
824 case EMPTY_DIR:
825 return mount_empty_dir(m);
826
ac0930c8
LP
827 case PRIVATE_TMP:
828 what = tmp_dir;
a227a4be 829 make = true;
ac0930c8
LP
830 break;
831
832 case PRIVATE_VAR_TMP:
833 what = var_tmp_dir;
a227a4be 834 make = true;
15ae422b 835 break;
e364ad06 836
d6797c92 837 case PRIVATE_DEV:
5d997827
LP
838 return mount_private_dev(m);
839
840 case BIND_DEV:
841 return mount_bind_dev(m);
842
843 case SYSFS:
844 return mount_sysfs(m);
845
846 case PROCFS:
847 return mount_procfs(m);
d6797c92 848
e364ad06
LP
849 default:
850 assert_not_reached("Unknown mode");
15ae422b
LP
851 }
852
ac0930c8 853 assert(what);
15ae422b 854
a227a4be
LP
855 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
856 bool try_again = false;
857 r = -errno;
858
859 if (r == -ENOENT && make) {
860 struct stat st;
861
862 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
863
864 if (stat(what, &st) >= 0) {
865
866 (void) mkdir_parents(mount_entry_path(m), 0755);
867
868 if (S_ISDIR(st.st_mode))
869 try_again = mkdir(mount_entry_path(m), 0755) >= 0;
870 else
871 try_again = touch(mount_entry_path(m)) >= 0;
872 }
873 }
874
875 if (try_again) {
876 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
877 r = -errno;
878 else
879 r = 0;
880 }
881
882 if (r < 0)
883 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
884 }
6b7c9f8b 885
34de407a 886 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 887 return 0;
ac0930c8 888}
15ae422b 889
ac9de0b3 890static int make_read_only(MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
6b7c9f8b 891 int r = 0;
15ae422b 892
c17ec25e 893 assert(m);
ac9de0b3 894 assert(proc_self_mountinfo);
ac0930c8 895
34de407a 896 if (mount_entry_read_only(m))
ac9de0b3 897 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
13e785f7 898 else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't */
34de407a 899 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 900 r = -errno;
737ba3c8 901 } else
6b7c9f8b
LP
902 return 0;
903
904 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
905 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
906 * read-only mounts already applied. */
ac0930c8 907
8fceda93
LP
908 if (r == -ENOENT && m->ignore)
909 r = 0;
5327c910 910
1d54cd5d 911 return r;
d944dc95
LP
912}
913
bb0ff3fb 914static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
5d997827
LP
915 assert(ns_info);
916
9c988f93
DH
917 /*
918 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
919 * since to protect the API VFS mounts, they need to be around in the
920 * first place... and RootDirectory= or RootImage= need to be set.
921 */
5d997827 922
9c988f93
DH
923 /* root_directory should point to a mount point */
924 return root_directory &&
925 (ns_info->mount_apivfs ||
926 ns_info->protect_control_groups ||
927 ns_info->protect_kernel_tunables);
5d997827
LP
928}
929
2652c6c1 930static unsigned namespace_calculate_mounts(
9c988f93 931 const char* root_directory,
bb0ff3fb 932 const NamespaceInfo *ns_info,
2652c6c1
DH
933 char** read_write_paths,
934 char** read_only_paths,
935 char** inaccessible_paths,
6c47cd7d 936 char** empty_directories,
d2d6c096
LP
937 const BindMount *bind_mounts,
938 unsigned n_bind_mounts,
2652c6c1
DH
939 const char* tmp_dir,
940 const char* var_tmp_dir,
2652c6c1
DH
941 ProtectHome protect_home,
942 ProtectSystem protect_system) {
943
b6c432ca 944 unsigned protect_home_cnt;
f471b2af
DH
945 unsigned protect_system_cnt =
946 (protect_system == PROTECT_SYSTEM_STRICT ?
947 ELEMENTSOF(protect_system_strict_table) :
948 ((protect_system == PROTECT_SYSTEM_FULL) ?
949 ELEMENTSOF(protect_system_full_table) :
950 ((protect_system == PROTECT_SYSTEM_YES) ?
951 ELEMENTSOF(protect_system_yes_table) : 0)));
952
b6c432ca
DH
953 protect_home_cnt =
954 (protect_home == PROTECT_HOME_YES ?
955 ELEMENTSOF(protect_home_yes_table) :
956 ((protect_home == PROTECT_HOME_READ_ONLY) ?
957 ELEMENTSOF(protect_home_read_only_table) : 0));
958
2652c6c1
DH
959 return !!tmp_dir + !!var_tmp_dir +
960 strv_length(read_write_paths) +
961 strv_length(read_only_paths) +
962 strv_length(inaccessible_paths) +
6c47cd7d 963 strv_length(empty_directories) +
d2d6c096 964 n_bind_mounts +
c575770b
DH
965 ns_info->private_dev +
966 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
967 (ns_info->protect_control_groups ? 1 : 0) +
968 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
5d997827 969 protect_home_cnt + protect_system_cnt +
9c988f93 970 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
2652c6c1
DH
971}
972
613b411c 973int setup_namespace(
ee818b89 974 const char* root_directory,
915e6d16 975 const char* root_image,
bb0ff3fb 976 const NamespaceInfo *ns_info,
2a624c36
AP
977 char** read_write_paths,
978 char** read_only_paths,
979 char** inaccessible_paths,
6c47cd7d 980 char** empty_directories,
d2d6c096
LP
981 const BindMount *bind_mounts,
982 unsigned n_bind_mounts,
a004cb4c
LP
983 const char* tmp_dir,
984 const char* var_tmp_dir,
1b8689f9
LP
985 ProtectHome protect_home,
986 ProtectSystem protect_system,
915e6d16
LP
987 unsigned long mount_flags,
988 DissectImageFlags dissect_image_flags) {
15ae422b 989
915e6d16 990 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
78ebe980 991 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
915e6d16 992 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
78ebe980 993 _cleanup_free_ void *root_hash = NULL;
34de407a 994 MountEntry *m, *mounts = NULL;
78ebe980 995 size_t root_hash_size = 0;
d944dc95 996 bool make_slave = false;
e908468b 997 const char *root;
f0a4feb0 998 unsigned n_mounts;
d18aff04 999 bool require_prefix = false;
c17ec25e 1000 int r = 0;
15ae422b 1001
915e6d16
LP
1002 assert(ns_info);
1003
613b411c 1004 if (mount_flags == 0)
c17ec25e 1005 mount_flags = MS_SHARED;
ac0930c8 1006
915e6d16
LP
1007 if (root_image) {
1008 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1009
1010 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1011 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1012
1013 r = loop_device_make_by_path(root_image,
1014 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1015 &loop_device);
1016 if (r < 0)
1017 return r;
1018
78ebe980
LP
1019 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1020 if (r < 0)
1021 return r;
1022
1023 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1024 if (r < 0)
1025 return r;
1026
1027 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
915e6d16
LP
1028 if (r < 0)
1029 return r;
915e6d16
LP
1030 }
1031
e908468b
LP
1032 if (root_directory)
1033 root = root_directory;
1034 else if (root_image || n_bind_mounts > 0) {
1035
1036 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1037 * the same mount point for all images, which is safe, since they all live in their own namespaces
1038 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1039 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1040 * while we are applying them. */
1041
1042 root = "/run/systemd/unit-root";
1043 (void) mkdir_label(root, 0700);
d18aff04 1044 require_prefix = true;
e908468b
LP
1045 } else
1046 root = NULL;
1047
cfbeb4ef 1048 n_mounts = namespace_calculate_mounts(
e908468b 1049 root,
cfbeb4ef
LP
1050 ns_info,
1051 read_write_paths,
1052 read_only_paths,
1053 inaccessible_paths,
6c47cd7d 1054 empty_directories,
d2d6c096 1055 bind_mounts, n_bind_mounts,
cfbeb4ef
LP
1056 tmp_dir, var_tmp_dir,
1057 protect_home, protect_system);
613b411c 1058
2652c6c1 1059 /* Set mount slave mode */
e908468b 1060 if (root || n_mounts > 0)
d944dc95
LP
1061 make_slave = true;
1062
f0a4feb0 1063 if (n_mounts > 0) {
34de407a 1064 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
d18aff04 1065 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
613b411c 1066 if (r < 0)
f0a4feb0 1067 goto finish;
613b411c 1068
d18aff04 1069 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
613b411c 1070 if (r < 0)
f0a4feb0 1071 goto finish;
613b411c 1072
d18aff04 1073 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
613b411c 1074 if (r < 0)
f0a4feb0 1075 goto finish;
7ff7394d 1076
6c47cd7d
LP
1077 r = append_empty_dir_mounts(&m, empty_directories);
1078 if (r < 0)
1079 goto finish;
1080
d2d6c096
LP
1081 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1082 if (r < 0)
1083 goto finish;
1084
613b411c 1085 if (tmp_dir) {
34de407a 1086 *(m++) = (MountEntry) {
5327c910
LP
1087 .path_const = "/tmp",
1088 .mode = PRIVATE_TMP,
1089 };
613b411c 1090 }
7ff7394d 1091
613b411c 1092 if (var_tmp_dir) {
34de407a 1093 *(m++) = (MountEntry) {
5327c910
LP
1094 .path_const = "/var/tmp",
1095 .mode = PRIVATE_VAR_TMP,
1096 };
7ff7394d 1097 }
ac0930c8 1098
c575770b 1099 if (ns_info->private_dev) {
34de407a 1100 *(m++) = (MountEntry) {
5327c910
LP
1101 .path_const = "/dev",
1102 .mode = PRIVATE_DEV,
1103 };
7f112f50
LP
1104 }
1105
c575770b 1106 if (ns_info->protect_kernel_tunables) {
5327c910 1107 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 1108 if (r < 0)
f0a4feb0 1109 goto finish;
c575770b
DH
1110 }
1111
1112 if (ns_info->protect_kernel_modules) {
5327c910 1113 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 1114 if (r < 0)
f0a4feb0 1115 goto finish;
c575770b 1116 }
59eeb84b 1117
c575770b 1118 if (ns_info->protect_control_groups) {
34de407a 1119 *(m++) = (MountEntry) {
5327c910
LP
1120 .path_const = "/sys/fs/cgroup",
1121 .mode = READONLY,
1122 };
59eeb84b
LP
1123 }
1124
5327c910 1125 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 1126 if (r < 0)
f0a4feb0 1127 goto finish;
417116f2 1128
5327c910 1129 r = append_protect_system(&m, protect_system, false);
f471b2af 1130 if (r < 0)
f0a4feb0 1131 goto finish;
417116f2 1132
e908468b 1133 if (namespace_info_mount_apivfs(root, ns_info)) {
5d997827
LP
1134 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1135 if (r < 0)
1136 goto finish;
1137 }
1138
f0a4feb0 1139 assert(mounts + n_mounts == m);
ac0930c8 1140
5327c910 1141 /* Prepend the root directory where that's necessary */
e908468b 1142 r = prefix_where_needed(mounts, n_mounts, root);
5327c910
LP
1143 if (r < 0)
1144 goto finish;
1145
34de407a 1146 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
fe3c2583 1147
f0a4feb0 1148 drop_duplicates(mounts, &n_mounts);
e908468b 1149 drop_outside_root(root, mounts, &n_mounts);
f0a4feb0
DH
1150 drop_inaccessible(mounts, &n_mounts);
1151 drop_nop(mounts, &n_mounts);
15ae422b
LP
1152 }
1153
d944dc95
LP
1154 if (unshare(CLONE_NEWNS) < 0) {
1155 r = -errno;
1156 goto finish;
1157 }
1e4e94c8 1158
d944dc95 1159 if (make_slave) {
c2c13f2d
LP
1160 /* Remount / as SLAVE so that nothing now mounted in the namespace
1161 shows up in the parent */
d944dc95
LP
1162 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1163 r = -errno;
1164 goto finish;
1165 }
ee818b89
AC
1166 }
1167
915e6d16 1168 if (root_image) {
e908468b 1169 /* A root image is specified, mount it to the right place */
2d3a5a73 1170 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
915e6d16
LP
1171 if (r < 0)
1172 goto finish;
1173
07ce7407
TM
1174 if (decrypted_image) {
1175 r = decrypted_image_relinquish(decrypted_image);
1176 if (r < 0)
1177 goto finish;
1178 }
78ebe980 1179
915e6d16
LP
1180 loop_device_relinquish(loop_device);
1181
1182 } else if (root_directory) {
1183
e908468b
LP
1184 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1185 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 1186 if (r < 0)
d944dc95 1187 goto finish;
8f1ad200 1188 if (r == 0) {
e908468b 1189 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
8f1ad200
LP
1190 r = -errno;
1191 goto finish;
1192 }
d944dc95 1193 }
e908468b
LP
1194
1195 } else if (root) {
1196
1197 /* Let's mount the main root directory to the root directory to use */
1198 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1199 r = -errno;
1200 goto finish;
1201 }
ee818b89 1202 }
c2c13f2d 1203
4e0c20de
LP
1204 /* Try to set up the new root directory before mounting anything else there. */
1205 if (root_image || root_directory)
1206 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1207
f0a4feb0 1208 if (n_mounts > 0) {
ac9de0b3 1209 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
6b7c9f8b
LP
1210 char **blacklist;
1211 unsigned j;
1212
ac9de0b3
TR
1213 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1214 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1215 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1216 if (!proc_self_mountinfo) {
1217 r = -errno;
1218 goto finish;
1219 }
1220
6b7c9f8b 1221 /* First round, add in all special mounts we need */
f0a4feb0 1222 for (m = mounts; m < mounts + n_mounts; ++m) {
e908468b 1223 r = apply_mount(root, m, tmp_dir, var_tmp_dir);
c2c13f2d 1224 if (r < 0)
d944dc95 1225 goto finish;
c2c13f2d 1226 }
15ae422b 1227
6b7c9f8b 1228 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
1229 blacklist = newa(char*, n_mounts+1);
1230 for (j = 0; j < n_mounts; j++)
34de407a 1231 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
1232 blacklist[j] = NULL;
1233
1234 /* Second round, flip the ro bits if necessary. */
f0a4feb0 1235 for (m = mounts; m < mounts + n_mounts; ++m) {
ac9de0b3 1236 r = make_read_only(m, blacklist, proc_self_mountinfo);
c2c13f2d 1237 if (r < 0)
d944dc95 1238 goto finish;
c2c13f2d 1239 }
15ae422b
LP
1240 }
1241
e908468b 1242 if (root) {
ee818b89 1243 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
e908468b 1244 r = mount_move_root(root);
d944dc95
LP
1245 if (r < 0)
1246 goto finish;
ee818b89
AC
1247 }
1248
c2c13f2d
LP
1249 /* Remount / as the desired mode. Not that this will not
1250 * reestablish propagation from our side to the host, since
1251 * what's disconnected is disconnected. */
d944dc95
LP
1252 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1253 r = -errno;
1254 goto finish;
1255 }
15ae422b 1256
d944dc95 1257 r = 0;
15ae422b 1258
d944dc95 1259finish:
f0a4feb0 1260 for (m = mounts; m < mounts + n_mounts; m++)
1eb7e08e 1261 mount_entry_done(m);
613b411c
LP
1262
1263 return r;
1264}
1265
d2d6c096
LP
1266void bind_mount_free_many(BindMount *b, unsigned n) {
1267 unsigned i;
1268
1269 assert(b || n == 0);
1270
1271 for (i = 0; i < n; i++) {
1272 free(b[i].source);
1273 free(b[i].destination);
1274 }
1275
1276 free(b);
1277}
1278
1279int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1280 _cleanup_free_ char *s = NULL, *d = NULL;
1281 BindMount *c;
1282
1283 assert(b);
1284 assert(n);
1285 assert(item);
1286
1287 s = strdup(item->source);
1288 if (!s)
1289 return -ENOMEM;
1290
1291 d = strdup(item->destination);
1292 if (!d)
1293 return -ENOMEM;
1294
1295 c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1296 if (!c)
1297 return -ENOMEM;
1298
1299 *b = c;
1300
1301 c[(*n) ++] = (BindMount) {
1302 .source = s,
1303 .destination = d,
1304 .read_only = item->read_only,
1305 .recursive = item->recursive,
1306 .ignore_enoent = item->ignore_enoent,
1307 };
1308
1309 s = d = NULL;
1310 return 0;
1311}
1312
613b411c
LP
1313static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1314 _cleanup_free_ char *x = NULL;
6b46ea73
LP
1315 char bid[SD_ID128_STRING_MAX];
1316 sd_id128_t boot_id;
1317 int r;
613b411c
LP
1318
1319 assert(id);
1320 assert(prefix);
1321 assert(path);
1322
6b46ea73
LP
1323 /* We include the boot id in the directory so that after a
1324 * reboot we can easily identify obsolete directories. */
1325
1326 r = sd_id128_get_boot(&boot_id);
1327 if (r < 0)
1328 return r;
1329
605405c6 1330 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
1331 if (!x)
1332 return -ENOMEM;
1333
1334 RUN_WITH_UMASK(0077)
1335 if (!mkdtemp(x))
1336 return -errno;
1337
1338 RUN_WITH_UMASK(0000) {
1339 char *y;
1340
63c372cb 1341 y = strjoina(x, "/tmp");
613b411c
LP
1342
1343 if (mkdir(y, 0777 | S_ISVTX) < 0)
1344 return -errno;
c17ec25e 1345 }
15ae422b 1346
613b411c
LP
1347 *path = x;
1348 x = NULL;
1349
1350 return 0;
1351}
1352
1353int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1354 char *a, *b;
1355 int r;
1356
1357 assert(id);
1358 assert(tmp_dir);
1359 assert(var_tmp_dir);
1360
1361 r = setup_one_tmp_dir(id, "/tmp", &a);
1362 if (r < 0)
1363 return r;
1364
1365 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1366 if (r < 0) {
1367 char *t;
1368
63c372cb 1369 t = strjoina(a, "/tmp");
613b411c
LP
1370 rmdir(t);
1371 rmdir(a);
1372
1373 free(a);
1374 return r;
1375 }
1376
1377 *tmp_dir = a;
1378 *var_tmp_dir = b;
1379
1380 return 0;
1381}
1382
1383int setup_netns(int netns_storage_socket[2]) {
1384 _cleanup_close_ int netns = -1;
3ee897d6 1385 int r, q;
613b411c
LP
1386
1387 assert(netns_storage_socket);
1388 assert(netns_storage_socket[0] >= 0);
1389 assert(netns_storage_socket[1] >= 0);
1390
1391 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1392 * namespace reference fd. Whatever process runs this first
1393 * shall create a new namespace, all others should just join
1394 * it. To serialize that we use a file lock on the socket
1395 * pair.
613b411c
LP
1396 *
1397 * It's a bit crazy, but hey, works great! */
1398
1399 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1400 return -errno;
1401
3ee897d6
LP
1402 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1403 if (netns == -EAGAIN) {
613b411c
LP
1404 /* Nothing stored yet, so let's create a new namespace */
1405
1406 if (unshare(CLONE_NEWNET) < 0) {
1407 r = -errno;
1408 goto fail;
1409 }
1410
1411 loopback_setup();
1412
1413 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1414 if (netns < 0) {
1415 r = -errno;
1416 goto fail;
1417 }
1418
1419 r = 1;
613b411c 1420
3ee897d6
LP
1421 } else if (netns < 0) {
1422 r = netns;
1423 goto fail;
613b411c 1424
3ee897d6
LP
1425 } else {
1426 /* Yay, found something, so let's join the namespace */
613b411c
LP
1427 if (setns(netns, CLONE_NEWNET) < 0) {
1428 r = -errno;
1429 goto fail;
1430 }
1431
1432 r = 0;
1433 }
1434
3ee897d6
LP
1435 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1436 if (q < 0) {
1437 r = q;
613b411c
LP
1438 goto fail;
1439 }
1440
1441fail:
fe048ce5 1442 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1443 return r;
1444}
417116f2 1445
6e2d7c4f
MS
1446bool ns_type_supported(NamespaceType type) {
1447 const char *t, *ns_proc;
1448
0fa5b831
LP
1449 t = namespace_type_to_string(type);
1450 if (!t) /* Don't know how to translate this? Then it's not supported */
6e2d7c4f
MS
1451 return false;
1452
6e2d7c4f 1453 ns_proc = strjoina("/proc/self/ns/", t);
6e2d7c4f
MS
1454 return access(ns_proc, F_OK) == 0;
1455}
1456
1b8689f9
LP
1457static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1458 [PROTECT_HOME_NO] = "no",
1459 [PROTECT_HOME_YES] = "yes",
1460 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
1461};
1462
1b8689f9
LP
1463DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1464
5e1c6154
YW
1465ProtectHome parse_protect_home_or_bool(const char *s) {
1466 int r;
1467
1468 r = parse_boolean(s);
1469 if (r > 0)
1470 return PROTECT_HOME_YES;
1471 if (r == 0)
1472 return PROTECT_HOME_NO;
1473
1474 return protect_home_from_string(s);
1475}
1476
1b8689f9
LP
1477static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1478 [PROTECT_SYSTEM_NO] = "no",
1479 [PROTECT_SYSTEM_YES] = "yes",
1480 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1481 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1482};
1483
1484DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
6e2d7c4f 1485
03c791aa
YW
1486ProtectSystem parse_protect_system_or_bool(const char *s) {
1487 int r;
1488
1489 r = parse_boolean(s);
1490 if (r > 0)
1491 return PROTECT_SYSTEM_YES;
1492 if (r == 0)
1493 return PROTECT_SYSTEM_NO;
1494
1495 return protect_system_from_string(s);
1496}
1497
6e2d7c4f
MS
1498static const char* const namespace_type_table[] = {
1499 [NAMESPACE_MOUNT] = "mnt",
1500 [NAMESPACE_CGROUP] = "cgroup",
1501 [NAMESPACE_UTS] = "uts",
1502 [NAMESPACE_IPC] = "ipc",
1503 [NAMESPACE_USER] = "user",
1504 [NAMESPACE_PID] = "pid",
1505 [NAMESPACE_NET] = "net",
1506};
1507
1508DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);