]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
core: usually our enum's _INVALID and _MAX special values are named after the full...
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
15ae422b
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
15ae422b 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
20#include <errno.h>
07630cea 21#include <sched.h>
15ae422b 22#include <stdio.h>
07630cea
LP
23#include <string.h>
24#include <sys/mount.h>
15ae422b 25#include <sys/stat.h>
07630cea 26#include <unistd.h>
25e870b5 27#include <linux/fs.h>
15ae422b 28
b5efdb8a 29#include "alloc-util.h"
10404d52 30#include "base-filesystem.h"
7f112f50 31#include "dev-setup.h"
3ffd4af2 32#include "fd-util.h"
d944dc95 33#include "fs-util.h"
915e6d16 34#include "loop-util.h"
07630cea
LP
35#include "loopback-setup.h"
36#include "missing.h"
37#include "mkdir.h"
4349cd7c 38#include "mount-util.h"
3ffd4af2 39#include "namespace.h"
07630cea 40#include "path-util.h"
d7b8eec7 41#include "selinux-util.h"
2583fbea 42#include "socket-util.h"
8b43440b 43#include "string-table.h"
07630cea
LP
44#include "string-util.h"
45#include "strv.h"
affb60b1 46#include "umask-util.h"
ee104e11 47#include "user-util.h"
07630cea 48#include "util.h"
15ae422b 49
737ba3c8 50#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
51
c17ec25e 52typedef enum MountMode {
15ae422b
LP
53 /* This is ordered by priority! */
54 INACCESSIBLE,
d2d6c096
LP
55 BIND_MOUNT,
56 BIND_MOUNT_RECURSIVE,
ac0930c8
LP
57 PRIVATE_TMP,
58 PRIVATE_VAR_TMP,
7f112f50 59 PRIVATE_DEV,
5d997827
LP
60 BIND_DEV,
61 SYSFS,
62 PROCFS,
63 READONLY,
59eeb84b 64 READWRITE,
c17ec25e 65} MountMode;
15ae422b 66
34de407a 67typedef struct MountEntry {
5327c910 68 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 69 MountMode mode:5;
5327c910
LP
70 bool ignore:1; /* Ignore if path does not exist? */
71 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 72 bool read_only:1; /* Shall this mount point be read-only? */
5327c910 73 char *path_malloc; /* Use this instead of 'path' if we had to allocate memory */
d2d6c096
LP
74 const char *source_const; /* The source path, for bind mounts */
75 char *source_malloc;
34de407a 76} MountEntry;
15ae422b 77
5d997827
LP
78/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
79 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
80static const MountEntry apivfs_table[] = {
81 { "/proc", PROCFS, false },
82 { "/dev", BIND_DEV, false },
83 { "/sys", SYSFS, false },
84};
f471b2af 85
11a30cec 86/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 87static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
88 { "/proc/sys", READONLY, false },
89 { "/proc/sysrq-trigger", READONLY, true },
90 { "/proc/latency_stats", READONLY, true },
91 { "/proc/mtrr", READONLY, true },
aa70f38b 92 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
93 { "/proc/acpi", READONLY, true },
94 { "/proc/timer_stats", READONLY, true },
95 { "/proc/asound", READONLY, true },
96 { "/proc/bus", READONLY, true },
97 { "/proc/fs", READONLY, true },
98 { "/proc/irq", READONLY, true },
99 { "/sys", READONLY, false },
100 { "/sys/kernel/debug", READONLY, true },
101 { "/sys/kernel/tracing", READONLY, true },
102 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
3a0bf6d6 103 { "/sys/fs/selinux", READWRITE, true },
11a30cec
DH
104};
105
c575770b 106/* ProtectKernelModules= option */
34de407a 107static const MountEntry protect_kernel_modules_table[] = {
c575770b 108#ifdef HAVE_SPLIT_USR
c6232fb0 109 { "/lib/modules", INACCESSIBLE, true },
c575770b 110#endif
c6232fb0 111 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
112};
113
b6c432ca
DH
114/*
115 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
116 * system should be protected by ProtectSystem=
117 */
34de407a 118static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
119 { "/home", READONLY, true },
120 { "/run/user", READONLY, true },
121 { "/root", READONLY, true },
b6c432ca
DH
122};
123
124/* ProtectHome=yes table */
34de407a 125static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
126 { "/home", INACCESSIBLE, true },
127 { "/run/user", INACCESSIBLE, true },
128 { "/root", INACCESSIBLE, true },
b6c432ca
DH
129};
130
f471b2af 131/* ProtectSystem=yes table */
34de407a 132static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
133 { "/usr", READONLY, false },
134 { "/boot", READONLY, true },
135 { "/efi", READONLY, true },
f471b2af
DH
136};
137
138/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 139static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
140 { "/usr", READONLY, false },
141 { "/boot", READONLY, true },
142 { "/efi", READONLY, true },
143 { "/etc", READONLY, false },
f471b2af
DH
144};
145
146/*
147 * ProtectSystem=strict table. In this strict mode, we mount everything
148 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
149 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
150 * protect those, and these options should be fully orthogonal.
151 * (And of course /home and friends are also left writable, as ProtectHome=
152 * shall manage those, orthogonally).
153 */
34de407a 154static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
155 { "/", READONLY, false },
156 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
157 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
158 { "/dev", READWRITE, false }, /* PrivateDevices= */
159 { "/home", READWRITE, true }, /* ProtectHome= */
160 { "/run/user", READWRITE, true }, /* ProtectHome= */
161 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
162};
163
34de407a 164static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
165 assert(p);
166
5327c910
LP
167 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
168 * otherwise the stack/static ->path field is returned. */
f0a4feb0 169
5327c910 170 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
171}
172
34de407a 173static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
174 assert(p);
175
176 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
177}
178
d2d6c096
LP
179static const char *mount_entry_source(const MountEntry *p) {
180 assert(p);
181
182 return p->source_malloc ?: p->source_const;
183}
184
1eb7e08e
LP
185static void mount_entry_done(MountEntry *p) {
186 assert(p);
187
188 p->path_malloc = mfree(p->path_malloc);
189 p->source_malloc = mfree(p->source_malloc);
190}
191
34de407a 192static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) {
15ae422b
LP
193 char **i;
194
613b411c
LP
195 assert(p);
196
5327c910
LP
197 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
198
15ae422b 199 STRV_FOREACH(i, strv) {
5327c910
LP
200 bool ignore = false, needs_prefix = false;
201 const char *e = *i;
15ae422b 202
5327c910
LP
203 /* Look for any prefixes */
204 if (startswith(e, "-")) {
205 e++;
9c94d52e 206 ignore = true;
ea92ae33 207 }
5327c910
LP
208 if (startswith(e, "+")) {
209 e++;
210 needs_prefix = true;
211 }
ea92ae33 212
5327c910 213 if (!path_is_absolute(e))
15ae422b
LP
214 return -EINVAL;
215
34de407a 216 *((*p)++) = (MountEntry) {
5327c910
LP
217 .path_const = e,
218 .mode = mode,
219 .ignore = ignore,
220 .has_prefix = !needs_prefix,
221 };
15ae422b
LP
222 }
223
224 return 0;
225}
226
d2d6c096
LP
227static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
228 unsigned i;
229
230 assert(p);
231
232 for (i = 0; i < n; i++) {
233 const BindMount *b = binds + i;
234
235 *((*p)++) = (MountEntry) {
236 .path_const = b->destination,
237 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
238 .read_only = b->read_only,
239 .source_const = b->source,
240 };
241 }
242
243 return 0;
244}
245
34de407a 246static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 247 unsigned i;
11a30cec
DH
248
249 assert(p);
f471b2af 250 assert(mounts);
11a30cec 251
5327c910 252 /* Adds a list of static pre-defined entries */
f471b2af 253
5327c910 254 for (i = 0; i < n; i++)
34de407a
LP
255 *((*p)++) = (MountEntry) {
256 .path_const = mount_entry_path(mounts+i),
5327c910
LP
257 .mode = mounts[i].mode,
258 .ignore = mounts[i].ignore || ignore_protect,
259 };
f471b2af
DH
260
261 return 0;
262}
263
34de407a 264static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
265 assert(p);
266
5327c910 267 switch (protect_home) {
b6c432ca 268
5327c910 269 case PROTECT_HOME_NO:
b6c432ca
DH
270 return 0;
271
b6c432ca 272 case PROTECT_HOME_READ_ONLY:
5327c910
LP
273 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
274
b6c432ca 275 case PROTECT_HOME_YES:
5327c910
LP
276 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
277
b6c432ca 278 default:
5327c910 279 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 280 }
b6c432ca
DH
281}
282
34de407a 283static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
284 assert(p);
285
5327c910
LP
286 switch (protect_system) {
287
288 case PROTECT_SYSTEM_NO:
f471b2af
DH
289 return 0;
290
f471b2af 291 case PROTECT_SYSTEM_STRICT:
5327c910
LP
292 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
293
f471b2af 294 case PROTECT_SYSTEM_YES:
5327c910
LP
295 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
296
f471b2af 297 case PROTECT_SYSTEM_FULL:
5327c910
LP
298 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
299
f471b2af 300 default:
5327c910 301 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 302 }
11a30cec
DH
303}
304
c17ec25e 305static int mount_path_compare(const void *a, const void *b) {
34de407a 306 const MountEntry *p = a, *q = b;
a0827e2b 307 int d;
15ae422b 308
6ee1a919 309 /* If the paths are not equal, then order prefixes first */
34de407a 310 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
311 if (d != 0)
312 return d;
15ae422b 313
6ee1a919
LP
314 /* If the paths are equal, check the mode */
315 if (p->mode < q->mode)
316 return -1;
15ae422b 317
6ee1a919
LP
318 if (p->mode > q->mode)
319 return 1;
15ae422b 320
6ee1a919 321 return 0;
15ae422b
LP
322}
323
34de407a 324static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
325 unsigned i;
326
327 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
328 * that. */
329
330 if (!root_directory)
331 return 0;
332
333 for (i = 0; i < n; i++) {
334 char *s;
335
336 if (m[i].has_prefix)
337 continue;
338
34de407a 339 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
340 if (!s)
341 return -ENOMEM;
342
343 free(m[i].path_malloc);
344 m[i].path_malloc = s;
345
346 m[i].has_prefix = true;
347 }
348
349 return 0;
350}
351
34de407a
LP
352static void drop_duplicates(MountEntry *m, unsigned *n) {
353 MountEntry *f, *t, *previous;
15ae422b 354
c17ec25e 355 assert(m);
15ae422b 356 assert(n);
15ae422b 357
fe3c2583
LP
358 /* Drops duplicate entries. Expects that the array is properly ordered already. */
359
1d54cd5d 360 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 361
fe3c2583
LP
362 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
363 * above. */
34de407a
LP
364 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
365 log_debug("%s is duplicate.", mount_entry_path(f));
366 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 367 mount_entry_done(f);
15ae422b 368 continue;
fe3c2583 369 }
15ae422b 370
e2d7c1a0 371 *t = *f;
15ae422b 372 previous = t;
fe3c2583
LP
373 t++;
374 }
375
376 *n = t - m;
377}
378
34de407a
LP
379static void drop_inaccessible(MountEntry *m, unsigned *n) {
380 MountEntry *f, *t;
fe3c2583
LP
381 const char *clear = NULL;
382
383 assert(m);
384 assert(n);
385
386 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
387 * ordered already. */
388
1d54cd5d 389 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
390
391 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
392 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
393 if (clear && path_startswith(mount_entry_path(f), clear)) {
394 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 395 mount_entry_done(f);
fe3c2583
LP
396 continue;
397 }
15ae422b 398
34de407a 399 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
400
401 *t = *f;
15ae422b
LP
402 t++;
403 }
404
c17ec25e 405 *n = t - m;
15ae422b
LP
406}
407
34de407a
LP
408static void drop_nop(MountEntry *m, unsigned *n) {
409 MountEntry *f, *t;
7648a565
LP
410
411 assert(m);
412 assert(n);
413
414 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
415 * list is ordered by prefixes. */
416
1d54cd5d 417 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
418
419 /* Only suppress such subtrees for READONLY and READWRITE entries */
420 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 421 MountEntry *p;
7648a565
LP
422 bool found = false;
423
424 /* Now let's find the first parent of the entry we are looking at. */
425 for (p = t-1; p >= m; p--) {
34de407a 426 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
427 found = true;
428 break;
429 }
430 }
431
432 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
433 if (found && p->mode == f->mode) {
34de407a 434 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
1eb7e08e 435 mount_entry_done(f);
7648a565
LP
436 continue;
437 }
438 }
439
440 *t = *f;
441 t++;
442 }
443
444 *n = t - m;
445}
446
34de407a
LP
447static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
448 MountEntry *f, *t;
cd2902c9
LP
449
450 assert(m);
451 assert(n);
452
1d54cd5d 453 /* Nothing to do */
cd2902c9
LP
454 if (!root_directory)
455 return;
456
457 /* Drops all mounts that are outside of the root directory. */
458
1d54cd5d 459 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 460
34de407a
LP
461 if (!path_startswith(mount_entry_path(f), root_directory)) {
462 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 463 mount_entry_done(f);
cd2902c9
LP
464 continue;
465 }
466
467 *t = *f;
468 t++;
469 }
470
471 *n = t - m;
472}
473
5d997827 474static int mount_private_dev(MountEntry *m) {
7f112f50
LP
475 static const char devnodes[] =
476 "/dev/null\0"
477 "/dev/zero\0"
478 "/dev/full\0"
479 "/dev/random\0"
480 "/dev/urandom\0"
481 "/dev/tty\0";
482
2b85f4e1 483 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 484 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
485 _cleanup_umask_ mode_t u;
486 int r;
487
488 assert(m);
489
490 u = umask(0000);
491
2b85f4e1
LP
492 if (!mkdtemp(temporary_mount))
493 return -errno;
494
63c372cb 495 dev = strjoina(temporary_mount, "/dev");
dc751688 496 (void) mkdir(dev, 0755);
737ba3c8 497 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
498 r = -errno;
499 goto fail;
500 }
501
63c372cb 502 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 503 (void) mkdir(devpts, 0755);
2b85f4e1
LP
504 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
505 r = -errno;
506 goto fail;
507 }
508
63c372cb 509 devptmx = strjoina(temporary_mount, "/dev/ptmx");
3164e3cb
ZJS
510 if (symlink("pts/ptmx", devptmx) < 0) {
511 r = -errno;
512 goto fail;
513 }
e06b6479 514
63c372cb 515 devshm = strjoina(temporary_mount, "/dev/shm");
dc751688 516 (void) mkdir(devshm, 01777);
2b85f4e1
LP
517 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
518 if (r < 0) {
519 r = -errno;
520 goto fail;
521 }
522
63c372cb 523 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 524 (void) mkdir(devmqueue, 0755);
3164e3cb 525 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 526
63c372cb 527 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 528 (void) mkdir(devhugepages, 0755);
3164e3cb 529 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 530
63c372cb 531 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 532 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 533
7f112f50 534 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
535 _cleanup_free_ char *dn = NULL;
536 struct stat st;
537
538 r = stat(d, &st);
7f112f50 539 if (r < 0) {
2b85f4e1
LP
540
541 if (errno == ENOENT)
542 continue;
543
544 r = -errno;
545 goto fail;
7f112f50
LP
546 }
547
2b85f4e1
LP
548 if (!S_ISBLK(st.st_mode) &&
549 !S_ISCHR(st.st_mode)) {
550 r = -EINVAL;
551 goto fail;
552 }
553
554 if (st.st_rdev == 0)
555 continue;
556
557 dn = strappend(temporary_mount, d);
558 if (!dn) {
559 r = -ENOMEM;
560 goto fail;
561 }
562
ecabcf8b 563 mac_selinux_create_file_prepare(d, st.st_mode);
2b85f4e1 564 r = mknod(dn, st.st_mode, st.st_rdev);
ecabcf8b 565 mac_selinux_create_file_clear();
dd078a1e 566
2b85f4e1
LP
567 if (r < 0) {
568 r = -errno;
569 goto fail;
570 }
7f112f50
LP
571 }
572
03cfe0d5 573 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 574
ee818b89
AC
575 /* Create the /dev directory if missing. It is more likely to be
576 * missing when the service is started with RootDirectory. This is
577 * consistent with mount units creating the mount points when missing.
578 */
34de407a 579 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 580
9e5f8252 581 /* Unmount everything in old /dev */
34de407a
LP
582 umount_recursive(mount_entry_path(m), 0);
583 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
584 r = -errno;
585 goto fail;
586 }
7f112f50 587
2b85f4e1
LP
588 rmdir(dev);
589 rmdir(temporary_mount);
7f112f50 590
2b85f4e1 591 return 0;
7f112f50 592
2b85f4e1
LP
593fail:
594 if (devpts)
595 umount(devpts);
7f112f50 596
2b85f4e1
LP
597 if (devshm)
598 umount(devshm);
7f112f50 599
2b85f4e1
LP
600 if (devhugepages)
601 umount(devhugepages);
7f112f50 602
2b85f4e1
LP
603 if (devmqueue)
604 umount(devmqueue);
7f112f50 605
d267c5aa
ZJS
606 umount(dev);
607 rmdir(dev);
2b85f4e1 608 rmdir(temporary_mount);
7f112f50 609
2b85f4e1 610 return r;
7f112f50
LP
611}
612
5d997827
LP
613static int mount_bind_dev(MountEntry *m) {
614 int r;
615
616 assert(m);
617
618 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
619 * /dev. This is only used when RootDirectory= is set. */
620
621 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
622 if (r < 0)
623 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
624 if (r > 0) /* make this a NOP if /dev is already a mount point */
625 return 0;
626
627 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
628 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
629
630 return 1;
631}
632
633static int mount_sysfs(MountEntry *m) {
634 int r;
635
636 assert(m);
637
638 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
639 if (r < 0)
640 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
641 if (r > 0) /* make this a NOP if /sys is already a mount point */
642 return 0;
643
644 /* Bind mount the host's version so that we get all child mounts of it, too. */
645 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
646 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
647
648 return 1;
649}
650
651static int mount_procfs(MountEntry *m) {
652 int r;
653
654 assert(m);
655
656 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
657 if (r < 0)
658 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
659 if (r > 0) /* make this a NOP if /proc is already a mount point */
660 return 0;
661
662 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
663 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
664 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
665
666 return 1;
667}
668
d2d6c096
LP
669static int mount_entry_chase(
670 const char *root_directory,
671 MountEntry *m,
672 const char *path,
673 char **location) {
674
8fceda93
LP
675 char *chased;
676 int r;
677
678 assert(m);
679
680 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
d2d6c096
LP
681 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
682 * that applies). The result is stored in "location". */
8fceda93 683
d2d6c096 684 r = chase_symlinks(path, root_directory, 0, &chased);
8fceda93 685 if (r == -ENOENT && m->ignore) {
d2d6c096 686 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
8fceda93
LP
687 return 0;
688 }
689 if (r < 0)
d2d6c096 690 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
8fceda93 691
d2d6c096 692 log_debug("Followed symlinks %s → %s.", path, chased);
8fceda93 693
d2d6c096
LP
694 free(*location);
695 *location = chased;
8fceda93
LP
696
697 return 1;
698}
699
ac0930c8 700static int apply_mount(
8fceda93 701 const char *root_directory,
34de407a 702 MountEntry *m,
ac0930c8 703 const char *tmp_dir,
c17ec25e 704 const char *var_tmp_dir) {
ac0930c8 705
15ae422b 706 const char *what;
d2d6c096 707 bool rbind = true;
15ae422b 708 int r;
15ae422b 709
c17ec25e 710 assert(m);
15ae422b 711
d2d6c096 712 r = mount_entry_chase(root_directory, m, mount_entry_path(m), &m->path_malloc);
8fceda93
LP
713 if (r <= 0)
714 return r;
715
34de407a 716 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 717
c17ec25e 718 switch (m->mode) {
15ae422b 719
160cfdbe
LP
720 case INACCESSIBLE: {
721 struct stat target;
6d313367
LP
722
723 /* First, get rid of everything that is below if there
724 * is anything... Then, overmount it with an
c4b41707 725 * inaccessible path. */
34de407a 726 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 727
34de407a
LP
728 if (lstat(mount_entry_path(m), &target) < 0)
729 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
15ae422b 730
c4b41707 731 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
732 if (!what) {
733 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
734 return -ELOOP;
735 }
736 break;
160cfdbe 737 }
fe3c2583 738
15ae422b 739 case READONLY:
15ae422b 740 case READWRITE:
8fceda93 741 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
d944dc95 742 if (r < 0)
34de407a 743 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
744 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
745 return 0;
6b7c9f8b 746 /* This isn't a mount point yet, let's make it one. */
34de407a 747 what = mount_entry_path(m);
6b7c9f8b 748 break;
15ae422b 749
d2d6c096
LP
750 case BIND_MOUNT:
751 rbind = false;
752 /* fallthrough */
753
754 case BIND_MOUNT_RECURSIVE:
755 /* Also chase the source mount */
5d997827 756
d2d6c096
LP
757 r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
758 if (r <= 0)
759 return r;
760
761 what = mount_entry_source(m);
762 break;
763
ac0930c8
LP
764 case PRIVATE_TMP:
765 what = tmp_dir;
766 break;
767
768 case PRIVATE_VAR_TMP:
769 what = var_tmp_dir;
15ae422b 770 break;
e364ad06 771
d6797c92 772 case PRIVATE_DEV:
5d997827
LP
773 return mount_private_dev(m);
774
775 case BIND_DEV:
776 return mount_bind_dev(m);
777
778 case SYSFS:
779 return mount_sysfs(m);
780
781 case PROCFS:
782 return mount_procfs(m);
d6797c92 783
e364ad06
LP
784 default:
785 assert_not_reached("Unknown mode");
15ae422b
LP
786 }
787
ac0930c8 788 assert(what);
15ae422b 789
d2d6c096 790 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
34de407a 791 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
6b7c9f8b 792
34de407a 793 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 794 return 0;
ac0930c8 795}
15ae422b 796
ac9de0b3 797static int make_read_only(MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
6b7c9f8b 798 int r = 0;
15ae422b 799
c17ec25e 800 assert(m);
ac9de0b3 801 assert(proc_self_mountinfo);
ac0930c8 802
34de407a 803 if (mount_entry_read_only(m))
ac9de0b3 804 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
13e785f7 805 else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't */
34de407a 806 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 807 r = -errno;
737ba3c8 808 } else
6b7c9f8b
LP
809 return 0;
810
811 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
812 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
813 * read-only mounts already applied. */
ac0930c8 814
8fceda93
LP
815 if (r == -ENOENT && m->ignore)
816 r = 0;
5327c910 817
1d54cd5d 818 return r;
d944dc95
LP
819}
820
9c988f93 821static bool namespace_info_mount_apivfs(const char *root_directory, const NameSpaceInfo *ns_info) {
5d997827
LP
822 assert(ns_info);
823
9c988f93
DH
824 /*
825 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
826 * since to protect the API VFS mounts, they need to be around in the
827 * first place... and RootDirectory= or RootImage= need to be set.
828 */
5d997827 829
9c988f93
DH
830 /* root_directory should point to a mount point */
831 return root_directory &&
832 (ns_info->mount_apivfs ||
833 ns_info->protect_control_groups ||
834 ns_info->protect_kernel_tunables);
5d997827
LP
835}
836
2652c6c1 837static unsigned namespace_calculate_mounts(
9c988f93 838 const char* root_directory,
c575770b 839 const NameSpaceInfo *ns_info,
2652c6c1
DH
840 char** read_write_paths,
841 char** read_only_paths,
842 char** inaccessible_paths,
d2d6c096
LP
843 const BindMount *bind_mounts,
844 unsigned n_bind_mounts,
2652c6c1
DH
845 const char* tmp_dir,
846 const char* var_tmp_dir,
2652c6c1
DH
847 ProtectHome protect_home,
848 ProtectSystem protect_system) {
849
b6c432ca 850 unsigned protect_home_cnt;
f471b2af
DH
851 unsigned protect_system_cnt =
852 (protect_system == PROTECT_SYSTEM_STRICT ?
853 ELEMENTSOF(protect_system_strict_table) :
854 ((protect_system == PROTECT_SYSTEM_FULL) ?
855 ELEMENTSOF(protect_system_full_table) :
856 ((protect_system == PROTECT_SYSTEM_YES) ?
857 ELEMENTSOF(protect_system_yes_table) : 0)));
858
b6c432ca
DH
859 protect_home_cnt =
860 (protect_home == PROTECT_HOME_YES ?
861 ELEMENTSOF(protect_home_yes_table) :
862 ((protect_home == PROTECT_HOME_READ_ONLY) ?
863 ELEMENTSOF(protect_home_read_only_table) : 0));
864
2652c6c1
DH
865 return !!tmp_dir + !!var_tmp_dir +
866 strv_length(read_write_paths) +
867 strv_length(read_only_paths) +
868 strv_length(inaccessible_paths) +
d2d6c096 869 n_bind_mounts +
c575770b
DH
870 ns_info->private_dev +
871 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
872 (ns_info->protect_control_groups ? 1 : 0) +
873 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
5d997827 874 protect_home_cnt + protect_system_cnt +
9c988f93 875 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
2652c6c1
DH
876}
877
613b411c 878int setup_namespace(
ee818b89 879 const char* root_directory,
915e6d16 880 const char* root_image,
c575770b 881 const NameSpaceInfo *ns_info,
2a624c36
AP
882 char** read_write_paths,
883 char** read_only_paths,
884 char** inaccessible_paths,
d2d6c096
LP
885 const BindMount *bind_mounts,
886 unsigned n_bind_mounts,
a004cb4c
LP
887 const char* tmp_dir,
888 const char* var_tmp_dir,
1b8689f9
LP
889 ProtectHome protect_home,
890 ProtectSystem protect_system,
915e6d16
LP
891 unsigned long mount_flags,
892 DissectImageFlags dissect_image_flags) {
15ae422b 893
915e6d16 894 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
78ebe980 895 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
915e6d16 896 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
78ebe980 897 _cleanup_free_ void *root_hash = NULL;
34de407a 898 MountEntry *m, *mounts = NULL;
78ebe980 899 size_t root_hash_size = 0;
d944dc95 900 bool make_slave = false;
f0a4feb0 901 unsigned n_mounts;
c17ec25e 902 int r = 0;
15ae422b 903
915e6d16
LP
904 assert(ns_info);
905
613b411c 906 if (mount_flags == 0)
c17ec25e 907 mount_flags = MS_SHARED;
ac0930c8 908
915e6d16
LP
909 if (root_image) {
910 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
911
912 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
913 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
914
915 r = loop_device_make_by_path(root_image,
916 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
917 &loop_device);
918 if (r < 0)
919 return r;
920
78ebe980
LP
921 r = root_hash_load(root_image, &root_hash, &root_hash_size);
922 if (r < 0)
923 return r;
924
925 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
926 if (r < 0)
927 return r;
928
929 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
915e6d16
LP
930 if (r < 0)
931 return r;
932
933 if (!root_directory) {
934 /* Create a mount point for the image, if it's still missing. We use the same mount point for
935 * all images, which is safe, since they all live in their own namespaces after all, and hence
936 * won't see each other. */
937 root_directory = "/run/systemd/unit-root";
938 (void) mkdir(root_directory, 0700);
939 }
940 }
941
cfbeb4ef 942 n_mounts = namespace_calculate_mounts(
9c988f93 943 root_directory,
cfbeb4ef
LP
944 ns_info,
945 read_write_paths,
946 read_only_paths,
947 inaccessible_paths,
d2d6c096 948 bind_mounts, n_bind_mounts,
cfbeb4ef
LP
949 tmp_dir, var_tmp_dir,
950 protect_home, protect_system);
613b411c 951
2652c6c1 952 /* Set mount slave mode */
f0a4feb0 953 if (root_directory || n_mounts > 0)
d944dc95
LP
954 make_slave = true;
955
f0a4feb0 956 if (n_mounts > 0) {
34de407a 957 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
5327c910 958 r = append_access_mounts(&m, read_write_paths, READWRITE);
613b411c 959 if (r < 0)
f0a4feb0 960 goto finish;
613b411c 961
5327c910 962 r = append_access_mounts(&m, read_only_paths, READONLY);
613b411c 963 if (r < 0)
f0a4feb0 964 goto finish;
613b411c 965
5327c910 966 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE);
613b411c 967 if (r < 0)
f0a4feb0 968 goto finish;
7ff7394d 969
d2d6c096
LP
970 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
971 if (r < 0)
972 goto finish;
973
613b411c 974 if (tmp_dir) {
34de407a 975 *(m++) = (MountEntry) {
5327c910
LP
976 .path_const = "/tmp",
977 .mode = PRIVATE_TMP,
978 };
613b411c 979 }
7ff7394d 980
613b411c 981 if (var_tmp_dir) {
34de407a 982 *(m++) = (MountEntry) {
5327c910
LP
983 .path_const = "/var/tmp",
984 .mode = PRIVATE_VAR_TMP,
985 };
7ff7394d 986 }
ac0930c8 987
c575770b 988 if (ns_info->private_dev) {
34de407a 989 *(m++) = (MountEntry) {
5327c910
LP
990 .path_const = "/dev",
991 .mode = PRIVATE_DEV,
992 };
7f112f50
LP
993 }
994
c575770b 995 if (ns_info->protect_kernel_tunables) {
5327c910 996 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 997 if (r < 0)
f0a4feb0 998 goto finish;
c575770b
DH
999 }
1000
1001 if (ns_info->protect_kernel_modules) {
5327c910 1002 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 1003 if (r < 0)
f0a4feb0 1004 goto finish;
c575770b 1005 }
59eeb84b 1006
c575770b 1007 if (ns_info->protect_control_groups) {
34de407a 1008 *(m++) = (MountEntry) {
5327c910
LP
1009 .path_const = "/sys/fs/cgroup",
1010 .mode = READONLY,
1011 };
59eeb84b
LP
1012 }
1013
5327c910 1014 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 1015 if (r < 0)
f0a4feb0 1016 goto finish;
417116f2 1017
5327c910 1018 r = append_protect_system(&m, protect_system, false);
f471b2af 1019 if (r < 0)
f0a4feb0 1020 goto finish;
417116f2 1021
9c988f93 1022 if (namespace_info_mount_apivfs(root_directory, ns_info)) {
5d997827
LP
1023 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1024 if (r < 0)
1025 goto finish;
1026 }
1027
f0a4feb0 1028 assert(mounts + n_mounts == m);
ac0930c8 1029
5327c910
LP
1030 /* Prepend the root directory where that's necessary */
1031 r = prefix_where_needed(mounts, n_mounts, root_directory);
1032 if (r < 0)
1033 goto finish;
1034
34de407a 1035 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
fe3c2583 1036
f0a4feb0
DH
1037 drop_duplicates(mounts, &n_mounts);
1038 drop_outside_root(root_directory, mounts, &n_mounts);
1039 drop_inaccessible(mounts, &n_mounts);
1040 drop_nop(mounts, &n_mounts);
15ae422b
LP
1041 }
1042
d944dc95
LP
1043 if (unshare(CLONE_NEWNS) < 0) {
1044 r = -errno;
1045 goto finish;
1046 }
1e4e94c8 1047
d944dc95 1048 if (make_slave) {
c2c13f2d
LP
1049 /* Remount / as SLAVE so that nothing now mounted in the namespace
1050 shows up in the parent */
d944dc95
LP
1051 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1052 r = -errno;
1053 goto finish;
1054 }
ee818b89
AC
1055 }
1056
10404d52
DH
1057 /* Try to set up the new root directory before mounting anything there */
1058 if (root_directory)
1059 (void) base_filesystem_create(root_directory, UID_INVALID, GID_INVALID);
1060
915e6d16
LP
1061 if (root_image) {
1062 r = dissected_image_mount(dissected_image, root_directory, dissect_image_flags);
1063 if (r < 0)
1064 goto finish;
1065
07ce7407
TM
1066 if (decrypted_image) {
1067 r = decrypted_image_relinquish(decrypted_image);
1068 if (r < 0)
1069 goto finish;
1070 }
78ebe980 1071
915e6d16
LP
1072 loop_device_relinquish(loop_device);
1073
1074 } else if (root_directory) {
1075
8f1ad200 1076 /* Turn directory into bind mount, if it isn't one yet */
e1873695 1077 r = path_is_mount_point(root_directory, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 1078 if (r < 0)
d944dc95 1079 goto finish;
8f1ad200
LP
1080 if (r == 0) {
1081 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
1082 r = -errno;
1083 goto finish;
1084 }
d944dc95 1085 }
ee818b89 1086 }
c2c13f2d 1087
f0a4feb0 1088 if (n_mounts > 0) {
ac9de0b3 1089 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
6b7c9f8b
LP
1090 char **blacklist;
1091 unsigned j;
1092
ac9de0b3
TR
1093 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1094 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1095 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1096 if (!proc_self_mountinfo) {
1097 r = -errno;
1098 goto finish;
1099 }
1100
6b7c9f8b 1101 /* First round, add in all special mounts we need */
f0a4feb0 1102 for (m = mounts; m < mounts + n_mounts; ++m) {
8fceda93 1103 r = apply_mount(root_directory, m, tmp_dir, var_tmp_dir);
c2c13f2d 1104 if (r < 0)
d944dc95 1105 goto finish;
c2c13f2d 1106 }
15ae422b 1107
6b7c9f8b 1108 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
1109 blacklist = newa(char*, n_mounts+1);
1110 for (j = 0; j < n_mounts; j++)
34de407a 1111 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
1112 blacklist[j] = NULL;
1113
1114 /* Second round, flip the ro bits if necessary. */
f0a4feb0 1115 for (m = mounts; m < mounts + n_mounts; ++m) {
ac9de0b3 1116 r = make_read_only(m, blacklist, proc_self_mountinfo);
c2c13f2d 1117 if (r < 0)
d944dc95 1118 goto finish;
c2c13f2d 1119 }
15ae422b
LP
1120 }
1121
ee818b89
AC
1122 if (root_directory) {
1123 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1124 r = mount_move_root(root_directory);
d944dc95
LP
1125 if (r < 0)
1126 goto finish;
ee818b89
AC
1127 }
1128
c2c13f2d
LP
1129 /* Remount / as the desired mode. Not that this will not
1130 * reestablish propagation from our side to the host, since
1131 * what's disconnected is disconnected. */
d944dc95
LP
1132 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1133 r = -errno;
1134 goto finish;
1135 }
15ae422b 1136
d944dc95 1137 r = 0;
15ae422b 1138
d944dc95 1139finish:
f0a4feb0 1140 for (m = mounts; m < mounts + n_mounts; m++)
1eb7e08e 1141 mount_entry_done(m);
613b411c
LP
1142
1143 return r;
1144}
1145
d2d6c096
LP
1146void bind_mount_free_many(BindMount *b, unsigned n) {
1147 unsigned i;
1148
1149 assert(b || n == 0);
1150
1151 for (i = 0; i < n; i++) {
1152 free(b[i].source);
1153 free(b[i].destination);
1154 }
1155
1156 free(b);
1157}
1158
1159int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1160 _cleanup_free_ char *s = NULL, *d = NULL;
1161 BindMount *c;
1162
1163 assert(b);
1164 assert(n);
1165 assert(item);
1166
1167 s = strdup(item->source);
1168 if (!s)
1169 return -ENOMEM;
1170
1171 d = strdup(item->destination);
1172 if (!d)
1173 return -ENOMEM;
1174
1175 c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1176 if (!c)
1177 return -ENOMEM;
1178
1179 *b = c;
1180
1181 c[(*n) ++] = (BindMount) {
1182 .source = s,
1183 .destination = d,
1184 .read_only = item->read_only,
1185 .recursive = item->recursive,
1186 .ignore_enoent = item->ignore_enoent,
1187 };
1188
1189 s = d = NULL;
1190 return 0;
1191}
1192
613b411c
LP
1193static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1194 _cleanup_free_ char *x = NULL;
6b46ea73
LP
1195 char bid[SD_ID128_STRING_MAX];
1196 sd_id128_t boot_id;
1197 int r;
613b411c
LP
1198
1199 assert(id);
1200 assert(prefix);
1201 assert(path);
1202
6b46ea73
LP
1203 /* We include the boot id in the directory so that after a
1204 * reboot we can easily identify obsolete directories. */
1205
1206 r = sd_id128_get_boot(&boot_id);
1207 if (r < 0)
1208 return r;
1209
605405c6 1210 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
1211 if (!x)
1212 return -ENOMEM;
1213
1214 RUN_WITH_UMASK(0077)
1215 if (!mkdtemp(x))
1216 return -errno;
1217
1218 RUN_WITH_UMASK(0000) {
1219 char *y;
1220
63c372cb 1221 y = strjoina(x, "/tmp");
613b411c
LP
1222
1223 if (mkdir(y, 0777 | S_ISVTX) < 0)
1224 return -errno;
c17ec25e 1225 }
15ae422b 1226
613b411c
LP
1227 *path = x;
1228 x = NULL;
1229
1230 return 0;
1231}
1232
1233int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1234 char *a, *b;
1235 int r;
1236
1237 assert(id);
1238 assert(tmp_dir);
1239 assert(var_tmp_dir);
1240
1241 r = setup_one_tmp_dir(id, "/tmp", &a);
1242 if (r < 0)
1243 return r;
1244
1245 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1246 if (r < 0) {
1247 char *t;
1248
63c372cb 1249 t = strjoina(a, "/tmp");
613b411c
LP
1250 rmdir(t);
1251 rmdir(a);
1252
1253 free(a);
1254 return r;
1255 }
1256
1257 *tmp_dir = a;
1258 *var_tmp_dir = b;
1259
1260 return 0;
1261}
1262
1263int setup_netns(int netns_storage_socket[2]) {
1264 _cleanup_close_ int netns = -1;
3ee897d6 1265 int r, q;
613b411c
LP
1266
1267 assert(netns_storage_socket);
1268 assert(netns_storage_socket[0] >= 0);
1269 assert(netns_storage_socket[1] >= 0);
1270
1271 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1272 * namespace reference fd. Whatever process runs this first
1273 * shall create a new namespace, all others should just join
1274 * it. To serialize that we use a file lock on the socket
1275 * pair.
613b411c
LP
1276 *
1277 * It's a bit crazy, but hey, works great! */
1278
1279 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1280 return -errno;
1281
3ee897d6
LP
1282 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1283 if (netns == -EAGAIN) {
613b411c
LP
1284 /* Nothing stored yet, so let's create a new namespace */
1285
1286 if (unshare(CLONE_NEWNET) < 0) {
1287 r = -errno;
1288 goto fail;
1289 }
1290
1291 loopback_setup();
1292
1293 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1294 if (netns < 0) {
1295 r = -errno;
1296 goto fail;
1297 }
1298
1299 r = 1;
613b411c 1300
3ee897d6
LP
1301 } else if (netns < 0) {
1302 r = netns;
1303 goto fail;
613b411c 1304
3ee897d6
LP
1305 } else {
1306 /* Yay, found something, so let's join the namespace */
613b411c
LP
1307 if (setns(netns, CLONE_NEWNET) < 0) {
1308 r = -errno;
1309 goto fail;
1310 }
1311
1312 r = 0;
1313 }
1314
3ee897d6
LP
1315 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1316 if (q < 0) {
1317 r = q;
613b411c
LP
1318 goto fail;
1319 }
1320
1321fail:
fe048ce5 1322 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1323 return r;
1324}
417116f2 1325
1b8689f9
LP
1326static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1327 [PROTECT_HOME_NO] = "no",
1328 [PROTECT_HOME_YES] = "yes",
1329 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
1330};
1331
1b8689f9
LP
1332DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1333
1334static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1335 [PROTECT_SYSTEM_NO] = "no",
1336 [PROTECT_SYSTEM_YES] = "yes",
1337 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1338 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1339};
1340
1341DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);