]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
namespace: create /dev, /proc, /sys when needed
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
15ae422b
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
15ae422b 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
20#include <errno.h>
07630cea 21#include <sched.h>
15ae422b 22#include <stdio.h>
07630cea
LP
23#include <string.h>
24#include <sys/mount.h>
15ae422b 25#include <sys/stat.h>
07630cea 26#include <unistd.h>
25e870b5 27#include <linux/fs.h>
15ae422b 28
b5efdb8a 29#include "alloc-util.h"
10404d52 30#include "base-filesystem.h"
7f112f50 31#include "dev-setup.h"
3ffd4af2 32#include "fd-util.h"
d944dc95 33#include "fs-util.h"
915e6d16 34#include "loop-util.h"
07630cea
LP
35#include "loopback-setup.h"
36#include "missing.h"
37#include "mkdir.h"
4349cd7c 38#include "mount-util.h"
3ffd4af2 39#include "namespace.h"
07630cea 40#include "path-util.h"
d7b8eec7 41#include "selinux-util.h"
2583fbea 42#include "socket-util.h"
8b43440b 43#include "string-table.h"
07630cea
LP
44#include "string-util.h"
45#include "strv.h"
affb60b1 46#include "umask-util.h"
ee104e11 47#include "user-util.h"
07630cea 48#include "util.h"
15ae422b 49
737ba3c8 50#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
51
c17ec25e 52typedef enum MountMode {
15ae422b
LP
53 /* This is ordered by priority! */
54 INACCESSIBLE,
d2d6c096
LP
55 BIND_MOUNT,
56 BIND_MOUNT_RECURSIVE,
ac0930c8
LP
57 PRIVATE_TMP,
58 PRIVATE_VAR_TMP,
7f112f50 59 PRIVATE_DEV,
5d997827
LP
60 BIND_DEV,
61 SYSFS,
62 PROCFS,
63 READONLY,
59eeb84b 64 READWRITE,
c17ec25e 65} MountMode;
15ae422b 66
34de407a 67typedef struct MountEntry {
5327c910 68 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 69 MountMode mode:5;
5327c910
LP
70 bool ignore:1; /* Ignore if path does not exist? */
71 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 72 bool read_only:1; /* Shall this mount point be read-only? */
5327c910 73 char *path_malloc; /* Use this instead of 'path' if we had to allocate memory */
d2d6c096
LP
74 const char *source_const; /* The source path, for bind mounts */
75 char *source_malloc;
34de407a 76} MountEntry;
15ae422b 77
5d997827
LP
78/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
79 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
80static const MountEntry apivfs_table[] = {
81 { "/proc", PROCFS, false },
82 { "/dev", BIND_DEV, false },
83 { "/sys", SYSFS, false },
84};
f471b2af 85
11a30cec 86/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 87static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
88 { "/proc/sys", READONLY, false },
89 { "/proc/sysrq-trigger", READONLY, true },
90 { "/proc/latency_stats", READONLY, true },
91 { "/proc/mtrr", READONLY, true },
aa70f38b 92 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
93 { "/proc/acpi", READONLY, true },
94 { "/proc/timer_stats", READONLY, true },
95 { "/proc/asound", READONLY, true },
96 { "/proc/bus", READONLY, true },
97 { "/proc/fs", READONLY, true },
98 { "/proc/irq", READONLY, true },
99 { "/sys", READONLY, false },
100 { "/sys/kernel/debug", READONLY, true },
101 { "/sys/kernel/tracing", READONLY, true },
102 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
3a0bf6d6 103 { "/sys/fs/selinux", READWRITE, true },
11a30cec
DH
104};
105
c575770b 106/* ProtectKernelModules= option */
34de407a 107static const MountEntry protect_kernel_modules_table[] = {
c575770b 108#ifdef HAVE_SPLIT_USR
c6232fb0 109 { "/lib/modules", INACCESSIBLE, true },
c575770b 110#endif
c6232fb0 111 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
112};
113
b6c432ca
DH
114/*
115 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
116 * system should be protected by ProtectSystem=
117 */
34de407a 118static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
119 { "/home", READONLY, true },
120 { "/run/user", READONLY, true },
121 { "/root", READONLY, true },
b6c432ca
DH
122};
123
124/* ProtectHome=yes table */
34de407a 125static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
126 { "/home", INACCESSIBLE, true },
127 { "/run/user", INACCESSIBLE, true },
128 { "/root", INACCESSIBLE, true },
b6c432ca
DH
129};
130
f471b2af 131/* ProtectSystem=yes table */
34de407a 132static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
133 { "/usr", READONLY, false },
134 { "/boot", READONLY, true },
135 { "/efi", READONLY, true },
f471b2af
DH
136};
137
138/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 139static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
140 { "/usr", READONLY, false },
141 { "/boot", READONLY, true },
142 { "/efi", READONLY, true },
143 { "/etc", READONLY, false },
f471b2af
DH
144};
145
146/*
147 * ProtectSystem=strict table. In this strict mode, we mount everything
148 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
149 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
150 * protect those, and these options should be fully orthogonal.
151 * (And of course /home and friends are also left writable, as ProtectHome=
152 * shall manage those, orthogonally).
153 */
34de407a 154static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
155 { "/", READONLY, false },
156 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
157 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
158 { "/dev", READWRITE, false }, /* PrivateDevices= */
159 { "/home", READWRITE, true }, /* ProtectHome= */
160 { "/run/user", READWRITE, true }, /* ProtectHome= */
161 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
162};
163
34de407a 164static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
165 assert(p);
166
5327c910
LP
167 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
168 * otherwise the stack/static ->path field is returned. */
f0a4feb0 169
5327c910 170 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
171}
172
34de407a 173static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
174 assert(p);
175
176 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
177}
178
d2d6c096
LP
179static const char *mount_entry_source(const MountEntry *p) {
180 assert(p);
181
182 return p->source_malloc ?: p->source_const;
183}
184
1eb7e08e
LP
185static void mount_entry_done(MountEntry *p) {
186 assert(p);
187
188 p->path_malloc = mfree(p->path_malloc);
189 p->source_malloc = mfree(p->source_malloc);
190}
191
34de407a 192static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) {
15ae422b
LP
193 char **i;
194
613b411c
LP
195 assert(p);
196
5327c910
LP
197 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
198
15ae422b 199 STRV_FOREACH(i, strv) {
5327c910
LP
200 bool ignore = false, needs_prefix = false;
201 const char *e = *i;
15ae422b 202
5327c910
LP
203 /* Look for any prefixes */
204 if (startswith(e, "-")) {
205 e++;
9c94d52e 206 ignore = true;
ea92ae33 207 }
5327c910
LP
208 if (startswith(e, "+")) {
209 e++;
210 needs_prefix = true;
211 }
ea92ae33 212
5327c910 213 if (!path_is_absolute(e))
15ae422b
LP
214 return -EINVAL;
215
34de407a 216 *((*p)++) = (MountEntry) {
5327c910
LP
217 .path_const = e,
218 .mode = mode,
219 .ignore = ignore,
220 .has_prefix = !needs_prefix,
221 };
15ae422b
LP
222 }
223
224 return 0;
225}
226
d2d6c096
LP
227static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
228 unsigned i;
229
230 assert(p);
231
232 for (i = 0; i < n; i++) {
233 const BindMount *b = binds + i;
234
235 *((*p)++) = (MountEntry) {
236 .path_const = b->destination,
237 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
238 .read_only = b->read_only,
239 .source_const = b->source,
240 };
241 }
242
243 return 0;
244}
245
34de407a 246static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 247 unsigned i;
11a30cec
DH
248
249 assert(p);
f471b2af 250 assert(mounts);
11a30cec 251
5327c910 252 /* Adds a list of static pre-defined entries */
f471b2af 253
5327c910 254 for (i = 0; i < n; i++)
34de407a
LP
255 *((*p)++) = (MountEntry) {
256 .path_const = mount_entry_path(mounts+i),
5327c910
LP
257 .mode = mounts[i].mode,
258 .ignore = mounts[i].ignore || ignore_protect,
259 };
f471b2af
DH
260
261 return 0;
262}
263
34de407a 264static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
265 assert(p);
266
5327c910 267 switch (protect_home) {
b6c432ca 268
5327c910 269 case PROTECT_HOME_NO:
b6c432ca
DH
270 return 0;
271
b6c432ca 272 case PROTECT_HOME_READ_ONLY:
5327c910
LP
273 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
274
b6c432ca 275 case PROTECT_HOME_YES:
5327c910
LP
276 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
277
b6c432ca 278 default:
5327c910 279 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 280 }
b6c432ca
DH
281}
282
34de407a 283static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
284 assert(p);
285
5327c910
LP
286 switch (protect_system) {
287
288 case PROTECT_SYSTEM_NO:
f471b2af
DH
289 return 0;
290
f471b2af 291 case PROTECT_SYSTEM_STRICT:
5327c910
LP
292 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
293
f471b2af 294 case PROTECT_SYSTEM_YES:
5327c910
LP
295 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
296
f471b2af 297 case PROTECT_SYSTEM_FULL:
5327c910
LP
298 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
299
f471b2af 300 default:
5327c910 301 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 302 }
11a30cec
DH
303}
304
c17ec25e 305static int mount_path_compare(const void *a, const void *b) {
34de407a 306 const MountEntry *p = a, *q = b;
a0827e2b 307 int d;
15ae422b 308
6ee1a919 309 /* If the paths are not equal, then order prefixes first */
34de407a 310 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
311 if (d != 0)
312 return d;
15ae422b 313
6ee1a919
LP
314 /* If the paths are equal, check the mode */
315 if (p->mode < q->mode)
316 return -1;
15ae422b 317
6ee1a919
LP
318 if (p->mode > q->mode)
319 return 1;
15ae422b 320
6ee1a919 321 return 0;
15ae422b
LP
322}
323
34de407a 324static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
325 unsigned i;
326
327 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
328 * that. */
329
330 if (!root_directory)
331 return 0;
332
333 for (i = 0; i < n; i++) {
334 char *s;
335
336 if (m[i].has_prefix)
337 continue;
338
34de407a 339 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
340 if (!s)
341 return -ENOMEM;
342
343 free(m[i].path_malloc);
344 m[i].path_malloc = s;
345
346 m[i].has_prefix = true;
347 }
348
349 return 0;
350}
351
34de407a
LP
352static void drop_duplicates(MountEntry *m, unsigned *n) {
353 MountEntry *f, *t, *previous;
15ae422b 354
c17ec25e 355 assert(m);
15ae422b 356 assert(n);
15ae422b 357
fe3c2583
LP
358 /* Drops duplicate entries. Expects that the array is properly ordered already. */
359
1d54cd5d 360 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 361
fe3c2583
LP
362 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
363 * above. */
34de407a
LP
364 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
365 log_debug("%s is duplicate.", mount_entry_path(f));
366 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 367 mount_entry_done(f);
15ae422b 368 continue;
fe3c2583 369 }
15ae422b 370
e2d7c1a0 371 *t = *f;
15ae422b 372 previous = t;
fe3c2583
LP
373 t++;
374 }
375
376 *n = t - m;
377}
378
34de407a
LP
379static void drop_inaccessible(MountEntry *m, unsigned *n) {
380 MountEntry *f, *t;
fe3c2583
LP
381 const char *clear = NULL;
382
383 assert(m);
384 assert(n);
385
386 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
387 * ordered already. */
388
1d54cd5d 389 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
390
391 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
392 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
393 if (clear && path_startswith(mount_entry_path(f), clear)) {
394 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 395 mount_entry_done(f);
fe3c2583
LP
396 continue;
397 }
15ae422b 398
34de407a 399 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
400
401 *t = *f;
15ae422b
LP
402 t++;
403 }
404
c17ec25e 405 *n = t - m;
15ae422b
LP
406}
407
34de407a
LP
408static void drop_nop(MountEntry *m, unsigned *n) {
409 MountEntry *f, *t;
7648a565
LP
410
411 assert(m);
412 assert(n);
413
414 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
415 * list is ordered by prefixes. */
416
1d54cd5d 417 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
418
419 /* Only suppress such subtrees for READONLY and READWRITE entries */
420 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 421 MountEntry *p;
7648a565
LP
422 bool found = false;
423
424 /* Now let's find the first parent of the entry we are looking at. */
425 for (p = t-1; p >= m; p--) {
34de407a 426 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
427 found = true;
428 break;
429 }
430 }
431
432 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
433 if (found && p->mode == f->mode) {
34de407a 434 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
1eb7e08e 435 mount_entry_done(f);
7648a565
LP
436 continue;
437 }
438 }
439
440 *t = *f;
441 t++;
442 }
443
444 *n = t - m;
445}
446
34de407a
LP
447static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
448 MountEntry *f, *t;
cd2902c9
LP
449
450 assert(m);
451 assert(n);
452
1d54cd5d 453 /* Nothing to do */
cd2902c9
LP
454 if (!root_directory)
455 return;
456
457 /* Drops all mounts that are outside of the root directory. */
458
1d54cd5d 459 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 460
34de407a
LP
461 if (!path_startswith(mount_entry_path(f), root_directory)) {
462 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 463 mount_entry_done(f);
cd2902c9
LP
464 continue;
465 }
466
467 *t = *f;
468 t++;
469 }
470
471 *n = t - m;
472}
473
5d997827 474static int mount_private_dev(MountEntry *m) {
7f112f50
LP
475 static const char devnodes[] =
476 "/dev/null\0"
477 "/dev/zero\0"
478 "/dev/full\0"
479 "/dev/random\0"
480 "/dev/urandom\0"
481 "/dev/tty\0";
482
2b85f4e1 483 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 484 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
485 _cleanup_umask_ mode_t u;
486 int r;
487
488 assert(m);
489
490 u = umask(0000);
491
2b85f4e1
LP
492 if (!mkdtemp(temporary_mount))
493 return -errno;
494
63c372cb 495 dev = strjoina(temporary_mount, "/dev");
dc751688 496 (void) mkdir(dev, 0755);
737ba3c8 497 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
498 r = -errno;
499 goto fail;
500 }
501
63c372cb 502 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 503 (void) mkdir(devpts, 0755);
2b85f4e1
LP
504 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
505 r = -errno;
506 goto fail;
507 }
508
63c372cb 509 devptmx = strjoina(temporary_mount, "/dev/ptmx");
3164e3cb
ZJS
510 if (symlink("pts/ptmx", devptmx) < 0) {
511 r = -errno;
512 goto fail;
513 }
e06b6479 514
63c372cb 515 devshm = strjoina(temporary_mount, "/dev/shm");
dc751688 516 (void) mkdir(devshm, 01777);
2b85f4e1
LP
517 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
518 if (r < 0) {
519 r = -errno;
520 goto fail;
521 }
522
63c372cb 523 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 524 (void) mkdir(devmqueue, 0755);
3164e3cb 525 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 526
63c372cb 527 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 528 (void) mkdir(devhugepages, 0755);
3164e3cb 529 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 530
63c372cb 531 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 532 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 533
7f112f50 534 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
535 _cleanup_free_ char *dn = NULL;
536 struct stat st;
537
538 r = stat(d, &st);
7f112f50 539 if (r < 0) {
2b85f4e1
LP
540
541 if (errno == ENOENT)
542 continue;
543
544 r = -errno;
545 goto fail;
7f112f50
LP
546 }
547
2b85f4e1
LP
548 if (!S_ISBLK(st.st_mode) &&
549 !S_ISCHR(st.st_mode)) {
550 r = -EINVAL;
551 goto fail;
552 }
553
554 if (st.st_rdev == 0)
555 continue;
556
557 dn = strappend(temporary_mount, d);
558 if (!dn) {
559 r = -ENOMEM;
560 goto fail;
561 }
562
ecabcf8b 563 mac_selinux_create_file_prepare(d, st.st_mode);
2b85f4e1 564 r = mknod(dn, st.st_mode, st.st_rdev);
ecabcf8b 565 mac_selinux_create_file_clear();
dd078a1e 566
2b85f4e1
LP
567 if (r < 0) {
568 r = -errno;
569 goto fail;
570 }
7f112f50
LP
571 }
572
03cfe0d5 573 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 574
ee818b89
AC
575 /* Create the /dev directory if missing. It is more likely to be
576 * missing when the service is started with RootDirectory. This is
577 * consistent with mount units creating the mount points when missing.
578 */
34de407a 579 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 580
9e5f8252 581 /* Unmount everything in old /dev */
34de407a
LP
582 umount_recursive(mount_entry_path(m), 0);
583 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
584 r = -errno;
585 goto fail;
586 }
7f112f50 587
2b85f4e1
LP
588 rmdir(dev);
589 rmdir(temporary_mount);
7f112f50 590
2b85f4e1 591 return 0;
7f112f50 592
2b85f4e1
LP
593fail:
594 if (devpts)
595 umount(devpts);
7f112f50 596
2b85f4e1
LP
597 if (devshm)
598 umount(devshm);
7f112f50 599
2b85f4e1
LP
600 if (devhugepages)
601 umount(devhugepages);
7f112f50 602
2b85f4e1
LP
603 if (devmqueue)
604 umount(devmqueue);
7f112f50 605
d267c5aa
ZJS
606 umount(dev);
607 rmdir(dev);
2b85f4e1 608 rmdir(temporary_mount);
7f112f50 609
2b85f4e1 610 return r;
7f112f50
LP
611}
612
5d997827
LP
613static int mount_bind_dev(MountEntry *m) {
614 int r;
615
616 assert(m);
617
618 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
619 * /dev. This is only used when RootDirectory= is set. */
620
645767d6
LP
621 (void) mkdir_p_label(mount_entry_path(m), 0755);
622
5d997827
LP
623 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
624 if (r < 0)
625 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
626 if (r > 0) /* make this a NOP if /dev is already a mount point */
627 return 0;
628
629 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
630 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
631
632 return 1;
633}
634
635static int mount_sysfs(MountEntry *m) {
636 int r;
637
638 assert(m);
639
645767d6
LP
640 (void) mkdir_p_label(mount_entry_path(m), 0755);
641
5d997827
LP
642 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
643 if (r < 0)
644 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
645 if (r > 0) /* make this a NOP if /sys is already a mount point */
646 return 0;
647
648 /* Bind mount the host's version so that we get all child mounts of it, too. */
649 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
650 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
651
652 return 1;
653}
654
655static int mount_procfs(MountEntry *m) {
656 int r;
657
658 assert(m);
659
645767d6
LP
660 (void) mkdir_p_label(mount_entry_path(m), 0755);
661
5d997827
LP
662 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
663 if (r < 0)
664 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
665 if (r > 0) /* make this a NOP if /proc is already a mount point */
666 return 0;
667
668 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
669 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
670 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
671
672 return 1;
673}
674
d2d6c096
LP
675static int mount_entry_chase(
676 const char *root_directory,
677 MountEntry *m,
678 const char *path,
679 char **location) {
680
8fceda93
LP
681 char *chased;
682 int r;
683
684 assert(m);
685
686 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
d2d6c096
LP
687 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
688 * that applies). The result is stored in "location". */
8fceda93 689
d2d6c096 690 r = chase_symlinks(path, root_directory, 0, &chased);
8fceda93 691 if (r == -ENOENT && m->ignore) {
d2d6c096 692 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
8fceda93
LP
693 return 0;
694 }
695 if (r < 0)
d2d6c096 696 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
8fceda93 697
d2d6c096 698 log_debug("Followed symlinks %s → %s.", path, chased);
8fceda93 699
d2d6c096
LP
700 free(*location);
701 *location = chased;
8fceda93
LP
702
703 return 1;
704}
705
ac0930c8 706static int apply_mount(
8fceda93 707 const char *root_directory,
34de407a 708 MountEntry *m,
ac0930c8 709 const char *tmp_dir,
c17ec25e 710 const char *var_tmp_dir) {
ac0930c8 711
15ae422b 712 const char *what;
d2d6c096 713 bool rbind = true;
15ae422b 714 int r;
15ae422b 715
c17ec25e 716 assert(m);
15ae422b 717
d2d6c096 718 r = mount_entry_chase(root_directory, m, mount_entry_path(m), &m->path_malloc);
8fceda93
LP
719 if (r <= 0)
720 return r;
721
34de407a 722 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 723
c17ec25e 724 switch (m->mode) {
15ae422b 725
160cfdbe
LP
726 case INACCESSIBLE: {
727 struct stat target;
6d313367
LP
728
729 /* First, get rid of everything that is below if there
730 * is anything... Then, overmount it with an
c4b41707 731 * inaccessible path. */
34de407a 732 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 733
34de407a
LP
734 if (lstat(mount_entry_path(m), &target) < 0)
735 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
15ae422b 736
c4b41707 737 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
738 if (!what) {
739 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
740 return -ELOOP;
741 }
742 break;
160cfdbe 743 }
fe3c2583 744
15ae422b 745 case READONLY:
15ae422b 746 case READWRITE:
8fceda93 747 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
d944dc95 748 if (r < 0)
34de407a 749 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
750 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
751 return 0;
6b7c9f8b 752 /* This isn't a mount point yet, let's make it one. */
34de407a 753 what = mount_entry_path(m);
6b7c9f8b 754 break;
15ae422b 755
d2d6c096
LP
756 case BIND_MOUNT:
757 rbind = false;
758 /* fallthrough */
759
760 case BIND_MOUNT_RECURSIVE:
761 /* Also chase the source mount */
5d997827 762
d2d6c096
LP
763 r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
764 if (r <= 0)
765 return r;
766
767 what = mount_entry_source(m);
768 break;
769
ac0930c8
LP
770 case PRIVATE_TMP:
771 what = tmp_dir;
772 break;
773
774 case PRIVATE_VAR_TMP:
775 what = var_tmp_dir;
15ae422b 776 break;
e364ad06 777
d6797c92 778 case PRIVATE_DEV:
5d997827
LP
779 return mount_private_dev(m);
780
781 case BIND_DEV:
782 return mount_bind_dev(m);
783
784 case SYSFS:
785 return mount_sysfs(m);
786
787 case PROCFS:
788 return mount_procfs(m);
d6797c92 789
e364ad06
LP
790 default:
791 assert_not_reached("Unknown mode");
15ae422b
LP
792 }
793
ac0930c8 794 assert(what);
15ae422b 795
d2d6c096 796 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
34de407a 797 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
6b7c9f8b 798
34de407a 799 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 800 return 0;
ac0930c8 801}
15ae422b 802
ac9de0b3 803static int make_read_only(MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
6b7c9f8b 804 int r = 0;
15ae422b 805
c17ec25e 806 assert(m);
ac9de0b3 807 assert(proc_self_mountinfo);
ac0930c8 808
34de407a 809 if (mount_entry_read_only(m))
ac9de0b3 810 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
13e785f7 811 else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't */
34de407a 812 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 813 r = -errno;
737ba3c8 814 } else
6b7c9f8b
LP
815 return 0;
816
817 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
818 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
819 * read-only mounts already applied. */
ac0930c8 820
8fceda93
LP
821 if (r == -ENOENT && m->ignore)
822 r = 0;
5327c910 823
1d54cd5d 824 return r;
d944dc95
LP
825}
826
9c988f93 827static bool namespace_info_mount_apivfs(const char *root_directory, const NameSpaceInfo *ns_info) {
5d997827
LP
828 assert(ns_info);
829
9c988f93
DH
830 /*
831 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
832 * since to protect the API VFS mounts, they need to be around in the
833 * first place... and RootDirectory= or RootImage= need to be set.
834 */
5d997827 835
9c988f93
DH
836 /* root_directory should point to a mount point */
837 return root_directory &&
838 (ns_info->mount_apivfs ||
839 ns_info->protect_control_groups ||
840 ns_info->protect_kernel_tunables);
5d997827
LP
841}
842
2652c6c1 843static unsigned namespace_calculate_mounts(
9c988f93 844 const char* root_directory,
c575770b 845 const NameSpaceInfo *ns_info,
2652c6c1
DH
846 char** read_write_paths,
847 char** read_only_paths,
848 char** inaccessible_paths,
d2d6c096
LP
849 const BindMount *bind_mounts,
850 unsigned n_bind_mounts,
2652c6c1
DH
851 const char* tmp_dir,
852 const char* var_tmp_dir,
2652c6c1
DH
853 ProtectHome protect_home,
854 ProtectSystem protect_system) {
855
b6c432ca 856 unsigned protect_home_cnt;
f471b2af
DH
857 unsigned protect_system_cnt =
858 (protect_system == PROTECT_SYSTEM_STRICT ?
859 ELEMENTSOF(protect_system_strict_table) :
860 ((protect_system == PROTECT_SYSTEM_FULL) ?
861 ELEMENTSOF(protect_system_full_table) :
862 ((protect_system == PROTECT_SYSTEM_YES) ?
863 ELEMENTSOF(protect_system_yes_table) : 0)));
864
b6c432ca
DH
865 protect_home_cnt =
866 (protect_home == PROTECT_HOME_YES ?
867 ELEMENTSOF(protect_home_yes_table) :
868 ((protect_home == PROTECT_HOME_READ_ONLY) ?
869 ELEMENTSOF(protect_home_read_only_table) : 0));
870
2652c6c1
DH
871 return !!tmp_dir + !!var_tmp_dir +
872 strv_length(read_write_paths) +
873 strv_length(read_only_paths) +
874 strv_length(inaccessible_paths) +
d2d6c096 875 n_bind_mounts +
c575770b
DH
876 ns_info->private_dev +
877 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
878 (ns_info->protect_control_groups ? 1 : 0) +
879 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
5d997827 880 protect_home_cnt + protect_system_cnt +
9c988f93 881 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
2652c6c1
DH
882}
883
613b411c 884int setup_namespace(
ee818b89 885 const char* root_directory,
915e6d16 886 const char* root_image,
c575770b 887 const NameSpaceInfo *ns_info,
2a624c36
AP
888 char** read_write_paths,
889 char** read_only_paths,
890 char** inaccessible_paths,
d2d6c096
LP
891 const BindMount *bind_mounts,
892 unsigned n_bind_mounts,
a004cb4c
LP
893 const char* tmp_dir,
894 const char* var_tmp_dir,
1b8689f9
LP
895 ProtectHome protect_home,
896 ProtectSystem protect_system,
915e6d16
LP
897 unsigned long mount_flags,
898 DissectImageFlags dissect_image_flags) {
15ae422b 899
915e6d16 900 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
78ebe980 901 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
915e6d16 902 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
78ebe980 903 _cleanup_free_ void *root_hash = NULL;
34de407a 904 MountEntry *m, *mounts = NULL;
78ebe980 905 size_t root_hash_size = 0;
d944dc95 906 bool make_slave = false;
f0a4feb0 907 unsigned n_mounts;
c17ec25e 908 int r = 0;
15ae422b 909
915e6d16
LP
910 assert(ns_info);
911
613b411c 912 if (mount_flags == 0)
c17ec25e 913 mount_flags = MS_SHARED;
ac0930c8 914
915e6d16
LP
915 if (root_image) {
916 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
917
918 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
919 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
920
921 r = loop_device_make_by_path(root_image,
922 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
923 &loop_device);
924 if (r < 0)
925 return r;
926
78ebe980
LP
927 r = root_hash_load(root_image, &root_hash, &root_hash_size);
928 if (r < 0)
929 return r;
930
931 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
932 if (r < 0)
933 return r;
934
935 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
915e6d16
LP
936 if (r < 0)
937 return r;
938
939 if (!root_directory) {
940 /* Create a mount point for the image, if it's still missing. We use the same mount point for
941 * all images, which is safe, since they all live in their own namespaces after all, and hence
942 * won't see each other. */
943 root_directory = "/run/systemd/unit-root";
944 (void) mkdir(root_directory, 0700);
945 }
946 }
947
cfbeb4ef 948 n_mounts = namespace_calculate_mounts(
9c988f93 949 root_directory,
cfbeb4ef
LP
950 ns_info,
951 read_write_paths,
952 read_only_paths,
953 inaccessible_paths,
d2d6c096 954 bind_mounts, n_bind_mounts,
cfbeb4ef
LP
955 tmp_dir, var_tmp_dir,
956 protect_home, protect_system);
613b411c 957
2652c6c1 958 /* Set mount slave mode */
f0a4feb0 959 if (root_directory || n_mounts > 0)
d944dc95
LP
960 make_slave = true;
961
f0a4feb0 962 if (n_mounts > 0) {
34de407a 963 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
5327c910 964 r = append_access_mounts(&m, read_write_paths, READWRITE);
613b411c 965 if (r < 0)
f0a4feb0 966 goto finish;
613b411c 967
5327c910 968 r = append_access_mounts(&m, read_only_paths, READONLY);
613b411c 969 if (r < 0)
f0a4feb0 970 goto finish;
613b411c 971
5327c910 972 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE);
613b411c 973 if (r < 0)
f0a4feb0 974 goto finish;
7ff7394d 975
d2d6c096
LP
976 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
977 if (r < 0)
978 goto finish;
979
613b411c 980 if (tmp_dir) {
34de407a 981 *(m++) = (MountEntry) {
5327c910
LP
982 .path_const = "/tmp",
983 .mode = PRIVATE_TMP,
984 };
613b411c 985 }
7ff7394d 986
613b411c 987 if (var_tmp_dir) {
34de407a 988 *(m++) = (MountEntry) {
5327c910
LP
989 .path_const = "/var/tmp",
990 .mode = PRIVATE_VAR_TMP,
991 };
7ff7394d 992 }
ac0930c8 993
c575770b 994 if (ns_info->private_dev) {
34de407a 995 *(m++) = (MountEntry) {
5327c910
LP
996 .path_const = "/dev",
997 .mode = PRIVATE_DEV,
998 };
7f112f50
LP
999 }
1000
c575770b 1001 if (ns_info->protect_kernel_tunables) {
5327c910 1002 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 1003 if (r < 0)
f0a4feb0 1004 goto finish;
c575770b
DH
1005 }
1006
1007 if (ns_info->protect_kernel_modules) {
5327c910 1008 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 1009 if (r < 0)
f0a4feb0 1010 goto finish;
c575770b 1011 }
59eeb84b 1012
c575770b 1013 if (ns_info->protect_control_groups) {
34de407a 1014 *(m++) = (MountEntry) {
5327c910
LP
1015 .path_const = "/sys/fs/cgroup",
1016 .mode = READONLY,
1017 };
59eeb84b
LP
1018 }
1019
5327c910 1020 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 1021 if (r < 0)
f0a4feb0 1022 goto finish;
417116f2 1023
5327c910 1024 r = append_protect_system(&m, protect_system, false);
f471b2af 1025 if (r < 0)
f0a4feb0 1026 goto finish;
417116f2 1027
9c988f93 1028 if (namespace_info_mount_apivfs(root_directory, ns_info)) {
5d997827
LP
1029 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1030 if (r < 0)
1031 goto finish;
1032 }
1033
f0a4feb0 1034 assert(mounts + n_mounts == m);
ac0930c8 1035
5327c910
LP
1036 /* Prepend the root directory where that's necessary */
1037 r = prefix_where_needed(mounts, n_mounts, root_directory);
1038 if (r < 0)
1039 goto finish;
1040
34de407a 1041 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
fe3c2583 1042
f0a4feb0
DH
1043 drop_duplicates(mounts, &n_mounts);
1044 drop_outside_root(root_directory, mounts, &n_mounts);
1045 drop_inaccessible(mounts, &n_mounts);
1046 drop_nop(mounts, &n_mounts);
15ae422b
LP
1047 }
1048
d944dc95
LP
1049 if (unshare(CLONE_NEWNS) < 0) {
1050 r = -errno;
1051 goto finish;
1052 }
1e4e94c8 1053
d944dc95 1054 if (make_slave) {
c2c13f2d
LP
1055 /* Remount / as SLAVE so that nothing now mounted in the namespace
1056 shows up in the parent */
d944dc95
LP
1057 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1058 r = -errno;
1059 goto finish;
1060 }
ee818b89
AC
1061 }
1062
10404d52
DH
1063 /* Try to set up the new root directory before mounting anything there */
1064 if (root_directory)
1065 (void) base_filesystem_create(root_directory, UID_INVALID, GID_INVALID);
1066
915e6d16
LP
1067 if (root_image) {
1068 r = dissected_image_mount(dissected_image, root_directory, dissect_image_flags);
1069 if (r < 0)
1070 goto finish;
1071
07ce7407
TM
1072 if (decrypted_image) {
1073 r = decrypted_image_relinquish(decrypted_image);
1074 if (r < 0)
1075 goto finish;
1076 }
78ebe980 1077
915e6d16
LP
1078 loop_device_relinquish(loop_device);
1079
1080 } else if (root_directory) {
1081
8f1ad200 1082 /* Turn directory into bind mount, if it isn't one yet */
e1873695 1083 r = path_is_mount_point(root_directory, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 1084 if (r < 0)
d944dc95 1085 goto finish;
8f1ad200
LP
1086 if (r == 0) {
1087 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
1088 r = -errno;
1089 goto finish;
1090 }
d944dc95 1091 }
ee818b89 1092 }
c2c13f2d 1093
f0a4feb0 1094 if (n_mounts > 0) {
ac9de0b3 1095 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
6b7c9f8b
LP
1096 char **blacklist;
1097 unsigned j;
1098
ac9de0b3
TR
1099 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1100 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1101 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1102 if (!proc_self_mountinfo) {
1103 r = -errno;
1104 goto finish;
1105 }
1106
6b7c9f8b 1107 /* First round, add in all special mounts we need */
f0a4feb0 1108 for (m = mounts; m < mounts + n_mounts; ++m) {
8fceda93 1109 r = apply_mount(root_directory, m, tmp_dir, var_tmp_dir);
c2c13f2d 1110 if (r < 0)
d944dc95 1111 goto finish;
c2c13f2d 1112 }
15ae422b 1113
6b7c9f8b 1114 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
1115 blacklist = newa(char*, n_mounts+1);
1116 for (j = 0; j < n_mounts; j++)
34de407a 1117 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
1118 blacklist[j] = NULL;
1119
1120 /* Second round, flip the ro bits if necessary. */
f0a4feb0 1121 for (m = mounts; m < mounts + n_mounts; ++m) {
ac9de0b3 1122 r = make_read_only(m, blacklist, proc_self_mountinfo);
c2c13f2d 1123 if (r < 0)
d944dc95 1124 goto finish;
c2c13f2d 1125 }
15ae422b
LP
1126 }
1127
ee818b89
AC
1128 if (root_directory) {
1129 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1130 r = mount_move_root(root_directory);
d944dc95
LP
1131 if (r < 0)
1132 goto finish;
ee818b89
AC
1133 }
1134
c2c13f2d
LP
1135 /* Remount / as the desired mode. Not that this will not
1136 * reestablish propagation from our side to the host, since
1137 * what's disconnected is disconnected. */
d944dc95
LP
1138 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1139 r = -errno;
1140 goto finish;
1141 }
15ae422b 1142
d944dc95 1143 r = 0;
15ae422b 1144
d944dc95 1145finish:
f0a4feb0 1146 for (m = mounts; m < mounts + n_mounts; m++)
1eb7e08e 1147 mount_entry_done(m);
613b411c
LP
1148
1149 return r;
1150}
1151
d2d6c096
LP
1152void bind_mount_free_many(BindMount *b, unsigned n) {
1153 unsigned i;
1154
1155 assert(b || n == 0);
1156
1157 for (i = 0; i < n; i++) {
1158 free(b[i].source);
1159 free(b[i].destination);
1160 }
1161
1162 free(b);
1163}
1164
1165int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1166 _cleanup_free_ char *s = NULL, *d = NULL;
1167 BindMount *c;
1168
1169 assert(b);
1170 assert(n);
1171 assert(item);
1172
1173 s = strdup(item->source);
1174 if (!s)
1175 return -ENOMEM;
1176
1177 d = strdup(item->destination);
1178 if (!d)
1179 return -ENOMEM;
1180
1181 c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1182 if (!c)
1183 return -ENOMEM;
1184
1185 *b = c;
1186
1187 c[(*n) ++] = (BindMount) {
1188 .source = s,
1189 .destination = d,
1190 .read_only = item->read_only,
1191 .recursive = item->recursive,
1192 .ignore_enoent = item->ignore_enoent,
1193 };
1194
1195 s = d = NULL;
1196 return 0;
1197}
1198
613b411c
LP
1199static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1200 _cleanup_free_ char *x = NULL;
6b46ea73
LP
1201 char bid[SD_ID128_STRING_MAX];
1202 sd_id128_t boot_id;
1203 int r;
613b411c
LP
1204
1205 assert(id);
1206 assert(prefix);
1207 assert(path);
1208
6b46ea73
LP
1209 /* We include the boot id in the directory so that after a
1210 * reboot we can easily identify obsolete directories. */
1211
1212 r = sd_id128_get_boot(&boot_id);
1213 if (r < 0)
1214 return r;
1215
605405c6 1216 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
1217 if (!x)
1218 return -ENOMEM;
1219
1220 RUN_WITH_UMASK(0077)
1221 if (!mkdtemp(x))
1222 return -errno;
1223
1224 RUN_WITH_UMASK(0000) {
1225 char *y;
1226
63c372cb 1227 y = strjoina(x, "/tmp");
613b411c
LP
1228
1229 if (mkdir(y, 0777 | S_ISVTX) < 0)
1230 return -errno;
c17ec25e 1231 }
15ae422b 1232
613b411c
LP
1233 *path = x;
1234 x = NULL;
1235
1236 return 0;
1237}
1238
1239int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1240 char *a, *b;
1241 int r;
1242
1243 assert(id);
1244 assert(tmp_dir);
1245 assert(var_tmp_dir);
1246
1247 r = setup_one_tmp_dir(id, "/tmp", &a);
1248 if (r < 0)
1249 return r;
1250
1251 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1252 if (r < 0) {
1253 char *t;
1254
63c372cb 1255 t = strjoina(a, "/tmp");
613b411c
LP
1256 rmdir(t);
1257 rmdir(a);
1258
1259 free(a);
1260 return r;
1261 }
1262
1263 *tmp_dir = a;
1264 *var_tmp_dir = b;
1265
1266 return 0;
1267}
1268
1269int setup_netns(int netns_storage_socket[2]) {
1270 _cleanup_close_ int netns = -1;
3ee897d6 1271 int r, q;
613b411c
LP
1272
1273 assert(netns_storage_socket);
1274 assert(netns_storage_socket[0] >= 0);
1275 assert(netns_storage_socket[1] >= 0);
1276
1277 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1278 * namespace reference fd. Whatever process runs this first
1279 * shall create a new namespace, all others should just join
1280 * it. To serialize that we use a file lock on the socket
1281 * pair.
613b411c
LP
1282 *
1283 * It's a bit crazy, but hey, works great! */
1284
1285 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1286 return -errno;
1287
3ee897d6
LP
1288 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1289 if (netns == -EAGAIN) {
613b411c
LP
1290 /* Nothing stored yet, so let's create a new namespace */
1291
1292 if (unshare(CLONE_NEWNET) < 0) {
1293 r = -errno;
1294 goto fail;
1295 }
1296
1297 loopback_setup();
1298
1299 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1300 if (netns < 0) {
1301 r = -errno;
1302 goto fail;
1303 }
1304
1305 r = 1;
613b411c 1306
3ee897d6
LP
1307 } else if (netns < 0) {
1308 r = netns;
1309 goto fail;
613b411c 1310
3ee897d6
LP
1311 } else {
1312 /* Yay, found something, so let's join the namespace */
613b411c
LP
1313 if (setns(netns, CLONE_NEWNET) < 0) {
1314 r = -errno;
1315 goto fail;
1316 }
1317
1318 r = 0;
1319 }
1320
3ee897d6
LP
1321 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1322 if (q < 0) {
1323 r = q;
613b411c
LP
1324 goto fail;
1325 }
1326
1327fail:
fe048ce5 1328 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1329 return r;
1330}
417116f2 1331
1b8689f9
LP
1332static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1333 [PROTECT_HOME_NO] = "no",
1334 [PROTECT_HOME_YES] = "yes",
1335 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
1336};
1337
1b8689f9
LP
1338DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1339
1340static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1341 [PROTECT_SYSTEM_NO] = "no",
1342 [PROTECT_SYSTEM_YES] = "yes",
1343 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1344 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1345};
1346
1347DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);