]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
namespace: create base-filesystem directories if RootImage= or RootDirectory= are set
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
15ae422b
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
15ae422b 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
20#include <errno.h>
07630cea 21#include <sched.h>
15ae422b 22#include <stdio.h>
07630cea
LP
23#include <string.h>
24#include <sys/mount.h>
15ae422b 25#include <sys/stat.h>
07630cea 26#include <unistd.h>
25e870b5 27#include <linux/fs.h>
15ae422b 28
b5efdb8a 29#include "alloc-util.h"
10404d52 30#include "base-filesystem.h"
7f112f50 31#include "dev-setup.h"
3ffd4af2 32#include "fd-util.h"
d944dc95 33#include "fs-util.h"
915e6d16 34#include "loop-util.h"
07630cea
LP
35#include "loopback-setup.h"
36#include "missing.h"
37#include "mkdir.h"
4349cd7c 38#include "mount-util.h"
3ffd4af2 39#include "namespace.h"
07630cea 40#include "path-util.h"
d7b8eec7 41#include "selinux-util.h"
2583fbea 42#include "socket-util.h"
8b43440b 43#include "string-table.h"
07630cea
LP
44#include "string-util.h"
45#include "strv.h"
affb60b1 46#include "umask-util.h"
ee104e11 47#include "user-util.h"
07630cea 48#include "util.h"
15ae422b 49
737ba3c8 50#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
51
c17ec25e 52typedef enum MountMode {
15ae422b
LP
53 /* This is ordered by priority! */
54 INACCESSIBLE,
d2d6c096
LP
55 BIND_MOUNT,
56 BIND_MOUNT_RECURSIVE,
ac0930c8
LP
57 PRIVATE_TMP,
58 PRIVATE_VAR_TMP,
7f112f50 59 PRIVATE_DEV,
5d997827
LP
60 BIND_DEV,
61 SYSFS,
62 PROCFS,
63 READONLY,
59eeb84b 64 READWRITE,
c17ec25e 65} MountMode;
15ae422b 66
34de407a 67typedef struct MountEntry {
5327c910 68 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 69 MountMode mode:5;
5327c910
LP
70 bool ignore:1; /* Ignore if path does not exist? */
71 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 72 bool read_only:1; /* Shall this mount point be read-only? */
5327c910 73 char *path_malloc; /* Use this instead of 'path' if we had to allocate memory */
d2d6c096
LP
74 const char *source_const; /* The source path, for bind mounts */
75 char *source_malloc;
34de407a 76} MountEntry;
15ae422b 77
5d997827
LP
78/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
79 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
80static const MountEntry apivfs_table[] = {
81 { "/proc", PROCFS, false },
82 { "/dev", BIND_DEV, false },
83 { "/sys", SYSFS, false },
84};
f471b2af 85
11a30cec 86/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 87static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
88 { "/proc/sys", READONLY, false },
89 { "/proc/sysrq-trigger", READONLY, true },
90 { "/proc/latency_stats", READONLY, true },
91 { "/proc/mtrr", READONLY, true },
aa70f38b 92 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
93 { "/proc/acpi", READONLY, true },
94 { "/proc/timer_stats", READONLY, true },
95 { "/proc/asound", READONLY, true },
96 { "/proc/bus", READONLY, true },
97 { "/proc/fs", READONLY, true },
98 { "/proc/irq", READONLY, true },
99 { "/sys", READONLY, false },
100 { "/sys/kernel/debug", READONLY, true },
101 { "/sys/kernel/tracing", READONLY, true },
102 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
11a30cec
DH
103};
104
c575770b 105/* ProtectKernelModules= option */
34de407a 106static const MountEntry protect_kernel_modules_table[] = {
c575770b 107#ifdef HAVE_SPLIT_USR
c6232fb0 108 { "/lib/modules", INACCESSIBLE, true },
c575770b 109#endif
c6232fb0 110 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
111};
112
b6c432ca
DH
113/*
114 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
115 * system should be protected by ProtectSystem=
116 */
34de407a 117static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
118 { "/home", READONLY, true },
119 { "/run/user", READONLY, true },
120 { "/root", READONLY, true },
b6c432ca
DH
121};
122
123/* ProtectHome=yes table */
34de407a 124static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
125 { "/home", INACCESSIBLE, true },
126 { "/run/user", INACCESSIBLE, true },
127 { "/root", INACCESSIBLE, true },
b6c432ca
DH
128};
129
f471b2af 130/* ProtectSystem=yes table */
34de407a 131static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
132 { "/usr", READONLY, false },
133 { "/boot", READONLY, true },
134 { "/efi", READONLY, true },
f471b2af
DH
135};
136
137/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 138static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
139 { "/usr", READONLY, false },
140 { "/boot", READONLY, true },
141 { "/efi", READONLY, true },
142 { "/etc", READONLY, false },
f471b2af
DH
143};
144
145/*
146 * ProtectSystem=strict table. In this strict mode, we mount everything
147 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
148 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
149 * protect those, and these options should be fully orthogonal.
150 * (And of course /home and friends are also left writable, as ProtectHome=
151 * shall manage those, orthogonally).
152 */
34de407a 153static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
154 { "/", READONLY, false },
155 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
156 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
157 { "/dev", READWRITE, false }, /* PrivateDevices= */
158 { "/home", READWRITE, true }, /* ProtectHome= */
159 { "/run/user", READWRITE, true }, /* ProtectHome= */
160 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
161};
162
34de407a 163static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
164 assert(p);
165
5327c910
LP
166 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
167 * otherwise the stack/static ->path field is returned. */
f0a4feb0 168
5327c910 169 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
170}
171
34de407a 172static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
173 assert(p);
174
175 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
176}
177
d2d6c096
LP
178static const char *mount_entry_source(const MountEntry *p) {
179 assert(p);
180
181 return p->source_malloc ?: p->source_const;
182}
183
1eb7e08e
LP
184static void mount_entry_done(MountEntry *p) {
185 assert(p);
186
187 p->path_malloc = mfree(p->path_malloc);
188 p->source_malloc = mfree(p->source_malloc);
189}
190
34de407a 191static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) {
15ae422b
LP
192 char **i;
193
613b411c
LP
194 assert(p);
195
5327c910
LP
196 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
197
15ae422b 198 STRV_FOREACH(i, strv) {
5327c910
LP
199 bool ignore = false, needs_prefix = false;
200 const char *e = *i;
15ae422b 201
5327c910
LP
202 /* Look for any prefixes */
203 if (startswith(e, "-")) {
204 e++;
9c94d52e 205 ignore = true;
ea92ae33 206 }
5327c910
LP
207 if (startswith(e, "+")) {
208 e++;
209 needs_prefix = true;
210 }
ea92ae33 211
5327c910 212 if (!path_is_absolute(e))
15ae422b
LP
213 return -EINVAL;
214
34de407a 215 *((*p)++) = (MountEntry) {
5327c910
LP
216 .path_const = e,
217 .mode = mode,
218 .ignore = ignore,
219 .has_prefix = !needs_prefix,
220 };
15ae422b
LP
221 }
222
223 return 0;
224}
225
d2d6c096
LP
226static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
227 unsigned i;
228
229 assert(p);
230
231 for (i = 0; i < n; i++) {
232 const BindMount *b = binds + i;
233
234 *((*p)++) = (MountEntry) {
235 .path_const = b->destination,
236 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
237 .read_only = b->read_only,
238 .source_const = b->source,
239 };
240 }
241
242 return 0;
243}
244
34de407a 245static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 246 unsigned i;
11a30cec
DH
247
248 assert(p);
f471b2af 249 assert(mounts);
11a30cec 250
5327c910 251 /* Adds a list of static pre-defined entries */
f471b2af 252
5327c910 253 for (i = 0; i < n; i++)
34de407a
LP
254 *((*p)++) = (MountEntry) {
255 .path_const = mount_entry_path(mounts+i),
5327c910
LP
256 .mode = mounts[i].mode,
257 .ignore = mounts[i].ignore || ignore_protect,
258 };
f471b2af
DH
259
260 return 0;
261}
262
34de407a 263static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
264 assert(p);
265
5327c910 266 switch (protect_home) {
b6c432ca 267
5327c910 268 case PROTECT_HOME_NO:
b6c432ca
DH
269 return 0;
270
b6c432ca 271 case PROTECT_HOME_READ_ONLY:
5327c910
LP
272 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
273
b6c432ca 274 case PROTECT_HOME_YES:
5327c910
LP
275 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
276
b6c432ca 277 default:
5327c910 278 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 279 }
b6c432ca
DH
280}
281
34de407a 282static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
283 assert(p);
284
5327c910
LP
285 switch (protect_system) {
286
287 case PROTECT_SYSTEM_NO:
f471b2af
DH
288 return 0;
289
f471b2af 290 case PROTECT_SYSTEM_STRICT:
5327c910
LP
291 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
292
f471b2af 293 case PROTECT_SYSTEM_YES:
5327c910
LP
294 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
295
f471b2af 296 case PROTECT_SYSTEM_FULL:
5327c910
LP
297 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
298
f471b2af 299 default:
5327c910 300 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 301 }
11a30cec
DH
302}
303
c17ec25e 304static int mount_path_compare(const void *a, const void *b) {
34de407a 305 const MountEntry *p = a, *q = b;
a0827e2b 306 int d;
15ae422b 307
6ee1a919 308 /* If the paths are not equal, then order prefixes first */
34de407a 309 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
310 if (d != 0)
311 return d;
15ae422b 312
6ee1a919
LP
313 /* If the paths are equal, check the mode */
314 if (p->mode < q->mode)
315 return -1;
15ae422b 316
6ee1a919
LP
317 if (p->mode > q->mode)
318 return 1;
15ae422b 319
6ee1a919 320 return 0;
15ae422b
LP
321}
322
34de407a 323static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
324 unsigned i;
325
326 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
327 * that. */
328
329 if (!root_directory)
330 return 0;
331
332 for (i = 0; i < n; i++) {
333 char *s;
334
335 if (m[i].has_prefix)
336 continue;
337
34de407a 338 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
339 if (!s)
340 return -ENOMEM;
341
342 free(m[i].path_malloc);
343 m[i].path_malloc = s;
344
345 m[i].has_prefix = true;
346 }
347
348 return 0;
349}
350
34de407a
LP
351static void drop_duplicates(MountEntry *m, unsigned *n) {
352 MountEntry *f, *t, *previous;
15ae422b 353
c17ec25e 354 assert(m);
15ae422b 355 assert(n);
15ae422b 356
fe3c2583
LP
357 /* Drops duplicate entries. Expects that the array is properly ordered already. */
358
1d54cd5d 359 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 360
fe3c2583
LP
361 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
362 * above. */
34de407a
LP
363 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
364 log_debug("%s is duplicate.", mount_entry_path(f));
365 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 366 mount_entry_done(f);
15ae422b 367 continue;
fe3c2583 368 }
15ae422b 369
e2d7c1a0 370 *t = *f;
15ae422b 371 previous = t;
fe3c2583
LP
372 t++;
373 }
374
375 *n = t - m;
376}
377
34de407a
LP
378static void drop_inaccessible(MountEntry *m, unsigned *n) {
379 MountEntry *f, *t;
fe3c2583
LP
380 const char *clear = NULL;
381
382 assert(m);
383 assert(n);
384
385 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
386 * ordered already. */
387
1d54cd5d 388 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
389
390 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
391 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
392 if (clear && path_startswith(mount_entry_path(f), clear)) {
393 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 394 mount_entry_done(f);
fe3c2583
LP
395 continue;
396 }
15ae422b 397
34de407a 398 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
399
400 *t = *f;
15ae422b
LP
401 t++;
402 }
403
c17ec25e 404 *n = t - m;
15ae422b
LP
405}
406
34de407a
LP
407static void drop_nop(MountEntry *m, unsigned *n) {
408 MountEntry *f, *t;
7648a565
LP
409
410 assert(m);
411 assert(n);
412
413 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
414 * list is ordered by prefixes. */
415
1d54cd5d 416 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
417
418 /* Only suppress such subtrees for READONLY and READWRITE entries */
419 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 420 MountEntry *p;
7648a565
LP
421 bool found = false;
422
423 /* Now let's find the first parent of the entry we are looking at. */
424 for (p = t-1; p >= m; p--) {
34de407a 425 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
426 found = true;
427 break;
428 }
429 }
430
431 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
432 if (found && p->mode == f->mode) {
34de407a 433 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
1eb7e08e 434 mount_entry_done(f);
7648a565
LP
435 continue;
436 }
437 }
438
439 *t = *f;
440 t++;
441 }
442
443 *n = t - m;
444}
445
34de407a
LP
446static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
447 MountEntry *f, *t;
cd2902c9
LP
448
449 assert(m);
450 assert(n);
451
1d54cd5d 452 /* Nothing to do */
cd2902c9
LP
453 if (!root_directory)
454 return;
455
456 /* Drops all mounts that are outside of the root directory. */
457
1d54cd5d 458 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 459
34de407a
LP
460 if (!path_startswith(mount_entry_path(f), root_directory)) {
461 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 462 mount_entry_done(f);
cd2902c9
LP
463 continue;
464 }
465
466 *t = *f;
467 t++;
468 }
469
470 *n = t - m;
471}
472
5d997827 473static int mount_private_dev(MountEntry *m) {
7f112f50
LP
474 static const char devnodes[] =
475 "/dev/null\0"
476 "/dev/zero\0"
477 "/dev/full\0"
478 "/dev/random\0"
479 "/dev/urandom\0"
480 "/dev/tty\0";
481
2b85f4e1 482 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 483 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
484 _cleanup_umask_ mode_t u;
485 int r;
486
487 assert(m);
488
489 u = umask(0000);
490
2b85f4e1
LP
491 if (!mkdtemp(temporary_mount))
492 return -errno;
493
63c372cb 494 dev = strjoina(temporary_mount, "/dev");
dc751688 495 (void) mkdir(dev, 0755);
737ba3c8 496 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
497 r = -errno;
498 goto fail;
499 }
500
63c372cb 501 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 502 (void) mkdir(devpts, 0755);
2b85f4e1
LP
503 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
504 r = -errno;
505 goto fail;
506 }
507
63c372cb 508 devptmx = strjoina(temporary_mount, "/dev/ptmx");
3164e3cb
ZJS
509 if (symlink("pts/ptmx", devptmx) < 0) {
510 r = -errno;
511 goto fail;
512 }
e06b6479 513
63c372cb 514 devshm = strjoina(temporary_mount, "/dev/shm");
dc751688 515 (void) mkdir(devshm, 01777);
2b85f4e1
LP
516 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
517 if (r < 0) {
518 r = -errno;
519 goto fail;
520 }
521
63c372cb 522 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 523 (void) mkdir(devmqueue, 0755);
3164e3cb 524 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 525
63c372cb 526 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 527 (void) mkdir(devhugepages, 0755);
3164e3cb 528 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 529
63c372cb 530 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 531 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 532
7f112f50 533 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
534 _cleanup_free_ char *dn = NULL;
535 struct stat st;
536
537 r = stat(d, &st);
7f112f50 538 if (r < 0) {
2b85f4e1
LP
539
540 if (errno == ENOENT)
541 continue;
542
543 r = -errno;
544 goto fail;
7f112f50
LP
545 }
546
2b85f4e1
LP
547 if (!S_ISBLK(st.st_mode) &&
548 !S_ISCHR(st.st_mode)) {
549 r = -EINVAL;
550 goto fail;
551 }
552
553 if (st.st_rdev == 0)
554 continue;
555
556 dn = strappend(temporary_mount, d);
557 if (!dn) {
558 r = -ENOMEM;
559 goto fail;
560 }
561
ecabcf8b 562 mac_selinux_create_file_prepare(d, st.st_mode);
2b85f4e1 563 r = mknod(dn, st.st_mode, st.st_rdev);
ecabcf8b 564 mac_selinux_create_file_clear();
dd078a1e 565
2b85f4e1
LP
566 if (r < 0) {
567 r = -errno;
568 goto fail;
569 }
7f112f50
LP
570 }
571
03cfe0d5 572 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 573
ee818b89
AC
574 /* Create the /dev directory if missing. It is more likely to be
575 * missing when the service is started with RootDirectory. This is
576 * consistent with mount units creating the mount points when missing.
577 */
34de407a 578 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 579
9e5f8252 580 /* Unmount everything in old /dev */
34de407a
LP
581 umount_recursive(mount_entry_path(m), 0);
582 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
583 r = -errno;
584 goto fail;
585 }
7f112f50 586
2b85f4e1
LP
587 rmdir(dev);
588 rmdir(temporary_mount);
7f112f50 589
2b85f4e1 590 return 0;
7f112f50 591
2b85f4e1
LP
592fail:
593 if (devpts)
594 umount(devpts);
7f112f50 595
2b85f4e1
LP
596 if (devshm)
597 umount(devshm);
7f112f50 598
2b85f4e1
LP
599 if (devhugepages)
600 umount(devhugepages);
7f112f50 601
2b85f4e1
LP
602 if (devmqueue)
603 umount(devmqueue);
7f112f50 604
d267c5aa
ZJS
605 umount(dev);
606 rmdir(dev);
2b85f4e1 607 rmdir(temporary_mount);
7f112f50 608
2b85f4e1 609 return r;
7f112f50
LP
610}
611
5d997827
LP
612static int mount_bind_dev(MountEntry *m) {
613 int r;
614
615 assert(m);
616
617 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
618 * /dev. This is only used when RootDirectory= is set. */
619
620 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
621 if (r < 0)
622 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
623 if (r > 0) /* make this a NOP if /dev is already a mount point */
624 return 0;
625
626 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
627 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
628
629 return 1;
630}
631
632static int mount_sysfs(MountEntry *m) {
633 int r;
634
635 assert(m);
636
637 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
638 if (r < 0)
639 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
640 if (r > 0) /* make this a NOP if /sys is already a mount point */
641 return 0;
642
643 /* Bind mount the host's version so that we get all child mounts of it, too. */
644 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
645 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
646
647 return 1;
648}
649
650static int mount_procfs(MountEntry *m) {
651 int r;
652
653 assert(m);
654
655 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
656 if (r < 0)
657 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
658 if (r > 0) /* make this a NOP if /proc is already a mount point */
659 return 0;
660
661 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
662 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
663 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
664
665 return 1;
666}
667
d2d6c096
LP
668static int mount_entry_chase(
669 const char *root_directory,
670 MountEntry *m,
671 const char *path,
672 char **location) {
673
8fceda93
LP
674 char *chased;
675 int r;
676
677 assert(m);
678
679 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
d2d6c096
LP
680 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
681 * that applies). The result is stored in "location". */
8fceda93 682
d2d6c096 683 r = chase_symlinks(path, root_directory, 0, &chased);
8fceda93 684 if (r == -ENOENT && m->ignore) {
d2d6c096 685 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
8fceda93
LP
686 return 0;
687 }
688 if (r < 0)
d2d6c096 689 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
8fceda93 690
d2d6c096 691 log_debug("Followed symlinks %s → %s.", path, chased);
8fceda93 692
d2d6c096
LP
693 free(*location);
694 *location = chased;
8fceda93
LP
695
696 return 1;
697}
698
ac0930c8 699static int apply_mount(
8fceda93 700 const char *root_directory,
34de407a 701 MountEntry *m,
ac0930c8 702 const char *tmp_dir,
c17ec25e 703 const char *var_tmp_dir) {
ac0930c8 704
15ae422b 705 const char *what;
d2d6c096 706 bool rbind = true;
15ae422b 707 int r;
15ae422b 708
c17ec25e 709 assert(m);
15ae422b 710
d2d6c096 711 r = mount_entry_chase(root_directory, m, mount_entry_path(m), &m->path_malloc);
8fceda93
LP
712 if (r <= 0)
713 return r;
714
34de407a 715 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 716
c17ec25e 717 switch (m->mode) {
15ae422b 718
160cfdbe
LP
719 case INACCESSIBLE: {
720 struct stat target;
6d313367
LP
721
722 /* First, get rid of everything that is below if there
723 * is anything... Then, overmount it with an
c4b41707 724 * inaccessible path. */
34de407a 725 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 726
34de407a
LP
727 if (lstat(mount_entry_path(m), &target) < 0)
728 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
15ae422b 729
c4b41707 730 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
731 if (!what) {
732 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
733 return -ELOOP;
734 }
735 break;
160cfdbe 736 }
fe3c2583 737
15ae422b 738 case READONLY:
15ae422b 739 case READWRITE:
8fceda93 740 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
d944dc95 741 if (r < 0)
34de407a 742 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
743 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
744 return 0;
6b7c9f8b 745 /* This isn't a mount point yet, let's make it one. */
34de407a 746 what = mount_entry_path(m);
6b7c9f8b 747 break;
15ae422b 748
d2d6c096
LP
749 case BIND_MOUNT:
750 rbind = false;
751 /* fallthrough */
752
753 case BIND_MOUNT_RECURSIVE:
754 /* Also chase the source mount */
5d997827 755
d2d6c096
LP
756 r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
757 if (r <= 0)
758 return r;
759
760 what = mount_entry_source(m);
761 break;
762
ac0930c8
LP
763 case PRIVATE_TMP:
764 what = tmp_dir;
765 break;
766
767 case PRIVATE_VAR_TMP:
768 what = var_tmp_dir;
15ae422b 769 break;
e364ad06 770
d6797c92 771 case PRIVATE_DEV:
5d997827
LP
772 return mount_private_dev(m);
773
774 case BIND_DEV:
775 return mount_bind_dev(m);
776
777 case SYSFS:
778 return mount_sysfs(m);
779
780 case PROCFS:
781 return mount_procfs(m);
d6797c92 782
e364ad06
LP
783 default:
784 assert_not_reached("Unknown mode");
15ae422b
LP
785 }
786
ac0930c8 787 assert(what);
15ae422b 788
d2d6c096 789 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
34de407a 790 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
6b7c9f8b 791
34de407a 792 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 793 return 0;
ac0930c8 794}
15ae422b 795
34de407a 796static int make_read_only(MountEntry *m, char **blacklist) {
6b7c9f8b 797 int r = 0;
15ae422b 798
c17ec25e 799 assert(m);
ac0930c8 800
34de407a
LP
801 if (mount_entry_read_only(m))
802 r = bind_remount_recursive(mount_entry_path(m), true, blacklist);
13e785f7 803 else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't */
34de407a 804 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 805 r = -errno;
737ba3c8 806 } else
6b7c9f8b
LP
807 return 0;
808
809 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
810 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
811 * read-only mounts already applied. */
ac0930c8 812
8fceda93
LP
813 if (r == -ENOENT && m->ignore)
814 r = 0;
5327c910 815
1d54cd5d 816 return r;
d944dc95
LP
817}
818
5d997827
LP
819static bool namespace_info_mount_apivfs(const NameSpaceInfo *ns_info) {
820 assert(ns_info);
821
822 /* ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, since to protect the API VFS mounts,
823 * they need to be around in the first place... */
824
825 return ns_info->mount_apivfs ||
826 ns_info->protect_control_groups ||
827 ns_info->protect_kernel_tunables;
828}
829
2652c6c1 830static unsigned namespace_calculate_mounts(
c575770b 831 const NameSpaceInfo *ns_info,
2652c6c1
DH
832 char** read_write_paths,
833 char** read_only_paths,
834 char** inaccessible_paths,
d2d6c096
LP
835 const BindMount *bind_mounts,
836 unsigned n_bind_mounts,
2652c6c1
DH
837 const char* tmp_dir,
838 const char* var_tmp_dir,
2652c6c1
DH
839 ProtectHome protect_home,
840 ProtectSystem protect_system) {
841
b6c432ca 842 unsigned protect_home_cnt;
f471b2af
DH
843 unsigned protect_system_cnt =
844 (protect_system == PROTECT_SYSTEM_STRICT ?
845 ELEMENTSOF(protect_system_strict_table) :
846 ((protect_system == PROTECT_SYSTEM_FULL) ?
847 ELEMENTSOF(protect_system_full_table) :
848 ((protect_system == PROTECT_SYSTEM_YES) ?
849 ELEMENTSOF(protect_system_yes_table) : 0)));
850
b6c432ca
DH
851 protect_home_cnt =
852 (protect_home == PROTECT_HOME_YES ?
853 ELEMENTSOF(protect_home_yes_table) :
854 ((protect_home == PROTECT_HOME_READ_ONLY) ?
855 ELEMENTSOF(protect_home_read_only_table) : 0));
856
2652c6c1
DH
857 return !!tmp_dir + !!var_tmp_dir +
858 strv_length(read_write_paths) +
859 strv_length(read_only_paths) +
860 strv_length(inaccessible_paths) +
d2d6c096 861 n_bind_mounts +
c575770b
DH
862 ns_info->private_dev +
863 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
864 (ns_info->protect_control_groups ? 1 : 0) +
865 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
5d997827
LP
866 protect_home_cnt + protect_system_cnt +
867 (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0);
2652c6c1
DH
868}
869
613b411c 870int setup_namespace(
ee818b89 871 const char* root_directory,
915e6d16 872 const char* root_image,
c575770b 873 const NameSpaceInfo *ns_info,
2a624c36
AP
874 char** read_write_paths,
875 char** read_only_paths,
876 char** inaccessible_paths,
d2d6c096
LP
877 const BindMount *bind_mounts,
878 unsigned n_bind_mounts,
a004cb4c
LP
879 const char* tmp_dir,
880 const char* var_tmp_dir,
1b8689f9
LP
881 ProtectHome protect_home,
882 ProtectSystem protect_system,
915e6d16
LP
883 unsigned long mount_flags,
884 DissectImageFlags dissect_image_flags) {
15ae422b 885
915e6d16 886 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
78ebe980 887 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
915e6d16 888 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
78ebe980 889 _cleanup_free_ void *root_hash = NULL;
34de407a 890 MountEntry *m, *mounts = NULL;
78ebe980 891 size_t root_hash_size = 0;
d944dc95 892 bool make_slave = false;
f0a4feb0 893 unsigned n_mounts;
c17ec25e 894 int r = 0;
15ae422b 895
915e6d16
LP
896 assert(ns_info);
897
613b411c 898 if (mount_flags == 0)
c17ec25e 899 mount_flags = MS_SHARED;
ac0930c8 900
915e6d16
LP
901 if (root_image) {
902 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
903
904 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
905 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
906
907 r = loop_device_make_by_path(root_image,
908 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
909 &loop_device);
910 if (r < 0)
911 return r;
912
78ebe980
LP
913 r = root_hash_load(root_image, &root_hash, &root_hash_size);
914 if (r < 0)
915 return r;
916
917 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
918 if (r < 0)
919 return r;
920
921 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
915e6d16
LP
922 if (r < 0)
923 return r;
924
925 if (!root_directory) {
926 /* Create a mount point for the image, if it's still missing. We use the same mount point for
927 * all images, which is safe, since they all live in their own namespaces after all, and hence
928 * won't see each other. */
929 root_directory = "/run/systemd/unit-root";
930 (void) mkdir(root_directory, 0700);
931 }
932 }
933
cfbeb4ef
LP
934 n_mounts = namespace_calculate_mounts(
935 ns_info,
936 read_write_paths,
937 read_only_paths,
938 inaccessible_paths,
d2d6c096 939 bind_mounts, n_bind_mounts,
cfbeb4ef
LP
940 tmp_dir, var_tmp_dir,
941 protect_home, protect_system);
613b411c 942
2652c6c1 943 /* Set mount slave mode */
f0a4feb0 944 if (root_directory || n_mounts > 0)
d944dc95
LP
945 make_slave = true;
946
f0a4feb0 947 if (n_mounts > 0) {
34de407a 948 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
5327c910 949 r = append_access_mounts(&m, read_write_paths, READWRITE);
613b411c 950 if (r < 0)
f0a4feb0 951 goto finish;
613b411c 952
5327c910 953 r = append_access_mounts(&m, read_only_paths, READONLY);
613b411c 954 if (r < 0)
f0a4feb0 955 goto finish;
613b411c 956
5327c910 957 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE);
613b411c 958 if (r < 0)
f0a4feb0 959 goto finish;
7ff7394d 960
d2d6c096
LP
961 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
962 if (r < 0)
963 goto finish;
964
613b411c 965 if (tmp_dir) {
34de407a 966 *(m++) = (MountEntry) {
5327c910
LP
967 .path_const = "/tmp",
968 .mode = PRIVATE_TMP,
969 };
613b411c 970 }
7ff7394d 971
613b411c 972 if (var_tmp_dir) {
34de407a 973 *(m++) = (MountEntry) {
5327c910
LP
974 .path_const = "/var/tmp",
975 .mode = PRIVATE_VAR_TMP,
976 };
7ff7394d 977 }
ac0930c8 978
c575770b 979 if (ns_info->private_dev) {
34de407a 980 *(m++) = (MountEntry) {
5327c910
LP
981 .path_const = "/dev",
982 .mode = PRIVATE_DEV,
983 };
7f112f50
LP
984 }
985
c575770b 986 if (ns_info->protect_kernel_tunables) {
5327c910 987 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 988 if (r < 0)
f0a4feb0 989 goto finish;
c575770b
DH
990 }
991
992 if (ns_info->protect_kernel_modules) {
5327c910 993 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 994 if (r < 0)
f0a4feb0 995 goto finish;
c575770b 996 }
59eeb84b 997
c575770b 998 if (ns_info->protect_control_groups) {
34de407a 999 *(m++) = (MountEntry) {
5327c910
LP
1000 .path_const = "/sys/fs/cgroup",
1001 .mode = READONLY,
1002 };
59eeb84b
LP
1003 }
1004
5327c910 1005 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 1006 if (r < 0)
f0a4feb0 1007 goto finish;
417116f2 1008
5327c910 1009 r = append_protect_system(&m, protect_system, false);
f471b2af 1010 if (r < 0)
f0a4feb0 1011 goto finish;
417116f2 1012
5d997827
LP
1013 if (namespace_info_mount_apivfs(ns_info)) {
1014 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1015 if (r < 0)
1016 goto finish;
1017 }
1018
f0a4feb0 1019 assert(mounts + n_mounts == m);
ac0930c8 1020
5327c910
LP
1021 /* Prepend the root directory where that's necessary */
1022 r = prefix_where_needed(mounts, n_mounts, root_directory);
1023 if (r < 0)
1024 goto finish;
1025
34de407a 1026 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
fe3c2583 1027
f0a4feb0
DH
1028 drop_duplicates(mounts, &n_mounts);
1029 drop_outside_root(root_directory, mounts, &n_mounts);
1030 drop_inaccessible(mounts, &n_mounts);
1031 drop_nop(mounts, &n_mounts);
15ae422b
LP
1032 }
1033
d944dc95
LP
1034 if (unshare(CLONE_NEWNS) < 0) {
1035 r = -errno;
1036 goto finish;
1037 }
1e4e94c8 1038
d944dc95 1039 if (make_slave) {
c2c13f2d
LP
1040 /* Remount / as SLAVE so that nothing now mounted in the namespace
1041 shows up in the parent */
d944dc95
LP
1042 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1043 r = -errno;
1044 goto finish;
1045 }
ee818b89
AC
1046 }
1047
10404d52
DH
1048 /* Try to set up the new root directory before mounting anything there */
1049 if (root_directory)
1050 (void) base_filesystem_create(root_directory, UID_INVALID, GID_INVALID);
1051
915e6d16
LP
1052 if (root_image) {
1053 r = dissected_image_mount(dissected_image, root_directory, dissect_image_flags);
1054 if (r < 0)
1055 goto finish;
1056
78ebe980
LP
1057 r = decrypted_image_relinquish(decrypted_image);
1058 if (r < 0)
1059 goto finish;
1060
915e6d16
LP
1061 loop_device_relinquish(loop_device);
1062
1063 } else if (root_directory) {
1064
8f1ad200 1065 /* Turn directory into bind mount, if it isn't one yet */
e1873695 1066 r = path_is_mount_point(root_directory, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 1067 if (r < 0)
d944dc95 1068 goto finish;
8f1ad200
LP
1069 if (r == 0) {
1070 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
1071 r = -errno;
1072 goto finish;
1073 }
d944dc95 1074 }
ee818b89 1075 }
c2c13f2d 1076
f0a4feb0 1077 if (n_mounts > 0) {
6b7c9f8b
LP
1078 char **blacklist;
1079 unsigned j;
1080
1081 /* First round, add in all special mounts we need */
f0a4feb0 1082 for (m = mounts; m < mounts + n_mounts; ++m) {
8fceda93 1083 r = apply_mount(root_directory, m, tmp_dir, var_tmp_dir);
c2c13f2d 1084 if (r < 0)
d944dc95 1085 goto finish;
c2c13f2d 1086 }
15ae422b 1087
6b7c9f8b 1088 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
1089 blacklist = newa(char*, n_mounts+1);
1090 for (j = 0; j < n_mounts; j++)
34de407a 1091 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
1092 blacklist[j] = NULL;
1093
1094 /* Second round, flip the ro bits if necessary. */
f0a4feb0 1095 for (m = mounts; m < mounts + n_mounts; ++m) {
6b7c9f8b 1096 r = make_read_only(m, blacklist);
c2c13f2d 1097 if (r < 0)
d944dc95 1098 goto finish;
c2c13f2d 1099 }
15ae422b
LP
1100 }
1101
ee818b89
AC
1102 if (root_directory) {
1103 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1104 r = mount_move_root(root_directory);
d944dc95
LP
1105 if (r < 0)
1106 goto finish;
ee818b89
AC
1107 }
1108
c2c13f2d
LP
1109 /* Remount / as the desired mode. Not that this will not
1110 * reestablish propagation from our side to the host, since
1111 * what's disconnected is disconnected. */
d944dc95
LP
1112 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1113 r = -errno;
1114 goto finish;
1115 }
15ae422b 1116
d944dc95 1117 r = 0;
15ae422b 1118
d944dc95 1119finish:
f0a4feb0 1120 for (m = mounts; m < mounts + n_mounts; m++)
1eb7e08e 1121 mount_entry_done(m);
613b411c
LP
1122
1123 return r;
1124}
1125
d2d6c096
LP
1126void bind_mount_free_many(BindMount *b, unsigned n) {
1127 unsigned i;
1128
1129 assert(b || n == 0);
1130
1131 for (i = 0; i < n; i++) {
1132 free(b[i].source);
1133 free(b[i].destination);
1134 }
1135
1136 free(b);
1137}
1138
1139int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1140 _cleanup_free_ char *s = NULL, *d = NULL;
1141 BindMount *c;
1142
1143 assert(b);
1144 assert(n);
1145 assert(item);
1146
1147 s = strdup(item->source);
1148 if (!s)
1149 return -ENOMEM;
1150
1151 d = strdup(item->destination);
1152 if (!d)
1153 return -ENOMEM;
1154
1155 c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1156 if (!c)
1157 return -ENOMEM;
1158
1159 *b = c;
1160
1161 c[(*n) ++] = (BindMount) {
1162 .source = s,
1163 .destination = d,
1164 .read_only = item->read_only,
1165 .recursive = item->recursive,
1166 .ignore_enoent = item->ignore_enoent,
1167 };
1168
1169 s = d = NULL;
1170 return 0;
1171}
1172
613b411c
LP
1173static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1174 _cleanup_free_ char *x = NULL;
6b46ea73
LP
1175 char bid[SD_ID128_STRING_MAX];
1176 sd_id128_t boot_id;
1177 int r;
613b411c
LP
1178
1179 assert(id);
1180 assert(prefix);
1181 assert(path);
1182
6b46ea73
LP
1183 /* We include the boot id in the directory so that after a
1184 * reboot we can easily identify obsolete directories. */
1185
1186 r = sd_id128_get_boot(&boot_id);
1187 if (r < 0)
1188 return r;
1189
605405c6 1190 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
1191 if (!x)
1192 return -ENOMEM;
1193
1194 RUN_WITH_UMASK(0077)
1195 if (!mkdtemp(x))
1196 return -errno;
1197
1198 RUN_WITH_UMASK(0000) {
1199 char *y;
1200
63c372cb 1201 y = strjoina(x, "/tmp");
613b411c
LP
1202
1203 if (mkdir(y, 0777 | S_ISVTX) < 0)
1204 return -errno;
c17ec25e 1205 }
15ae422b 1206
613b411c
LP
1207 *path = x;
1208 x = NULL;
1209
1210 return 0;
1211}
1212
1213int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1214 char *a, *b;
1215 int r;
1216
1217 assert(id);
1218 assert(tmp_dir);
1219 assert(var_tmp_dir);
1220
1221 r = setup_one_tmp_dir(id, "/tmp", &a);
1222 if (r < 0)
1223 return r;
1224
1225 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1226 if (r < 0) {
1227 char *t;
1228
63c372cb 1229 t = strjoina(a, "/tmp");
613b411c
LP
1230 rmdir(t);
1231 rmdir(a);
1232
1233 free(a);
1234 return r;
1235 }
1236
1237 *tmp_dir = a;
1238 *var_tmp_dir = b;
1239
1240 return 0;
1241}
1242
1243int setup_netns(int netns_storage_socket[2]) {
1244 _cleanup_close_ int netns = -1;
3ee897d6 1245 int r, q;
613b411c
LP
1246
1247 assert(netns_storage_socket);
1248 assert(netns_storage_socket[0] >= 0);
1249 assert(netns_storage_socket[1] >= 0);
1250
1251 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1252 * namespace reference fd. Whatever process runs this first
1253 * shall create a new namespace, all others should just join
1254 * it. To serialize that we use a file lock on the socket
1255 * pair.
613b411c
LP
1256 *
1257 * It's a bit crazy, but hey, works great! */
1258
1259 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1260 return -errno;
1261
3ee897d6
LP
1262 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1263 if (netns == -EAGAIN) {
613b411c
LP
1264 /* Nothing stored yet, so let's create a new namespace */
1265
1266 if (unshare(CLONE_NEWNET) < 0) {
1267 r = -errno;
1268 goto fail;
1269 }
1270
1271 loopback_setup();
1272
1273 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1274 if (netns < 0) {
1275 r = -errno;
1276 goto fail;
1277 }
1278
1279 r = 1;
613b411c 1280
3ee897d6
LP
1281 } else if (netns < 0) {
1282 r = netns;
1283 goto fail;
613b411c 1284
3ee897d6
LP
1285 } else {
1286 /* Yay, found something, so let's join the namespace */
613b411c
LP
1287 if (setns(netns, CLONE_NEWNET) < 0) {
1288 r = -errno;
1289 goto fail;
1290 }
1291
1292 r = 0;
1293 }
1294
3ee897d6
LP
1295 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1296 if (q < 0) {
1297 r = q;
613b411c
LP
1298 goto fail;
1299 }
1300
1301fail:
fe048ce5 1302 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1303 return r;
1304}
417116f2 1305
1b8689f9
LP
1306static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1307 [PROTECT_HOME_NO] = "no",
1308 [PROTECT_HOME_YES] = "yes",
1309 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
1310};
1311
1b8689f9
LP
1312DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1313
1314static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1315 [PROTECT_SYSTEM_NO] = "no",
1316 [PROTECT_SYSTEM_YES] = "yes",
1317 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1318 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1319};
1320
1321DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);