]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
namespace: if we can create the destination of bind and PrivateTmp= mounts
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
15ae422b
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
15ae422b 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
20#include <errno.h>
07630cea 21#include <sched.h>
15ae422b 22#include <stdio.h>
07630cea
LP
23#include <string.h>
24#include <sys/mount.h>
15ae422b 25#include <sys/stat.h>
07630cea 26#include <unistd.h>
25e870b5 27#include <linux/fs.h>
15ae422b 28
b5efdb8a 29#include "alloc-util.h"
10404d52 30#include "base-filesystem.h"
7f112f50 31#include "dev-setup.h"
3ffd4af2 32#include "fd-util.h"
d944dc95 33#include "fs-util.h"
e908468b 34#include "label.h"
915e6d16 35#include "loop-util.h"
07630cea
LP
36#include "loopback-setup.h"
37#include "missing.h"
38#include "mkdir.h"
4349cd7c 39#include "mount-util.h"
3ffd4af2 40#include "namespace.h"
07630cea 41#include "path-util.h"
d7b8eec7 42#include "selinux-util.h"
2583fbea 43#include "socket-util.h"
8b43440b 44#include "string-table.h"
07630cea
LP
45#include "string-util.h"
46#include "strv.h"
affb60b1 47#include "umask-util.h"
ee104e11 48#include "user-util.h"
07630cea 49#include "util.h"
15ae422b 50
737ba3c8 51#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
52
c17ec25e 53typedef enum MountMode {
15ae422b
LP
54 /* This is ordered by priority! */
55 INACCESSIBLE,
d2d6c096
LP
56 BIND_MOUNT,
57 BIND_MOUNT_RECURSIVE,
ac0930c8
LP
58 PRIVATE_TMP,
59 PRIVATE_VAR_TMP,
7f112f50 60 PRIVATE_DEV,
5d997827
LP
61 BIND_DEV,
62 SYSFS,
63 PROCFS,
64 READONLY,
59eeb84b 65 READWRITE,
c17ec25e 66} MountMode;
15ae422b 67
34de407a 68typedef struct MountEntry {
5327c910 69 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 70 MountMode mode:5;
5327c910
LP
71 bool ignore:1; /* Ignore if path does not exist? */
72 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 73 bool read_only:1; /* Shall this mount point be read-only? */
5327c910 74 char *path_malloc; /* Use this instead of 'path' if we had to allocate memory */
d2d6c096
LP
75 const char *source_const; /* The source path, for bind mounts */
76 char *source_malloc;
34de407a 77} MountEntry;
15ae422b 78
5d997827
LP
79/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
80 * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
81static const MountEntry apivfs_table[] = {
82 { "/proc", PROCFS, false },
83 { "/dev", BIND_DEV, false },
84 { "/sys", SYSFS, false },
85};
f471b2af 86
11a30cec 87/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 88static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
89 { "/proc/sys", READONLY, false },
90 { "/proc/sysrq-trigger", READONLY, true },
91 { "/proc/latency_stats", READONLY, true },
92 { "/proc/mtrr", READONLY, true },
aa70f38b 93 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
94 { "/proc/acpi", READONLY, true },
95 { "/proc/timer_stats", READONLY, true },
96 { "/proc/asound", READONLY, true },
97 { "/proc/bus", READONLY, true },
98 { "/proc/fs", READONLY, true },
99 { "/proc/irq", READONLY, true },
100 { "/sys", READONLY, false },
101 { "/sys/kernel/debug", READONLY, true },
102 { "/sys/kernel/tracing", READONLY, true },
103 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
3a0bf6d6 104 { "/sys/fs/selinux", READWRITE, true },
11a30cec
DH
105};
106
c575770b 107/* ProtectKernelModules= option */
34de407a 108static const MountEntry protect_kernel_modules_table[] = {
c575770b 109#ifdef HAVE_SPLIT_USR
c6232fb0 110 { "/lib/modules", INACCESSIBLE, true },
c575770b 111#endif
c6232fb0 112 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
113};
114
b6c432ca
DH
115/*
116 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
117 * system should be protected by ProtectSystem=
118 */
34de407a 119static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
120 { "/home", READONLY, true },
121 { "/run/user", READONLY, true },
122 { "/root", READONLY, true },
b6c432ca
DH
123};
124
125/* ProtectHome=yes table */
34de407a 126static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
127 { "/home", INACCESSIBLE, true },
128 { "/run/user", INACCESSIBLE, true },
129 { "/root", INACCESSIBLE, true },
b6c432ca
DH
130};
131
f471b2af 132/* ProtectSystem=yes table */
34de407a 133static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
134 { "/usr", READONLY, false },
135 { "/boot", READONLY, true },
136 { "/efi", READONLY, true },
f471b2af
DH
137};
138
139/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 140static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
141 { "/usr", READONLY, false },
142 { "/boot", READONLY, true },
143 { "/efi", READONLY, true },
144 { "/etc", READONLY, false },
f471b2af
DH
145};
146
147/*
148 * ProtectSystem=strict table. In this strict mode, we mount everything
149 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
150 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
151 * protect those, and these options should be fully orthogonal.
152 * (And of course /home and friends are also left writable, as ProtectHome=
153 * shall manage those, orthogonally).
154 */
34de407a 155static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
156 { "/", READONLY, false },
157 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
158 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
159 { "/dev", READWRITE, false }, /* PrivateDevices= */
160 { "/home", READWRITE, true }, /* ProtectHome= */
161 { "/run/user", READWRITE, true }, /* ProtectHome= */
162 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
163};
164
34de407a 165static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
166 assert(p);
167
5327c910
LP
168 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
169 * otherwise the stack/static ->path field is returned. */
f0a4feb0 170
5327c910 171 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
172}
173
34de407a 174static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
175 assert(p);
176
177 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
178}
179
d2d6c096
LP
180static const char *mount_entry_source(const MountEntry *p) {
181 assert(p);
182
183 return p->source_malloc ?: p->source_const;
184}
185
1eb7e08e
LP
186static void mount_entry_done(MountEntry *p) {
187 assert(p);
188
189 p->path_malloc = mfree(p->path_malloc);
190 p->source_malloc = mfree(p->source_malloc);
191}
192
34de407a 193static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) {
15ae422b
LP
194 char **i;
195
613b411c
LP
196 assert(p);
197
5327c910
LP
198 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
199
15ae422b 200 STRV_FOREACH(i, strv) {
5327c910
LP
201 bool ignore = false, needs_prefix = false;
202 const char *e = *i;
15ae422b 203
5327c910
LP
204 /* Look for any prefixes */
205 if (startswith(e, "-")) {
206 e++;
9c94d52e 207 ignore = true;
ea92ae33 208 }
5327c910
LP
209 if (startswith(e, "+")) {
210 e++;
211 needs_prefix = true;
212 }
ea92ae33 213
5327c910 214 if (!path_is_absolute(e))
15ae422b
LP
215 return -EINVAL;
216
34de407a 217 *((*p)++) = (MountEntry) {
5327c910
LP
218 .path_const = e,
219 .mode = mode,
220 .ignore = ignore,
221 .has_prefix = !needs_prefix,
222 };
15ae422b
LP
223 }
224
225 return 0;
226}
227
d2d6c096
LP
228static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
229 unsigned i;
230
231 assert(p);
232
233 for (i = 0; i < n; i++) {
234 const BindMount *b = binds + i;
235
236 *((*p)++) = (MountEntry) {
237 .path_const = b->destination,
238 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
239 .read_only = b->read_only,
240 .source_const = b->source,
241 };
242 }
243
244 return 0;
245}
246
34de407a 247static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 248 unsigned i;
11a30cec
DH
249
250 assert(p);
f471b2af 251 assert(mounts);
11a30cec 252
5327c910 253 /* Adds a list of static pre-defined entries */
f471b2af 254
5327c910 255 for (i = 0; i < n; i++)
34de407a
LP
256 *((*p)++) = (MountEntry) {
257 .path_const = mount_entry_path(mounts+i),
5327c910
LP
258 .mode = mounts[i].mode,
259 .ignore = mounts[i].ignore || ignore_protect,
260 };
f471b2af
DH
261
262 return 0;
263}
264
34de407a 265static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
266 assert(p);
267
5327c910 268 switch (protect_home) {
b6c432ca 269
5327c910 270 case PROTECT_HOME_NO:
b6c432ca
DH
271 return 0;
272
b6c432ca 273 case PROTECT_HOME_READ_ONLY:
5327c910
LP
274 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
275
b6c432ca 276 case PROTECT_HOME_YES:
5327c910
LP
277 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
278
b6c432ca 279 default:
5327c910 280 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 281 }
b6c432ca
DH
282}
283
34de407a 284static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
285 assert(p);
286
5327c910
LP
287 switch (protect_system) {
288
289 case PROTECT_SYSTEM_NO:
f471b2af
DH
290 return 0;
291
f471b2af 292 case PROTECT_SYSTEM_STRICT:
5327c910
LP
293 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
294
f471b2af 295 case PROTECT_SYSTEM_YES:
5327c910
LP
296 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
297
f471b2af 298 case PROTECT_SYSTEM_FULL:
5327c910
LP
299 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
300
f471b2af 301 default:
5327c910 302 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 303 }
11a30cec
DH
304}
305
c17ec25e 306static int mount_path_compare(const void *a, const void *b) {
34de407a 307 const MountEntry *p = a, *q = b;
a0827e2b 308 int d;
15ae422b 309
6ee1a919 310 /* If the paths are not equal, then order prefixes first */
34de407a 311 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
312 if (d != 0)
313 return d;
15ae422b 314
6ee1a919
LP
315 /* If the paths are equal, check the mode */
316 if (p->mode < q->mode)
317 return -1;
15ae422b 318
6ee1a919
LP
319 if (p->mode > q->mode)
320 return 1;
15ae422b 321
6ee1a919 322 return 0;
15ae422b
LP
323}
324
34de407a 325static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
326 unsigned i;
327
328 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
329 * that. */
330
331 if (!root_directory)
332 return 0;
333
334 for (i = 0; i < n; i++) {
335 char *s;
336
337 if (m[i].has_prefix)
338 continue;
339
34de407a 340 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
341 if (!s)
342 return -ENOMEM;
343
344 free(m[i].path_malloc);
345 m[i].path_malloc = s;
346
347 m[i].has_prefix = true;
348 }
349
350 return 0;
351}
352
34de407a
LP
353static void drop_duplicates(MountEntry *m, unsigned *n) {
354 MountEntry *f, *t, *previous;
15ae422b 355
c17ec25e 356 assert(m);
15ae422b 357 assert(n);
15ae422b 358
fe3c2583
LP
359 /* Drops duplicate entries. Expects that the array is properly ordered already. */
360
1d54cd5d 361 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 362
fe3c2583
LP
363 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
364 * above. */
34de407a
LP
365 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
366 log_debug("%s is duplicate.", mount_entry_path(f));
367 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 368 mount_entry_done(f);
15ae422b 369 continue;
fe3c2583 370 }
15ae422b 371
e2d7c1a0 372 *t = *f;
15ae422b 373 previous = t;
fe3c2583
LP
374 t++;
375 }
376
377 *n = t - m;
378}
379
34de407a
LP
380static void drop_inaccessible(MountEntry *m, unsigned *n) {
381 MountEntry *f, *t;
fe3c2583
LP
382 const char *clear = NULL;
383
384 assert(m);
385 assert(n);
386
387 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
388 * ordered already. */
389
1d54cd5d 390 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
391
392 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
393 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
394 if (clear && path_startswith(mount_entry_path(f), clear)) {
395 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 396 mount_entry_done(f);
fe3c2583
LP
397 continue;
398 }
15ae422b 399
34de407a 400 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
401
402 *t = *f;
15ae422b
LP
403 t++;
404 }
405
c17ec25e 406 *n = t - m;
15ae422b
LP
407}
408
34de407a
LP
409static void drop_nop(MountEntry *m, unsigned *n) {
410 MountEntry *f, *t;
7648a565
LP
411
412 assert(m);
413 assert(n);
414
415 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
416 * list is ordered by prefixes. */
417
1d54cd5d 418 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
419
420 /* Only suppress such subtrees for READONLY and READWRITE entries */
421 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 422 MountEntry *p;
7648a565
LP
423 bool found = false;
424
425 /* Now let's find the first parent of the entry we are looking at. */
426 for (p = t-1; p >= m; p--) {
34de407a 427 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
428 found = true;
429 break;
430 }
431 }
432
433 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
434 if (found && p->mode == f->mode) {
34de407a 435 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
1eb7e08e 436 mount_entry_done(f);
7648a565
LP
437 continue;
438 }
439 }
440
441 *t = *f;
442 t++;
443 }
444
445 *n = t - m;
446}
447
34de407a
LP
448static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
449 MountEntry *f, *t;
cd2902c9
LP
450
451 assert(m);
452 assert(n);
453
1d54cd5d 454 /* Nothing to do */
cd2902c9
LP
455 if (!root_directory)
456 return;
457
458 /* Drops all mounts that are outside of the root directory. */
459
1d54cd5d 460 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 461
34de407a
LP
462 if (!path_startswith(mount_entry_path(f), root_directory)) {
463 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 464 mount_entry_done(f);
cd2902c9
LP
465 continue;
466 }
467
468 *t = *f;
469 t++;
470 }
471
472 *n = t - m;
473}
474
5d997827 475static int mount_private_dev(MountEntry *m) {
7f112f50
LP
476 static const char devnodes[] =
477 "/dev/null\0"
478 "/dev/zero\0"
479 "/dev/full\0"
480 "/dev/random\0"
481 "/dev/urandom\0"
482 "/dev/tty\0";
483
2b85f4e1 484 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 485 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
486 _cleanup_umask_ mode_t u;
487 int r;
488
489 assert(m);
490
491 u = umask(0000);
492
2b85f4e1
LP
493 if (!mkdtemp(temporary_mount))
494 return -errno;
495
63c372cb 496 dev = strjoina(temporary_mount, "/dev");
dc751688 497 (void) mkdir(dev, 0755);
737ba3c8 498 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
499 r = -errno;
500 goto fail;
501 }
502
63c372cb 503 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 504 (void) mkdir(devpts, 0755);
2b85f4e1
LP
505 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
506 r = -errno;
507 goto fail;
508 }
509
63c372cb 510 devptmx = strjoina(temporary_mount, "/dev/ptmx");
3164e3cb
ZJS
511 if (symlink("pts/ptmx", devptmx) < 0) {
512 r = -errno;
513 goto fail;
514 }
e06b6479 515
63c372cb 516 devshm = strjoina(temporary_mount, "/dev/shm");
dc751688 517 (void) mkdir(devshm, 01777);
2b85f4e1
LP
518 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
519 if (r < 0) {
520 r = -errno;
521 goto fail;
522 }
523
63c372cb 524 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 525 (void) mkdir(devmqueue, 0755);
3164e3cb 526 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 527
63c372cb 528 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 529 (void) mkdir(devhugepages, 0755);
3164e3cb 530 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 531
63c372cb 532 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 533 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 534
7f112f50 535 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
536 _cleanup_free_ char *dn = NULL;
537 struct stat st;
538
539 r = stat(d, &st);
7f112f50 540 if (r < 0) {
2b85f4e1
LP
541
542 if (errno == ENOENT)
543 continue;
544
545 r = -errno;
546 goto fail;
7f112f50
LP
547 }
548
2b85f4e1
LP
549 if (!S_ISBLK(st.st_mode) &&
550 !S_ISCHR(st.st_mode)) {
551 r = -EINVAL;
552 goto fail;
553 }
554
555 if (st.st_rdev == 0)
556 continue;
557
558 dn = strappend(temporary_mount, d);
559 if (!dn) {
560 r = -ENOMEM;
561 goto fail;
562 }
563
ecabcf8b 564 mac_selinux_create_file_prepare(d, st.st_mode);
2b85f4e1 565 r = mknod(dn, st.st_mode, st.st_rdev);
ecabcf8b 566 mac_selinux_create_file_clear();
dd078a1e 567
2b85f4e1
LP
568 if (r < 0) {
569 r = -errno;
570 goto fail;
571 }
7f112f50
LP
572 }
573
03cfe0d5 574 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 575
ee818b89
AC
576 /* Create the /dev directory if missing. It is more likely to be
577 * missing when the service is started with RootDirectory. This is
578 * consistent with mount units creating the mount points when missing.
579 */
34de407a 580 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 581
9e5f8252 582 /* Unmount everything in old /dev */
34de407a
LP
583 umount_recursive(mount_entry_path(m), 0);
584 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
585 r = -errno;
586 goto fail;
587 }
7f112f50 588
2b85f4e1
LP
589 rmdir(dev);
590 rmdir(temporary_mount);
7f112f50 591
2b85f4e1 592 return 0;
7f112f50 593
2b85f4e1
LP
594fail:
595 if (devpts)
596 umount(devpts);
7f112f50 597
2b85f4e1
LP
598 if (devshm)
599 umount(devshm);
7f112f50 600
2b85f4e1
LP
601 if (devhugepages)
602 umount(devhugepages);
7f112f50 603
2b85f4e1
LP
604 if (devmqueue)
605 umount(devmqueue);
7f112f50 606
d267c5aa
ZJS
607 umount(dev);
608 rmdir(dev);
2b85f4e1 609 rmdir(temporary_mount);
7f112f50 610
2b85f4e1 611 return r;
7f112f50
LP
612}
613
5d997827
LP
614static int mount_bind_dev(MountEntry *m) {
615 int r;
616
617 assert(m);
618
619 /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
620 * /dev. This is only used when RootDirectory= is set. */
621
645767d6
LP
622 (void) mkdir_p_label(mount_entry_path(m), 0755);
623
5d997827
LP
624 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
625 if (r < 0)
626 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
627 if (r > 0) /* make this a NOP if /dev is already a mount point */
628 return 0;
629
630 if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
631 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
632
633 return 1;
634}
635
636static int mount_sysfs(MountEntry *m) {
637 int r;
638
639 assert(m);
640
645767d6
LP
641 (void) mkdir_p_label(mount_entry_path(m), 0755);
642
5d997827
LP
643 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
644 if (r < 0)
645 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
646 if (r > 0) /* make this a NOP if /sys is already a mount point */
647 return 0;
648
649 /* Bind mount the host's version so that we get all child mounts of it, too. */
650 if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
651 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
652
653 return 1;
654}
655
656static int mount_procfs(MountEntry *m) {
657 int r;
658
659 assert(m);
660
645767d6
LP
661 (void) mkdir_p_label(mount_entry_path(m), 0755);
662
5d997827
LP
663 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
664 if (r < 0)
665 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
666 if (r > 0) /* make this a NOP if /proc is already a mount point */
667 return 0;
668
669 /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
670 if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
671 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
672
673 return 1;
674}
675
d2d6c096
LP
676static int mount_entry_chase(
677 const char *root_directory,
678 MountEntry *m,
679 const char *path,
680 char **location) {
681
8fceda93
LP
682 char *chased;
683 int r;
684
685 assert(m);
686
687 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
d2d6c096
LP
688 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
689 * that applies). The result is stored in "location". */
8fceda93 690
a227a4be
LP
691 r = chase_symlinks(path, root_directory,
692 IN_SET(m->mode, BIND_MOUNT, BIND_MOUNT_RECURSIVE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV, BIND_DEV, EMPTY_DIR, SYSFS, PROCFS) ? CHASE_NONEXISTENT : 0,
693 &chased);
8fceda93 694 if (r == -ENOENT && m->ignore) {
d2d6c096 695 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
8fceda93
LP
696 return 0;
697 }
698 if (r < 0)
d2d6c096 699 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
8fceda93 700
d2d6c096 701 log_debug("Followed symlinks %s → %s.", path, chased);
8fceda93 702
d2d6c096
LP
703 free(*location);
704 *location = chased;
8fceda93
LP
705
706 return 1;
707}
708
ac0930c8 709static int apply_mount(
8fceda93 710 const char *root_directory,
34de407a 711 MountEntry *m,
ac0930c8 712 const char *tmp_dir,
c17ec25e 713 const char *var_tmp_dir) {
ac0930c8 714
a227a4be 715 bool rbind = true, make = false;
15ae422b 716 const char *what;
15ae422b 717 int r;
15ae422b 718
c17ec25e 719 assert(m);
15ae422b 720
d2d6c096 721 r = mount_entry_chase(root_directory, m, mount_entry_path(m), &m->path_malloc);
8fceda93
LP
722 if (r <= 0)
723 return r;
724
34de407a 725 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 726
c17ec25e 727 switch (m->mode) {
15ae422b 728
160cfdbe
LP
729 case INACCESSIBLE: {
730 struct stat target;
6d313367
LP
731
732 /* First, get rid of everything that is below if there
733 * is anything... Then, overmount it with an
c4b41707 734 * inaccessible path. */
34de407a 735 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 736
34de407a
LP
737 if (lstat(mount_entry_path(m), &target) < 0)
738 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
15ae422b 739
c4b41707 740 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
741 if (!what) {
742 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
743 return -ELOOP;
744 }
745 break;
160cfdbe 746 }
fe3c2583 747
15ae422b 748 case READONLY:
15ae422b 749 case READWRITE:
8fceda93 750 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
d944dc95 751 if (r < 0)
34de407a 752 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
753 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
754 return 0;
6b7c9f8b 755 /* This isn't a mount point yet, let's make it one. */
34de407a 756 what = mount_entry_path(m);
6b7c9f8b 757 break;
15ae422b 758
d2d6c096
LP
759 case BIND_MOUNT:
760 rbind = false;
761 /* fallthrough */
762
763 case BIND_MOUNT_RECURSIVE:
764 /* Also chase the source mount */
5d997827 765
d2d6c096
LP
766 r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
767 if (r <= 0)
768 return r;
769
770 what = mount_entry_source(m);
a227a4be 771 make = true;
d2d6c096
LP
772 break;
773
ac0930c8
LP
774 case PRIVATE_TMP:
775 what = tmp_dir;
a227a4be 776 make = true;
ac0930c8
LP
777 break;
778
779 case PRIVATE_VAR_TMP:
780 what = var_tmp_dir;
a227a4be 781 make = true;
15ae422b 782 break;
e364ad06 783
d6797c92 784 case PRIVATE_DEV:
5d997827
LP
785 return mount_private_dev(m);
786
787 case BIND_DEV:
788 return mount_bind_dev(m);
789
790 case SYSFS:
791 return mount_sysfs(m);
792
793 case PROCFS:
794 return mount_procfs(m);
d6797c92 795
e364ad06
LP
796 default:
797 assert_not_reached("Unknown mode");
15ae422b
LP
798 }
799
ac0930c8 800 assert(what);
15ae422b 801
a227a4be
LP
802 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
803 bool try_again = false;
804 r = -errno;
805
806 if (r == -ENOENT && make) {
807 struct stat st;
808
809 /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
810
811 if (stat(what, &st) >= 0) {
812
813 (void) mkdir_parents(mount_entry_path(m), 0755);
814
815 if (S_ISDIR(st.st_mode))
816 try_again = mkdir(mount_entry_path(m), 0755) >= 0;
817 else
818 try_again = touch(mount_entry_path(m)) >= 0;
819 }
820 }
821
822 if (try_again) {
823 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
824 r = -errno;
825 else
826 r = 0;
827 }
828
829 if (r < 0)
830 return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
831 }
6b7c9f8b 832
34de407a 833 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 834 return 0;
ac0930c8 835}
15ae422b 836
ac9de0b3 837static int make_read_only(MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
6b7c9f8b 838 int r = 0;
15ae422b 839
c17ec25e 840 assert(m);
ac9de0b3 841 assert(proc_self_mountinfo);
ac0930c8 842
34de407a 843 if (mount_entry_read_only(m))
ac9de0b3 844 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
13e785f7 845 else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't */
34de407a 846 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 847 r = -errno;
737ba3c8 848 } else
6b7c9f8b
LP
849 return 0;
850
851 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
852 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
853 * read-only mounts already applied. */
ac0930c8 854
8fceda93
LP
855 if (r == -ENOENT && m->ignore)
856 r = 0;
5327c910 857
1d54cd5d 858 return r;
d944dc95
LP
859}
860
9c988f93 861static bool namespace_info_mount_apivfs(const char *root_directory, const NameSpaceInfo *ns_info) {
5d997827
LP
862 assert(ns_info);
863
9c988f93
DH
864 /*
865 * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
866 * since to protect the API VFS mounts, they need to be around in the
867 * first place... and RootDirectory= or RootImage= need to be set.
868 */
5d997827 869
9c988f93
DH
870 /* root_directory should point to a mount point */
871 return root_directory &&
872 (ns_info->mount_apivfs ||
873 ns_info->protect_control_groups ||
874 ns_info->protect_kernel_tunables);
5d997827
LP
875}
876
2652c6c1 877static unsigned namespace_calculate_mounts(
9c988f93 878 const char* root_directory,
c575770b 879 const NameSpaceInfo *ns_info,
2652c6c1
DH
880 char** read_write_paths,
881 char** read_only_paths,
882 char** inaccessible_paths,
d2d6c096
LP
883 const BindMount *bind_mounts,
884 unsigned n_bind_mounts,
2652c6c1
DH
885 const char* tmp_dir,
886 const char* var_tmp_dir,
2652c6c1
DH
887 ProtectHome protect_home,
888 ProtectSystem protect_system) {
889
b6c432ca 890 unsigned protect_home_cnt;
f471b2af
DH
891 unsigned protect_system_cnt =
892 (protect_system == PROTECT_SYSTEM_STRICT ?
893 ELEMENTSOF(protect_system_strict_table) :
894 ((protect_system == PROTECT_SYSTEM_FULL) ?
895 ELEMENTSOF(protect_system_full_table) :
896 ((protect_system == PROTECT_SYSTEM_YES) ?
897 ELEMENTSOF(protect_system_yes_table) : 0)));
898
b6c432ca
DH
899 protect_home_cnt =
900 (protect_home == PROTECT_HOME_YES ?
901 ELEMENTSOF(protect_home_yes_table) :
902 ((protect_home == PROTECT_HOME_READ_ONLY) ?
903 ELEMENTSOF(protect_home_read_only_table) : 0));
904
2652c6c1
DH
905 return !!tmp_dir + !!var_tmp_dir +
906 strv_length(read_write_paths) +
907 strv_length(read_only_paths) +
908 strv_length(inaccessible_paths) +
d2d6c096 909 n_bind_mounts +
c575770b
DH
910 ns_info->private_dev +
911 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
912 (ns_info->protect_control_groups ? 1 : 0) +
913 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
5d997827 914 protect_home_cnt + protect_system_cnt +
9c988f93 915 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
2652c6c1
DH
916}
917
613b411c 918int setup_namespace(
ee818b89 919 const char* root_directory,
915e6d16 920 const char* root_image,
c575770b 921 const NameSpaceInfo *ns_info,
2a624c36
AP
922 char** read_write_paths,
923 char** read_only_paths,
924 char** inaccessible_paths,
d2d6c096
LP
925 const BindMount *bind_mounts,
926 unsigned n_bind_mounts,
a004cb4c
LP
927 const char* tmp_dir,
928 const char* var_tmp_dir,
1b8689f9
LP
929 ProtectHome protect_home,
930 ProtectSystem protect_system,
915e6d16
LP
931 unsigned long mount_flags,
932 DissectImageFlags dissect_image_flags) {
15ae422b 933
915e6d16 934 _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
78ebe980 935 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
915e6d16 936 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
78ebe980 937 _cleanup_free_ void *root_hash = NULL;
34de407a 938 MountEntry *m, *mounts = NULL;
78ebe980 939 size_t root_hash_size = 0;
d944dc95 940 bool make_slave = false;
e908468b 941 const char *root;
f0a4feb0 942 unsigned n_mounts;
c17ec25e 943 int r = 0;
15ae422b 944
915e6d16
LP
945 assert(ns_info);
946
613b411c 947 if (mount_flags == 0)
c17ec25e 948 mount_flags = MS_SHARED;
ac0930c8 949
915e6d16
LP
950 if (root_image) {
951 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
952
953 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
954 dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
955
956 r = loop_device_make_by_path(root_image,
957 dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
958 &loop_device);
959 if (r < 0)
960 return r;
961
78ebe980
LP
962 r = root_hash_load(root_image, &root_hash, &root_hash_size);
963 if (r < 0)
964 return r;
965
966 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
967 if (r < 0)
968 return r;
969
970 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
915e6d16
LP
971 if (r < 0)
972 return r;
915e6d16
LP
973 }
974
e908468b
LP
975 if (root_directory)
976 root = root_directory;
977 else if (root_image || n_bind_mounts > 0) {
978
979 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
980 * the same mount point for all images, which is safe, since they all live in their own namespaces
981 * after all, and hence won't see each other. We also use such a root directory whenever there are bind
982 * mounts configured, so that their source mounts are never obstructed by mounts we already applied
983 * while we are applying them. */
984
985 root = "/run/systemd/unit-root";
986 (void) mkdir_label(root, 0700);
987 } else
988 root = NULL;
989
cfbeb4ef 990 n_mounts = namespace_calculate_mounts(
e908468b 991 root,
cfbeb4ef
LP
992 ns_info,
993 read_write_paths,
994 read_only_paths,
995 inaccessible_paths,
d2d6c096 996 bind_mounts, n_bind_mounts,
cfbeb4ef
LP
997 tmp_dir, var_tmp_dir,
998 protect_home, protect_system);
613b411c 999
2652c6c1 1000 /* Set mount slave mode */
e908468b 1001 if (root || n_mounts > 0)
d944dc95
LP
1002 make_slave = true;
1003
f0a4feb0 1004 if (n_mounts > 0) {
34de407a 1005 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
5327c910 1006 r = append_access_mounts(&m, read_write_paths, READWRITE);
613b411c 1007 if (r < 0)
f0a4feb0 1008 goto finish;
613b411c 1009
5327c910 1010 r = append_access_mounts(&m, read_only_paths, READONLY);
613b411c 1011 if (r < 0)
f0a4feb0 1012 goto finish;
613b411c 1013
5327c910 1014 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE);
613b411c 1015 if (r < 0)
f0a4feb0 1016 goto finish;
7ff7394d 1017
d2d6c096
LP
1018 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1019 if (r < 0)
1020 goto finish;
1021
613b411c 1022 if (tmp_dir) {
34de407a 1023 *(m++) = (MountEntry) {
5327c910
LP
1024 .path_const = "/tmp",
1025 .mode = PRIVATE_TMP,
1026 };
613b411c 1027 }
7ff7394d 1028
613b411c 1029 if (var_tmp_dir) {
34de407a 1030 *(m++) = (MountEntry) {
5327c910
LP
1031 .path_const = "/var/tmp",
1032 .mode = PRIVATE_VAR_TMP,
1033 };
7ff7394d 1034 }
ac0930c8 1035
c575770b 1036 if (ns_info->private_dev) {
34de407a 1037 *(m++) = (MountEntry) {
5327c910
LP
1038 .path_const = "/dev",
1039 .mode = PRIVATE_DEV,
1040 };
7f112f50
LP
1041 }
1042
c575770b 1043 if (ns_info->protect_kernel_tunables) {
5327c910 1044 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 1045 if (r < 0)
f0a4feb0 1046 goto finish;
c575770b
DH
1047 }
1048
1049 if (ns_info->protect_kernel_modules) {
5327c910 1050 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 1051 if (r < 0)
f0a4feb0 1052 goto finish;
c575770b 1053 }
59eeb84b 1054
c575770b 1055 if (ns_info->protect_control_groups) {
34de407a 1056 *(m++) = (MountEntry) {
5327c910
LP
1057 .path_const = "/sys/fs/cgroup",
1058 .mode = READONLY,
1059 };
59eeb84b
LP
1060 }
1061
5327c910 1062 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 1063 if (r < 0)
f0a4feb0 1064 goto finish;
417116f2 1065
5327c910 1066 r = append_protect_system(&m, protect_system, false);
f471b2af 1067 if (r < 0)
f0a4feb0 1068 goto finish;
417116f2 1069
e908468b 1070 if (namespace_info_mount_apivfs(root, ns_info)) {
5d997827
LP
1071 r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1072 if (r < 0)
1073 goto finish;
1074 }
1075
f0a4feb0 1076 assert(mounts + n_mounts == m);
ac0930c8 1077
5327c910 1078 /* Prepend the root directory where that's necessary */
e908468b 1079 r = prefix_where_needed(mounts, n_mounts, root);
5327c910
LP
1080 if (r < 0)
1081 goto finish;
1082
34de407a 1083 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
fe3c2583 1084
f0a4feb0 1085 drop_duplicates(mounts, &n_mounts);
e908468b 1086 drop_outside_root(root, mounts, &n_mounts);
f0a4feb0
DH
1087 drop_inaccessible(mounts, &n_mounts);
1088 drop_nop(mounts, &n_mounts);
15ae422b
LP
1089 }
1090
d944dc95
LP
1091 if (unshare(CLONE_NEWNS) < 0) {
1092 r = -errno;
1093 goto finish;
1094 }
1e4e94c8 1095
d944dc95 1096 if (make_slave) {
c2c13f2d
LP
1097 /* Remount / as SLAVE so that nothing now mounted in the namespace
1098 shows up in the parent */
d944dc95
LP
1099 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1100 r = -errno;
1101 goto finish;
1102 }
ee818b89
AC
1103 }
1104
10404d52 1105 /* Try to set up the new root directory before mounting anything there */
e908468b
LP
1106 if (root)
1107 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
10404d52 1108
915e6d16 1109 if (root_image) {
e908468b
LP
1110 /* A root image is specified, mount it to the right place */
1111 r = dissected_image_mount(dissected_image, root, dissect_image_flags);
915e6d16
LP
1112 if (r < 0)
1113 goto finish;
1114
07ce7407
TM
1115 if (decrypted_image) {
1116 r = decrypted_image_relinquish(decrypted_image);
1117 if (r < 0)
1118 goto finish;
1119 }
78ebe980 1120
915e6d16
LP
1121 loop_device_relinquish(loop_device);
1122
1123 } else if (root_directory) {
1124
e908468b
LP
1125 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1126 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 1127 if (r < 0)
d944dc95 1128 goto finish;
8f1ad200 1129 if (r == 0) {
e908468b 1130 if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
8f1ad200
LP
1131 r = -errno;
1132 goto finish;
1133 }
d944dc95 1134 }
e908468b
LP
1135
1136 } else if (root) {
1137
1138 /* Let's mount the main root directory to the root directory to use */
1139 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1140 r = -errno;
1141 goto finish;
1142 }
ee818b89 1143 }
c2c13f2d 1144
f0a4feb0 1145 if (n_mounts > 0) {
ac9de0b3 1146 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
6b7c9f8b
LP
1147 char **blacklist;
1148 unsigned j;
1149
ac9de0b3
TR
1150 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1151 * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1152 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1153 if (!proc_self_mountinfo) {
1154 r = -errno;
1155 goto finish;
1156 }
1157
6b7c9f8b 1158 /* First round, add in all special mounts we need */
f0a4feb0 1159 for (m = mounts; m < mounts + n_mounts; ++m) {
e908468b 1160 r = apply_mount(root, m, tmp_dir, var_tmp_dir);
c2c13f2d 1161 if (r < 0)
d944dc95 1162 goto finish;
c2c13f2d 1163 }
15ae422b 1164
6b7c9f8b 1165 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
1166 blacklist = newa(char*, n_mounts+1);
1167 for (j = 0; j < n_mounts; j++)
34de407a 1168 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
1169 blacklist[j] = NULL;
1170
1171 /* Second round, flip the ro bits if necessary. */
f0a4feb0 1172 for (m = mounts; m < mounts + n_mounts; ++m) {
ac9de0b3 1173 r = make_read_only(m, blacklist, proc_self_mountinfo);
c2c13f2d 1174 if (r < 0)
d944dc95 1175 goto finish;
c2c13f2d 1176 }
15ae422b
LP
1177 }
1178
e908468b 1179 if (root) {
ee818b89 1180 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
e908468b 1181 r = mount_move_root(root);
d944dc95
LP
1182 if (r < 0)
1183 goto finish;
ee818b89
AC
1184 }
1185
c2c13f2d
LP
1186 /* Remount / as the desired mode. Not that this will not
1187 * reestablish propagation from our side to the host, since
1188 * what's disconnected is disconnected. */
d944dc95
LP
1189 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1190 r = -errno;
1191 goto finish;
1192 }
15ae422b 1193
d944dc95 1194 r = 0;
15ae422b 1195
d944dc95 1196finish:
f0a4feb0 1197 for (m = mounts; m < mounts + n_mounts; m++)
1eb7e08e 1198 mount_entry_done(m);
613b411c
LP
1199
1200 return r;
1201}
1202
d2d6c096
LP
1203void bind_mount_free_many(BindMount *b, unsigned n) {
1204 unsigned i;
1205
1206 assert(b || n == 0);
1207
1208 for (i = 0; i < n; i++) {
1209 free(b[i].source);
1210 free(b[i].destination);
1211 }
1212
1213 free(b);
1214}
1215
1216int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1217 _cleanup_free_ char *s = NULL, *d = NULL;
1218 BindMount *c;
1219
1220 assert(b);
1221 assert(n);
1222 assert(item);
1223
1224 s = strdup(item->source);
1225 if (!s)
1226 return -ENOMEM;
1227
1228 d = strdup(item->destination);
1229 if (!d)
1230 return -ENOMEM;
1231
1232 c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1233 if (!c)
1234 return -ENOMEM;
1235
1236 *b = c;
1237
1238 c[(*n) ++] = (BindMount) {
1239 .source = s,
1240 .destination = d,
1241 .read_only = item->read_only,
1242 .recursive = item->recursive,
1243 .ignore_enoent = item->ignore_enoent,
1244 };
1245
1246 s = d = NULL;
1247 return 0;
1248}
1249
613b411c
LP
1250static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1251 _cleanup_free_ char *x = NULL;
6b46ea73
LP
1252 char bid[SD_ID128_STRING_MAX];
1253 sd_id128_t boot_id;
1254 int r;
613b411c
LP
1255
1256 assert(id);
1257 assert(prefix);
1258 assert(path);
1259
6b46ea73
LP
1260 /* We include the boot id in the directory so that after a
1261 * reboot we can easily identify obsolete directories. */
1262
1263 r = sd_id128_get_boot(&boot_id);
1264 if (r < 0)
1265 return r;
1266
605405c6 1267 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
1268 if (!x)
1269 return -ENOMEM;
1270
1271 RUN_WITH_UMASK(0077)
1272 if (!mkdtemp(x))
1273 return -errno;
1274
1275 RUN_WITH_UMASK(0000) {
1276 char *y;
1277
63c372cb 1278 y = strjoina(x, "/tmp");
613b411c
LP
1279
1280 if (mkdir(y, 0777 | S_ISVTX) < 0)
1281 return -errno;
c17ec25e 1282 }
15ae422b 1283
613b411c
LP
1284 *path = x;
1285 x = NULL;
1286
1287 return 0;
1288}
1289
1290int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1291 char *a, *b;
1292 int r;
1293
1294 assert(id);
1295 assert(tmp_dir);
1296 assert(var_tmp_dir);
1297
1298 r = setup_one_tmp_dir(id, "/tmp", &a);
1299 if (r < 0)
1300 return r;
1301
1302 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1303 if (r < 0) {
1304 char *t;
1305
63c372cb 1306 t = strjoina(a, "/tmp");
613b411c
LP
1307 rmdir(t);
1308 rmdir(a);
1309
1310 free(a);
1311 return r;
1312 }
1313
1314 *tmp_dir = a;
1315 *var_tmp_dir = b;
1316
1317 return 0;
1318}
1319
1320int setup_netns(int netns_storage_socket[2]) {
1321 _cleanup_close_ int netns = -1;
3ee897d6 1322 int r, q;
613b411c
LP
1323
1324 assert(netns_storage_socket);
1325 assert(netns_storage_socket[0] >= 0);
1326 assert(netns_storage_socket[1] >= 0);
1327
1328 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1329 * namespace reference fd. Whatever process runs this first
1330 * shall create a new namespace, all others should just join
1331 * it. To serialize that we use a file lock on the socket
1332 * pair.
613b411c
LP
1333 *
1334 * It's a bit crazy, but hey, works great! */
1335
1336 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1337 return -errno;
1338
3ee897d6
LP
1339 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1340 if (netns == -EAGAIN) {
613b411c
LP
1341 /* Nothing stored yet, so let's create a new namespace */
1342
1343 if (unshare(CLONE_NEWNET) < 0) {
1344 r = -errno;
1345 goto fail;
1346 }
1347
1348 loopback_setup();
1349
1350 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1351 if (netns < 0) {
1352 r = -errno;
1353 goto fail;
1354 }
1355
1356 r = 1;
613b411c 1357
3ee897d6
LP
1358 } else if (netns < 0) {
1359 r = netns;
1360 goto fail;
613b411c 1361
3ee897d6
LP
1362 } else {
1363 /* Yay, found something, so let's join the namespace */
613b411c
LP
1364 if (setns(netns, CLONE_NEWNET) < 0) {
1365 r = -errno;
1366 goto fail;
1367 }
1368
1369 r = 0;
1370 }
1371
3ee897d6
LP
1372 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1373 if (q < 0) {
1374 r = q;
613b411c
LP
1375 goto fail;
1376 }
1377
1378fail:
fe048ce5 1379 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1380 return r;
1381}
417116f2 1382
1b8689f9
LP
1383static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1384 [PROTECT_HOME_NO] = "no",
1385 [PROTECT_HOME_YES] = "yes",
1386 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
1387};
1388
1b8689f9
LP
1389DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1390
1391static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1392 [PROTECT_SYSTEM_NO] = "no",
1393 [PROTECT_SYSTEM_YES] = "yes",
1394 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1395 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1396};
1397
1398DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);