]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
namespace: chase symlinks for mounts to set up in userspace
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
15ae422b
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
15ae422b 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
20#include <errno.h>
07630cea 21#include <sched.h>
15ae422b 22#include <stdio.h>
07630cea
LP
23#include <string.h>
24#include <sys/mount.h>
15ae422b 25#include <sys/stat.h>
07630cea 26#include <unistd.h>
25e870b5 27#include <linux/fs.h>
15ae422b 28
b5efdb8a 29#include "alloc-util.h"
7f112f50 30#include "dev-setup.h"
3ffd4af2 31#include "fd-util.h"
d944dc95 32#include "fs-util.h"
07630cea
LP
33#include "loopback-setup.h"
34#include "missing.h"
35#include "mkdir.h"
4349cd7c 36#include "mount-util.h"
3ffd4af2 37#include "namespace.h"
07630cea 38#include "path-util.h"
d7b8eec7 39#include "selinux-util.h"
2583fbea 40#include "socket-util.h"
8b43440b 41#include "string-table.h"
07630cea
LP
42#include "string-util.h"
43#include "strv.h"
affb60b1 44#include "umask-util.h"
ee104e11 45#include "user-util.h"
07630cea 46#include "util.h"
15ae422b 47
737ba3c8 48#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
49
c17ec25e 50typedef enum MountMode {
15ae422b
LP
51 /* This is ordered by priority! */
52 INACCESSIBLE,
53 READONLY,
ac0930c8
LP
54 PRIVATE_TMP,
55 PRIVATE_VAR_TMP,
7f112f50 56 PRIVATE_DEV,
59eeb84b 57 READWRITE,
c17ec25e 58} MountMode;
15ae422b 59
c17ec25e 60typedef struct BindMount {
d944dc95
LP
61 const char *path; /* stack memory, doesn't need to be freed explicitly */
62 char *chased; /* malloc()ed memory, needs to be freed */
c17ec25e 63 MountMode mode;
ea92ae33 64 bool ignore;
c17ec25e 65} BindMount;
15ae422b 66
c17ec25e 67static int append_mounts(BindMount **p, char **strv, MountMode mode) {
15ae422b
LP
68 char **i;
69
613b411c
LP
70 assert(p);
71
15ae422b
LP
72 STRV_FOREACH(i, strv) {
73
ea92ae33
MW
74 (*p)->ignore = false;
75
94828d2d 76 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
ea92ae33
MW
77 (*p)->ignore = true;
78 (*i)++;
79 }
80
15ae422b
LP
81 if (!path_is_absolute(*i))
82 return -EINVAL;
83
84 (*p)->path = *i;
85 (*p)->mode = mode;
86 (*p)++;
87 }
88
89 return 0;
90}
91
c17ec25e
MS
92static int mount_path_compare(const void *a, const void *b) {
93 const BindMount *p = a, *q = b;
a0827e2b 94 int d;
15ae422b 95
6ee1a919 96 /* If the paths are not equal, then order prefixes first */
a0827e2b 97 d = path_compare(p->path, q->path);
6ee1a919
LP
98 if (d != 0)
99 return d;
15ae422b 100
6ee1a919
LP
101 /* If the paths are equal, check the mode */
102 if (p->mode < q->mode)
103 return -1;
15ae422b 104
6ee1a919
LP
105 if (p->mode > q->mode)
106 return 1;
15ae422b 107
6ee1a919 108 return 0;
15ae422b
LP
109}
110
c17ec25e
MS
111static void drop_duplicates(BindMount *m, unsigned *n) {
112 BindMount *f, *t, *previous;
15ae422b 113
c17ec25e 114 assert(m);
15ae422b 115 assert(n);
15ae422b 116
fe3c2583
LP
117 /* Drops duplicate entries. Expects that the array is properly ordered already. */
118
c17ec25e 119 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
15ae422b 120
fe3c2583
LP
121 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
122 * above. */
123 if (previous && path_equal(f->path, previous->path)) {
124 log_debug("%s is duplicate.", f->path);
15ae422b 125 continue;
fe3c2583 126 }
15ae422b 127
e2d7c1a0 128 *t = *f;
15ae422b 129 previous = t;
fe3c2583
LP
130 t++;
131 }
132
133 *n = t - m;
134}
135
136static void drop_inaccessible(BindMount *m, unsigned *n) {
137 BindMount *f, *t;
138 const char *clear = NULL;
139
140 assert(m);
141 assert(n);
142
143 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
144 * ordered already. */
145
146 for (f = m, t = m; f < m+*n; f++) {
147
148 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
149 * it, as inaccessible paths really should drop the entire subtree. */
150 if (clear && path_startswith(f->path, clear)) {
151 log_debug("%s is masked by %s.", f->path, clear);
152 continue;
153 }
15ae422b 154
fe3c2583
LP
155 clear = f->mode == INACCESSIBLE ? f->path : NULL;
156
157 *t = *f;
15ae422b
LP
158 t++;
159 }
160
c17ec25e 161 *n = t - m;
15ae422b
LP
162}
163
7648a565
LP
164static void drop_nop(BindMount *m, unsigned *n) {
165 BindMount *f, *t;
166
167 assert(m);
168 assert(n);
169
170 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
171 * list is ordered by prefixes. */
172
173 for (f = m, t = m; f < m+*n; f++) {
174
175 /* Only suppress such subtrees for READONLY and READWRITE entries */
176 if (IN_SET(f->mode, READONLY, READWRITE)) {
177 BindMount *p;
178 bool found = false;
179
180 /* Now let's find the first parent of the entry we are looking at. */
181 for (p = t-1; p >= m; p--) {
182 if (path_startswith(f->path, p->path)) {
183 found = true;
184 break;
185 }
186 }
187
188 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
189 if (found && p->mode == f->mode) {
190 log_debug("%s is redundant by %s", f->path, p->path);
191 continue;
192 }
193 }
194
195 *t = *f;
196 t++;
197 }
198
199 *n = t - m;
200}
201
7f112f50
LP
202static int mount_dev(BindMount *m) {
203 static const char devnodes[] =
204 "/dev/null\0"
205 "/dev/zero\0"
206 "/dev/full\0"
207 "/dev/random\0"
208 "/dev/urandom\0"
209 "/dev/tty\0";
210
2b85f4e1 211 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 212 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
213 _cleanup_umask_ mode_t u;
214 int r;
215
216 assert(m);
217
218 u = umask(0000);
219
2b85f4e1
LP
220 if (!mkdtemp(temporary_mount))
221 return -errno;
222
63c372cb 223 dev = strjoina(temporary_mount, "/dev");
dc751688 224 (void) mkdir(dev, 0755);
737ba3c8 225 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
226 r = -errno;
227 goto fail;
228 }
229
63c372cb 230 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 231 (void) mkdir(devpts, 0755);
2b85f4e1
LP
232 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
233 r = -errno;
234 goto fail;
235 }
236
63c372cb 237 devptmx = strjoina(temporary_mount, "/dev/ptmx");
3164e3cb
ZJS
238 if (symlink("pts/ptmx", devptmx) < 0) {
239 r = -errno;
240 goto fail;
241 }
e06b6479 242
63c372cb 243 devshm = strjoina(temporary_mount, "/dev/shm");
dc751688 244 (void) mkdir(devshm, 01777);
2b85f4e1
LP
245 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
246 if (r < 0) {
247 r = -errno;
248 goto fail;
249 }
250
63c372cb 251 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 252 (void) mkdir(devmqueue, 0755);
3164e3cb 253 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 254
63c372cb 255 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 256 (void) mkdir(devhugepages, 0755);
3164e3cb 257 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 258
63c372cb 259 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 260 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 261
7f112f50 262 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
263 _cleanup_free_ char *dn = NULL;
264 struct stat st;
265
266 r = stat(d, &st);
7f112f50 267 if (r < 0) {
2b85f4e1
LP
268
269 if (errno == ENOENT)
270 continue;
271
272 r = -errno;
273 goto fail;
7f112f50
LP
274 }
275
2b85f4e1
LP
276 if (!S_ISBLK(st.st_mode) &&
277 !S_ISCHR(st.st_mode)) {
278 r = -EINVAL;
279 goto fail;
280 }
281
282 if (st.st_rdev == 0)
283 continue;
284
285 dn = strappend(temporary_mount, d);
286 if (!dn) {
287 r = -ENOMEM;
288 goto fail;
289 }
290
ecabcf8b 291 mac_selinux_create_file_prepare(d, st.st_mode);
2b85f4e1 292 r = mknod(dn, st.st_mode, st.st_rdev);
ecabcf8b 293 mac_selinux_create_file_clear();
dd078a1e 294
2b85f4e1
LP
295 if (r < 0) {
296 r = -errno;
297 goto fail;
298 }
7f112f50
LP
299 }
300
03cfe0d5 301 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 302
ee818b89
AC
303 /* Create the /dev directory if missing. It is more likely to be
304 * missing when the service is started with RootDirectory. This is
305 * consistent with mount units creating the mount points when missing.
306 */
307 (void) mkdir_p_label(m->path, 0755);
308
9e5f8252 309 /* Unmount everything in old /dev */
310 umount_recursive(m->path, 0);
ee818b89 311 if (mount(dev, m->path, NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
312 r = -errno;
313 goto fail;
314 }
7f112f50 315
2b85f4e1
LP
316 rmdir(dev);
317 rmdir(temporary_mount);
7f112f50 318
2b85f4e1 319 return 0;
7f112f50 320
2b85f4e1
LP
321fail:
322 if (devpts)
323 umount(devpts);
7f112f50 324
2b85f4e1
LP
325 if (devshm)
326 umount(devshm);
7f112f50 327
2b85f4e1
LP
328 if (devhugepages)
329 umount(devhugepages);
7f112f50 330
2b85f4e1
LP
331 if (devmqueue)
332 umount(devmqueue);
7f112f50 333
d267c5aa
ZJS
334 umount(dev);
335 rmdir(dev);
2b85f4e1 336 rmdir(temporary_mount);
7f112f50 337
2b85f4e1 338 return r;
7f112f50
LP
339}
340
ac0930c8 341static int apply_mount(
c17ec25e 342 BindMount *m,
ac0930c8 343 const char *tmp_dir,
c17ec25e 344 const char *var_tmp_dir) {
ac0930c8 345
15ae422b 346 const char *what;
15ae422b 347 int r;
15ae422b 348
c17ec25e 349 assert(m);
15ae422b 350
fe3c2583
LP
351 log_debug("Applying namespace mount on %s", m->path);
352
c17ec25e 353 switch (m->mode) {
15ae422b 354
160cfdbe
LP
355 case INACCESSIBLE: {
356 struct stat target;
6d313367
LP
357
358 /* First, get rid of everything that is below if there
359 * is anything... Then, overmount it with an
c4b41707 360 * inaccessible path. */
fe3c2583 361 (void) umount_recursive(m->path, 0);
6d313367 362
d944dc95 363 if (lstat(m->path, &target) < 0)
160cfdbe 364 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path);
15ae422b 365
c4b41707 366 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
367 if (!what) {
368 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
369 return -ELOOP;
370 }
371 break;
160cfdbe 372 }
fe3c2583 373
15ae422b 374 case READONLY:
15ae422b 375 case READWRITE:
6b7c9f8b
LP
376
377 r = path_is_mount_point(m->path, 0);
d944dc95 378 if (r < 0)
6b7c9f8b 379 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path);
6b7c9f8b
LP
380 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
381 return 0;
382
383 /* This isn't a mount point yet, let's make it one. */
384 what = m->path;
385 break;
15ae422b 386
ac0930c8
LP
387 case PRIVATE_TMP:
388 what = tmp_dir;
389 break;
390
391 case PRIVATE_VAR_TMP:
392 what = var_tmp_dir;
15ae422b 393 break;
e364ad06 394
d6797c92
LP
395 case PRIVATE_DEV:
396 return mount_dev(m);
397
e364ad06
LP
398 default:
399 assert_not_reached("Unknown mode");
15ae422b
LP
400 }
401
ac0930c8 402 assert(what);
15ae422b 403
d944dc95 404 if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0)
5fd7cf6f 405 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path);
6b7c9f8b
LP
406
407 log_debug("Successfully mounted %s to %s", what, m->path);
408 return 0;
ac0930c8 409}
15ae422b 410
6b7c9f8b
LP
411static int make_read_only(BindMount *m, char **blacklist) {
412 int r = 0;
15ae422b 413
c17ec25e 414 assert(m);
ac0930c8 415
d6797c92 416 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
6b7c9f8b
LP
417 r = bind_remount_recursive(m->path, true, blacklist);
418 else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
419 if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
420 r = -errno;
737ba3c8 421 } else
6b7c9f8b
LP
422 return 0;
423
424 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
425 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
426 * read-only mounts already applied. */
ac0930c8 427
d6797c92 428 return r;
15ae422b
LP
429}
430
d944dc95
LP
431static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned *n) {
432 BindMount *f, *t;
433 int r;
434
435 assert(m);
436 assert(n);
437
438 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
439 * chase the symlinks on our own first. This call wil do so for all entries and remove all entries where we
440 * can't resolve the path, and which have been marked for such removal. */
441
442 for (f = m, t = m; f < m+*n; f++) {
443
444 r = chase_symlinks(f->path, root_directory, &f->chased);
445 if (r == -ENOENT && f->ignore) /* Doesn't exist? Then remove it! */
446 continue;
447 if (r < 0)
448 return log_debug_errno(r, "Failed to chase symlinks for %s: %m", f->path);
449
450 if (path_equal(f->path, f->chased))
451 f->chased = mfree(f->chased);
452 else {
453 log_debug("Chased %s → %s", f->path, f->chased);
454 f->path = f->chased;
455 }
456
457 *t = *f;
458 t++;
459 }
460
461 *n = t - m;
462 return 0;
463}
464
613b411c 465int setup_namespace(
ee818b89 466 const char* root_directory,
2a624c36
AP
467 char** read_write_paths,
468 char** read_only_paths,
469 char** inaccessible_paths,
a004cb4c
LP
470 const char* tmp_dir,
471 const char* var_tmp_dir,
7f112f50 472 bool private_dev,
59eeb84b
LP
473 bool protect_sysctl,
474 bool protect_cgroups,
1b8689f9
LP
475 ProtectHome protect_home,
476 ProtectSystem protect_system,
e6547662 477 unsigned long mount_flags) {
15ae422b 478
7ff7394d 479 BindMount *m, *mounts = NULL;
d944dc95 480 bool make_slave = false;
613b411c 481 unsigned n;
c17ec25e 482 int r = 0;
15ae422b 483
613b411c 484 if (mount_flags == 0)
c17ec25e 485 mount_flags = MS_SHARED;
ac0930c8 486
9ca6ff50 487 n = !!tmp_dir + !!var_tmp_dir +
2a624c36
AP
488 strv_length(read_write_paths) +
489 strv_length(read_only_paths) +
490 strv_length(inaccessible_paths) +
417116f2 491 private_dev +
59eeb84b
LP
492 (protect_sysctl ? 3 : 0) +
493 (protect_cgroups != protect_sysctl) +
3f815163
LP
494 (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
495 (protect_system == PROTECT_SYSTEM_STRICT ?
496 (2 + !private_dev + !protect_sysctl) :
497 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
498 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
613b411c 499
d944dc95
LP
500 if (root_directory || n > 0)
501 make_slave = true;
502
613b411c 503 if (n > 0) {
002b2268 504 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
2a624c36 505 r = append_mounts(&m, read_write_paths, READWRITE);
613b411c
LP
506 if (r < 0)
507 return r;
508
2a624c36 509 r = append_mounts(&m, read_only_paths, READONLY);
613b411c
LP
510 if (r < 0)
511 return r;
512
2a624c36 513 r = append_mounts(&m, inaccessible_paths, INACCESSIBLE);
613b411c 514 if (r < 0)
7ff7394d
ZJS
515 return r;
516
613b411c 517 if (tmp_dir) {
ee818b89 518 m->path = prefix_roota(root_directory, "/tmp");
7ff7394d
ZJS
519 m->mode = PRIVATE_TMP;
520 m++;
613b411c 521 }
7ff7394d 522
613b411c 523 if (var_tmp_dir) {
ee818b89 524 m->path = prefix_roota(root_directory, "/var/tmp");
7ff7394d
ZJS
525 m->mode = PRIVATE_VAR_TMP;
526 m++;
527 }
ac0930c8 528
7f112f50 529 if (private_dev) {
ee818b89 530 m->path = prefix_roota(root_directory, "/dev");
7f112f50
LP
531 m->mode = PRIVATE_DEV;
532 m++;
533 }
534
59eeb84b
LP
535 if (protect_sysctl) {
536 m->path = prefix_roota(root_directory, "/proc/sys");
537 m->mode = READONLY;
538 m++;
539
540 m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
541 m->mode = READONLY;
542 m->ignore = true; /* Not always compiled into the kernel */
543 m++;
544
545 m->path = prefix_roota(root_directory, "/sys");
546 m->mode = READONLY;
547 m++;
548 }
549
550 if (protect_cgroups != protect_sysctl) {
551 m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
552 m->mode = protect_cgroups ? READONLY : READWRITE;
553 m++;
554 }
555
3f815163 556 if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
ee818b89
AC
557 const char *home_dir, *run_user_dir, *root_dir;
558
3f815163
LP
559 /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
560 * strict system protection mode, then also add entries for these directories, but mark them
561 * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
562
ee818b89
AC
563 home_dir = prefix_roota(root_directory, "/home");
564 home_dir = strjoina("-", home_dir);
565 run_user_dir = prefix_roota(root_directory, "/run/user");
566 run_user_dir = strjoina("-", run_user_dir);
567 root_dir = prefix_roota(root_directory, "/root");
568 root_dir = strjoina("-", root_dir);
569
570 r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
3f815163
LP
571 protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
572 protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
417116f2
LP
573 if (r < 0)
574 return r;
575 }
576
3f815163
LP
577 if (protect_system == PROTECT_SYSTEM_STRICT) {
578 /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
579 * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
580 * protect those, and these options should be fully orthogonal. (And of course /home and
581 * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
582 * above). */
583
584 m->path = prefix_roota(root_directory, "/");
585 m->mode = READONLY;
586 m++;
587
588 m->path = prefix_roota(root_directory, "/proc");
589 m->mode = READWRITE;
590 m++;
591
592 if (!private_dev) {
593 m->path = prefix_roota(root_directory, "/dev");
594 m->mode = READWRITE;
595 m++;
596 }
597 if (!protect_sysctl) {
598 m->path = prefix_roota(root_directory, "/sys");
599 m->mode = READWRITE;
600 m++;
601 }
602
603 } else if (protect_system != PROTECT_SYSTEM_NO) {
604 const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
605
606 /* In any other mode we simply mark the relevant three directories ready-only. */
ee818b89 607
d38e01dc 608 usr_dir = prefix_roota(root_directory, "/usr");
ee818b89
AC
609 boot_dir = prefix_roota(root_directory, "/boot");
610 boot_dir = strjoina("-", boot_dir);
3f815163
LP
611 efi_dir = prefix_roota(root_directory, "/efi");
612 efi_dir = strjoina("-", efi_dir);
ee818b89
AC
613 etc_dir = prefix_roota(root_directory, "/etc");
614
615 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
3f815163
LP
616 ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
617 : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
417116f2
LP
618 if (r < 0)
619 return r;
620 }
621
7ff7394d 622 assert(mounts + n == m);
ac0930c8 623
d944dc95
LP
624 /* Resolve symlinks manually first, as mount() will always follow them relative to the host's
625 * root. Moreover we want to suppress duplicates based on the resolved paths. This of course is a bit
626 * racy. */
627 r = chase_all_symlinks(root_directory, mounts, &n);
628 if (r < 0)
629 goto finish;
630
7ff7394d 631 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
fe3c2583 632
7ff7394d 633 drop_duplicates(mounts, &n);
fe3c2583 634 drop_inaccessible(mounts, &n);
7648a565 635 drop_nop(mounts, &n);
15ae422b
LP
636 }
637
d944dc95
LP
638 if (unshare(CLONE_NEWNS) < 0) {
639 r = -errno;
640 goto finish;
641 }
1e4e94c8 642
d944dc95 643 if (make_slave) {
c2c13f2d
LP
644 /* Remount / as SLAVE so that nothing now mounted in the namespace
645 shows up in the parent */
d944dc95
LP
646 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
647 r = -errno;
648 goto finish;
649 }
ee818b89
AC
650 }
651
652 if (root_directory) {
653 /* Turn directory into bind mount */
d944dc95
LP
654 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
655 r = -errno;
656 goto finish;
657 }
ee818b89 658 }
c2c13f2d 659
ee818b89 660 if (n > 0) {
6b7c9f8b
LP
661 char **blacklist;
662 unsigned j;
663
664 /* First round, add in all special mounts we need */
c2c13f2d
LP
665 for (m = mounts; m < mounts + n; ++m) {
666 r = apply_mount(m, tmp_dir, var_tmp_dir);
667 if (r < 0)
d944dc95 668 goto finish;
c2c13f2d 669 }
15ae422b 670
6b7c9f8b
LP
671 /* Create a blacklist we can pass to bind_mount_recursive() */
672 blacklist = newa(char*, n+1);
673 for (j = 0; j < n; j++)
674 blacklist[j] = (char*) mounts[j].path;
675 blacklist[j] = NULL;
676
677 /* Second round, flip the ro bits if necessary. */
c2c13f2d 678 for (m = mounts; m < mounts + n; ++m) {
6b7c9f8b 679 r = make_read_only(m, blacklist);
c2c13f2d 680 if (r < 0)
d944dc95 681 goto finish;
c2c13f2d 682 }
15ae422b
LP
683 }
684
ee818b89
AC
685 if (root_directory) {
686 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
687 r = mount_move_root(root_directory);
d944dc95
LP
688 if (r < 0)
689 goto finish;
ee818b89
AC
690 }
691
c2c13f2d
LP
692 /* Remount / as the desired mode. Not that this will not
693 * reestablish propagation from our side to the host, since
694 * what's disconnected is disconnected. */
d944dc95
LP
695 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
696 r = -errno;
697 goto finish;
698 }
15ae422b 699
d944dc95 700 r = 0;
15ae422b 701
d944dc95
LP
702finish:
703 for (m = mounts; m < mounts + n; m++)
704 free(m->chased);
613b411c
LP
705
706 return r;
707}
708
709static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
710 _cleanup_free_ char *x = NULL;
6b46ea73
LP
711 char bid[SD_ID128_STRING_MAX];
712 sd_id128_t boot_id;
713 int r;
613b411c
LP
714
715 assert(id);
716 assert(prefix);
717 assert(path);
718
6b46ea73
LP
719 /* We include the boot id in the directory so that after a
720 * reboot we can easily identify obsolete directories. */
721
722 r = sd_id128_get_boot(&boot_id);
723 if (r < 0)
724 return r;
725
726 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
613b411c
LP
727 if (!x)
728 return -ENOMEM;
729
730 RUN_WITH_UMASK(0077)
731 if (!mkdtemp(x))
732 return -errno;
733
734 RUN_WITH_UMASK(0000) {
735 char *y;
736
63c372cb 737 y = strjoina(x, "/tmp");
613b411c
LP
738
739 if (mkdir(y, 0777 | S_ISVTX) < 0)
740 return -errno;
c17ec25e 741 }
15ae422b 742
613b411c
LP
743 *path = x;
744 x = NULL;
745
746 return 0;
747}
748
749int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
750 char *a, *b;
751 int r;
752
753 assert(id);
754 assert(tmp_dir);
755 assert(var_tmp_dir);
756
757 r = setup_one_tmp_dir(id, "/tmp", &a);
758 if (r < 0)
759 return r;
760
761 r = setup_one_tmp_dir(id, "/var/tmp", &b);
762 if (r < 0) {
763 char *t;
764
63c372cb 765 t = strjoina(a, "/tmp");
613b411c
LP
766 rmdir(t);
767 rmdir(a);
768
769 free(a);
770 return r;
771 }
772
773 *tmp_dir = a;
774 *var_tmp_dir = b;
775
776 return 0;
777}
778
779int setup_netns(int netns_storage_socket[2]) {
780 _cleanup_close_ int netns = -1;
3ee897d6 781 int r, q;
613b411c
LP
782
783 assert(netns_storage_socket);
784 assert(netns_storage_socket[0] >= 0);
785 assert(netns_storage_socket[1] >= 0);
786
787 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
788 * namespace reference fd. Whatever process runs this first
789 * shall create a new namespace, all others should just join
790 * it. To serialize that we use a file lock on the socket
791 * pair.
613b411c
LP
792 *
793 * It's a bit crazy, but hey, works great! */
794
795 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
796 return -errno;
797
3ee897d6
LP
798 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
799 if (netns == -EAGAIN) {
613b411c
LP
800 /* Nothing stored yet, so let's create a new namespace */
801
802 if (unshare(CLONE_NEWNET) < 0) {
803 r = -errno;
804 goto fail;
805 }
806
807 loopback_setup();
808
809 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
810 if (netns < 0) {
811 r = -errno;
812 goto fail;
813 }
814
815 r = 1;
613b411c 816
3ee897d6
LP
817 } else if (netns < 0) {
818 r = netns;
819 goto fail;
613b411c 820
3ee897d6
LP
821 } else {
822 /* Yay, found something, so let's join the namespace */
613b411c
LP
823 if (setns(netns, CLONE_NEWNET) < 0) {
824 r = -errno;
825 goto fail;
826 }
827
828 r = 0;
829 }
830
3ee897d6
LP
831 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
832 if (q < 0) {
833 r = q;
613b411c
LP
834 goto fail;
835 }
836
837fail:
fe048ce5 838 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
839 return r;
840}
417116f2 841
1b8689f9
LP
842static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
843 [PROTECT_HOME_NO] = "no",
844 [PROTECT_HOME_YES] = "yes",
845 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
846};
847
1b8689f9
LP
848DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
849
850static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
851 [PROTECT_SYSTEM_NO] = "no",
852 [PROTECT_SYSTEM_YES] = "yes",
853 [PROTECT_SYSTEM_FULL] = "full",
3f815163 854 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
855};
856
857DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);