]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
core:namespace: minor improvements to append_mounts()
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
15ae422b
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
15ae422b 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
20#include <errno.h>
07630cea 21#include <sched.h>
15ae422b 22#include <stdio.h>
07630cea
LP
23#include <string.h>
24#include <sys/mount.h>
15ae422b 25#include <sys/stat.h>
07630cea 26#include <unistd.h>
25e870b5 27#include <linux/fs.h>
15ae422b 28
b5efdb8a 29#include "alloc-util.h"
7f112f50 30#include "dev-setup.h"
3ffd4af2 31#include "fd-util.h"
d944dc95 32#include "fs-util.h"
07630cea
LP
33#include "loopback-setup.h"
34#include "missing.h"
35#include "mkdir.h"
4349cd7c 36#include "mount-util.h"
3ffd4af2 37#include "namespace.h"
07630cea 38#include "path-util.h"
d7b8eec7 39#include "selinux-util.h"
2583fbea 40#include "socket-util.h"
8b43440b 41#include "string-table.h"
07630cea
LP
42#include "string-util.h"
43#include "strv.h"
affb60b1 44#include "umask-util.h"
ee104e11 45#include "user-util.h"
07630cea 46#include "util.h"
15ae422b 47
737ba3c8 48#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
49
c17ec25e 50typedef enum MountMode {
15ae422b
LP
51 /* This is ordered by priority! */
52 INACCESSIBLE,
53 READONLY,
ac0930c8
LP
54 PRIVATE_TMP,
55 PRIVATE_VAR_TMP,
7f112f50 56 PRIVATE_DEV,
59eeb84b 57 READWRITE,
c17ec25e 58} MountMode;
15ae422b 59
c17ec25e 60typedef struct BindMount {
d944dc95
LP
61 const char *path; /* stack memory, doesn't need to be freed explicitly */
62 char *chased; /* malloc()ed memory, needs to be freed */
c17ec25e 63 MountMode mode;
ea92ae33 64 bool ignore;
c17ec25e 65} BindMount;
15ae422b 66
c17ec25e 67static int append_mounts(BindMount **p, char **strv, MountMode mode) {
15ae422b
LP
68 char **i;
69
613b411c
LP
70 assert(p);
71
15ae422b 72 STRV_FOREACH(i, strv) {
9c94d52e 73 bool ignore = false;
15ae422b 74
9c94d52e 75 if (IN_SET(mode, INACCESSIBLE, READONLY, READWRITE) && startswith(*i, "-")) {
ea92ae33 76 (*i)++;
9c94d52e 77 ignore = true;
ea92ae33
MW
78 }
79
15ae422b
LP
80 if (!path_is_absolute(*i))
81 return -EINVAL;
82
83 (*p)->path = *i;
84 (*p)->mode = mode;
9c94d52e 85 (*p)->ignore = ignore;
15ae422b
LP
86 (*p)++;
87 }
88
89 return 0;
90}
91
c17ec25e
MS
92static int mount_path_compare(const void *a, const void *b) {
93 const BindMount *p = a, *q = b;
a0827e2b 94 int d;
15ae422b 95
6ee1a919 96 /* If the paths are not equal, then order prefixes first */
a0827e2b 97 d = path_compare(p->path, q->path);
6ee1a919
LP
98 if (d != 0)
99 return d;
15ae422b 100
6ee1a919
LP
101 /* If the paths are equal, check the mode */
102 if (p->mode < q->mode)
103 return -1;
15ae422b 104
6ee1a919
LP
105 if (p->mode > q->mode)
106 return 1;
15ae422b 107
6ee1a919 108 return 0;
15ae422b
LP
109}
110
c17ec25e
MS
111static void drop_duplicates(BindMount *m, unsigned *n) {
112 BindMount *f, *t, *previous;
15ae422b 113
c17ec25e 114 assert(m);
15ae422b 115 assert(n);
15ae422b 116
fe3c2583
LP
117 /* Drops duplicate entries. Expects that the array is properly ordered already. */
118
c17ec25e 119 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
15ae422b 120
fe3c2583
LP
121 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
122 * above. */
123 if (previous && path_equal(f->path, previous->path)) {
124 log_debug("%s is duplicate.", f->path);
15ae422b 125 continue;
fe3c2583 126 }
15ae422b 127
e2d7c1a0 128 *t = *f;
15ae422b 129 previous = t;
fe3c2583
LP
130 t++;
131 }
132
133 *n = t - m;
134}
135
136static void drop_inaccessible(BindMount *m, unsigned *n) {
137 BindMount *f, *t;
138 const char *clear = NULL;
139
140 assert(m);
141 assert(n);
142
143 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
144 * ordered already. */
145
146 for (f = m, t = m; f < m+*n; f++) {
147
148 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
149 * it, as inaccessible paths really should drop the entire subtree. */
150 if (clear && path_startswith(f->path, clear)) {
151 log_debug("%s is masked by %s.", f->path, clear);
152 continue;
153 }
15ae422b 154
fe3c2583
LP
155 clear = f->mode == INACCESSIBLE ? f->path : NULL;
156
157 *t = *f;
15ae422b
LP
158 t++;
159 }
160
c17ec25e 161 *n = t - m;
15ae422b
LP
162}
163
7648a565
LP
164static void drop_nop(BindMount *m, unsigned *n) {
165 BindMount *f, *t;
166
167 assert(m);
168 assert(n);
169
170 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
171 * list is ordered by prefixes. */
172
173 for (f = m, t = m; f < m+*n; f++) {
174
175 /* Only suppress such subtrees for READONLY and READWRITE entries */
176 if (IN_SET(f->mode, READONLY, READWRITE)) {
177 BindMount *p;
178 bool found = false;
179
180 /* Now let's find the first parent of the entry we are looking at. */
181 for (p = t-1; p >= m; p--) {
182 if (path_startswith(f->path, p->path)) {
183 found = true;
184 break;
185 }
186 }
187
188 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
189 if (found && p->mode == f->mode) {
190 log_debug("%s is redundant by %s", f->path, p->path);
191 continue;
192 }
193 }
194
195 *t = *f;
196 t++;
197 }
198
199 *n = t - m;
200}
201
cd2902c9
LP
202static void drop_outside_root(const char *root_directory, BindMount *m, unsigned *n) {
203 BindMount *f, *t;
204
205 assert(m);
206 assert(n);
207
208 if (!root_directory)
209 return;
210
211 /* Drops all mounts that are outside of the root directory. */
212
213 for (f = m, t = m; f < m+*n; f++) {
214
215 if (!path_startswith(f->path, root_directory)) {
216 log_debug("%s is outside of root directory.", f->path);
217 continue;
218 }
219
220 *t = *f;
221 t++;
222 }
223
224 *n = t - m;
225}
226
7f112f50
LP
227static int mount_dev(BindMount *m) {
228 static const char devnodes[] =
229 "/dev/null\0"
230 "/dev/zero\0"
231 "/dev/full\0"
232 "/dev/random\0"
233 "/dev/urandom\0"
234 "/dev/tty\0";
235
2b85f4e1 236 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 237 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
238 _cleanup_umask_ mode_t u;
239 int r;
240
241 assert(m);
242
243 u = umask(0000);
244
2b85f4e1
LP
245 if (!mkdtemp(temporary_mount))
246 return -errno;
247
63c372cb 248 dev = strjoina(temporary_mount, "/dev");
dc751688 249 (void) mkdir(dev, 0755);
737ba3c8 250 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
251 r = -errno;
252 goto fail;
253 }
254
63c372cb 255 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 256 (void) mkdir(devpts, 0755);
2b85f4e1
LP
257 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
258 r = -errno;
259 goto fail;
260 }
261
63c372cb 262 devptmx = strjoina(temporary_mount, "/dev/ptmx");
3164e3cb
ZJS
263 if (symlink("pts/ptmx", devptmx) < 0) {
264 r = -errno;
265 goto fail;
266 }
e06b6479 267
63c372cb 268 devshm = strjoina(temporary_mount, "/dev/shm");
dc751688 269 (void) mkdir(devshm, 01777);
2b85f4e1
LP
270 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
271 if (r < 0) {
272 r = -errno;
273 goto fail;
274 }
275
63c372cb 276 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 277 (void) mkdir(devmqueue, 0755);
3164e3cb 278 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 279
63c372cb 280 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 281 (void) mkdir(devhugepages, 0755);
3164e3cb 282 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 283
63c372cb 284 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 285 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 286
7f112f50 287 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
288 _cleanup_free_ char *dn = NULL;
289 struct stat st;
290
291 r = stat(d, &st);
7f112f50 292 if (r < 0) {
2b85f4e1
LP
293
294 if (errno == ENOENT)
295 continue;
296
297 r = -errno;
298 goto fail;
7f112f50
LP
299 }
300
2b85f4e1
LP
301 if (!S_ISBLK(st.st_mode) &&
302 !S_ISCHR(st.st_mode)) {
303 r = -EINVAL;
304 goto fail;
305 }
306
307 if (st.st_rdev == 0)
308 continue;
309
310 dn = strappend(temporary_mount, d);
311 if (!dn) {
312 r = -ENOMEM;
313 goto fail;
314 }
315
ecabcf8b 316 mac_selinux_create_file_prepare(d, st.st_mode);
2b85f4e1 317 r = mknod(dn, st.st_mode, st.st_rdev);
ecabcf8b 318 mac_selinux_create_file_clear();
dd078a1e 319
2b85f4e1
LP
320 if (r < 0) {
321 r = -errno;
322 goto fail;
323 }
7f112f50
LP
324 }
325
03cfe0d5 326 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 327
ee818b89
AC
328 /* Create the /dev directory if missing. It is more likely to be
329 * missing when the service is started with RootDirectory. This is
330 * consistent with mount units creating the mount points when missing.
331 */
332 (void) mkdir_p_label(m->path, 0755);
333
9e5f8252 334 /* Unmount everything in old /dev */
335 umount_recursive(m->path, 0);
ee818b89 336 if (mount(dev, m->path, NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
337 r = -errno;
338 goto fail;
339 }
7f112f50 340
2b85f4e1
LP
341 rmdir(dev);
342 rmdir(temporary_mount);
7f112f50 343
2b85f4e1 344 return 0;
7f112f50 345
2b85f4e1
LP
346fail:
347 if (devpts)
348 umount(devpts);
7f112f50 349
2b85f4e1
LP
350 if (devshm)
351 umount(devshm);
7f112f50 352
2b85f4e1
LP
353 if (devhugepages)
354 umount(devhugepages);
7f112f50 355
2b85f4e1
LP
356 if (devmqueue)
357 umount(devmqueue);
7f112f50 358
d267c5aa
ZJS
359 umount(dev);
360 rmdir(dev);
2b85f4e1 361 rmdir(temporary_mount);
7f112f50 362
2b85f4e1 363 return r;
7f112f50
LP
364}
365
ac0930c8 366static int apply_mount(
c17ec25e 367 BindMount *m,
ac0930c8 368 const char *tmp_dir,
c17ec25e 369 const char *var_tmp_dir) {
ac0930c8 370
15ae422b 371 const char *what;
15ae422b 372 int r;
15ae422b 373
c17ec25e 374 assert(m);
15ae422b 375
fe3c2583
LP
376 log_debug("Applying namespace mount on %s", m->path);
377
c17ec25e 378 switch (m->mode) {
15ae422b 379
160cfdbe
LP
380 case INACCESSIBLE: {
381 struct stat target;
6d313367
LP
382
383 /* First, get rid of everything that is below if there
384 * is anything... Then, overmount it with an
c4b41707 385 * inaccessible path. */
fe3c2583 386 (void) umount_recursive(m->path, 0);
6d313367 387
d944dc95 388 if (lstat(m->path, &target) < 0)
160cfdbe 389 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path);
15ae422b 390
c4b41707 391 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
392 if (!what) {
393 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
394 return -ELOOP;
395 }
396 break;
160cfdbe 397 }
fe3c2583 398
15ae422b 399 case READONLY:
15ae422b 400 case READWRITE:
6b7c9f8b
LP
401
402 r = path_is_mount_point(m->path, 0);
d944dc95 403 if (r < 0)
6b7c9f8b 404 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path);
6b7c9f8b
LP
405 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
406 return 0;
407
408 /* This isn't a mount point yet, let's make it one. */
409 what = m->path;
410 break;
15ae422b 411
ac0930c8
LP
412 case PRIVATE_TMP:
413 what = tmp_dir;
414 break;
415
416 case PRIVATE_VAR_TMP:
417 what = var_tmp_dir;
15ae422b 418 break;
e364ad06 419
d6797c92
LP
420 case PRIVATE_DEV:
421 return mount_dev(m);
422
e364ad06
LP
423 default:
424 assert_not_reached("Unknown mode");
15ae422b
LP
425 }
426
ac0930c8 427 assert(what);
15ae422b 428
d944dc95 429 if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0)
5fd7cf6f 430 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path);
6b7c9f8b
LP
431
432 log_debug("Successfully mounted %s to %s", what, m->path);
433 return 0;
ac0930c8 434}
15ae422b 435
6b7c9f8b
LP
436static int make_read_only(BindMount *m, char **blacklist) {
437 int r = 0;
15ae422b 438
c17ec25e 439 assert(m);
ac0930c8 440
d6797c92 441 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
6b7c9f8b
LP
442 r = bind_remount_recursive(m->path, true, blacklist);
443 else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
444 if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
445 r = -errno;
737ba3c8 446 } else
6b7c9f8b
LP
447 return 0;
448
449 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
450 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
451 * read-only mounts already applied. */
ac0930c8 452
d6797c92 453 return r;
15ae422b
LP
454}
455
d944dc95
LP
456static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned *n) {
457 BindMount *f, *t;
458 int r;
459
460 assert(m);
461 assert(n);
462
463 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
464 * chase the symlinks on our own first. This call wil do so for all entries and remove all entries where we
465 * can't resolve the path, and which have been marked for such removal. */
466
467 for (f = m, t = m; f < m+*n; f++) {
468
469 r = chase_symlinks(f->path, root_directory, &f->chased);
470 if (r == -ENOENT && f->ignore) /* Doesn't exist? Then remove it! */
471 continue;
472 if (r < 0)
473 return log_debug_errno(r, "Failed to chase symlinks for %s: %m", f->path);
474
475 if (path_equal(f->path, f->chased))
476 f->chased = mfree(f->chased);
477 else {
478 log_debug("Chased %s → %s", f->path, f->chased);
479 f->path = f->chased;
480 }
481
482 *t = *f;
483 t++;
484 }
485
486 *n = t - m;
487 return 0;
488}
489
613b411c 490int setup_namespace(
ee818b89 491 const char* root_directory,
2a624c36
AP
492 char** read_write_paths,
493 char** read_only_paths,
494 char** inaccessible_paths,
a004cb4c
LP
495 const char* tmp_dir,
496 const char* var_tmp_dir,
7f112f50 497 bool private_dev,
59eeb84b
LP
498 bool protect_sysctl,
499 bool protect_cgroups,
1b8689f9
LP
500 ProtectHome protect_home,
501 ProtectSystem protect_system,
e6547662 502 unsigned long mount_flags) {
15ae422b 503
7ff7394d 504 BindMount *m, *mounts = NULL;
d944dc95 505 bool make_slave = false;
613b411c 506 unsigned n;
c17ec25e 507 int r = 0;
15ae422b 508
613b411c 509 if (mount_flags == 0)
c17ec25e 510 mount_flags = MS_SHARED;
ac0930c8 511
9ca6ff50 512 n = !!tmp_dir + !!var_tmp_dir +
2a624c36
AP
513 strv_length(read_write_paths) +
514 strv_length(read_only_paths) +
515 strv_length(inaccessible_paths) +
417116f2 516 private_dev +
59eeb84b
LP
517 (protect_sysctl ? 3 : 0) +
518 (protect_cgroups != protect_sysctl) +
3f815163
LP
519 (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
520 (protect_system == PROTECT_SYSTEM_STRICT ?
521 (2 + !private_dev + !protect_sysctl) :
522 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
523 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
613b411c 524
d944dc95
LP
525 if (root_directory || n > 0)
526 make_slave = true;
527
613b411c 528 if (n > 0) {
002b2268 529 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
2a624c36 530 r = append_mounts(&m, read_write_paths, READWRITE);
613b411c
LP
531 if (r < 0)
532 return r;
533
2a624c36 534 r = append_mounts(&m, read_only_paths, READONLY);
613b411c
LP
535 if (r < 0)
536 return r;
537
2a624c36 538 r = append_mounts(&m, inaccessible_paths, INACCESSIBLE);
613b411c 539 if (r < 0)
7ff7394d
ZJS
540 return r;
541
613b411c 542 if (tmp_dir) {
ee818b89 543 m->path = prefix_roota(root_directory, "/tmp");
7ff7394d
ZJS
544 m->mode = PRIVATE_TMP;
545 m++;
613b411c 546 }
7ff7394d 547
613b411c 548 if (var_tmp_dir) {
ee818b89 549 m->path = prefix_roota(root_directory, "/var/tmp");
7ff7394d
ZJS
550 m->mode = PRIVATE_VAR_TMP;
551 m++;
552 }
ac0930c8 553
7f112f50 554 if (private_dev) {
ee818b89 555 m->path = prefix_roota(root_directory, "/dev");
7f112f50
LP
556 m->mode = PRIVATE_DEV;
557 m++;
558 }
559
59eeb84b
LP
560 if (protect_sysctl) {
561 m->path = prefix_roota(root_directory, "/proc/sys");
562 m->mode = READONLY;
563 m++;
564
565 m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
566 m->mode = READONLY;
567 m->ignore = true; /* Not always compiled into the kernel */
568 m++;
569
570 m->path = prefix_roota(root_directory, "/sys");
571 m->mode = READONLY;
572 m++;
573 }
574
575 if (protect_cgroups != protect_sysctl) {
576 m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
577 m->mode = protect_cgroups ? READONLY : READWRITE;
578 m++;
579 }
580
3f815163 581 if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
ee818b89
AC
582 const char *home_dir, *run_user_dir, *root_dir;
583
3f815163
LP
584 /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
585 * strict system protection mode, then also add entries for these directories, but mark them
586 * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
587
ee818b89
AC
588 home_dir = prefix_roota(root_directory, "/home");
589 home_dir = strjoina("-", home_dir);
590 run_user_dir = prefix_roota(root_directory, "/run/user");
591 run_user_dir = strjoina("-", run_user_dir);
592 root_dir = prefix_roota(root_directory, "/root");
593 root_dir = strjoina("-", root_dir);
594
595 r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
3f815163
LP
596 protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
597 protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
417116f2
LP
598 if (r < 0)
599 return r;
600 }
601
3f815163
LP
602 if (protect_system == PROTECT_SYSTEM_STRICT) {
603 /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
604 * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
605 * protect those, and these options should be fully orthogonal. (And of course /home and
606 * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
607 * above). */
608
609 m->path = prefix_roota(root_directory, "/");
610 m->mode = READONLY;
611 m++;
612
613 m->path = prefix_roota(root_directory, "/proc");
614 m->mode = READWRITE;
615 m++;
616
617 if (!private_dev) {
618 m->path = prefix_roota(root_directory, "/dev");
619 m->mode = READWRITE;
620 m++;
621 }
622 if (!protect_sysctl) {
623 m->path = prefix_roota(root_directory, "/sys");
624 m->mode = READWRITE;
625 m++;
626 }
627
628 } else if (protect_system != PROTECT_SYSTEM_NO) {
629 const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
630
631 /* In any other mode we simply mark the relevant three directories ready-only. */
ee818b89 632
d38e01dc 633 usr_dir = prefix_roota(root_directory, "/usr");
ee818b89
AC
634 boot_dir = prefix_roota(root_directory, "/boot");
635 boot_dir = strjoina("-", boot_dir);
3f815163
LP
636 efi_dir = prefix_roota(root_directory, "/efi");
637 efi_dir = strjoina("-", efi_dir);
ee818b89
AC
638 etc_dir = prefix_roota(root_directory, "/etc");
639
640 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
3f815163
LP
641 ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
642 : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
417116f2
LP
643 if (r < 0)
644 return r;
645 }
646
7ff7394d 647 assert(mounts + n == m);
ac0930c8 648
d944dc95
LP
649 /* Resolve symlinks manually first, as mount() will always follow them relative to the host's
650 * root. Moreover we want to suppress duplicates based on the resolved paths. This of course is a bit
651 * racy. */
652 r = chase_all_symlinks(root_directory, mounts, &n);
653 if (r < 0)
654 goto finish;
655
7ff7394d 656 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
fe3c2583 657
7ff7394d 658 drop_duplicates(mounts, &n);
cd2902c9 659 drop_outside_root(root_directory, mounts, &n);
fe3c2583 660 drop_inaccessible(mounts, &n);
7648a565 661 drop_nop(mounts, &n);
15ae422b
LP
662 }
663
d944dc95
LP
664 if (unshare(CLONE_NEWNS) < 0) {
665 r = -errno;
666 goto finish;
667 }
1e4e94c8 668
d944dc95 669 if (make_slave) {
c2c13f2d
LP
670 /* Remount / as SLAVE so that nothing now mounted in the namespace
671 shows up in the parent */
d944dc95
LP
672 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
673 r = -errno;
674 goto finish;
675 }
ee818b89
AC
676 }
677
678 if (root_directory) {
8f1ad200
LP
679 /* Turn directory into bind mount, if it isn't one yet */
680 r = path_is_mount_point(root_directory, AT_SYMLINK_FOLLOW);
681 if (r < 0)
d944dc95 682 goto finish;
8f1ad200
LP
683 if (r == 0) {
684 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
685 r = -errno;
686 goto finish;
687 }
d944dc95 688 }
ee818b89 689 }
c2c13f2d 690
ee818b89 691 if (n > 0) {
6b7c9f8b
LP
692 char **blacklist;
693 unsigned j;
694
695 /* First round, add in all special mounts we need */
c2c13f2d
LP
696 for (m = mounts; m < mounts + n; ++m) {
697 r = apply_mount(m, tmp_dir, var_tmp_dir);
698 if (r < 0)
d944dc95 699 goto finish;
c2c13f2d 700 }
15ae422b 701
6b7c9f8b
LP
702 /* Create a blacklist we can pass to bind_mount_recursive() */
703 blacklist = newa(char*, n+1);
704 for (j = 0; j < n; j++)
705 blacklist[j] = (char*) mounts[j].path;
706 blacklist[j] = NULL;
707
708 /* Second round, flip the ro bits if necessary. */
c2c13f2d 709 for (m = mounts; m < mounts + n; ++m) {
6b7c9f8b 710 r = make_read_only(m, blacklist);
c2c13f2d 711 if (r < 0)
d944dc95 712 goto finish;
c2c13f2d 713 }
15ae422b
LP
714 }
715
ee818b89
AC
716 if (root_directory) {
717 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
718 r = mount_move_root(root_directory);
d944dc95
LP
719 if (r < 0)
720 goto finish;
ee818b89
AC
721 }
722
c2c13f2d
LP
723 /* Remount / as the desired mode. Not that this will not
724 * reestablish propagation from our side to the host, since
725 * what's disconnected is disconnected. */
d944dc95
LP
726 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
727 r = -errno;
728 goto finish;
729 }
15ae422b 730
d944dc95 731 r = 0;
15ae422b 732
d944dc95
LP
733finish:
734 for (m = mounts; m < mounts + n; m++)
735 free(m->chased);
613b411c
LP
736
737 return r;
738}
739
740static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
741 _cleanup_free_ char *x = NULL;
6b46ea73
LP
742 char bid[SD_ID128_STRING_MAX];
743 sd_id128_t boot_id;
744 int r;
613b411c
LP
745
746 assert(id);
747 assert(prefix);
748 assert(path);
749
6b46ea73
LP
750 /* We include the boot id in the directory so that after a
751 * reboot we can easily identify obsolete directories. */
752
753 r = sd_id128_get_boot(&boot_id);
754 if (r < 0)
755 return r;
756
757 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
613b411c
LP
758 if (!x)
759 return -ENOMEM;
760
761 RUN_WITH_UMASK(0077)
762 if (!mkdtemp(x))
763 return -errno;
764
765 RUN_WITH_UMASK(0000) {
766 char *y;
767
63c372cb 768 y = strjoina(x, "/tmp");
613b411c
LP
769
770 if (mkdir(y, 0777 | S_ISVTX) < 0)
771 return -errno;
c17ec25e 772 }
15ae422b 773
613b411c
LP
774 *path = x;
775 x = NULL;
776
777 return 0;
778}
779
780int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
781 char *a, *b;
782 int r;
783
784 assert(id);
785 assert(tmp_dir);
786 assert(var_tmp_dir);
787
788 r = setup_one_tmp_dir(id, "/tmp", &a);
789 if (r < 0)
790 return r;
791
792 r = setup_one_tmp_dir(id, "/var/tmp", &b);
793 if (r < 0) {
794 char *t;
795
63c372cb 796 t = strjoina(a, "/tmp");
613b411c
LP
797 rmdir(t);
798 rmdir(a);
799
800 free(a);
801 return r;
802 }
803
804 *tmp_dir = a;
805 *var_tmp_dir = b;
806
807 return 0;
808}
809
810int setup_netns(int netns_storage_socket[2]) {
811 _cleanup_close_ int netns = -1;
3ee897d6 812 int r, q;
613b411c
LP
813
814 assert(netns_storage_socket);
815 assert(netns_storage_socket[0] >= 0);
816 assert(netns_storage_socket[1] >= 0);
817
818 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
819 * namespace reference fd. Whatever process runs this first
820 * shall create a new namespace, all others should just join
821 * it. To serialize that we use a file lock on the socket
822 * pair.
613b411c
LP
823 *
824 * It's a bit crazy, but hey, works great! */
825
826 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
827 return -errno;
828
3ee897d6
LP
829 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
830 if (netns == -EAGAIN) {
613b411c
LP
831 /* Nothing stored yet, so let's create a new namespace */
832
833 if (unshare(CLONE_NEWNET) < 0) {
834 r = -errno;
835 goto fail;
836 }
837
838 loopback_setup();
839
840 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
841 if (netns < 0) {
842 r = -errno;
843 goto fail;
844 }
845
846 r = 1;
613b411c 847
3ee897d6
LP
848 } else if (netns < 0) {
849 r = netns;
850 goto fail;
613b411c 851
3ee897d6
LP
852 } else {
853 /* Yay, found something, so let's join the namespace */
613b411c
LP
854 if (setns(netns, CLONE_NEWNET) < 0) {
855 r = -errno;
856 goto fail;
857 }
858
859 r = 0;
860 }
861
3ee897d6
LP
862 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
863 if (q < 0) {
864 r = q;
613b411c
LP
865 goto fail;
866 }
867
868fail:
fe048ce5 869 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
870 return r;
871}
417116f2 872
1b8689f9
LP
873static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
874 [PROTECT_HOME_NO] = "no",
875 [PROTECT_HOME_YES] = "yes",
876 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
877};
878
1b8689f9
LP
879DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
880
881static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
882 [PROTECT_SYSTEM_NO] = "no",
883 [PROTECT_SYSTEM_YES] = "yes",
884 [PROTECT_SYSTEM_FULL] = "full",
3f815163 885 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
886};
887
888DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);