]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
core:namespace: put paths protected by ProtectKernelTunables= in
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
15ae422b
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
15ae422b 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
20#include <errno.h>
07630cea 21#include <sched.h>
15ae422b 22#include <stdio.h>
07630cea
LP
23#include <string.h>
24#include <sys/mount.h>
15ae422b 25#include <sys/stat.h>
07630cea 26#include <unistd.h>
25e870b5 27#include <linux/fs.h>
15ae422b 28
b5efdb8a 29#include "alloc-util.h"
7f112f50 30#include "dev-setup.h"
3ffd4af2 31#include "fd-util.h"
d944dc95 32#include "fs-util.h"
07630cea
LP
33#include "loopback-setup.h"
34#include "missing.h"
35#include "mkdir.h"
4349cd7c 36#include "mount-util.h"
3ffd4af2 37#include "namespace.h"
07630cea 38#include "path-util.h"
d7b8eec7 39#include "selinux-util.h"
2583fbea 40#include "socket-util.h"
8b43440b 41#include "string-table.h"
07630cea
LP
42#include "string-util.h"
43#include "strv.h"
affb60b1 44#include "umask-util.h"
ee104e11 45#include "user-util.h"
07630cea 46#include "util.h"
15ae422b 47
737ba3c8 48#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
49
c17ec25e 50typedef enum MountMode {
15ae422b
LP
51 /* This is ordered by priority! */
52 INACCESSIBLE,
53 READONLY,
ac0930c8
LP
54 PRIVATE_TMP,
55 PRIVATE_VAR_TMP,
7f112f50 56 PRIVATE_DEV,
59eeb84b 57 READWRITE,
c17ec25e 58} MountMode;
15ae422b 59
c17ec25e 60typedef struct BindMount {
d944dc95
LP
61 const char *path; /* stack memory, doesn't need to be freed explicitly */
62 char *chased; /* malloc()ed memory, needs to be freed */
c17ec25e 63 MountMode mode;
11a30cec 64 bool ignore; /* Ignore if path does not exist */
c17ec25e 65} BindMount;
15ae422b 66
11a30cec
DH
67typedef struct TargetMount {
68 const char *path;
69 MountMode mode;
70 bool ignore; /* Ignore if path does not exist */
71} TargetMount;
72
73/* ProtectKernelTunables= option and the related filesystem APIs */
74static const TargetMount protect_kernel_tunables_table[] = {
75 { "/proc/sys", READONLY, false },
76 { "/proc/sysrq-trigger", READONLY, true },
77 { "/sys", READONLY, false },
78 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
79};
80
c17ec25e 81static int append_mounts(BindMount **p, char **strv, MountMode mode) {
15ae422b
LP
82 char **i;
83
613b411c
LP
84 assert(p);
85
15ae422b 86 STRV_FOREACH(i, strv) {
9c94d52e 87 bool ignore = false;
15ae422b 88
9c94d52e 89 if (IN_SET(mode, INACCESSIBLE, READONLY, READWRITE) && startswith(*i, "-")) {
ea92ae33 90 (*i)++;
9c94d52e 91 ignore = true;
ea92ae33
MW
92 }
93
15ae422b
LP
94 if (!path_is_absolute(*i))
95 return -EINVAL;
96
97 (*p)->path = *i;
98 (*p)->mode = mode;
9c94d52e 99 (*p)->ignore = ignore;
15ae422b
LP
100 (*p)++;
101 }
102
103 return 0;
104}
105
11a30cec
DH
106static void append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
107 unsigned int i;
108
109 assert(p);
110
111 for (i = 0; i < ELEMENTSOF(protect_kernel_tunables_table); i++) {
112 const TargetMount *t = &protect_kernel_tunables_table[i];
113 (*p)->path = prefix_roota(root_directory, t->path);
114 (*p)->mode = t->mode;
115 (*p)->ignore = t->ignore;
116 (*p)++;
117 }
118}
119
c17ec25e
MS
120static int mount_path_compare(const void *a, const void *b) {
121 const BindMount *p = a, *q = b;
a0827e2b 122 int d;
15ae422b 123
6ee1a919 124 /* If the paths are not equal, then order prefixes first */
a0827e2b 125 d = path_compare(p->path, q->path);
6ee1a919
LP
126 if (d != 0)
127 return d;
15ae422b 128
6ee1a919
LP
129 /* If the paths are equal, check the mode */
130 if (p->mode < q->mode)
131 return -1;
15ae422b 132
6ee1a919
LP
133 if (p->mode > q->mode)
134 return 1;
15ae422b 135
6ee1a919 136 return 0;
15ae422b
LP
137}
138
c17ec25e
MS
139static void drop_duplicates(BindMount *m, unsigned *n) {
140 BindMount *f, *t, *previous;
15ae422b 141
c17ec25e 142 assert(m);
15ae422b 143 assert(n);
15ae422b 144
fe3c2583
LP
145 /* Drops duplicate entries. Expects that the array is properly ordered already. */
146
c17ec25e 147 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
15ae422b 148
fe3c2583
LP
149 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
150 * above. */
151 if (previous && path_equal(f->path, previous->path)) {
152 log_debug("%s is duplicate.", f->path);
15ae422b 153 continue;
fe3c2583 154 }
15ae422b 155
e2d7c1a0 156 *t = *f;
15ae422b 157 previous = t;
fe3c2583
LP
158 t++;
159 }
160
161 *n = t - m;
162}
163
164static void drop_inaccessible(BindMount *m, unsigned *n) {
165 BindMount *f, *t;
166 const char *clear = NULL;
167
168 assert(m);
169 assert(n);
170
171 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
172 * ordered already. */
173
174 for (f = m, t = m; f < m+*n; f++) {
175
176 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
177 * it, as inaccessible paths really should drop the entire subtree. */
178 if (clear && path_startswith(f->path, clear)) {
179 log_debug("%s is masked by %s.", f->path, clear);
180 continue;
181 }
15ae422b 182
fe3c2583
LP
183 clear = f->mode == INACCESSIBLE ? f->path : NULL;
184
185 *t = *f;
15ae422b
LP
186 t++;
187 }
188
c17ec25e 189 *n = t - m;
15ae422b
LP
190}
191
7648a565
LP
192static void drop_nop(BindMount *m, unsigned *n) {
193 BindMount *f, *t;
194
195 assert(m);
196 assert(n);
197
198 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
199 * list is ordered by prefixes. */
200
201 for (f = m, t = m; f < m+*n; f++) {
202
203 /* Only suppress such subtrees for READONLY and READWRITE entries */
204 if (IN_SET(f->mode, READONLY, READWRITE)) {
205 BindMount *p;
206 bool found = false;
207
208 /* Now let's find the first parent of the entry we are looking at. */
209 for (p = t-1; p >= m; p--) {
210 if (path_startswith(f->path, p->path)) {
211 found = true;
212 break;
213 }
214 }
215
216 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
217 if (found && p->mode == f->mode) {
218 log_debug("%s is redundant by %s", f->path, p->path);
219 continue;
220 }
221 }
222
223 *t = *f;
224 t++;
225 }
226
227 *n = t - m;
228}
229
cd2902c9
LP
230static void drop_outside_root(const char *root_directory, BindMount *m, unsigned *n) {
231 BindMount *f, *t;
232
233 assert(m);
234 assert(n);
235
236 if (!root_directory)
237 return;
238
239 /* Drops all mounts that are outside of the root directory. */
240
241 for (f = m, t = m; f < m+*n; f++) {
242
243 if (!path_startswith(f->path, root_directory)) {
244 log_debug("%s is outside of root directory.", f->path);
245 continue;
246 }
247
248 *t = *f;
249 t++;
250 }
251
252 *n = t - m;
253}
254
7f112f50
LP
255static int mount_dev(BindMount *m) {
256 static const char devnodes[] =
257 "/dev/null\0"
258 "/dev/zero\0"
259 "/dev/full\0"
260 "/dev/random\0"
261 "/dev/urandom\0"
262 "/dev/tty\0";
263
2b85f4e1 264 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 265 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
266 _cleanup_umask_ mode_t u;
267 int r;
268
269 assert(m);
270
271 u = umask(0000);
272
2b85f4e1
LP
273 if (!mkdtemp(temporary_mount))
274 return -errno;
275
63c372cb 276 dev = strjoina(temporary_mount, "/dev");
dc751688 277 (void) mkdir(dev, 0755);
737ba3c8 278 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
279 r = -errno;
280 goto fail;
281 }
282
63c372cb 283 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 284 (void) mkdir(devpts, 0755);
2b85f4e1
LP
285 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
286 r = -errno;
287 goto fail;
288 }
289
63c372cb 290 devptmx = strjoina(temporary_mount, "/dev/ptmx");
3164e3cb
ZJS
291 if (symlink("pts/ptmx", devptmx) < 0) {
292 r = -errno;
293 goto fail;
294 }
e06b6479 295
63c372cb 296 devshm = strjoina(temporary_mount, "/dev/shm");
dc751688 297 (void) mkdir(devshm, 01777);
2b85f4e1
LP
298 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
299 if (r < 0) {
300 r = -errno;
301 goto fail;
302 }
303
63c372cb 304 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 305 (void) mkdir(devmqueue, 0755);
3164e3cb 306 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 307
63c372cb 308 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 309 (void) mkdir(devhugepages, 0755);
3164e3cb 310 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 311
63c372cb 312 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 313 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 314
7f112f50 315 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
316 _cleanup_free_ char *dn = NULL;
317 struct stat st;
318
319 r = stat(d, &st);
7f112f50 320 if (r < 0) {
2b85f4e1
LP
321
322 if (errno == ENOENT)
323 continue;
324
325 r = -errno;
326 goto fail;
7f112f50
LP
327 }
328
2b85f4e1
LP
329 if (!S_ISBLK(st.st_mode) &&
330 !S_ISCHR(st.st_mode)) {
331 r = -EINVAL;
332 goto fail;
333 }
334
335 if (st.st_rdev == 0)
336 continue;
337
338 dn = strappend(temporary_mount, d);
339 if (!dn) {
340 r = -ENOMEM;
341 goto fail;
342 }
343
ecabcf8b 344 mac_selinux_create_file_prepare(d, st.st_mode);
2b85f4e1 345 r = mknod(dn, st.st_mode, st.st_rdev);
ecabcf8b 346 mac_selinux_create_file_clear();
dd078a1e 347
2b85f4e1
LP
348 if (r < 0) {
349 r = -errno;
350 goto fail;
351 }
7f112f50
LP
352 }
353
03cfe0d5 354 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 355
ee818b89
AC
356 /* Create the /dev directory if missing. It is more likely to be
357 * missing when the service is started with RootDirectory. This is
358 * consistent with mount units creating the mount points when missing.
359 */
360 (void) mkdir_p_label(m->path, 0755);
361
9e5f8252 362 /* Unmount everything in old /dev */
363 umount_recursive(m->path, 0);
ee818b89 364 if (mount(dev, m->path, NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
365 r = -errno;
366 goto fail;
367 }
7f112f50 368
2b85f4e1
LP
369 rmdir(dev);
370 rmdir(temporary_mount);
7f112f50 371
2b85f4e1 372 return 0;
7f112f50 373
2b85f4e1
LP
374fail:
375 if (devpts)
376 umount(devpts);
7f112f50 377
2b85f4e1
LP
378 if (devshm)
379 umount(devshm);
7f112f50 380
2b85f4e1
LP
381 if (devhugepages)
382 umount(devhugepages);
7f112f50 383
2b85f4e1
LP
384 if (devmqueue)
385 umount(devmqueue);
7f112f50 386
d267c5aa
ZJS
387 umount(dev);
388 rmdir(dev);
2b85f4e1 389 rmdir(temporary_mount);
7f112f50 390
2b85f4e1 391 return r;
7f112f50
LP
392}
393
ac0930c8 394static int apply_mount(
c17ec25e 395 BindMount *m,
ac0930c8 396 const char *tmp_dir,
c17ec25e 397 const char *var_tmp_dir) {
ac0930c8 398
15ae422b 399 const char *what;
15ae422b 400 int r;
15ae422b 401
c17ec25e 402 assert(m);
15ae422b 403
fe3c2583
LP
404 log_debug("Applying namespace mount on %s", m->path);
405
c17ec25e 406 switch (m->mode) {
15ae422b 407
160cfdbe
LP
408 case INACCESSIBLE: {
409 struct stat target;
6d313367
LP
410
411 /* First, get rid of everything that is below if there
412 * is anything... Then, overmount it with an
c4b41707 413 * inaccessible path. */
fe3c2583 414 (void) umount_recursive(m->path, 0);
6d313367 415
d944dc95 416 if (lstat(m->path, &target) < 0)
160cfdbe 417 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path);
15ae422b 418
c4b41707 419 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
420 if (!what) {
421 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
422 return -ELOOP;
423 }
424 break;
160cfdbe 425 }
fe3c2583 426
15ae422b 427 case READONLY:
15ae422b 428 case READWRITE:
6b7c9f8b
LP
429
430 r = path_is_mount_point(m->path, 0);
d944dc95 431 if (r < 0)
6b7c9f8b 432 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path);
6b7c9f8b
LP
433 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
434 return 0;
435
436 /* This isn't a mount point yet, let's make it one. */
437 what = m->path;
438 break;
15ae422b 439
ac0930c8
LP
440 case PRIVATE_TMP:
441 what = tmp_dir;
442 break;
443
444 case PRIVATE_VAR_TMP:
445 what = var_tmp_dir;
15ae422b 446 break;
e364ad06 447
d6797c92
LP
448 case PRIVATE_DEV:
449 return mount_dev(m);
450
e364ad06
LP
451 default:
452 assert_not_reached("Unknown mode");
15ae422b
LP
453 }
454
ac0930c8 455 assert(what);
15ae422b 456
d944dc95 457 if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0)
5fd7cf6f 458 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path);
6b7c9f8b
LP
459
460 log_debug("Successfully mounted %s to %s", what, m->path);
461 return 0;
ac0930c8 462}
15ae422b 463
6b7c9f8b
LP
464static int make_read_only(BindMount *m, char **blacklist) {
465 int r = 0;
15ae422b 466
c17ec25e 467 assert(m);
ac0930c8 468
d6797c92 469 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
6b7c9f8b
LP
470 r = bind_remount_recursive(m->path, true, blacklist);
471 else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
472 if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
473 r = -errno;
737ba3c8 474 } else
6b7c9f8b
LP
475 return 0;
476
477 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
478 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
479 * read-only mounts already applied. */
ac0930c8 480
d6797c92 481 return r;
15ae422b
LP
482}
483
d944dc95
LP
484static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned *n) {
485 BindMount *f, *t;
486 int r;
487
488 assert(m);
489 assert(n);
490
491 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
492 * chase the symlinks on our own first. This call wil do so for all entries and remove all entries where we
493 * can't resolve the path, and which have been marked for such removal. */
494
495 for (f = m, t = m; f < m+*n; f++) {
496
497 r = chase_symlinks(f->path, root_directory, &f->chased);
498 if (r == -ENOENT && f->ignore) /* Doesn't exist? Then remove it! */
499 continue;
500 if (r < 0)
501 return log_debug_errno(r, "Failed to chase symlinks for %s: %m", f->path);
502
503 if (path_equal(f->path, f->chased))
504 f->chased = mfree(f->chased);
505 else {
506 log_debug("Chased %s → %s", f->path, f->chased);
507 f->path = f->chased;
508 }
509
510 *t = *f;
511 t++;
512 }
513
514 *n = t - m;
515 return 0;
516}
517
613b411c 518int setup_namespace(
ee818b89 519 const char* root_directory,
2a624c36
AP
520 char** read_write_paths,
521 char** read_only_paths,
522 char** inaccessible_paths,
a004cb4c
LP
523 const char* tmp_dir,
524 const char* var_tmp_dir,
7f112f50 525 bool private_dev,
59eeb84b
LP
526 bool protect_sysctl,
527 bool protect_cgroups,
1b8689f9
LP
528 ProtectHome protect_home,
529 ProtectSystem protect_system,
e6547662 530 unsigned long mount_flags) {
15ae422b 531
7ff7394d 532 BindMount *m, *mounts = NULL;
d944dc95 533 bool make_slave = false;
613b411c 534 unsigned n;
c17ec25e 535 int r = 0;
15ae422b 536
613b411c 537 if (mount_flags == 0)
c17ec25e 538 mount_flags = MS_SHARED;
ac0930c8 539
9ca6ff50 540 n = !!tmp_dir + !!var_tmp_dir +
2a624c36
AP
541 strv_length(read_write_paths) +
542 strv_length(read_only_paths) +
543 strv_length(inaccessible_paths) +
417116f2 544 private_dev +
11a30cec
DH
545 (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
546 (protect_cgroups ? 1 : 0) +
3f815163
LP
547 (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
548 (protect_system == PROTECT_SYSTEM_STRICT ?
549 (2 + !private_dev + !protect_sysctl) :
550 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
551 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
613b411c 552
d944dc95
LP
553 if (root_directory || n > 0)
554 make_slave = true;
555
613b411c 556 if (n > 0) {
002b2268 557 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
2a624c36 558 r = append_mounts(&m, read_write_paths, READWRITE);
613b411c
LP
559 if (r < 0)
560 return r;
561
2a624c36 562 r = append_mounts(&m, read_only_paths, READONLY);
613b411c
LP
563 if (r < 0)
564 return r;
565
2a624c36 566 r = append_mounts(&m, inaccessible_paths, INACCESSIBLE);
613b411c 567 if (r < 0)
7ff7394d
ZJS
568 return r;
569
613b411c 570 if (tmp_dir) {
ee818b89 571 m->path = prefix_roota(root_directory, "/tmp");
7ff7394d
ZJS
572 m->mode = PRIVATE_TMP;
573 m++;
613b411c 574 }
7ff7394d 575
613b411c 576 if (var_tmp_dir) {
ee818b89 577 m->path = prefix_roota(root_directory, "/var/tmp");
7ff7394d
ZJS
578 m->mode = PRIVATE_VAR_TMP;
579 m++;
580 }
ac0930c8 581
7f112f50 582 if (private_dev) {
ee818b89 583 m->path = prefix_roota(root_directory, "/dev");
7f112f50
LP
584 m->mode = PRIVATE_DEV;
585 m++;
586 }
587
11a30cec
DH
588 if (protect_sysctl)
589 append_protect_kernel_tunables(&m, root_directory);
59eeb84b 590
11a30cec 591 if (protect_cgroups) {
59eeb84b 592 m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
11a30cec 593 m->mode = READONLY;
59eeb84b
LP
594 m++;
595 }
596
3f815163 597 if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
ee818b89
AC
598 const char *home_dir, *run_user_dir, *root_dir;
599
3f815163
LP
600 /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
601 * strict system protection mode, then also add entries for these directories, but mark them
602 * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
603
ee818b89
AC
604 home_dir = prefix_roota(root_directory, "/home");
605 home_dir = strjoina("-", home_dir);
606 run_user_dir = prefix_roota(root_directory, "/run/user");
607 run_user_dir = strjoina("-", run_user_dir);
608 root_dir = prefix_roota(root_directory, "/root");
609 root_dir = strjoina("-", root_dir);
610
611 r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
3f815163
LP
612 protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
613 protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
417116f2
LP
614 if (r < 0)
615 return r;
616 }
617
3f815163
LP
618 if (protect_system == PROTECT_SYSTEM_STRICT) {
619 /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
620 * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
621 * protect those, and these options should be fully orthogonal. (And of course /home and
622 * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
623 * above). */
624
625 m->path = prefix_roota(root_directory, "/");
626 m->mode = READONLY;
627 m++;
628
629 m->path = prefix_roota(root_directory, "/proc");
630 m->mode = READWRITE;
631 m++;
632
633 if (!private_dev) {
634 m->path = prefix_roota(root_directory, "/dev");
635 m->mode = READWRITE;
636 m++;
637 }
638 if (!protect_sysctl) {
639 m->path = prefix_roota(root_directory, "/sys");
640 m->mode = READWRITE;
641 m++;
642 }
643
644 } else if (protect_system != PROTECT_SYSTEM_NO) {
645 const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
646
647 /* In any other mode we simply mark the relevant three directories ready-only. */
ee818b89 648
d38e01dc 649 usr_dir = prefix_roota(root_directory, "/usr");
ee818b89
AC
650 boot_dir = prefix_roota(root_directory, "/boot");
651 boot_dir = strjoina("-", boot_dir);
3f815163
LP
652 efi_dir = prefix_roota(root_directory, "/efi");
653 efi_dir = strjoina("-", efi_dir);
ee818b89
AC
654 etc_dir = prefix_roota(root_directory, "/etc");
655
656 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
3f815163
LP
657 ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
658 : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
417116f2
LP
659 if (r < 0)
660 return r;
661 }
662
7ff7394d 663 assert(mounts + n == m);
ac0930c8 664
d944dc95
LP
665 /* Resolve symlinks manually first, as mount() will always follow them relative to the host's
666 * root. Moreover we want to suppress duplicates based on the resolved paths. This of course is a bit
667 * racy. */
668 r = chase_all_symlinks(root_directory, mounts, &n);
669 if (r < 0)
670 goto finish;
671
7ff7394d 672 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
fe3c2583 673
7ff7394d 674 drop_duplicates(mounts, &n);
cd2902c9 675 drop_outside_root(root_directory, mounts, &n);
fe3c2583 676 drop_inaccessible(mounts, &n);
7648a565 677 drop_nop(mounts, &n);
15ae422b
LP
678 }
679
d944dc95
LP
680 if (unshare(CLONE_NEWNS) < 0) {
681 r = -errno;
682 goto finish;
683 }
1e4e94c8 684
d944dc95 685 if (make_slave) {
c2c13f2d
LP
686 /* Remount / as SLAVE so that nothing now mounted in the namespace
687 shows up in the parent */
d944dc95
LP
688 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
689 r = -errno;
690 goto finish;
691 }
ee818b89
AC
692 }
693
694 if (root_directory) {
8f1ad200
LP
695 /* Turn directory into bind mount, if it isn't one yet */
696 r = path_is_mount_point(root_directory, AT_SYMLINK_FOLLOW);
697 if (r < 0)
d944dc95 698 goto finish;
8f1ad200
LP
699 if (r == 0) {
700 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
701 r = -errno;
702 goto finish;
703 }
d944dc95 704 }
ee818b89 705 }
c2c13f2d 706
ee818b89 707 if (n > 0) {
6b7c9f8b
LP
708 char **blacklist;
709 unsigned j;
710
711 /* First round, add in all special mounts we need */
c2c13f2d
LP
712 for (m = mounts; m < mounts + n; ++m) {
713 r = apply_mount(m, tmp_dir, var_tmp_dir);
714 if (r < 0)
d944dc95 715 goto finish;
c2c13f2d 716 }
15ae422b 717
6b7c9f8b
LP
718 /* Create a blacklist we can pass to bind_mount_recursive() */
719 blacklist = newa(char*, n+1);
720 for (j = 0; j < n; j++)
721 blacklist[j] = (char*) mounts[j].path;
722 blacklist[j] = NULL;
723
724 /* Second round, flip the ro bits if necessary. */
c2c13f2d 725 for (m = mounts; m < mounts + n; ++m) {
6b7c9f8b 726 r = make_read_only(m, blacklist);
c2c13f2d 727 if (r < 0)
d944dc95 728 goto finish;
c2c13f2d 729 }
15ae422b
LP
730 }
731
ee818b89
AC
732 if (root_directory) {
733 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
734 r = mount_move_root(root_directory);
d944dc95
LP
735 if (r < 0)
736 goto finish;
ee818b89
AC
737 }
738
c2c13f2d
LP
739 /* Remount / as the desired mode. Not that this will not
740 * reestablish propagation from our side to the host, since
741 * what's disconnected is disconnected. */
d944dc95
LP
742 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
743 r = -errno;
744 goto finish;
745 }
15ae422b 746
d944dc95 747 r = 0;
15ae422b 748
d944dc95
LP
749finish:
750 for (m = mounts; m < mounts + n; m++)
751 free(m->chased);
613b411c
LP
752
753 return r;
754}
755
756static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
757 _cleanup_free_ char *x = NULL;
6b46ea73
LP
758 char bid[SD_ID128_STRING_MAX];
759 sd_id128_t boot_id;
760 int r;
613b411c
LP
761
762 assert(id);
763 assert(prefix);
764 assert(path);
765
6b46ea73
LP
766 /* We include the boot id in the directory so that after a
767 * reboot we can easily identify obsolete directories. */
768
769 r = sd_id128_get_boot(&boot_id);
770 if (r < 0)
771 return r;
772
773 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
613b411c
LP
774 if (!x)
775 return -ENOMEM;
776
777 RUN_WITH_UMASK(0077)
778 if (!mkdtemp(x))
779 return -errno;
780
781 RUN_WITH_UMASK(0000) {
782 char *y;
783
63c372cb 784 y = strjoina(x, "/tmp");
613b411c
LP
785
786 if (mkdir(y, 0777 | S_ISVTX) < 0)
787 return -errno;
c17ec25e 788 }
15ae422b 789
613b411c
LP
790 *path = x;
791 x = NULL;
792
793 return 0;
794}
795
796int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
797 char *a, *b;
798 int r;
799
800 assert(id);
801 assert(tmp_dir);
802 assert(var_tmp_dir);
803
804 r = setup_one_tmp_dir(id, "/tmp", &a);
805 if (r < 0)
806 return r;
807
808 r = setup_one_tmp_dir(id, "/var/tmp", &b);
809 if (r < 0) {
810 char *t;
811
63c372cb 812 t = strjoina(a, "/tmp");
613b411c
LP
813 rmdir(t);
814 rmdir(a);
815
816 free(a);
817 return r;
818 }
819
820 *tmp_dir = a;
821 *var_tmp_dir = b;
822
823 return 0;
824}
825
826int setup_netns(int netns_storage_socket[2]) {
827 _cleanup_close_ int netns = -1;
3ee897d6 828 int r, q;
613b411c
LP
829
830 assert(netns_storage_socket);
831 assert(netns_storage_socket[0] >= 0);
832 assert(netns_storage_socket[1] >= 0);
833
834 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
835 * namespace reference fd. Whatever process runs this first
836 * shall create a new namespace, all others should just join
837 * it. To serialize that we use a file lock on the socket
838 * pair.
613b411c
LP
839 *
840 * It's a bit crazy, but hey, works great! */
841
842 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
843 return -errno;
844
3ee897d6
LP
845 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
846 if (netns == -EAGAIN) {
613b411c
LP
847 /* Nothing stored yet, so let's create a new namespace */
848
849 if (unshare(CLONE_NEWNET) < 0) {
850 r = -errno;
851 goto fail;
852 }
853
854 loopback_setup();
855
856 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
857 if (netns < 0) {
858 r = -errno;
859 goto fail;
860 }
861
862 r = 1;
613b411c 863
3ee897d6
LP
864 } else if (netns < 0) {
865 r = netns;
866 goto fail;
613b411c 867
3ee897d6
LP
868 } else {
869 /* Yay, found something, so let's join the namespace */
613b411c
LP
870 if (setns(netns, CLONE_NEWNET) < 0) {
871 r = -errno;
872 goto fail;
873 }
874
875 r = 0;
876 }
877
3ee897d6
LP
878 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
879 if (q < 0) {
880 r = q;
613b411c
LP
881 goto fail;
882 }
883
884fail:
fe048ce5 885 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
886 return r;
887}
417116f2 888
1b8689f9
LP
889static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
890 [PROTECT_HOME_NO] = "no",
891 [PROTECT_HOME_YES] = "yes",
892 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
893};
894
1b8689f9
LP
895DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
896
897static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
898 [PROTECT_SYSTEM_NO] = "no",
899 [PROTECT_SYSTEM_YES] = "yes",
900 [PROTECT_SYSTEM_FULL] = "full",
3f815163 901 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
902};
903
904DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);