]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
core: rename BindMount structure → MountEntry
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
15ae422b
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
15ae422b 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
20#include <errno.h>
07630cea 21#include <sched.h>
15ae422b 22#include <stdio.h>
07630cea
LP
23#include <string.h>
24#include <sys/mount.h>
15ae422b 25#include <sys/stat.h>
07630cea 26#include <unistd.h>
25e870b5 27#include <linux/fs.h>
15ae422b 28
b5efdb8a 29#include "alloc-util.h"
7f112f50 30#include "dev-setup.h"
3ffd4af2 31#include "fd-util.h"
d944dc95 32#include "fs-util.h"
07630cea
LP
33#include "loopback-setup.h"
34#include "missing.h"
35#include "mkdir.h"
4349cd7c 36#include "mount-util.h"
3ffd4af2 37#include "namespace.h"
07630cea 38#include "path-util.h"
d7b8eec7 39#include "selinux-util.h"
2583fbea 40#include "socket-util.h"
8b43440b 41#include "string-table.h"
07630cea
LP
42#include "string-util.h"
43#include "strv.h"
affb60b1 44#include "umask-util.h"
ee104e11 45#include "user-util.h"
07630cea 46#include "util.h"
15ae422b 47
737ba3c8 48#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
49
c17ec25e 50typedef enum MountMode {
15ae422b
LP
51 /* This is ordered by priority! */
52 INACCESSIBLE,
53 READONLY,
ac0930c8
LP
54 PRIVATE_TMP,
55 PRIVATE_VAR_TMP,
7f112f50 56 PRIVATE_DEV,
59eeb84b 57 READWRITE,
c17ec25e 58} MountMode;
15ae422b 59
34de407a 60typedef struct MountEntry {
5327c910 61 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 62 MountMode mode:5;
5327c910
LP
63 bool ignore:1; /* Ignore if path does not exist? */
64 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 65 bool read_only:1; /* Shall this mount point be read-only? */
5327c910 66 char *path_malloc; /* Use this instead of 'path' if we had to allocate memory */
34de407a 67} MountEntry;
15ae422b 68
f471b2af
DH
69/*
70 * The following Protect tables are to protect paths and mark some of them
71 * READONLY, in case a path is covered by an option from another table, then
72 * it is marked READWRITE in the current one, and the more restrictive mode is
73 * applied from that other table. This way all options can be combined in a
74 * safe and comprehensible way for users.
75 */
76
11a30cec 77/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 78static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
79 { "/proc/sys", READONLY, false },
80 { "/proc/sysrq-trigger", READONLY, true },
81 { "/proc/latency_stats", READONLY, true },
82 { "/proc/mtrr", READONLY, true },
aa70f38b 83 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
84 { "/proc/acpi", READONLY, true },
85 { "/proc/timer_stats", READONLY, true },
86 { "/proc/asound", READONLY, true },
87 { "/proc/bus", READONLY, true },
88 { "/proc/fs", READONLY, true },
89 { "/proc/irq", READONLY, true },
90 { "/sys", READONLY, false },
91 { "/sys/kernel/debug", READONLY, true },
92 { "/sys/kernel/tracing", READONLY, true },
93 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
11a30cec
DH
94};
95
c575770b 96/* ProtectKernelModules= option */
34de407a 97static const MountEntry protect_kernel_modules_table[] = {
c575770b 98#ifdef HAVE_SPLIT_USR
c6232fb0 99 { "/lib/modules", INACCESSIBLE, true },
c575770b 100#endif
c6232fb0 101 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
102};
103
b6c432ca
DH
104/*
105 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
106 * system should be protected by ProtectSystem=
107 */
34de407a 108static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
109 { "/home", READONLY, true },
110 { "/run/user", READONLY, true },
111 { "/root", READONLY, true },
b6c432ca
DH
112};
113
114/* ProtectHome=yes table */
34de407a 115static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
116 { "/home", INACCESSIBLE, true },
117 { "/run/user", INACCESSIBLE, true },
118 { "/root", INACCESSIBLE, true },
b6c432ca
DH
119};
120
f471b2af 121/* ProtectSystem=yes table */
34de407a 122static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
123 { "/usr", READONLY, false },
124 { "/boot", READONLY, true },
125 { "/efi", READONLY, true },
f471b2af
DH
126};
127
128/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 129static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
130 { "/usr", READONLY, false },
131 { "/boot", READONLY, true },
132 { "/efi", READONLY, true },
133 { "/etc", READONLY, false },
f471b2af
DH
134};
135
136/*
137 * ProtectSystem=strict table. In this strict mode, we mount everything
138 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
139 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
140 * protect those, and these options should be fully orthogonal.
141 * (And of course /home and friends are also left writable, as ProtectHome=
142 * shall manage those, orthogonally).
143 */
34de407a 144static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
145 { "/", READONLY, false },
146 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
147 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
148 { "/dev", READWRITE, false }, /* PrivateDevices= */
149 { "/home", READWRITE, true }, /* ProtectHome= */
150 { "/run/user", READWRITE, true }, /* ProtectHome= */
151 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
152};
153
34de407a 154static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
155 assert(p);
156
5327c910
LP
157 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
158 * otherwise the stack/static ->path field is returned. */
f0a4feb0 159
5327c910 160 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
161}
162
34de407a 163static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
164 assert(p);
165
166 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
167}
168
34de407a 169static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) {
15ae422b
LP
170 char **i;
171
613b411c
LP
172 assert(p);
173
5327c910
LP
174 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
175
15ae422b 176 STRV_FOREACH(i, strv) {
5327c910
LP
177 bool ignore = false, needs_prefix = false;
178 const char *e = *i;
15ae422b 179
5327c910
LP
180 /* Look for any prefixes */
181 if (startswith(e, "-")) {
182 e++;
9c94d52e 183 ignore = true;
ea92ae33 184 }
5327c910
LP
185 if (startswith(e, "+")) {
186 e++;
187 needs_prefix = true;
188 }
ea92ae33 189
5327c910 190 if (!path_is_absolute(e))
15ae422b
LP
191 return -EINVAL;
192
34de407a 193 *((*p)++) = (MountEntry) {
5327c910
LP
194 .path_const = e,
195 .mode = mode,
196 .ignore = ignore,
197 .has_prefix = !needs_prefix,
198 };
15ae422b
LP
199 }
200
201 return 0;
202}
203
34de407a 204static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 205 unsigned i;
11a30cec
DH
206
207 assert(p);
f471b2af 208 assert(mounts);
11a30cec 209
5327c910 210 /* Adds a list of static pre-defined entries */
f471b2af 211
5327c910 212 for (i = 0; i < n; i++)
34de407a
LP
213 *((*p)++) = (MountEntry) {
214 .path_const = mount_entry_path(mounts+i),
5327c910
LP
215 .mode = mounts[i].mode,
216 .ignore = mounts[i].ignore || ignore_protect,
217 };
f471b2af
DH
218
219 return 0;
220}
221
34de407a 222static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
223 assert(p);
224
5327c910 225 switch (protect_home) {
b6c432ca 226
5327c910 227 case PROTECT_HOME_NO:
b6c432ca
DH
228 return 0;
229
b6c432ca 230 case PROTECT_HOME_READ_ONLY:
5327c910
LP
231 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
232
b6c432ca 233 case PROTECT_HOME_YES:
5327c910
LP
234 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
235
b6c432ca 236 default:
5327c910 237 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 238 }
b6c432ca
DH
239}
240
34de407a 241static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
242 assert(p);
243
5327c910
LP
244 switch (protect_system) {
245
246 case PROTECT_SYSTEM_NO:
f471b2af
DH
247 return 0;
248
f471b2af 249 case PROTECT_SYSTEM_STRICT:
5327c910
LP
250 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
251
f471b2af 252 case PROTECT_SYSTEM_YES:
5327c910
LP
253 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
254
f471b2af 255 case PROTECT_SYSTEM_FULL:
5327c910
LP
256 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
257
f471b2af 258 default:
5327c910 259 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 260 }
11a30cec
DH
261}
262
c17ec25e 263static int mount_path_compare(const void *a, const void *b) {
34de407a 264 const MountEntry *p = a, *q = b;
a0827e2b 265 int d;
15ae422b 266
6ee1a919 267 /* If the paths are not equal, then order prefixes first */
34de407a 268 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
269 if (d != 0)
270 return d;
15ae422b 271
6ee1a919
LP
272 /* If the paths are equal, check the mode */
273 if (p->mode < q->mode)
274 return -1;
15ae422b 275
6ee1a919
LP
276 if (p->mode > q->mode)
277 return 1;
15ae422b 278
6ee1a919 279 return 0;
15ae422b
LP
280}
281
34de407a 282static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
283 unsigned i;
284
285 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
286 * that. */
287
288 if (!root_directory)
289 return 0;
290
291 for (i = 0; i < n; i++) {
292 char *s;
293
294 if (m[i].has_prefix)
295 continue;
296
34de407a 297 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
298 if (!s)
299 return -ENOMEM;
300
301 free(m[i].path_malloc);
302 m[i].path_malloc = s;
303
304 m[i].has_prefix = true;
305 }
306
307 return 0;
308}
309
34de407a
LP
310static void drop_duplicates(MountEntry *m, unsigned *n) {
311 MountEntry *f, *t, *previous;
15ae422b 312
c17ec25e 313 assert(m);
15ae422b 314 assert(n);
15ae422b 315
fe3c2583
LP
316 /* Drops duplicate entries. Expects that the array is properly ordered already. */
317
1d54cd5d 318 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 319
fe3c2583
LP
320 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
321 * above. */
34de407a
LP
322 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
323 log_debug("%s is duplicate.", mount_entry_path(f));
324 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
5327c910 325 f->path_malloc = mfree(f->path_malloc);
15ae422b 326 continue;
fe3c2583 327 }
15ae422b 328
e2d7c1a0 329 *t = *f;
15ae422b 330 previous = t;
fe3c2583
LP
331 t++;
332 }
333
334 *n = t - m;
335}
336
34de407a
LP
337static void drop_inaccessible(MountEntry *m, unsigned *n) {
338 MountEntry *f, *t;
fe3c2583
LP
339 const char *clear = NULL;
340
341 assert(m);
342 assert(n);
343
344 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
345 * ordered already. */
346
1d54cd5d 347 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
348
349 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
350 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
351 if (clear && path_startswith(mount_entry_path(f), clear)) {
352 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
5327c910 353 f->path_malloc = mfree(f->path_malloc);
fe3c2583
LP
354 continue;
355 }
15ae422b 356
34de407a 357 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
358
359 *t = *f;
15ae422b
LP
360 t++;
361 }
362
c17ec25e 363 *n = t - m;
15ae422b
LP
364}
365
34de407a
LP
366static void drop_nop(MountEntry *m, unsigned *n) {
367 MountEntry *f, *t;
7648a565
LP
368
369 assert(m);
370 assert(n);
371
372 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
373 * list is ordered by prefixes. */
374
1d54cd5d 375 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
376
377 /* Only suppress such subtrees for READONLY and READWRITE entries */
378 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 379 MountEntry *p;
7648a565
LP
380 bool found = false;
381
382 /* Now let's find the first parent of the entry we are looking at. */
383 for (p = t-1; p >= m; p--) {
34de407a 384 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
385 found = true;
386 break;
387 }
388 }
389
390 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
391 if (found && p->mode == f->mode) {
34de407a 392 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
5327c910 393 f->path_malloc = mfree(f->path_malloc);
7648a565
LP
394 continue;
395 }
396 }
397
398 *t = *f;
399 t++;
400 }
401
402 *n = t - m;
403}
404
34de407a
LP
405static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
406 MountEntry *f, *t;
cd2902c9
LP
407
408 assert(m);
409 assert(n);
410
1d54cd5d 411 /* Nothing to do */
cd2902c9
LP
412 if (!root_directory)
413 return;
414
415 /* Drops all mounts that are outside of the root directory. */
416
1d54cd5d 417 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 418
34de407a
LP
419 if (!path_startswith(mount_entry_path(f), root_directory)) {
420 log_debug("%s is outside of root directory.", mount_entry_path(f));
5327c910 421 f->path_malloc = mfree(f->path_malloc);
cd2902c9
LP
422 continue;
423 }
424
425 *t = *f;
426 t++;
427 }
428
429 *n = t - m;
430}
431
34de407a 432static int mount_dev(MountEntry *m) {
7f112f50
LP
433 static const char devnodes[] =
434 "/dev/null\0"
435 "/dev/zero\0"
436 "/dev/full\0"
437 "/dev/random\0"
438 "/dev/urandom\0"
439 "/dev/tty\0";
440
2b85f4e1 441 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 442 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
443 _cleanup_umask_ mode_t u;
444 int r;
445
446 assert(m);
447
448 u = umask(0000);
449
2b85f4e1
LP
450 if (!mkdtemp(temporary_mount))
451 return -errno;
452
63c372cb 453 dev = strjoina(temporary_mount, "/dev");
dc751688 454 (void) mkdir(dev, 0755);
737ba3c8 455 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
456 r = -errno;
457 goto fail;
458 }
459
63c372cb 460 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 461 (void) mkdir(devpts, 0755);
2b85f4e1
LP
462 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
463 r = -errno;
464 goto fail;
465 }
466
63c372cb 467 devptmx = strjoina(temporary_mount, "/dev/ptmx");
3164e3cb
ZJS
468 if (symlink("pts/ptmx", devptmx) < 0) {
469 r = -errno;
470 goto fail;
471 }
e06b6479 472
63c372cb 473 devshm = strjoina(temporary_mount, "/dev/shm");
dc751688 474 (void) mkdir(devshm, 01777);
2b85f4e1
LP
475 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
476 if (r < 0) {
477 r = -errno;
478 goto fail;
479 }
480
63c372cb 481 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 482 (void) mkdir(devmqueue, 0755);
3164e3cb 483 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 484
63c372cb 485 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 486 (void) mkdir(devhugepages, 0755);
3164e3cb 487 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 488
63c372cb 489 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 490 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 491
7f112f50 492 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
493 _cleanup_free_ char *dn = NULL;
494 struct stat st;
495
496 r = stat(d, &st);
7f112f50 497 if (r < 0) {
2b85f4e1
LP
498
499 if (errno == ENOENT)
500 continue;
501
502 r = -errno;
503 goto fail;
7f112f50
LP
504 }
505
2b85f4e1
LP
506 if (!S_ISBLK(st.st_mode) &&
507 !S_ISCHR(st.st_mode)) {
508 r = -EINVAL;
509 goto fail;
510 }
511
512 if (st.st_rdev == 0)
513 continue;
514
515 dn = strappend(temporary_mount, d);
516 if (!dn) {
517 r = -ENOMEM;
518 goto fail;
519 }
520
ecabcf8b 521 mac_selinux_create_file_prepare(d, st.st_mode);
2b85f4e1 522 r = mknod(dn, st.st_mode, st.st_rdev);
ecabcf8b 523 mac_selinux_create_file_clear();
dd078a1e 524
2b85f4e1
LP
525 if (r < 0) {
526 r = -errno;
527 goto fail;
528 }
7f112f50
LP
529 }
530
03cfe0d5 531 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 532
ee818b89
AC
533 /* Create the /dev directory if missing. It is more likely to be
534 * missing when the service is started with RootDirectory. This is
535 * consistent with mount units creating the mount points when missing.
536 */
34de407a 537 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 538
9e5f8252 539 /* Unmount everything in old /dev */
34de407a
LP
540 umount_recursive(mount_entry_path(m), 0);
541 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
542 r = -errno;
543 goto fail;
544 }
7f112f50 545
2b85f4e1
LP
546 rmdir(dev);
547 rmdir(temporary_mount);
7f112f50 548
2b85f4e1 549 return 0;
7f112f50 550
2b85f4e1
LP
551fail:
552 if (devpts)
553 umount(devpts);
7f112f50 554
2b85f4e1
LP
555 if (devshm)
556 umount(devshm);
7f112f50 557
2b85f4e1
LP
558 if (devhugepages)
559 umount(devhugepages);
7f112f50 560
2b85f4e1
LP
561 if (devmqueue)
562 umount(devmqueue);
7f112f50 563
d267c5aa
ZJS
564 umount(dev);
565 rmdir(dev);
2b85f4e1 566 rmdir(temporary_mount);
7f112f50 567
2b85f4e1 568 return r;
7f112f50
LP
569}
570
ac0930c8 571static int apply_mount(
34de407a 572 MountEntry *m,
ac0930c8 573 const char *tmp_dir,
c17ec25e 574 const char *var_tmp_dir) {
ac0930c8 575
15ae422b 576 const char *what;
15ae422b 577 int r;
15ae422b 578
c17ec25e 579 assert(m);
15ae422b 580
34de407a 581 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 582
c17ec25e 583 switch (m->mode) {
15ae422b 584
160cfdbe
LP
585 case INACCESSIBLE: {
586 struct stat target;
6d313367
LP
587
588 /* First, get rid of everything that is below if there
589 * is anything... Then, overmount it with an
c4b41707 590 * inaccessible path. */
34de407a 591 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 592
34de407a
LP
593 if (lstat(mount_entry_path(m), &target) < 0)
594 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
15ae422b 595
c4b41707 596 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
597 if (!what) {
598 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
599 return -ELOOP;
600 }
601 break;
160cfdbe 602 }
fe3c2583 603
15ae422b 604 case READONLY:
15ae422b 605 case READWRITE:
6b7c9f8b 606
34de407a 607 r = path_is_mount_point(mount_entry_path(m), NULL, 0);
d944dc95 608 if (r < 0)
34de407a 609 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
610 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
611 return 0;
6b7c9f8b 612 /* This isn't a mount point yet, let's make it one. */
34de407a 613 what = mount_entry_path(m);
6b7c9f8b 614 break;
15ae422b 615
ac0930c8
LP
616 case PRIVATE_TMP:
617 what = tmp_dir;
618 break;
619
620 case PRIVATE_VAR_TMP:
621 what = var_tmp_dir;
15ae422b 622 break;
e364ad06 623
d6797c92
LP
624 case PRIVATE_DEV:
625 return mount_dev(m);
626
e364ad06
LP
627 default:
628 assert_not_reached("Unknown mode");
15ae422b
LP
629 }
630
ac0930c8 631 assert(what);
15ae422b 632
34de407a
LP
633 if (mount(what, mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
634 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
6b7c9f8b 635
34de407a 636 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 637 return 0;
ac0930c8 638}
15ae422b 639
34de407a 640static int make_read_only(MountEntry *m, char **blacklist) {
6b7c9f8b 641 int r = 0;
15ae422b 642
c17ec25e 643 assert(m);
ac0930c8 644
34de407a
LP
645 if (mount_entry_read_only(m))
646 r = bind_remount_recursive(mount_entry_path(m), true, blacklist);
6b7c9f8b 647 else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
34de407a 648 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 649 r = -errno;
737ba3c8 650 } else
6b7c9f8b
LP
651 return 0;
652
653 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
654 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
655 * read-only mounts already applied. */
ac0930c8 656
d6797c92 657 return r;
15ae422b
LP
658}
659
1d54cd5d 660/* Chase symlinks and remove failed paths from mounts */
34de407a
LP
661static int chase_all_symlinks(const char *root_directory, MountEntry *m, unsigned *n) {
662 MountEntry *f, *t;
1d54cd5d 663 int r = 0;
d944dc95
LP
664
665 assert(m);
666 assert(n);
667
668 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
669 * chase the symlinks on our own first. This call wil do so for all entries and remove all entries where we
670 * can't resolve the path, and which have been marked for such removal. */
671
49fedb40 672 for (f = m, t = m; f < m + *n; f++) {
f0a4feb0 673 _cleanup_free_ char *chased = NULL;
5327c910 674 int k;
49fedb40 675
34de407a 676 k = chase_symlinks(mount_entry_path(f), root_directory, 0, &chased);
1d54cd5d
DH
677 if (k < 0) {
678 /* Get only real errors */
679 if (r >= 0 && (k != -ENOENT || !f->ignore))
680 r = k;
681
1d54cd5d 682 /* Doesn't exist or failed? Then remove it and continue! */
34de407a 683 log_debug_errno(k, "Failed to chase symlinks for %s: %m", mount_entry_path(f));
5327c910 684 f->path_malloc = mfree(f->path_malloc);
d944dc95 685 continue;
f0a4feb0 686 }
d944dc95 687
34de407a
LP
688 if (!path_equal(mount_entry_path(f), chased)) {
689 log_debug("Chased %s → %s", mount_entry_path(f), chased);
5327c910
LP
690
691 free(f->path_malloc);
692 f->path_malloc = chased;
693 chased = NULL;
d944dc95
LP
694 }
695
696 *t = *f;
697 t++;
698 }
699
700 *n = t - m;
1d54cd5d 701 return r;
d944dc95
LP
702}
703
2652c6c1 704static unsigned namespace_calculate_mounts(
c575770b 705 const NameSpaceInfo *ns_info,
2652c6c1
DH
706 char** read_write_paths,
707 char** read_only_paths,
708 char** inaccessible_paths,
709 const char* tmp_dir,
710 const char* var_tmp_dir,
2652c6c1
DH
711 ProtectHome protect_home,
712 ProtectSystem protect_system) {
713
b6c432ca 714 unsigned protect_home_cnt;
f471b2af
DH
715 unsigned protect_system_cnt =
716 (protect_system == PROTECT_SYSTEM_STRICT ?
717 ELEMENTSOF(protect_system_strict_table) :
718 ((protect_system == PROTECT_SYSTEM_FULL) ?
719 ELEMENTSOF(protect_system_full_table) :
720 ((protect_system == PROTECT_SYSTEM_YES) ?
721 ELEMENTSOF(protect_system_yes_table) : 0)));
722
b6c432ca
DH
723 protect_home_cnt =
724 (protect_home == PROTECT_HOME_YES ?
725 ELEMENTSOF(protect_home_yes_table) :
726 ((protect_home == PROTECT_HOME_READ_ONLY) ?
727 ELEMENTSOF(protect_home_read_only_table) : 0));
728
2652c6c1
DH
729 return !!tmp_dir + !!var_tmp_dir +
730 strv_length(read_write_paths) +
731 strv_length(read_only_paths) +
732 strv_length(inaccessible_paths) +
c575770b
DH
733 ns_info->private_dev +
734 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
735 (ns_info->protect_control_groups ? 1 : 0) +
736 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
b6c432ca 737 protect_home_cnt + protect_system_cnt;
2652c6c1
DH
738}
739
613b411c 740int setup_namespace(
ee818b89 741 const char* root_directory,
c575770b 742 const NameSpaceInfo *ns_info,
2a624c36
AP
743 char** read_write_paths,
744 char** read_only_paths,
745 char** inaccessible_paths,
a004cb4c
LP
746 const char* tmp_dir,
747 const char* var_tmp_dir,
1b8689f9
LP
748 ProtectHome protect_home,
749 ProtectSystem protect_system,
e6547662 750 unsigned long mount_flags) {
15ae422b 751
34de407a 752 MountEntry *m, *mounts = NULL;
d944dc95 753 bool make_slave = false;
f0a4feb0 754 unsigned n_mounts;
c17ec25e 755 int r = 0;
15ae422b 756
613b411c 757 if (mount_flags == 0)
c17ec25e 758 mount_flags = MS_SHARED;
ac0930c8 759
cfbeb4ef
LP
760 n_mounts = namespace_calculate_mounts(
761 ns_info,
762 read_write_paths,
763 read_only_paths,
764 inaccessible_paths,
765 tmp_dir, var_tmp_dir,
766 protect_home, protect_system);
613b411c 767
2652c6c1 768 /* Set mount slave mode */
f0a4feb0 769 if (root_directory || n_mounts > 0)
d944dc95
LP
770 make_slave = true;
771
f0a4feb0 772 if (n_mounts > 0) {
34de407a 773 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
5327c910 774 r = append_access_mounts(&m, read_write_paths, READWRITE);
613b411c 775 if (r < 0)
f0a4feb0 776 goto finish;
613b411c 777
5327c910 778 r = append_access_mounts(&m, read_only_paths, READONLY);
613b411c 779 if (r < 0)
f0a4feb0 780 goto finish;
613b411c 781
5327c910 782 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE);
613b411c 783 if (r < 0)
f0a4feb0 784 goto finish;
7ff7394d 785
613b411c 786 if (tmp_dir) {
34de407a 787 *(m++) = (MountEntry) {
5327c910
LP
788 .path_const = "/tmp",
789 .mode = PRIVATE_TMP,
790 };
613b411c 791 }
7ff7394d 792
613b411c 793 if (var_tmp_dir) {
34de407a 794 *(m++) = (MountEntry) {
5327c910
LP
795 .path_const = "/var/tmp",
796 .mode = PRIVATE_VAR_TMP,
797 };
7ff7394d 798 }
ac0930c8 799
c575770b 800 if (ns_info->private_dev) {
34de407a 801 *(m++) = (MountEntry) {
5327c910
LP
802 .path_const = "/dev",
803 .mode = PRIVATE_DEV,
804 };
7f112f50
LP
805 }
806
c575770b 807 if (ns_info->protect_kernel_tunables) {
5327c910 808 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 809 if (r < 0)
f0a4feb0 810 goto finish;
c575770b
DH
811 }
812
813 if (ns_info->protect_kernel_modules) {
5327c910 814 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 815 if (r < 0)
f0a4feb0 816 goto finish;
c575770b 817 }
59eeb84b 818
c575770b 819 if (ns_info->protect_control_groups) {
34de407a 820 *(m++) = (MountEntry) {
5327c910
LP
821 .path_const = "/sys/fs/cgroup",
822 .mode = READONLY,
823 };
59eeb84b
LP
824 }
825
5327c910 826 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 827 if (r < 0)
f0a4feb0 828 goto finish;
417116f2 829
5327c910 830 r = append_protect_system(&m, protect_system, false);
f471b2af 831 if (r < 0)
f0a4feb0 832 goto finish;
417116f2 833
f0a4feb0 834 assert(mounts + n_mounts == m);
ac0930c8 835
5327c910
LP
836 /* Prepend the root directory where that's necessary */
837 r = prefix_where_needed(mounts, n_mounts, root_directory);
838 if (r < 0)
839 goto finish;
840
d944dc95
LP
841 /* Resolve symlinks manually first, as mount() will always follow them relative to the host's
842 * root. Moreover we want to suppress duplicates based on the resolved paths. This of course is a bit
843 * racy. */
f0a4feb0 844 r = chase_all_symlinks(root_directory, mounts, &n_mounts);
d944dc95
LP
845 if (r < 0)
846 goto finish;
847
34de407a 848 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
fe3c2583 849
f0a4feb0
DH
850 drop_duplicates(mounts, &n_mounts);
851 drop_outside_root(root_directory, mounts, &n_mounts);
852 drop_inaccessible(mounts, &n_mounts);
853 drop_nop(mounts, &n_mounts);
15ae422b
LP
854 }
855
d944dc95
LP
856 if (unshare(CLONE_NEWNS) < 0) {
857 r = -errno;
858 goto finish;
859 }
1e4e94c8 860
d944dc95 861 if (make_slave) {
c2c13f2d
LP
862 /* Remount / as SLAVE so that nothing now mounted in the namespace
863 shows up in the parent */
d944dc95
LP
864 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
865 r = -errno;
866 goto finish;
867 }
ee818b89
AC
868 }
869
870 if (root_directory) {
8f1ad200 871 /* Turn directory into bind mount, if it isn't one yet */
e1873695 872 r = path_is_mount_point(root_directory, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 873 if (r < 0)
d944dc95 874 goto finish;
8f1ad200
LP
875 if (r == 0) {
876 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
877 r = -errno;
878 goto finish;
879 }
d944dc95 880 }
ee818b89 881 }
c2c13f2d 882
f0a4feb0 883 if (n_mounts > 0) {
6b7c9f8b
LP
884 char **blacklist;
885 unsigned j;
886
887 /* First round, add in all special mounts we need */
f0a4feb0 888 for (m = mounts; m < mounts + n_mounts; ++m) {
c2c13f2d
LP
889 r = apply_mount(m, tmp_dir, var_tmp_dir);
890 if (r < 0)
d944dc95 891 goto finish;
c2c13f2d 892 }
15ae422b 893
6b7c9f8b 894 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
895 blacklist = newa(char*, n_mounts+1);
896 for (j = 0; j < n_mounts; j++)
34de407a 897 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
898 blacklist[j] = NULL;
899
900 /* Second round, flip the ro bits if necessary. */
f0a4feb0 901 for (m = mounts; m < mounts + n_mounts; ++m) {
6b7c9f8b 902 r = make_read_only(m, blacklist);
c2c13f2d 903 if (r < 0)
d944dc95 904 goto finish;
c2c13f2d 905 }
15ae422b
LP
906 }
907
ee818b89
AC
908 if (root_directory) {
909 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
910 r = mount_move_root(root_directory);
d944dc95
LP
911 if (r < 0)
912 goto finish;
ee818b89
AC
913 }
914
c2c13f2d
LP
915 /* Remount / as the desired mode. Not that this will not
916 * reestablish propagation from our side to the host, since
917 * what's disconnected is disconnected. */
d944dc95
LP
918 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
919 r = -errno;
920 goto finish;
921 }
15ae422b 922
d944dc95 923 r = 0;
15ae422b 924
d944dc95 925finish:
f0a4feb0 926 for (m = mounts; m < mounts + n_mounts; m++)
5327c910 927 free(m->path_malloc);
613b411c
LP
928
929 return r;
930}
931
932static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
933 _cleanup_free_ char *x = NULL;
6b46ea73
LP
934 char bid[SD_ID128_STRING_MAX];
935 sd_id128_t boot_id;
936 int r;
613b411c
LP
937
938 assert(id);
939 assert(prefix);
940 assert(path);
941
6b46ea73
LP
942 /* We include the boot id in the directory so that after a
943 * reboot we can easily identify obsolete directories. */
944
945 r = sd_id128_get_boot(&boot_id);
946 if (r < 0)
947 return r;
948
605405c6 949 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
950 if (!x)
951 return -ENOMEM;
952
953 RUN_WITH_UMASK(0077)
954 if (!mkdtemp(x))
955 return -errno;
956
957 RUN_WITH_UMASK(0000) {
958 char *y;
959
63c372cb 960 y = strjoina(x, "/tmp");
613b411c
LP
961
962 if (mkdir(y, 0777 | S_ISVTX) < 0)
963 return -errno;
c17ec25e 964 }
15ae422b 965
613b411c
LP
966 *path = x;
967 x = NULL;
968
969 return 0;
970}
971
972int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
973 char *a, *b;
974 int r;
975
976 assert(id);
977 assert(tmp_dir);
978 assert(var_tmp_dir);
979
980 r = setup_one_tmp_dir(id, "/tmp", &a);
981 if (r < 0)
982 return r;
983
984 r = setup_one_tmp_dir(id, "/var/tmp", &b);
985 if (r < 0) {
986 char *t;
987
63c372cb 988 t = strjoina(a, "/tmp");
613b411c
LP
989 rmdir(t);
990 rmdir(a);
991
992 free(a);
993 return r;
994 }
995
996 *tmp_dir = a;
997 *var_tmp_dir = b;
998
999 return 0;
1000}
1001
1002int setup_netns(int netns_storage_socket[2]) {
1003 _cleanup_close_ int netns = -1;
3ee897d6 1004 int r, q;
613b411c
LP
1005
1006 assert(netns_storage_socket);
1007 assert(netns_storage_socket[0] >= 0);
1008 assert(netns_storage_socket[1] >= 0);
1009
1010 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1011 * namespace reference fd. Whatever process runs this first
1012 * shall create a new namespace, all others should just join
1013 * it. To serialize that we use a file lock on the socket
1014 * pair.
613b411c
LP
1015 *
1016 * It's a bit crazy, but hey, works great! */
1017
1018 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1019 return -errno;
1020
3ee897d6
LP
1021 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1022 if (netns == -EAGAIN) {
613b411c
LP
1023 /* Nothing stored yet, so let's create a new namespace */
1024
1025 if (unshare(CLONE_NEWNET) < 0) {
1026 r = -errno;
1027 goto fail;
1028 }
1029
1030 loopback_setup();
1031
1032 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1033 if (netns < 0) {
1034 r = -errno;
1035 goto fail;
1036 }
1037
1038 r = 1;
613b411c 1039
3ee897d6
LP
1040 } else if (netns < 0) {
1041 r = netns;
1042 goto fail;
613b411c 1043
3ee897d6
LP
1044 } else {
1045 /* Yay, found something, so let's join the namespace */
613b411c
LP
1046 if (setns(netns, CLONE_NEWNET) < 0) {
1047 r = -errno;
1048 goto fail;
1049 }
1050
1051 r = 0;
1052 }
1053
3ee897d6
LP
1054 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1055 if (q < 0) {
1056 r = q;
613b411c
LP
1057 goto fail;
1058 }
1059
1060fail:
fe048ce5 1061 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1062 return r;
1063}
417116f2 1064
1b8689f9
LP
1065static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1066 [PROTECT_HOME_NO] = "no",
1067 [PROTECT_HOME_YES] = "yes",
1068 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
1069};
1070
1b8689f9
LP
1071DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1072
1073static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1074 [PROTECT_SYSTEM_NO] = "no",
1075 [PROTECT_SYSTEM_YES] = "yes",
1076 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1077 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1078};
1079
1080DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);