]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
namespace: instead of chasing mount symlinks a priori, do so as-we-go
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
15ae422b
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
15ae422b 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
20#include <errno.h>
07630cea 21#include <sched.h>
15ae422b 22#include <stdio.h>
07630cea
LP
23#include <string.h>
24#include <sys/mount.h>
15ae422b 25#include <sys/stat.h>
07630cea 26#include <unistd.h>
25e870b5 27#include <linux/fs.h>
15ae422b 28
b5efdb8a 29#include "alloc-util.h"
7f112f50 30#include "dev-setup.h"
3ffd4af2 31#include "fd-util.h"
d944dc95 32#include "fs-util.h"
07630cea
LP
33#include "loopback-setup.h"
34#include "missing.h"
35#include "mkdir.h"
4349cd7c 36#include "mount-util.h"
3ffd4af2 37#include "namespace.h"
07630cea 38#include "path-util.h"
d7b8eec7 39#include "selinux-util.h"
2583fbea 40#include "socket-util.h"
8b43440b 41#include "string-table.h"
07630cea
LP
42#include "string-util.h"
43#include "strv.h"
affb60b1 44#include "umask-util.h"
ee104e11 45#include "user-util.h"
07630cea 46#include "util.h"
15ae422b 47
737ba3c8 48#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
49
c17ec25e 50typedef enum MountMode {
15ae422b
LP
51 /* This is ordered by priority! */
52 INACCESSIBLE,
53 READONLY,
ac0930c8
LP
54 PRIVATE_TMP,
55 PRIVATE_VAR_TMP,
7f112f50 56 PRIVATE_DEV,
59eeb84b 57 READWRITE,
c17ec25e 58} MountMode;
15ae422b 59
34de407a 60typedef struct MountEntry {
5327c910 61 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 62 MountMode mode:5;
5327c910
LP
63 bool ignore:1; /* Ignore if path does not exist? */
64 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 65 bool read_only:1; /* Shall this mount point be read-only? */
5327c910 66 char *path_malloc; /* Use this instead of 'path' if we had to allocate memory */
34de407a 67} MountEntry;
15ae422b 68
f471b2af
DH
69/*
70 * The following Protect tables are to protect paths and mark some of them
71 * READONLY, in case a path is covered by an option from another table, then
72 * it is marked READWRITE in the current one, and the more restrictive mode is
73 * applied from that other table. This way all options can be combined in a
74 * safe and comprehensible way for users.
75 */
76
11a30cec 77/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 78static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
79 { "/proc/sys", READONLY, false },
80 { "/proc/sysrq-trigger", READONLY, true },
81 { "/proc/latency_stats", READONLY, true },
82 { "/proc/mtrr", READONLY, true },
aa70f38b 83 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
84 { "/proc/acpi", READONLY, true },
85 { "/proc/timer_stats", READONLY, true },
86 { "/proc/asound", READONLY, true },
87 { "/proc/bus", READONLY, true },
88 { "/proc/fs", READONLY, true },
89 { "/proc/irq", READONLY, true },
90 { "/sys", READONLY, false },
91 { "/sys/kernel/debug", READONLY, true },
92 { "/sys/kernel/tracing", READONLY, true },
93 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
11a30cec
DH
94};
95
c575770b 96/* ProtectKernelModules= option */
34de407a 97static const MountEntry protect_kernel_modules_table[] = {
c575770b 98#ifdef HAVE_SPLIT_USR
c6232fb0 99 { "/lib/modules", INACCESSIBLE, true },
c575770b 100#endif
c6232fb0 101 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
102};
103
b6c432ca
DH
104/*
105 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
106 * system should be protected by ProtectSystem=
107 */
34de407a 108static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
109 { "/home", READONLY, true },
110 { "/run/user", READONLY, true },
111 { "/root", READONLY, true },
b6c432ca
DH
112};
113
114/* ProtectHome=yes table */
34de407a 115static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
116 { "/home", INACCESSIBLE, true },
117 { "/run/user", INACCESSIBLE, true },
118 { "/root", INACCESSIBLE, true },
b6c432ca
DH
119};
120
f471b2af 121/* ProtectSystem=yes table */
34de407a 122static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
123 { "/usr", READONLY, false },
124 { "/boot", READONLY, true },
125 { "/efi", READONLY, true },
f471b2af
DH
126};
127
128/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 129static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
130 { "/usr", READONLY, false },
131 { "/boot", READONLY, true },
132 { "/efi", READONLY, true },
133 { "/etc", READONLY, false },
f471b2af
DH
134};
135
136/*
137 * ProtectSystem=strict table. In this strict mode, we mount everything
138 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
139 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
140 * protect those, and these options should be fully orthogonal.
141 * (And of course /home and friends are also left writable, as ProtectHome=
142 * shall manage those, orthogonally).
143 */
34de407a 144static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
145 { "/", READONLY, false },
146 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
147 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
148 { "/dev", READWRITE, false }, /* PrivateDevices= */
149 { "/home", READWRITE, true }, /* ProtectHome= */
150 { "/run/user", READWRITE, true }, /* ProtectHome= */
151 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
152};
153
34de407a 154static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
155 assert(p);
156
5327c910
LP
157 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
158 * otherwise the stack/static ->path field is returned. */
f0a4feb0 159
5327c910 160 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
161}
162
34de407a 163static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
164 assert(p);
165
166 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
167}
168
34de407a 169static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) {
15ae422b
LP
170 char **i;
171
613b411c
LP
172 assert(p);
173
5327c910
LP
174 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
175
15ae422b 176 STRV_FOREACH(i, strv) {
5327c910
LP
177 bool ignore = false, needs_prefix = false;
178 const char *e = *i;
15ae422b 179
5327c910
LP
180 /* Look for any prefixes */
181 if (startswith(e, "-")) {
182 e++;
9c94d52e 183 ignore = true;
ea92ae33 184 }
5327c910
LP
185 if (startswith(e, "+")) {
186 e++;
187 needs_prefix = true;
188 }
ea92ae33 189
5327c910 190 if (!path_is_absolute(e))
15ae422b
LP
191 return -EINVAL;
192
34de407a 193 *((*p)++) = (MountEntry) {
5327c910
LP
194 .path_const = e,
195 .mode = mode,
196 .ignore = ignore,
197 .has_prefix = !needs_prefix,
198 };
15ae422b
LP
199 }
200
201 return 0;
202}
203
34de407a 204static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 205 unsigned i;
11a30cec
DH
206
207 assert(p);
f471b2af 208 assert(mounts);
11a30cec 209
5327c910 210 /* Adds a list of static pre-defined entries */
f471b2af 211
5327c910 212 for (i = 0; i < n; i++)
34de407a
LP
213 *((*p)++) = (MountEntry) {
214 .path_const = mount_entry_path(mounts+i),
5327c910
LP
215 .mode = mounts[i].mode,
216 .ignore = mounts[i].ignore || ignore_protect,
217 };
f471b2af
DH
218
219 return 0;
220}
221
34de407a 222static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
223 assert(p);
224
5327c910 225 switch (protect_home) {
b6c432ca 226
5327c910 227 case PROTECT_HOME_NO:
b6c432ca
DH
228 return 0;
229
b6c432ca 230 case PROTECT_HOME_READ_ONLY:
5327c910
LP
231 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
232
b6c432ca 233 case PROTECT_HOME_YES:
5327c910
LP
234 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
235
b6c432ca 236 default:
5327c910 237 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 238 }
b6c432ca
DH
239}
240
34de407a 241static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
242 assert(p);
243
5327c910
LP
244 switch (protect_system) {
245
246 case PROTECT_SYSTEM_NO:
f471b2af
DH
247 return 0;
248
f471b2af 249 case PROTECT_SYSTEM_STRICT:
5327c910
LP
250 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
251
f471b2af 252 case PROTECT_SYSTEM_YES:
5327c910
LP
253 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
254
f471b2af 255 case PROTECT_SYSTEM_FULL:
5327c910
LP
256 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
257
f471b2af 258 default:
5327c910 259 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 260 }
11a30cec
DH
261}
262
c17ec25e 263static int mount_path_compare(const void *a, const void *b) {
34de407a 264 const MountEntry *p = a, *q = b;
a0827e2b 265 int d;
15ae422b 266
6ee1a919 267 /* If the paths are not equal, then order prefixes first */
34de407a 268 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
269 if (d != 0)
270 return d;
15ae422b 271
6ee1a919
LP
272 /* If the paths are equal, check the mode */
273 if (p->mode < q->mode)
274 return -1;
15ae422b 275
6ee1a919
LP
276 if (p->mode > q->mode)
277 return 1;
15ae422b 278
6ee1a919 279 return 0;
15ae422b
LP
280}
281
34de407a 282static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
283 unsigned i;
284
285 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
286 * that. */
287
288 if (!root_directory)
289 return 0;
290
291 for (i = 0; i < n; i++) {
292 char *s;
293
294 if (m[i].has_prefix)
295 continue;
296
34de407a 297 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
298 if (!s)
299 return -ENOMEM;
300
301 free(m[i].path_malloc);
302 m[i].path_malloc = s;
303
304 m[i].has_prefix = true;
305 }
306
307 return 0;
308}
309
34de407a
LP
310static void drop_duplicates(MountEntry *m, unsigned *n) {
311 MountEntry *f, *t, *previous;
15ae422b 312
c17ec25e 313 assert(m);
15ae422b 314 assert(n);
15ae422b 315
fe3c2583
LP
316 /* Drops duplicate entries. Expects that the array is properly ordered already. */
317
1d54cd5d 318 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 319
fe3c2583
LP
320 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
321 * above. */
34de407a
LP
322 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
323 log_debug("%s is duplicate.", mount_entry_path(f));
324 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
5327c910 325 f->path_malloc = mfree(f->path_malloc);
15ae422b 326 continue;
fe3c2583 327 }
15ae422b 328
e2d7c1a0 329 *t = *f;
15ae422b 330 previous = t;
fe3c2583
LP
331 t++;
332 }
333
334 *n = t - m;
335}
336
34de407a
LP
337static void drop_inaccessible(MountEntry *m, unsigned *n) {
338 MountEntry *f, *t;
fe3c2583
LP
339 const char *clear = NULL;
340
341 assert(m);
342 assert(n);
343
344 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
345 * ordered already. */
346
1d54cd5d 347 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
348
349 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
350 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
351 if (clear && path_startswith(mount_entry_path(f), clear)) {
352 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
5327c910 353 f->path_malloc = mfree(f->path_malloc);
fe3c2583
LP
354 continue;
355 }
15ae422b 356
34de407a 357 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
358
359 *t = *f;
15ae422b
LP
360 t++;
361 }
362
c17ec25e 363 *n = t - m;
15ae422b
LP
364}
365
34de407a
LP
366static void drop_nop(MountEntry *m, unsigned *n) {
367 MountEntry *f, *t;
7648a565
LP
368
369 assert(m);
370 assert(n);
371
372 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
373 * list is ordered by prefixes. */
374
1d54cd5d 375 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
376
377 /* Only suppress such subtrees for READONLY and READWRITE entries */
378 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 379 MountEntry *p;
7648a565
LP
380 bool found = false;
381
382 /* Now let's find the first parent of the entry we are looking at. */
383 for (p = t-1; p >= m; p--) {
34de407a 384 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
385 found = true;
386 break;
387 }
388 }
389
390 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
391 if (found && p->mode == f->mode) {
34de407a 392 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
5327c910 393 f->path_malloc = mfree(f->path_malloc);
7648a565
LP
394 continue;
395 }
396 }
397
398 *t = *f;
399 t++;
400 }
401
402 *n = t - m;
403}
404
34de407a
LP
405static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
406 MountEntry *f, *t;
cd2902c9
LP
407
408 assert(m);
409 assert(n);
410
1d54cd5d 411 /* Nothing to do */
cd2902c9
LP
412 if (!root_directory)
413 return;
414
415 /* Drops all mounts that are outside of the root directory. */
416
1d54cd5d 417 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 418
34de407a
LP
419 if (!path_startswith(mount_entry_path(f), root_directory)) {
420 log_debug("%s is outside of root directory.", mount_entry_path(f));
5327c910 421 f->path_malloc = mfree(f->path_malloc);
cd2902c9
LP
422 continue;
423 }
424
425 *t = *f;
426 t++;
427 }
428
429 *n = t - m;
430}
431
34de407a 432static int mount_dev(MountEntry *m) {
7f112f50
LP
433 static const char devnodes[] =
434 "/dev/null\0"
435 "/dev/zero\0"
436 "/dev/full\0"
437 "/dev/random\0"
438 "/dev/urandom\0"
439 "/dev/tty\0";
440
2b85f4e1 441 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 442 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
443 _cleanup_umask_ mode_t u;
444 int r;
445
446 assert(m);
447
448 u = umask(0000);
449
2b85f4e1
LP
450 if (!mkdtemp(temporary_mount))
451 return -errno;
452
63c372cb 453 dev = strjoina(temporary_mount, "/dev");
dc751688 454 (void) mkdir(dev, 0755);
737ba3c8 455 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
456 r = -errno;
457 goto fail;
458 }
459
63c372cb 460 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 461 (void) mkdir(devpts, 0755);
2b85f4e1
LP
462 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
463 r = -errno;
464 goto fail;
465 }
466
63c372cb 467 devptmx = strjoina(temporary_mount, "/dev/ptmx");
3164e3cb
ZJS
468 if (symlink("pts/ptmx", devptmx) < 0) {
469 r = -errno;
470 goto fail;
471 }
e06b6479 472
63c372cb 473 devshm = strjoina(temporary_mount, "/dev/shm");
dc751688 474 (void) mkdir(devshm, 01777);
2b85f4e1
LP
475 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
476 if (r < 0) {
477 r = -errno;
478 goto fail;
479 }
480
63c372cb 481 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 482 (void) mkdir(devmqueue, 0755);
3164e3cb 483 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 484
63c372cb 485 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 486 (void) mkdir(devhugepages, 0755);
3164e3cb 487 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 488
63c372cb 489 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 490 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 491
7f112f50 492 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
493 _cleanup_free_ char *dn = NULL;
494 struct stat st;
495
496 r = stat(d, &st);
7f112f50 497 if (r < 0) {
2b85f4e1
LP
498
499 if (errno == ENOENT)
500 continue;
501
502 r = -errno;
503 goto fail;
7f112f50
LP
504 }
505
2b85f4e1
LP
506 if (!S_ISBLK(st.st_mode) &&
507 !S_ISCHR(st.st_mode)) {
508 r = -EINVAL;
509 goto fail;
510 }
511
512 if (st.st_rdev == 0)
513 continue;
514
515 dn = strappend(temporary_mount, d);
516 if (!dn) {
517 r = -ENOMEM;
518 goto fail;
519 }
520
ecabcf8b 521 mac_selinux_create_file_prepare(d, st.st_mode);
2b85f4e1 522 r = mknod(dn, st.st_mode, st.st_rdev);
ecabcf8b 523 mac_selinux_create_file_clear();
dd078a1e 524
2b85f4e1
LP
525 if (r < 0) {
526 r = -errno;
527 goto fail;
528 }
7f112f50
LP
529 }
530
03cfe0d5 531 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 532
ee818b89
AC
533 /* Create the /dev directory if missing. It is more likely to be
534 * missing when the service is started with RootDirectory. This is
535 * consistent with mount units creating the mount points when missing.
536 */
34de407a 537 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 538
9e5f8252 539 /* Unmount everything in old /dev */
34de407a
LP
540 umount_recursive(mount_entry_path(m), 0);
541 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
542 r = -errno;
543 goto fail;
544 }
7f112f50 545
2b85f4e1
LP
546 rmdir(dev);
547 rmdir(temporary_mount);
7f112f50 548
2b85f4e1 549 return 0;
7f112f50 550
2b85f4e1
LP
551fail:
552 if (devpts)
553 umount(devpts);
7f112f50 554
2b85f4e1
LP
555 if (devshm)
556 umount(devshm);
7f112f50 557
2b85f4e1
LP
558 if (devhugepages)
559 umount(devhugepages);
7f112f50 560
2b85f4e1
LP
561 if (devmqueue)
562 umount(devmqueue);
7f112f50 563
d267c5aa
ZJS
564 umount(dev);
565 rmdir(dev);
2b85f4e1 566 rmdir(temporary_mount);
7f112f50 567
2b85f4e1 568 return r;
7f112f50
LP
569}
570
8fceda93
LP
571static int mount_entry_chase(MountEntry *m, const char *root_directory) {
572 char *chased;
573 int r;
574
575 assert(m);
576
577 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
578 * chase the symlinks on our own first. */
579
580 r = chase_symlinks(mount_entry_path(m), root_directory, 0, &chased);
581 if (r == -ENOENT && m->ignore) {
582 log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_path(m));
583 return 0;
584 }
585 if (r < 0)
586 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_path(m));
587
588 log_debug("Followed symlinks %s → %s.", mount_entry_path(m), chased);
589
590 free(m->path_malloc);
591 m->path_malloc = chased;
592
593 return 1;
594}
595
ac0930c8 596static int apply_mount(
8fceda93 597 const char *root_directory,
34de407a 598 MountEntry *m,
ac0930c8 599 const char *tmp_dir,
c17ec25e 600 const char *var_tmp_dir) {
ac0930c8 601
15ae422b 602 const char *what;
15ae422b 603 int r;
15ae422b 604
c17ec25e 605 assert(m);
15ae422b 606
8fceda93
LP
607 r = mount_entry_chase(m, root_directory);
608 if (r <= 0)
609 return r;
610
34de407a 611 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 612
c17ec25e 613 switch (m->mode) {
15ae422b 614
160cfdbe
LP
615 case INACCESSIBLE: {
616 struct stat target;
6d313367
LP
617
618 /* First, get rid of everything that is below if there
619 * is anything... Then, overmount it with an
c4b41707 620 * inaccessible path. */
34de407a 621 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 622
34de407a
LP
623 if (lstat(mount_entry_path(m), &target) < 0)
624 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
15ae422b 625
c4b41707 626 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
627 if (!what) {
628 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
629 return -ELOOP;
630 }
631 break;
160cfdbe 632 }
fe3c2583 633
15ae422b 634 case READONLY:
15ae422b 635 case READWRITE:
6b7c9f8b 636
8fceda93 637 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
d944dc95 638 if (r < 0)
34de407a 639 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
640 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
641 return 0;
6b7c9f8b 642 /* This isn't a mount point yet, let's make it one. */
34de407a 643 what = mount_entry_path(m);
6b7c9f8b 644 break;
15ae422b 645
ac0930c8
LP
646 case PRIVATE_TMP:
647 what = tmp_dir;
648 break;
649
650 case PRIVATE_VAR_TMP:
651 what = var_tmp_dir;
15ae422b 652 break;
e364ad06 653
d6797c92
LP
654 case PRIVATE_DEV:
655 return mount_dev(m);
656
e364ad06
LP
657 default:
658 assert_not_reached("Unknown mode");
15ae422b
LP
659 }
660
ac0930c8 661 assert(what);
15ae422b 662
34de407a
LP
663 if (mount(what, mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
664 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
6b7c9f8b 665
34de407a 666 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 667 return 0;
ac0930c8 668}
15ae422b 669
34de407a 670static int make_read_only(MountEntry *m, char **blacklist) {
6b7c9f8b 671 int r = 0;
15ae422b 672
c17ec25e 673 assert(m);
ac0930c8 674
34de407a
LP
675 if (mount_entry_read_only(m))
676 r = bind_remount_recursive(mount_entry_path(m), true, blacklist);
6b7c9f8b 677 else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
34de407a 678 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 679 r = -errno;
737ba3c8 680 } else
6b7c9f8b
LP
681 return 0;
682
683 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
684 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
685 * read-only mounts already applied. */
ac0930c8 686
8fceda93
LP
687 if (r == -ENOENT && m->ignore)
688 r = 0;
5327c910 689
1d54cd5d 690 return r;
d944dc95
LP
691}
692
2652c6c1 693static unsigned namespace_calculate_mounts(
c575770b 694 const NameSpaceInfo *ns_info,
2652c6c1
DH
695 char** read_write_paths,
696 char** read_only_paths,
697 char** inaccessible_paths,
698 const char* tmp_dir,
699 const char* var_tmp_dir,
2652c6c1
DH
700 ProtectHome protect_home,
701 ProtectSystem protect_system) {
702
b6c432ca 703 unsigned protect_home_cnt;
f471b2af
DH
704 unsigned protect_system_cnt =
705 (protect_system == PROTECT_SYSTEM_STRICT ?
706 ELEMENTSOF(protect_system_strict_table) :
707 ((protect_system == PROTECT_SYSTEM_FULL) ?
708 ELEMENTSOF(protect_system_full_table) :
709 ((protect_system == PROTECT_SYSTEM_YES) ?
710 ELEMENTSOF(protect_system_yes_table) : 0)));
711
b6c432ca
DH
712 protect_home_cnt =
713 (protect_home == PROTECT_HOME_YES ?
714 ELEMENTSOF(protect_home_yes_table) :
715 ((protect_home == PROTECT_HOME_READ_ONLY) ?
716 ELEMENTSOF(protect_home_read_only_table) : 0));
717
2652c6c1
DH
718 return !!tmp_dir + !!var_tmp_dir +
719 strv_length(read_write_paths) +
720 strv_length(read_only_paths) +
721 strv_length(inaccessible_paths) +
c575770b
DH
722 ns_info->private_dev +
723 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
724 (ns_info->protect_control_groups ? 1 : 0) +
725 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
b6c432ca 726 protect_home_cnt + protect_system_cnt;
2652c6c1
DH
727}
728
613b411c 729int setup_namespace(
ee818b89 730 const char* root_directory,
c575770b 731 const NameSpaceInfo *ns_info,
2a624c36
AP
732 char** read_write_paths,
733 char** read_only_paths,
734 char** inaccessible_paths,
a004cb4c
LP
735 const char* tmp_dir,
736 const char* var_tmp_dir,
1b8689f9
LP
737 ProtectHome protect_home,
738 ProtectSystem protect_system,
e6547662 739 unsigned long mount_flags) {
15ae422b 740
34de407a 741 MountEntry *m, *mounts = NULL;
d944dc95 742 bool make_slave = false;
f0a4feb0 743 unsigned n_mounts;
c17ec25e 744 int r = 0;
15ae422b 745
613b411c 746 if (mount_flags == 0)
c17ec25e 747 mount_flags = MS_SHARED;
ac0930c8 748
cfbeb4ef
LP
749 n_mounts = namespace_calculate_mounts(
750 ns_info,
751 read_write_paths,
752 read_only_paths,
753 inaccessible_paths,
754 tmp_dir, var_tmp_dir,
755 protect_home, protect_system);
613b411c 756
2652c6c1 757 /* Set mount slave mode */
f0a4feb0 758 if (root_directory || n_mounts > 0)
d944dc95
LP
759 make_slave = true;
760
f0a4feb0 761 if (n_mounts > 0) {
34de407a 762 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
5327c910 763 r = append_access_mounts(&m, read_write_paths, READWRITE);
613b411c 764 if (r < 0)
f0a4feb0 765 goto finish;
613b411c 766
5327c910 767 r = append_access_mounts(&m, read_only_paths, READONLY);
613b411c 768 if (r < 0)
f0a4feb0 769 goto finish;
613b411c 770
5327c910 771 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE);
613b411c 772 if (r < 0)
f0a4feb0 773 goto finish;
7ff7394d 774
613b411c 775 if (tmp_dir) {
34de407a 776 *(m++) = (MountEntry) {
5327c910
LP
777 .path_const = "/tmp",
778 .mode = PRIVATE_TMP,
779 };
613b411c 780 }
7ff7394d 781
613b411c 782 if (var_tmp_dir) {
34de407a 783 *(m++) = (MountEntry) {
5327c910
LP
784 .path_const = "/var/tmp",
785 .mode = PRIVATE_VAR_TMP,
786 };
7ff7394d 787 }
ac0930c8 788
c575770b 789 if (ns_info->private_dev) {
34de407a 790 *(m++) = (MountEntry) {
5327c910
LP
791 .path_const = "/dev",
792 .mode = PRIVATE_DEV,
793 };
7f112f50
LP
794 }
795
c575770b 796 if (ns_info->protect_kernel_tunables) {
5327c910 797 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 798 if (r < 0)
f0a4feb0 799 goto finish;
c575770b
DH
800 }
801
802 if (ns_info->protect_kernel_modules) {
5327c910 803 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 804 if (r < 0)
f0a4feb0 805 goto finish;
c575770b 806 }
59eeb84b 807
c575770b 808 if (ns_info->protect_control_groups) {
34de407a 809 *(m++) = (MountEntry) {
5327c910
LP
810 .path_const = "/sys/fs/cgroup",
811 .mode = READONLY,
812 };
59eeb84b
LP
813 }
814
5327c910 815 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 816 if (r < 0)
f0a4feb0 817 goto finish;
417116f2 818
5327c910 819 r = append_protect_system(&m, protect_system, false);
f471b2af 820 if (r < 0)
f0a4feb0 821 goto finish;
417116f2 822
f0a4feb0 823 assert(mounts + n_mounts == m);
ac0930c8 824
5327c910
LP
825 /* Prepend the root directory where that's necessary */
826 r = prefix_where_needed(mounts, n_mounts, root_directory);
827 if (r < 0)
828 goto finish;
829
34de407a 830 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
fe3c2583 831
f0a4feb0
DH
832 drop_duplicates(mounts, &n_mounts);
833 drop_outside_root(root_directory, mounts, &n_mounts);
834 drop_inaccessible(mounts, &n_mounts);
835 drop_nop(mounts, &n_mounts);
15ae422b
LP
836 }
837
d944dc95
LP
838 if (unshare(CLONE_NEWNS) < 0) {
839 r = -errno;
840 goto finish;
841 }
1e4e94c8 842
d944dc95 843 if (make_slave) {
c2c13f2d
LP
844 /* Remount / as SLAVE so that nothing now mounted in the namespace
845 shows up in the parent */
d944dc95
LP
846 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
847 r = -errno;
848 goto finish;
849 }
ee818b89
AC
850 }
851
852 if (root_directory) {
8f1ad200 853 /* Turn directory into bind mount, if it isn't one yet */
e1873695 854 r = path_is_mount_point(root_directory, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 855 if (r < 0)
d944dc95 856 goto finish;
8f1ad200
LP
857 if (r == 0) {
858 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
859 r = -errno;
860 goto finish;
861 }
d944dc95 862 }
ee818b89 863 }
c2c13f2d 864
f0a4feb0 865 if (n_mounts > 0) {
6b7c9f8b
LP
866 char **blacklist;
867 unsigned j;
868
869 /* First round, add in all special mounts we need */
f0a4feb0 870 for (m = mounts; m < mounts + n_mounts; ++m) {
8fceda93 871 r = apply_mount(root_directory, m, tmp_dir, var_tmp_dir);
c2c13f2d 872 if (r < 0)
d944dc95 873 goto finish;
c2c13f2d 874 }
15ae422b 875
6b7c9f8b 876 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
877 blacklist = newa(char*, n_mounts+1);
878 for (j = 0; j < n_mounts; j++)
34de407a 879 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
880 blacklist[j] = NULL;
881
882 /* Second round, flip the ro bits if necessary. */
f0a4feb0 883 for (m = mounts; m < mounts + n_mounts; ++m) {
6b7c9f8b 884 r = make_read_only(m, blacklist);
c2c13f2d 885 if (r < 0)
d944dc95 886 goto finish;
c2c13f2d 887 }
15ae422b
LP
888 }
889
ee818b89
AC
890 if (root_directory) {
891 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
892 r = mount_move_root(root_directory);
d944dc95
LP
893 if (r < 0)
894 goto finish;
ee818b89
AC
895 }
896
c2c13f2d
LP
897 /* Remount / as the desired mode. Not that this will not
898 * reestablish propagation from our side to the host, since
899 * what's disconnected is disconnected. */
d944dc95
LP
900 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
901 r = -errno;
902 goto finish;
903 }
15ae422b 904
d944dc95 905 r = 0;
15ae422b 906
d944dc95 907finish:
f0a4feb0 908 for (m = mounts; m < mounts + n_mounts; m++)
5327c910 909 free(m->path_malloc);
613b411c
LP
910
911 return r;
912}
913
914static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
915 _cleanup_free_ char *x = NULL;
6b46ea73
LP
916 char bid[SD_ID128_STRING_MAX];
917 sd_id128_t boot_id;
918 int r;
613b411c
LP
919
920 assert(id);
921 assert(prefix);
922 assert(path);
923
6b46ea73
LP
924 /* We include the boot id in the directory so that after a
925 * reboot we can easily identify obsolete directories. */
926
927 r = sd_id128_get_boot(&boot_id);
928 if (r < 0)
929 return r;
930
605405c6 931 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
932 if (!x)
933 return -ENOMEM;
934
935 RUN_WITH_UMASK(0077)
936 if (!mkdtemp(x))
937 return -errno;
938
939 RUN_WITH_UMASK(0000) {
940 char *y;
941
63c372cb 942 y = strjoina(x, "/tmp");
613b411c
LP
943
944 if (mkdir(y, 0777 | S_ISVTX) < 0)
945 return -errno;
c17ec25e 946 }
15ae422b 947
613b411c
LP
948 *path = x;
949 x = NULL;
950
951 return 0;
952}
953
954int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
955 char *a, *b;
956 int r;
957
958 assert(id);
959 assert(tmp_dir);
960 assert(var_tmp_dir);
961
962 r = setup_one_tmp_dir(id, "/tmp", &a);
963 if (r < 0)
964 return r;
965
966 r = setup_one_tmp_dir(id, "/var/tmp", &b);
967 if (r < 0) {
968 char *t;
969
63c372cb 970 t = strjoina(a, "/tmp");
613b411c
LP
971 rmdir(t);
972 rmdir(a);
973
974 free(a);
975 return r;
976 }
977
978 *tmp_dir = a;
979 *var_tmp_dir = b;
980
981 return 0;
982}
983
984int setup_netns(int netns_storage_socket[2]) {
985 _cleanup_close_ int netns = -1;
3ee897d6 986 int r, q;
613b411c
LP
987
988 assert(netns_storage_socket);
989 assert(netns_storage_socket[0] >= 0);
990 assert(netns_storage_socket[1] >= 0);
991
992 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
993 * namespace reference fd. Whatever process runs this first
994 * shall create a new namespace, all others should just join
995 * it. To serialize that we use a file lock on the socket
996 * pair.
613b411c
LP
997 *
998 * It's a bit crazy, but hey, works great! */
999
1000 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1001 return -errno;
1002
3ee897d6
LP
1003 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1004 if (netns == -EAGAIN) {
613b411c
LP
1005 /* Nothing stored yet, so let's create a new namespace */
1006
1007 if (unshare(CLONE_NEWNET) < 0) {
1008 r = -errno;
1009 goto fail;
1010 }
1011
1012 loopback_setup();
1013
1014 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1015 if (netns < 0) {
1016 r = -errno;
1017 goto fail;
1018 }
1019
1020 r = 1;
613b411c 1021
3ee897d6
LP
1022 } else if (netns < 0) {
1023 r = netns;
1024 goto fail;
613b411c 1025
3ee897d6
LP
1026 } else {
1027 /* Yay, found something, so let's join the namespace */
613b411c
LP
1028 if (setns(netns, CLONE_NEWNET) < 0) {
1029 r = -errno;
1030 goto fail;
1031 }
1032
1033 r = 0;
1034 }
1035
3ee897d6
LP
1036 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1037 if (q < 0) {
1038 r = q;
613b411c
LP
1039 goto fail;
1040 }
1041
1042fail:
fe048ce5 1043 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1044 return r;
1045}
417116f2 1046
1b8689f9
LP
1047static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1048 [PROTECT_HOME_NO] = "no",
1049 [PROTECT_HOME_YES] = "yes",
1050 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
1051};
1052
1b8689f9
LP
1053DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1054
1055static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1056 [PROTECT_SYSTEM_NO] = "no",
1057 [PROTECT_SYSTEM_YES] = "yes",
1058 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1059 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1060};
1061
1062DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);