]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
core: fix minor memleak in namespace.c
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
15ae422b
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
15ae422b 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
20#include <errno.h>
07630cea 21#include <sched.h>
15ae422b 22#include <stdio.h>
07630cea
LP
23#include <string.h>
24#include <sys/mount.h>
15ae422b 25#include <sys/stat.h>
07630cea 26#include <unistd.h>
25e870b5 27#include <linux/fs.h>
15ae422b 28
b5efdb8a 29#include "alloc-util.h"
7f112f50 30#include "dev-setup.h"
3ffd4af2 31#include "fd-util.h"
d944dc95 32#include "fs-util.h"
07630cea
LP
33#include "loopback-setup.h"
34#include "missing.h"
35#include "mkdir.h"
4349cd7c 36#include "mount-util.h"
3ffd4af2 37#include "namespace.h"
07630cea 38#include "path-util.h"
d7b8eec7 39#include "selinux-util.h"
2583fbea 40#include "socket-util.h"
8b43440b 41#include "string-table.h"
07630cea
LP
42#include "string-util.h"
43#include "strv.h"
affb60b1 44#include "umask-util.h"
ee104e11 45#include "user-util.h"
07630cea 46#include "util.h"
15ae422b 47
737ba3c8 48#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
49
c17ec25e 50typedef enum MountMode {
15ae422b
LP
51 /* This is ordered by priority! */
52 INACCESSIBLE,
d2d6c096
LP
53 BIND_MOUNT,
54 BIND_MOUNT_RECURSIVE,
15ae422b 55 READONLY,
ac0930c8
LP
56 PRIVATE_TMP,
57 PRIVATE_VAR_TMP,
7f112f50 58 PRIVATE_DEV,
59eeb84b 59 READWRITE,
c17ec25e 60} MountMode;
15ae422b 61
34de407a 62typedef struct MountEntry {
5327c910 63 const char *path_const; /* Memory allocated on stack or static */
cfbeb4ef 64 MountMode mode:5;
5327c910
LP
65 bool ignore:1; /* Ignore if path does not exist? */
66 bool has_prefix:1; /* Already is prefixed by the root dir? */
cfbeb4ef 67 bool read_only:1; /* Shall this mount point be read-only? */
5327c910 68 char *path_malloc; /* Use this instead of 'path' if we had to allocate memory */
d2d6c096
LP
69 const char *source_const; /* The source path, for bind mounts */
70 char *source_malloc;
34de407a 71} MountEntry;
15ae422b 72
f471b2af
DH
73/*
74 * The following Protect tables are to protect paths and mark some of them
75 * READONLY, in case a path is covered by an option from another table, then
76 * it is marked READWRITE in the current one, and the more restrictive mode is
77 * applied from that other table. This way all options can be combined in a
78 * safe and comprehensible way for users.
79 */
80
11a30cec 81/* ProtectKernelTunables= option and the related filesystem APIs */
34de407a 82static const MountEntry protect_kernel_tunables_table[] = {
c6232fb0
LP
83 { "/proc/sys", READONLY, false },
84 { "/proc/sysrq-trigger", READONLY, true },
85 { "/proc/latency_stats", READONLY, true },
86 { "/proc/mtrr", READONLY, true },
aa70f38b 87 { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
c6232fb0
LP
88 { "/proc/acpi", READONLY, true },
89 { "/proc/timer_stats", READONLY, true },
90 { "/proc/asound", READONLY, true },
91 { "/proc/bus", READONLY, true },
92 { "/proc/fs", READONLY, true },
93 { "/proc/irq", READONLY, true },
94 { "/sys", READONLY, false },
95 { "/sys/kernel/debug", READONLY, true },
96 { "/sys/kernel/tracing", READONLY, true },
97 { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
11a30cec
DH
98};
99
c575770b 100/* ProtectKernelModules= option */
34de407a 101static const MountEntry protect_kernel_modules_table[] = {
c575770b 102#ifdef HAVE_SPLIT_USR
c6232fb0 103 { "/lib/modules", INACCESSIBLE, true },
c575770b 104#endif
c6232fb0 105 { "/usr/lib/modules", INACCESSIBLE, true },
c575770b
DH
106};
107
b6c432ca
DH
108/*
109 * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
110 * system should be protected by ProtectSystem=
111 */
34de407a 112static const MountEntry protect_home_read_only_table[] = {
c6232fb0
LP
113 { "/home", READONLY, true },
114 { "/run/user", READONLY, true },
115 { "/root", READONLY, true },
b6c432ca
DH
116};
117
118/* ProtectHome=yes table */
34de407a 119static const MountEntry protect_home_yes_table[] = {
c6232fb0
LP
120 { "/home", INACCESSIBLE, true },
121 { "/run/user", INACCESSIBLE, true },
122 { "/root", INACCESSIBLE, true },
b6c432ca
DH
123};
124
f471b2af 125/* ProtectSystem=yes table */
34de407a 126static const MountEntry protect_system_yes_table[] = {
c6232fb0
LP
127 { "/usr", READONLY, false },
128 { "/boot", READONLY, true },
129 { "/efi", READONLY, true },
f471b2af
DH
130};
131
132/* ProtectSystem=full includes ProtectSystem=yes */
34de407a 133static const MountEntry protect_system_full_table[] = {
c6232fb0
LP
134 { "/usr", READONLY, false },
135 { "/boot", READONLY, true },
136 { "/efi", READONLY, true },
137 { "/etc", READONLY, false },
f471b2af
DH
138};
139
140/*
141 * ProtectSystem=strict table. In this strict mode, we mount everything
142 * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
143 * which are left writable, but PrivateDevices= + ProtectKernelTunables=
144 * protect those, and these options should be fully orthogonal.
145 * (And of course /home and friends are also left writable, as ProtectHome=
146 * shall manage those, orthogonally).
147 */
34de407a 148static const MountEntry protect_system_strict_table[] = {
ddbe0412
LP
149 { "/", READONLY, false },
150 { "/proc", READWRITE, false }, /* ProtectKernelTunables= */
151 { "/sys", READWRITE, false }, /* ProtectKernelTunables= */
152 { "/dev", READWRITE, false }, /* PrivateDevices= */
153 { "/home", READWRITE, true }, /* ProtectHome= */
154 { "/run/user", READWRITE, true }, /* ProtectHome= */
155 { "/root", READWRITE, true }, /* ProtectHome= */
f471b2af
DH
156};
157
34de407a 158static const char *mount_entry_path(const MountEntry *p) {
f0a4feb0
DH
159 assert(p);
160
5327c910
LP
161 /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
162 * otherwise the stack/static ->path field is returned. */
f0a4feb0 163
5327c910 164 return p->path_malloc ?: p->path_const;
f0a4feb0
DH
165}
166
34de407a 167static bool mount_entry_read_only(const MountEntry *p) {
cfbeb4ef
LP
168 assert(p);
169
170 return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
171}
172
d2d6c096
LP
173static const char *mount_entry_source(const MountEntry *p) {
174 assert(p);
175
176 return p->source_malloc ?: p->source_const;
177}
178
1eb7e08e
LP
179static void mount_entry_done(MountEntry *p) {
180 assert(p);
181
182 p->path_malloc = mfree(p->path_malloc);
183 p->source_malloc = mfree(p->source_malloc);
184}
185
34de407a 186static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) {
15ae422b
LP
187 char **i;
188
613b411c
LP
189 assert(p);
190
5327c910
LP
191 /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
192
15ae422b 193 STRV_FOREACH(i, strv) {
5327c910
LP
194 bool ignore = false, needs_prefix = false;
195 const char *e = *i;
15ae422b 196
5327c910
LP
197 /* Look for any prefixes */
198 if (startswith(e, "-")) {
199 e++;
9c94d52e 200 ignore = true;
ea92ae33 201 }
5327c910
LP
202 if (startswith(e, "+")) {
203 e++;
204 needs_prefix = true;
205 }
ea92ae33 206
5327c910 207 if (!path_is_absolute(e))
15ae422b
LP
208 return -EINVAL;
209
34de407a 210 *((*p)++) = (MountEntry) {
5327c910
LP
211 .path_const = e,
212 .mode = mode,
213 .ignore = ignore,
214 .has_prefix = !needs_prefix,
215 };
15ae422b
LP
216 }
217
218 return 0;
219}
220
d2d6c096
LP
221static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
222 unsigned i;
223
224 assert(p);
225
226 for (i = 0; i < n; i++) {
227 const BindMount *b = binds + i;
228
229 *((*p)++) = (MountEntry) {
230 .path_const = b->destination,
231 .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
232 .read_only = b->read_only,
233 .source_const = b->source,
234 };
235 }
236
237 return 0;
238}
239
34de407a 240static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
f471b2af 241 unsigned i;
11a30cec
DH
242
243 assert(p);
f471b2af 244 assert(mounts);
11a30cec 245
5327c910 246 /* Adds a list of static pre-defined entries */
f471b2af 247
5327c910 248 for (i = 0; i < n; i++)
34de407a
LP
249 *((*p)++) = (MountEntry) {
250 .path_const = mount_entry_path(mounts+i),
5327c910
LP
251 .mode = mounts[i].mode,
252 .ignore = mounts[i].ignore || ignore_protect,
253 };
f471b2af
DH
254
255 return 0;
256}
257
34de407a 258static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
c575770b
DH
259 assert(p);
260
5327c910 261 switch (protect_home) {
b6c432ca 262
5327c910 263 case PROTECT_HOME_NO:
b6c432ca
DH
264 return 0;
265
b6c432ca 266 case PROTECT_HOME_READ_ONLY:
5327c910
LP
267 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
268
b6c432ca 269 case PROTECT_HOME_YES:
5327c910
LP
270 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
271
b6c432ca 272 default:
5327c910 273 assert_not_reached("Unexpected ProtectHome= value");
b6c432ca 274 }
b6c432ca
DH
275}
276
34de407a 277static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
f471b2af
DH
278 assert(p);
279
5327c910
LP
280 switch (protect_system) {
281
282 case PROTECT_SYSTEM_NO:
f471b2af
DH
283 return 0;
284
f471b2af 285 case PROTECT_SYSTEM_STRICT:
5327c910
LP
286 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
287
f471b2af 288 case PROTECT_SYSTEM_YES:
5327c910
LP
289 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
290
f471b2af 291 case PROTECT_SYSTEM_FULL:
5327c910
LP
292 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
293
f471b2af 294 default:
5327c910 295 assert_not_reached("Unexpected ProtectSystem= value");
f471b2af 296 }
11a30cec
DH
297}
298
c17ec25e 299static int mount_path_compare(const void *a, const void *b) {
34de407a 300 const MountEntry *p = a, *q = b;
a0827e2b 301 int d;
15ae422b 302
6ee1a919 303 /* If the paths are not equal, then order prefixes first */
34de407a 304 d = path_compare(mount_entry_path(p), mount_entry_path(q));
6ee1a919
LP
305 if (d != 0)
306 return d;
15ae422b 307
6ee1a919
LP
308 /* If the paths are equal, check the mode */
309 if (p->mode < q->mode)
310 return -1;
15ae422b 311
6ee1a919
LP
312 if (p->mode > q->mode)
313 return 1;
15ae422b 314
6ee1a919 315 return 0;
15ae422b
LP
316}
317
34de407a 318static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
5327c910
LP
319 unsigned i;
320
321 /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
322 * that. */
323
324 if (!root_directory)
325 return 0;
326
327 for (i = 0; i < n; i++) {
328 char *s;
329
330 if (m[i].has_prefix)
331 continue;
332
34de407a 333 s = prefix_root(root_directory, mount_entry_path(m+i));
5327c910
LP
334 if (!s)
335 return -ENOMEM;
336
337 free(m[i].path_malloc);
338 m[i].path_malloc = s;
339
340 m[i].has_prefix = true;
341 }
342
343 return 0;
344}
345
34de407a
LP
346static void drop_duplicates(MountEntry *m, unsigned *n) {
347 MountEntry *f, *t, *previous;
15ae422b 348
c17ec25e 349 assert(m);
15ae422b 350 assert(n);
15ae422b 351
fe3c2583
LP
352 /* Drops duplicate entries. Expects that the array is properly ordered already. */
353
1d54cd5d 354 for (f = m, t = m, previous = NULL; f < m + *n; f++) {
15ae422b 355
fe3c2583
LP
356 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
357 * above. */
34de407a
LP
358 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
359 log_debug("%s is duplicate.", mount_entry_path(f));
360 previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
1eb7e08e 361 mount_entry_done(f);
15ae422b 362 continue;
fe3c2583 363 }
15ae422b 364
e2d7c1a0 365 *t = *f;
15ae422b 366 previous = t;
fe3c2583
LP
367 t++;
368 }
369
370 *n = t - m;
371}
372
34de407a
LP
373static void drop_inaccessible(MountEntry *m, unsigned *n) {
374 MountEntry *f, *t;
fe3c2583
LP
375 const char *clear = NULL;
376
377 assert(m);
378 assert(n);
379
380 /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
381 * ordered already. */
382
1d54cd5d 383 for (f = m, t = m; f < m + *n; f++) {
fe3c2583
LP
384
385 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
386 * it, as inaccessible paths really should drop the entire subtree. */
34de407a
LP
387 if (clear && path_startswith(mount_entry_path(f), clear)) {
388 log_debug("%s is masked by %s.", mount_entry_path(f), clear);
1eb7e08e 389 mount_entry_done(f);
fe3c2583
LP
390 continue;
391 }
15ae422b 392
34de407a 393 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
fe3c2583
LP
394
395 *t = *f;
15ae422b
LP
396 t++;
397 }
398
c17ec25e 399 *n = t - m;
15ae422b
LP
400}
401
34de407a
LP
402static void drop_nop(MountEntry *m, unsigned *n) {
403 MountEntry *f, *t;
7648a565
LP
404
405 assert(m);
406 assert(n);
407
408 /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
409 * list is ordered by prefixes. */
410
1d54cd5d 411 for (f = m, t = m; f < m + *n; f++) {
7648a565
LP
412
413 /* Only suppress such subtrees for READONLY and READWRITE entries */
414 if (IN_SET(f->mode, READONLY, READWRITE)) {
34de407a 415 MountEntry *p;
7648a565
LP
416 bool found = false;
417
418 /* Now let's find the first parent of the entry we are looking at. */
419 for (p = t-1; p >= m; p--) {
34de407a 420 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
7648a565
LP
421 found = true;
422 break;
423 }
424 }
425
426 /* We found it, let's see if it's the same mode, if so, we can drop this entry */
427 if (found && p->mode == f->mode) {
34de407a 428 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
1eb7e08e 429 mount_entry_done(f);
7648a565
LP
430 continue;
431 }
432 }
433
434 *t = *f;
435 t++;
436 }
437
438 *n = t - m;
439}
440
34de407a
LP
441static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
442 MountEntry *f, *t;
cd2902c9
LP
443
444 assert(m);
445 assert(n);
446
1d54cd5d 447 /* Nothing to do */
cd2902c9
LP
448 if (!root_directory)
449 return;
450
451 /* Drops all mounts that are outside of the root directory. */
452
1d54cd5d 453 for (f = m, t = m; f < m + *n; f++) {
cd2902c9 454
34de407a
LP
455 if (!path_startswith(mount_entry_path(f), root_directory)) {
456 log_debug("%s is outside of root directory.", mount_entry_path(f));
1eb7e08e 457 mount_entry_done(f);
cd2902c9
LP
458 continue;
459 }
460
461 *t = *f;
462 t++;
463 }
464
465 *n = t - m;
466}
467
34de407a 468static int mount_dev(MountEntry *m) {
7f112f50
LP
469 static const char devnodes[] =
470 "/dev/null\0"
471 "/dev/zero\0"
472 "/dev/full\0"
473 "/dev/random\0"
474 "/dev/urandom\0"
475 "/dev/tty\0";
476
2b85f4e1 477 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
63cc4c31 478 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
479 _cleanup_umask_ mode_t u;
480 int r;
481
482 assert(m);
483
484 u = umask(0000);
485
2b85f4e1
LP
486 if (!mkdtemp(temporary_mount))
487 return -errno;
488
63c372cb 489 dev = strjoina(temporary_mount, "/dev");
dc751688 490 (void) mkdir(dev, 0755);
737ba3c8 491 if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
2b85f4e1
LP
492 r = -errno;
493 goto fail;
494 }
495
63c372cb 496 devpts = strjoina(temporary_mount, "/dev/pts");
dc751688 497 (void) mkdir(devpts, 0755);
2b85f4e1
LP
498 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
499 r = -errno;
500 goto fail;
501 }
502
63c372cb 503 devptmx = strjoina(temporary_mount, "/dev/ptmx");
3164e3cb
ZJS
504 if (symlink("pts/ptmx", devptmx) < 0) {
505 r = -errno;
506 goto fail;
507 }
e06b6479 508
63c372cb 509 devshm = strjoina(temporary_mount, "/dev/shm");
dc751688 510 (void) mkdir(devshm, 01777);
2b85f4e1
LP
511 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
512 if (r < 0) {
513 r = -errno;
514 goto fail;
515 }
516
63c372cb 517 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
dc751688 518 (void) mkdir(devmqueue, 0755);
3164e3cb 519 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
2b85f4e1 520
63c372cb 521 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
dc751688 522 (void) mkdir(devhugepages, 0755);
3164e3cb 523 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
2b85f4e1 524
63c372cb 525 devlog = strjoina(temporary_mount, "/dev/log");
3164e3cb 526 (void) symlink("/run/systemd/journal/dev-log", devlog);
82d25240 527
7f112f50 528 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
529 _cleanup_free_ char *dn = NULL;
530 struct stat st;
531
532 r = stat(d, &st);
7f112f50 533 if (r < 0) {
2b85f4e1
LP
534
535 if (errno == ENOENT)
536 continue;
537
538 r = -errno;
539 goto fail;
7f112f50
LP
540 }
541
2b85f4e1
LP
542 if (!S_ISBLK(st.st_mode) &&
543 !S_ISCHR(st.st_mode)) {
544 r = -EINVAL;
545 goto fail;
546 }
547
548 if (st.st_rdev == 0)
549 continue;
550
551 dn = strappend(temporary_mount, d);
552 if (!dn) {
553 r = -ENOMEM;
554 goto fail;
555 }
556
ecabcf8b 557 mac_selinux_create_file_prepare(d, st.st_mode);
2b85f4e1 558 r = mknod(dn, st.st_mode, st.st_rdev);
ecabcf8b 559 mac_selinux_create_file_clear();
dd078a1e 560
2b85f4e1
LP
561 if (r < 0) {
562 r = -errno;
563 goto fail;
564 }
7f112f50
LP
565 }
566
03cfe0d5 567 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
7f112f50 568
ee818b89
AC
569 /* Create the /dev directory if missing. It is more likely to be
570 * missing when the service is started with RootDirectory. This is
571 * consistent with mount units creating the mount points when missing.
572 */
34de407a 573 (void) mkdir_p_label(mount_entry_path(m), 0755);
ee818b89 574
9e5f8252 575 /* Unmount everything in old /dev */
34de407a
LP
576 umount_recursive(mount_entry_path(m), 0);
577 if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
2b85f4e1
LP
578 r = -errno;
579 goto fail;
580 }
7f112f50 581
2b85f4e1
LP
582 rmdir(dev);
583 rmdir(temporary_mount);
7f112f50 584
2b85f4e1 585 return 0;
7f112f50 586
2b85f4e1
LP
587fail:
588 if (devpts)
589 umount(devpts);
7f112f50 590
2b85f4e1
LP
591 if (devshm)
592 umount(devshm);
7f112f50 593
2b85f4e1
LP
594 if (devhugepages)
595 umount(devhugepages);
7f112f50 596
2b85f4e1
LP
597 if (devmqueue)
598 umount(devmqueue);
7f112f50 599
d267c5aa
ZJS
600 umount(dev);
601 rmdir(dev);
2b85f4e1 602 rmdir(temporary_mount);
7f112f50 603
2b85f4e1 604 return r;
7f112f50
LP
605}
606
d2d6c096
LP
607static int mount_entry_chase(
608 const char *root_directory,
609 MountEntry *m,
610 const char *path,
611 char **location) {
612
8fceda93
LP
613 char *chased;
614 int r;
615
616 assert(m);
617
618 /* Since mount() will always follow symlinks and we need to take the different root directory into account we
d2d6c096
LP
619 * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
620 * that applies). The result is stored in "location". */
8fceda93 621
d2d6c096 622 r = chase_symlinks(path, root_directory, 0, &chased);
8fceda93 623 if (r == -ENOENT && m->ignore) {
d2d6c096 624 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
8fceda93
LP
625 return 0;
626 }
627 if (r < 0)
d2d6c096 628 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
8fceda93 629
d2d6c096 630 log_debug("Followed symlinks %s → %s.", path, chased);
8fceda93 631
d2d6c096
LP
632 free(*location);
633 *location = chased;
8fceda93
LP
634
635 return 1;
636}
637
ac0930c8 638static int apply_mount(
8fceda93 639 const char *root_directory,
34de407a 640 MountEntry *m,
ac0930c8 641 const char *tmp_dir,
c17ec25e 642 const char *var_tmp_dir) {
ac0930c8 643
15ae422b 644 const char *what;
d2d6c096 645 bool rbind = true;
15ae422b 646 int r;
15ae422b 647
c17ec25e 648 assert(m);
15ae422b 649
d2d6c096 650 r = mount_entry_chase(root_directory, m, mount_entry_path(m), &m->path_malloc);
8fceda93
LP
651 if (r <= 0)
652 return r;
653
34de407a 654 log_debug("Applying namespace mount on %s", mount_entry_path(m));
fe3c2583 655
c17ec25e 656 switch (m->mode) {
15ae422b 657
160cfdbe
LP
658 case INACCESSIBLE: {
659 struct stat target;
6d313367
LP
660
661 /* First, get rid of everything that is below if there
662 * is anything... Then, overmount it with an
c4b41707 663 * inaccessible path. */
34de407a 664 (void) umount_recursive(mount_entry_path(m), 0);
6d313367 665
34de407a
LP
666 if (lstat(mount_entry_path(m), &target) < 0)
667 return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
15ae422b 668
c4b41707 669 what = mode_to_inaccessible_node(target.st_mode);
5fd7cf6f
LP
670 if (!what) {
671 log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
c4b41707
AP
672 return -ELOOP;
673 }
674 break;
160cfdbe 675 }
fe3c2583 676
15ae422b 677 case READONLY:
15ae422b 678 case READWRITE:
8fceda93 679 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
d944dc95 680 if (r < 0)
34de407a 681 return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
6b7c9f8b
LP
682 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
683 return 0;
6b7c9f8b 684 /* This isn't a mount point yet, let's make it one. */
34de407a 685 what = mount_entry_path(m);
6b7c9f8b 686 break;
15ae422b 687
d2d6c096
LP
688 case BIND_MOUNT:
689 rbind = false;
690 /* fallthrough */
691
692 case BIND_MOUNT_RECURSIVE:
693 /* Also chase the source mount */
694 r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
695 if (r <= 0)
696 return r;
697
698 what = mount_entry_source(m);
699 break;
700
ac0930c8
LP
701 case PRIVATE_TMP:
702 what = tmp_dir;
703 break;
704
705 case PRIVATE_VAR_TMP:
706 what = var_tmp_dir;
15ae422b 707 break;
e364ad06 708
d6797c92
LP
709 case PRIVATE_DEV:
710 return mount_dev(m);
711
e364ad06
LP
712 default:
713 assert_not_reached("Unknown mode");
15ae422b
LP
714 }
715
ac0930c8 716 assert(what);
15ae422b 717
d2d6c096 718 if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
34de407a 719 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
6b7c9f8b 720
34de407a 721 log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
6b7c9f8b 722 return 0;
ac0930c8 723}
15ae422b 724
34de407a 725static int make_read_only(MountEntry *m, char **blacklist) {
6b7c9f8b 726 int r = 0;
15ae422b 727
c17ec25e 728 assert(m);
ac0930c8 729
34de407a
LP
730 if (mount_entry_read_only(m))
731 r = bind_remount_recursive(mount_entry_path(m), true, blacklist);
6b7c9f8b 732 else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
34de407a 733 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
6b7c9f8b 734 r = -errno;
737ba3c8 735 } else
6b7c9f8b
LP
736 return 0;
737
738 /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
739 * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
740 * read-only mounts already applied. */
ac0930c8 741
8fceda93
LP
742 if (r == -ENOENT && m->ignore)
743 r = 0;
5327c910 744
1d54cd5d 745 return r;
d944dc95
LP
746}
747
2652c6c1 748static unsigned namespace_calculate_mounts(
c575770b 749 const NameSpaceInfo *ns_info,
2652c6c1
DH
750 char** read_write_paths,
751 char** read_only_paths,
752 char** inaccessible_paths,
d2d6c096
LP
753 const BindMount *bind_mounts,
754 unsigned n_bind_mounts,
2652c6c1
DH
755 const char* tmp_dir,
756 const char* var_tmp_dir,
2652c6c1
DH
757 ProtectHome protect_home,
758 ProtectSystem protect_system) {
759
b6c432ca 760 unsigned protect_home_cnt;
f471b2af
DH
761 unsigned protect_system_cnt =
762 (protect_system == PROTECT_SYSTEM_STRICT ?
763 ELEMENTSOF(protect_system_strict_table) :
764 ((protect_system == PROTECT_SYSTEM_FULL) ?
765 ELEMENTSOF(protect_system_full_table) :
766 ((protect_system == PROTECT_SYSTEM_YES) ?
767 ELEMENTSOF(protect_system_yes_table) : 0)));
768
b6c432ca
DH
769 protect_home_cnt =
770 (protect_home == PROTECT_HOME_YES ?
771 ELEMENTSOF(protect_home_yes_table) :
772 ((protect_home == PROTECT_HOME_READ_ONLY) ?
773 ELEMENTSOF(protect_home_read_only_table) : 0));
774
2652c6c1
DH
775 return !!tmp_dir + !!var_tmp_dir +
776 strv_length(read_write_paths) +
777 strv_length(read_only_paths) +
778 strv_length(inaccessible_paths) +
d2d6c096 779 n_bind_mounts +
c575770b
DH
780 ns_info->private_dev +
781 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
782 (ns_info->protect_control_groups ? 1 : 0) +
783 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
b6c432ca 784 protect_home_cnt + protect_system_cnt;
2652c6c1
DH
785}
786
613b411c 787int setup_namespace(
ee818b89 788 const char* root_directory,
c575770b 789 const NameSpaceInfo *ns_info,
2a624c36
AP
790 char** read_write_paths,
791 char** read_only_paths,
792 char** inaccessible_paths,
d2d6c096
LP
793 const BindMount *bind_mounts,
794 unsigned n_bind_mounts,
a004cb4c
LP
795 const char* tmp_dir,
796 const char* var_tmp_dir,
1b8689f9
LP
797 ProtectHome protect_home,
798 ProtectSystem protect_system,
e6547662 799 unsigned long mount_flags) {
15ae422b 800
34de407a 801 MountEntry *m, *mounts = NULL;
d944dc95 802 bool make_slave = false;
f0a4feb0 803 unsigned n_mounts;
c17ec25e 804 int r = 0;
15ae422b 805
613b411c 806 if (mount_flags == 0)
c17ec25e 807 mount_flags = MS_SHARED;
ac0930c8 808
cfbeb4ef
LP
809 n_mounts = namespace_calculate_mounts(
810 ns_info,
811 read_write_paths,
812 read_only_paths,
813 inaccessible_paths,
d2d6c096 814 bind_mounts, n_bind_mounts,
cfbeb4ef
LP
815 tmp_dir, var_tmp_dir,
816 protect_home, protect_system);
613b411c 817
2652c6c1 818 /* Set mount slave mode */
f0a4feb0 819 if (root_directory || n_mounts > 0)
d944dc95
LP
820 make_slave = true;
821
f0a4feb0 822 if (n_mounts > 0) {
34de407a 823 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
5327c910 824 r = append_access_mounts(&m, read_write_paths, READWRITE);
613b411c 825 if (r < 0)
f0a4feb0 826 goto finish;
613b411c 827
5327c910 828 r = append_access_mounts(&m, read_only_paths, READONLY);
613b411c 829 if (r < 0)
f0a4feb0 830 goto finish;
613b411c 831
5327c910 832 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE);
613b411c 833 if (r < 0)
f0a4feb0 834 goto finish;
7ff7394d 835
d2d6c096
LP
836 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
837 if (r < 0)
838 goto finish;
839
613b411c 840 if (tmp_dir) {
34de407a 841 *(m++) = (MountEntry) {
5327c910
LP
842 .path_const = "/tmp",
843 .mode = PRIVATE_TMP,
844 };
613b411c 845 }
7ff7394d 846
613b411c 847 if (var_tmp_dir) {
34de407a 848 *(m++) = (MountEntry) {
5327c910
LP
849 .path_const = "/var/tmp",
850 .mode = PRIVATE_VAR_TMP,
851 };
7ff7394d 852 }
ac0930c8 853
c575770b 854 if (ns_info->private_dev) {
34de407a 855 *(m++) = (MountEntry) {
5327c910
LP
856 .path_const = "/dev",
857 .mode = PRIVATE_DEV,
858 };
7f112f50
LP
859 }
860
c575770b 861 if (ns_info->protect_kernel_tunables) {
5327c910 862 r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
c575770b 863 if (r < 0)
f0a4feb0 864 goto finish;
c575770b
DH
865 }
866
867 if (ns_info->protect_kernel_modules) {
5327c910 868 r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
c575770b 869 if (r < 0)
f0a4feb0 870 goto finish;
c575770b 871 }
59eeb84b 872
c575770b 873 if (ns_info->protect_control_groups) {
34de407a 874 *(m++) = (MountEntry) {
5327c910
LP
875 .path_const = "/sys/fs/cgroup",
876 .mode = READONLY,
877 };
59eeb84b
LP
878 }
879
5327c910 880 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
b6c432ca 881 if (r < 0)
f0a4feb0 882 goto finish;
417116f2 883
5327c910 884 r = append_protect_system(&m, protect_system, false);
f471b2af 885 if (r < 0)
f0a4feb0 886 goto finish;
417116f2 887
f0a4feb0 888 assert(mounts + n_mounts == m);
ac0930c8 889
5327c910
LP
890 /* Prepend the root directory where that's necessary */
891 r = prefix_where_needed(mounts, n_mounts, root_directory);
892 if (r < 0)
893 goto finish;
894
34de407a 895 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
fe3c2583 896
f0a4feb0
DH
897 drop_duplicates(mounts, &n_mounts);
898 drop_outside_root(root_directory, mounts, &n_mounts);
899 drop_inaccessible(mounts, &n_mounts);
900 drop_nop(mounts, &n_mounts);
15ae422b
LP
901 }
902
d944dc95
LP
903 if (unshare(CLONE_NEWNS) < 0) {
904 r = -errno;
905 goto finish;
906 }
1e4e94c8 907
d944dc95 908 if (make_slave) {
c2c13f2d
LP
909 /* Remount / as SLAVE so that nothing now mounted in the namespace
910 shows up in the parent */
d944dc95
LP
911 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
912 r = -errno;
913 goto finish;
914 }
ee818b89
AC
915 }
916
917 if (root_directory) {
8f1ad200 918 /* Turn directory into bind mount, if it isn't one yet */
e1873695 919 r = path_is_mount_point(root_directory, NULL, AT_SYMLINK_FOLLOW);
8f1ad200 920 if (r < 0)
d944dc95 921 goto finish;
8f1ad200
LP
922 if (r == 0) {
923 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
924 r = -errno;
925 goto finish;
926 }
d944dc95 927 }
ee818b89 928 }
c2c13f2d 929
f0a4feb0 930 if (n_mounts > 0) {
6b7c9f8b
LP
931 char **blacklist;
932 unsigned j;
933
934 /* First round, add in all special mounts we need */
f0a4feb0 935 for (m = mounts; m < mounts + n_mounts; ++m) {
8fceda93 936 r = apply_mount(root_directory, m, tmp_dir, var_tmp_dir);
c2c13f2d 937 if (r < 0)
d944dc95 938 goto finish;
c2c13f2d 939 }
15ae422b 940
6b7c9f8b 941 /* Create a blacklist we can pass to bind_mount_recursive() */
f0a4feb0
DH
942 blacklist = newa(char*, n_mounts+1);
943 for (j = 0; j < n_mounts; j++)
34de407a 944 blacklist[j] = (char*) mount_entry_path(mounts+j);
6b7c9f8b
LP
945 blacklist[j] = NULL;
946
947 /* Second round, flip the ro bits if necessary. */
f0a4feb0 948 for (m = mounts; m < mounts + n_mounts; ++m) {
6b7c9f8b 949 r = make_read_only(m, blacklist);
c2c13f2d 950 if (r < 0)
d944dc95 951 goto finish;
c2c13f2d 952 }
15ae422b
LP
953 }
954
ee818b89
AC
955 if (root_directory) {
956 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
957 r = mount_move_root(root_directory);
d944dc95
LP
958 if (r < 0)
959 goto finish;
ee818b89
AC
960 }
961
c2c13f2d
LP
962 /* Remount / as the desired mode. Not that this will not
963 * reestablish propagation from our side to the host, since
964 * what's disconnected is disconnected. */
d944dc95
LP
965 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
966 r = -errno;
967 goto finish;
968 }
15ae422b 969
d944dc95 970 r = 0;
15ae422b 971
d944dc95 972finish:
f0a4feb0 973 for (m = mounts; m < mounts + n_mounts; m++)
1eb7e08e 974 mount_entry_done(m);
613b411c
LP
975
976 return r;
977}
978
d2d6c096
LP
979void bind_mount_free_many(BindMount *b, unsigned n) {
980 unsigned i;
981
982 assert(b || n == 0);
983
984 for (i = 0; i < n; i++) {
985 free(b[i].source);
986 free(b[i].destination);
987 }
988
989 free(b);
990}
991
992int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
993 _cleanup_free_ char *s = NULL, *d = NULL;
994 BindMount *c;
995
996 assert(b);
997 assert(n);
998 assert(item);
999
1000 s = strdup(item->source);
1001 if (!s)
1002 return -ENOMEM;
1003
1004 d = strdup(item->destination);
1005 if (!d)
1006 return -ENOMEM;
1007
1008 c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1009 if (!c)
1010 return -ENOMEM;
1011
1012 *b = c;
1013
1014 c[(*n) ++] = (BindMount) {
1015 .source = s,
1016 .destination = d,
1017 .read_only = item->read_only,
1018 .recursive = item->recursive,
1019 .ignore_enoent = item->ignore_enoent,
1020 };
1021
1022 s = d = NULL;
1023 return 0;
1024}
1025
613b411c
LP
1026static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1027 _cleanup_free_ char *x = NULL;
6b46ea73
LP
1028 char bid[SD_ID128_STRING_MAX];
1029 sd_id128_t boot_id;
1030 int r;
613b411c
LP
1031
1032 assert(id);
1033 assert(prefix);
1034 assert(path);
1035
6b46ea73
LP
1036 /* We include the boot id in the directory so that after a
1037 * reboot we can easily identify obsolete directories. */
1038
1039 r = sd_id128_get_boot(&boot_id);
1040 if (r < 0)
1041 return r;
1042
605405c6 1043 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
613b411c
LP
1044 if (!x)
1045 return -ENOMEM;
1046
1047 RUN_WITH_UMASK(0077)
1048 if (!mkdtemp(x))
1049 return -errno;
1050
1051 RUN_WITH_UMASK(0000) {
1052 char *y;
1053
63c372cb 1054 y = strjoina(x, "/tmp");
613b411c
LP
1055
1056 if (mkdir(y, 0777 | S_ISVTX) < 0)
1057 return -errno;
c17ec25e 1058 }
15ae422b 1059
613b411c
LP
1060 *path = x;
1061 x = NULL;
1062
1063 return 0;
1064}
1065
1066int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1067 char *a, *b;
1068 int r;
1069
1070 assert(id);
1071 assert(tmp_dir);
1072 assert(var_tmp_dir);
1073
1074 r = setup_one_tmp_dir(id, "/tmp", &a);
1075 if (r < 0)
1076 return r;
1077
1078 r = setup_one_tmp_dir(id, "/var/tmp", &b);
1079 if (r < 0) {
1080 char *t;
1081
63c372cb 1082 t = strjoina(a, "/tmp");
613b411c
LP
1083 rmdir(t);
1084 rmdir(a);
1085
1086 free(a);
1087 return r;
1088 }
1089
1090 *tmp_dir = a;
1091 *var_tmp_dir = b;
1092
1093 return 0;
1094}
1095
1096int setup_netns(int netns_storage_socket[2]) {
1097 _cleanup_close_ int netns = -1;
3ee897d6 1098 int r, q;
613b411c
LP
1099
1100 assert(netns_storage_socket);
1101 assert(netns_storage_socket[0] >= 0);
1102 assert(netns_storage_socket[1] >= 0);
1103
1104 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
1105 * namespace reference fd. Whatever process runs this first
1106 * shall create a new namespace, all others should just join
1107 * it. To serialize that we use a file lock on the socket
1108 * pair.
613b411c
LP
1109 *
1110 * It's a bit crazy, but hey, works great! */
1111
1112 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1113 return -errno;
1114
3ee897d6
LP
1115 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1116 if (netns == -EAGAIN) {
613b411c
LP
1117 /* Nothing stored yet, so let's create a new namespace */
1118
1119 if (unshare(CLONE_NEWNET) < 0) {
1120 r = -errno;
1121 goto fail;
1122 }
1123
1124 loopback_setup();
1125
1126 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1127 if (netns < 0) {
1128 r = -errno;
1129 goto fail;
1130 }
1131
1132 r = 1;
613b411c 1133
3ee897d6
LP
1134 } else if (netns < 0) {
1135 r = netns;
1136 goto fail;
613b411c 1137
3ee897d6
LP
1138 } else {
1139 /* Yay, found something, so let's join the namespace */
613b411c
LP
1140 if (setns(netns, CLONE_NEWNET) < 0) {
1141 r = -errno;
1142 goto fail;
1143 }
1144
1145 r = 0;
1146 }
1147
3ee897d6
LP
1148 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1149 if (q < 0) {
1150 r = q;
613b411c
LP
1151 goto fail;
1152 }
1153
1154fail:
fe048ce5 1155 (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
15ae422b
LP
1156 return r;
1157}
417116f2 1158
1b8689f9
LP
1159static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1160 [PROTECT_HOME_NO] = "no",
1161 [PROTECT_HOME_YES] = "yes",
1162 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
1163};
1164
1b8689f9
LP
1165DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1166
1167static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1168 [PROTECT_SYSTEM_NO] = "no",
1169 [PROTECT_SYSTEM_YES] = "yes",
1170 [PROTECT_SYSTEM_FULL] = "full",
3f815163 1171 [PROTECT_SYSTEM_STRICT] = "strict",
1b8689f9
LP
1172};
1173
1174DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);