]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
namespace: when setting up an inaccessible mount point, unmounting everything below
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
d6c9574f 1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
15ae422b
LP
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
15ae422b 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <errno.h>
23#include <sys/mount.h>
24#include <string.h>
25#include <stdio.h>
26#include <unistd.h>
27#include <sys/stat.h>
28#include <sys/types.h>
29#include <sched.h>
30#include <sys/syscall.h>
31#include <limits.h>
25e870b5 32#include <linux/fs.h>
613b411c 33#include <sys/file.h>
15ae422b
LP
34
35#include "strv.h"
36#include "util.h"
9eb977db 37#include "path-util.h"
15ae422b
LP
38#include "namespace.h"
39#include "missing.h"
c17ec25e 40#include "execute.h"
613b411c 41#include "loopback-setup.h"
7f112f50
LP
42#include "mkdir.h"
43#include "dev-setup.h"
44#include "def.h"
15ae422b 45
c17ec25e 46typedef enum MountMode {
15ae422b
LP
47 /* This is ordered by priority! */
48 INACCESSIBLE,
49 READONLY,
ac0930c8
LP
50 PRIVATE_TMP,
51 PRIVATE_VAR_TMP,
7f112f50 52 PRIVATE_DEV,
15ae422b 53 READWRITE
c17ec25e 54} MountMode;
15ae422b 55
c17ec25e 56typedef struct BindMount {
15ae422b 57 const char *path;
c17ec25e 58 MountMode mode;
ac0930c8 59 bool done;
ea92ae33 60 bool ignore;
c17ec25e 61} BindMount;
15ae422b 62
c17ec25e 63static int append_mounts(BindMount **p, char **strv, MountMode mode) {
15ae422b
LP
64 char **i;
65
613b411c
LP
66 assert(p);
67
15ae422b
LP
68 STRV_FOREACH(i, strv) {
69
ea92ae33
MW
70 (*p)->ignore = false;
71
94828d2d 72 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
ea92ae33
MW
73 (*p)->ignore = true;
74 (*i)++;
75 }
76
15ae422b
LP
77 if (!path_is_absolute(*i))
78 return -EINVAL;
79
80 (*p)->path = *i;
81 (*p)->mode = mode;
82 (*p)++;
83 }
84
85 return 0;
86}
87
c17ec25e
MS
88static int mount_path_compare(const void *a, const void *b) {
89 const BindMount *p = a, *q = b;
15ae422b
LP
90
91 if (path_equal(p->path, q->path)) {
92
93 /* If the paths are equal, check the mode */
94 if (p->mode < q->mode)
95 return -1;
96
97 if (p->mode > q->mode)
98 return 1;
99
100 return 0;
101 }
102
103 /* If the paths are not equal, then order prefixes first */
104 if (path_startswith(p->path, q->path))
105 return 1;
106
107 if (path_startswith(q->path, p->path))
108 return -1;
109
110 return 0;
111}
112
c17ec25e
MS
113static void drop_duplicates(BindMount *m, unsigned *n) {
114 BindMount *f, *t, *previous;
15ae422b 115
c17ec25e 116 assert(m);
15ae422b 117 assert(n);
15ae422b 118
c17ec25e 119 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
15ae422b 120
ac0930c8 121 /* The first one wins */
15ae422b
LP
122 if (previous && path_equal(f->path, previous->path))
123 continue;
124
125 t->path = f->path;
126 t->mode = f->mode;
127
15ae422b
LP
128 previous = t;
129
130 t++;
131 }
132
c17ec25e 133 *n = t - m;
15ae422b
LP
134}
135
7f112f50
LP
136static int mount_dev(BindMount *m) {
137 static const char devnodes[] =
138 "/dev/null\0"
139 "/dev/zero\0"
140 "/dev/full\0"
141 "/dev/random\0"
142 "/dev/urandom\0"
143 "/dev/tty\0";
144
2b85f4e1 145 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
e06b6479 146 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
7f112f50
LP
147 _cleanup_umask_ mode_t u;
148 int r;
149
150 assert(m);
151
152 u = umask(0000);
153
2b85f4e1
LP
154 if (!mkdtemp(temporary_mount))
155 return -errno;
156
157 dev = strappenda(temporary_mount, "/dev");
158 mkdir(dev, 0755);
159 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
160 r = -errno;
161 goto fail;
162 }
163
164 devpts = strappenda(temporary_mount, "/dev/pts");
165 mkdir(devpts, 0755);
166 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
167 r = -errno;
168 goto fail;
169 }
170
e06b6479
LP
171 devptmx = strappenda(temporary_mount, "/dev/ptmx");
172 symlink("pts/ptmx", devptmx);
173
2b85f4e1
LP
174 devshm = strappenda(temporary_mount, "/dev/shm");
175 mkdir(devshm, 01777);
176 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
177 if (r < 0) {
178 r = -errno;
179 goto fail;
180 }
181
182 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
183 mkdir(devmqueue, 0755);
184 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
185
186 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
187 mkdir(devkdbus, 0755);
188 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
189
190 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
191 mkdir(devhugepages, 0755);
192 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
193
82d25240
LP
194 devlog = strappenda(temporary_mount, "/dev/log");
195 symlink("/run/systemd/journal/dev-log", devlog);
196
7f112f50 197 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
198 _cleanup_free_ char *dn = NULL;
199 struct stat st;
200
201 r = stat(d, &st);
7f112f50 202 if (r < 0) {
2b85f4e1
LP
203
204 if (errno == ENOENT)
205 continue;
206
207 r = -errno;
208 goto fail;
7f112f50
LP
209 }
210
2b85f4e1
LP
211 if (!S_ISBLK(st.st_mode) &&
212 !S_ISCHR(st.st_mode)) {
213 r = -EINVAL;
214 goto fail;
215 }
216
217 if (st.st_rdev == 0)
218 continue;
219
220 dn = strappend(temporary_mount, d);
221 if (!dn) {
222 r = -ENOMEM;
223 goto fail;
224 }
225
226 r = mknod(dn, st.st_mode, st.st_rdev);
227 if (r < 0) {
228 r = -errno;
229 goto fail;
230 }
7f112f50
LP
231 }
232
2b85f4e1 233 dev_setup(temporary_mount);
7f112f50 234
2b85f4e1
LP
235 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
236 r = -errno;
237 goto fail;
238 }
7f112f50 239
2b85f4e1
LP
240 rmdir(dev);
241 rmdir(temporary_mount);
7f112f50 242
2b85f4e1 243 return 0;
7f112f50 244
2b85f4e1
LP
245fail:
246 if (devpts)
247 umount(devpts);
7f112f50 248
2b85f4e1
LP
249 if (devshm)
250 umount(devshm);
7f112f50 251
2b85f4e1
LP
252 if (devkdbus)
253 umount(devkdbus);
7f112f50 254
2b85f4e1
LP
255 if (devhugepages)
256 umount(devhugepages);
7f112f50 257
2b85f4e1
LP
258 if (devmqueue)
259 umount(devmqueue);
7f112f50 260
2b85f4e1
LP
261 if (dev) {
262 umount(dev);
263 rmdir(dev);
7f112f50
LP
264 }
265
2b85f4e1 266 rmdir(temporary_mount);
7f112f50 267
2b85f4e1 268 return r;
7f112f50
LP
269}
270
ac0930c8 271static int apply_mount(
c17ec25e 272 BindMount *m,
ac0930c8 273 const char *tmp_dir,
c17ec25e 274 const char *var_tmp_dir) {
ac0930c8 275
15ae422b 276 const char *what;
15ae422b 277 int r;
15ae422b 278
c17ec25e 279 assert(m);
15ae422b 280
c17ec25e 281 switch (m->mode) {
15ae422b 282
7f112f50
LP
283 case PRIVATE_DEV:
284 return mount_dev(m);
285
15ae422b 286 case INACCESSIBLE:
6d313367
LP
287
288 /* First, get rid of everything that is below if there
289 * is anything... Then, overmount it with an
290 * inaccessible directory. */
291 umount_recursive(m->path, 0);
292
c17ec25e 293 what = "/run/systemd/inaccessible";
15ae422b
LP
294 break;
295
296 case READONLY:
15ae422b 297 case READWRITE:
c17ec25e 298 what = m->path;
15ae422b
LP
299 break;
300
ac0930c8
LP
301 case PRIVATE_TMP:
302 what = tmp_dir;
303 break;
304
305 case PRIVATE_VAR_TMP:
306 what = var_tmp_dir;
15ae422b 307 break;
e364ad06
LP
308
309 default:
310 assert_not_reached("Unknown mode");
15ae422b
LP
311 }
312
ac0930c8 313 assert(what);
15ae422b 314
c17ec25e 315 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
ac0930c8 316 if (r >= 0)
c17ec25e 317 log_debug("Successfully mounted %s to %s", what, m->path);
ea92ae33
MW
318 else if (m->ignore && errno == ENOENT)
319 r = 0;
15ae422b 320
ac0930c8
LP
321 return r;
322}
15ae422b 323
c17ec25e 324static int make_read_only(BindMount *m) {
ac0930c8 325 int r;
15ae422b 326
c17ec25e 327 assert(m);
ac0930c8 328
c17ec25e 329 if (m->mode != INACCESSIBLE && m->mode != READONLY)
ac0930c8
LP
330 return 0;
331
c17ec25e 332 r = mount(NULL, m->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL);
ea92ae33 333 if (r < 0 && !(m->ignore && errno == ENOENT))
ac0930c8
LP
334 return -errno;
335
336 return 0;
15ae422b
LP
337}
338
613b411c
LP
339int setup_namespace(
340 char** read_write_dirs,
341 char** read_only_dirs,
342 char** inaccessible_dirs,
343 char* tmp_dir,
344 char* var_tmp_dir,
7f112f50 345 bool private_dev,
1b8689f9
LP
346 ProtectHome protect_home,
347 ProtectSystem protect_system,
613b411c 348 unsigned mount_flags) {
15ae422b 349
7ff7394d 350 BindMount *m, *mounts = NULL;
613b411c 351 unsigned n;
c17ec25e 352 int r = 0;
15ae422b 353
613b411c 354 if (mount_flags == 0)
c17ec25e 355 mount_flags = MS_SHARED;
ac0930c8 356
d5a3f0ea
ZJS
357 if (unshare(CLONE_NEWNS) < 0)
358 return -errno;
15ae422b 359
613b411c
LP
360 n = !!tmp_dir + !!var_tmp_dir +
361 strv_length(read_write_dirs) +
362 strv_length(read_only_dirs) +
7f112f50 363 strv_length(inaccessible_dirs) +
417116f2 364 private_dev +
1b8689f9 365 (protect_home != PROTECT_HOME_NO ? 2 : 0) +
5331194c 366 (protect_system != PROTECT_SYSTEM_NO ? 1 : 0) +
1b8689f9 367 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
613b411c
LP
368
369 if (n > 0) {
7ff7394d 370 m = mounts = (BindMount *) alloca(n * sizeof(BindMount));
613b411c
LP
371 r = append_mounts(&m, read_write_dirs, READWRITE);
372 if (r < 0)
373 return r;
374
375 r = append_mounts(&m, read_only_dirs, READONLY);
376 if (r < 0)
377 return r;
378
379 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
380 if (r < 0)
7ff7394d
ZJS
381 return r;
382
613b411c 383 if (tmp_dir) {
7ff7394d
ZJS
384 m->path = "/tmp";
385 m->mode = PRIVATE_TMP;
386 m++;
613b411c 387 }
7ff7394d 388
613b411c 389 if (var_tmp_dir) {
7ff7394d
ZJS
390 m->path = "/var/tmp";
391 m->mode = PRIVATE_VAR_TMP;
392 m++;
393 }
ac0930c8 394
7f112f50
LP
395 if (private_dev) {
396 m->path = "/dev";
397 m->mode = PRIVATE_DEV;
398 m++;
399 }
400
1b8689f9
LP
401 if (protect_home != PROTECT_HOME_NO) {
402 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
417116f2
LP
403 if (r < 0)
404 return r;
405 }
406
1b8689f9 407 if (protect_system != PROTECT_SYSTEM_NO) {
5331194c 408 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "/etc") : STRV_MAKE("/usr"), READONLY);
417116f2
LP
409 if (r < 0)
410 return r;
411 }
412
7ff7394d 413 assert(mounts + n == m);
ac0930c8 414
7ff7394d
ZJS
415 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
416 drop_duplicates(mounts, &n);
15ae422b
LP
417 }
418
c2c13f2d
LP
419 if (n > 0) {
420 /* Remount / as SLAVE so that nothing now mounted in the namespace
421 shows up in the parent */
422 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
423 return -errno;
424
425 for (m = mounts; m < mounts + n; ++m) {
426 r = apply_mount(m, tmp_dir, var_tmp_dir);
427 if (r < 0)
428 goto fail;
429 }
15ae422b 430
c2c13f2d
LP
431 for (m = mounts; m < mounts + n; ++m) {
432 r = make_read_only(m);
433 if (r < 0)
434 goto fail;
435 }
15ae422b
LP
436 }
437
c2c13f2d
LP
438 /* Remount / as the desired mode. Not that this will not
439 * reestablish propagation from our side to the host, since
440 * what's disconnected is disconnected. */
c17ec25e 441 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
15ae422b 442 r = -errno;
613b411c 443 goto fail;
15ae422b
LP
444 }
445
15ae422b
LP
446 return 0;
447
613b411c 448fail:
c2c13f2d
LP
449 if (n > 0) {
450 for (m = mounts; m < mounts + n; ++m)
451 if (m->done)
452 umount2(m->path, MNT_DETACH);
453 }
613b411c
LP
454
455 return r;
456}
457
458static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
459 _cleanup_free_ char *x = NULL;
6b46ea73
LP
460 char bid[SD_ID128_STRING_MAX];
461 sd_id128_t boot_id;
462 int r;
613b411c
LP
463
464 assert(id);
465 assert(prefix);
466 assert(path);
467
6b46ea73
LP
468 /* We include the boot id in the directory so that after a
469 * reboot we can easily identify obsolete directories. */
470
471 r = sd_id128_get_boot(&boot_id);
472 if (r < 0)
473 return r;
474
475 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
613b411c
LP
476 if (!x)
477 return -ENOMEM;
478
479 RUN_WITH_UMASK(0077)
480 if (!mkdtemp(x))
481 return -errno;
482
483 RUN_WITH_UMASK(0000) {
484 char *y;
485
486 y = strappenda(x, "/tmp");
487
488 if (mkdir(y, 0777 | S_ISVTX) < 0)
489 return -errno;
c17ec25e 490 }
15ae422b 491
613b411c
LP
492 *path = x;
493 x = NULL;
494
495 return 0;
496}
497
498int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
499 char *a, *b;
500 int r;
501
502 assert(id);
503 assert(tmp_dir);
504 assert(var_tmp_dir);
505
506 r = setup_one_tmp_dir(id, "/tmp", &a);
507 if (r < 0)
508 return r;
509
510 r = setup_one_tmp_dir(id, "/var/tmp", &b);
511 if (r < 0) {
512 char *t;
513
514 t = strappenda(a, "/tmp");
515 rmdir(t);
516 rmdir(a);
517
518 free(a);
519 return r;
520 }
521
522 *tmp_dir = a;
523 *var_tmp_dir = b;
524
525 return 0;
526}
527
528int setup_netns(int netns_storage_socket[2]) {
529 _cleanup_close_ int netns = -1;
530 union {
531 struct cmsghdr cmsghdr;
532 uint8_t buf[CMSG_SPACE(sizeof(int))];
533 } control = {};
534 struct msghdr mh = {
535 .msg_control = &control,
536 .msg_controllen = sizeof(control),
537 };
538 struct cmsghdr *cmsg;
539 int r;
540
541 assert(netns_storage_socket);
542 assert(netns_storage_socket[0] >= 0);
543 assert(netns_storage_socket[1] >= 0);
544
545 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
546 * namespace reference fd. Whatever process runs this first
547 * shall create a new namespace, all others should just join
548 * it. To serialize that we use a file lock on the socket
549 * pair.
613b411c
LP
550 *
551 * It's a bit crazy, but hey, works great! */
552
553 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
554 return -errno;
555
556 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
557 if (errno != EAGAIN) {
558 r = -errno;
559 goto fail;
560 }
561
562 /* Nothing stored yet, so let's create a new namespace */
563
564 if (unshare(CLONE_NEWNET) < 0) {
565 r = -errno;
566 goto fail;
567 }
568
569 loopback_setup();
570
571 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
572 if (netns < 0) {
573 r = -errno;
574 goto fail;
575 }
576
577 r = 1;
578 } else {
579 /* Yay, found something, so let's join the namespace */
580
581 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
582 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
583 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
584 netns = *(int*) CMSG_DATA(cmsg);
585 }
586 }
587
588 if (setns(netns, CLONE_NEWNET) < 0) {
589 r = -errno;
590 goto fail;
591 }
592
593 r = 0;
594 }
595
596 cmsg = CMSG_FIRSTHDR(&mh);
597 cmsg->cmsg_level = SOL_SOCKET;
598 cmsg->cmsg_type = SCM_RIGHTS;
599 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
600 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
601 mh.msg_controllen = cmsg->cmsg_len;
602
603 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
604 r = -errno;
605 goto fail;
606 }
607
608fail:
609 lockf(netns_storage_socket[0], F_ULOCK, 0);
610
15ae422b
LP
611 return r;
612}
417116f2 613
1b8689f9
LP
614static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
615 [PROTECT_HOME_NO] = "no",
616 [PROTECT_HOME_YES] = "yes",
617 [PROTECT_HOME_READ_ONLY] = "read-only",
417116f2
LP
618};
619
1b8689f9
LP
620DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
621
622static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
623 [PROTECT_SYSTEM_NO] = "no",
624 [PROTECT_SYSTEM_YES] = "yes",
625 [PROTECT_SYSTEM_FULL] = "full",
626};
627
628DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);