]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
namespace: add support for custom kdbus endpoint
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #include <sched.h>
30 #include <sys/syscall.h>
31 #include <limits.h>
32 #include <linux/fs.h>
33 #include <sys/file.h>
34
35 #include "strv.h"
36 #include "util.h"
37 #include "path-util.h"
38 #include "namespace.h"
39 #include "missing.h"
40 #include "execute.h"
41 #include "loopback-setup.h"
42 #include "mkdir.h"
43 #include "dev-setup.h"
44 #include "def.h"
45 #include "label.h"
46
47 typedef enum MountMode {
48 /* This is ordered by priority! */
49 INACCESSIBLE,
50 READONLY,
51 PRIVATE_TMP,
52 PRIVATE_VAR_TMP,
53 PRIVATE_DEV,
54 PRIVATE_BUS_ENDPOINT,
55 READWRITE
56 } MountMode;
57
58 typedef struct BindMount {
59 const char *path;
60 MountMode mode;
61 bool done;
62 bool ignore;
63 } BindMount;
64
65 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
66 char **i;
67
68 assert(p);
69
70 STRV_FOREACH(i, strv) {
71
72 (*p)->ignore = false;
73 (*p)->done = false;
74
75 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
76 (*p)->ignore = true;
77 (*i)++;
78 }
79
80 if (!path_is_absolute(*i))
81 return -EINVAL;
82
83 (*p)->path = *i;
84 (*p)->mode = mode;
85 (*p)++;
86 }
87
88 return 0;
89 }
90
91 static int mount_path_compare(const void *a, const void *b) {
92 const BindMount *p = a, *q = b;
93
94 if (path_equal(p->path, q->path)) {
95
96 /* If the paths are equal, check the mode */
97 if (p->mode < q->mode)
98 return -1;
99
100 if (p->mode > q->mode)
101 return 1;
102
103 return 0;
104 }
105
106 /* If the paths are not equal, then order prefixes first */
107 if (path_startswith(p->path, q->path))
108 return 1;
109
110 if (path_startswith(q->path, p->path))
111 return -1;
112
113 return 0;
114 }
115
116 static void drop_duplicates(BindMount *m, unsigned *n) {
117 BindMount *f, *t, *previous;
118
119 assert(m);
120 assert(n);
121
122 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
123
124 /* The first one wins */
125 if (previous && path_equal(f->path, previous->path))
126 continue;
127
128 *t = *f;
129
130 previous = t;
131
132 t++;
133 }
134
135 *n = t - m;
136 }
137
138 static int mount_dev(BindMount *m) {
139 static const char devnodes[] =
140 "/dev/null\0"
141 "/dev/zero\0"
142 "/dev/full\0"
143 "/dev/random\0"
144 "/dev/urandom\0"
145 "/dev/tty\0";
146
147 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
148 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
149 _cleanup_umask_ mode_t u;
150 int r;
151
152 assert(m);
153
154 u = umask(0000);
155
156 if (!mkdtemp(temporary_mount))
157 return -errno;
158
159 dev = strappenda(temporary_mount, "/dev");
160 mkdir(dev, 0755);
161 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
162 r = -errno;
163 goto fail;
164 }
165
166 devpts = strappenda(temporary_mount, "/dev/pts");
167 mkdir(devpts, 0755);
168 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
169 r = -errno;
170 goto fail;
171 }
172
173 devptmx = strappenda(temporary_mount, "/dev/ptmx");
174 symlink("pts/ptmx", devptmx);
175
176 devshm = strappenda(temporary_mount, "/dev/shm");
177 mkdir(devshm, 01777);
178 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
179 if (r < 0) {
180 r = -errno;
181 goto fail;
182 }
183
184 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
185 mkdir(devmqueue, 0755);
186 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
187
188 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
189 mkdir(devkdbus, 0755);
190 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
191
192 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
193 mkdir(devhugepages, 0755);
194 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
195
196 devlog = strappenda(temporary_mount, "/dev/log");
197 symlink("/run/systemd/journal/dev-log", devlog);
198
199 NULSTR_FOREACH(d, devnodes) {
200 _cleanup_free_ char *dn = NULL;
201 struct stat st;
202
203 r = stat(d, &st);
204 if (r < 0) {
205
206 if (errno == ENOENT)
207 continue;
208
209 r = -errno;
210 goto fail;
211 }
212
213 if (!S_ISBLK(st.st_mode) &&
214 !S_ISCHR(st.st_mode)) {
215 r = -EINVAL;
216 goto fail;
217 }
218
219 if (st.st_rdev == 0)
220 continue;
221
222 dn = strappend(temporary_mount, d);
223 if (!dn) {
224 r = -ENOMEM;
225 goto fail;
226 }
227
228 label_context_set(d, st.st_mode);
229 r = mknod(dn, st.st_mode, st.st_rdev);
230 label_context_clear();
231
232 if (r < 0) {
233 r = -errno;
234 goto fail;
235 }
236 }
237
238 dev_setup(temporary_mount);
239
240 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
241 r = -errno;
242 goto fail;
243 }
244
245 rmdir(dev);
246 rmdir(temporary_mount);
247
248 return 0;
249
250 fail:
251 if (devpts)
252 umount(devpts);
253
254 if (devshm)
255 umount(devshm);
256
257 if (devkdbus)
258 umount(devkdbus);
259
260 if (devhugepages)
261 umount(devhugepages);
262
263 if (devmqueue)
264 umount(devmqueue);
265
266 if (dev) {
267 umount(dev);
268 rmdir(dev);
269 }
270
271 rmdir(temporary_mount);
272
273 return r;
274 }
275
276 static int mount_kdbus(BindMount *m) {
277
278 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
279 _cleanup_free_ char *basepath = NULL;
280 _cleanup_umask_ mode_t u;
281 char *busnode, *root;
282 struct stat st;
283 int r;
284
285 assert(m);
286
287 u = umask(0000);
288
289 if (!mkdtemp(temporary_mount)) {
290 log_error("Failed create temp dir: %m");
291 return -errno;
292 }
293
294 root = strappenda(temporary_mount, "/kdbus");
295 mkdir(root, 0755);
296 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
297 r = -errno;
298 goto fail;
299 }
300
301 /* create a new /dev/null dev node copy so we have some fodder to
302 * bind-mount the custom endpoint over. */
303 if (stat("/dev/null", &st) < 0) {
304 log_error("Failed to stat /dev/null: %m");
305 r = -errno;
306 goto fail;
307 }
308
309 busnode = strappenda(root, "/bus");
310 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
311 log_error("mknod() for %s failed: %m", busnode);
312 r = -errno;
313 goto fail;
314 }
315
316 r = mount(m->path, busnode, "bind", MS_BIND, NULL);
317 if (r < 0) {
318 log_error("bind mount of %s failed: %m", m->path);
319 r = -errno;
320 goto fail;
321 }
322
323 basepath = dirname_malloc(m->path);
324 if (!basepath) {
325 r = -ENOMEM;
326 goto fail;
327 }
328
329 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
330 log_error("bind mount of %s failed: %m", basepath);
331 r = -errno;
332 goto fail;
333 }
334
335 rmdir(temporary_mount);
336 return 0;
337
338 fail:
339 if (busnode) {
340 umount(busnode);
341 unlink(busnode);
342 }
343
344 if (root) {
345 umount(root);
346 rmdir(root);
347 }
348
349 rmdir(temporary_mount);
350
351 return r;
352 }
353
354 static int apply_mount(
355 BindMount *m,
356 const char *tmp_dir,
357 const char *var_tmp_dir) {
358
359 const char *what;
360 int r;
361
362 assert(m);
363
364 switch (m->mode) {
365
366 case INACCESSIBLE:
367
368 /* First, get rid of everything that is below if there
369 * is anything... Then, overmount it with an
370 * inaccessible directory. */
371 umount_recursive(m->path, 0);
372
373 what = "/run/systemd/inaccessible";
374 break;
375
376 case READONLY:
377 case READWRITE:
378 /* Nothing to mount here, we just later toggle the
379 * MS_RDONLY bit for the mount point */
380 return 0;
381
382 case PRIVATE_TMP:
383 what = tmp_dir;
384 break;
385
386 case PRIVATE_VAR_TMP:
387 what = var_tmp_dir;
388 break;
389
390 case PRIVATE_DEV:
391 return mount_dev(m);
392
393 case PRIVATE_BUS_ENDPOINT:
394 return mount_kdbus(m);
395
396 default:
397 assert_not_reached("Unknown mode");
398 }
399
400 assert(what);
401
402 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
403 if (r >= 0)
404 log_debug("Successfully mounted %s to %s", what, m->path);
405 else if (m->ignore && errno == ENOENT)
406 return 0;
407
408 return r;
409 }
410
411 static int make_read_only(BindMount *m) {
412 int r;
413
414 assert(m);
415
416 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
417 r = bind_remount_recursive(m->path, true);
418 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
419 r = bind_remount_recursive(m->path, false);
420 else
421 r = 0;
422
423 if (m->ignore && r == -ENOENT)
424 return 0;
425
426 return r;
427 }
428
429 int setup_namespace(
430 char** read_write_dirs,
431 char** read_only_dirs,
432 char** inaccessible_dirs,
433 char* tmp_dir,
434 char* var_tmp_dir,
435 char* bus_endpoint_path,
436 bool private_dev,
437 ProtectHome protect_home,
438 ProtectSystem protect_system,
439 unsigned mount_flags) {
440
441 BindMount *m, *mounts = NULL;
442 unsigned n;
443 int r = 0;
444
445 if (mount_flags == 0)
446 mount_flags = MS_SHARED;
447
448 if (unshare(CLONE_NEWNS) < 0)
449 return -errno;
450
451 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
452 strv_length(read_write_dirs) +
453 strv_length(read_only_dirs) +
454 strv_length(inaccessible_dirs) +
455 private_dev +
456 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
457 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
458 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
459
460 if (n > 0) {
461 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
462 r = append_mounts(&m, read_write_dirs, READWRITE);
463 if (r < 0)
464 return r;
465
466 r = append_mounts(&m, read_only_dirs, READONLY);
467 if (r < 0)
468 return r;
469
470 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
471 if (r < 0)
472 return r;
473
474 if (tmp_dir) {
475 m->path = "/tmp";
476 m->mode = PRIVATE_TMP;
477 m++;
478 }
479
480 if (var_tmp_dir) {
481 m->path = "/var/tmp";
482 m->mode = PRIVATE_VAR_TMP;
483 m++;
484 }
485
486 if (private_dev) {
487 m->path = "/dev";
488 m->mode = PRIVATE_DEV;
489 m++;
490 }
491
492 if (bus_endpoint_path) {
493 m->path = bus_endpoint_path;
494 m->mode = PRIVATE_BUS_ENDPOINT;
495 m++;
496 }
497
498 if (protect_home != PROTECT_HOME_NO) {
499 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
500 if (r < 0)
501 return r;
502 }
503
504 if (protect_system != PROTECT_SYSTEM_NO) {
505 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
506 if (r < 0)
507 return r;
508 }
509
510 assert(mounts + n == m);
511
512 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
513 drop_duplicates(mounts, &n);
514 }
515
516 if (n > 0) {
517 /* Remount / as SLAVE so that nothing now mounted in the namespace
518 shows up in the parent */
519 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
520 return -errno;
521
522 for (m = mounts; m < mounts + n; ++m) {
523 r = apply_mount(m, tmp_dir, var_tmp_dir);
524 if (r < 0)
525 goto fail;
526 }
527
528 for (m = mounts; m < mounts + n; ++m) {
529 r = make_read_only(m);
530 if (r < 0)
531 goto fail;
532 }
533 }
534
535 /* Remount / as the desired mode. Not that this will not
536 * reestablish propagation from our side to the host, since
537 * what's disconnected is disconnected. */
538 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
539 r = -errno;
540 goto fail;
541 }
542
543 return 0;
544
545 fail:
546 if (n > 0) {
547 for (m = mounts; m < mounts + n; ++m)
548 if (m->done)
549 umount2(m->path, MNT_DETACH);
550 }
551
552 return r;
553 }
554
555 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
556 _cleanup_free_ char *x = NULL;
557 char bid[SD_ID128_STRING_MAX];
558 sd_id128_t boot_id;
559 int r;
560
561 assert(id);
562 assert(prefix);
563 assert(path);
564
565 /* We include the boot id in the directory so that after a
566 * reboot we can easily identify obsolete directories. */
567
568 r = sd_id128_get_boot(&boot_id);
569 if (r < 0)
570 return r;
571
572 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
573 if (!x)
574 return -ENOMEM;
575
576 RUN_WITH_UMASK(0077)
577 if (!mkdtemp(x))
578 return -errno;
579
580 RUN_WITH_UMASK(0000) {
581 char *y;
582
583 y = strappenda(x, "/tmp");
584
585 if (mkdir(y, 0777 | S_ISVTX) < 0)
586 return -errno;
587 }
588
589 *path = x;
590 x = NULL;
591
592 return 0;
593 }
594
595 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
596 char *a, *b;
597 int r;
598
599 assert(id);
600 assert(tmp_dir);
601 assert(var_tmp_dir);
602
603 r = setup_one_tmp_dir(id, "/tmp", &a);
604 if (r < 0)
605 return r;
606
607 r = setup_one_tmp_dir(id, "/var/tmp", &b);
608 if (r < 0) {
609 char *t;
610
611 t = strappenda(a, "/tmp");
612 rmdir(t);
613 rmdir(a);
614
615 free(a);
616 return r;
617 }
618
619 *tmp_dir = a;
620 *var_tmp_dir = b;
621
622 return 0;
623 }
624
625 int setup_netns(int netns_storage_socket[2]) {
626 _cleanup_close_ int netns = -1;
627 union {
628 struct cmsghdr cmsghdr;
629 uint8_t buf[CMSG_SPACE(sizeof(int))];
630 } control = {};
631 struct msghdr mh = {
632 .msg_control = &control,
633 .msg_controllen = sizeof(control),
634 };
635 struct cmsghdr *cmsg;
636 int r;
637
638 assert(netns_storage_socket);
639 assert(netns_storage_socket[0] >= 0);
640 assert(netns_storage_socket[1] >= 0);
641
642 /* We use the passed socketpair as a storage buffer for our
643 * namespace reference fd. Whatever process runs this first
644 * shall create a new namespace, all others should just join
645 * it. To serialize that we use a file lock on the socket
646 * pair.
647 *
648 * It's a bit crazy, but hey, works great! */
649
650 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
651 return -errno;
652
653 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
654 if (errno != EAGAIN) {
655 r = -errno;
656 goto fail;
657 }
658
659 /* Nothing stored yet, so let's create a new namespace */
660
661 if (unshare(CLONE_NEWNET) < 0) {
662 r = -errno;
663 goto fail;
664 }
665
666 loopback_setup();
667
668 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
669 if (netns < 0) {
670 r = -errno;
671 goto fail;
672 }
673
674 r = 1;
675 } else {
676 /* Yay, found something, so let's join the namespace */
677
678 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
679 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
680 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
681 netns = *(int*) CMSG_DATA(cmsg);
682 }
683 }
684
685 if (setns(netns, CLONE_NEWNET) < 0) {
686 r = -errno;
687 goto fail;
688 }
689
690 r = 0;
691 }
692
693 cmsg = CMSG_FIRSTHDR(&mh);
694 cmsg->cmsg_level = SOL_SOCKET;
695 cmsg->cmsg_type = SCM_RIGHTS;
696 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
697 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
698 mh.msg_controllen = cmsg->cmsg_len;
699
700 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
701 r = -errno;
702 goto fail;
703 }
704
705 fail:
706 lockf(netns_storage_socket[0], F_ULOCK, 0);
707
708 return r;
709 }
710
711 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
712 [PROTECT_HOME_NO] = "no",
713 [PROTECT_HOME_YES] = "yes",
714 [PROTECT_HOME_READ_ONLY] = "read-only",
715 };
716
717 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
718
719 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
720 [PROTECT_SYSTEM_NO] = "no",
721 [PROTECT_SYSTEM_YES] = "yes",
722 [PROTECT_SYSTEM_FULL] = "full",
723 };
724
725 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);