]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
Merge pull request #143 from teg/networkd-packets-per-slave-mode
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sched.h>
29 #include <linux/fs.h>
30
31 #include "strv.h"
32 #include "util.h"
33 #include "path-util.h"
34 #include "missing.h"
35 #include "loopback-setup.h"
36 #include "dev-setup.h"
37 #include "selinux-util.h"
38 #include "namespace.h"
39 #include "mkdir.h"
40
41 typedef enum MountMode {
42 /* This is ordered by priority! */
43 INACCESSIBLE,
44 READONLY,
45 PRIVATE_TMP,
46 PRIVATE_VAR_TMP,
47 PRIVATE_DEV,
48 PRIVATE_BUS_ENDPOINT,
49 READWRITE
50 } MountMode;
51
52 typedef struct BindMount {
53 const char *path;
54 MountMode mode;
55 bool done;
56 bool ignore;
57 } BindMount;
58
59 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
60 char **i;
61
62 assert(p);
63
64 STRV_FOREACH(i, strv) {
65
66 (*p)->ignore = false;
67 (*p)->done = false;
68
69 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
70 (*p)->ignore = true;
71 (*i)++;
72 }
73
74 if (!path_is_absolute(*i))
75 return -EINVAL;
76
77 (*p)->path = *i;
78 (*p)->mode = mode;
79 (*p)++;
80 }
81
82 return 0;
83 }
84
85 static int mount_path_compare(const void *a, const void *b) {
86 const BindMount *p = a, *q = b;
87 int d;
88
89 d = path_compare(p->path, q->path);
90
91 if (d == 0) {
92 /* If the paths are equal, check the mode */
93 if (p->mode < q->mode)
94 return -1;
95
96 if (p->mode > q->mode)
97 return 1;
98
99 return 0;
100 }
101
102 /* If the paths are not equal, then order prefixes first */
103 return d;
104 }
105
106 static void drop_duplicates(BindMount *m, unsigned *n) {
107 BindMount *f, *t, *previous;
108
109 assert(m);
110 assert(n);
111
112 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
113
114 /* The first one wins */
115 if (previous && path_equal(f->path, previous->path))
116 continue;
117
118 *t = *f;
119
120 previous = t;
121
122 t++;
123 }
124
125 *n = t - m;
126 }
127
128 static int mount_dev(BindMount *m) {
129 static const char devnodes[] =
130 "/dev/null\0"
131 "/dev/zero\0"
132 "/dev/full\0"
133 "/dev/random\0"
134 "/dev/urandom\0"
135 "/dev/tty\0";
136
137 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
138 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
139 _cleanup_umask_ mode_t u;
140 int r;
141
142 assert(m);
143
144 u = umask(0000);
145
146 if (!mkdtemp(temporary_mount))
147 return -errno;
148
149 dev = strjoina(temporary_mount, "/dev");
150 (void) mkdir(dev, 0755);
151 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
152 r = -errno;
153 goto fail;
154 }
155
156 devpts = strjoina(temporary_mount, "/dev/pts");
157 (void) mkdir(devpts, 0755);
158 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
159 r = -errno;
160 goto fail;
161 }
162
163 devptmx = strjoina(temporary_mount, "/dev/ptmx");
164 if (symlink("pts/ptmx", devptmx) < 0) {
165 r = -errno;
166 goto fail;
167 }
168
169 devshm = strjoina(temporary_mount, "/dev/shm");
170 (void) mkdir(devshm, 01777);
171 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
172 if (r < 0) {
173 r = -errno;
174 goto fail;
175 }
176
177 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
178 (void) mkdir(devmqueue, 0755);
179 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
180
181 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
182 (void) mkdir(devhugepages, 0755);
183 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
184
185 devlog = strjoina(temporary_mount, "/dev/log");
186 (void) symlink("/run/systemd/journal/dev-log", devlog);
187
188 NULSTR_FOREACH(d, devnodes) {
189 _cleanup_free_ char *dn = NULL;
190 struct stat st;
191
192 r = stat(d, &st);
193 if (r < 0) {
194
195 if (errno == ENOENT)
196 continue;
197
198 r = -errno;
199 goto fail;
200 }
201
202 if (!S_ISBLK(st.st_mode) &&
203 !S_ISCHR(st.st_mode)) {
204 r = -EINVAL;
205 goto fail;
206 }
207
208 if (st.st_rdev == 0)
209 continue;
210
211 dn = strappend(temporary_mount, d);
212 if (!dn) {
213 r = -ENOMEM;
214 goto fail;
215 }
216
217 mac_selinux_create_file_prepare(d, st.st_mode);
218 r = mknod(dn, st.st_mode, st.st_rdev);
219 mac_selinux_create_file_clear();
220
221 if (r < 0) {
222 r = -errno;
223 goto fail;
224 }
225 }
226
227 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
228
229 /* Create the /dev directory if missing. It is more likely to be
230 * missing when the service is started with RootDirectory. This is
231 * consistent with mount units creating the mount points when missing.
232 */
233 (void) mkdir_p_label(m->path, 0755);
234
235 if (mount(dev, m->path, NULL, MS_MOVE, NULL) < 0) {
236 r = -errno;
237 goto fail;
238 }
239
240 rmdir(dev);
241 rmdir(temporary_mount);
242
243 return 0;
244
245 fail:
246 if (devpts)
247 umount(devpts);
248
249 if (devshm)
250 umount(devshm);
251
252 if (devhugepages)
253 umount(devhugepages);
254
255 if (devmqueue)
256 umount(devmqueue);
257
258 umount(dev);
259 rmdir(dev);
260 rmdir(temporary_mount);
261
262 return r;
263 }
264
265 static int mount_kdbus(BindMount *m) {
266
267 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
268 _cleanup_free_ char *basepath = NULL;
269 _cleanup_umask_ mode_t u;
270 char *busnode = NULL, *root;
271 struct stat st;
272 int r;
273
274 assert(m);
275
276 u = umask(0000);
277
278 if (!mkdtemp(temporary_mount))
279 return log_error_errno(errno, "Failed create temp dir: %m");
280
281 root = strjoina(temporary_mount, "/kdbus");
282 (void) mkdir(root, 0755);
283 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
284 r = -errno;
285 goto fail;
286 }
287
288 /* create a new /dev/null dev node copy so we have some fodder to
289 * bind-mount the custom endpoint over. */
290 if (stat("/dev/null", &st) < 0) {
291 log_error_errno(errno, "Failed to stat /dev/null: %m");
292 r = -errno;
293 goto fail;
294 }
295
296 busnode = strjoina(root, "/bus");
297 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
298 log_error_errno(errno, "mknod() for %s failed: %m", busnode);
299 r = -errno;
300 goto fail;
301 }
302
303 r = mount(m->path, busnode, NULL, MS_BIND, NULL);
304 if (r < 0) {
305 log_error_errno(errno, "bind mount of %s failed: %m", m->path);
306 r = -errno;
307 goto fail;
308 }
309
310 basepath = dirname_malloc(m->path);
311 if (!basepath) {
312 r = -ENOMEM;
313 goto fail;
314 }
315
316 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
317 log_error_errno(errno, "bind mount of %s failed: %m", basepath);
318 r = -errno;
319 goto fail;
320 }
321
322 rmdir(temporary_mount);
323 return 0;
324
325 fail:
326 if (busnode) {
327 umount(busnode);
328 unlink(busnode);
329 }
330
331 umount(root);
332 rmdir(root);
333 rmdir(temporary_mount);
334
335 return r;
336 }
337
338 static int apply_mount(
339 BindMount *m,
340 const char *tmp_dir,
341 const char *var_tmp_dir) {
342
343 const char *what;
344 int r;
345
346 assert(m);
347
348 switch (m->mode) {
349
350 case INACCESSIBLE:
351
352 /* First, get rid of everything that is below if there
353 * is anything... Then, overmount it with an
354 * inaccessible directory. */
355 umount_recursive(m->path, 0);
356
357 what = "/run/systemd/inaccessible";
358 break;
359
360 case READONLY:
361 case READWRITE:
362 /* Nothing to mount here, we just later toggle the
363 * MS_RDONLY bit for the mount point */
364 return 0;
365
366 case PRIVATE_TMP:
367 what = tmp_dir;
368 break;
369
370 case PRIVATE_VAR_TMP:
371 what = var_tmp_dir;
372 break;
373
374 case PRIVATE_DEV:
375 return mount_dev(m);
376
377 case PRIVATE_BUS_ENDPOINT:
378 return mount_kdbus(m);
379
380 default:
381 assert_not_reached("Unknown mode");
382 }
383
384 assert(what);
385
386 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
387 if (r >= 0)
388 log_debug("Successfully mounted %s to %s", what, m->path);
389 else if (m->ignore && errno == ENOENT)
390 return 0;
391
392 return r;
393 }
394
395 static int make_read_only(BindMount *m) {
396 int r;
397
398 assert(m);
399
400 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
401 r = bind_remount_recursive(m->path, true);
402 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
403 r = bind_remount_recursive(m->path, false);
404 else
405 r = 0;
406
407 if (m->ignore && r == -ENOENT)
408 return 0;
409
410 return r;
411 }
412
413 int setup_namespace(
414 const char* root_directory,
415 char** read_write_dirs,
416 char** read_only_dirs,
417 char** inaccessible_dirs,
418 const char* tmp_dir,
419 const char* var_tmp_dir,
420 const char* bus_endpoint_path,
421 bool private_dev,
422 ProtectHome protect_home,
423 ProtectSystem protect_system,
424 unsigned long mount_flags) {
425
426 BindMount *m, *mounts = NULL;
427 unsigned n;
428 int r = 0;
429
430 if (mount_flags == 0)
431 mount_flags = MS_SHARED;
432
433 if (unshare(CLONE_NEWNS) < 0)
434 return -errno;
435
436 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
437 strv_length(read_write_dirs) +
438 strv_length(read_only_dirs) +
439 strv_length(inaccessible_dirs) +
440 private_dev +
441 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
442 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
443 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
444
445 if (n > 0) {
446 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
447 r = append_mounts(&m, read_write_dirs, READWRITE);
448 if (r < 0)
449 return r;
450
451 r = append_mounts(&m, read_only_dirs, READONLY);
452 if (r < 0)
453 return r;
454
455 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
456 if (r < 0)
457 return r;
458
459 if (tmp_dir) {
460 m->path = prefix_roota(root_directory, "/tmp");
461 m->mode = PRIVATE_TMP;
462 m++;
463 }
464
465 if (var_tmp_dir) {
466 m->path = prefix_roota(root_directory, "/var/tmp");
467 m->mode = PRIVATE_VAR_TMP;
468 m++;
469 }
470
471 if (private_dev) {
472 m->path = prefix_roota(root_directory, "/dev");
473 m->mode = PRIVATE_DEV;
474 m++;
475 }
476
477 if (bus_endpoint_path) {
478 m->path = prefix_roota(root_directory, bus_endpoint_path);
479 m->mode = PRIVATE_BUS_ENDPOINT;
480 m++;
481 }
482
483 if (protect_home != PROTECT_HOME_NO) {
484 const char *home_dir, *run_user_dir, *root_dir;
485
486 home_dir = prefix_roota(root_directory, "/home");
487 home_dir = strjoina("-", home_dir);
488 run_user_dir = prefix_roota(root_directory, "/run/user");
489 run_user_dir = strjoina("-", run_user_dir);
490 root_dir = prefix_roota(root_directory, "/root");
491 root_dir = strjoina("-", root_dir);
492
493 r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
494 protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
495 if (r < 0)
496 return r;
497 }
498
499 if (protect_system != PROTECT_SYSTEM_NO) {
500 const char *usr_dir, *boot_dir, *etc_dir;
501
502 usr_dir = prefix_roota(root_directory, "/usr");
503 boot_dir = prefix_roota(root_directory, "/boot");
504 boot_dir = strjoina("-", boot_dir);
505 etc_dir = prefix_roota(root_directory, "/etc");
506
507 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
508 ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
509 : STRV_MAKE(usr_dir, boot_dir), READONLY);
510 if (r < 0)
511 return r;
512 }
513
514 assert(mounts + n == m);
515
516 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
517 drop_duplicates(mounts, &n);
518 }
519
520 if (n > 0 || root_directory) {
521 /* Remount / as SLAVE so that nothing now mounted in the namespace
522 shows up in the parent */
523 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
524 return -errno;
525 }
526
527 if (root_directory) {
528 /* Turn directory into bind mount */
529 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0)
530 return -errno;
531 }
532
533 if (n > 0) {
534 for (m = mounts; m < mounts + n; ++m) {
535 r = apply_mount(m, tmp_dir, var_tmp_dir);
536 if (r < 0)
537 goto fail;
538 }
539
540 for (m = mounts; m < mounts + n; ++m) {
541 r = make_read_only(m);
542 if (r < 0)
543 goto fail;
544 }
545 }
546
547 if (root_directory) {
548 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
549 r = mount_move_root(root_directory);
550
551 /* at this point, we cannot rollback */
552 if (r < 0)
553 return r;
554 }
555
556 /* Remount / as the desired mode. Not that this will not
557 * reestablish propagation from our side to the host, since
558 * what's disconnected is disconnected. */
559 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
560 /* at this point, we cannot rollback */
561 return -errno;
562 }
563
564 return 0;
565
566 fail:
567 if (n > 0) {
568 for (m = mounts; m < mounts + n; ++m)
569 if (m->done)
570 (void) umount2(m->path, MNT_DETACH);
571 }
572
573 return r;
574 }
575
576 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
577 _cleanup_free_ char *x = NULL;
578 char bid[SD_ID128_STRING_MAX];
579 sd_id128_t boot_id;
580 int r;
581
582 assert(id);
583 assert(prefix);
584 assert(path);
585
586 /* We include the boot id in the directory so that after a
587 * reboot we can easily identify obsolete directories. */
588
589 r = sd_id128_get_boot(&boot_id);
590 if (r < 0)
591 return r;
592
593 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
594 if (!x)
595 return -ENOMEM;
596
597 RUN_WITH_UMASK(0077)
598 if (!mkdtemp(x))
599 return -errno;
600
601 RUN_WITH_UMASK(0000) {
602 char *y;
603
604 y = strjoina(x, "/tmp");
605
606 if (mkdir(y, 0777 | S_ISVTX) < 0)
607 return -errno;
608 }
609
610 *path = x;
611 x = NULL;
612
613 return 0;
614 }
615
616 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
617 char *a, *b;
618 int r;
619
620 assert(id);
621 assert(tmp_dir);
622 assert(var_tmp_dir);
623
624 r = setup_one_tmp_dir(id, "/tmp", &a);
625 if (r < 0)
626 return r;
627
628 r = setup_one_tmp_dir(id, "/var/tmp", &b);
629 if (r < 0) {
630 char *t;
631
632 t = strjoina(a, "/tmp");
633 rmdir(t);
634 rmdir(a);
635
636 free(a);
637 return r;
638 }
639
640 *tmp_dir = a;
641 *var_tmp_dir = b;
642
643 return 0;
644 }
645
646 int setup_netns(int netns_storage_socket[2]) {
647 _cleanup_close_ int netns = -1;
648 union {
649 struct cmsghdr cmsghdr;
650 uint8_t buf[CMSG_SPACE(sizeof(int))];
651 } control = {};
652 struct msghdr mh = {
653 .msg_control = &control,
654 .msg_controllen = sizeof(control),
655 };
656 struct cmsghdr *cmsg;
657 int r;
658
659 assert(netns_storage_socket);
660 assert(netns_storage_socket[0] >= 0);
661 assert(netns_storage_socket[1] >= 0);
662
663 /* We use the passed socketpair as a storage buffer for our
664 * namespace reference fd. Whatever process runs this first
665 * shall create a new namespace, all others should just join
666 * it. To serialize that we use a file lock on the socket
667 * pair.
668 *
669 * It's a bit crazy, but hey, works great! */
670
671 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
672 return -errno;
673
674 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
675 if (errno != EAGAIN) {
676 r = -errno;
677 goto fail;
678 }
679
680 /* Nothing stored yet, so let's create a new namespace */
681
682 if (unshare(CLONE_NEWNET) < 0) {
683 r = -errno;
684 goto fail;
685 }
686
687 loopback_setup();
688
689 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
690 if (netns < 0) {
691 r = -errno;
692 goto fail;
693 }
694
695 r = 1;
696 } else {
697 /* Yay, found something, so let's join the namespace */
698
699 CMSG_FOREACH(cmsg, &mh)
700 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
701 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
702 netns = *(int*) CMSG_DATA(cmsg);
703 }
704
705 if (setns(netns, CLONE_NEWNET) < 0) {
706 r = -errno;
707 goto fail;
708 }
709
710 r = 0;
711 }
712
713 cmsg = CMSG_FIRSTHDR(&mh);
714 cmsg->cmsg_level = SOL_SOCKET;
715 cmsg->cmsg_type = SCM_RIGHTS;
716 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
717 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
718 mh.msg_controllen = cmsg->cmsg_len;
719
720 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
721 r = -errno;
722 goto fail;
723 }
724
725 fail:
726 lockf(netns_storage_socket[0], F_ULOCK, 0);
727
728 return r;
729 }
730
731 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
732 [PROTECT_HOME_NO] = "no",
733 [PROTECT_HOME_YES] = "yes",
734 [PROTECT_HOME_READ_ONLY] = "read-only",
735 };
736
737 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
738
739 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
740 [PROTECT_SYSTEM_NO] = "no",
741 [PROTECT_SYSTEM_YES] = "yes",
742 [PROTECT_SYSTEM_FULL] = "full",
743 };
744
745 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);