]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
treewide: use log_*_errno whenever %m is in the format string
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #include <sched.h>
30 #include <sys/syscall.h>
31 #include <limits.h>
32 #include <linux/fs.h>
33 #include <sys/file.h>
34
35 #include "strv.h"
36 #include "util.h"
37 #include "path-util.h"
38 #include "namespace.h"
39 #include "missing.h"
40 #include "execute.h"
41 #include "loopback-setup.h"
42 #include "mkdir.h"
43 #include "dev-setup.h"
44 #include "def.h"
45 #include "label.h"
46
47 typedef enum MountMode {
48 /* This is ordered by priority! */
49 INACCESSIBLE,
50 READONLY,
51 PRIVATE_TMP,
52 PRIVATE_VAR_TMP,
53 PRIVATE_DEV,
54 PRIVATE_BUS_ENDPOINT,
55 READWRITE
56 } MountMode;
57
58 typedef struct BindMount {
59 const char *path;
60 MountMode mode;
61 bool done;
62 bool ignore;
63 } BindMount;
64
65 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
66 char **i;
67
68 assert(p);
69
70 STRV_FOREACH(i, strv) {
71
72 (*p)->ignore = false;
73 (*p)->done = false;
74
75 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
76 (*p)->ignore = true;
77 (*i)++;
78 }
79
80 if (!path_is_absolute(*i))
81 return -EINVAL;
82
83 (*p)->path = *i;
84 (*p)->mode = mode;
85 (*p)++;
86 }
87
88 return 0;
89 }
90
91 static int mount_path_compare(const void *a, const void *b) {
92 const BindMount *p = a, *q = b;
93
94 if (path_equal(p->path, q->path)) {
95
96 /* If the paths are equal, check the mode */
97 if (p->mode < q->mode)
98 return -1;
99
100 if (p->mode > q->mode)
101 return 1;
102
103 return 0;
104 }
105
106 /* If the paths are not equal, then order prefixes first */
107 if (path_startswith(p->path, q->path))
108 return 1;
109
110 if (path_startswith(q->path, p->path))
111 return -1;
112
113 return 0;
114 }
115
116 static void drop_duplicates(BindMount *m, unsigned *n) {
117 BindMount *f, *t, *previous;
118
119 assert(m);
120 assert(n);
121
122 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
123
124 /* The first one wins */
125 if (previous && path_equal(f->path, previous->path))
126 continue;
127
128 *t = *f;
129
130 previous = t;
131
132 t++;
133 }
134
135 *n = t - m;
136 }
137
138 static int mount_dev(BindMount *m) {
139 static const char devnodes[] =
140 "/dev/null\0"
141 "/dev/zero\0"
142 "/dev/full\0"
143 "/dev/random\0"
144 "/dev/urandom\0"
145 "/dev/tty\0";
146
147 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
148 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
149 _cleanup_umask_ mode_t u;
150 int r;
151
152 assert(m);
153
154 u = umask(0000);
155
156 if (!mkdtemp(temporary_mount))
157 return -errno;
158
159 dev = strappenda(temporary_mount, "/dev");
160 (void)mkdir(dev, 0755);
161 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
162 r = -errno;
163 goto fail;
164 }
165
166 devpts = strappenda(temporary_mount, "/dev/pts");
167 (void)mkdir(devpts, 0755);
168 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
169 r = -errno;
170 goto fail;
171 }
172
173 devptmx = strappenda(temporary_mount, "/dev/ptmx");
174 symlink("pts/ptmx", devptmx);
175
176 devshm = strappenda(temporary_mount, "/dev/shm");
177 (void)mkdir(devshm, 01777);
178 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
179 if (r < 0) {
180 r = -errno;
181 goto fail;
182 }
183
184 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
185 (void)mkdir(devmqueue, 0755);
186 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
187
188 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
189 (void)mkdir(devhugepages, 0755);
190 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
191
192 devlog = strappenda(temporary_mount, "/dev/log");
193 symlink("/run/systemd/journal/dev-log", devlog);
194
195 NULSTR_FOREACH(d, devnodes) {
196 _cleanup_free_ char *dn = NULL;
197 struct stat st;
198
199 r = stat(d, &st);
200 if (r < 0) {
201
202 if (errno == ENOENT)
203 continue;
204
205 r = -errno;
206 goto fail;
207 }
208
209 if (!S_ISBLK(st.st_mode) &&
210 !S_ISCHR(st.st_mode)) {
211 r = -EINVAL;
212 goto fail;
213 }
214
215 if (st.st_rdev == 0)
216 continue;
217
218 dn = strappend(temporary_mount, d);
219 if (!dn) {
220 r = -ENOMEM;
221 goto fail;
222 }
223
224 mac_selinux_create_file_prepare(d, st.st_mode);
225 r = mknod(dn, st.st_mode, st.st_rdev);
226 mac_selinux_create_file_clear();
227
228 if (r < 0) {
229 r = -errno;
230 goto fail;
231 }
232 }
233
234 dev_setup(temporary_mount);
235
236 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
237 r = -errno;
238 goto fail;
239 }
240
241 rmdir(dev);
242 rmdir(temporary_mount);
243
244 return 0;
245
246 fail:
247 if (devpts)
248 umount(devpts);
249
250 if (devshm)
251 umount(devshm);
252
253 if (devhugepages)
254 umount(devhugepages);
255
256 if (devmqueue)
257 umount(devmqueue);
258
259 umount(dev);
260 rmdir(dev);
261 rmdir(temporary_mount);
262
263 return r;
264 }
265
266 static int mount_kdbus(BindMount *m) {
267
268 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
269 _cleanup_free_ char *basepath = NULL;
270 _cleanup_umask_ mode_t u;
271 char *busnode = NULL, *root;
272 struct stat st;
273 int r;
274
275 assert(m);
276
277 u = umask(0000);
278
279 if (!mkdtemp(temporary_mount)) {
280 log_error_errno(errno, "Failed create temp dir: %m");
281 return -errno;
282 }
283
284 root = strappenda(temporary_mount, "/kdbus");
285 (void)mkdir(root, 0755);
286 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
287 r = -errno;
288 goto fail;
289 }
290
291 /* create a new /dev/null dev node copy so we have some fodder to
292 * bind-mount the custom endpoint over. */
293 if (stat("/dev/null", &st) < 0) {
294 log_error_errno(errno, "Failed to stat /dev/null: %m");
295 r = -errno;
296 goto fail;
297 }
298
299 busnode = strappenda(root, "/bus");
300 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
301 log_error_errno(errno, "mknod() for %s failed: %m", busnode);
302 r = -errno;
303 goto fail;
304 }
305
306 r = mount(m->path, busnode, "bind", MS_BIND, NULL);
307 if (r < 0) {
308 log_error_errno(errno, "bind mount of %s failed: %m", m->path);
309 r = -errno;
310 goto fail;
311 }
312
313 basepath = dirname_malloc(m->path);
314 if (!basepath) {
315 r = -ENOMEM;
316 goto fail;
317 }
318
319 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
320 log_error_errno(errno, "bind mount of %s failed: %m", basepath);
321 r = -errno;
322 goto fail;
323 }
324
325 rmdir(temporary_mount);
326 return 0;
327
328 fail:
329 if (busnode) {
330 umount(busnode);
331 unlink(busnode);
332 }
333
334 umount(root);
335 rmdir(root);
336 rmdir(temporary_mount);
337
338 return r;
339 }
340
341 static int apply_mount(
342 BindMount *m,
343 const char *tmp_dir,
344 const char *var_tmp_dir) {
345
346 const char *what;
347 int r;
348
349 assert(m);
350
351 switch (m->mode) {
352
353 case INACCESSIBLE:
354
355 /* First, get rid of everything that is below if there
356 * is anything... Then, overmount it with an
357 * inaccessible directory. */
358 umount_recursive(m->path, 0);
359
360 what = "/run/systemd/inaccessible";
361 break;
362
363 case READONLY:
364 case READWRITE:
365 /* Nothing to mount here, we just later toggle the
366 * MS_RDONLY bit for the mount point */
367 return 0;
368
369 case PRIVATE_TMP:
370 what = tmp_dir;
371 break;
372
373 case PRIVATE_VAR_TMP:
374 what = var_tmp_dir;
375 break;
376
377 case PRIVATE_DEV:
378 return mount_dev(m);
379
380 case PRIVATE_BUS_ENDPOINT:
381 return mount_kdbus(m);
382
383 default:
384 assert_not_reached("Unknown mode");
385 }
386
387 assert(what);
388
389 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
390 if (r >= 0)
391 log_debug("Successfully mounted %s to %s", what, m->path);
392 else if (m->ignore && errno == ENOENT)
393 return 0;
394
395 return r;
396 }
397
398 static int make_read_only(BindMount *m) {
399 int r;
400
401 assert(m);
402
403 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
404 r = bind_remount_recursive(m->path, true);
405 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
406 r = bind_remount_recursive(m->path, false);
407 else
408 r = 0;
409
410 if (m->ignore && r == -ENOENT)
411 return 0;
412
413 return r;
414 }
415
416 int setup_namespace(
417 char** read_write_dirs,
418 char** read_only_dirs,
419 char** inaccessible_dirs,
420 const char* tmp_dir,
421 const char* var_tmp_dir,
422 const char* bus_endpoint_path,
423 bool private_dev,
424 ProtectHome protect_home,
425 ProtectSystem protect_system,
426 unsigned mount_flags) {
427
428 BindMount *m, *mounts = NULL;
429 unsigned n;
430 int r = 0;
431
432 if (mount_flags == 0)
433 mount_flags = MS_SHARED;
434
435 if (unshare(CLONE_NEWNS) < 0)
436 return -errno;
437
438 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
439 strv_length(read_write_dirs) +
440 strv_length(read_only_dirs) +
441 strv_length(inaccessible_dirs) +
442 private_dev +
443 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
444 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
445 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
446
447 if (n > 0) {
448 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
449 r = append_mounts(&m, read_write_dirs, READWRITE);
450 if (r < 0)
451 return r;
452
453 r = append_mounts(&m, read_only_dirs, READONLY);
454 if (r < 0)
455 return r;
456
457 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
458 if (r < 0)
459 return r;
460
461 if (tmp_dir) {
462 m->path = "/tmp";
463 m->mode = PRIVATE_TMP;
464 m++;
465 }
466
467 if (var_tmp_dir) {
468 m->path = "/var/tmp";
469 m->mode = PRIVATE_VAR_TMP;
470 m++;
471 }
472
473 if (private_dev) {
474 m->path = "/dev";
475 m->mode = PRIVATE_DEV;
476 m++;
477 }
478
479 if (bus_endpoint_path) {
480 m->path = bus_endpoint_path;
481 m->mode = PRIVATE_BUS_ENDPOINT;
482 m++;
483 }
484
485 if (protect_home != PROTECT_HOME_NO) {
486 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
487 if (r < 0)
488 return r;
489 }
490
491 if (protect_system != PROTECT_SYSTEM_NO) {
492 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
493 if (r < 0)
494 return r;
495 }
496
497 assert(mounts + n == m);
498
499 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
500 drop_duplicates(mounts, &n);
501 }
502
503 if (n > 0) {
504 /* Remount / as SLAVE so that nothing now mounted in the namespace
505 shows up in the parent */
506 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
507 return -errno;
508
509 for (m = mounts; m < mounts + n; ++m) {
510 r = apply_mount(m, tmp_dir, var_tmp_dir);
511 if (r < 0)
512 goto fail;
513 }
514
515 for (m = mounts; m < mounts + n; ++m) {
516 r = make_read_only(m);
517 if (r < 0)
518 goto fail;
519 }
520 }
521
522 /* Remount / as the desired mode. Not that this will not
523 * reestablish propagation from our side to the host, since
524 * what's disconnected is disconnected. */
525 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
526 r = -errno;
527 goto fail;
528 }
529
530 return 0;
531
532 fail:
533 if (n > 0) {
534 for (m = mounts; m < mounts + n; ++m)
535 if (m->done)
536 umount2(m->path, MNT_DETACH);
537 }
538
539 return r;
540 }
541
542 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
543 _cleanup_free_ char *x = NULL;
544 char bid[SD_ID128_STRING_MAX];
545 sd_id128_t boot_id;
546 int r;
547
548 assert(id);
549 assert(prefix);
550 assert(path);
551
552 /* We include the boot id in the directory so that after a
553 * reboot we can easily identify obsolete directories. */
554
555 r = sd_id128_get_boot(&boot_id);
556 if (r < 0)
557 return r;
558
559 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
560 if (!x)
561 return -ENOMEM;
562
563 RUN_WITH_UMASK(0077)
564 if (!mkdtemp(x))
565 return -errno;
566
567 RUN_WITH_UMASK(0000) {
568 char *y;
569
570 y = strappenda(x, "/tmp");
571
572 if (mkdir(y, 0777 | S_ISVTX) < 0)
573 return -errno;
574 }
575
576 *path = x;
577 x = NULL;
578
579 return 0;
580 }
581
582 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
583 char *a, *b;
584 int r;
585
586 assert(id);
587 assert(tmp_dir);
588 assert(var_tmp_dir);
589
590 r = setup_one_tmp_dir(id, "/tmp", &a);
591 if (r < 0)
592 return r;
593
594 r = setup_one_tmp_dir(id, "/var/tmp", &b);
595 if (r < 0) {
596 char *t;
597
598 t = strappenda(a, "/tmp");
599 rmdir(t);
600 rmdir(a);
601
602 free(a);
603 return r;
604 }
605
606 *tmp_dir = a;
607 *var_tmp_dir = b;
608
609 return 0;
610 }
611
612 int setup_netns(int netns_storage_socket[2]) {
613 _cleanup_close_ int netns = -1;
614 union {
615 struct cmsghdr cmsghdr;
616 uint8_t buf[CMSG_SPACE(sizeof(int))];
617 } control = {};
618 struct msghdr mh = {
619 .msg_control = &control,
620 .msg_controllen = sizeof(control),
621 };
622 struct cmsghdr *cmsg;
623 int r;
624
625 assert(netns_storage_socket);
626 assert(netns_storage_socket[0] >= 0);
627 assert(netns_storage_socket[1] >= 0);
628
629 /* We use the passed socketpair as a storage buffer for our
630 * namespace reference fd. Whatever process runs this first
631 * shall create a new namespace, all others should just join
632 * it. To serialize that we use a file lock on the socket
633 * pair.
634 *
635 * It's a bit crazy, but hey, works great! */
636
637 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
638 return -errno;
639
640 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
641 if (errno != EAGAIN) {
642 r = -errno;
643 goto fail;
644 }
645
646 /* Nothing stored yet, so let's create a new namespace */
647
648 if (unshare(CLONE_NEWNET) < 0) {
649 r = -errno;
650 goto fail;
651 }
652
653 loopback_setup();
654
655 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
656 if (netns < 0) {
657 r = -errno;
658 goto fail;
659 }
660
661 r = 1;
662 } else {
663 /* Yay, found something, so let's join the namespace */
664
665 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
666 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
667 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
668 netns = *(int*) CMSG_DATA(cmsg);
669 }
670 }
671
672 if (setns(netns, CLONE_NEWNET) < 0) {
673 r = -errno;
674 goto fail;
675 }
676
677 r = 0;
678 }
679
680 cmsg = CMSG_FIRSTHDR(&mh);
681 cmsg->cmsg_level = SOL_SOCKET;
682 cmsg->cmsg_type = SCM_RIGHTS;
683 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
684 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
685 mh.msg_controllen = cmsg->cmsg_len;
686
687 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
688 r = -errno;
689 goto fail;
690 }
691
692 fail:
693 lockf(netns_storage_socket[0], F_ULOCK, 0);
694
695 return r;
696 }
697
698 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
699 [PROTECT_HOME_NO] = "no",
700 [PROTECT_HOME_YES] = "yes",
701 [PROTECT_HOME_READ_ONLY] = "read-only",
702 };
703
704 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
705
706 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
707 [PROTECT_SYSTEM_NO] = "no",
708 [PROTECT_SYSTEM_YES] = "yes",
709 [PROTECT_SYSTEM_FULL] = "full",
710 };
711
712 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);