]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
tree-wide: make use of log_error_errno() return value
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sched.h>
29 #include <linux/fs.h>
30
31 #include "strv.h"
32 #include "util.h"
33 #include "path-util.h"
34 #include "missing.h"
35 #include "loopback-setup.h"
36 #include "dev-setup.h"
37 #include "selinux-util.h"
38 #include "namespace.h"
39 #include "mkdir.h"
40
41 typedef enum MountMode {
42 /* This is ordered by priority! */
43 INACCESSIBLE,
44 READONLY,
45 PRIVATE_TMP,
46 PRIVATE_VAR_TMP,
47 PRIVATE_DEV,
48 PRIVATE_BUS_ENDPOINT,
49 READWRITE
50 } MountMode;
51
52 typedef struct BindMount {
53 const char *path;
54 MountMode mode;
55 bool done;
56 bool ignore;
57 } BindMount;
58
59 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
60 char **i;
61
62 assert(p);
63
64 STRV_FOREACH(i, strv) {
65
66 (*p)->ignore = false;
67 (*p)->done = false;
68
69 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
70 (*p)->ignore = true;
71 (*i)++;
72 }
73
74 if (!path_is_absolute(*i))
75 return -EINVAL;
76
77 (*p)->path = *i;
78 (*p)->mode = mode;
79 (*p)++;
80 }
81
82 return 0;
83 }
84
85 static int mount_path_compare(const void *a, const void *b) {
86 const BindMount *p = a, *q = b;
87 int d;
88
89 d = path_compare(p->path, q->path);
90
91 if (d == 0) {
92 /* If the paths are equal, check the mode */
93 if (p->mode < q->mode)
94 return -1;
95
96 if (p->mode > q->mode)
97 return 1;
98
99 return 0;
100 }
101
102 /* If the paths are not equal, then order prefixes first */
103 return d;
104 }
105
106 static void drop_duplicates(BindMount *m, unsigned *n) {
107 BindMount *f, *t, *previous;
108
109 assert(m);
110 assert(n);
111
112 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
113
114 /* The first one wins */
115 if (previous && path_equal(f->path, previous->path))
116 continue;
117
118 *t = *f;
119
120 previous = t;
121
122 t++;
123 }
124
125 *n = t - m;
126 }
127
128 static int mount_dev(BindMount *m) {
129 static const char devnodes[] =
130 "/dev/null\0"
131 "/dev/zero\0"
132 "/dev/full\0"
133 "/dev/random\0"
134 "/dev/urandom\0"
135 "/dev/tty\0";
136
137 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
138 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
139 _cleanup_umask_ mode_t u;
140 int r;
141
142 assert(m);
143
144 u = umask(0000);
145
146 if (!mkdtemp(temporary_mount))
147 return -errno;
148
149 dev = strjoina(temporary_mount, "/dev");
150 (void) mkdir(dev, 0755);
151 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
152 r = -errno;
153 goto fail;
154 }
155
156 devpts = strjoina(temporary_mount, "/dev/pts");
157 (void) mkdir(devpts, 0755);
158 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
159 r = -errno;
160 goto fail;
161 }
162
163 devptmx = strjoina(temporary_mount, "/dev/ptmx");
164 if (symlink("pts/ptmx", devptmx) < 0) {
165 r = -errno;
166 goto fail;
167 }
168
169 devshm = strjoina(temporary_mount, "/dev/shm");
170 (void) mkdir(devshm, 01777);
171 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
172 if (r < 0) {
173 r = -errno;
174 goto fail;
175 }
176
177 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
178 (void) mkdir(devmqueue, 0755);
179 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
180
181 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
182 (void) mkdir(devhugepages, 0755);
183 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
184
185 devlog = strjoina(temporary_mount, "/dev/log");
186 (void) symlink("/run/systemd/journal/dev-log", devlog);
187
188 NULSTR_FOREACH(d, devnodes) {
189 _cleanup_free_ char *dn = NULL;
190 struct stat st;
191
192 r = stat(d, &st);
193 if (r < 0) {
194
195 if (errno == ENOENT)
196 continue;
197
198 r = -errno;
199 goto fail;
200 }
201
202 if (!S_ISBLK(st.st_mode) &&
203 !S_ISCHR(st.st_mode)) {
204 r = -EINVAL;
205 goto fail;
206 }
207
208 if (st.st_rdev == 0)
209 continue;
210
211 dn = strappend(temporary_mount, d);
212 if (!dn) {
213 r = -ENOMEM;
214 goto fail;
215 }
216
217 mac_selinux_create_file_prepare(d, st.st_mode);
218 r = mknod(dn, st.st_mode, st.st_rdev);
219 mac_selinux_create_file_clear();
220
221 if (r < 0) {
222 r = -errno;
223 goto fail;
224 }
225 }
226
227 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
228
229 /* Create the /dev directory if missing. It is more likely to be
230 * missing when the service is started with RootDirectory. This is
231 * consistent with mount units creating the mount points when missing.
232 */
233 (void) mkdir_p_label(m->path, 0755);
234
235 if (mount(dev, m->path, NULL, MS_MOVE, NULL) < 0) {
236 r = -errno;
237 goto fail;
238 }
239
240 rmdir(dev);
241 rmdir(temporary_mount);
242
243 return 0;
244
245 fail:
246 if (devpts)
247 umount(devpts);
248
249 if (devshm)
250 umount(devshm);
251
252 if (devhugepages)
253 umount(devhugepages);
254
255 if (devmqueue)
256 umount(devmqueue);
257
258 umount(dev);
259 rmdir(dev);
260 rmdir(temporary_mount);
261
262 return r;
263 }
264
265 static int mount_kdbus(BindMount *m) {
266
267 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
268 _cleanup_free_ char *basepath = NULL;
269 _cleanup_umask_ mode_t u;
270 char *busnode = NULL, *root;
271 struct stat st;
272 int r;
273
274 assert(m);
275
276 u = umask(0000);
277
278 if (!mkdtemp(temporary_mount))
279 return log_error_errno(errno, "Failed create temp dir: %m");
280
281 root = strjoina(temporary_mount, "/kdbus");
282 (void) mkdir(root, 0755);
283 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
284 r = -errno;
285 goto fail;
286 }
287
288 /* create a new /dev/null dev node copy so we have some fodder to
289 * bind-mount the custom endpoint over. */
290 if (stat("/dev/null", &st) < 0) {
291 r = log_error_errno(errno, "Failed to stat /dev/null: %m");
292 goto fail;
293 }
294
295 busnode = strjoina(root, "/bus");
296 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
297 log_error_errno(errno, "mknod() for %s failed: %m", busnode);
298 r = -errno;
299 goto fail;
300 }
301
302 r = mount(m->path, busnode, NULL, MS_BIND, NULL);
303 if (r < 0) {
304 log_error_errno(errno, "bind mount of %s failed: %m", m->path);
305 r = -errno;
306 goto fail;
307 }
308
309 basepath = dirname_malloc(m->path);
310 if (!basepath) {
311 r = -ENOMEM;
312 goto fail;
313 }
314
315 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
316 log_error_errno(errno, "bind mount of %s failed: %m", basepath);
317 r = -errno;
318 goto fail;
319 }
320
321 rmdir(temporary_mount);
322 return 0;
323
324 fail:
325 if (busnode) {
326 umount(busnode);
327 unlink(busnode);
328 }
329
330 umount(root);
331 rmdir(root);
332 rmdir(temporary_mount);
333
334 return r;
335 }
336
337 static int apply_mount(
338 BindMount *m,
339 const char *tmp_dir,
340 const char *var_tmp_dir) {
341
342 const char *what;
343 int r;
344
345 assert(m);
346
347 switch (m->mode) {
348
349 case INACCESSIBLE:
350
351 /* First, get rid of everything that is below if there
352 * is anything... Then, overmount it with an
353 * inaccessible directory. */
354 umount_recursive(m->path, 0);
355
356 what = "/run/systemd/inaccessible";
357 break;
358
359 case READONLY:
360 case READWRITE:
361 /* Nothing to mount here, we just later toggle the
362 * MS_RDONLY bit for the mount point */
363 return 0;
364
365 case PRIVATE_TMP:
366 what = tmp_dir;
367 break;
368
369 case PRIVATE_VAR_TMP:
370 what = var_tmp_dir;
371 break;
372
373 case PRIVATE_DEV:
374 return mount_dev(m);
375
376 case PRIVATE_BUS_ENDPOINT:
377 return mount_kdbus(m);
378
379 default:
380 assert_not_reached("Unknown mode");
381 }
382
383 assert(what);
384
385 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
386 if (r >= 0)
387 log_debug("Successfully mounted %s to %s", what, m->path);
388 else if (m->ignore && errno == ENOENT)
389 return 0;
390
391 return r;
392 }
393
394 static int make_read_only(BindMount *m) {
395 int r;
396
397 assert(m);
398
399 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
400 r = bind_remount_recursive(m->path, true);
401 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
402 r = bind_remount_recursive(m->path, false);
403 else
404 r = 0;
405
406 if (m->ignore && r == -ENOENT)
407 return 0;
408
409 return r;
410 }
411
412 int setup_namespace(
413 const char* root_directory,
414 char** read_write_dirs,
415 char** read_only_dirs,
416 char** inaccessible_dirs,
417 const char* tmp_dir,
418 const char* var_tmp_dir,
419 const char* bus_endpoint_path,
420 bool private_dev,
421 ProtectHome protect_home,
422 ProtectSystem protect_system,
423 unsigned long mount_flags) {
424
425 BindMount *m, *mounts = NULL;
426 unsigned n;
427 int r = 0;
428
429 if (mount_flags == 0)
430 mount_flags = MS_SHARED;
431
432 if (unshare(CLONE_NEWNS) < 0)
433 return -errno;
434
435 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
436 strv_length(read_write_dirs) +
437 strv_length(read_only_dirs) +
438 strv_length(inaccessible_dirs) +
439 private_dev +
440 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
441 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
442 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
443
444 if (n > 0) {
445 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
446 r = append_mounts(&m, read_write_dirs, READWRITE);
447 if (r < 0)
448 return r;
449
450 r = append_mounts(&m, read_only_dirs, READONLY);
451 if (r < 0)
452 return r;
453
454 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
455 if (r < 0)
456 return r;
457
458 if (tmp_dir) {
459 m->path = prefix_roota(root_directory, "/tmp");
460 m->mode = PRIVATE_TMP;
461 m++;
462 }
463
464 if (var_tmp_dir) {
465 m->path = prefix_roota(root_directory, "/var/tmp");
466 m->mode = PRIVATE_VAR_TMP;
467 m++;
468 }
469
470 if (private_dev) {
471 m->path = prefix_roota(root_directory, "/dev");
472 m->mode = PRIVATE_DEV;
473 m++;
474 }
475
476 if (bus_endpoint_path) {
477 m->path = prefix_roota(root_directory, bus_endpoint_path);
478 m->mode = PRIVATE_BUS_ENDPOINT;
479 m++;
480 }
481
482 if (protect_home != PROTECT_HOME_NO) {
483 const char *home_dir, *run_user_dir, *root_dir;
484
485 home_dir = prefix_roota(root_directory, "/home");
486 home_dir = strjoina("-", home_dir);
487 run_user_dir = prefix_roota(root_directory, "/run/user");
488 run_user_dir = strjoina("-", run_user_dir);
489 root_dir = prefix_roota(root_directory, "/root");
490 root_dir = strjoina("-", root_dir);
491
492 r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
493 protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
494 if (r < 0)
495 return r;
496 }
497
498 if (protect_system != PROTECT_SYSTEM_NO) {
499 const char *usr_dir, *boot_dir, *etc_dir;
500
501 usr_dir = prefix_roota(root_directory, "/usr");
502 boot_dir = prefix_roota(root_directory, "/boot");
503 boot_dir = strjoina("-", boot_dir);
504 etc_dir = prefix_roota(root_directory, "/etc");
505
506 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
507 ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
508 : STRV_MAKE(usr_dir, boot_dir), READONLY);
509 if (r < 0)
510 return r;
511 }
512
513 assert(mounts + n == m);
514
515 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
516 drop_duplicates(mounts, &n);
517 }
518
519 if (n > 0 || root_directory) {
520 /* Remount / as SLAVE so that nothing now mounted in the namespace
521 shows up in the parent */
522 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
523 return -errno;
524 }
525
526 if (root_directory) {
527 /* Turn directory into bind mount */
528 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0)
529 return -errno;
530 }
531
532 if (n > 0) {
533 for (m = mounts; m < mounts + n; ++m) {
534 r = apply_mount(m, tmp_dir, var_tmp_dir);
535 if (r < 0)
536 goto fail;
537 }
538
539 for (m = mounts; m < mounts + n; ++m) {
540 r = make_read_only(m);
541 if (r < 0)
542 goto fail;
543 }
544 }
545
546 if (root_directory) {
547 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
548 r = mount_move_root(root_directory);
549
550 /* at this point, we cannot rollback */
551 if (r < 0)
552 return r;
553 }
554
555 /* Remount / as the desired mode. Not that this will not
556 * reestablish propagation from our side to the host, since
557 * what's disconnected is disconnected. */
558 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
559 /* at this point, we cannot rollback */
560 return -errno;
561 }
562
563 return 0;
564
565 fail:
566 if (n > 0) {
567 for (m = mounts; m < mounts + n; ++m)
568 if (m->done)
569 (void) umount2(m->path, MNT_DETACH);
570 }
571
572 return r;
573 }
574
575 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
576 _cleanup_free_ char *x = NULL;
577 char bid[SD_ID128_STRING_MAX];
578 sd_id128_t boot_id;
579 int r;
580
581 assert(id);
582 assert(prefix);
583 assert(path);
584
585 /* We include the boot id in the directory so that after a
586 * reboot we can easily identify obsolete directories. */
587
588 r = sd_id128_get_boot(&boot_id);
589 if (r < 0)
590 return r;
591
592 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
593 if (!x)
594 return -ENOMEM;
595
596 RUN_WITH_UMASK(0077)
597 if (!mkdtemp(x))
598 return -errno;
599
600 RUN_WITH_UMASK(0000) {
601 char *y;
602
603 y = strjoina(x, "/tmp");
604
605 if (mkdir(y, 0777 | S_ISVTX) < 0)
606 return -errno;
607 }
608
609 *path = x;
610 x = NULL;
611
612 return 0;
613 }
614
615 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
616 char *a, *b;
617 int r;
618
619 assert(id);
620 assert(tmp_dir);
621 assert(var_tmp_dir);
622
623 r = setup_one_tmp_dir(id, "/tmp", &a);
624 if (r < 0)
625 return r;
626
627 r = setup_one_tmp_dir(id, "/var/tmp", &b);
628 if (r < 0) {
629 char *t;
630
631 t = strjoina(a, "/tmp");
632 rmdir(t);
633 rmdir(a);
634
635 free(a);
636 return r;
637 }
638
639 *tmp_dir = a;
640 *var_tmp_dir = b;
641
642 return 0;
643 }
644
645 int setup_netns(int netns_storage_socket[2]) {
646 _cleanup_close_ int netns = -1;
647 union {
648 struct cmsghdr cmsghdr;
649 uint8_t buf[CMSG_SPACE(sizeof(int))];
650 } control = {};
651 struct msghdr mh = {
652 .msg_control = &control,
653 .msg_controllen = sizeof(control),
654 };
655 struct cmsghdr *cmsg;
656 int r;
657
658 assert(netns_storage_socket);
659 assert(netns_storage_socket[0] >= 0);
660 assert(netns_storage_socket[1] >= 0);
661
662 /* We use the passed socketpair as a storage buffer for our
663 * namespace reference fd. Whatever process runs this first
664 * shall create a new namespace, all others should just join
665 * it. To serialize that we use a file lock on the socket
666 * pair.
667 *
668 * It's a bit crazy, but hey, works great! */
669
670 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
671 return -errno;
672
673 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
674 if (errno != EAGAIN) {
675 r = -errno;
676 goto fail;
677 }
678
679 /* Nothing stored yet, so let's create a new namespace */
680
681 if (unshare(CLONE_NEWNET) < 0) {
682 r = -errno;
683 goto fail;
684 }
685
686 loopback_setup();
687
688 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
689 if (netns < 0) {
690 r = -errno;
691 goto fail;
692 }
693
694 r = 1;
695 } else {
696 /* Yay, found something, so let's join the namespace */
697
698 CMSG_FOREACH(cmsg, &mh)
699 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
700 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
701 netns = *(int*) CMSG_DATA(cmsg);
702 }
703
704 if (setns(netns, CLONE_NEWNET) < 0) {
705 r = -errno;
706 goto fail;
707 }
708
709 r = 0;
710 }
711
712 cmsg = CMSG_FIRSTHDR(&mh);
713 cmsg->cmsg_level = SOL_SOCKET;
714 cmsg->cmsg_type = SCM_RIGHTS;
715 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
716 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
717 mh.msg_controllen = cmsg->cmsg_len;
718
719 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
720 r = -errno;
721 goto fail;
722 }
723
724 fail:
725 lockf(netns_storage_socket[0], F_ULOCK, 0);
726
727 return r;
728 }
729
730 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
731 [PROTECT_HOME_NO] = "no",
732 [PROTECT_HOME_YES] = "yes",
733 [PROTECT_HOME_READ_ONLY] = "read-only",
734 };
735
736 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
737
738 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
739 [PROTECT_SYSTEM_NO] = "no",
740 [PROTECT_SYSTEM_YES] = "yes",
741 [PROTECT_SYSTEM_FULL] = "full",
742 };
743
744 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);