]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
util-lib: split our string related calls from util.[ch] into its own file string...
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sched.h>
24 #include <stdio.h>
25 #include <string.h>
26 #include <sys/mount.h>
27 #include <sys/stat.h>
28 #include <unistd.h>
29 #include <linux/fs.h>
30
31 #include "dev-setup.h"
32 #include "loopback-setup.h"
33 #include "missing.h"
34 #include "mkdir.h"
35 #include "path-util.h"
36 #include "selinux-util.h"
37 #include "string-util.h"
38 #include "strv.h"
39 #include "util.h"
40 #include "namespace.h"
41
42 typedef enum MountMode {
43 /* This is ordered by priority! */
44 INACCESSIBLE,
45 READONLY,
46 PRIVATE_TMP,
47 PRIVATE_VAR_TMP,
48 PRIVATE_DEV,
49 PRIVATE_BUS_ENDPOINT,
50 READWRITE
51 } MountMode;
52
53 typedef struct BindMount {
54 const char *path;
55 MountMode mode;
56 bool done;
57 bool ignore;
58 } BindMount;
59
60 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
61 char **i;
62
63 assert(p);
64
65 STRV_FOREACH(i, strv) {
66
67 (*p)->ignore = false;
68 (*p)->done = false;
69
70 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
71 (*p)->ignore = true;
72 (*i)++;
73 }
74
75 if (!path_is_absolute(*i))
76 return -EINVAL;
77
78 (*p)->path = *i;
79 (*p)->mode = mode;
80 (*p)++;
81 }
82
83 return 0;
84 }
85
86 static int mount_path_compare(const void *a, const void *b) {
87 const BindMount *p = a, *q = b;
88 int d;
89
90 d = path_compare(p->path, q->path);
91
92 if (d == 0) {
93 /* If the paths are equal, check the mode */
94 if (p->mode < q->mode)
95 return -1;
96
97 if (p->mode > q->mode)
98 return 1;
99
100 return 0;
101 }
102
103 /* If the paths are not equal, then order prefixes first */
104 return d;
105 }
106
107 static void drop_duplicates(BindMount *m, unsigned *n) {
108 BindMount *f, *t, *previous;
109
110 assert(m);
111 assert(n);
112
113 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
114
115 /* The first one wins */
116 if (previous && path_equal(f->path, previous->path))
117 continue;
118
119 *t = *f;
120
121 previous = t;
122
123 t++;
124 }
125
126 *n = t - m;
127 }
128
129 static int mount_dev(BindMount *m) {
130 static const char devnodes[] =
131 "/dev/null\0"
132 "/dev/zero\0"
133 "/dev/full\0"
134 "/dev/random\0"
135 "/dev/urandom\0"
136 "/dev/tty\0";
137
138 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
139 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
140 _cleanup_umask_ mode_t u;
141 int r;
142
143 assert(m);
144
145 u = umask(0000);
146
147 if (!mkdtemp(temporary_mount))
148 return -errno;
149
150 dev = strjoina(temporary_mount, "/dev");
151 (void) mkdir(dev, 0755);
152 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
153 r = -errno;
154 goto fail;
155 }
156
157 devpts = strjoina(temporary_mount, "/dev/pts");
158 (void) mkdir(devpts, 0755);
159 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
160 r = -errno;
161 goto fail;
162 }
163
164 devptmx = strjoina(temporary_mount, "/dev/ptmx");
165 if (symlink("pts/ptmx", devptmx) < 0) {
166 r = -errno;
167 goto fail;
168 }
169
170 devshm = strjoina(temporary_mount, "/dev/shm");
171 (void) mkdir(devshm, 01777);
172 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
173 if (r < 0) {
174 r = -errno;
175 goto fail;
176 }
177
178 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
179 (void) mkdir(devmqueue, 0755);
180 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
181
182 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
183 (void) mkdir(devhugepages, 0755);
184 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
185
186 devlog = strjoina(temporary_mount, "/dev/log");
187 (void) symlink("/run/systemd/journal/dev-log", devlog);
188
189 NULSTR_FOREACH(d, devnodes) {
190 _cleanup_free_ char *dn = NULL;
191 struct stat st;
192
193 r = stat(d, &st);
194 if (r < 0) {
195
196 if (errno == ENOENT)
197 continue;
198
199 r = -errno;
200 goto fail;
201 }
202
203 if (!S_ISBLK(st.st_mode) &&
204 !S_ISCHR(st.st_mode)) {
205 r = -EINVAL;
206 goto fail;
207 }
208
209 if (st.st_rdev == 0)
210 continue;
211
212 dn = strappend(temporary_mount, d);
213 if (!dn) {
214 r = -ENOMEM;
215 goto fail;
216 }
217
218 mac_selinux_create_file_prepare(d, st.st_mode);
219 r = mknod(dn, st.st_mode, st.st_rdev);
220 mac_selinux_create_file_clear();
221
222 if (r < 0) {
223 r = -errno;
224 goto fail;
225 }
226 }
227
228 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
229
230 /* Create the /dev directory if missing. It is more likely to be
231 * missing when the service is started with RootDirectory. This is
232 * consistent with mount units creating the mount points when missing.
233 */
234 (void) mkdir_p_label(m->path, 0755);
235
236 if (mount(dev, m->path, NULL, MS_MOVE, NULL) < 0) {
237 r = -errno;
238 goto fail;
239 }
240
241 rmdir(dev);
242 rmdir(temporary_mount);
243
244 return 0;
245
246 fail:
247 if (devpts)
248 umount(devpts);
249
250 if (devshm)
251 umount(devshm);
252
253 if (devhugepages)
254 umount(devhugepages);
255
256 if (devmqueue)
257 umount(devmqueue);
258
259 umount(dev);
260 rmdir(dev);
261 rmdir(temporary_mount);
262
263 return r;
264 }
265
266 static int mount_kdbus(BindMount *m) {
267
268 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
269 _cleanup_free_ char *basepath = NULL;
270 _cleanup_umask_ mode_t u;
271 char *busnode = NULL, *root;
272 struct stat st;
273 int r;
274
275 assert(m);
276
277 u = umask(0000);
278
279 if (!mkdtemp(temporary_mount))
280 return log_error_errno(errno, "Failed create temp dir: %m");
281
282 root = strjoina(temporary_mount, "/kdbus");
283 (void) mkdir(root, 0755);
284 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
285 r = -errno;
286 goto fail;
287 }
288
289 /* create a new /dev/null dev node copy so we have some fodder to
290 * bind-mount the custom endpoint over. */
291 if (stat("/dev/null", &st) < 0) {
292 r = log_error_errno(errno, "Failed to stat /dev/null: %m");
293 goto fail;
294 }
295
296 busnode = strjoina(root, "/bus");
297 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
298 r = log_error_errno(errno, "mknod() for %s failed: %m",
299 busnode);
300 goto fail;
301 }
302
303 r = mount(m->path, busnode, NULL, MS_BIND, NULL);
304 if (r < 0) {
305 r = log_error_errno(errno, "bind mount of %s failed: %m",
306 m->path);
307 goto fail;
308 }
309
310 basepath = dirname_malloc(m->path);
311 if (!basepath) {
312 r = -ENOMEM;
313 goto fail;
314 }
315
316 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
317 r = log_error_errno(errno, "bind mount of %s failed: %m",
318 basepath);
319 goto fail;
320 }
321
322 rmdir(temporary_mount);
323 return 0;
324
325 fail:
326 if (busnode) {
327 umount(busnode);
328 unlink(busnode);
329 }
330
331 umount(root);
332 rmdir(root);
333 rmdir(temporary_mount);
334
335 return r;
336 }
337
338 static int apply_mount(
339 BindMount *m,
340 const char *tmp_dir,
341 const char *var_tmp_dir) {
342
343 const char *what;
344 int r;
345
346 assert(m);
347
348 switch (m->mode) {
349
350 case INACCESSIBLE:
351
352 /* First, get rid of everything that is below if there
353 * is anything... Then, overmount it with an
354 * inaccessible directory. */
355 umount_recursive(m->path, 0);
356
357 what = "/run/systemd/inaccessible";
358 break;
359
360 case READONLY:
361 case READWRITE:
362 /* Nothing to mount here, we just later toggle the
363 * MS_RDONLY bit for the mount point */
364 return 0;
365
366 case PRIVATE_TMP:
367 what = tmp_dir;
368 break;
369
370 case PRIVATE_VAR_TMP:
371 what = var_tmp_dir;
372 break;
373
374 case PRIVATE_DEV:
375 return mount_dev(m);
376
377 case PRIVATE_BUS_ENDPOINT:
378 return mount_kdbus(m);
379
380 default:
381 assert_not_reached("Unknown mode");
382 }
383
384 assert(what);
385
386 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
387 if (r >= 0)
388 log_debug("Successfully mounted %s to %s", what, m->path);
389 else if (m->ignore && errno == ENOENT)
390 return 0;
391
392 return r;
393 }
394
395 static int make_read_only(BindMount *m) {
396 int r;
397
398 assert(m);
399
400 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
401 r = bind_remount_recursive(m->path, true);
402 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
403 r = bind_remount_recursive(m->path, false);
404 else
405 r = 0;
406
407 if (m->ignore && r == -ENOENT)
408 return 0;
409
410 return r;
411 }
412
413 int setup_namespace(
414 const char* root_directory,
415 char** read_write_dirs,
416 char** read_only_dirs,
417 char** inaccessible_dirs,
418 const char* tmp_dir,
419 const char* var_tmp_dir,
420 const char* bus_endpoint_path,
421 bool private_dev,
422 ProtectHome protect_home,
423 ProtectSystem protect_system,
424 unsigned long mount_flags) {
425
426 BindMount *m, *mounts = NULL;
427 unsigned n;
428 int r = 0;
429
430 if (mount_flags == 0)
431 mount_flags = MS_SHARED;
432
433 if (unshare(CLONE_NEWNS) < 0)
434 return -errno;
435
436 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
437 strv_length(read_write_dirs) +
438 strv_length(read_only_dirs) +
439 strv_length(inaccessible_dirs) +
440 private_dev +
441 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
442 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
443 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
444
445 if (n > 0) {
446 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
447 r = append_mounts(&m, read_write_dirs, READWRITE);
448 if (r < 0)
449 return r;
450
451 r = append_mounts(&m, read_only_dirs, READONLY);
452 if (r < 0)
453 return r;
454
455 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
456 if (r < 0)
457 return r;
458
459 if (tmp_dir) {
460 m->path = prefix_roota(root_directory, "/tmp");
461 m->mode = PRIVATE_TMP;
462 m++;
463 }
464
465 if (var_tmp_dir) {
466 m->path = prefix_roota(root_directory, "/var/tmp");
467 m->mode = PRIVATE_VAR_TMP;
468 m++;
469 }
470
471 if (private_dev) {
472 m->path = prefix_roota(root_directory, "/dev");
473 m->mode = PRIVATE_DEV;
474 m++;
475 }
476
477 if (bus_endpoint_path) {
478 m->path = prefix_roota(root_directory, bus_endpoint_path);
479 m->mode = PRIVATE_BUS_ENDPOINT;
480 m++;
481 }
482
483 if (protect_home != PROTECT_HOME_NO) {
484 const char *home_dir, *run_user_dir, *root_dir;
485
486 home_dir = prefix_roota(root_directory, "/home");
487 home_dir = strjoina("-", home_dir);
488 run_user_dir = prefix_roota(root_directory, "/run/user");
489 run_user_dir = strjoina("-", run_user_dir);
490 root_dir = prefix_roota(root_directory, "/root");
491 root_dir = strjoina("-", root_dir);
492
493 r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
494 protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
495 if (r < 0)
496 return r;
497 }
498
499 if (protect_system != PROTECT_SYSTEM_NO) {
500 const char *usr_dir, *boot_dir, *etc_dir;
501
502 usr_dir = prefix_roota(root_directory, "/usr");
503 boot_dir = prefix_roota(root_directory, "/boot");
504 boot_dir = strjoina("-", boot_dir);
505 etc_dir = prefix_roota(root_directory, "/etc");
506
507 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
508 ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
509 : STRV_MAKE(usr_dir, boot_dir), READONLY);
510 if (r < 0)
511 return r;
512 }
513
514 assert(mounts + n == m);
515
516 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
517 drop_duplicates(mounts, &n);
518 }
519
520 if (n > 0 || root_directory) {
521 /* Remount / as SLAVE so that nothing now mounted in the namespace
522 shows up in the parent */
523 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
524 return -errno;
525 }
526
527 if (root_directory) {
528 /* Turn directory into bind mount */
529 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0)
530 return -errno;
531 }
532
533 if (n > 0) {
534 for (m = mounts; m < mounts + n; ++m) {
535 r = apply_mount(m, tmp_dir, var_tmp_dir);
536 if (r < 0)
537 goto fail;
538 }
539
540 for (m = mounts; m < mounts + n; ++m) {
541 r = make_read_only(m);
542 if (r < 0)
543 goto fail;
544 }
545 }
546
547 if (root_directory) {
548 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
549 r = mount_move_root(root_directory);
550
551 /* at this point, we cannot rollback */
552 if (r < 0)
553 return r;
554 }
555
556 /* Remount / as the desired mode. Not that this will not
557 * reestablish propagation from our side to the host, since
558 * what's disconnected is disconnected. */
559 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
560 /* at this point, we cannot rollback */
561 return -errno;
562
563 return 0;
564
565 fail:
566 if (n > 0) {
567 for (m = mounts; m < mounts + n; ++m)
568 if (m->done)
569 (void) umount2(m->path, MNT_DETACH);
570 }
571
572 return r;
573 }
574
575 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
576 _cleanup_free_ char *x = NULL;
577 char bid[SD_ID128_STRING_MAX];
578 sd_id128_t boot_id;
579 int r;
580
581 assert(id);
582 assert(prefix);
583 assert(path);
584
585 /* We include the boot id in the directory so that after a
586 * reboot we can easily identify obsolete directories. */
587
588 r = sd_id128_get_boot(&boot_id);
589 if (r < 0)
590 return r;
591
592 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
593 if (!x)
594 return -ENOMEM;
595
596 RUN_WITH_UMASK(0077)
597 if (!mkdtemp(x))
598 return -errno;
599
600 RUN_WITH_UMASK(0000) {
601 char *y;
602
603 y = strjoina(x, "/tmp");
604
605 if (mkdir(y, 0777 | S_ISVTX) < 0)
606 return -errno;
607 }
608
609 *path = x;
610 x = NULL;
611
612 return 0;
613 }
614
615 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
616 char *a, *b;
617 int r;
618
619 assert(id);
620 assert(tmp_dir);
621 assert(var_tmp_dir);
622
623 r = setup_one_tmp_dir(id, "/tmp", &a);
624 if (r < 0)
625 return r;
626
627 r = setup_one_tmp_dir(id, "/var/tmp", &b);
628 if (r < 0) {
629 char *t;
630
631 t = strjoina(a, "/tmp");
632 rmdir(t);
633 rmdir(a);
634
635 free(a);
636 return r;
637 }
638
639 *tmp_dir = a;
640 *var_tmp_dir = b;
641
642 return 0;
643 }
644
645 int setup_netns(int netns_storage_socket[2]) {
646 _cleanup_close_ int netns = -1;
647 int r, q;
648
649 assert(netns_storage_socket);
650 assert(netns_storage_socket[0] >= 0);
651 assert(netns_storage_socket[1] >= 0);
652
653 /* We use the passed socketpair as a storage buffer for our
654 * namespace reference fd. Whatever process runs this first
655 * shall create a new namespace, all others should just join
656 * it. To serialize that we use a file lock on the socket
657 * pair.
658 *
659 * It's a bit crazy, but hey, works great! */
660
661 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
662 return -errno;
663
664 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
665 if (netns == -EAGAIN) {
666 /* Nothing stored yet, so let's create a new namespace */
667
668 if (unshare(CLONE_NEWNET) < 0) {
669 r = -errno;
670 goto fail;
671 }
672
673 loopback_setup();
674
675 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
676 if (netns < 0) {
677 r = -errno;
678 goto fail;
679 }
680
681 r = 1;
682
683 } else if (netns < 0) {
684 r = netns;
685 goto fail;
686
687 } else {
688 /* Yay, found something, so let's join the namespace */
689 if (setns(netns, CLONE_NEWNET) < 0) {
690 r = -errno;
691 goto fail;
692 }
693
694 r = 0;
695 }
696
697 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
698 if (q < 0) {
699 r = q;
700 goto fail;
701 }
702
703 fail:
704 lockf(netns_storage_socket[0], F_ULOCK, 0);
705 return r;
706 }
707
708 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
709 [PROTECT_HOME_NO] = "no",
710 [PROTECT_HOME_YES] = "yes",
711 [PROTECT_HOME_READ_ONLY] = "read-only",
712 };
713
714 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
715
716 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
717 [PROTECT_SYSTEM_NO] = "no",
718 [PROTECT_SYSTEM_YES] = "yes",
719 [PROTECT_SYSTEM_FULL] = "full",
720 };
721
722 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);