]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
util-lib: move string table stuff into its own string-table.[ch]
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sched.h>
24 #include <stdio.h>
25 #include <string.h>
26 #include <sys/mount.h>
27 #include <sys/stat.h>
28 #include <unistd.h>
29 #include <linux/fs.h>
30
31 #include "dev-setup.h"
32 #include "fd-util.h"
33 #include "loopback-setup.h"
34 #include "missing.h"
35 #include "mkdir.h"
36 #include "mount-util.h"
37 #include "namespace.h"
38 #include "path-util.h"
39 #include "selinux-util.h"
40 #include "socket-util.h"
41 #include "string-table.h"
42 #include "string-util.h"
43 #include "strv.h"
44 #include "util.h"
45
46 typedef enum MountMode {
47 /* This is ordered by priority! */
48 INACCESSIBLE,
49 READONLY,
50 PRIVATE_TMP,
51 PRIVATE_VAR_TMP,
52 PRIVATE_DEV,
53 PRIVATE_BUS_ENDPOINT,
54 READWRITE
55 } MountMode;
56
57 typedef struct BindMount {
58 const char *path;
59 MountMode mode;
60 bool done;
61 bool ignore;
62 } BindMount;
63
64 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
65 char **i;
66
67 assert(p);
68
69 STRV_FOREACH(i, strv) {
70
71 (*p)->ignore = false;
72 (*p)->done = false;
73
74 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
75 (*p)->ignore = true;
76 (*i)++;
77 }
78
79 if (!path_is_absolute(*i))
80 return -EINVAL;
81
82 (*p)->path = *i;
83 (*p)->mode = mode;
84 (*p)++;
85 }
86
87 return 0;
88 }
89
90 static int mount_path_compare(const void *a, const void *b) {
91 const BindMount *p = a, *q = b;
92 int d;
93
94 d = path_compare(p->path, q->path);
95
96 if (d == 0) {
97 /* If the paths are equal, check the mode */
98 if (p->mode < q->mode)
99 return -1;
100
101 if (p->mode > q->mode)
102 return 1;
103
104 return 0;
105 }
106
107 /* If the paths are not equal, then order prefixes first */
108 return d;
109 }
110
111 static void drop_duplicates(BindMount *m, unsigned *n) {
112 BindMount *f, *t, *previous;
113
114 assert(m);
115 assert(n);
116
117 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
118
119 /* The first one wins */
120 if (previous && path_equal(f->path, previous->path))
121 continue;
122
123 *t = *f;
124
125 previous = t;
126
127 t++;
128 }
129
130 *n = t - m;
131 }
132
133 static int mount_dev(BindMount *m) {
134 static const char devnodes[] =
135 "/dev/null\0"
136 "/dev/zero\0"
137 "/dev/full\0"
138 "/dev/random\0"
139 "/dev/urandom\0"
140 "/dev/tty\0";
141
142 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
143 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
144 _cleanup_umask_ mode_t u;
145 int r;
146
147 assert(m);
148
149 u = umask(0000);
150
151 if (!mkdtemp(temporary_mount))
152 return -errno;
153
154 dev = strjoina(temporary_mount, "/dev");
155 (void) mkdir(dev, 0755);
156 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
157 r = -errno;
158 goto fail;
159 }
160
161 devpts = strjoina(temporary_mount, "/dev/pts");
162 (void) mkdir(devpts, 0755);
163 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
164 r = -errno;
165 goto fail;
166 }
167
168 devptmx = strjoina(temporary_mount, "/dev/ptmx");
169 if (symlink("pts/ptmx", devptmx) < 0) {
170 r = -errno;
171 goto fail;
172 }
173
174 devshm = strjoina(temporary_mount, "/dev/shm");
175 (void) mkdir(devshm, 01777);
176 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
177 if (r < 0) {
178 r = -errno;
179 goto fail;
180 }
181
182 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
183 (void) mkdir(devmqueue, 0755);
184 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
185
186 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
187 (void) mkdir(devhugepages, 0755);
188 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
189
190 devlog = strjoina(temporary_mount, "/dev/log");
191 (void) symlink("/run/systemd/journal/dev-log", devlog);
192
193 NULSTR_FOREACH(d, devnodes) {
194 _cleanup_free_ char *dn = NULL;
195 struct stat st;
196
197 r = stat(d, &st);
198 if (r < 0) {
199
200 if (errno == ENOENT)
201 continue;
202
203 r = -errno;
204 goto fail;
205 }
206
207 if (!S_ISBLK(st.st_mode) &&
208 !S_ISCHR(st.st_mode)) {
209 r = -EINVAL;
210 goto fail;
211 }
212
213 if (st.st_rdev == 0)
214 continue;
215
216 dn = strappend(temporary_mount, d);
217 if (!dn) {
218 r = -ENOMEM;
219 goto fail;
220 }
221
222 mac_selinux_create_file_prepare(d, st.st_mode);
223 r = mknod(dn, st.st_mode, st.st_rdev);
224 mac_selinux_create_file_clear();
225
226 if (r < 0) {
227 r = -errno;
228 goto fail;
229 }
230 }
231
232 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
233
234 /* Create the /dev directory if missing. It is more likely to be
235 * missing when the service is started with RootDirectory. This is
236 * consistent with mount units creating the mount points when missing.
237 */
238 (void) mkdir_p_label(m->path, 0755);
239
240 if (mount(dev, m->path, NULL, MS_MOVE, NULL) < 0) {
241 r = -errno;
242 goto fail;
243 }
244
245 rmdir(dev);
246 rmdir(temporary_mount);
247
248 return 0;
249
250 fail:
251 if (devpts)
252 umount(devpts);
253
254 if (devshm)
255 umount(devshm);
256
257 if (devhugepages)
258 umount(devhugepages);
259
260 if (devmqueue)
261 umount(devmqueue);
262
263 umount(dev);
264 rmdir(dev);
265 rmdir(temporary_mount);
266
267 return r;
268 }
269
270 static int mount_kdbus(BindMount *m) {
271
272 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
273 _cleanup_free_ char *basepath = NULL;
274 _cleanup_umask_ mode_t u;
275 char *busnode = NULL, *root;
276 struct stat st;
277 int r;
278
279 assert(m);
280
281 u = umask(0000);
282
283 if (!mkdtemp(temporary_mount))
284 return log_error_errno(errno, "Failed create temp dir: %m");
285
286 root = strjoina(temporary_mount, "/kdbus");
287 (void) mkdir(root, 0755);
288 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
289 r = -errno;
290 goto fail;
291 }
292
293 /* create a new /dev/null dev node copy so we have some fodder to
294 * bind-mount the custom endpoint over. */
295 if (stat("/dev/null", &st) < 0) {
296 r = log_error_errno(errno, "Failed to stat /dev/null: %m");
297 goto fail;
298 }
299
300 busnode = strjoina(root, "/bus");
301 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
302 r = log_error_errno(errno, "mknod() for %s failed: %m",
303 busnode);
304 goto fail;
305 }
306
307 r = mount(m->path, busnode, NULL, MS_BIND, NULL);
308 if (r < 0) {
309 r = log_error_errno(errno, "bind mount of %s failed: %m",
310 m->path);
311 goto fail;
312 }
313
314 basepath = dirname_malloc(m->path);
315 if (!basepath) {
316 r = -ENOMEM;
317 goto fail;
318 }
319
320 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
321 r = log_error_errno(errno, "bind mount of %s failed: %m",
322 basepath);
323 goto fail;
324 }
325
326 rmdir(temporary_mount);
327 return 0;
328
329 fail:
330 if (busnode) {
331 umount(busnode);
332 unlink(busnode);
333 }
334
335 umount(root);
336 rmdir(root);
337 rmdir(temporary_mount);
338
339 return r;
340 }
341
342 static int apply_mount(
343 BindMount *m,
344 const char *tmp_dir,
345 const char *var_tmp_dir) {
346
347 const char *what;
348 int r;
349
350 assert(m);
351
352 switch (m->mode) {
353
354 case INACCESSIBLE:
355
356 /* First, get rid of everything that is below if there
357 * is anything... Then, overmount it with an
358 * inaccessible directory. */
359 umount_recursive(m->path, 0);
360
361 what = "/run/systemd/inaccessible";
362 break;
363
364 case READONLY:
365 case READWRITE:
366 /* Nothing to mount here, we just later toggle the
367 * MS_RDONLY bit for the mount point */
368 return 0;
369
370 case PRIVATE_TMP:
371 what = tmp_dir;
372 break;
373
374 case PRIVATE_VAR_TMP:
375 what = var_tmp_dir;
376 break;
377
378 case PRIVATE_DEV:
379 return mount_dev(m);
380
381 case PRIVATE_BUS_ENDPOINT:
382 return mount_kdbus(m);
383
384 default:
385 assert_not_reached("Unknown mode");
386 }
387
388 assert(what);
389
390 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
391 if (r >= 0)
392 log_debug("Successfully mounted %s to %s", what, m->path);
393 else if (m->ignore && errno == ENOENT)
394 return 0;
395
396 return r;
397 }
398
399 static int make_read_only(BindMount *m) {
400 int r;
401
402 assert(m);
403
404 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
405 r = bind_remount_recursive(m->path, true);
406 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
407 r = bind_remount_recursive(m->path, false);
408 else
409 r = 0;
410
411 if (m->ignore && r == -ENOENT)
412 return 0;
413
414 return r;
415 }
416
417 int setup_namespace(
418 const char* root_directory,
419 char** read_write_dirs,
420 char** read_only_dirs,
421 char** inaccessible_dirs,
422 const char* tmp_dir,
423 const char* var_tmp_dir,
424 const char* bus_endpoint_path,
425 bool private_dev,
426 ProtectHome protect_home,
427 ProtectSystem protect_system,
428 unsigned long mount_flags) {
429
430 BindMount *m, *mounts = NULL;
431 unsigned n;
432 int r = 0;
433
434 if (mount_flags == 0)
435 mount_flags = MS_SHARED;
436
437 if (unshare(CLONE_NEWNS) < 0)
438 return -errno;
439
440 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
441 strv_length(read_write_dirs) +
442 strv_length(read_only_dirs) +
443 strv_length(inaccessible_dirs) +
444 private_dev +
445 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
446 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
447 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
448
449 if (n > 0) {
450 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
451 r = append_mounts(&m, read_write_dirs, READWRITE);
452 if (r < 0)
453 return r;
454
455 r = append_mounts(&m, read_only_dirs, READONLY);
456 if (r < 0)
457 return r;
458
459 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
460 if (r < 0)
461 return r;
462
463 if (tmp_dir) {
464 m->path = prefix_roota(root_directory, "/tmp");
465 m->mode = PRIVATE_TMP;
466 m++;
467 }
468
469 if (var_tmp_dir) {
470 m->path = prefix_roota(root_directory, "/var/tmp");
471 m->mode = PRIVATE_VAR_TMP;
472 m++;
473 }
474
475 if (private_dev) {
476 m->path = prefix_roota(root_directory, "/dev");
477 m->mode = PRIVATE_DEV;
478 m++;
479 }
480
481 if (bus_endpoint_path) {
482 m->path = prefix_roota(root_directory, bus_endpoint_path);
483 m->mode = PRIVATE_BUS_ENDPOINT;
484 m++;
485 }
486
487 if (protect_home != PROTECT_HOME_NO) {
488 const char *home_dir, *run_user_dir, *root_dir;
489
490 home_dir = prefix_roota(root_directory, "/home");
491 home_dir = strjoina("-", home_dir);
492 run_user_dir = prefix_roota(root_directory, "/run/user");
493 run_user_dir = strjoina("-", run_user_dir);
494 root_dir = prefix_roota(root_directory, "/root");
495 root_dir = strjoina("-", root_dir);
496
497 r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
498 protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
499 if (r < 0)
500 return r;
501 }
502
503 if (protect_system != PROTECT_SYSTEM_NO) {
504 const char *usr_dir, *boot_dir, *etc_dir;
505
506 usr_dir = prefix_roota(root_directory, "/usr");
507 boot_dir = prefix_roota(root_directory, "/boot");
508 boot_dir = strjoina("-", boot_dir);
509 etc_dir = prefix_roota(root_directory, "/etc");
510
511 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
512 ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
513 : STRV_MAKE(usr_dir, boot_dir), READONLY);
514 if (r < 0)
515 return r;
516 }
517
518 assert(mounts + n == m);
519
520 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
521 drop_duplicates(mounts, &n);
522 }
523
524 if (n > 0 || root_directory) {
525 /* Remount / as SLAVE so that nothing now mounted in the namespace
526 shows up in the parent */
527 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
528 return -errno;
529 }
530
531 if (root_directory) {
532 /* Turn directory into bind mount */
533 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0)
534 return -errno;
535 }
536
537 if (n > 0) {
538 for (m = mounts; m < mounts + n; ++m) {
539 r = apply_mount(m, tmp_dir, var_tmp_dir);
540 if (r < 0)
541 goto fail;
542 }
543
544 for (m = mounts; m < mounts + n; ++m) {
545 r = make_read_only(m);
546 if (r < 0)
547 goto fail;
548 }
549 }
550
551 if (root_directory) {
552 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
553 r = mount_move_root(root_directory);
554
555 /* at this point, we cannot rollback */
556 if (r < 0)
557 return r;
558 }
559
560 /* Remount / as the desired mode. Not that this will not
561 * reestablish propagation from our side to the host, since
562 * what's disconnected is disconnected. */
563 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
564 /* at this point, we cannot rollback */
565 return -errno;
566
567 return 0;
568
569 fail:
570 if (n > 0) {
571 for (m = mounts; m < mounts + n; ++m)
572 if (m->done)
573 (void) umount2(m->path, MNT_DETACH);
574 }
575
576 return r;
577 }
578
579 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
580 _cleanup_free_ char *x = NULL;
581 char bid[SD_ID128_STRING_MAX];
582 sd_id128_t boot_id;
583 int r;
584
585 assert(id);
586 assert(prefix);
587 assert(path);
588
589 /* We include the boot id in the directory so that after a
590 * reboot we can easily identify obsolete directories. */
591
592 r = sd_id128_get_boot(&boot_id);
593 if (r < 0)
594 return r;
595
596 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
597 if (!x)
598 return -ENOMEM;
599
600 RUN_WITH_UMASK(0077)
601 if (!mkdtemp(x))
602 return -errno;
603
604 RUN_WITH_UMASK(0000) {
605 char *y;
606
607 y = strjoina(x, "/tmp");
608
609 if (mkdir(y, 0777 | S_ISVTX) < 0)
610 return -errno;
611 }
612
613 *path = x;
614 x = NULL;
615
616 return 0;
617 }
618
619 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
620 char *a, *b;
621 int r;
622
623 assert(id);
624 assert(tmp_dir);
625 assert(var_tmp_dir);
626
627 r = setup_one_tmp_dir(id, "/tmp", &a);
628 if (r < 0)
629 return r;
630
631 r = setup_one_tmp_dir(id, "/var/tmp", &b);
632 if (r < 0) {
633 char *t;
634
635 t = strjoina(a, "/tmp");
636 rmdir(t);
637 rmdir(a);
638
639 free(a);
640 return r;
641 }
642
643 *tmp_dir = a;
644 *var_tmp_dir = b;
645
646 return 0;
647 }
648
649 int setup_netns(int netns_storage_socket[2]) {
650 _cleanup_close_ int netns = -1;
651 int r, q;
652
653 assert(netns_storage_socket);
654 assert(netns_storage_socket[0] >= 0);
655 assert(netns_storage_socket[1] >= 0);
656
657 /* We use the passed socketpair as a storage buffer for our
658 * namespace reference fd. Whatever process runs this first
659 * shall create a new namespace, all others should just join
660 * it. To serialize that we use a file lock on the socket
661 * pair.
662 *
663 * It's a bit crazy, but hey, works great! */
664
665 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
666 return -errno;
667
668 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
669 if (netns == -EAGAIN) {
670 /* Nothing stored yet, so let's create a new namespace */
671
672 if (unshare(CLONE_NEWNET) < 0) {
673 r = -errno;
674 goto fail;
675 }
676
677 loopback_setup();
678
679 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
680 if (netns < 0) {
681 r = -errno;
682 goto fail;
683 }
684
685 r = 1;
686
687 } else if (netns < 0) {
688 r = netns;
689 goto fail;
690
691 } else {
692 /* Yay, found something, so let's join the namespace */
693 if (setns(netns, CLONE_NEWNET) < 0) {
694 r = -errno;
695 goto fail;
696 }
697
698 r = 0;
699 }
700
701 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
702 if (q < 0) {
703 r = q;
704 goto fail;
705 }
706
707 fail:
708 lockf(netns_storage_socket[0], F_ULOCK, 0);
709 return r;
710 }
711
712 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
713 [PROTECT_HOME_NO] = "no",
714 [PROTECT_HOME_YES] = "yes",
715 [PROTECT_HOME_READ_ONLY] = "read-only",
716 };
717
718 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
719
720 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
721 [PROTECT_SYSTEM_NO] = "no",
722 [PROTECT_SYSTEM_YES] = "yes",
723 [PROTECT_SYSTEM_FULL] = "full",
724 };
725
726 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);