]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
util-lib: split out umask-related code to umask-util.h
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sched.h>
24 #include <stdio.h>
25 #include <string.h>
26 #include <sys/mount.h>
27 #include <sys/stat.h>
28 #include <unistd.h>
29 #include <linux/fs.h>
30
31 #include "dev-setup.h"
32 #include "fd-util.h"
33 #include "loopback-setup.h"
34 #include "missing.h"
35 #include "mkdir.h"
36 #include "mount-util.h"
37 #include "namespace.h"
38 #include "path-util.h"
39 #include "selinux-util.h"
40 #include "socket-util.h"
41 #include "string-table.h"
42 #include "string-util.h"
43 #include "strv.h"
44 #include "umask-util.h"
45 #include "util.h"
46
47 typedef enum MountMode {
48 /* This is ordered by priority! */
49 INACCESSIBLE,
50 READONLY,
51 PRIVATE_TMP,
52 PRIVATE_VAR_TMP,
53 PRIVATE_DEV,
54 PRIVATE_BUS_ENDPOINT,
55 READWRITE
56 } MountMode;
57
58 typedef struct BindMount {
59 const char *path;
60 MountMode mode;
61 bool done;
62 bool ignore;
63 } BindMount;
64
65 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
66 char **i;
67
68 assert(p);
69
70 STRV_FOREACH(i, strv) {
71
72 (*p)->ignore = false;
73 (*p)->done = false;
74
75 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
76 (*p)->ignore = true;
77 (*i)++;
78 }
79
80 if (!path_is_absolute(*i))
81 return -EINVAL;
82
83 (*p)->path = *i;
84 (*p)->mode = mode;
85 (*p)++;
86 }
87
88 return 0;
89 }
90
91 static int mount_path_compare(const void *a, const void *b) {
92 const BindMount *p = a, *q = b;
93 int d;
94
95 d = path_compare(p->path, q->path);
96
97 if (d == 0) {
98 /* If the paths are equal, check the mode */
99 if (p->mode < q->mode)
100 return -1;
101
102 if (p->mode > q->mode)
103 return 1;
104
105 return 0;
106 }
107
108 /* If the paths are not equal, then order prefixes first */
109 return d;
110 }
111
112 static void drop_duplicates(BindMount *m, unsigned *n) {
113 BindMount *f, *t, *previous;
114
115 assert(m);
116 assert(n);
117
118 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
119
120 /* The first one wins */
121 if (previous && path_equal(f->path, previous->path))
122 continue;
123
124 *t = *f;
125
126 previous = t;
127
128 t++;
129 }
130
131 *n = t - m;
132 }
133
134 static int mount_dev(BindMount *m) {
135 static const char devnodes[] =
136 "/dev/null\0"
137 "/dev/zero\0"
138 "/dev/full\0"
139 "/dev/random\0"
140 "/dev/urandom\0"
141 "/dev/tty\0";
142
143 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
144 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
145 _cleanup_umask_ mode_t u;
146 int r;
147
148 assert(m);
149
150 u = umask(0000);
151
152 if (!mkdtemp(temporary_mount))
153 return -errno;
154
155 dev = strjoina(temporary_mount, "/dev");
156 (void) mkdir(dev, 0755);
157 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
158 r = -errno;
159 goto fail;
160 }
161
162 devpts = strjoina(temporary_mount, "/dev/pts");
163 (void) mkdir(devpts, 0755);
164 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
165 r = -errno;
166 goto fail;
167 }
168
169 devptmx = strjoina(temporary_mount, "/dev/ptmx");
170 if (symlink("pts/ptmx", devptmx) < 0) {
171 r = -errno;
172 goto fail;
173 }
174
175 devshm = strjoina(temporary_mount, "/dev/shm");
176 (void) mkdir(devshm, 01777);
177 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
178 if (r < 0) {
179 r = -errno;
180 goto fail;
181 }
182
183 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
184 (void) mkdir(devmqueue, 0755);
185 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
186
187 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
188 (void) mkdir(devhugepages, 0755);
189 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
190
191 devlog = strjoina(temporary_mount, "/dev/log");
192 (void) symlink("/run/systemd/journal/dev-log", devlog);
193
194 NULSTR_FOREACH(d, devnodes) {
195 _cleanup_free_ char *dn = NULL;
196 struct stat st;
197
198 r = stat(d, &st);
199 if (r < 0) {
200
201 if (errno == ENOENT)
202 continue;
203
204 r = -errno;
205 goto fail;
206 }
207
208 if (!S_ISBLK(st.st_mode) &&
209 !S_ISCHR(st.st_mode)) {
210 r = -EINVAL;
211 goto fail;
212 }
213
214 if (st.st_rdev == 0)
215 continue;
216
217 dn = strappend(temporary_mount, d);
218 if (!dn) {
219 r = -ENOMEM;
220 goto fail;
221 }
222
223 mac_selinux_create_file_prepare(d, st.st_mode);
224 r = mknod(dn, st.st_mode, st.st_rdev);
225 mac_selinux_create_file_clear();
226
227 if (r < 0) {
228 r = -errno;
229 goto fail;
230 }
231 }
232
233 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
234
235 /* Create the /dev directory if missing. It is more likely to be
236 * missing when the service is started with RootDirectory. This is
237 * consistent with mount units creating the mount points when missing.
238 */
239 (void) mkdir_p_label(m->path, 0755);
240
241 if (mount(dev, m->path, NULL, MS_MOVE, NULL) < 0) {
242 r = -errno;
243 goto fail;
244 }
245
246 rmdir(dev);
247 rmdir(temporary_mount);
248
249 return 0;
250
251 fail:
252 if (devpts)
253 umount(devpts);
254
255 if (devshm)
256 umount(devshm);
257
258 if (devhugepages)
259 umount(devhugepages);
260
261 if (devmqueue)
262 umount(devmqueue);
263
264 umount(dev);
265 rmdir(dev);
266 rmdir(temporary_mount);
267
268 return r;
269 }
270
271 static int mount_kdbus(BindMount *m) {
272
273 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
274 _cleanup_free_ char *basepath = NULL;
275 _cleanup_umask_ mode_t u;
276 char *busnode = NULL, *root;
277 struct stat st;
278 int r;
279
280 assert(m);
281
282 u = umask(0000);
283
284 if (!mkdtemp(temporary_mount))
285 return log_error_errno(errno, "Failed create temp dir: %m");
286
287 root = strjoina(temporary_mount, "/kdbus");
288 (void) mkdir(root, 0755);
289 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
290 r = -errno;
291 goto fail;
292 }
293
294 /* create a new /dev/null dev node copy so we have some fodder to
295 * bind-mount the custom endpoint over. */
296 if (stat("/dev/null", &st) < 0) {
297 r = log_error_errno(errno, "Failed to stat /dev/null: %m");
298 goto fail;
299 }
300
301 busnode = strjoina(root, "/bus");
302 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
303 r = log_error_errno(errno, "mknod() for %s failed: %m",
304 busnode);
305 goto fail;
306 }
307
308 r = mount(m->path, busnode, NULL, MS_BIND, NULL);
309 if (r < 0) {
310 r = log_error_errno(errno, "bind mount of %s failed: %m",
311 m->path);
312 goto fail;
313 }
314
315 basepath = dirname_malloc(m->path);
316 if (!basepath) {
317 r = -ENOMEM;
318 goto fail;
319 }
320
321 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
322 r = log_error_errno(errno, "bind mount of %s failed: %m",
323 basepath);
324 goto fail;
325 }
326
327 rmdir(temporary_mount);
328 return 0;
329
330 fail:
331 if (busnode) {
332 umount(busnode);
333 unlink(busnode);
334 }
335
336 umount(root);
337 rmdir(root);
338 rmdir(temporary_mount);
339
340 return r;
341 }
342
343 static int apply_mount(
344 BindMount *m,
345 const char *tmp_dir,
346 const char *var_tmp_dir) {
347
348 const char *what;
349 int r;
350
351 assert(m);
352
353 switch (m->mode) {
354
355 case INACCESSIBLE:
356
357 /* First, get rid of everything that is below if there
358 * is anything... Then, overmount it with an
359 * inaccessible directory. */
360 umount_recursive(m->path, 0);
361
362 what = "/run/systemd/inaccessible";
363 break;
364
365 case READONLY:
366 case READWRITE:
367 /* Nothing to mount here, we just later toggle the
368 * MS_RDONLY bit for the mount point */
369 return 0;
370
371 case PRIVATE_TMP:
372 what = tmp_dir;
373 break;
374
375 case PRIVATE_VAR_TMP:
376 what = var_tmp_dir;
377 break;
378
379 case PRIVATE_DEV:
380 return mount_dev(m);
381
382 case PRIVATE_BUS_ENDPOINT:
383 return mount_kdbus(m);
384
385 default:
386 assert_not_reached("Unknown mode");
387 }
388
389 assert(what);
390
391 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
392 if (r >= 0)
393 log_debug("Successfully mounted %s to %s", what, m->path);
394 else if (m->ignore && errno == ENOENT)
395 return 0;
396
397 return r;
398 }
399
400 static int make_read_only(BindMount *m) {
401 int r;
402
403 assert(m);
404
405 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
406 r = bind_remount_recursive(m->path, true);
407 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
408 r = bind_remount_recursive(m->path, false);
409 else
410 r = 0;
411
412 if (m->ignore && r == -ENOENT)
413 return 0;
414
415 return r;
416 }
417
418 int setup_namespace(
419 const char* root_directory,
420 char** read_write_dirs,
421 char** read_only_dirs,
422 char** inaccessible_dirs,
423 const char* tmp_dir,
424 const char* var_tmp_dir,
425 const char* bus_endpoint_path,
426 bool private_dev,
427 ProtectHome protect_home,
428 ProtectSystem protect_system,
429 unsigned long mount_flags) {
430
431 BindMount *m, *mounts = NULL;
432 unsigned n;
433 int r = 0;
434
435 if (mount_flags == 0)
436 mount_flags = MS_SHARED;
437
438 if (unshare(CLONE_NEWNS) < 0)
439 return -errno;
440
441 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
442 strv_length(read_write_dirs) +
443 strv_length(read_only_dirs) +
444 strv_length(inaccessible_dirs) +
445 private_dev +
446 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
447 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
448 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
449
450 if (n > 0) {
451 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
452 r = append_mounts(&m, read_write_dirs, READWRITE);
453 if (r < 0)
454 return r;
455
456 r = append_mounts(&m, read_only_dirs, READONLY);
457 if (r < 0)
458 return r;
459
460 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
461 if (r < 0)
462 return r;
463
464 if (tmp_dir) {
465 m->path = prefix_roota(root_directory, "/tmp");
466 m->mode = PRIVATE_TMP;
467 m++;
468 }
469
470 if (var_tmp_dir) {
471 m->path = prefix_roota(root_directory, "/var/tmp");
472 m->mode = PRIVATE_VAR_TMP;
473 m++;
474 }
475
476 if (private_dev) {
477 m->path = prefix_roota(root_directory, "/dev");
478 m->mode = PRIVATE_DEV;
479 m++;
480 }
481
482 if (bus_endpoint_path) {
483 m->path = prefix_roota(root_directory, bus_endpoint_path);
484 m->mode = PRIVATE_BUS_ENDPOINT;
485 m++;
486 }
487
488 if (protect_home != PROTECT_HOME_NO) {
489 const char *home_dir, *run_user_dir, *root_dir;
490
491 home_dir = prefix_roota(root_directory, "/home");
492 home_dir = strjoina("-", home_dir);
493 run_user_dir = prefix_roota(root_directory, "/run/user");
494 run_user_dir = strjoina("-", run_user_dir);
495 root_dir = prefix_roota(root_directory, "/root");
496 root_dir = strjoina("-", root_dir);
497
498 r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
499 protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
500 if (r < 0)
501 return r;
502 }
503
504 if (protect_system != PROTECT_SYSTEM_NO) {
505 const char *usr_dir, *boot_dir, *etc_dir;
506
507 usr_dir = prefix_roota(root_directory, "/usr");
508 boot_dir = prefix_roota(root_directory, "/boot");
509 boot_dir = strjoina("-", boot_dir);
510 etc_dir = prefix_roota(root_directory, "/etc");
511
512 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
513 ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
514 : STRV_MAKE(usr_dir, boot_dir), READONLY);
515 if (r < 0)
516 return r;
517 }
518
519 assert(mounts + n == m);
520
521 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
522 drop_duplicates(mounts, &n);
523 }
524
525 if (n > 0 || root_directory) {
526 /* Remount / as SLAVE so that nothing now mounted in the namespace
527 shows up in the parent */
528 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
529 return -errno;
530 }
531
532 if (root_directory) {
533 /* Turn directory into bind mount */
534 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0)
535 return -errno;
536 }
537
538 if (n > 0) {
539 for (m = mounts; m < mounts + n; ++m) {
540 r = apply_mount(m, tmp_dir, var_tmp_dir);
541 if (r < 0)
542 goto fail;
543 }
544
545 for (m = mounts; m < mounts + n; ++m) {
546 r = make_read_only(m);
547 if (r < 0)
548 goto fail;
549 }
550 }
551
552 if (root_directory) {
553 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
554 r = mount_move_root(root_directory);
555
556 /* at this point, we cannot rollback */
557 if (r < 0)
558 return r;
559 }
560
561 /* Remount / as the desired mode. Not that this will not
562 * reestablish propagation from our side to the host, since
563 * what's disconnected is disconnected. */
564 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
565 /* at this point, we cannot rollback */
566 return -errno;
567
568 return 0;
569
570 fail:
571 if (n > 0) {
572 for (m = mounts; m < mounts + n; ++m)
573 if (m->done)
574 (void) umount2(m->path, MNT_DETACH);
575 }
576
577 return r;
578 }
579
580 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
581 _cleanup_free_ char *x = NULL;
582 char bid[SD_ID128_STRING_MAX];
583 sd_id128_t boot_id;
584 int r;
585
586 assert(id);
587 assert(prefix);
588 assert(path);
589
590 /* We include the boot id in the directory so that after a
591 * reboot we can easily identify obsolete directories. */
592
593 r = sd_id128_get_boot(&boot_id);
594 if (r < 0)
595 return r;
596
597 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
598 if (!x)
599 return -ENOMEM;
600
601 RUN_WITH_UMASK(0077)
602 if (!mkdtemp(x))
603 return -errno;
604
605 RUN_WITH_UMASK(0000) {
606 char *y;
607
608 y = strjoina(x, "/tmp");
609
610 if (mkdir(y, 0777 | S_ISVTX) < 0)
611 return -errno;
612 }
613
614 *path = x;
615 x = NULL;
616
617 return 0;
618 }
619
620 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
621 char *a, *b;
622 int r;
623
624 assert(id);
625 assert(tmp_dir);
626 assert(var_tmp_dir);
627
628 r = setup_one_tmp_dir(id, "/tmp", &a);
629 if (r < 0)
630 return r;
631
632 r = setup_one_tmp_dir(id, "/var/tmp", &b);
633 if (r < 0) {
634 char *t;
635
636 t = strjoina(a, "/tmp");
637 rmdir(t);
638 rmdir(a);
639
640 free(a);
641 return r;
642 }
643
644 *tmp_dir = a;
645 *var_tmp_dir = b;
646
647 return 0;
648 }
649
650 int setup_netns(int netns_storage_socket[2]) {
651 _cleanup_close_ int netns = -1;
652 int r, q;
653
654 assert(netns_storage_socket);
655 assert(netns_storage_socket[0] >= 0);
656 assert(netns_storage_socket[1] >= 0);
657
658 /* We use the passed socketpair as a storage buffer for our
659 * namespace reference fd. Whatever process runs this first
660 * shall create a new namespace, all others should just join
661 * it. To serialize that we use a file lock on the socket
662 * pair.
663 *
664 * It's a bit crazy, but hey, works great! */
665
666 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
667 return -errno;
668
669 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
670 if (netns == -EAGAIN) {
671 /* Nothing stored yet, so let's create a new namespace */
672
673 if (unshare(CLONE_NEWNET) < 0) {
674 r = -errno;
675 goto fail;
676 }
677
678 loopback_setup();
679
680 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
681 if (netns < 0) {
682 r = -errno;
683 goto fail;
684 }
685
686 r = 1;
687
688 } else if (netns < 0) {
689 r = netns;
690 goto fail;
691
692 } else {
693 /* Yay, found something, so let's join the namespace */
694 if (setns(netns, CLONE_NEWNET) < 0) {
695 r = -errno;
696 goto fail;
697 }
698
699 r = 0;
700 }
701
702 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
703 if (q < 0) {
704 r = q;
705 goto fail;
706 }
707
708 fail:
709 lockf(netns_storage_socket[0], F_ULOCK, 0);
710 return r;
711 }
712
713 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
714 [PROTECT_HOME_NO] = "no",
715 [PROTECT_HOME_YES] = "yes",
716 [PROTECT_HOME_READ_ONLY] = "read-only",
717 };
718
719 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
720
721 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
722 [PROTECT_SYSTEM_NO] = "no",
723 [PROTECT_SYSTEM_YES] = "yes",
724 [PROTECT_SYSTEM_FULL] = "full",
725 };
726
727 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);