]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
Merge pull request #1562 from michich/overlinking
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sched.h>
29 #include <linux/fs.h>
30
31 #include "strv.h"
32 #include "util.h"
33 #include "path-util.h"
34 #include "missing.h"
35 #include "loopback-setup.h"
36 #include "dev-setup.h"
37 #include "selinux-util.h"
38 #include "namespace.h"
39 #include "mkdir.h"
40
41 typedef enum MountMode {
42 /* This is ordered by priority! */
43 INACCESSIBLE,
44 READONLY,
45 PRIVATE_TMP,
46 PRIVATE_VAR_TMP,
47 PRIVATE_DEV,
48 PRIVATE_BUS_ENDPOINT,
49 READWRITE
50 } MountMode;
51
52 typedef struct BindMount {
53 const char *path;
54 MountMode mode;
55 bool done;
56 bool ignore;
57 } BindMount;
58
59 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
60 char **i;
61
62 assert(p);
63
64 STRV_FOREACH(i, strv) {
65
66 (*p)->ignore = false;
67 (*p)->done = false;
68
69 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
70 (*p)->ignore = true;
71 (*i)++;
72 }
73
74 if (!path_is_absolute(*i))
75 return -EINVAL;
76
77 (*p)->path = *i;
78 (*p)->mode = mode;
79 (*p)++;
80 }
81
82 return 0;
83 }
84
85 static int mount_path_compare(const void *a, const void *b) {
86 const BindMount *p = a, *q = b;
87 int d;
88
89 d = path_compare(p->path, q->path);
90
91 if (d == 0) {
92 /* If the paths are equal, check the mode */
93 if (p->mode < q->mode)
94 return -1;
95
96 if (p->mode > q->mode)
97 return 1;
98
99 return 0;
100 }
101
102 /* If the paths are not equal, then order prefixes first */
103 return d;
104 }
105
106 static void drop_duplicates(BindMount *m, unsigned *n) {
107 BindMount *f, *t, *previous;
108
109 assert(m);
110 assert(n);
111
112 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
113
114 /* The first one wins */
115 if (previous && path_equal(f->path, previous->path))
116 continue;
117
118 *t = *f;
119
120 previous = t;
121
122 t++;
123 }
124
125 *n = t - m;
126 }
127
128 static int mount_dev(BindMount *m) {
129 static const char devnodes[] =
130 "/dev/null\0"
131 "/dev/zero\0"
132 "/dev/full\0"
133 "/dev/random\0"
134 "/dev/urandom\0"
135 "/dev/tty\0";
136
137 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
138 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
139 _cleanup_umask_ mode_t u;
140 int r;
141
142 assert(m);
143
144 u = umask(0000);
145
146 if (!mkdtemp(temporary_mount))
147 return -errno;
148
149 dev = strjoina(temporary_mount, "/dev");
150 (void) mkdir(dev, 0755);
151 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
152 r = -errno;
153 goto fail;
154 }
155
156 devpts = strjoina(temporary_mount, "/dev/pts");
157 (void) mkdir(devpts, 0755);
158 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
159 r = -errno;
160 goto fail;
161 }
162
163 devptmx = strjoina(temporary_mount, "/dev/ptmx");
164 if (symlink("pts/ptmx", devptmx) < 0) {
165 r = -errno;
166 goto fail;
167 }
168
169 devshm = strjoina(temporary_mount, "/dev/shm");
170 (void) mkdir(devshm, 01777);
171 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
172 if (r < 0) {
173 r = -errno;
174 goto fail;
175 }
176
177 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
178 (void) mkdir(devmqueue, 0755);
179 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
180
181 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
182 (void) mkdir(devhugepages, 0755);
183 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
184
185 devlog = strjoina(temporary_mount, "/dev/log");
186 (void) symlink("/run/systemd/journal/dev-log", devlog);
187
188 NULSTR_FOREACH(d, devnodes) {
189 _cleanup_free_ char *dn = NULL;
190 struct stat st;
191
192 r = stat(d, &st);
193 if (r < 0) {
194
195 if (errno == ENOENT)
196 continue;
197
198 r = -errno;
199 goto fail;
200 }
201
202 if (!S_ISBLK(st.st_mode) &&
203 !S_ISCHR(st.st_mode)) {
204 r = -EINVAL;
205 goto fail;
206 }
207
208 if (st.st_rdev == 0)
209 continue;
210
211 dn = strappend(temporary_mount, d);
212 if (!dn) {
213 r = -ENOMEM;
214 goto fail;
215 }
216
217 mac_selinux_create_file_prepare(d, st.st_mode);
218 r = mknod(dn, st.st_mode, st.st_rdev);
219 mac_selinux_create_file_clear();
220
221 if (r < 0) {
222 r = -errno;
223 goto fail;
224 }
225 }
226
227 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
228
229 /* Create the /dev directory if missing. It is more likely to be
230 * missing when the service is started with RootDirectory. This is
231 * consistent with mount units creating the mount points when missing.
232 */
233 (void) mkdir_p_label(m->path, 0755);
234
235 if (mount(dev, m->path, NULL, MS_MOVE, NULL) < 0) {
236 r = -errno;
237 goto fail;
238 }
239
240 rmdir(dev);
241 rmdir(temporary_mount);
242
243 return 0;
244
245 fail:
246 if (devpts)
247 umount(devpts);
248
249 if (devshm)
250 umount(devshm);
251
252 if (devhugepages)
253 umount(devhugepages);
254
255 if (devmqueue)
256 umount(devmqueue);
257
258 umount(dev);
259 rmdir(dev);
260 rmdir(temporary_mount);
261
262 return r;
263 }
264
265 static int mount_kdbus(BindMount *m) {
266
267 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
268 _cleanup_free_ char *basepath = NULL;
269 _cleanup_umask_ mode_t u;
270 char *busnode = NULL, *root;
271 struct stat st;
272 int r;
273
274 assert(m);
275
276 u = umask(0000);
277
278 if (!mkdtemp(temporary_mount))
279 return log_error_errno(errno, "Failed create temp dir: %m");
280
281 root = strjoina(temporary_mount, "/kdbus");
282 (void) mkdir(root, 0755);
283 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
284 r = -errno;
285 goto fail;
286 }
287
288 /* create a new /dev/null dev node copy so we have some fodder to
289 * bind-mount the custom endpoint over. */
290 if (stat("/dev/null", &st) < 0) {
291 r = log_error_errno(errno, "Failed to stat /dev/null: %m");
292 goto fail;
293 }
294
295 busnode = strjoina(root, "/bus");
296 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
297 r = log_error_errno(errno, "mknod() for %s failed: %m",
298 busnode);
299 goto fail;
300 }
301
302 r = mount(m->path, busnode, NULL, MS_BIND, NULL);
303 if (r < 0) {
304 r = log_error_errno(errno, "bind mount of %s failed: %m",
305 m->path);
306 goto fail;
307 }
308
309 basepath = dirname_malloc(m->path);
310 if (!basepath) {
311 r = -ENOMEM;
312 goto fail;
313 }
314
315 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
316 r = log_error_errno(errno, "bind mount of %s failed: %m",
317 basepath);
318 goto fail;
319 }
320
321 rmdir(temporary_mount);
322 return 0;
323
324 fail:
325 if (busnode) {
326 umount(busnode);
327 unlink(busnode);
328 }
329
330 umount(root);
331 rmdir(root);
332 rmdir(temporary_mount);
333
334 return r;
335 }
336
337 static int apply_mount(
338 BindMount *m,
339 const char *tmp_dir,
340 const char *var_tmp_dir) {
341
342 const char *what;
343 int r;
344
345 assert(m);
346
347 switch (m->mode) {
348
349 case INACCESSIBLE:
350
351 /* First, get rid of everything that is below if there
352 * is anything... Then, overmount it with an
353 * inaccessible directory. */
354 umount_recursive(m->path, 0);
355
356 what = "/run/systemd/inaccessible";
357 break;
358
359 case READONLY:
360 case READWRITE:
361 /* Nothing to mount here, we just later toggle the
362 * MS_RDONLY bit for the mount point */
363 return 0;
364
365 case PRIVATE_TMP:
366 what = tmp_dir;
367 break;
368
369 case PRIVATE_VAR_TMP:
370 what = var_tmp_dir;
371 break;
372
373 case PRIVATE_DEV:
374 return mount_dev(m);
375
376 case PRIVATE_BUS_ENDPOINT:
377 return mount_kdbus(m);
378
379 default:
380 assert_not_reached("Unknown mode");
381 }
382
383 assert(what);
384
385 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
386 if (r >= 0)
387 log_debug("Successfully mounted %s to %s", what, m->path);
388 else if (m->ignore && errno == ENOENT)
389 return 0;
390
391 return r;
392 }
393
394 static int make_read_only(BindMount *m) {
395 int r;
396
397 assert(m);
398
399 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
400 r = bind_remount_recursive(m->path, true);
401 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
402 r = bind_remount_recursive(m->path, false);
403 else
404 r = 0;
405
406 if (m->ignore && r == -ENOENT)
407 return 0;
408
409 return r;
410 }
411
412 int setup_namespace(
413 const char* root_directory,
414 char** read_write_dirs,
415 char** read_only_dirs,
416 char** inaccessible_dirs,
417 const char* tmp_dir,
418 const char* var_tmp_dir,
419 const char* bus_endpoint_path,
420 bool private_dev,
421 ProtectHome protect_home,
422 ProtectSystem protect_system,
423 unsigned long mount_flags) {
424
425 BindMount *m, *mounts = NULL;
426 unsigned n;
427 int r = 0;
428
429 if (mount_flags == 0)
430 mount_flags = MS_SHARED;
431
432 if (unshare(CLONE_NEWNS) < 0)
433 return -errno;
434
435 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
436 strv_length(read_write_dirs) +
437 strv_length(read_only_dirs) +
438 strv_length(inaccessible_dirs) +
439 private_dev +
440 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
441 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
442 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
443
444 if (n > 0) {
445 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
446 r = append_mounts(&m, read_write_dirs, READWRITE);
447 if (r < 0)
448 return r;
449
450 r = append_mounts(&m, read_only_dirs, READONLY);
451 if (r < 0)
452 return r;
453
454 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
455 if (r < 0)
456 return r;
457
458 if (tmp_dir) {
459 m->path = prefix_roota(root_directory, "/tmp");
460 m->mode = PRIVATE_TMP;
461 m++;
462 }
463
464 if (var_tmp_dir) {
465 m->path = prefix_roota(root_directory, "/var/tmp");
466 m->mode = PRIVATE_VAR_TMP;
467 m++;
468 }
469
470 if (private_dev) {
471 m->path = prefix_roota(root_directory, "/dev");
472 m->mode = PRIVATE_DEV;
473 m++;
474 }
475
476 if (bus_endpoint_path) {
477 m->path = prefix_roota(root_directory, bus_endpoint_path);
478 m->mode = PRIVATE_BUS_ENDPOINT;
479 m++;
480 }
481
482 if (protect_home != PROTECT_HOME_NO) {
483 const char *home_dir, *run_user_dir, *root_dir;
484
485 home_dir = prefix_roota(root_directory, "/home");
486 home_dir = strjoina("-", home_dir);
487 run_user_dir = prefix_roota(root_directory, "/run/user");
488 run_user_dir = strjoina("-", run_user_dir);
489 root_dir = prefix_roota(root_directory, "/root");
490 root_dir = strjoina("-", root_dir);
491
492 r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
493 protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
494 if (r < 0)
495 return r;
496 }
497
498 if (protect_system != PROTECT_SYSTEM_NO) {
499 const char *usr_dir, *boot_dir, *etc_dir;
500
501 usr_dir = prefix_roota(root_directory, "/usr");
502 boot_dir = prefix_roota(root_directory, "/boot");
503 boot_dir = strjoina("-", boot_dir);
504 etc_dir = prefix_roota(root_directory, "/etc");
505
506 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
507 ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
508 : STRV_MAKE(usr_dir, boot_dir), READONLY);
509 if (r < 0)
510 return r;
511 }
512
513 assert(mounts + n == m);
514
515 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
516 drop_duplicates(mounts, &n);
517 }
518
519 if (n > 0 || root_directory) {
520 /* Remount / as SLAVE so that nothing now mounted in the namespace
521 shows up in the parent */
522 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
523 return -errno;
524 }
525
526 if (root_directory) {
527 /* Turn directory into bind mount */
528 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0)
529 return -errno;
530 }
531
532 if (n > 0) {
533 for (m = mounts; m < mounts + n; ++m) {
534 r = apply_mount(m, tmp_dir, var_tmp_dir);
535 if (r < 0)
536 goto fail;
537 }
538
539 for (m = mounts; m < mounts + n; ++m) {
540 r = make_read_only(m);
541 if (r < 0)
542 goto fail;
543 }
544 }
545
546 if (root_directory) {
547 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
548 r = mount_move_root(root_directory);
549
550 /* at this point, we cannot rollback */
551 if (r < 0)
552 return r;
553 }
554
555 /* Remount / as the desired mode. Not that this will not
556 * reestablish propagation from our side to the host, since
557 * what's disconnected is disconnected. */
558 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
559 /* at this point, we cannot rollback */
560 return -errno;
561
562 return 0;
563
564 fail:
565 if (n > 0) {
566 for (m = mounts; m < mounts + n; ++m)
567 if (m->done)
568 (void) umount2(m->path, MNT_DETACH);
569 }
570
571 return r;
572 }
573
574 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
575 _cleanup_free_ char *x = NULL;
576 char bid[SD_ID128_STRING_MAX];
577 sd_id128_t boot_id;
578 int r;
579
580 assert(id);
581 assert(prefix);
582 assert(path);
583
584 /* We include the boot id in the directory so that after a
585 * reboot we can easily identify obsolete directories. */
586
587 r = sd_id128_get_boot(&boot_id);
588 if (r < 0)
589 return r;
590
591 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
592 if (!x)
593 return -ENOMEM;
594
595 RUN_WITH_UMASK(0077)
596 if (!mkdtemp(x))
597 return -errno;
598
599 RUN_WITH_UMASK(0000) {
600 char *y;
601
602 y = strjoina(x, "/tmp");
603
604 if (mkdir(y, 0777 | S_ISVTX) < 0)
605 return -errno;
606 }
607
608 *path = x;
609 x = NULL;
610
611 return 0;
612 }
613
614 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
615 char *a, *b;
616 int r;
617
618 assert(id);
619 assert(tmp_dir);
620 assert(var_tmp_dir);
621
622 r = setup_one_tmp_dir(id, "/tmp", &a);
623 if (r < 0)
624 return r;
625
626 r = setup_one_tmp_dir(id, "/var/tmp", &b);
627 if (r < 0) {
628 char *t;
629
630 t = strjoina(a, "/tmp");
631 rmdir(t);
632 rmdir(a);
633
634 free(a);
635 return r;
636 }
637
638 *tmp_dir = a;
639 *var_tmp_dir = b;
640
641 return 0;
642 }
643
644 int setup_netns(int netns_storage_socket[2]) {
645 _cleanup_close_ int netns = -1;
646 int r, q;
647
648 assert(netns_storage_socket);
649 assert(netns_storage_socket[0] >= 0);
650 assert(netns_storage_socket[1] >= 0);
651
652 /* We use the passed socketpair as a storage buffer for our
653 * namespace reference fd. Whatever process runs this first
654 * shall create a new namespace, all others should just join
655 * it. To serialize that we use a file lock on the socket
656 * pair.
657 *
658 * It's a bit crazy, but hey, works great! */
659
660 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
661 return -errno;
662
663 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
664 if (netns == -EAGAIN) {
665 /* Nothing stored yet, so let's create a new namespace */
666
667 if (unshare(CLONE_NEWNET) < 0) {
668 r = -errno;
669 goto fail;
670 }
671
672 loopback_setup();
673
674 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
675 if (netns < 0) {
676 r = -errno;
677 goto fail;
678 }
679
680 r = 1;
681
682 } else if (netns < 0) {
683 r = netns;
684 goto fail;
685
686 } else {
687 /* Yay, found something, so let's join the namespace */
688 if (setns(netns, CLONE_NEWNET) < 0) {
689 r = -errno;
690 goto fail;
691 }
692
693 r = 0;
694 }
695
696 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
697 if (q < 0) {
698 r = q;
699 goto fail;
700 }
701
702 fail:
703 lockf(netns_storage_socket[0], F_ULOCK, 0);
704 return r;
705 }
706
707 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
708 [PROTECT_HOME_NO] = "no",
709 [PROTECT_HOME_YES] = "yes",
710 [PROTECT_HOME_READ_ONLY] = "read-only",
711 };
712
713 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
714
715 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
716 [PROTECT_SYSTEM_NO] = "no",
717 [PROTECT_SYSTEM_YES] = "yes",
718 [PROTECT_SYSTEM_FULL] = "full",
719 };
720
721 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);