]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
util: rework strappenda(), and rename it strjoina()
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #include <sched.h>
30 #include <sys/syscall.h>
31 #include <limits.h>
32 #include <linux/fs.h>
33 #include <sys/file.h>
34
35 #include "strv.h"
36 #include "util.h"
37 #include "path-util.h"
38 #include "missing.h"
39 #include "execute.h"
40 #include "loopback-setup.h"
41 #include "mkdir.h"
42 #include "dev-setup.h"
43 #include "def.h"
44 #include "label.h"
45 #include "selinux-util.h"
46 #include "namespace.h"
47
48 typedef enum MountMode {
49 /* This is ordered by priority! */
50 INACCESSIBLE,
51 READONLY,
52 PRIVATE_TMP,
53 PRIVATE_VAR_TMP,
54 PRIVATE_DEV,
55 PRIVATE_BUS_ENDPOINT,
56 READWRITE
57 } MountMode;
58
59 typedef struct BindMount {
60 const char *path;
61 MountMode mode;
62 bool done;
63 bool ignore;
64 } BindMount;
65
66 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
67 char **i;
68
69 assert(p);
70
71 STRV_FOREACH(i, strv) {
72
73 (*p)->ignore = false;
74 (*p)->done = false;
75
76 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
77 (*p)->ignore = true;
78 (*i)++;
79 }
80
81 if (!path_is_absolute(*i))
82 return -EINVAL;
83
84 (*p)->path = *i;
85 (*p)->mode = mode;
86 (*p)++;
87 }
88
89 return 0;
90 }
91
92 static int mount_path_compare(const void *a, const void *b) {
93 const BindMount *p = a, *q = b;
94
95 if (path_equal(p->path, q->path)) {
96
97 /* If the paths are equal, check the mode */
98 if (p->mode < q->mode)
99 return -1;
100
101 if (p->mode > q->mode)
102 return 1;
103
104 return 0;
105 }
106
107 /* If the paths are not equal, then order prefixes first */
108 if (path_startswith(p->path, q->path))
109 return 1;
110
111 if (path_startswith(q->path, p->path))
112 return -1;
113
114 return 0;
115 }
116
117 static void drop_duplicates(BindMount *m, unsigned *n) {
118 BindMount *f, *t, *previous;
119
120 assert(m);
121 assert(n);
122
123 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
124
125 /* The first one wins */
126 if (previous && path_equal(f->path, previous->path))
127 continue;
128
129 *t = *f;
130
131 previous = t;
132
133 t++;
134 }
135
136 *n = t - m;
137 }
138
139 static int mount_dev(BindMount *m) {
140 static const char devnodes[] =
141 "/dev/null\0"
142 "/dev/zero\0"
143 "/dev/full\0"
144 "/dev/random\0"
145 "/dev/urandom\0"
146 "/dev/tty\0";
147
148 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
149 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
150 _cleanup_umask_ mode_t u;
151 int r;
152
153 assert(m);
154
155 u = umask(0000);
156
157 if (!mkdtemp(temporary_mount))
158 return -errno;
159
160 dev = strjoina(temporary_mount, "/dev");
161 (void)mkdir(dev, 0755);
162 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
163 r = -errno;
164 goto fail;
165 }
166
167 devpts = strjoina(temporary_mount, "/dev/pts");
168 (void)mkdir(devpts, 0755);
169 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
170 r = -errno;
171 goto fail;
172 }
173
174 devptmx = strjoina(temporary_mount, "/dev/ptmx");
175 symlink("pts/ptmx", devptmx);
176
177 devshm = strjoina(temporary_mount, "/dev/shm");
178 (void)mkdir(devshm, 01777);
179 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
180 if (r < 0) {
181 r = -errno;
182 goto fail;
183 }
184
185 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
186 (void)mkdir(devmqueue, 0755);
187 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
188
189 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
190 (void)mkdir(devhugepages, 0755);
191 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
192
193 devlog = strjoina(temporary_mount, "/dev/log");
194 symlink("/run/systemd/journal/dev-log", devlog);
195
196 NULSTR_FOREACH(d, devnodes) {
197 _cleanup_free_ char *dn = NULL;
198 struct stat st;
199
200 r = stat(d, &st);
201 if (r < 0) {
202
203 if (errno == ENOENT)
204 continue;
205
206 r = -errno;
207 goto fail;
208 }
209
210 if (!S_ISBLK(st.st_mode) &&
211 !S_ISCHR(st.st_mode)) {
212 r = -EINVAL;
213 goto fail;
214 }
215
216 if (st.st_rdev == 0)
217 continue;
218
219 dn = strappend(temporary_mount, d);
220 if (!dn) {
221 r = -ENOMEM;
222 goto fail;
223 }
224
225 mac_selinux_create_file_prepare(d, st.st_mode);
226 r = mknod(dn, st.st_mode, st.st_rdev);
227 mac_selinux_create_file_clear();
228
229 if (r < 0) {
230 r = -errno;
231 goto fail;
232 }
233 }
234
235 dev_setup(temporary_mount);
236
237 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
238 r = -errno;
239 goto fail;
240 }
241
242 rmdir(dev);
243 rmdir(temporary_mount);
244
245 return 0;
246
247 fail:
248 if (devpts)
249 umount(devpts);
250
251 if (devshm)
252 umount(devshm);
253
254 if (devhugepages)
255 umount(devhugepages);
256
257 if (devmqueue)
258 umount(devmqueue);
259
260 umount(dev);
261 rmdir(dev);
262 rmdir(temporary_mount);
263
264 return r;
265 }
266
267 static int mount_kdbus(BindMount *m) {
268
269 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
270 _cleanup_free_ char *basepath = NULL;
271 _cleanup_umask_ mode_t u;
272 char *busnode = NULL, *root;
273 struct stat st;
274 int r;
275
276 assert(m);
277
278 u = umask(0000);
279
280 if (!mkdtemp(temporary_mount))
281 return log_error_errno(errno, "Failed create temp dir: %m");
282
283 root = strjoina(temporary_mount, "/kdbus");
284 (void)mkdir(root, 0755);
285 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
286 r = -errno;
287 goto fail;
288 }
289
290 /* create a new /dev/null dev node copy so we have some fodder to
291 * bind-mount the custom endpoint over. */
292 if (stat("/dev/null", &st) < 0) {
293 log_error_errno(errno, "Failed to stat /dev/null: %m");
294 r = -errno;
295 goto fail;
296 }
297
298 busnode = strjoina(root, "/bus");
299 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
300 log_error_errno(errno, "mknod() for %s failed: %m", busnode);
301 r = -errno;
302 goto fail;
303 }
304
305 r = mount(m->path, busnode, "bind", MS_BIND, NULL);
306 if (r < 0) {
307 log_error_errno(errno, "bind mount of %s failed: %m", m->path);
308 r = -errno;
309 goto fail;
310 }
311
312 basepath = dirname_malloc(m->path);
313 if (!basepath) {
314 r = -ENOMEM;
315 goto fail;
316 }
317
318 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
319 log_error_errno(errno, "bind mount of %s failed: %m", basepath);
320 r = -errno;
321 goto fail;
322 }
323
324 rmdir(temporary_mount);
325 return 0;
326
327 fail:
328 if (busnode) {
329 umount(busnode);
330 unlink(busnode);
331 }
332
333 umount(root);
334 rmdir(root);
335 rmdir(temporary_mount);
336
337 return r;
338 }
339
340 static int apply_mount(
341 BindMount *m,
342 const char *tmp_dir,
343 const char *var_tmp_dir) {
344
345 const char *what;
346 int r;
347
348 assert(m);
349
350 switch (m->mode) {
351
352 case INACCESSIBLE:
353
354 /* First, get rid of everything that is below if there
355 * is anything... Then, overmount it with an
356 * inaccessible directory. */
357 umount_recursive(m->path, 0);
358
359 what = "/run/systemd/inaccessible";
360 break;
361
362 case READONLY:
363 case READWRITE:
364 /* Nothing to mount here, we just later toggle the
365 * MS_RDONLY bit for the mount point */
366 return 0;
367
368 case PRIVATE_TMP:
369 what = tmp_dir;
370 break;
371
372 case PRIVATE_VAR_TMP:
373 what = var_tmp_dir;
374 break;
375
376 case PRIVATE_DEV:
377 return mount_dev(m);
378
379 case PRIVATE_BUS_ENDPOINT:
380 return mount_kdbus(m);
381
382 default:
383 assert_not_reached("Unknown mode");
384 }
385
386 assert(what);
387
388 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
389 if (r >= 0)
390 log_debug("Successfully mounted %s to %s", what, m->path);
391 else if (m->ignore && errno == ENOENT)
392 return 0;
393
394 return r;
395 }
396
397 static int make_read_only(BindMount *m) {
398 int r;
399
400 assert(m);
401
402 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
403 r = bind_remount_recursive(m->path, true);
404 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
405 r = bind_remount_recursive(m->path, false);
406 else
407 r = 0;
408
409 if (m->ignore && r == -ENOENT)
410 return 0;
411
412 return r;
413 }
414
415 int setup_namespace(
416 char** read_write_dirs,
417 char** read_only_dirs,
418 char** inaccessible_dirs,
419 const char* tmp_dir,
420 const char* var_tmp_dir,
421 const char* bus_endpoint_path,
422 bool private_dev,
423 ProtectHome protect_home,
424 ProtectSystem protect_system,
425 unsigned long mount_flags) {
426
427 BindMount *m, *mounts = NULL;
428 unsigned n;
429 int r = 0;
430
431 if (mount_flags == 0)
432 mount_flags = MS_SHARED;
433
434 if (unshare(CLONE_NEWNS) < 0)
435 return -errno;
436
437 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
438 strv_length(read_write_dirs) +
439 strv_length(read_only_dirs) +
440 strv_length(inaccessible_dirs) +
441 private_dev +
442 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
443 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
444 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
445
446 if (n > 0) {
447 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
448 r = append_mounts(&m, read_write_dirs, READWRITE);
449 if (r < 0)
450 return r;
451
452 r = append_mounts(&m, read_only_dirs, READONLY);
453 if (r < 0)
454 return r;
455
456 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
457 if (r < 0)
458 return r;
459
460 if (tmp_dir) {
461 m->path = "/tmp";
462 m->mode = PRIVATE_TMP;
463 m++;
464 }
465
466 if (var_tmp_dir) {
467 m->path = "/var/tmp";
468 m->mode = PRIVATE_VAR_TMP;
469 m++;
470 }
471
472 if (private_dev) {
473 m->path = "/dev";
474 m->mode = PRIVATE_DEV;
475 m++;
476 }
477
478 if (bus_endpoint_path) {
479 m->path = bus_endpoint_path;
480 m->mode = PRIVATE_BUS_ENDPOINT;
481 m++;
482 }
483
484 if (protect_home != PROTECT_HOME_NO) {
485 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
486 if (r < 0)
487 return r;
488 }
489
490 if (protect_system != PROTECT_SYSTEM_NO) {
491 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
492 if (r < 0)
493 return r;
494 }
495
496 assert(mounts + n == m);
497
498 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
499 drop_duplicates(mounts, &n);
500 }
501
502 if (n > 0) {
503 /* Remount / as SLAVE so that nothing now mounted in the namespace
504 shows up in the parent */
505 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
506 return -errno;
507
508 for (m = mounts; m < mounts + n; ++m) {
509 r = apply_mount(m, tmp_dir, var_tmp_dir);
510 if (r < 0)
511 goto fail;
512 }
513
514 for (m = mounts; m < mounts + n; ++m) {
515 r = make_read_only(m);
516 if (r < 0)
517 goto fail;
518 }
519 }
520
521 /* Remount / as the desired mode. Not that this will not
522 * reestablish propagation from our side to the host, since
523 * what's disconnected is disconnected. */
524 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
525 r = -errno;
526 goto fail;
527 }
528
529 return 0;
530
531 fail:
532 if (n > 0) {
533 for (m = mounts; m < mounts + n; ++m)
534 if (m->done)
535 umount2(m->path, MNT_DETACH);
536 }
537
538 return r;
539 }
540
541 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
542 _cleanup_free_ char *x = NULL;
543 char bid[SD_ID128_STRING_MAX];
544 sd_id128_t boot_id;
545 int r;
546
547 assert(id);
548 assert(prefix);
549 assert(path);
550
551 /* We include the boot id in the directory so that after a
552 * reboot we can easily identify obsolete directories. */
553
554 r = sd_id128_get_boot(&boot_id);
555 if (r < 0)
556 return r;
557
558 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
559 if (!x)
560 return -ENOMEM;
561
562 RUN_WITH_UMASK(0077)
563 if (!mkdtemp(x))
564 return -errno;
565
566 RUN_WITH_UMASK(0000) {
567 char *y;
568
569 y = strjoina(x, "/tmp");
570
571 if (mkdir(y, 0777 | S_ISVTX) < 0)
572 return -errno;
573 }
574
575 *path = x;
576 x = NULL;
577
578 return 0;
579 }
580
581 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
582 char *a, *b;
583 int r;
584
585 assert(id);
586 assert(tmp_dir);
587 assert(var_tmp_dir);
588
589 r = setup_one_tmp_dir(id, "/tmp", &a);
590 if (r < 0)
591 return r;
592
593 r = setup_one_tmp_dir(id, "/var/tmp", &b);
594 if (r < 0) {
595 char *t;
596
597 t = strjoina(a, "/tmp");
598 rmdir(t);
599 rmdir(a);
600
601 free(a);
602 return r;
603 }
604
605 *tmp_dir = a;
606 *var_tmp_dir = b;
607
608 return 0;
609 }
610
611 int setup_netns(int netns_storage_socket[2]) {
612 _cleanup_close_ int netns = -1;
613 union {
614 struct cmsghdr cmsghdr;
615 uint8_t buf[CMSG_SPACE(sizeof(int))];
616 } control = {};
617 struct msghdr mh = {
618 .msg_control = &control,
619 .msg_controllen = sizeof(control),
620 };
621 struct cmsghdr *cmsg;
622 int r;
623
624 assert(netns_storage_socket);
625 assert(netns_storage_socket[0] >= 0);
626 assert(netns_storage_socket[1] >= 0);
627
628 /* We use the passed socketpair as a storage buffer for our
629 * namespace reference fd. Whatever process runs this first
630 * shall create a new namespace, all others should just join
631 * it. To serialize that we use a file lock on the socket
632 * pair.
633 *
634 * It's a bit crazy, but hey, works great! */
635
636 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
637 return -errno;
638
639 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
640 if (errno != EAGAIN) {
641 r = -errno;
642 goto fail;
643 }
644
645 /* Nothing stored yet, so let's create a new namespace */
646
647 if (unshare(CLONE_NEWNET) < 0) {
648 r = -errno;
649 goto fail;
650 }
651
652 loopback_setup();
653
654 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
655 if (netns < 0) {
656 r = -errno;
657 goto fail;
658 }
659
660 r = 1;
661 } else {
662 /* Yay, found something, so let's join the namespace */
663
664 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
665 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
666 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
667 netns = *(int*) CMSG_DATA(cmsg);
668 }
669 }
670
671 if (setns(netns, CLONE_NEWNET) < 0) {
672 r = -errno;
673 goto fail;
674 }
675
676 r = 0;
677 }
678
679 cmsg = CMSG_FIRSTHDR(&mh);
680 cmsg->cmsg_level = SOL_SOCKET;
681 cmsg->cmsg_type = SCM_RIGHTS;
682 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
683 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
684 mh.msg_controllen = cmsg->cmsg_len;
685
686 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
687 r = -errno;
688 goto fail;
689 }
690
691 fail:
692 lockf(netns_storage_socket[0], F_ULOCK, 0);
693
694 return r;
695 }
696
697 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
698 [PROTECT_HOME_NO] = "no",
699 [PROTECT_HOME_YES] = "yes",
700 [PROTECT_HOME_READ_ONLY] = "read-only",
701 };
702
703 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
704
705 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
706 [PROTECT_SYSTEM_NO] = "no",
707 [PROTECT_SYSTEM_YES] = "yes",
708 [PROTECT_SYSTEM_FULL] = "full",
709 };
710
711 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);