]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
namespace: add missing 'const' to parameters
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #include <sched.h>
30 #include <sys/syscall.h>
31 #include <limits.h>
32 #include <linux/fs.h>
33 #include <sys/file.h>
34
35 #include "strv.h"
36 #include "util.h"
37 #include "path-util.h"
38 #include "namespace.h"
39 #include "missing.h"
40 #include "execute.h"
41 #include "loopback-setup.h"
42 #include "mkdir.h"
43 #include "dev-setup.h"
44 #include "def.h"
45 #include "label.h"
46
47 typedef enum MountMode {
48 /* This is ordered by priority! */
49 INACCESSIBLE,
50 READONLY,
51 PRIVATE_TMP,
52 PRIVATE_VAR_TMP,
53 PRIVATE_DEV,
54 PRIVATE_BUS_ENDPOINT,
55 READWRITE
56 } MountMode;
57
58 typedef struct BindMount {
59 const char *path;
60 MountMode mode;
61 bool done;
62 bool ignore;
63 } BindMount;
64
65 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
66 char **i;
67
68 assert(p);
69
70 STRV_FOREACH(i, strv) {
71
72 (*p)->ignore = false;
73 (*p)->done = false;
74
75 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
76 (*p)->ignore = true;
77 (*i)++;
78 }
79
80 if (!path_is_absolute(*i))
81 return -EINVAL;
82
83 (*p)->path = *i;
84 (*p)->mode = mode;
85 (*p)++;
86 }
87
88 return 0;
89 }
90
91 static int mount_path_compare(const void *a, const void *b) {
92 const BindMount *p = a, *q = b;
93
94 if (path_equal(p->path, q->path)) {
95
96 /* If the paths are equal, check the mode */
97 if (p->mode < q->mode)
98 return -1;
99
100 if (p->mode > q->mode)
101 return 1;
102
103 return 0;
104 }
105
106 /* If the paths are not equal, then order prefixes first */
107 if (path_startswith(p->path, q->path))
108 return 1;
109
110 if (path_startswith(q->path, p->path))
111 return -1;
112
113 return 0;
114 }
115
116 static void drop_duplicates(BindMount *m, unsigned *n) {
117 BindMount *f, *t, *previous;
118
119 assert(m);
120 assert(n);
121
122 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
123
124 /* The first one wins */
125 if (previous && path_equal(f->path, previous->path))
126 continue;
127
128 *t = *f;
129
130 previous = t;
131
132 t++;
133 }
134
135 *n = t - m;
136 }
137
138 static int mount_dev(BindMount *m) {
139 static const char devnodes[] =
140 "/dev/null\0"
141 "/dev/zero\0"
142 "/dev/full\0"
143 "/dev/random\0"
144 "/dev/urandom\0"
145 "/dev/tty\0";
146
147 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
148 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
149 _cleanup_umask_ mode_t u;
150 int r;
151
152 assert(m);
153
154 u = umask(0000);
155
156 if (!mkdtemp(temporary_mount))
157 return -errno;
158
159 dev = strappenda(temporary_mount, "/dev");
160 mkdir(dev, 0755);
161 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
162 r = -errno;
163 goto fail;
164 }
165
166 devpts = strappenda(temporary_mount, "/dev/pts");
167 mkdir(devpts, 0755);
168 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
169 r = -errno;
170 goto fail;
171 }
172
173 devptmx = strappenda(temporary_mount, "/dev/ptmx");
174 symlink("pts/ptmx", devptmx);
175
176 devshm = strappenda(temporary_mount, "/dev/shm");
177 mkdir(devshm, 01777);
178 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
179 if (r < 0) {
180 r = -errno;
181 goto fail;
182 }
183
184 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
185 mkdir(devmqueue, 0755);
186 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
187
188 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
189 mkdir(devkdbus, 0755);
190 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
191
192 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
193 mkdir(devhugepages, 0755);
194 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
195
196 devlog = strappenda(temporary_mount, "/dev/log");
197 symlink("/run/systemd/journal/dev-log", devlog);
198
199 NULSTR_FOREACH(d, devnodes) {
200 _cleanup_free_ char *dn = NULL;
201 struct stat st;
202
203 r = stat(d, &st);
204 if (r < 0) {
205
206 if (errno == ENOENT)
207 continue;
208
209 r = -errno;
210 goto fail;
211 }
212
213 if (!S_ISBLK(st.st_mode) &&
214 !S_ISCHR(st.st_mode)) {
215 r = -EINVAL;
216 goto fail;
217 }
218
219 if (st.st_rdev == 0)
220 continue;
221
222 dn = strappend(temporary_mount, d);
223 if (!dn) {
224 r = -ENOMEM;
225 goto fail;
226 }
227
228 label_context_set(d, st.st_mode);
229 r = mknod(dn, st.st_mode, st.st_rdev);
230 label_context_clear();
231
232 if (r < 0) {
233 r = -errno;
234 goto fail;
235 }
236 }
237
238 dev_setup(temporary_mount);
239
240 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
241 r = -errno;
242 goto fail;
243 }
244
245 rmdir(dev);
246 rmdir(temporary_mount);
247
248 return 0;
249
250 fail:
251 if (devpts)
252 umount(devpts);
253
254 if (devshm)
255 umount(devshm);
256
257 if (devkdbus)
258 umount(devkdbus);
259
260 if (devhugepages)
261 umount(devhugepages);
262
263 if (devmqueue)
264 umount(devmqueue);
265
266 umount(dev);
267 rmdir(dev);
268 rmdir(temporary_mount);
269
270 return r;
271 }
272
273 static int mount_kdbus(BindMount *m) {
274
275 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
276 _cleanup_free_ char *basepath = NULL;
277 _cleanup_umask_ mode_t u;
278 char *busnode = NULL, *root;
279 struct stat st;
280 int r;
281
282 assert(m);
283
284 u = umask(0000);
285
286 if (!mkdtemp(temporary_mount)) {
287 log_error("Failed create temp dir: %m");
288 return -errno;
289 }
290
291 root = strappenda(temporary_mount, "/kdbus");
292 mkdir(root, 0755);
293 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
294 r = -errno;
295 goto fail;
296 }
297
298 /* create a new /dev/null dev node copy so we have some fodder to
299 * bind-mount the custom endpoint over. */
300 if (stat("/dev/null", &st) < 0) {
301 log_error("Failed to stat /dev/null: %m");
302 r = -errno;
303 goto fail;
304 }
305
306 busnode = strappenda(root, "/bus");
307 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
308 log_error("mknod() for %s failed: %m", busnode);
309 r = -errno;
310 goto fail;
311 }
312
313 r = mount(m->path, busnode, "bind", MS_BIND, NULL);
314 if (r < 0) {
315 log_error("bind mount of %s failed: %m", m->path);
316 r = -errno;
317 goto fail;
318 }
319
320 basepath = dirname_malloc(m->path);
321 if (!basepath) {
322 r = -ENOMEM;
323 goto fail;
324 }
325
326 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
327 log_error("bind mount of %s failed: %m", basepath);
328 r = -errno;
329 goto fail;
330 }
331
332 rmdir(temporary_mount);
333 return 0;
334
335 fail:
336 if (busnode) {
337 umount(busnode);
338 unlink(busnode);
339 }
340
341 umount(root);
342 rmdir(root);
343 rmdir(temporary_mount);
344
345 return r;
346 }
347
348 static int apply_mount(
349 BindMount *m,
350 const char *tmp_dir,
351 const char *var_tmp_dir) {
352
353 const char *what;
354 int r;
355
356 assert(m);
357
358 switch (m->mode) {
359
360 case INACCESSIBLE:
361
362 /* First, get rid of everything that is below if there
363 * is anything... Then, overmount it with an
364 * inaccessible directory. */
365 umount_recursive(m->path, 0);
366
367 what = "/run/systemd/inaccessible";
368 break;
369
370 case READONLY:
371 case READWRITE:
372 /* Nothing to mount here, we just later toggle the
373 * MS_RDONLY bit for the mount point */
374 return 0;
375
376 case PRIVATE_TMP:
377 what = tmp_dir;
378 break;
379
380 case PRIVATE_VAR_TMP:
381 what = var_tmp_dir;
382 break;
383
384 case PRIVATE_DEV:
385 return mount_dev(m);
386
387 case PRIVATE_BUS_ENDPOINT:
388 return mount_kdbus(m);
389
390 default:
391 assert_not_reached("Unknown mode");
392 }
393
394 assert(what);
395
396 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
397 if (r >= 0)
398 log_debug("Successfully mounted %s to %s", what, m->path);
399 else if (m->ignore && errno == ENOENT)
400 return 0;
401
402 return r;
403 }
404
405 static int make_read_only(BindMount *m) {
406 int r;
407
408 assert(m);
409
410 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
411 r = bind_remount_recursive(m->path, true);
412 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
413 r = bind_remount_recursive(m->path, false);
414 else
415 r = 0;
416
417 if (m->ignore && r == -ENOENT)
418 return 0;
419
420 return r;
421 }
422
423 int setup_namespace(
424 char** read_write_dirs,
425 char** read_only_dirs,
426 char** inaccessible_dirs,
427 const char* tmp_dir,
428 const char* var_tmp_dir,
429 const char* bus_endpoint_path,
430 bool private_dev,
431 ProtectHome protect_home,
432 ProtectSystem protect_system,
433 unsigned mount_flags) {
434
435 BindMount *m, *mounts = NULL;
436 unsigned n;
437 int r = 0;
438
439 if (mount_flags == 0)
440 mount_flags = MS_SHARED;
441
442 if (unshare(CLONE_NEWNS) < 0)
443 return -errno;
444
445 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
446 strv_length(read_write_dirs) +
447 strv_length(read_only_dirs) +
448 strv_length(inaccessible_dirs) +
449 private_dev +
450 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
451 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
452 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
453
454 if (n > 0) {
455 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
456 r = append_mounts(&m, read_write_dirs, READWRITE);
457 if (r < 0)
458 return r;
459
460 r = append_mounts(&m, read_only_dirs, READONLY);
461 if (r < 0)
462 return r;
463
464 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
465 if (r < 0)
466 return r;
467
468 if (tmp_dir) {
469 m->path = "/tmp";
470 m->mode = PRIVATE_TMP;
471 m++;
472 }
473
474 if (var_tmp_dir) {
475 m->path = "/var/tmp";
476 m->mode = PRIVATE_VAR_TMP;
477 m++;
478 }
479
480 if (private_dev) {
481 m->path = "/dev";
482 m->mode = PRIVATE_DEV;
483 m++;
484 }
485
486 if (bus_endpoint_path) {
487 m->path = bus_endpoint_path;
488 m->mode = PRIVATE_BUS_ENDPOINT;
489 m++;
490 }
491
492 if (protect_home != PROTECT_HOME_NO) {
493 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
494 if (r < 0)
495 return r;
496 }
497
498 if (protect_system != PROTECT_SYSTEM_NO) {
499 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
500 if (r < 0)
501 return r;
502 }
503
504 assert(mounts + n == m);
505
506 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
507 drop_duplicates(mounts, &n);
508 }
509
510 if (n > 0) {
511 /* Remount / as SLAVE so that nothing now mounted in the namespace
512 shows up in the parent */
513 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
514 return -errno;
515
516 for (m = mounts; m < mounts + n; ++m) {
517 r = apply_mount(m, tmp_dir, var_tmp_dir);
518 if (r < 0)
519 goto fail;
520 }
521
522 for (m = mounts; m < mounts + n; ++m) {
523 r = make_read_only(m);
524 if (r < 0)
525 goto fail;
526 }
527 }
528
529 /* Remount / as the desired mode. Not that this will not
530 * reestablish propagation from our side to the host, since
531 * what's disconnected is disconnected. */
532 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
533 r = -errno;
534 goto fail;
535 }
536
537 return 0;
538
539 fail:
540 if (n > 0) {
541 for (m = mounts; m < mounts + n; ++m)
542 if (m->done)
543 umount2(m->path, MNT_DETACH);
544 }
545
546 return r;
547 }
548
549 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
550 _cleanup_free_ char *x = NULL;
551 char bid[SD_ID128_STRING_MAX];
552 sd_id128_t boot_id;
553 int r;
554
555 assert(id);
556 assert(prefix);
557 assert(path);
558
559 /* We include the boot id in the directory so that after a
560 * reboot we can easily identify obsolete directories. */
561
562 r = sd_id128_get_boot(&boot_id);
563 if (r < 0)
564 return r;
565
566 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
567 if (!x)
568 return -ENOMEM;
569
570 RUN_WITH_UMASK(0077)
571 if (!mkdtemp(x))
572 return -errno;
573
574 RUN_WITH_UMASK(0000) {
575 char *y;
576
577 y = strappenda(x, "/tmp");
578
579 if (mkdir(y, 0777 | S_ISVTX) < 0)
580 return -errno;
581 }
582
583 *path = x;
584 x = NULL;
585
586 return 0;
587 }
588
589 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
590 char *a, *b;
591 int r;
592
593 assert(id);
594 assert(tmp_dir);
595 assert(var_tmp_dir);
596
597 r = setup_one_tmp_dir(id, "/tmp", &a);
598 if (r < 0)
599 return r;
600
601 r = setup_one_tmp_dir(id, "/var/tmp", &b);
602 if (r < 0) {
603 char *t;
604
605 t = strappenda(a, "/tmp");
606 rmdir(t);
607 rmdir(a);
608
609 free(a);
610 return r;
611 }
612
613 *tmp_dir = a;
614 *var_tmp_dir = b;
615
616 return 0;
617 }
618
619 int setup_netns(int netns_storage_socket[2]) {
620 _cleanup_close_ int netns = -1;
621 union {
622 struct cmsghdr cmsghdr;
623 uint8_t buf[CMSG_SPACE(sizeof(int))];
624 } control = {};
625 struct msghdr mh = {
626 .msg_control = &control,
627 .msg_controllen = sizeof(control),
628 };
629 struct cmsghdr *cmsg;
630 int r;
631
632 assert(netns_storage_socket);
633 assert(netns_storage_socket[0] >= 0);
634 assert(netns_storage_socket[1] >= 0);
635
636 /* We use the passed socketpair as a storage buffer for our
637 * namespace reference fd. Whatever process runs this first
638 * shall create a new namespace, all others should just join
639 * it. To serialize that we use a file lock on the socket
640 * pair.
641 *
642 * It's a bit crazy, but hey, works great! */
643
644 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
645 return -errno;
646
647 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
648 if (errno != EAGAIN) {
649 r = -errno;
650 goto fail;
651 }
652
653 /* Nothing stored yet, so let's create a new namespace */
654
655 if (unshare(CLONE_NEWNET) < 0) {
656 r = -errno;
657 goto fail;
658 }
659
660 loopback_setup();
661
662 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
663 if (netns < 0) {
664 r = -errno;
665 goto fail;
666 }
667
668 r = 1;
669 } else {
670 /* Yay, found something, so let's join the namespace */
671
672 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
673 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
674 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
675 netns = *(int*) CMSG_DATA(cmsg);
676 }
677 }
678
679 if (setns(netns, CLONE_NEWNET) < 0) {
680 r = -errno;
681 goto fail;
682 }
683
684 r = 0;
685 }
686
687 cmsg = CMSG_FIRSTHDR(&mh);
688 cmsg->cmsg_level = SOL_SOCKET;
689 cmsg->cmsg_type = SCM_RIGHTS;
690 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
691 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
692 mh.msg_controllen = cmsg->cmsg_len;
693
694 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
695 r = -errno;
696 goto fail;
697 }
698
699 fail:
700 lockf(netns_storage_socket[0], F_ULOCK, 0);
701
702 return r;
703 }
704
705 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
706 [PROTECT_HOME_NO] = "no",
707 [PROTECT_HOME_YES] = "yes",
708 [PROTECT_HOME_READ_ONLY] = "read-only",
709 };
710
711 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
712
713 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
714 [PROTECT_SYSTEM_NO] = "no",
715 [PROTECT_SYSTEM_YES] = "yes",
716 [PROTECT_SYSTEM_FULL] = "full",
717 };
718
719 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);