]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
namespace: make sure /tmp, /var/tmp and /dev are writable in namespaces we set up
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #include <sched.h>
30 #include <sys/syscall.h>
31 #include <limits.h>
32 #include <linux/fs.h>
33 #include <sys/file.h>
34
35 #include "strv.h"
36 #include "util.h"
37 #include "path-util.h"
38 #include "namespace.h"
39 #include "missing.h"
40 #include "execute.h"
41 #include "loopback-setup.h"
42 #include "mkdir.h"
43 #include "dev-setup.h"
44 #include "def.h"
45 #include "label.h"
46
47 typedef enum MountMode {
48 /* This is ordered by priority! */
49 INACCESSIBLE,
50 READONLY,
51 PRIVATE_TMP,
52 PRIVATE_VAR_TMP,
53 PRIVATE_DEV,
54 READWRITE
55 } MountMode;
56
57 typedef struct BindMount {
58 const char *path;
59 MountMode mode;
60 bool done;
61 bool ignore;
62 } BindMount;
63
64 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
65 char **i;
66
67 assert(p);
68
69 STRV_FOREACH(i, strv) {
70
71 (*p)->ignore = false;
72 (*p)->done = false;
73
74 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
75 (*p)->ignore = true;
76 (*i)++;
77 }
78
79 if (!path_is_absolute(*i))
80 return -EINVAL;
81
82 (*p)->path = *i;
83 (*p)->mode = mode;
84 (*p)++;
85 }
86
87 return 0;
88 }
89
90 static int mount_path_compare(const void *a, const void *b) {
91 const BindMount *p = a, *q = b;
92
93 if (path_equal(p->path, q->path)) {
94
95 /* If the paths are equal, check the mode */
96 if (p->mode < q->mode)
97 return -1;
98
99 if (p->mode > q->mode)
100 return 1;
101
102 return 0;
103 }
104
105 /* If the paths are not equal, then order prefixes first */
106 if (path_startswith(p->path, q->path))
107 return 1;
108
109 if (path_startswith(q->path, p->path))
110 return -1;
111
112 return 0;
113 }
114
115 static void drop_duplicates(BindMount *m, unsigned *n) {
116 BindMount *f, *t, *previous;
117
118 assert(m);
119 assert(n);
120
121 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
122
123 /* The first one wins */
124 if (previous && path_equal(f->path, previous->path))
125 continue;
126
127 t->path = f->path;
128 t->mode = f->mode;
129
130 previous = t;
131
132 t++;
133 }
134
135 *n = t - m;
136 }
137
138 static int mount_dev(BindMount *m) {
139 static const char devnodes[] =
140 "/dev/null\0"
141 "/dev/zero\0"
142 "/dev/full\0"
143 "/dev/random\0"
144 "/dev/urandom\0"
145 "/dev/tty\0";
146
147 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
148 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
149 _cleanup_umask_ mode_t u;
150 int r;
151
152 assert(m);
153
154 u = umask(0000);
155
156 if (!mkdtemp(temporary_mount))
157 return -errno;
158
159 dev = strappenda(temporary_mount, "/dev");
160 mkdir(dev, 0755);
161 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
162 r = -errno;
163 goto fail;
164 }
165
166 devpts = strappenda(temporary_mount, "/dev/pts");
167 mkdir(devpts, 0755);
168 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
169 r = -errno;
170 goto fail;
171 }
172
173 devptmx = strappenda(temporary_mount, "/dev/ptmx");
174 symlink("pts/ptmx", devptmx);
175
176 devshm = strappenda(temporary_mount, "/dev/shm");
177 mkdir(devshm, 01777);
178 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
179 if (r < 0) {
180 r = -errno;
181 goto fail;
182 }
183
184 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
185 mkdir(devmqueue, 0755);
186 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
187
188 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
189 mkdir(devkdbus, 0755);
190 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
191
192 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
193 mkdir(devhugepages, 0755);
194 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
195
196 devlog = strappenda(temporary_mount, "/dev/log");
197 symlink("/run/systemd/journal/dev-log", devlog);
198
199 NULSTR_FOREACH(d, devnodes) {
200 _cleanup_free_ char *dn = NULL;
201 struct stat st;
202
203 r = stat(d, &st);
204 if (r < 0) {
205
206 if (errno == ENOENT)
207 continue;
208
209 r = -errno;
210 goto fail;
211 }
212
213 if (!S_ISBLK(st.st_mode) &&
214 !S_ISCHR(st.st_mode)) {
215 r = -EINVAL;
216 goto fail;
217 }
218
219 if (st.st_rdev == 0)
220 continue;
221
222 dn = strappend(temporary_mount, d);
223 if (!dn) {
224 r = -ENOMEM;
225 goto fail;
226 }
227
228 label_context_set(d, st.st_mode);
229 r = mknod(dn, st.st_mode, st.st_rdev);
230 label_context_clear();
231
232 if (r < 0) {
233 r = -errno;
234 goto fail;
235 }
236 }
237
238 dev_setup(temporary_mount);
239
240 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
241 r = -errno;
242 goto fail;
243 }
244
245 rmdir(dev);
246 rmdir(temporary_mount);
247
248 return 0;
249
250 fail:
251 if (devpts)
252 umount(devpts);
253
254 if (devshm)
255 umount(devshm);
256
257 if (devkdbus)
258 umount(devkdbus);
259
260 if (devhugepages)
261 umount(devhugepages);
262
263 if (devmqueue)
264 umount(devmqueue);
265
266 if (dev) {
267 umount(dev);
268 rmdir(dev);
269 }
270
271 rmdir(temporary_mount);
272
273 return r;
274 }
275
276 static int apply_mount(
277 BindMount *m,
278 const char *tmp_dir,
279 const char *var_tmp_dir) {
280
281 const char *what;
282 int r;
283
284 assert(m);
285
286 switch (m->mode) {
287
288 case INACCESSIBLE:
289
290 /* First, get rid of everything that is below if there
291 * is anything... Then, overmount it with an
292 * inaccessible directory. */
293 umount_recursive(m->path, 0);
294
295 what = "/run/systemd/inaccessible";
296 break;
297
298 case READONLY:
299 case READWRITE:
300 /* Nothing to mount here, we just later toggle the
301 * MS_RDONLY bit for the mount point */
302 return 0;
303
304 case PRIVATE_TMP:
305 what = tmp_dir;
306 break;
307
308 case PRIVATE_VAR_TMP:
309 what = var_tmp_dir;
310 break;
311
312 case PRIVATE_DEV:
313 return mount_dev(m);
314
315 default:
316 assert_not_reached("Unknown mode");
317 }
318
319 assert(what);
320
321 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
322 if (r >= 0)
323 log_debug("Successfully mounted %s to %s", what, m->path);
324 else if (m->ignore && errno == ENOENT)
325 return 0;
326
327 return r;
328 }
329
330 static int make_read_only(BindMount *m) {
331 int r;
332
333 assert(m);
334
335 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
336 r = bind_remount_recursive(m->path, true);
337 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
338 r = bind_remount_recursive(m->path, false);
339 else
340 r = 0;
341
342 if (m->ignore && r == -ENOENT)
343 return 0;
344
345 return r;
346 }
347
348 int setup_namespace(
349 char** read_write_dirs,
350 char** read_only_dirs,
351 char** inaccessible_dirs,
352 char* tmp_dir,
353 char* var_tmp_dir,
354 bool private_dev,
355 ProtectHome protect_home,
356 ProtectSystem protect_system,
357 unsigned mount_flags) {
358
359 BindMount *m, *mounts = NULL;
360 unsigned n;
361 int r = 0;
362
363 if (mount_flags == 0)
364 mount_flags = MS_SHARED;
365
366 if (unshare(CLONE_NEWNS) < 0)
367 return -errno;
368
369 n = !!tmp_dir + !!var_tmp_dir +
370 strv_length(read_write_dirs) +
371 strv_length(read_only_dirs) +
372 strv_length(inaccessible_dirs) +
373 private_dev +
374 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
375 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
376 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
377
378 if (n > 0) {
379 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
380 r = append_mounts(&m, read_write_dirs, READWRITE);
381 if (r < 0)
382 return r;
383
384 r = append_mounts(&m, read_only_dirs, READONLY);
385 if (r < 0)
386 return r;
387
388 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
389 if (r < 0)
390 return r;
391
392 if (tmp_dir) {
393 m->path = "/tmp";
394 m->mode = PRIVATE_TMP;
395 m++;
396 }
397
398 if (var_tmp_dir) {
399 m->path = "/var/tmp";
400 m->mode = PRIVATE_VAR_TMP;
401 m++;
402 }
403
404 if (private_dev) {
405 m->path = "/dev";
406 m->mode = PRIVATE_DEV;
407 m++;
408 }
409
410 if (protect_home != PROTECT_HOME_NO) {
411 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
412 if (r < 0)
413 return r;
414 }
415
416 if (protect_system != PROTECT_SYSTEM_NO) {
417 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
418 if (r < 0)
419 return r;
420 }
421
422 assert(mounts + n == m);
423
424 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
425 drop_duplicates(mounts, &n);
426 }
427
428 if (n > 0) {
429 /* Remount / as SLAVE so that nothing now mounted in the namespace
430 shows up in the parent */
431 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
432 return -errno;
433
434 for (m = mounts; m < mounts + n; ++m) {
435 r = apply_mount(m, tmp_dir, var_tmp_dir);
436 if (r < 0)
437 goto fail;
438 }
439
440 for (m = mounts; m < mounts + n; ++m) {
441 r = make_read_only(m);
442 if (r < 0)
443 goto fail;
444 }
445 }
446
447 /* Remount / as the desired mode. Not that this will not
448 * reestablish propagation from our side to the host, since
449 * what's disconnected is disconnected. */
450 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
451 r = -errno;
452 goto fail;
453 }
454
455 return 0;
456
457 fail:
458 if (n > 0) {
459 for (m = mounts; m < mounts + n; ++m)
460 if (m->done)
461 umount2(m->path, MNT_DETACH);
462 }
463
464 return r;
465 }
466
467 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
468 _cleanup_free_ char *x = NULL;
469 char bid[SD_ID128_STRING_MAX];
470 sd_id128_t boot_id;
471 int r;
472
473 assert(id);
474 assert(prefix);
475 assert(path);
476
477 /* We include the boot id in the directory so that after a
478 * reboot we can easily identify obsolete directories. */
479
480 r = sd_id128_get_boot(&boot_id);
481 if (r < 0)
482 return r;
483
484 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
485 if (!x)
486 return -ENOMEM;
487
488 RUN_WITH_UMASK(0077)
489 if (!mkdtemp(x))
490 return -errno;
491
492 RUN_WITH_UMASK(0000) {
493 char *y;
494
495 y = strappenda(x, "/tmp");
496
497 if (mkdir(y, 0777 | S_ISVTX) < 0)
498 return -errno;
499 }
500
501 *path = x;
502 x = NULL;
503
504 return 0;
505 }
506
507 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
508 char *a, *b;
509 int r;
510
511 assert(id);
512 assert(tmp_dir);
513 assert(var_tmp_dir);
514
515 r = setup_one_tmp_dir(id, "/tmp", &a);
516 if (r < 0)
517 return r;
518
519 r = setup_one_tmp_dir(id, "/var/tmp", &b);
520 if (r < 0) {
521 char *t;
522
523 t = strappenda(a, "/tmp");
524 rmdir(t);
525 rmdir(a);
526
527 free(a);
528 return r;
529 }
530
531 *tmp_dir = a;
532 *var_tmp_dir = b;
533
534 return 0;
535 }
536
537 int setup_netns(int netns_storage_socket[2]) {
538 _cleanup_close_ int netns = -1;
539 union {
540 struct cmsghdr cmsghdr;
541 uint8_t buf[CMSG_SPACE(sizeof(int))];
542 } control = {};
543 struct msghdr mh = {
544 .msg_control = &control,
545 .msg_controllen = sizeof(control),
546 };
547 struct cmsghdr *cmsg;
548 int r;
549
550 assert(netns_storage_socket);
551 assert(netns_storage_socket[0] >= 0);
552 assert(netns_storage_socket[1] >= 0);
553
554 /* We use the passed socketpair as a storage buffer for our
555 * namespace reference fd. Whatever process runs this first
556 * shall create a new namespace, all others should just join
557 * it. To serialize that we use a file lock on the socket
558 * pair.
559 *
560 * It's a bit crazy, but hey, works great! */
561
562 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
563 return -errno;
564
565 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
566 if (errno != EAGAIN) {
567 r = -errno;
568 goto fail;
569 }
570
571 /* Nothing stored yet, so let's create a new namespace */
572
573 if (unshare(CLONE_NEWNET) < 0) {
574 r = -errno;
575 goto fail;
576 }
577
578 loopback_setup();
579
580 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
581 if (netns < 0) {
582 r = -errno;
583 goto fail;
584 }
585
586 r = 1;
587 } else {
588 /* Yay, found something, so let's join the namespace */
589
590 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
591 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
592 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
593 netns = *(int*) CMSG_DATA(cmsg);
594 }
595 }
596
597 if (setns(netns, CLONE_NEWNET) < 0) {
598 r = -errno;
599 goto fail;
600 }
601
602 r = 0;
603 }
604
605 cmsg = CMSG_FIRSTHDR(&mh);
606 cmsg->cmsg_level = SOL_SOCKET;
607 cmsg->cmsg_type = SCM_RIGHTS;
608 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
609 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
610 mh.msg_controllen = cmsg->cmsg_len;
611
612 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
613 r = -errno;
614 goto fail;
615 }
616
617 fail:
618 lockf(netns_storage_socket[0], F_ULOCK, 0);
619
620 return r;
621 }
622
623 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
624 [PROTECT_HOME_NO] = "no",
625 [PROTECT_HOME_YES] = "yes",
626 [PROTECT_HOME_READ_ONLY] = "read-only",
627 };
628
629 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
630
631 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
632 [PROTECT_SYSTEM_NO] = "no",
633 [PROTECT_SYSTEM_YES] = "yes",
634 [PROTECT_SYSTEM_FULL] = "full",
635 };
636
637 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);