]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
core: rename ReadOnlySystem= to ProtectSystem= and add a third value for also mountin...
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #include <sched.h>
30 #include <sys/syscall.h>
31 #include <limits.h>
32 #include <linux/fs.h>
33 #include <sys/file.h>
34
35 #include "strv.h"
36 #include "util.h"
37 #include "path-util.h"
38 #include "namespace.h"
39 #include "missing.h"
40 #include "execute.h"
41 #include "loopback-setup.h"
42 #include "mkdir.h"
43 #include "dev-setup.h"
44 #include "def.h"
45
46 typedef enum MountMode {
47 /* This is ordered by priority! */
48 INACCESSIBLE,
49 READONLY,
50 PRIVATE_TMP,
51 PRIVATE_VAR_TMP,
52 PRIVATE_DEV,
53 READWRITE
54 } MountMode;
55
56 typedef struct BindMount {
57 const char *path;
58 MountMode mode;
59 bool done;
60 bool ignore;
61 } BindMount;
62
63 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
64 char **i;
65
66 assert(p);
67
68 STRV_FOREACH(i, strv) {
69
70 (*p)->ignore = false;
71
72 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
73 (*p)->ignore = true;
74 (*i)++;
75 }
76
77 if (!path_is_absolute(*i))
78 return -EINVAL;
79
80 (*p)->path = *i;
81 (*p)->mode = mode;
82 (*p)++;
83 }
84
85 return 0;
86 }
87
88 static int mount_path_compare(const void *a, const void *b) {
89 const BindMount *p = a, *q = b;
90
91 if (path_equal(p->path, q->path)) {
92
93 /* If the paths are equal, check the mode */
94 if (p->mode < q->mode)
95 return -1;
96
97 if (p->mode > q->mode)
98 return 1;
99
100 return 0;
101 }
102
103 /* If the paths are not equal, then order prefixes first */
104 if (path_startswith(p->path, q->path))
105 return 1;
106
107 if (path_startswith(q->path, p->path))
108 return -1;
109
110 return 0;
111 }
112
113 static void drop_duplicates(BindMount *m, unsigned *n) {
114 BindMount *f, *t, *previous;
115
116 assert(m);
117 assert(n);
118
119 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
120
121 /* The first one wins */
122 if (previous && path_equal(f->path, previous->path))
123 continue;
124
125 t->path = f->path;
126 t->mode = f->mode;
127
128 previous = t;
129
130 t++;
131 }
132
133 *n = t - m;
134 }
135
136 static int mount_dev(BindMount *m) {
137 static const char devnodes[] =
138 "/dev/null\0"
139 "/dev/zero\0"
140 "/dev/full\0"
141 "/dev/random\0"
142 "/dev/urandom\0"
143 "/dev/tty\0";
144
145 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
146 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
147 _cleanup_umask_ mode_t u;
148 int r;
149
150 assert(m);
151
152 u = umask(0000);
153
154 if (!mkdtemp(temporary_mount))
155 return -errno;
156
157 dev = strappenda(temporary_mount, "/dev");
158 mkdir(dev, 0755);
159 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
160 r = -errno;
161 goto fail;
162 }
163
164 devpts = strappenda(temporary_mount, "/dev/pts");
165 mkdir(devpts, 0755);
166 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
167 r = -errno;
168 goto fail;
169 }
170
171 devptmx = strappenda(temporary_mount, "/dev/ptmx");
172 symlink("pts/ptmx", devptmx);
173
174 devshm = strappenda(temporary_mount, "/dev/shm");
175 mkdir(devshm, 01777);
176 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
177 if (r < 0) {
178 r = -errno;
179 goto fail;
180 }
181
182 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
183 mkdir(devmqueue, 0755);
184 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
185
186 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
187 mkdir(devkdbus, 0755);
188 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
189
190 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
191 mkdir(devhugepages, 0755);
192 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
193
194 devlog = strappenda(temporary_mount, "/dev/log");
195 symlink("/run/systemd/journal/dev-log", devlog);
196
197 NULSTR_FOREACH(d, devnodes) {
198 _cleanup_free_ char *dn = NULL;
199 struct stat st;
200
201 r = stat(d, &st);
202 if (r < 0) {
203
204 if (errno == ENOENT)
205 continue;
206
207 r = -errno;
208 goto fail;
209 }
210
211 if (!S_ISBLK(st.st_mode) &&
212 !S_ISCHR(st.st_mode)) {
213 r = -EINVAL;
214 goto fail;
215 }
216
217 if (st.st_rdev == 0)
218 continue;
219
220 dn = strappend(temporary_mount, d);
221 if (!dn) {
222 r = -ENOMEM;
223 goto fail;
224 }
225
226 r = mknod(dn, st.st_mode, st.st_rdev);
227 if (r < 0) {
228 r = -errno;
229 goto fail;
230 }
231 }
232
233 dev_setup(temporary_mount);
234
235 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
236 r = -errno;
237 goto fail;
238 }
239
240 rmdir(dev);
241 rmdir(temporary_mount);
242
243 return 0;
244
245 fail:
246 if (devpts)
247 umount(devpts);
248
249 if (devshm)
250 umount(devshm);
251
252 if (devkdbus)
253 umount(devkdbus);
254
255 if (devhugepages)
256 umount(devhugepages);
257
258 if (devmqueue)
259 umount(devmqueue);
260
261 if (dev) {
262 umount(dev);
263 rmdir(dev);
264 }
265
266 rmdir(temporary_mount);
267
268 return r;
269 }
270
271 static int apply_mount(
272 BindMount *m,
273 const char *tmp_dir,
274 const char *var_tmp_dir) {
275
276 const char *what;
277 int r;
278
279 assert(m);
280
281 switch (m->mode) {
282
283 case PRIVATE_DEV:
284 return mount_dev(m);
285
286 case INACCESSIBLE:
287 what = "/run/systemd/inaccessible";
288 break;
289
290 case READONLY:
291 case READWRITE:
292 what = m->path;
293 break;
294
295 case PRIVATE_TMP:
296 what = tmp_dir;
297 break;
298
299 case PRIVATE_VAR_TMP:
300 what = var_tmp_dir;
301 break;
302
303 default:
304 assert_not_reached("Unknown mode");
305 }
306
307 assert(what);
308
309 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
310 if (r >= 0)
311 log_debug("Successfully mounted %s to %s", what, m->path);
312 else if (m->ignore && errno == ENOENT)
313 r = 0;
314
315 return r;
316 }
317
318 static int make_read_only(BindMount *m) {
319 int r;
320
321 assert(m);
322
323 if (m->mode != INACCESSIBLE && m->mode != READONLY)
324 return 0;
325
326 r = mount(NULL, m->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL);
327 if (r < 0 && !(m->ignore && errno == ENOENT))
328 return -errno;
329
330 return 0;
331 }
332
333 int setup_namespace(
334 char** read_write_dirs,
335 char** read_only_dirs,
336 char** inaccessible_dirs,
337 char* tmp_dir,
338 char* var_tmp_dir,
339 bool private_dev,
340 ProtectHome protect_home,
341 ProtectSystem protect_system,
342 unsigned mount_flags) {
343
344 BindMount *m, *mounts = NULL;
345 unsigned n;
346 int r = 0;
347
348 if (mount_flags == 0)
349 mount_flags = MS_SHARED;
350
351 if (unshare(CLONE_NEWNS) < 0)
352 return -errno;
353
354 n = !!tmp_dir + !!var_tmp_dir +
355 strv_length(read_write_dirs) +
356 strv_length(read_only_dirs) +
357 strv_length(inaccessible_dirs) +
358 private_dev +
359 (protect_home != PROTECT_HOME_NO ? 2 : 0) +
360 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
361 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
362
363 if (n > 0) {
364 m = mounts = (BindMount *) alloca(n * sizeof(BindMount));
365 r = append_mounts(&m, read_write_dirs, READWRITE);
366 if (r < 0)
367 return r;
368
369 r = append_mounts(&m, read_only_dirs, READONLY);
370 if (r < 0)
371 return r;
372
373 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
374 if (r < 0)
375 return r;
376
377 if (tmp_dir) {
378 m->path = "/tmp";
379 m->mode = PRIVATE_TMP;
380 m++;
381 }
382
383 if (var_tmp_dir) {
384 m->path = "/var/tmp";
385 m->mode = PRIVATE_VAR_TMP;
386 m++;
387 }
388
389 if (private_dev) {
390 m->path = "/dev";
391 m->mode = PRIVATE_DEV;
392 m++;
393 }
394
395 if (protect_home != PROTECT_HOME_NO) {
396 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
397 if (r < 0)
398 return r;
399 }
400
401 if (protect_system != PROTECT_SYSTEM_NO) {
402 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "/etc", "-/boot") : STRV_MAKE("/usr", "-/boot"), READONLY);
403 if (r < 0)
404 return r;
405 }
406
407 assert(mounts + n == m);
408
409 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
410 drop_duplicates(mounts, &n);
411 }
412
413 if (n > 0) {
414 /* Remount / as SLAVE so that nothing now mounted in the namespace
415 shows up in the parent */
416 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
417 return -errno;
418
419 for (m = mounts; m < mounts + n; ++m) {
420 r = apply_mount(m, tmp_dir, var_tmp_dir);
421 if (r < 0)
422 goto fail;
423 }
424
425 for (m = mounts; m < mounts + n; ++m) {
426 r = make_read_only(m);
427 if (r < 0)
428 goto fail;
429 }
430 }
431
432 /* Remount / as the desired mode. Not that this will not
433 * reestablish propagation from our side to the host, since
434 * what's disconnected is disconnected. */
435 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
436 r = -errno;
437 goto fail;
438 }
439
440 return 0;
441
442 fail:
443 if (n > 0) {
444 for (m = mounts; m < mounts + n; ++m)
445 if (m->done)
446 umount2(m->path, MNT_DETACH);
447 }
448
449 return r;
450 }
451
452 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
453 _cleanup_free_ char *x = NULL;
454 char bid[SD_ID128_STRING_MAX];
455 sd_id128_t boot_id;
456 int r;
457
458 assert(id);
459 assert(prefix);
460 assert(path);
461
462 /* We include the boot id in the directory so that after a
463 * reboot we can easily identify obsolete directories. */
464
465 r = sd_id128_get_boot(&boot_id);
466 if (r < 0)
467 return r;
468
469 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
470 if (!x)
471 return -ENOMEM;
472
473 RUN_WITH_UMASK(0077)
474 if (!mkdtemp(x))
475 return -errno;
476
477 RUN_WITH_UMASK(0000) {
478 char *y;
479
480 y = strappenda(x, "/tmp");
481
482 if (mkdir(y, 0777 | S_ISVTX) < 0)
483 return -errno;
484 }
485
486 *path = x;
487 x = NULL;
488
489 return 0;
490 }
491
492 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
493 char *a, *b;
494 int r;
495
496 assert(id);
497 assert(tmp_dir);
498 assert(var_tmp_dir);
499
500 r = setup_one_tmp_dir(id, "/tmp", &a);
501 if (r < 0)
502 return r;
503
504 r = setup_one_tmp_dir(id, "/var/tmp", &b);
505 if (r < 0) {
506 char *t;
507
508 t = strappenda(a, "/tmp");
509 rmdir(t);
510 rmdir(a);
511
512 free(a);
513 return r;
514 }
515
516 *tmp_dir = a;
517 *var_tmp_dir = b;
518
519 return 0;
520 }
521
522 int setup_netns(int netns_storage_socket[2]) {
523 _cleanup_close_ int netns = -1;
524 union {
525 struct cmsghdr cmsghdr;
526 uint8_t buf[CMSG_SPACE(sizeof(int))];
527 } control = {};
528 struct msghdr mh = {
529 .msg_control = &control,
530 .msg_controllen = sizeof(control),
531 };
532 struct cmsghdr *cmsg;
533 int r;
534
535 assert(netns_storage_socket);
536 assert(netns_storage_socket[0] >= 0);
537 assert(netns_storage_socket[1] >= 0);
538
539 /* We use the passed socketpair as a storage buffer for our
540 * namespace reference fd. Whatever process runs this first
541 * shall create a new namespace, all others should just join
542 * it. To serialize that we use a file lock on the socket
543 * pair.
544 *
545 * It's a bit crazy, but hey, works great! */
546
547 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
548 return -errno;
549
550 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
551 if (errno != EAGAIN) {
552 r = -errno;
553 goto fail;
554 }
555
556 /* Nothing stored yet, so let's create a new namespace */
557
558 if (unshare(CLONE_NEWNET) < 0) {
559 r = -errno;
560 goto fail;
561 }
562
563 loopback_setup();
564
565 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
566 if (netns < 0) {
567 r = -errno;
568 goto fail;
569 }
570
571 r = 1;
572 } else {
573 /* Yay, found something, so let's join the namespace */
574
575 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
576 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
577 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
578 netns = *(int*) CMSG_DATA(cmsg);
579 }
580 }
581
582 if (setns(netns, CLONE_NEWNET) < 0) {
583 r = -errno;
584 goto fail;
585 }
586
587 r = 0;
588 }
589
590 cmsg = CMSG_FIRSTHDR(&mh);
591 cmsg->cmsg_level = SOL_SOCKET;
592 cmsg->cmsg_type = SCM_RIGHTS;
593 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
594 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
595 mh.msg_controllen = cmsg->cmsg_len;
596
597 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
598 r = -errno;
599 goto fail;
600 }
601
602 fail:
603 lockf(netns_storage_socket[0], F_ULOCK, 0);
604
605 return r;
606 }
607
608 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
609 [PROTECT_HOME_NO] = "no",
610 [PROTECT_HOME_YES] = "yes",
611 [PROTECT_HOME_READ_ONLY] = "read-only",
612 };
613
614 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
615
616 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
617 [PROTECT_SYSTEM_NO] = "no",
618 [PROTECT_SYSTEM_YES] = "yes",
619 [PROTECT_SYSTEM_FULL] = "full",
620 };
621
622 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);