]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
drop_duplicates: copy full BindMount struct
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #include <sched.h>
30 #include <sys/syscall.h>
31 #include <limits.h>
32 #include <linux/fs.h>
33 #include <sys/file.h>
34
35 #include "strv.h"
36 #include "util.h"
37 #include "path-util.h"
38 #include "namespace.h"
39 #include "missing.h"
40 #include "execute.h"
41 #include "loopback-setup.h"
42 #include "mkdir.h"
43 #include "dev-setup.h"
44 #include "def.h"
45 #include "label.h"
46
47 typedef enum MountMode {
48 /* This is ordered by priority! */
49 INACCESSIBLE,
50 READONLY,
51 PRIVATE_TMP,
52 PRIVATE_VAR_TMP,
53 PRIVATE_DEV,
54 READWRITE
55 } MountMode;
56
57 typedef struct BindMount {
58 const char *path;
59 MountMode mode;
60 bool done;
61 bool ignore;
62 } BindMount;
63
64 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
65 char **i;
66
67 assert(p);
68
69 STRV_FOREACH(i, strv) {
70
71 (*p)->ignore = false;
72 (*p)->done = false;
73
74 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
75 (*p)->ignore = true;
76 (*i)++;
77 }
78
79 if (!path_is_absolute(*i))
80 return -EINVAL;
81
82 (*p)->path = *i;
83 (*p)->mode = mode;
84 (*p)++;
85 }
86
87 return 0;
88 }
89
90 static int mount_path_compare(const void *a, const void *b) {
91 const BindMount *p = a, *q = b;
92
93 if (path_equal(p->path, q->path)) {
94
95 /* If the paths are equal, check the mode */
96 if (p->mode < q->mode)
97 return -1;
98
99 if (p->mode > q->mode)
100 return 1;
101
102 return 0;
103 }
104
105 /* If the paths are not equal, then order prefixes first */
106 if (path_startswith(p->path, q->path))
107 return 1;
108
109 if (path_startswith(q->path, p->path))
110 return -1;
111
112 return 0;
113 }
114
115 static void drop_duplicates(BindMount *m, unsigned *n) {
116 BindMount *f, *t, *previous;
117
118 assert(m);
119 assert(n);
120
121 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
122
123 /* The first one wins */
124 if (previous && path_equal(f->path, previous->path))
125 continue;
126
127 *t = *f;
128
129 previous = t;
130
131 t++;
132 }
133
134 *n = t - m;
135 }
136
137 static int mount_dev(BindMount *m) {
138 static const char devnodes[] =
139 "/dev/null\0"
140 "/dev/zero\0"
141 "/dev/full\0"
142 "/dev/random\0"
143 "/dev/urandom\0"
144 "/dev/tty\0";
145
146 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
147 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
148 _cleanup_umask_ mode_t u;
149 int r;
150
151 assert(m);
152
153 u = umask(0000);
154
155 if (!mkdtemp(temporary_mount))
156 return -errno;
157
158 dev = strappenda(temporary_mount, "/dev");
159 mkdir(dev, 0755);
160 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
161 r = -errno;
162 goto fail;
163 }
164
165 devpts = strappenda(temporary_mount, "/dev/pts");
166 mkdir(devpts, 0755);
167 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
168 r = -errno;
169 goto fail;
170 }
171
172 devptmx = strappenda(temporary_mount, "/dev/ptmx");
173 symlink("pts/ptmx", devptmx);
174
175 devshm = strappenda(temporary_mount, "/dev/shm");
176 mkdir(devshm, 01777);
177 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
178 if (r < 0) {
179 r = -errno;
180 goto fail;
181 }
182
183 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
184 mkdir(devmqueue, 0755);
185 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
186
187 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
188 mkdir(devkdbus, 0755);
189 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
190
191 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
192 mkdir(devhugepages, 0755);
193 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
194
195 devlog = strappenda(temporary_mount, "/dev/log");
196 symlink("/run/systemd/journal/dev-log", devlog);
197
198 NULSTR_FOREACH(d, devnodes) {
199 _cleanup_free_ char *dn = NULL;
200 struct stat st;
201
202 r = stat(d, &st);
203 if (r < 0) {
204
205 if (errno == ENOENT)
206 continue;
207
208 r = -errno;
209 goto fail;
210 }
211
212 if (!S_ISBLK(st.st_mode) &&
213 !S_ISCHR(st.st_mode)) {
214 r = -EINVAL;
215 goto fail;
216 }
217
218 if (st.st_rdev == 0)
219 continue;
220
221 dn = strappend(temporary_mount, d);
222 if (!dn) {
223 r = -ENOMEM;
224 goto fail;
225 }
226
227 label_context_set(d, st.st_mode);
228 r = mknod(dn, st.st_mode, st.st_rdev);
229 label_context_clear();
230
231 if (r < 0) {
232 r = -errno;
233 goto fail;
234 }
235 }
236
237 dev_setup(temporary_mount);
238
239 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
240 r = -errno;
241 goto fail;
242 }
243
244 rmdir(dev);
245 rmdir(temporary_mount);
246
247 return 0;
248
249 fail:
250 if (devpts)
251 umount(devpts);
252
253 if (devshm)
254 umount(devshm);
255
256 if (devkdbus)
257 umount(devkdbus);
258
259 if (devhugepages)
260 umount(devhugepages);
261
262 if (devmqueue)
263 umount(devmqueue);
264
265 if (dev) {
266 umount(dev);
267 rmdir(dev);
268 }
269
270 rmdir(temporary_mount);
271
272 return r;
273 }
274
275 static int apply_mount(
276 BindMount *m,
277 const char *tmp_dir,
278 const char *var_tmp_dir) {
279
280 const char *what;
281 int r;
282
283 assert(m);
284
285 switch (m->mode) {
286
287 case INACCESSIBLE:
288
289 /* First, get rid of everything that is below if there
290 * is anything... Then, overmount it with an
291 * inaccessible directory. */
292 umount_recursive(m->path, 0);
293
294 what = "/run/systemd/inaccessible";
295 break;
296
297 case READONLY:
298 case READWRITE:
299 /* Nothing to mount here, we just later toggle the
300 * MS_RDONLY bit for the mount point */
301 return 0;
302
303 case PRIVATE_TMP:
304 what = tmp_dir;
305 break;
306
307 case PRIVATE_VAR_TMP:
308 what = var_tmp_dir;
309 break;
310
311 case PRIVATE_DEV:
312 return mount_dev(m);
313
314 default:
315 assert_not_reached("Unknown mode");
316 }
317
318 assert(what);
319
320 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
321 if (r >= 0)
322 log_debug("Successfully mounted %s to %s", what, m->path);
323 else if (m->ignore && errno == ENOENT)
324 return 0;
325
326 return r;
327 }
328
329 static int make_read_only(BindMount *m) {
330 int r;
331
332 assert(m);
333
334 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
335 r = bind_remount_recursive(m->path, true);
336 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
337 r = bind_remount_recursive(m->path, false);
338 else
339 r = 0;
340
341 if (m->ignore && r == -ENOENT)
342 return 0;
343
344 return r;
345 }
346
347 int setup_namespace(
348 char** read_write_dirs,
349 char** read_only_dirs,
350 char** inaccessible_dirs,
351 char* tmp_dir,
352 char* var_tmp_dir,
353 bool private_dev,
354 ProtectHome protect_home,
355 ProtectSystem protect_system,
356 unsigned mount_flags) {
357
358 BindMount *m, *mounts = NULL;
359 unsigned n;
360 int r = 0;
361
362 if (mount_flags == 0)
363 mount_flags = MS_SHARED;
364
365 if (unshare(CLONE_NEWNS) < 0)
366 return -errno;
367
368 n = !!tmp_dir + !!var_tmp_dir +
369 strv_length(read_write_dirs) +
370 strv_length(read_only_dirs) +
371 strv_length(inaccessible_dirs) +
372 private_dev +
373 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
374 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
375 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
376
377 if (n > 0) {
378 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
379 r = append_mounts(&m, read_write_dirs, READWRITE);
380 if (r < 0)
381 return r;
382
383 r = append_mounts(&m, read_only_dirs, READONLY);
384 if (r < 0)
385 return r;
386
387 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
388 if (r < 0)
389 return r;
390
391 if (tmp_dir) {
392 m->path = "/tmp";
393 m->mode = PRIVATE_TMP;
394 m++;
395 }
396
397 if (var_tmp_dir) {
398 m->path = "/var/tmp";
399 m->mode = PRIVATE_VAR_TMP;
400 m++;
401 }
402
403 if (private_dev) {
404 m->path = "/dev";
405 m->mode = PRIVATE_DEV;
406 m++;
407 }
408
409 if (protect_home != PROTECT_HOME_NO) {
410 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
411 if (r < 0)
412 return r;
413 }
414
415 if (protect_system != PROTECT_SYSTEM_NO) {
416 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
417 if (r < 0)
418 return r;
419 }
420
421 assert(mounts + n == m);
422
423 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
424 drop_duplicates(mounts, &n);
425 }
426
427 if (n > 0) {
428 /* Remount / as SLAVE so that nothing now mounted in the namespace
429 shows up in the parent */
430 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
431 return -errno;
432
433 for (m = mounts; m < mounts + n; ++m) {
434 r = apply_mount(m, tmp_dir, var_tmp_dir);
435 if (r < 0)
436 goto fail;
437 }
438
439 for (m = mounts; m < mounts + n; ++m) {
440 r = make_read_only(m);
441 if (r < 0)
442 goto fail;
443 }
444 }
445
446 /* Remount / as the desired mode. Not that this will not
447 * reestablish propagation from our side to the host, since
448 * what's disconnected is disconnected. */
449 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
450 r = -errno;
451 goto fail;
452 }
453
454 return 0;
455
456 fail:
457 if (n > 0) {
458 for (m = mounts; m < mounts + n; ++m)
459 if (m->done)
460 umount2(m->path, MNT_DETACH);
461 }
462
463 return r;
464 }
465
466 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
467 _cleanup_free_ char *x = NULL;
468 char bid[SD_ID128_STRING_MAX];
469 sd_id128_t boot_id;
470 int r;
471
472 assert(id);
473 assert(prefix);
474 assert(path);
475
476 /* We include the boot id in the directory so that after a
477 * reboot we can easily identify obsolete directories. */
478
479 r = sd_id128_get_boot(&boot_id);
480 if (r < 0)
481 return r;
482
483 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
484 if (!x)
485 return -ENOMEM;
486
487 RUN_WITH_UMASK(0077)
488 if (!mkdtemp(x))
489 return -errno;
490
491 RUN_WITH_UMASK(0000) {
492 char *y;
493
494 y = strappenda(x, "/tmp");
495
496 if (mkdir(y, 0777 | S_ISVTX) < 0)
497 return -errno;
498 }
499
500 *path = x;
501 x = NULL;
502
503 return 0;
504 }
505
506 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
507 char *a, *b;
508 int r;
509
510 assert(id);
511 assert(tmp_dir);
512 assert(var_tmp_dir);
513
514 r = setup_one_tmp_dir(id, "/tmp", &a);
515 if (r < 0)
516 return r;
517
518 r = setup_one_tmp_dir(id, "/var/tmp", &b);
519 if (r < 0) {
520 char *t;
521
522 t = strappenda(a, "/tmp");
523 rmdir(t);
524 rmdir(a);
525
526 free(a);
527 return r;
528 }
529
530 *tmp_dir = a;
531 *var_tmp_dir = b;
532
533 return 0;
534 }
535
536 int setup_netns(int netns_storage_socket[2]) {
537 _cleanup_close_ int netns = -1;
538 union {
539 struct cmsghdr cmsghdr;
540 uint8_t buf[CMSG_SPACE(sizeof(int))];
541 } control = {};
542 struct msghdr mh = {
543 .msg_control = &control,
544 .msg_controllen = sizeof(control),
545 };
546 struct cmsghdr *cmsg;
547 int r;
548
549 assert(netns_storage_socket);
550 assert(netns_storage_socket[0] >= 0);
551 assert(netns_storage_socket[1] >= 0);
552
553 /* We use the passed socketpair as a storage buffer for our
554 * namespace reference fd. Whatever process runs this first
555 * shall create a new namespace, all others should just join
556 * it. To serialize that we use a file lock on the socket
557 * pair.
558 *
559 * It's a bit crazy, but hey, works great! */
560
561 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
562 return -errno;
563
564 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
565 if (errno != EAGAIN) {
566 r = -errno;
567 goto fail;
568 }
569
570 /* Nothing stored yet, so let's create a new namespace */
571
572 if (unshare(CLONE_NEWNET) < 0) {
573 r = -errno;
574 goto fail;
575 }
576
577 loopback_setup();
578
579 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
580 if (netns < 0) {
581 r = -errno;
582 goto fail;
583 }
584
585 r = 1;
586 } else {
587 /* Yay, found something, so let's join the namespace */
588
589 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
590 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
591 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
592 netns = *(int*) CMSG_DATA(cmsg);
593 }
594 }
595
596 if (setns(netns, CLONE_NEWNET) < 0) {
597 r = -errno;
598 goto fail;
599 }
600
601 r = 0;
602 }
603
604 cmsg = CMSG_FIRSTHDR(&mh);
605 cmsg->cmsg_level = SOL_SOCKET;
606 cmsg->cmsg_type = SCM_RIGHTS;
607 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
608 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
609 mh.msg_controllen = cmsg->cmsg_len;
610
611 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
612 r = -errno;
613 goto fail;
614 }
615
616 fail:
617 lockf(netns_storage_socket[0], F_ULOCK, 0);
618
619 return r;
620 }
621
622 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
623 [PROTECT_HOME_NO] = "no",
624 [PROTECT_HOME_YES] = "yes",
625 [PROTECT_HOME_READ_ONLY] = "read-only",
626 };
627
628 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
629
630 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
631 [PROTECT_SYSTEM_NO] = "no",
632 [PROTECT_SYSTEM_YES] = "yes",
633 [PROTECT_SYSTEM_FULL] = "full",
634 };
635
636 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);