]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
unit: turn off mount propagation for udevd
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #include <sched.h>
30 #include <sys/syscall.h>
31 #include <limits.h>
32 #include <linux/fs.h>
33 #include <sys/file.h>
34
35 #include "strv.h"
36 #include "util.h"
37 #include "path-util.h"
38 #include "namespace.h"
39 #include "missing.h"
40 #include "execute.h"
41 #include "loopback-setup.h"
42 #include "mkdir.h"
43 #include "dev-setup.h"
44 #include "def.h"
45
46 typedef enum MountMode {
47 /* This is ordered by priority! */
48 INACCESSIBLE,
49 READONLY,
50 PRIVATE_TMP,
51 PRIVATE_VAR_TMP,
52 PRIVATE_DEV,
53 READWRITE
54 } MountMode;
55
56 typedef struct BindMount {
57 const char *path;
58 MountMode mode;
59 bool done;
60 bool ignore;
61 } BindMount;
62
63 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
64 char **i;
65
66 assert(p);
67
68 STRV_FOREACH(i, strv) {
69
70 (*p)->ignore = false;
71
72 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
73 (*p)->ignore = true;
74 (*i)++;
75 }
76
77 if (!path_is_absolute(*i))
78 return -EINVAL;
79
80 (*p)->path = *i;
81 (*p)->mode = mode;
82 (*p)++;
83 }
84
85 return 0;
86 }
87
88 static int mount_path_compare(const void *a, const void *b) {
89 const BindMount *p = a, *q = b;
90
91 if (path_equal(p->path, q->path)) {
92
93 /* If the paths are equal, check the mode */
94 if (p->mode < q->mode)
95 return -1;
96
97 if (p->mode > q->mode)
98 return 1;
99
100 return 0;
101 }
102
103 /* If the paths are not equal, then order prefixes first */
104 if (path_startswith(p->path, q->path))
105 return 1;
106
107 if (path_startswith(q->path, p->path))
108 return -1;
109
110 return 0;
111 }
112
113 static void drop_duplicates(BindMount *m, unsigned *n) {
114 BindMount *f, *t, *previous;
115
116 assert(m);
117 assert(n);
118
119 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
120
121 /* The first one wins */
122 if (previous && path_equal(f->path, previous->path))
123 continue;
124
125 t->path = f->path;
126 t->mode = f->mode;
127
128 previous = t;
129
130 t++;
131 }
132
133 *n = t - m;
134 }
135
136 static int mount_dev(BindMount *m) {
137 static const char devnodes[] =
138 "/dev/null\0"
139 "/dev/zero\0"
140 "/dev/full\0"
141 "/dev/random\0"
142 "/dev/urandom\0"
143 "/dev/tty\0";
144
145 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
146 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL;
147 _cleanup_umask_ mode_t u;
148 int r;
149
150 assert(m);
151
152 u = umask(0000);
153
154 if (!mkdtemp(temporary_mount))
155 return -errno;
156
157 dev = strappenda(temporary_mount, "/dev");
158 mkdir(dev, 0755);
159 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
160 r = -errno;
161 goto fail;
162 }
163
164 devpts = strappenda(temporary_mount, "/dev/pts");
165 mkdir(devpts, 0755);
166 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
167 r = -errno;
168 goto fail;
169 }
170
171 devshm = strappenda(temporary_mount, "/dev/shm");
172 mkdir(devshm, 01777);
173 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
174 if (r < 0) {
175 r = -errno;
176 goto fail;
177 }
178
179 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
180 mkdir(devmqueue, 0755);
181 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
182
183 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
184 mkdir(devkdbus, 0755);
185 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
186
187 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
188 mkdir(devhugepages, 0755);
189 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
190
191 NULSTR_FOREACH(d, devnodes) {
192 _cleanup_free_ char *dn = NULL;
193 struct stat st;
194
195 r = stat(d, &st);
196 if (r < 0) {
197
198 if (errno == ENOENT)
199 continue;
200
201 r = -errno;
202 goto fail;
203 }
204
205 if (!S_ISBLK(st.st_mode) &&
206 !S_ISCHR(st.st_mode)) {
207 r = -EINVAL;
208 goto fail;
209 }
210
211 if (st.st_rdev == 0)
212 continue;
213
214 dn = strappend(temporary_mount, d);
215 if (!dn) {
216 r = -ENOMEM;
217 goto fail;
218 }
219
220 r = mknod(dn, st.st_mode, st.st_rdev);
221 if (r < 0) {
222 r = -errno;
223 goto fail;
224 }
225 }
226
227 dev_setup(temporary_mount);
228
229 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
230 r = -errno;
231 goto fail;
232 }
233
234 rmdir(dev);
235 rmdir(temporary_mount);
236
237 return 0;
238
239 fail:
240 if (devpts)
241 umount(devpts);
242
243 if (devshm)
244 umount(devshm);
245
246 if (devkdbus)
247 umount(devkdbus);
248
249 if (devhugepages)
250 umount(devhugepages);
251
252 if (devmqueue)
253 umount(devmqueue);
254
255 if (dev) {
256 umount(dev);
257 rmdir(dev);
258 }
259
260 rmdir(temporary_mount);
261
262 return r;
263 }
264
265 static int apply_mount(
266 BindMount *m,
267 const char *tmp_dir,
268 const char *var_tmp_dir) {
269
270 const char *what;
271 int r;
272
273 assert(m);
274
275 switch (m->mode) {
276
277 case PRIVATE_DEV:
278 return mount_dev(m);
279
280 case INACCESSIBLE:
281 what = "/run/systemd/inaccessible";
282 break;
283
284 case READONLY:
285 case READWRITE:
286 what = m->path;
287 break;
288
289 case PRIVATE_TMP:
290 what = tmp_dir;
291 break;
292
293 case PRIVATE_VAR_TMP:
294 what = var_tmp_dir;
295 break;
296
297 default:
298 assert_not_reached("Unknown mode");
299 }
300
301 assert(what);
302
303 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
304 if (r >= 0)
305 log_debug("Successfully mounted %s to %s", what, m->path);
306 else if (m->ignore && errno == ENOENT)
307 r = 0;
308
309 return r;
310 }
311
312 static int make_read_only(BindMount *m) {
313 int r;
314
315 assert(m);
316
317 if (m->mode != INACCESSIBLE && m->mode != READONLY)
318 return 0;
319
320 r = mount(NULL, m->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL);
321 if (r < 0 && !(m->ignore && errno == ENOENT))
322 return -errno;
323
324 return 0;
325 }
326
327 int setup_namespace(
328 char** read_write_dirs,
329 char** read_only_dirs,
330 char** inaccessible_dirs,
331 char* tmp_dir,
332 char* var_tmp_dir,
333 bool private_dev,
334 unsigned mount_flags) {
335
336 BindMount *m, *mounts = NULL;
337 unsigned n;
338 int r = 0;
339
340 if (mount_flags == 0)
341 mount_flags = MS_SHARED;
342
343 if (unshare(CLONE_NEWNS) < 0)
344 return -errno;
345
346 n = !!tmp_dir + !!var_tmp_dir +
347 strv_length(read_write_dirs) +
348 strv_length(read_only_dirs) +
349 strv_length(inaccessible_dirs) +
350 private_dev;
351
352 if (n > 0) {
353 m = mounts = (BindMount *) alloca(n * sizeof(BindMount));
354 r = append_mounts(&m, read_write_dirs, READWRITE);
355 if (r < 0)
356 return r;
357
358 r = append_mounts(&m, read_only_dirs, READONLY);
359 if (r < 0)
360 return r;
361
362 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
363 if (r < 0)
364 return r;
365
366 if (tmp_dir) {
367 m->path = "/tmp";
368 m->mode = PRIVATE_TMP;
369 m++;
370 }
371
372 if (var_tmp_dir) {
373 m->path = "/var/tmp";
374 m->mode = PRIVATE_VAR_TMP;
375 m++;
376 }
377
378 if (private_dev) {
379 m->path = "/dev";
380 m->mode = PRIVATE_DEV;
381 m++;
382 }
383
384 assert(mounts + n == m);
385
386 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
387 drop_duplicates(mounts, &n);
388 }
389
390 if (n > 0) {
391 /* Remount / as SLAVE so that nothing now mounted in the namespace
392 shows up in the parent */
393 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
394 return -errno;
395
396 for (m = mounts; m < mounts + n; ++m) {
397 r = apply_mount(m, tmp_dir, var_tmp_dir);
398 if (r < 0)
399 goto fail;
400 }
401
402 for (m = mounts; m < mounts + n; ++m) {
403 r = make_read_only(m);
404 if (r < 0)
405 goto fail;
406 }
407 }
408
409 /* Remount / as the desired mode. Not that this will not
410 * reestablish propagation from our side to the host, since
411 * what's disconnected is disconnected. */
412 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
413 r = -errno;
414 goto fail;
415 }
416
417 return 0;
418
419 fail:
420 if (n > 0) {
421 for (m = mounts; m < mounts + n; ++m)
422 if (m->done)
423 umount2(m->path, MNT_DETACH);
424 }
425
426 return r;
427 }
428
429 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
430 _cleanup_free_ char *x = NULL;
431 char bid[SD_ID128_STRING_MAX];
432 sd_id128_t boot_id;
433 int r;
434
435 assert(id);
436 assert(prefix);
437 assert(path);
438
439 /* We include the boot id in the directory so that after a
440 * reboot we can easily identify obsolete directories. */
441
442 r = sd_id128_get_boot(&boot_id);
443 if (r < 0)
444 return r;
445
446 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
447 if (!x)
448 return -ENOMEM;
449
450 RUN_WITH_UMASK(0077)
451 if (!mkdtemp(x))
452 return -errno;
453
454 RUN_WITH_UMASK(0000) {
455 char *y;
456
457 y = strappenda(x, "/tmp");
458
459 if (mkdir(y, 0777 | S_ISVTX) < 0)
460 return -errno;
461 }
462
463 *path = x;
464 x = NULL;
465
466 return 0;
467 }
468
469 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
470 char *a, *b;
471 int r;
472
473 assert(id);
474 assert(tmp_dir);
475 assert(var_tmp_dir);
476
477 r = setup_one_tmp_dir(id, "/tmp", &a);
478 if (r < 0)
479 return r;
480
481 r = setup_one_tmp_dir(id, "/var/tmp", &b);
482 if (r < 0) {
483 char *t;
484
485 t = strappenda(a, "/tmp");
486 rmdir(t);
487 rmdir(a);
488
489 free(a);
490 return r;
491 }
492
493 *tmp_dir = a;
494 *var_tmp_dir = b;
495
496 return 0;
497 }
498
499 int setup_netns(int netns_storage_socket[2]) {
500 _cleanup_close_ int netns = -1;
501 union {
502 struct cmsghdr cmsghdr;
503 uint8_t buf[CMSG_SPACE(sizeof(int))];
504 } control = {};
505 struct msghdr mh = {
506 .msg_control = &control,
507 .msg_controllen = sizeof(control),
508 };
509 struct cmsghdr *cmsg;
510 int r;
511
512 assert(netns_storage_socket);
513 assert(netns_storage_socket[0] >= 0);
514 assert(netns_storage_socket[1] >= 0);
515
516 /* We use the passed socketpair as a storage buffer for our
517 * namespace reference fd. Whatever process runs this first
518 * shall create a new namespace, all others should just join
519 * it. To serialize that we use a file lock on the socket
520 * pair.
521 *
522 * It's a bit crazy, but hey, works great! */
523
524 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
525 return -errno;
526
527 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
528 if (errno != EAGAIN) {
529 r = -errno;
530 goto fail;
531 }
532
533 /* Nothing stored yet, so let's create a new namespace */
534
535 if (unshare(CLONE_NEWNET) < 0) {
536 r = -errno;
537 goto fail;
538 }
539
540 loopback_setup();
541
542 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
543 if (netns < 0) {
544 r = -errno;
545 goto fail;
546 }
547
548 r = 1;
549 } else {
550 /* Yay, found something, so let's join the namespace */
551
552 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
553 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
554 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
555 netns = *(int*) CMSG_DATA(cmsg);
556 }
557 }
558
559 if (setns(netns, CLONE_NEWNET) < 0) {
560 r = -errno;
561 goto fail;
562 }
563
564 r = 0;
565 }
566
567 cmsg = CMSG_FIRSTHDR(&mh);
568 cmsg->cmsg_level = SOL_SOCKET;
569 cmsg->cmsg_type = SCM_RIGHTS;
570 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
571 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
572 mh.msg_controllen = cmsg->cmsg_len;
573
574 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
575 r = -errno;
576 goto fail;
577 }
578
579 fail:
580 lockf(netns_storage_socket[0], F_ULOCK, 0);
581
582 return r;
583 }