]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/namespace.c
initctl: move /dev/initctl fifo into /run, replace it by symlink
[thirdparty/systemd.git] / src / core / namespace.c
CommitLineData
d6c9574f 1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
15ae422b
LP
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
15ae422b
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
15ae422b 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
15ae422b
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <errno.h>
23#include <sys/mount.h>
24#include <string.h>
25#include <stdio.h>
26#include <unistd.h>
27#include <sys/stat.h>
28#include <sys/types.h>
29#include <sched.h>
30#include <sys/syscall.h>
31#include <limits.h>
25e870b5 32#include <linux/fs.h>
613b411c 33#include <sys/file.h>
15ae422b
LP
34
35#include "strv.h"
36#include "util.h"
9eb977db 37#include "path-util.h"
15ae422b
LP
38#include "namespace.h"
39#include "missing.h"
c17ec25e 40#include "execute.h"
613b411c 41#include "loopback-setup.h"
7f112f50
LP
42#include "mkdir.h"
43#include "dev-setup.h"
44#include "def.h"
15ae422b 45
c17ec25e 46typedef enum MountMode {
15ae422b
LP
47 /* This is ordered by priority! */
48 INACCESSIBLE,
49 READONLY,
ac0930c8
LP
50 PRIVATE_TMP,
51 PRIVATE_VAR_TMP,
7f112f50 52 PRIVATE_DEV,
15ae422b 53 READWRITE
c17ec25e 54} MountMode;
15ae422b 55
c17ec25e 56typedef struct BindMount {
15ae422b 57 const char *path;
c17ec25e 58 MountMode mode;
ac0930c8 59 bool done;
ea92ae33 60 bool ignore;
c17ec25e 61} BindMount;
15ae422b 62
c17ec25e 63static int append_mounts(BindMount **p, char **strv, MountMode mode) {
15ae422b
LP
64 char **i;
65
613b411c
LP
66 assert(p);
67
15ae422b
LP
68 STRV_FOREACH(i, strv) {
69
ea92ae33
MW
70 (*p)->ignore = false;
71
94828d2d 72 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
ea92ae33
MW
73 (*p)->ignore = true;
74 (*i)++;
75 }
76
15ae422b
LP
77 if (!path_is_absolute(*i))
78 return -EINVAL;
79
80 (*p)->path = *i;
81 (*p)->mode = mode;
82 (*p)++;
83 }
84
85 return 0;
86}
87
c17ec25e
MS
88static int mount_path_compare(const void *a, const void *b) {
89 const BindMount *p = a, *q = b;
15ae422b
LP
90
91 if (path_equal(p->path, q->path)) {
92
93 /* If the paths are equal, check the mode */
94 if (p->mode < q->mode)
95 return -1;
96
97 if (p->mode > q->mode)
98 return 1;
99
100 return 0;
101 }
102
103 /* If the paths are not equal, then order prefixes first */
104 if (path_startswith(p->path, q->path))
105 return 1;
106
107 if (path_startswith(q->path, p->path))
108 return -1;
109
110 return 0;
111}
112
c17ec25e
MS
113static void drop_duplicates(BindMount *m, unsigned *n) {
114 BindMount *f, *t, *previous;
15ae422b 115
c17ec25e 116 assert(m);
15ae422b 117 assert(n);
15ae422b 118
c17ec25e 119 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
15ae422b 120
ac0930c8 121 /* The first one wins */
15ae422b
LP
122 if (previous && path_equal(f->path, previous->path))
123 continue;
124
125 t->path = f->path;
126 t->mode = f->mode;
127
15ae422b
LP
128 previous = t;
129
130 t++;
131 }
132
c17ec25e 133 *n = t - m;
15ae422b
LP
134}
135
7f112f50
LP
136static int mount_dev(BindMount *m) {
137 static const char devnodes[] =
138 "/dev/null\0"
139 "/dev/zero\0"
140 "/dev/full\0"
141 "/dev/random\0"
142 "/dev/urandom\0"
143 "/dev/tty\0";
144
2b85f4e1
LP
145 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
146 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL;
7f112f50
LP
147 _cleanup_umask_ mode_t u;
148 int r;
149
150 assert(m);
151
152 u = umask(0000);
153
2b85f4e1
LP
154 if (!mkdtemp(temporary_mount))
155 return -errno;
156
157 dev = strappenda(temporary_mount, "/dev");
158 mkdir(dev, 0755);
159 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
160 r = -errno;
161 goto fail;
162 }
163
164 devpts = strappenda(temporary_mount, "/dev/pts");
165 mkdir(devpts, 0755);
166 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
167 r = -errno;
168 goto fail;
169 }
170
171 devshm = strappenda(temporary_mount, "/dev/shm");
172 mkdir(devshm, 01777);
173 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
174 if (r < 0) {
175 r = -errno;
176 goto fail;
177 }
178
179 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
180 mkdir(devmqueue, 0755);
181 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
182
183 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
184 mkdir(devkdbus, 0755);
185 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
186
187 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
188 mkdir(devhugepages, 0755);
189 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
190
7f112f50 191 NULSTR_FOREACH(d, devnodes) {
2b85f4e1
LP
192 _cleanup_free_ char *dn = NULL;
193 struct stat st;
194
195 r = stat(d, &st);
7f112f50 196 if (r < 0) {
2b85f4e1
LP
197
198 if (errno == ENOENT)
199 continue;
200
201 r = -errno;
202 goto fail;
7f112f50
LP
203 }
204
2b85f4e1
LP
205 if (!S_ISBLK(st.st_mode) &&
206 !S_ISCHR(st.st_mode)) {
207 r = -EINVAL;
208 goto fail;
209 }
210
211 if (st.st_rdev == 0)
212 continue;
213
214 dn = strappend(temporary_mount, d);
215 if (!dn) {
216 r = -ENOMEM;
217 goto fail;
218 }
219
220 r = mknod(dn, st.st_mode, st.st_rdev);
221 if (r < 0) {
222 r = -errno;
223 goto fail;
224 }
7f112f50
LP
225 }
226
2b85f4e1 227 dev_setup(temporary_mount);
7f112f50 228
2b85f4e1
LP
229 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
230 r = -errno;
231 goto fail;
232 }
7f112f50 233
2b85f4e1
LP
234 rmdir(dev);
235 rmdir(temporary_mount);
7f112f50 236
2b85f4e1 237 return 0;
7f112f50 238
2b85f4e1
LP
239fail:
240 if (devpts)
241 umount(devpts);
7f112f50 242
2b85f4e1
LP
243 if (devshm)
244 umount(devshm);
7f112f50 245
2b85f4e1
LP
246 if (devkdbus)
247 umount(devkdbus);
7f112f50 248
2b85f4e1
LP
249 if (devhugepages)
250 umount(devhugepages);
7f112f50 251
2b85f4e1
LP
252 if (devmqueue)
253 umount(devmqueue);
7f112f50 254
2b85f4e1
LP
255 if (dev) {
256 umount(dev);
257 rmdir(dev);
7f112f50
LP
258 }
259
2b85f4e1 260 rmdir(temporary_mount);
7f112f50 261
2b85f4e1 262 return r;
7f112f50
LP
263}
264
ac0930c8 265static int apply_mount(
c17ec25e 266 BindMount *m,
ac0930c8 267 const char *tmp_dir,
c17ec25e 268 const char *var_tmp_dir) {
ac0930c8 269
15ae422b 270 const char *what;
15ae422b 271 int r;
15ae422b 272
c17ec25e 273 assert(m);
15ae422b 274
c17ec25e 275 switch (m->mode) {
15ae422b 276
7f112f50
LP
277 case PRIVATE_DEV:
278 return mount_dev(m);
279
15ae422b 280 case INACCESSIBLE:
c17ec25e 281 what = "/run/systemd/inaccessible";
15ae422b
LP
282 break;
283
284 case READONLY:
15ae422b 285 case READWRITE:
c17ec25e 286 what = m->path;
15ae422b
LP
287 break;
288
ac0930c8
LP
289 case PRIVATE_TMP:
290 what = tmp_dir;
291 break;
292
293 case PRIVATE_VAR_TMP:
294 what = var_tmp_dir;
15ae422b 295 break;
e364ad06
LP
296
297 default:
298 assert_not_reached("Unknown mode");
15ae422b
LP
299 }
300
ac0930c8 301 assert(what);
15ae422b 302
c17ec25e 303 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
ac0930c8 304 if (r >= 0)
c17ec25e 305 log_debug("Successfully mounted %s to %s", what, m->path);
ea92ae33
MW
306 else if (m->ignore && errno == ENOENT)
307 r = 0;
15ae422b 308
ac0930c8
LP
309 return r;
310}
15ae422b 311
c17ec25e 312static int make_read_only(BindMount *m) {
ac0930c8 313 int r;
15ae422b 314
c17ec25e 315 assert(m);
ac0930c8 316
c17ec25e 317 if (m->mode != INACCESSIBLE && m->mode != READONLY)
ac0930c8
LP
318 return 0;
319
c17ec25e 320 r = mount(NULL, m->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL);
ea92ae33 321 if (r < 0 && !(m->ignore && errno == ENOENT))
ac0930c8
LP
322 return -errno;
323
324 return 0;
15ae422b
LP
325}
326
613b411c
LP
327int setup_namespace(
328 char** read_write_dirs,
329 char** read_only_dirs,
330 char** inaccessible_dirs,
331 char* tmp_dir,
332 char* var_tmp_dir,
7f112f50 333 bool private_dev,
417116f2
LP
334 ProtectedHome protected_home,
335 bool read_only_system,
613b411c 336 unsigned mount_flags) {
15ae422b 337
7ff7394d 338 BindMount *m, *mounts = NULL;
613b411c 339 unsigned n;
c17ec25e 340 int r = 0;
15ae422b 341
613b411c 342 if (mount_flags == 0)
c17ec25e 343 mount_flags = MS_SHARED;
ac0930c8 344
d5a3f0ea
ZJS
345 if (unshare(CLONE_NEWNS) < 0)
346 return -errno;
15ae422b 347
613b411c
LP
348 n = !!tmp_dir + !!var_tmp_dir +
349 strv_length(read_write_dirs) +
350 strv_length(read_only_dirs) +
7f112f50 351 strv_length(inaccessible_dirs) +
417116f2
LP
352 private_dev +
353 (protected_home != PROTECTED_HOME_NO ? 2 : 0) +
354 (read_only_system ? 2 : 0);
613b411c
LP
355
356 if (n > 0) {
7ff7394d 357 m = mounts = (BindMount *) alloca(n * sizeof(BindMount));
613b411c
LP
358 r = append_mounts(&m, read_write_dirs, READWRITE);
359 if (r < 0)
360 return r;
361
362 r = append_mounts(&m, read_only_dirs, READONLY);
363 if (r < 0)
364 return r;
365
366 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
367 if (r < 0)
7ff7394d
ZJS
368 return r;
369
613b411c 370 if (tmp_dir) {
7ff7394d
ZJS
371 m->path = "/tmp";
372 m->mode = PRIVATE_TMP;
373 m++;
613b411c 374 }
7ff7394d 375
613b411c 376 if (var_tmp_dir) {
7ff7394d
ZJS
377 m->path = "/var/tmp";
378 m->mode = PRIVATE_VAR_TMP;
379 m++;
380 }
ac0930c8 381
7f112f50
LP
382 if (private_dev) {
383 m->path = "/dev";
384 m->mode = PRIVATE_DEV;
385 m++;
386 }
387
417116f2
LP
388 if (protected_home != PROTECTED_HOME_NO) {
389 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user"), protected_home == PROTECTED_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
390 if (r < 0)
391 return r;
392 }
393
394 if (read_only_system) {
395 r = append_mounts(&m, STRV_MAKE("/usr", "-/boot"), READONLY);
396 if (r < 0)
397 return r;
398 }
399
7ff7394d 400 assert(mounts + n == m);
ac0930c8 401
7ff7394d
ZJS
402 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
403 drop_duplicates(mounts, &n);
15ae422b
LP
404 }
405
c2c13f2d
LP
406 if (n > 0) {
407 /* Remount / as SLAVE so that nothing now mounted in the namespace
408 shows up in the parent */
409 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
410 return -errno;
411
412 for (m = mounts; m < mounts + n; ++m) {
413 r = apply_mount(m, tmp_dir, var_tmp_dir);
414 if (r < 0)
415 goto fail;
416 }
15ae422b 417
c2c13f2d
LP
418 for (m = mounts; m < mounts + n; ++m) {
419 r = make_read_only(m);
420 if (r < 0)
421 goto fail;
422 }
15ae422b
LP
423 }
424
c2c13f2d
LP
425 /* Remount / as the desired mode. Not that this will not
426 * reestablish propagation from our side to the host, since
427 * what's disconnected is disconnected. */
c17ec25e 428 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
15ae422b 429 r = -errno;
613b411c 430 goto fail;
15ae422b
LP
431 }
432
15ae422b
LP
433 return 0;
434
613b411c 435fail:
c2c13f2d
LP
436 if (n > 0) {
437 for (m = mounts; m < mounts + n; ++m)
438 if (m->done)
439 umount2(m->path, MNT_DETACH);
440 }
613b411c
LP
441
442 return r;
443}
444
445static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
446 _cleanup_free_ char *x = NULL;
6b46ea73
LP
447 char bid[SD_ID128_STRING_MAX];
448 sd_id128_t boot_id;
449 int r;
613b411c
LP
450
451 assert(id);
452 assert(prefix);
453 assert(path);
454
6b46ea73
LP
455 /* We include the boot id in the directory so that after a
456 * reboot we can easily identify obsolete directories. */
457
458 r = sd_id128_get_boot(&boot_id);
459 if (r < 0)
460 return r;
461
462 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
613b411c
LP
463 if (!x)
464 return -ENOMEM;
465
466 RUN_WITH_UMASK(0077)
467 if (!mkdtemp(x))
468 return -errno;
469
470 RUN_WITH_UMASK(0000) {
471 char *y;
472
473 y = strappenda(x, "/tmp");
474
475 if (mkdir(y, 0777 | S_ISVTX) < 0)
476 return -errno;
c17ec25e 477 }
15ae422b 478
613b411c
LP
479 *path = x;
480 x = NULL;
481
482 return 0;
483}
484
485int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
486 char *a, *b;
487 int r;
488
489 assert(id);
490 assert(tmp_dir);
491 assert(var_tmp_dir);
492
493 r = setup_one_tmp_dir(id, "/tmp", &a);
494 if (r < 0)
495 return r;
496
497 r = setup_one_tmp_dir(id, "/var/tmp", &b);
498 if (r < 0) {
499 char *t;
500
501 t = strappenda(a, "/tmp");
502 rmdir(t);
503 rmdir(a);
504
505 free(a);
506 return r;
507 }
508
509 *tmp_dir = a;
510 *var_tmp_dir = b;
511
512 return 0;
513}
514
515int setup_netns(int netns_storage_socket[2]) {
516 _cleanup_close_ int netns = -1;
517 union {
518 struct cmsghdr cmsghdr;
519 uint8_t buf[CMSG_SPACE(sizeof(int))];
520 } control = {};
521 struct msghdr mh = {
522 .msg_control = &control,
523 .msg_controllen = sizeof(control),
524 };
525 struct cmsghdr *cmsg;
526 int r;
527
528 assert(netns_storage_socket);
529 assert(netns_storage_socket[0] >= 0);
530 assert(netns_storage_socket[1] >= 0);
531
532 /* We use the passed socketpair as a storage buffer for our
76cd584b
LP
533 * namespace reference fd. Whatever process runs this first
534 * shall create a new namespace, all others should just join
535 * it. To serialize that we use a file lock on the socket
536 * pair.
613b411c
LP
537 *
538 * It's a bit crazy, but hey, works great! */
539
540 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
541 return -errno;
542
543 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
544 if (errno != EAGAIN) {
545 r = -errno;
546 goto fail;
547 }
548
549 /* Nothing stored yet, so let's create a new namespace */
550
551 if (unshare(CLONE_NEWNET) < 0) {
552 r = -errno;
553 goto fail;
554 }
555
556 loopback_setup();
557
558 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
559 if (netns < 0) {
560 r = -errno;
561 goto fail;
562 }
563
564 r = 1;
565 } else {
566 /* Yay, found something, so let's join the namespace */
567
568 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
569 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
570 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
571 netns = *(int*) CMSG_DATA(cmsg);
572 }
573 }
574
575 if (setns(netns, CLONE_NEWNET) < 0) {
576 r = -errno;
577 goto fail;
578 }
579
580 r = 0;
581 }
582
583 cmsg = CMSG_FIRSTHDR(&mh);
584 cmsg->cmsg_level = SOL_SOCKET;
585 cmsg->cmsg_type = SCM_RIGHTS;
586 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
587 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
588 mh.msg_controllen = cmsg->cmsg_len;
589
590 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
591 r = -errno;
592 goto fail;
593 }
594
595fail:
596 lockf(netns_storage_socket[0], F_ULOCK, 0);
597
15ae422b
LP
598 return r;
599}
417116f2
LP
600
601static const char *const protected_home_table[_PROTECTED_HOME_MAX] = {
602 [PROTECTED_HOME_NO] = "no",
603 [PROTECTED_HOME_YES] = "yes",
604 [PROTECTED_HOME_READ_ONLY] = "read-only",
605};
606
607DEFINE_STRING_TABLE_LOOKUP(protected_home, ProtectedHome);