]>
Commit | Line | Data |
---|---|---|
d6c9574f | 1 | /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ |
15ae422b LP |
2 | |
3 | /*** | |
4 | This file is part of systemd. | |
5 | ||
6 | Copyright 2010 Lennart Poettering | |
7 | ||
8 | systemd is free software; you can redistribute it and/or modify it | |
5430f7f2 LP |
9 | under the terms of the GNU Lesser General Public License as published by |
10 | the Free Software Foundation; either version 2.1 of the License, or | |
15ae422b LP |
11 | (at your option) any later version. |
12 | ||
13 | systemd is distributed in the hope that it will be useful, but | |
14 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
5430f7f2 | 16 | Lesser General Public License for more details. |
15ae422b | 17 | |
5430f7f2 | 18 | You should have received a copy of the GNU Lesser General Public License |
15ae422b LP |
19 | along with systemd; If not, see <http://www.gnu.org/licenses/>. |
20 | ***/ | |
21 | ||
22 | #include <errno.h> | |
23 | #include <sys/mount.h> | |
24 | #include <string.h> | |
25 | #include <stdio.h> | |
26 | #include <unistd.h> | |
27 | #include <sys/stat.h> | |
28 | #include <sys/types.h> | |
29 | #include <sched.h> | |
30 | #include <sys/syscall.h> | |
31 | #include <limits.h> | |
25e870b5 | 32 | #include <linux/fs.h> |
613b411c | 33 | #include <sys/file.h> |
15ae422b LP |
34 | |
35 | #include "strv.h" | |
36 | #include "util.h" | |
9eb977db | 37 | #include "path-util.h" |
15ae422b LP |
38 | #include "namespace.h" |
39 | #include "missing.h" | |
c17ec25e | 40 | #include "execute.h" |
613b411c | 41 | #include "loopback-setup.h" |
7f112f50 LP |
42 | #include "mkdir.h" |
43 | #include "dev-setup.h" | |
44 | #include "def.h" | |
15ae422b | 45 | |
c17ec25e | 46 | typedef enum MountMode { |
15ae422b LP |
47 | /* This is ordered by priority! */ |
48 | INACCESSIBLE, | |
49 | READONLY, | |
ac0930c8 LP |
50 | PRIVATE_TMP, |
51 | PRIVATE_VAR_TMP, | |
7f112f50 | 52 | PRIVATE_DEV, |
15ae422b | 53 | READWRITE |
c17ec25e | 54 | } MountMode; |
15ae422b | 55 | |
c17ec25e | 56 | typedef struct BindMount { |
15ae422b | 57 | const char *path; |
c17ec25e | 58 | MountMode mode; |
ac0930c8 | 59 | bool done; |
ea92ae33 | 60 | bool ignore; |
c17ec25e | 61 | } BindMount; |
15ae422b | 62 | |
c17ec25e | 63 | static int append_mounts(BindMount **p, char **strv, MountMode mode) { |
15ae422b LP |
64 | char **i; |
65 | ||
613b411c LP |
66 | assert(p); |
67 | ||
15ae422b LP |
68 | STRV_FOREACH(i, strv) { |
69 | ||
ea92ae33 MW |
70 | (*p)->ignore = false; |
71 | ||
94828d2d | 72 | if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') { |
ea92ae33 MW |
73 | (*p)->ignore = true; |
74 | (*i)++; | |
75 | } | |
76 | ||
15ae422b LP |
77 | if (!path_is_absolute(*i)) |
78 | return -EINVAL; | |
79 | ||
80 | (*p)->path = *i; | |
81 | (*p)->mode = mode; | |
82 | (*p)++; | |
83 | } | |
84 | ||
85 | return 0; | |
86 | } | |
87 | ||
c17ec25e MS |
88 | static int mount_path_compare(const void *a, const void *b) { |
89 | const BindMount *p = a, *q = b; | |
15ae422b LP |
90 | |
91 | if (path_equal(p->path, q->path)) { | |
92 | ||
93 | /* If the paths are equal, check the mode */ | |
94 | if (p->mode < q->mode) | |
95 | return -1; | |
96 | ||
97 | if (p->mode > q->mode) | |
98 | return 1; | |
99 | ||
100 | return 0; | |
101 | } | |
102 | ||
103 | /* If the paths are not equal, then order prefixes first */ | |
104 | if (path_startswith(p->path, q->path)) | |
105 | return 1; | |
106 | ||
107 | if (path_startswith(q->path, p->path)) | |
108 | return -1; | |
109 | ||
110 | return 0; | |
111 | } | |
112 | ||
c17ec25e MS |
113 | static void drop_duplicates(BindMount *m, unsigned *n) { |
114 | BindMount *f, *t, *previous; | |
15ae422b | 115 | |
c17ec25e | 116 | assert(m); |
15ae422b | 117 | assert(n); |
15ae422b | 118 | |
c17ec25e | 119 | for (f = m, t = m, previous = NULL; f < m+*n; f++) { |
15ae422b | 120 | |
ac0930c8 | 121 | /* The first one wins */ |
15ae422b LP |
122 | if (previous && path_equal(f->path, previous->path)) |
123 | continue; | |
124 | ||
125 | t->path = f->path; | |
126 | t->mode = f->mode; | |
127 | ||
15ae422b LP |
128 | previous = t; |
129 | ||
130 | t++; | |
131 | } | |
132 | ||
c17ec25e | 133 | *n = t - m; |
15ae422b LP |
134 | } |
135 | ||
7f112f50 LP |
136 | static int mount_dev(BindMount *m) { |
137 | static const char devnodes[] = | |
138 | "/dev/null\0" | |
139 | "/dev/zero\0" | |
140 | "/dev/full\0" | |
141 | "/dev/random\0" | |
142 | "/dev/urandom\0" | |
143 | "/dev/tty\0"; | |
144 | ||
2b85f4e1 LP |
145 | char temporary_mount[] = "/tmp/namespace-dev-XXXXXX"; |
146 | const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL; | |
7f112f50 LP |
147 | _cleanup_umask_ mode_t u; |
148 | int r; | |
149 | ||
150 | assert(m); | |
151 | ||
152 | u = umask(0000); | |
153 | ||
2b85f4e1 LP |
154 | if (!mkdtemp(temporary_mount)) |
155 | return -errno; | |
156 | ||
157 | dev = strappenda(temporary_mount, "/dev"); | |
158 | mkdir(dev, 0755); | |
159 | if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) { | |
160 | r = -errno; | |
161 | goto fail; | |
162 | } | |
163 | ||
164 | devpts = strappenda(temporary_mount, "/dev/pts"); | |
165 | mkdir(devpts, 0755); | |
166 | if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) { | |
167 | r = -errno; | |
168 | goto fail; | |
169 | } | |
170 | ||
171 | devshm = strappenda(temporary_mount, "/dev/shm"); | |
172 | mkdir(devshm, 01777); | |
173 | r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL); | |
174 | if (r < 0) { | |
175 | r = -errno; | |
176 | goto fail; | |
177 | } | |
178 | ||
179 | devmqueue = strappenda(temporary_mount, "/dev/mqueue"); | |
180 | mkdir(devmqueue, 0755); | |
181 | mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL); | |
182 | ||
183 | devkdbus = strappenda(temporary_mount, "/dev/kdbus"); | |
184 | mkdir(devkdbus, 0755); | |
185 | mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL); | |
186 | ||
187 | devhugepages = strappenda(temporary_mount, "/dev/hugepages"); | |
188 | mkdir(devhugepages, 0755); | |
189 | mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL); | |
190 | ||
7f112f50 | 191 | NULSTR_FOREACH(d, devnodes) { |
2b85f4e1 LP |
192 | _cleanup_free_ char *dn = NULL; |
193 | struct stat st; | |
194 | ||
195 | r = stat(d, &st); | |
7f112f50 | 196 | if (r < 0) { |
2b85f4e1 LP |
197 | |
198 | if (errno == ENOENT) | |
199 | continue; | |
200 | ||
201 | r = -errno; | |
202 | goto fail; | |
7f112f50 LP |
203 | } |
204 | ||
2b85f4e1 LP |
205 | if (!S_ISBLK(st.st_mode) && |
206 | !S_ISCHR(st.st_mode)) { | |
207 | r = -EINVAL; | |
208 | goto fail; | |
209 | } | |
210 | ||
211 | if (st.st_rdev == 0) | |
212 | continue; | |
213 | ||
214 | dn = strappend(temporary_mount, d); | |
215 | if (!dn) { | |
216 | r = -ENOMEM; | |
217 | goto fail; | |
218 | } | |
219 | ||
220 | r = mknod(dn, st.st_mode, st.st_rdev); | |
221 | if (r < 0) { | |
222 | r = -errno; | |
223 | goto fail; | |
224 | } | |
7f112f50 LP |
225 | } |
226 | ||
2b85f4e1 | 227 | dev_setup(temporary_mount); |
7f112f50 | 228 | |
2b85f4e1 LP |
229 | if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) { |
230 | r = -errno; | |
231 | goto fail; | |
232 | } | |
7f112f50 | 233 | |
2b85f4e1 LP |
234 | rmdir(dev); |
235 | rmdir(temporary_mount); | |
7f112f50 | 236 | |
2b85f4e1 | 237 | return 0; |
7f112f50 | 238 | |
2b85f4e1 LP |
239 | fail: |
240 | if (devpts) | |
241 | umount(devpts); | |
7f112f50 | 242 | |
2b85f4e1 LP |
243 | if (devshm) |
244 | umount(devshm); | |
7f112f50 | 245 | |
2b85f4e1 LP |
246 | if (devkdbus) |
247 | umount(devkdbus); | |
7f112f50 | 248 | |
2b85f4e1 LP |
249 | if (devhugepages) |
250 | umount(devhugepages); | |
7f112f50 | 251 | |
2b85f4e1 LP |
252 | if (devmqueue) |
253 | umount(devmqueue); | |
7f112f50 | 254 | |
2b85f4e1 LP |
255 | if (dev) { |
256 | umount(dev); | |
257 | rmdir(dev); | |
7f112f50 LP |
258 | } |
259 | ||
2b85f4e1 | 260 | rmdir(temporary_mount); |
7f112f50 | 261 | |
2b85f4e1 | 262 | return r; |
7f112f50 LP |
263 | } |
264 | ||
ac0930c8 | 265 | static int apply_mount( |
c17ec25e | 266 | BindMount *m, |
ac0930c8 | 267 | const char *tmp_dir, |
c17ec25e | 268 | const char *var_tmp_dir) { |
ac0930c8 | 269 | |
15ae422b | 270 | const char *what; |
15ae422b | 271 | int r; |
15ae422b | 272 | |
c17ec25e | 273 | assert(m); |
15ae422b | 274 | |
c17ec25e | 275 | switch (m->mode) { |
15ae422b | 276 | |
7f112f50 LP |
277 | case PRIVATE_DEV: |
278 | return mount_dev(m); | |
279 | ||
15ae422b | 280 | case INACCESSIBLE: |
c17ec25e | 281 | what = "/run/systemd/inaccessible"; |
15ae422b LP |
282 | break; |
283 | ||
284 | case READONLY: | |
15ae422b | 285 | case READWRITE: |
c17ec25e | 286 | what = m->path; |
15ae422b LP |
287 | break; |
288 | ||
ac0930c8 LP |
289 | case PRIVATE_TMP: |
290 | what = tmp_dir; | |
291 | break; | |
292 | ||
293 | case PRIVATE_VAR_TMP: | |
294 | what = var_tmp_dir; | |
15ae422b | 295 | break; |
e364ad06 LP |
296 | |
297 | default: | |
298 | assert_not_reached("Unknown mode"); | |
15ae422b LP |
299 | } |
300 | ||
ac0930c8 | 301 | assert(what); |
15ae422b | 302 | |
c17ec25e | 303 | r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL); |
ac0930c8 | 304 | if (r >= 0) |
c17ec25e | 305 | log_debug("Successfully mounted %s to %s", what, m->path); |
ea92ae33 MW |
306 | else if (m->ignore && errno == ENOENT) |
307 | r = 0; | |
15ae422b | 308 | |
ac0930c8 LP |
309 | return r; |
310 | } | |
15ae422b | 311 | |
c17ec25e | 312 | static int make_read_only(BindMount *m) { |
ac0930c8 | 313 | int r; |
15ae422b | 314 | |
c17ec25e | 315 | assert(m); |
ac0930c8 | 316 | |
c17ec25e | 317 | if (m->mode != INACCESSIBLE && m->mode != READONLY) |
ac0930c8 LP |
318 | return 0; |
319 | ||
c17ec25e | 320 | r = mount(NULL, m->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL); |
ea92ae33 | 321 | if (r < 0 && !(m->ignore && errno == ENOENT)) |
ac0930c8 LP |
322 | return -errno; |
323 | ||
324 | return 0; | |
15ae422b LP |
325 | } |
326 | ||
613b411c LP |
327 | int setup_namespace( |
328 | char** read_write_dirs, | |
329 | char** read_only_dirs, | |
330 | char** inaccessible_dirs, | |
331 | char* tmp_dir, | |
332 | char* var_tmp_dir, | |
7f112f50 | 333 | bool private_dev, |
613b411c | 334 | unsigned mount_flags) { |
15ae422b | 335 | |
7ff7394d | 336 | BindMount *m, *mounts = NULL; |
613b411c | 337 | unsigned n; |
c17ec25e | 338 | int r = 0; |
15ae422b | 339 | |
613b411c | 340 | if (mount_flags == 0) |
c17ec25e | 341 | mount_flags = MS_SHARED; |
ac0930c8 | 342 | |
d5a3f0ea ZJS |
343 | if (unshare(CLONE_NEWNS) < 0) |
344 | return -errno; | |
15ae422b | 345 | |
613b411c LP |
346 | n = !!tmp_dir + !!var_tmp_dir + |
347 | strv_length(read_write_dirs) + | |
348 | strv_length(read_only_dirs) + | |
7f112f50 LP |
349 | strv_length(inaccessible_dirs) + |
350 | private_dev; | |
613b411c LP |
351 | |
352 | if (n > 0) { | |
7ff7394d | 353 | m = mounts = (BindMount *) alloca(n * sizeof(BindMount)); |
613b411c LP |
354 | r = append_mounts(&m, read_write_dirs, READWRITE); |
355 | if (r < 0) | |
356 | return r; | |
357 | ||
358 | r = append_mounts(&m, read_only_dirs, READONLY); | |
359 | if (r < 0) | |
360 | return r; | |
361 | ||
362 | r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE); | |
363 | if (r < 0) | |
7ff7394d ZJS |
364 | return r; |
365 | ||
613b411c | 366 | if (tmp_dir) { |
7ff7394d ZJS |
367 | m->path = "/tmp"; |
368 | m->mode = PRIVATE_TMP; | |
369 | m++; | |
613b411c | 370 | } |
7ff7394d | 371 | |
613b411c | 372 | if (var_tmp_dir) { |
7ff7394d ZJS |
373 | m->path = "/var/tmp"; |
374 | m->mode = PRIVATE_VAR_TMP; | |
375 | m++; | |
376 | } | |
ac0930c8 | 377 | |
7f112f50 LP |
378 | if (private_dev) { |
379 | m->path = "/dev"; | |
380 | m->mode = PRIVATE_DEV; | |
381 | m++; | |
382 | } | |
383 | ||
7ff7394d | 384 | assert(mounts + n == m); |
ac0930c8 | 385 | |
7ff7394d ZJS |
386 | qsort(mounts, n, sizeof(BindMount), mount_path_compare); |
387 | drop_duplicates(mounts, &n); | |
15ae422b LP |
388 | } |
389 | ||
c2c13f2d LP |
390 | if (n > 0) { |
391 | /* Remount / as SLAVE so that nothing now mounted in the namespace | |
392 | shows up in the parent */ | |
393 | if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) | |
394 | return -errno; | |
395 | ||
396 | for (m = mounts; m < mounts + n; ++m) { | |
397 | r = apply_mount(m, tmp_dir, var_tmp_dir); | |
398 | if (r < 0) | |
399 | goto fail; | |
400 | } | |
15ae422b | 401 | |
c2c13f2d LP |
402 | for (m = mounts; m < mounts + n; ++m) { |
403 | r = make_read_only(m); | |
404 | if (r < 0) | |
405 | goto fail; | |
406 | } | |
15ae422b LP |
407 | } |
408 | ||
c2c13f2d LP |
409 | /* Remount / as the desired mode. Not that this will not |
410 | * reestablish propagation from our side to the host, since | |
411 | * what's disconnected is disconnected. */ | |
c17ec25e | 412 | if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) { |
15ae422b | 413 | r = -errno; |
613b411c | 414 | goto fail; |
15ae422b LP |
415 | } |
416 | ||
15ae422b LP |
417 | return 0; |
418 | ||
613b411c | 419 | fail: |
c2c13f2d LP |
420 | if (n > 0) { |
421 | for (m = mounts; m < mounts + n; ++m) | |
422 | if (m->done) | |
423 | umount2(m->path, MNT_DETACH); | |
424 | } | |
613b411c LP |
425 | |
426 | return r; | |
427 | } | |
428 | ||
429 | static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) { | |
430 | _cleanup_free_ char *x = NULL; | |
6b46ea73 LP |
431 | char bid[SD_ID128_STRING_MAX]; |
432 | sd_id128_t boot_id; | |
433 | int r; | |
613b411c LP |
434 | |
435 | assert(id); | |
436 | assert(prefix); | |
437 | assert(path); | |
438 | ||
6b46ea73 LP |
439 | /* We include the boot id in the directory so that after a |
440 | * reboot we can easily identify obsolete directories. */ | |
441 | ||
442 | r = sd_id128_get_boot(&boot_id); | |
443 | if (r < 0) | |
444 | return r; | |
445 | ||
446 | x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL); | |
613b411c LP |
447 | if (!x) |
448 | return -ENOMEM; | |
449 | ||
450 | RUN_WITH_UMASK(0077) | |
451 | if (!mkdtemp(x)) | |
452 | return -errno; | |
453 | ||
454 | RUN_WITH_UMASK(0000) { | |
455 | char *y; | |
456 | ||
457 | y = strappenda(x, "/tmp"); | |
458 | ||
459 | if (mkdir(y, 0777 | S_ISVTX) < 0) | |
460 | return -errno; | |
c17ec25e | 461 | } |
15ae422b | 462 | |
613b411c LP |
463 | *path = x; |
464 | x = NULL; | |
465 | ||
466 | return 0; | |
467 | } | |
468 | ||
469 | int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) { | |
470 | char *a, *b; | |
471 | int r; | |
472 | ||
473 | assert(id); | |
474 | assert(tmp_dir); | |
475 | assert(var_tmp_dir); | |
476 | ||
477 | r = setup_one_tmp_dir(id, "/tmp", &a); | |
478 | if (r < 0) | |
479 | return r; | |
480 | ||
481 | r = setup_one_tmp_dir(id, "/var/tmp", &b); | |
482 | if (r < 0) { | |
483 | char *t; | |
484 | ||
485 | t = strappenda(a, "/tmp"); | |
486 | rmdir(t); | |
487 | rmdir(a); | |
488 | ||
489 | free(a); | |
490 | return r; | |
491 | } | |
492 | ||
493 | *tmp_dir = a; | |
494 | *var_tmp_dir = b; | |
495 | ||
496 | return 0; | |
497 | } | |
498 | ||
499 | int setup_netns(int netns_storage_socket[2]) { | |
500 | _cleanup_close_ int netns = -1; | |
501 | union { | |
502 | struct cmsghdr cmsghdr; | |
503 | uint8_t buf[CMSG_SPACE(sizeof(int))]; | |
504 | } control = {}; | |
505 | struct msghdr mh = { | |
506 | .msg_control = &control, | |
507 | .msg_controllen = sizeof(control), | |
508 | }; | |
509 | struct cmsghdr *cmsg; | |
510 | int r; | |
511 | ||
512 | assert(netns_storage_socket); | |
513 | assert(netns_storage_socket[0] >= 0); | |
514 | assert(netns_storage_socket[1] >= 0); | |
515 | ||
516 | /* We use the passed socketpair as a storage buffer for our | |
76cd584b LP |
517 | * namespace reference fd. Whatever process runs this first |
518 | * shall create a new namespace, all others should just join | |
519 | * it. To serialize that we use a file lock on the socket | |
520 | * pair. | |
613b411c LP |
521 | * |
522 | * It's a bit crazy, but hey, works great! */ | |
523 | ||
524 | if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0) | |
525 | return -errno; | |
526 | ||
527 | if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) { | |
528 | if (errno != EAGAIN) { | |
529 | r = -errno; | |
530 | goto fail; | |
531 | } | |
532 | ||
533 | /* Nothing stored yet, so let's create a new namespace */ | |
534 | ||
535 | if (unshare(CLONE_NEWNET) < 0) { | |
536 | r = -errno; | |
537 | goto fail; | |
538 | } | |
539 | ||
540 | loopback_setup(); | |
541 | ||
542 | netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY); | |
543 | if (netns < 0) { | |
544 | r = -errno; | |
545 | goto fail; | |
546 | } | |
547 | ||
548 | r = 1; | |
549 | } else { | |
550 | /* Yay, found something, so let's join the namespace */ | |
551 | ||
552 | for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) { | |
553 | if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { | |
554 | assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); | |
555 | netns = *(int*) CMSG_DATA(cmsg); | |
556 | } | |
557 | } | |
558 | ||
559 | if (setns(netns, CLONE_NEWNET) < 0) { | |
560 | r = -errno; | |
561 | goto fail; | |
562 | } | |
563 | ||
564 | r = 0; | |
565 | } | |
566 | ||
567 | cmsg = CMSG_FIRSTHDR(&mh); | |
568 | cmsg->cmsg_level = SOL_SOCKET; | |
569 | cmsg->cmsg_type = SCM_RIGHTS; | |
570 | cmsg->cmsg_len = CMSG_LEN(sizeof(int)); | |
571 | memcpy(CMSG_DATA(cmsg), &netns, sizeof(int)); | |
572 | mh.msg_controllen = cmsg->cmsg_len; | |
573 | ||
574 | if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) { | |
575 | r = -errno; | |
576 | goto fail; | |
577 | } | |
578 | ||
579 | fail: | |
580 | lockf(netns_storage_socket[0], F_ULOCK, 0); | |
581 | ||
15ae422b LP |
582 | return r; |
583 | } |