]>
Commit | Line | Data |
---|---|---|
d6c9574f | 1 | /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ |
15ae422b LP |
2 | |
3 | /*** | |
4 | This file is part of systemd. | |
5 | ||
6 | Copyright 2010 Lennart Poettering | |
7 | ||
8 | systemd is free software; you can redistribute it and/or modify it | |
5430f7f2 LP |
9 | under the terms of the GNU Lesser General Public License as published by |
10 | the Free Software Foundation; either version 2.1 of the License, or | |
15ae422b LP |
11 | (at your option) any later version. |
12 | ||
13 | systemd is distributed in the hope that it will be useful, but | |
14 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
5430f7f2 | 16 | Lesser General Public License for more details. |
15ae422b | 17 | |
5430f7f2 | 18 | You should have received a copy of the GNU Lesser General Public License |
15ae422b LP |
19 | along with systemd; If not, see <http://www.gnu.org/licenses/>. |
20 | ***/ | |
21 | ||
22 | #include <errno.h> | |
23 | #include <sys/mount.h> | |
24 | #include <string.h> | |
25 | #include <stdio.h> | |
26 | #include <unistd.h> | |
27 | #include <sys/stat.h> | |
28 | #include <sys/types.h> | |
29 | #include <sched.h> | |
30 | #include <sys/syscall.h> | |
31 | #include <limits.h> | |
25e870b5 | 32 | #include <linux/fs.h> |
613b411c | 33 | #include <sys/file.h> |
15ae422b LP |
34 | |
35 | #include "strv.h" | |
36 | #include "util.h" | |
9eb977db | 37 | #include "path-util.h" |
15ae422b LP |
38 | #include "namespace.h" |
39 | #include "missing.h" | |
c17ec25e | 40 | #include "execute.h" |
613b411c | 41 | #include "loopback-setup.h" |
7f112f50 LP |
42 | #include "mkdir.h" |
43 | #include "dev-setup.h" | |
44 | #include "def.h" | |
15ae422b | 45 | |
c17ec25e | 46 | typedef enum MountMode { |
15ae422b LP |
47 | /* This is ordered by priority! */ |
48 | INACCESSIBLE, | |
49 | READONLY, | |
ac0930c8 LP |
50 | PRIVATE_TMP, |
51 | PRIVATE_VAR_TMP, | |
7f112f50 | 52 | PRIVATE_DEV, |
15ae422b | 53 | READWRITE |
c17ec25e | 54 | } MountMode; |
15ae422b | 55 | |
c17ec25e | 56 | typedef struct BindMount { |
15ae422b | 57 | const char *path; |
c17ec25e | 58 | MountMode mode; |
ac0930c8 | 59 | bool done; |
ea92ae33 | 60 | bool ignore; |
c17ec25e | 61 | } BindMount; |
15ae422b | 62 | |
c17ec25e | 63 | static int append_mounts(BindMount **p, char **strv, MountMode mode) { |
15ae422b LP |
64 | char **i; |
65 | ||
613b411c LP |
66 | assert(p); |
67 | ||
15ae422b LP |
68 | STRV_FOREACH(i, strv) { |
69 | ||
ea92ae33 MW |
70 | (*p)->ignore = false; |
71 | ||
94828d2d | 72 | if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') { |
ea92ae33 MW |
73 | (*p)->ignore = true; |
74 | (*i)++; | |
75 | } | |
76 | ||
15ae422b LP |
77 | if (!path_is_absolute(*i)) |
78 | return -EINVAL; | |
79 | ||
80 | (*p)->path = *i; | |
81 | (*p)->mode = mode; | |
82 | (*p)++; | |
83 | } | |
84 | ||
85 | return 0; | |
86 | } | |
87 | ||
c17ec25e MS |
88 | static int mount_path_compare(const void *a, const void *b) { |
89 | const BindMount *p = a, *q = b; | |
15ae422b LP |
90 | |
91 | if (path_equal(p->path, q->path)) { | |
92 | ||
93 | /* If the paths are equal, check the mode */ | |
94 | if (p->mode < q->mode) | |
95 | return -1; | |
96 | ||
97 | if (p->mode > q->mode) | |
98 | return 1; | |
99 | ||
100 | return 0; | |
101 | } | |
102 | ||
103 | /* If the paths are not equal, then order prefixes first */ | |
104 | if (path_startswith(p->path, q->path)) | |
105 | return 1; | |
106 | ||
107 | if (path_startswith(q->path, p->path)) | |
108 | return -1; | |
109 | ||
110 | return 0; | |
111 | } | |
112 | ||
c17ec25e MS |
113 | static void drop_duplicates(BindMount *m, unsigned *n) { |
114 | BindMount *f, *t, *previous; | |
15ae422b | 115 | |
c17ec25e | 116 | assert(m); |
15ae422b | 117 | assert(n); |
15ae422b | 118 | |
c17ec25e | 119 | for (f = m, t = m, previous = NULL; f < m+*n; f++) { |
15ae422b | 120 | |
ac0930c8 | 121 | /* The first one wins */ |
15ae422b LP |
122 | if (previous && path_equal(f->path, previous->path)) |
123 | continue; | |
124 | ||
125 | t->path = f->path; | |
126 | t->mode = f->mode; | |
127 | ||
15ae422b LP |
128 | previous = t; |
129 | ||
130 | t++; | |
131 | } | |
132 | ||
c17ec25e | 133 | *n = t - m; |
15ae422b LP |
134 | } |
135 | ||
7f112f50 LP |
136 | static int mount_dev(BindMount *m) { |
137 | static const char devnodes[] = | |
138 | "/dev/null\0" | |
139 | "/dev/zero\0" | |
140 | "/dev/full\0" | |
141 | "/dev/random\0" | |
142 | "/dev/urandom\0" | |
143 | "/dev/tty\0"; | |
144 | ||
145 | struct stat devnodes_stat[6] = {}; | |
146 | const char *d; | |
147 | unsigned n = 0; | |
148 | _cleanup_umask_ mode_t u; | |
149 | int r; | |
150 | ||
151 | assert(m); | |
152 | ||
153 | u = umask(0000); | |
154 | ||
155 | /* First: record device mode_t and dev_t */ | |
156 | NULSTR_FOREACH(d, devnodes) { | |
157 | r = stat(d, &devnodes_stat[n]); | |
158 | if (r < 0) { | |
159 | if (errno != ENOENT) | |
160 | return -errno; | |
161 | } else { | |
162 | if (!S_ISBLK(devnodes_stat[n].st_mode) && | |
163 | !S_ISCHR(devnodes_stat[n].st_mode)) | |
164 | return -EINVAL; | |
165 | } | |
166 | ||
167 | n++; | |
168 | } | |
169 | ||
170 | assert(n == ELEMENTSOF(devnodes_stat)); | |
171 | ||
172 | r = mount("tmpfs", "/dev", "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755"); | |
173 | if (r < 0) | |
174 | return m->ignore ? 0 : -errno; | |
175 | ||
176 | ||
177 | mkdir_p("/dev/pts", 0755); | |
178 | ||
179 | r = mount("devpts", "/dev/pts", "devpts", MS_NOSUID|MS_NOEXEC, "newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID)); | |
180 | if (r < 0) | |
181 | return m->ignore ? 0 : -errno; | |
182 | ||
183 | mkdir_p("/dev/shm", 0755); | |
184 | ||
185 | r = mount("tmpfs", "/dev/shm", "tmpfs", MS_NOSUID|MS_NODEV|MS_STRICTATIME, "mode=1777"); | |
186 | if (r < 0) | |
187 | return m->ignore ? 0 : -errno; | |
188 | ||
189 | /* Second: actually create it */ | |
190 | n = 0; | |
191 | NULSTR_FOREACH(d, devnodes) { | |
192 | if (devnodes_stat[n].st_rdev == 0) | |
193 | continue; | |
194 | ||
195 | r = mknod(d, devnodes_stat[n].st_mode, devnodes_stat[n].st_rdev); | |
196 | if (r < 0) | |
197 | return m->ignore ? 0 : -errno; | |
198 | ||
199 | n++; | |
200 | } | |
201 | ||
202 | dev_setup(NULL); | |
203 | ||
204 | return 0; | |
205 | } | |
206 | ||
ac0930c8 | 207 | static int apply_mount( |
c17ec25e | 208 | BindMount *m, |
ac0930c8 | 209 | const char *tmp_dir, |
c17ec25e | 210 | const char *var_tmp_dir) { |
ac0930c8 | 211 | |
15ae422b | 212 | const char *what; |
15ae422b | 213 | int r; |
15ae422b | 214 | |
c17ec25e | 215 | assert(m); |
15ae422b | 216 | |
c17ec25e | 217 | switch (m->mode) { |
15ae422b | 218 | |
7f112f50 LP |
219 | case PRIVATE_DEV: |
220 | return mount_dev(m); | |
221 | ||
15ae422b | 222 | case INACCESSIBLE: |
c17ec25e | 223 | what = "/run/systemd/inaccessible"; |
15ae422b LP |
224 | break; |
225 | ||
226 | case READONLY: | |
15ae422b | 227 | case READWRITE: |
c17ec25e | 228 | what = m->path; |
15ae422b LP |
229 | break; |
230 | ||
ac0930c8 LP |
231 | case PRIVATE_TMP: |
232 | what = tmp_dir; | |
233 | break; | |
234 | ||
235 | case PRIVATE_VAR_TMP: | |
236 | what = var_tmp_dir; | |
15ae422b | 237 | break; |
e364ad06 LP |
238 | |
239 | default: | |
240 | assert_not_reached("Unknown mode"); | |
15ae422b LP |
241 | } |
242 | ||
ac0930c8 | 243 | assert(what); |
15ae422b | 244 | |
c17ec25e | 245 | r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL); |
ac0930c8 | 246 | if (r >= 0) |
c17ec25e | 247 | log_debug("Successfully mounted %s to %s", what, m->path); |
ea92ae33 MW |
248 | else if (m->ignore && errno == ENOENT) |
249 | r = 0; | |
15ae422b | 250 | |
ac0930c8 LP |
251 | return r; |
252 | } | |
15ae422b | 253 | |
c17ec25e | 254 | static int make_read_only(BindMount *m) { |
ac0930c8 | 255 | int r; |
15ae422b | 256 | |
c17ec25e | 257 | assert(m); |
ac0930c8 | 258 | |
c17ec25e | 259 | if (m->mode != INACCESSIBLE && m->mode != READONLY) |
ac0930c8 LP |
260 | return 0; |
261 | ||
c17ec25e | 262 | r = mount(NULL, m->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL); |
ea92ae33 | 263 | if (r < 0 && !(m->ignore && errno == ENOENT)) |
ac0930c8 LP |
264 | return -errno; |
265 | ||
266 | return 0; | |
15ae422b LP |
267 | } |
268 | ||
613b411c LP |
269 | int setup_namespace( |
270 | char** read_write_dirs, | |
271 | char** read_only_dirs, | |
272 | char** inaccessible_dirs, | |
273 | char* tmp_dir, | |
274 | char* var_tmp_dir, | |
7f112f50 | 275 | bool private_dev, |
613b411c | 276 | unsigned mount_flags) { |
15ae422b | 277 | |
7ff7394d | 278 | BindMount *m, *mounts = NULL; |
613b411c | 279 | unsigned n; |
c17ec25e | 280 | int r = 0; |
15ae422b | 281 | |
613b411c | 282 | if (mount_flags == 0) |
c17ec25e | 283 | mount_flags = MS_SHARED; |
ac0930c8 | 284 | |
d5a3f0ea ZJS |
285 | if (unshare(CLONE_NEWNS) < 0) |
286 | return -errno; | |
15ae422b | 287 | |
613b411c LP |
288 | n = !!tmp_dir + !!var_tmp_dir + |
289 | strv_length(read_write_dirs) + | |
290 | strv_length(read_only_dirs) + | |
7f112f50 LP |
291 | strv_length(inaccessible_dirs) + |
292 | private_dev; | |
613b411c LP |
293 | |
294 | if (n > 0) { | |
7ff7394d | 295 | m = mounts = (BindMount *) alloca(n * sizeof(BindMount)); |
613b411c LP |
296 | r = append_mounts(&m, read_write_dirs, READWRITE); |
297 | if (r < 0) | |
298 | return r; | |
299 | ||
300 | r = append_mounts(&m, read_only_dirs, READONLY); | |
301 | if (r < 0) | |
302 | return r; | |
303 | ||
304 | r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE); | |
305 | if (r < 0) | |
7ff7394d ZJS |
306 | return r; |
307 | ||
613b411c | 308 | if (tmp_dir) { |
7ff7394d ZJS |
309 | m->path = "/tmp"; |
310 | m->mode = PRIVATE_TMP; | |
311 | m++; | |
613b411c | 312 | } |
7ff7394d | 313 | |
613b411c | 314 | if (var_tmp_dir) { |
7ff7394d ZJS |
315 | m->path = "/var/tmp"; |
316 | m->mode = PRIVATE_VAR_TMP; | |
317 | m++; | |
318 | } | |
ac0930c8 | 319 | |
7f112f50 LP |
320 | if (private_dev) { |
321 | m->path = "/dev"; | |
322 | m->mode = PRIVATE_DEV; | |
323 | m++; | |
324 | } | |
325 | ||
7ff7394d | 326 | assert(mounts + n == m); |
ac0930c8 | 327 | |
7ff7394d ZJS |
328 | qsort(mounts, n, sizeof(BindMount), mount_path_compare); |
329 | drop_duplicates(mounts, &n); | |
15ae422b LP |
330 | } |
331 | ||
ac0930c8 | 332 | /* Remount / as SLAVE so that nothing now mounted in the namespace |
dc4b0200 | 333 | shows up in the parent */ |
d5a3f0ea ZJS |
334 | if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) |
335 | return -errno; | |
15ae422b | 336 | |
c17ec25e MS |
337 | for (m = mounts; m < mounts + n; ++m) { |
338 | r = apply_mount(m, tmp_dir, var_tmp_dir); | |
c1d70f7c | 339 | if (r < 0) |
613b411c | 340 | goto fail; |
c1d70f7c | 341 | } |
15ae422b | 342 | |
c17ec25e MS |
343 | for (m = mounts; m < mounts + n; ++m) { |
344 | r = make_read_only(m); | |
ac0930c8 | 345 | if (r < 0) |
613b411c | 346 | goto fail; |
15ae422b LP |
347 | } |
348 | ||
ac0930c8 | 349 | /* Remount / as the desired mode */ |
c17ec25e | 350 | if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) { |
15ae422b | 351 | r = -errno; |
613b411c | 352 | goto fail; |
15ae422b LP |
353 | } |
354 | ||
15ae422b LP |
355 | return 0; |
356 | ||
613b411c LP |
357 | fail: |
358 | for (m = mounts; m < mounts + n; ++m) | |
c17ec25e MS |
359 | if (m->done) |
360 | umount2(m->path, MNT_DETACH); | |
613b411c LP |
361 | |
362 | return r; | |
363 | } | |
364 | ||
365 | static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) { | |
366 | _cleanup_free_ char *x = NULL; | |
6b46ea73 LP |
367 | char bid[SD_ID128_STRING_MAX]; |
368 | sd_id128_t boot_id; | |
369 | int r; | |
613b411c LP |
370 | |
371 | assert(id); | |
372 | assert(prefix); | |
373 | assert(path); | |
374 | ||
6b46ea73 LP |
375 | /* We include the boot id in the directory so that after a |
376 | * reboot we can easily identify obsolete directories. */ | |
377 | ||
378 | r = sd_id128_get_boot(&boot_id); | |
379 | if (r < 0) | |
380 | return r; | |
381 | ||
382 | x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL); | |
613b411c LP |
383 | if (!x) |
384 | return -ENOMEM; | |
385 | ||
386 | RUN_WITH_UMASK(0077) | |
387 | if (!mkdtemp(x)) | |
388 | return -errno; | |
389 | ||
390 | RUN_WITH_UMASK(0000) { | |
391 | char *y; | |
392 | ||
393 | y = strappenda(x, "/tmp"); | |
394 | ||
395 | if (mkdir(y, 0777 | S_ISVTX) < 0) | |
396 | return -errno; | |
c17ec25e | 397 | } |
15ae422b | 398 | |
613b411c LP |
399 | *path = x; |
400 | x = NULL; | |
401 | ||
402 | return 0; | |
403 | } | |
404 | ||
405 | int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) { | |
406 | char *a, *b; | |
407 | int r; | |
408 | ||
409 | assert(id); | |
410 | assert(tmp_dir); | |
411 | assert(var_tmp_dir); | |
412 | ||
413 | r = setup_one_tmp_dir(id, "/tmp", &a); | |
414 | if (r < 0) | |
415 | return r; | |
416 | ||
417 | r = setup_one_tmp_dir(id, "/var/tmp", &b); | |
418 | if (r < 0) { | |
419 | char *t; | |
420 | ||
421 | t = strappenda(a, "/tmp"); | |
422 | rmdir(t); | |
423 | rmdir(a); | |
424 | ||
425 | free(a); | |
426 | return r; | |
427 | } | |
428 | ||
429 | *tmp_dir = a; | |
430 | *var_tmp_dir = b; | |
431 | ||
432 | return 0; | |
433 | } | |
434 | ||
435 | int setup_netns(int netns_storage_socket[2]) { | |
436 | _cleanup_close_ int netns = -1; | |
437 | union { | |
438 | struct cmsghdr cmsghdr; | |
439 | uint8_t buf[CMSG_SPACE(sizeof(int))]; | |
440 | } control = {}; | |
441 | struct msghdr mh = { | |
442 | .msg_control = &control, | |
443 | .msg_controllen = sizeof(control), | |
444 | }; | |
445 | struct cmsghdr *cmsg; | |
446 | int r; | |
447 | ||
448 | assert(netns_storage_socket); | |
449 | assert(netns_storage_socket[0] >= 0); | |
450 | assert(netns_storage_socket[1] >= 0); | |
451 | ||
452 | /* We use the passed socketpair as a storage buffer for our | |
76cd584b LP |
453 | * namespace reference fd. Whatever process runs this first |
454 | * shall create a new namespace, all others should just join | |
455 | * it. To serialize that we use a file lock on the socket | |
456 | * pair. | |
613b411c LP |
457 | * |
458 | * It's a bit crazy, but hey, works great! */ | |
459 | ||
460 | if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0) | |
461 | return -errno; | |
462 | ||
463 | if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) { | |
464 | if (errno != EAGAIN) { | |
465 | r = -errno; | |
466 | goto fail; | |
467 | } | |
468 | ||
469 | /* Nothing stored yet, so let's create a new namespace */ | |
470 | ||
471 | if (unshare(CLONE_NEWNET) < 0) { | |
472 | r = -errno; | |
473 | goto fail; | |
474 | } | |
475 | ||
476 | loopback_setup(); | |
477 | ||
478 | netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY); | |
479 | if (netns < 0) { | |
480 | r = -errno; | |
481 | goto fail; | |
482 | } | |
483 | ||
484 | r = 1; | |
485 | } else { | |
486 | /* Yay, found something, so let's join the namespace */ | |
487 | ||
488 | for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) { | |
489 | if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { | |
490 | assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); | |
491 | netns = *(int*) CMSG_DATA(cmsg); | |
492 | } | |
493 | } | |
494 | ||
495 | if (setns(netns, CLONE_NEWNET) < 0) { | |
496 | r = -errno; | |
497 | goto fail; | |
498 | } | |
499 | ||
500 | r = 0; | |
501 | } | |
502 | ||
503 | cmsg = CMSG_FIRSTHDR(&mh); | |
504 | cmsg->cmsg_level = SOL_SOCKET; | |
505 | cmsg->cmsg_type = SCM_RIGHTS; | |
506 | cmsg->cmsg_len = CMSG_LEN(sizeof(int)); | |
507 | memcpy(CMSG_DATA(cmsg), &netns, sizeof(int)); | |
508 | mh.msg_controllen = cmsg->cmsg_len; | |
509 | ||
510 | if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) { | |
511 | r = -errno; | |
512 | goto fail; | |
513 | } | |
514 | ||
515 | fail: | |
516 | lockf(netns_storage_socket[0], F_ULOCK, 0); | |
517 | ||
15ae422b LP |
518 | return r; |
519 | } |