]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
conf-parser: config_parse_path_strv() is not generic, so let's move it into load...
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #include <sched.h>
30 #include <sys/syscall.h>
31 #include <limits.h>
32 #include <linux/fs.h>
33 #include <sys/file.h>
34
35 #include "strv.h"
36 #include "util.h"
37 #include "path-util.h"
38 #include "namespace.h"
39 #include "missing.h"
40 #include "execute.h"
41 #include "loopback-setup.h"
42 #include "mkdir.h"
43 #include "dev-setup.h"
44 #include "def.h"
45
46 typedef enum MountMode {
47 /* This is ordered by priority! */
48 INACCESSIBLE,
49 READONLY,
50 PRIVATE_TMP,
51 PRIVATE_VAR_TMP,
52 PRIVATE_DEV,
53 READWRITE
54 } MountMode;
55
56 typedef struct BindMount {
57 const char *path;
58 MountMode mode;
59 bool done;
60 bool ignore;
61 } BindMount;
62
63 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
64 char **i;
65
66 assert(p);
67
68 STRV_FOREACH(i, strv) {
69
70 (*p)->ignore = false;
71
72 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
73 (*p)->ignore = true;
74 (*i)++;
75 }
76
77 if (!path_is_absolute(*i))
78 return -EINVAL;
79
80 (*p)->path = *i;
81 (*p)->mode = mode;
82 (*p)++;
83 }
84
85 return 0;
86 }
87
88 static int mount_path_compare(const void *a, const void *b) {
89 const BindMount *p = a, *q = b;
90
91 if (path_equal(p->path, q->path)) {
92
93 /* If the paths are equal, check the mode */
94 if (p->mode < q->mode)
95 return -1;
96
97 if (p->mode > q->mode)
98 return 1;
99
100 return 0;
101 }
102
103 /* If the paths are not equal, then order prefixes first */
104 if (path_startswith(p->path, q->path))
105 return 1;
106
107 if (path_startswith(q->path, p->path))
108 return -1;
109
110 return 0;
111 }
112
113 static void drop_duplicates(BindMount *m, unsigned *n) {
114 BindMount *f, *t, *previous;
115
116 assert(m);
117 assert(n);
118
119 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
120
121 /* The first one wins */
122 if (previous && path_equal(f->path, previous->path))
123 continue;
124
125 t->path = f->path;
126 t->mode = f->mode;
127
128 previous = t;
129
130 t++;
131 }
132
133 *n = t - m;
134 }
135
136 static int mount_dev(BindMount *m) {
137 static const char devnodes[] =
138 "/dev/null\0"
139 "/dev/zero\0"
140 "/dev/full\0"
141 "/dev/random\0"
142 "/dev/urandom\0"
143 "/dev/tty\0";
144
145 struct stat devnodes_stat[6] = {};
146 const char *d;
147 unsigned n = 0;
148 _cleanup_umask_ mode_t u;
149 int r;
150
151 assert(m);
152
153 u = umask(0000);
154
155 /* First: record device mode_t and dev_t */
156 NULSTR_FOREACH(d, devnodes) {
157 r = stat(d, &devnodes_stat[n]);
158 if (r < 0) {
159 if (errno != ENOENT)
160 return -errno;
161 } else {
162 if (!S_ISBLK(devnodes_stat[n].st_mode) &&
163 !S_ISCHR(devnodes_stat[n].st_mode))
164 return -EINVAL;
165 }
166
167 n++;
168 }
169
170 assert(n == ELEMENTSOF(devnodes_stat));
171
172 r = mount("tmpfs", "/dev", "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755");
173 if (r < 0)
174 return m->ignore ? 0 : -errno;
175
176
177 mkdir_p("/dev/pts", 0755);
178
179 r = mount("devpts", "/dev/pts", "devpts", MS_NOSUID|MS_NOEXEC, "newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID));
180 if (r < 0)
181 return m->ignore ? 0 : -errno;
182
183 mkdir_p("/dev/shm", 0755);
184
185 r = mount("tmpfs", "/dev/shm", "tmpfs", MS_NOSUID|MS_NODEV|MS_STRICTATIME, "mode=1777");
186 if (r < 0)
187 return m->ignore ? 0 : -errno;
188
189 /* Second: actually create it */
190 n = 0;
191 NULSTR_FOREACH(d, devnodes) {
192 if (devnodes_stat[n].st_rdev == 0)
193 continue;
194
195 r = mknod(d, devnodes_stat[n].st_mode, devnodes_stat[n].st_rdev);
196 if (r < 0)
197 return m->ignore ? 0 : -errno;
198
199 n++;
200 }
201
202 dev_setup(NULL);
203
204 return 0;
205 }
206
207 static int apply_mount(
208 BindMount *m,
209 const char *tmp_dir,
210 const char *var_tmp_dir) {
211
212 const char *what;
213 int r;
214
215 assert(m);
216
217 switch (m->mode) {
218
219 case PRIVATE_DEV:
220 return mount_dev(m);
221
222 case INACCESSIBLE:
223 what = "/run/systemd/inaccessible";
224 break;
225
226 case READONLY:
227 case READWRITE:
228 what = m->path;
229 break;
230
231 case PRIVATE_TMP:
232 what = tmp_dir;
233 break;
234
235 case PRIVATE_VAR_TMP:
236 what = var_tmp_dir;
237 break;
238
239 default:
240 assert_not_reached("Unknown mode");
241 }
242
243 assert(what);
244
245 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
246 if (r >= 0)
247 log_debug("Successfully mounted %s to %s", what, m->path);
248 else if (m->ignore && errno == ENOENT)
249 r = 0;
250
251 return r;
252 }
253
254 static int make_read_only(BindMount *m) {
255 int r;
256
257 assert(m);
258
259 if (m->mode != INACCESSIBLE && m->mode != READONLY)
260 return 0;
261
262 r = mount(NULL, m->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL);
263 if (r < 0 && !(m->ignore && errno == ENOENT))
264 return -errno;
265
266 return 0;
267 }
268
269 int setup_namespace(
270 char** read_write_dirs,
271 char** read_only_dirs,
272 char** inaccessible_dirs,
273 char* tmp_dir,
274 char* var_tmp_dir,
275 bool private_dev,
276 unsigned mount_flags) {
277
278 BindMount *m, *mounts = NULL;
279 unsigned n;
280 int r = 0;
281
282 if (mount_flags == 0)
283 mount_flags = MS_SHARED;
284
285 if (unshare(CLONE_NEWNS) < 0)
286 return -errno;
287
288 n = !!tmp_dir + !!var_tmp_dir +
289 strv_length(read_write_dirs) +
290 strv_length(read_only_dirs) +
291 strv_length(inaccessible_dirs) +
292 private_dev;
293
294 if (n > 0) {
295 m = mounts = (BindMount *) alloca(n * sizeof(BindMount));
296 r = append_mounts(&m, read_write_dirs, READWRITE);
297 if (r < 0)
298 return r;
299
300 r = append_mounts(&m, read_only_dirs, READONLY);
301 if (r < 0)
302 return r;
303
304 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
305 if (r < 0)
306 return r;
307
308 if (tmp_dir) {
309 m->path = "/tmp";
310 m->mode = PRIVATE_TMP;
311 m++;
312 }
313
314 if (var_tmp_dir) {
315 m->path = "/var/tmp";
316 m->mode = PRIVATE_VAR_TMP;
317 m++;
318 }
319
320 if (private_dev) {
321 m->path = "/dev";
322 m->mode = PRIVATE_DEV;
323 m++;
324 }
325
326 assert(mounts + n == m);
327
328 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
329 drop_duplicates(mounts, &n);
330 }
331
332 /* Remount / as SLAVE so that nothing now mounted in the namespace
333 shows up in the parent */
334 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
335 return -errno;
336
337 for (m = mounts; m < mounts + n; ++m) {
338 r = apply_mount(m, tmp_dir, var_tmp_dir);
339 if (r < 0)
340 goto fail;
341 }
342
343 for (m = mounts; m < mounts + n; ++m) {
344 r = make_read_only(m);
345 if (r < 0)
346 goto fail;
347 }
348
349 /* Remount / as the desired mode */
350 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
351 r = -errno;
352 goto fail;
353 }
354
355 return 0;
356
357 fail:
358 for (m = mounts; m < mounts + n; ++m)
359 if (m->done)
360 umount2(m->path, MNT_DETACH);
361
362 return r;
363 }
364
365 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
366 _cleanup_free_ char *x = NULL;
367 char bid[SD_ID128_STRING_MAX];
368 sd_id128_t boot_id;
369 int r;
370
371 assert(id);
372 assert(prefix);
373 assert(path);
374
375 /* We include the boot id in the directory so that after a
376 * reboot we can easily identify obsolete directories. */
377
378 r = sd_id128_get_boot(&boot_id);
379 if (r < 0)
380 return r;
381
382 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
383 if (!x)
384 return -ENOMEM;
385
386 RUN_WITH_UMASK(0077)
387 if (!mkdtemp(x))
388 return -errno;
389
390 RUN_WITH_UMASK(0000) {
391 char *y;
392
393 y = strappenda(x, "/tmp");
394
395 if (mkdir(y, 0777 | S_ISVTX) < 0)
396 return -errno;
397 }
398
399 *path = x;
400 x = NULL;
401
402 return 0;
403 }
404
405 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
406 char *a, *b;
407 int r;
408
409 assert(id);
410 assert(tmp_dir);
411 assert(var_tmp_dir);
412
413 r = setup_one_tmp_dir(id, "/tmp", &a);
414 if (r < 0)
415 return r;
416
417 r = setup_one_tmp_dir(id, "/var/tmp", &b);
418 if (r < 0) {
419 char *t;
420
421 t = strappenda(a, "/tmp");
422 rmdir(t);
423 rmdir(a);
424
425 free(a);
426 return r;
427 }
428
429 *tmp_dir = a;
430 *var_tmp_dir = b;
431
432 return 0;
433 }
434
435 int setup_netns(int netns_storage_socket[2]) {
436 _cleanup_close_ int netns = -1;
437 union {
438 struct cmsghdr cmsghdr;
439 uint8_t buf[CMSG_SPACE(sizeof(int))];
440 } control = {};
441 struct msghdr mh = {
442 .msg_control = &control,
443 .msg_controllen = sizeof(control),
444 };
445 struct cmsghdr *cmsg;
446 int r;
447
448 assert(netns_storage_socket);
449 assert(netns_storage_socket[0] >= 0);
450 assert(netns_storage_socket[1] >= 0);
451
452 /* We use the passed socketpair as a storage buffer for our
453 * namespace reference fd. Whatever process runs this first
454 * shall create a new namespace, all others should just join
455 * it. To serialize that we use a file lock on the socket
456 * pair.
457 *
458 * It's a bit crazy, but hey, works great! */
459
460 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
461 return -errno;
462
463 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
464 if (errno != EAGAIN) {
465 r = -errno;
466 goto fail;
467 }
468
469 /* Nothing stored yet, so let's create a new namespace */
470
471 if (unshare(CLONE_NEWNET) < 0) {
472 r = -errno;
473 goto fail;
474 }
475
476 loopback_setup();
477
478 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
479 if (netns < 0) {
480 r = -errno;
481 goto fail;
482 }
483
484 r = 1;
485 } else {
486 /* Yay, found something, so let's join the namespace */
487
488 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
489 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
490 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
491 netns = *(int*) CMSG_DATA(cmsg);
492 }
493 }
494
495 if (setns(netns, CLONE_NEWNET) < 0) {
496 r = -errno;
497 goto fail;
498 }
499
500 r = 0;
501 }
502
503 cmsg = CMSG_FIRSTHDR(&mh);
504 cmsg->cmsg_level = SOL_SOCKET;
505 cmsg->cmsg_type = SCM_RIGHTS;
506 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
507 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
508 mh.msg_controllen = cmsg->cmsg_len;
509
510 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
511 r = -errno;
512 goto fail;
513 }
514
515 fail:
516 lockf(netns_storage_socket[0], F_ULOCK, 0);
517
518 return r;
519 }