]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-mount.c
Merge pull request #11472 from poettering/sd-bus-ref-tweak
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <sys/mount.h>
4 #include <linux/magic.h>
5
6 #include "alloc-util.h"
7 #include "escape.h"
8 #include "fd-util.h"
9 #include "fs-util.h"
10 #include "label.h"
11 #include "mkdir.h"
12 #include "mount-util.h"
13 #include "mountpoint-util.h"
14 #include "nspawn-mount.h"
15 #include "parse-util.h"
16 #include "path-util.h"
17 #include "rm-rf.h"
18 #include "set.h"
19 #include "stat-util.h"
20 #include "string-util.h"
21 #include "strv.h"
22 #include "tmpfile-util.h"
23 #include "user-util.h"
24 #include "util.h"
25
26 CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
27 CustomMount *c, *ret;
28
29 assert(l);
30 assert(n);
31 assert(t >= 0);
32 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
33
34 c = reallocarray(*l, *n + 1, sizeof(CustomMount));
35 if (!c)
36 return NULL;
37
38 *l = c;
39 ret = *l + *n;
40 (*n)++;
41
42 *ret = (CustomMount) { .type = t };
43
44 return ret;
45 }
46
47 void custom_mount_free_all(CustomMount *l, size_t n) {
48 size_t i;
49
50 for (i = 0; i < n; i++) {
51 CustomMount *m = l + i;
52
53 free(m->source);
54 free(m->destination);
55 free(m->options);
56
57 if (m->work_dir) {
58 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
59 free(m->work_dir);
60 }
61
62 if (m->rm_rf_tmpdir) {
63 (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
64 free(m->rm_rf_tmpdir);
65 }
66
67 strv_free(m->lower);
68 }
69
70 free(l);
71 }
72
73 static int custom_mount_compare(const CustomMount *a, const CustomMount *b) {
74 int r;
75
76 r = path_compare(a->destination, b->destination);
77 if (r != 0)
78 return r;
79
80 return CMP(a->type, b->type);
81 }
82
83 static bool source_path_is_valid(const char *p) {
84 assert(p);
85
86 if (*p == '+')
87 p++;
88
89 return path_is_absolute(p);
90 }
91
92 static char *resolve_source_path(const char *dest, const char *source) {
93
94 if (!source)
95 return NULL;
96
97 if (source[0] == '+')
98 return prefix_root(dest, source + 1);
99
100 return strdup(source);
101 }
102
103 int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
104 size_t i;
105 int r;
106
107 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
108 * parent process, so that we know the temporary directories to remove on exit before we fork off the
109 * children. */
110
111 assert(l || n == 0);
112
113 /* Order the custom mounts, and make sure we have a working directory */
114 typesafe_qsort(l, n, custom_mount_compare);
115
116 for (i = 0; i < n; i++) {
117 CustomMount *m = l + i;
118
119 if (m->source) {
120 char *s;
121
122 s = resolve_source_path(dest, m->source);
123 if (!s)
124 return log_oom();
125
126 free_and_replace(m->source, s);
127 } else {
128 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
129
130 m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
131 if (!m->rm_rf_tmpdir)
132 return log_oom();
133
134 if (!mkdtemp(m->rm_rf_tmpdir)) {
135 m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
136 return log_error_errno(errno, "Failed to acquire temporary directory: %m");
137 }
138
139 m->source = strjoin(m->rm_rf_tmpdir, "/src");
140 if (!m->source)
141 return log_oom();
142
143 if (mkdir(m->source, 0755) < 0)
144 return log_error_errno(errno, "Failed to create %s: %m", m->source);
145 }
146
147 if (m->type == CUSTOM_MOUNT_OVERLAY) {
148 char **j;
149
150 STRV_FOREACH(j, m->lower) {
151 char *s;
152
153 s = resolve_source_path(dest, *j);
154 if (!s)
155 return log_oom();
156
157 free_and_replace(*j, s);
158 }
159
160 if (m->work_dir) {
161 char *s;
162
163 s = resolve_source_path(dest, m->work_dir);
164 if (!s)
165 return log_oom();
166
167 free_and_replace(m->work_dir, s);
168 } else {
169 assert(m->source);
170
171 r = tempfn_random(m->source, NULL, &m->work_dir);
172 if (r < 0)
173 return log_error_errno(r, "Failed to acquire working directory: %m");
174 }
175
176 (void) mkdir_label(m->work_dir, 0700);
177 }
178 }
179
180 return 0;
181 }
182
183 int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
184 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
185 const char *p = s;
186 CustomMount *m;
187 int r;
188
189 assert(l);
190 assert(n);
191
192 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
193 if (r < 0)
194 return r;
195 if (r == 0)
196 return -EINVAL;
197 if (r == 1) {
198 destination = strdup(source[0] == '+' ? source+1 : source);
199 if (!destination)
200 return -ENOMEM;
201 }
202 if (r == 2 && !isempty(p)) {
203 opts = strdup(p);
204 if (!opts)
205 return -ENOMEM;
206 }
207
208 if (isempty(source))
209 source = NULL;
210 else if (!source_path_is_valid(source))
211 return -EINVAL;
212
213 if (!path_is_absolute(destination))
214 return -EINVAL;
215 if (empty_or_root(destination))
216 return -EINVAL;
217
218 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
219 if (!m)
220 return -ENOMEM;
221
222 m->source = source;
223 m->destination = destination;
224 m->read_only = read_only;
225 m->options = opts;
226
227 source = destination = opts = NULL;
228 return 0;
229 }
230
231 int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
232 _cleanup_free_ char *path = NULL, *opts = NULL;
233 const char *p = s;
234 CustomMount *m;
235 int r;
236
237 assert(l);
238 assert(n);
239 assert(s);
240
241 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
242 if (r < 0)
243 return r;
244 if (r == 0)
245 return -EINVAL;
246
247 if (isempty(p))
248 opts = strdup("mode=0755");
249 else
250 opts = strdup(p);
251 if (!opts)
252 return -ENOMEM;
253
254 if (!path_is_absolute(path))
255 return -EINVAL;
256 if (empty_or_root(path))
257 return -EINVAL;
258
259 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
260 if (!m)
261 return -ENOMEM;
262
263 m->destination = TAKE_PTR(path);
264 m->options = TAKE_PTR(opts);
265
266 return 0;
267 }
268
269 int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
270 _cleanup_free_ char *upper = NULL, *destination = NULL;
271 _cleanup_strv_free_ char **lower = NULL;
272 CustomMount *m;
273 int k;
274
275 k = strv_split_extract(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
276 if (k < 0)
277 return k;
278 if (k < 2)
279 return -EADDRNOTAVAIL;
280 if (k == 2) {
281 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
282 * we'll also define the destination mount point the same as the upper. */
283
284 if (!source_path_is_valid(lower[0]) ||
285 !source_path_is_valid(lower[1]))
286 return -EINVAL;
287
288 upper = TAKE_PTR(lower[1]);
289
290 destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
291 if (!destination)
292 return -ENOMEM;
293 } else {
294 char **i;
295
296 /* If more than two parameters are specified, the last one is the destination, the second to last one
297 * the "upper", and all before that the "lower" directories. */
298
299 destination = lower[k - 1];
300 upper = TAKE_PTR(lower[k - 2]);
301
302 STRV_FOREACH(i, lower)
303 if (!source_path_is_valid(*i))
304 return -EINVAL;
305
306 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
307 * in /var/tmp */
308 if (isempty(upper))
309 upper = NULL;
310 else if (!source_path_is_valid(upper))
311 return -EINVAL;
312
313 if (!path_is_absolute(destination))
314 return -EINVAL;
315 }
316
317 if (empty_or_root(destination))
318 return -EINVAL;
319
320 m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
321 if (!m)
322 return -ENOMEM;
323
324 m->destination = TAKE_PTR(destination);
325 m->source = TAKE_PTR(upper);
326 m->lower = TAKE_PTR(lower);
327 m->read_only = read_only;
328
329 return 0;
330 }
331
332 int tmpfs_patch_options(
333 const char *options,
334 uid_t uid_shift,
335 const char *selinux_apifs_context,
336 char **ret) {
337
338 char *buf = NULL;
339
340 if (uid_shift != UID_INVALID) {
341 if (asprintf(&buf, "%s%suid=" UID_FMT ",gid=" UID_FMT,
342 strempty(options), options ? "," : "",
343 uid_shift, uid_shift) < 0)
344 return -ENOMEM;
345
346 options = buf;
347 }
348
349 #if HAVE_SELINUX
350 if (selinux_apifs_context) {
351 char *t;
352
353 t = strjoin(strempty(options), options ? "," : "",
354 "context=\"", selinux_apifs_context, "\"");
355 free(buf);
356 if (!t)
357 return -ENOMEM;
358
359 buf = t;
360 }
361 #endif
362
363 if (!buf && options) {
364 buf = strdup(options);
365 if (!buf)
366 return -ENOMEM;
367 }
368 *ret = buf;
369
370 return !!buf;
371 }
372
373 int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
374 const char *full, *top, *x;
375 int r;
376 unsigned long extra_flags = 0;
377
378 top = prefix_roota(dest, "/sys");
379 r = path_is_fs_type(top, SYSFS_MAGIC);
380 if (r < 0)
381 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
382 /* /sys might already be mounted as sysfs by the outer child in the
383 * !netns case. In this case, it's all good. Don't touch it because we
384 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
385 */
386 if (r > 0)
387 return 0;
388
389 full = prefix_roota(top, "/full");
390
391 (void) mkdir(full, 0755);
392
393 if (mount_settings & MOUNT_APPLY_APIVFS_RO)
394 extra_flags |= MS_RDONLY;
395
396 r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
397 MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
398 if (r < 0)
399 return r;
400
401 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
402 _cleanup_free_ char *from = NULL, *to = NULL;
403
404 from = prefix_root(full, x);
405 if (!from)
406 return log_oom();
407
408 to = prefix_root(top, x);
409 if (!to)
410 return log_oom();
411
412 (void) mkdir(to, 0755);
413
414 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
415 if (r < 0)
416 return r;
417
418 r = mount_verbose(LOG_ERR, NULL, to, NULL,
419 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
420 if (r < 0)
421 return r;
422 }
423
424 r = umount_verbose(full);
425 if (r < 0)
426 return r;
427
428 if (rmdir(full) < 0)
429 return log_error_errno(errno, "Failed to remove %s: %m", full);
430
431 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
432 * remount /sys read-only.
433 */
434 x = prefix_roota(top, "/fs/cgroup");
435 (void) mkdir_p(x, 0755);
436
437 return mount_verbose(LOG_ERR, NULL, top, NULL,
438 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
439 }
440
441 static int mkdir_userns(const char *path, mode_t mode, uid_t uid_shift) {
442 int r;
443
444 assert(path);
445
446 r = mkdir_errno_wrapper(path, mode);
447 if (r < 0 && r != -EEXIST)
448 return r;
449
450 if (uid_shift == UID_INVALID)
451 return 0;
452
453 if (lchown(path, uid_shift, uid_shift) < 0)
454 return -errno;
455
456 return 0;
457 }
458
459 static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, uid_t uid_shift) {
460 const char *p, *e;
461 int r;
462
463 assert(path);
464
465 if (prefix && !path_startswith(path, prefix))
466 return -ENOTDIR;
467
468 /* create every parent directory in the path, except the last component */
469 p = path + strspn(path, "/");
470 for (;;) {
471 char t[strlen(path) + 1];
472
473 e = p + strcspn(p, "/");
474 p = e + strspn(e, "/");
475
476 /* Is this the last component? If so, then we're done */
477 if (*p == 0)
478 break;
479
480 memcpy(t, path, e - path);
481 t[e-path] = 0;
482
483 if (prefix && path_startswith(prefix, t))
484 continue;
485
486 r = mkdir_userns(t, mode, uid_shift);
487 if (r < 0)
488 return r;
489 }
490
491 return mkdir_userns(path, mode, uid_shift);
492 }
493
494 int mount_all(const char *dest,
495 MountSettingsMask mount_settings,
496 uid_t uid_shift,
497 const char *selinux_apifs_context) {
498
499 #define PROC_INACCESSIBLE(path) \
500 { NULL, (path), NULL, NULL, MS_BIND, \
501 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \
502 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
503 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
504
505 #define PROC_READ_ONLY(path) \
506 { (path), (path), NULL, NULL, MS_BIND, \
507 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
508 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
509 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
510
511 typedef struct MountPoint {
512 const char *what;
513 const char *where;
514 const char *type;
515 const char *options;
516 unsigned long flags;
517 MountSettingsMask mount_settings;
518 } MountPoint;
519
520 static const MountPoint mount_table[] = {
521 /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
522 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
523 MOUNT_FATAL|MOUNT_IN_USERNS },
524
525 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
526 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
527
528 { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND,
529 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
530
531 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
532 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
533
534 /* Make these files inaccessible to container payloads: they potentially leak information about kernel
535 * internals or the host's execution environment to the container */
536 PROC_INACCESSIBLE("/proc/kallsyms"),
537 PROC_INACCESSIBLE("/proc/kcore"),
538 PROC_INACCESSIBLE("/proc/keys"),
539 PROC_INACCESSIBLE("/proc/sysrq-trigger"),
540 PROC_INACCESSIBLE("/proc/timer_list"),
541
542 /* Make these directories read-only to container payloads: they show hardware information, and in some
543 * cases contain tunables the container really shouldn't have access to. */
544 PROC_READ_ONLY("/proc/acpi"),
545 PROC_READ_ONLY("/proc/apm"),
546 PROC_READ_ONLY("/proc/asound"),
547 PROC_READ_ONLY("/proc/bus"),
548 PROC_READ_ONLY("/proc/fs"),
549 PROC_READ_ONLY("/proc/irq"),
550 PROC_READ_ONLY("/proc/scsi"),
551
552 /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
553 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
554 MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP },
555 { "tmpfs", "/sys", "tmpfs", "mode=555", MS_NOSUID|MS_NOEXEC|MS_NODEV,
556 MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
557 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
558 MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
559 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
560 MOUNT_FATAL }, /* skipped if above was mounted */
561 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
562 MOUNT_FATAL },
563 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
564 MOUNT_FATAL },
565 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
566 MOUNT_FATAL },
567 { "mqueue", "/dev/mqueue", "mqueue", NULL, 0,
568 MOUNT_FATAL },
569
570 #if HAVE_SELINUX
571 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
572 0 }, /* Bind mount first */
573 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
574 0 }, /* Then, make it r/o */
575 #endif
576 };
577
578 _cleanup_(unlink_and_freep) char *inaccessible = NULL;
579 bool use_userns = (mount_settings & MOUNT_USE_USERNS);
580 bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
581 bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
582 bool in_userns = (mount_settings & MOUNT_IN_USERNS);
583 bool tmpfs_tmp = (mount_settings & MOUNT_APPLY_TMPFS_TMP);
584 size_t k;
585 int r;
586
587 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
588 _cleanup_free_ char *where = NULL, *options = NULL;
589 const char *o, *what;
590 bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
591
592 if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
593 continue;
594
595 if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
596 continue;
597
598 if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
599 continue;
600
601 if (!tmpfs_tmp && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_TMPFS_TMP))
602 continue;
603
604 r = chase_symlinks(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where);
605 if (r < 0)
606 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
607
608 if (mount_table[k].mount_settings & MOUNT_INACCESSIBLE_REG) {
609
610 if (!inaccessible) {
611 _cleanup_free_ char *np = NULL;
612
613 r = tempfn_random_child(NULL, "inaccessible", &np);
614 if (r < 0)
615 return log_error_errno(r, "Failed to generate inaccessible file node path: %m");
616
617 r = touch_file(np, false, USEC_INFINITY, UID_INVALID, GID_INVALID, 0000);
618 if (r < 0)
619 return log_error_errno(r, "Failed to create inaccessible file node '%s': %m", np);
620
621 inaccessible = TAKE_PTR(np);
622 }
623
624 what = inaccessible;
625 } else
626 what = mount_table[k].what;
627
628 r = path_is_mount_point(where, NULL, 0);
629 if (r < 0 && r != -ENOENT)
630 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
631
632 /* Skip this entry if it is not a remount. */
633 if (what && r > 0)
634 continue;
635
636 r = mkdir_userns_p(dest, where, 0755, (use_userns && !in_userns) ? uid_shift : UID_INVALID);
637 if (r < 0 && r != -EEXIST) {
638 if (fatal && r != -EROFS)
639 return log_error_errno(r, "Failed to create directory %s: %m", where);
640
641 log_debug_errno(r, "Failed to create directory %s: %m", where);
642 /* If we failed mkdir() or chown() due to the root
643 * directory being read only, attempt to mount this fs
644 * anyway and let mount_verbose log any errors */
645 if (r != -EROFS)
646 continue;
647 }
648
649 o = mount_table[k].options;
650 if (streq_ptr(mount_table[k].type, "tmpfs")) {
651 r = tmpfs_patch_options(o, in_userns ? 0 : uid_shift, selinux_apifs_context, &options);
652 if (r < 0)
653 return log_oom();
654 if (r > 0)
655 o = options;
656 }
657
658 r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
659 what,
660 where,
661 mount_table[k].type,
662 mount_table[k].flags,
663 o);
664 if (r < 0 && fatal)
665 return r;
666 }
667
668 return 0;
669 }
670
671 static int mount_bind(const char *dest, CustomMount *m) {
672
673 _cleanup_free_ char *where = NULL;
674 struct stat source_st, dest_st;
675 int r;
676
677 assert(dest);
678 assert(m);
679
680 if (stat(m->source, &source_st) < 0)
681 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
682
683 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
684 if (r < 0)
685 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
686 if (r > 0) { /* Path exists already? */
687
688 if (stat(where, &dest_st) < 0)
689 return log_error_errno(errno, "Failed to stat %s: %m", where);
690
691 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode))
692 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
693 "Cannot bind mount directory %s on file %s.",
694 m->source, where);
695
696 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode))
697 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
698 "Cannot bind mount file %s on directory %s.",
699 m->source, where);
700
701 } else { /* Path doesn't exist yet? */
702 r = mkdir_parents_label(where, 0755);
703 if (r < 0)
704 return log_error_errno(r, "Failed to make parents of %s: %m", where);
705
706 /* Create the mount point. Any non-directory file can be
707 * mounted on any non-directory file (regular, fifo, socket,
708 * char, block).
709 */
710 if (S_ISDIR(source_st.st_mode))
711 r = mkdir_label(where, 0755);
712 else
713 r = touch(where);
714 if (r < 0)
715 return log_error_errno(r, "Failed to create mount point %s: %m", where);
716
717 }
718
719 r = mount_verbose(LOG_ERR, m->source, where, NULL, MS_BIND | MS_REC, m->options);
720 if (r < 0)
721 return r;
722
723 if (m->read_only) {
724 r = bind_remount_recursive(where, true, NULL);
725 if (r < 0)
726 return log_error_errno(r, "Read-only bind mount failed: %m");
727 }
728
729 return 0;
730 }
731
732 static int mount_tmpfs(
733 const char *dest,
734 CustomMount *m,
735 bool userns, uid_t uid_shift, uid_t uid_range,
736 const char *selinux_apifs_context) {
737
738 const char *options;
739 _cleanup_free_ char *buf = NULL, *where = NULL;
740 int r;
741
742 assert(dest);
743 assert(m);
744
745 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
746 if (r < 0)
747 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
748 if (r == 0) { /* Doesn't exist yet? */
749 r = mkdir_p_label(where, 0755);
750 if (r < 0)
751 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
752 }
753
754 r = tmpfs_patch_options(m->options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
755 if (r < 0)
756 return log_oom();
757 options = r > 0 ? buf : m->options;
758
759 return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
760 }
761
762 static char *joined_and_escaped_lower_dirs(char **lower) {
763 _cleanup_strv_free_ char **sv = NULL;
764
765 sv = strv_copy(lower);
766 if (!sv)
767 return NULL;
768
769 strv_reverse(sv);
770
771 if (!strv_shell_escape(sv, ",:"))
772 return NULL;
773
774 return strv_join(sv, ":");
775 }
776
777 static int mount_overlay(const char *dest, CustomMount *m) {
778
779 _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
780 const char *options;
781 int r;
782
783 assert(dest);
784 assert(m);
785
786 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
787 if (r < 0)
788 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
789 if (r == 0) { /* Doesn't exist yet? */
790 r = mkdir_label(where, 0755);
791 if (r < 0)
792 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
793 }
794
795 (void) mkdir_p_label(m->source, 0755);
796
797 lower = joined_and_escaped_lower_dirs(m->lower);
798 if (!lower)
799 return log_oom();
800
801 escaped_source = shell_escape(m->source, ",:");
802 if (!escaped_source)
803 return log_oom();
804
805 if (m->read_only)
806 options = strjoina("lowerdir=", escaped_source, ":", lower);
807 else {
808 _cleanup_free_ char *escaped_work_dir = NULL;
809
810 escaped_work_dir = shell_escape(m->work_dir, ",:");
811 if (!escaped_work_dir)
812 return log_oom();
813
814 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
815 }
816
817 return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
818 }
819
820 int mount_custom(
821 const char *dest,
822 CustomMount *mounts, size_t n,
823 bool userns, uid_t uid_shift, uid_t uid_range,
824 const char *selinux_apifs_context) {
825
826 size_t i;
827 int r;
828
829 assert(dest);
830
831 for (i = 0; i < n; i++) {
832 CustomMount *m = mounts + i;
833
834 switch (m->type) {
835
836 case CUSTOM_MOUNT_BIND:
837 r = mount_bind(dest, m);
838 break;
839
840 case CUSTOM_MOUNT_TMPFS:
841 r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
842 break;
843
844 case CUSTOM_MOUNT_OVERLAY:
845 r = mount_overlay(dest, m);
846 break;
847
848 default:
849 assert_not_reached("Unknown custom mount type");
850 }
851
852 if (r < 0)
853 return r;
854 }
855
856 return 0;
857 }
858
859 static int setup_volatile_state(
860 const char *directory,
861 bool userns, uid_t uid_shift, uid_t uid_range,
862 const char *selinux_apifs_context) {
863
864 _cleanup_free_ char *buf = NULL;
865 const char *p, *options;
866 int r;
867
868 assert(directory);
869
870 /* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */
871
872 r = bind_remount_recursive(directory, true, NULL);
873 if (r < 0)
874 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
875
876 p = prefix_roota(directory, "/var");
877 r = mkdir(p, 0755);
878 if (r < 0 && errno != EEXIST)
879 return log_error_errno(errno, "Failed to create %s: %m", directory);
880
881 options = "mode=755";
882 r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
883 if (r < 0)
884 return log_oom();
885 if (r > 0)
886 options = buf;
887
888 return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
889 }
890
891 static int setup_volatile_yes(
892 const char *directory,
893 bool userns, uid_t uid_shift, uid_t uid_range,
894 const char *selinux_apifs_context) {
895
896 bool tmpfs_mounted = false, bind_mounted = false;
897 char template[] = "/tmp/nspawn-volatile-XXXXXX";
898 _cleanup_free_ char *buf = NULL;
899 const char *f, *t, *options;
900 int r;
901
902 assert(directory);
903
904 /* --volatile=yes means we mount a tmpfs to the root dir, and the original /usr to use inside it, and that
905 read-only. */
906
907 if (!mkdtemp(template))
908 return log_error_errno(errno, "Failed to create temporary directory: %m");
909
910 options = "mode=755";
911 r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
912 if (r < 0)
913 goto fail;
914 if (r > 0)
915 options = buf;
916
917 r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
918 if (r < 0)
919 goto fail;
920
921 tmpfs_mounted = true;
922
923 f = prefix_roota(directory, "/usr");
924 t = prefix_roota(template, "/usr");
925
926 r = mkdir(t, 0755);
927 if (r < 0 && errno != EEXIST) {
928 r = log_error_errno(errno, "Failed to create %s: %m", t);
929 goto fail;
930 }
931
932 r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
933 if (r < 0)
934 goto fail;
935
936 bind_mounted = true;
937
938 r = bind_remount_recursive(t, true, NULL);
939 if (r < 0) {
940 log_error_errno(r, "Failed to remount %s read-only: %m", t);
941 goto fail;
942 }
943
944 r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
945 if (r < 0)
946 goto fail;
947
948 (void) rmdir(template);
949
950 return 0;
951
952 fail:
953 if (bind_mounted)
954 (void) umount_verbose(t);
955
956 if (tmpfs_mounted)
957 (void) umount_verbose(template);
958 (void) rmdir(template);
959 return r;
960 }
961
962 static int setup_volatile_overlay(
963 const char *directory,
964 bool userns, uid_t uid_shift, uid_t uid_range,
965 const char *selinux_apifs_context) {
966
967 _cleanup_free_ char *buf = NULL, *escaped_directory = NULL, *escaped_upper = NULL, *escaped_work = NULL;
968 char template[] = "/tmp/nspawn-volatile-XXXXXX";
969 const char *upper, *work, *options;
970 bool tmpfs_mounted = false;
971 int r;
972
973 assert(directory);
974
975 /* --volatile=overlay means we mount an overlayfs to the root dir. */
976
977 if (!mkdtemp(template))
978 return log_error_errno(errno, "Failed to create temporary directory: %m");
979
980 options = "mode=755";
981 r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
982 if (r < 0)
983 goto finish;
984 if (r > 0)
985 options = buf;
986
987 r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
988 if (r < 0)
989 goto finish;
990
991 tmpfs_mounted = true;
992
993 upper = strjoina(template, "/upper");
994 work = strjoina(template, "/work");
995
996 if (mkdir(upper, 0755) < 0) {
997 r = log_error_errno(errno, "Failed to create %s: %m", upper);
998 goto finish;
999 }
1000 if (mkdir(work, 0755) < 0) {
1001 r = log_error_errno(errno, "Failed to create %s: %m", work);
1002 goto finish;
1003 }
1004
1005 /* And now, let's overmount the root dir with an overlayfs that uses the root dir as lower dir. It's kinda nice
1006 * that the kernel allows us to do that without going through some mount point rearrangements. */
1007
1008 escaped_directory = shell_escape(directory, ",:");
1009 escaped_upper = shell_escape(upper, ",:");
1010 escaped_work = shell_escape(work, ",:");
1011 if (!escaped_directory || !escaped_upper || !escaped_work) {
1012 r = -ENOMEM;
1013 goto finish;
1014 }
1015
1016 options = strjoina("lowerdir=", escaped_directory, ",upperdir=", escaped_upper, ",workdir=", escaped_work);
1017 r = mount_verbose(LOG_ERR, "overlay", directory, "overlay", 0, options);
1018
1019 finish:
1020 if (tmpfs_mounted)
1021 (void) umount_verbose(template);
1022
1023 (void) rmdir(template);
1024 return r;
1025 }
1026
1027 int setup_volatile_mode(
1028 const char *directory,
1029 VolatileMode mode,
1030 bool userns, uid_t uid_shift, uid_t uid_range,
1031 const char *selinux_apifs_context) {
1032
1033 switch (mode) {
1034
1035 case VOLATILE_YES:
1036 return setup_volatile_yes(directory, userns, uid_shift, uid_range, selinux_apifs_context);
1037
1038 case VOLATILE_STATE:
1039 return setup_volatile_state(directory, userns, uid_shift, uid_range, selinux_apifs_context);
1040
1041 case VOLATILE_OVERLAY:
1042 return setup_volatile_overlay(directory, userns, uid_shift, uid_range, selinux_apifs_context);
1043
1044 default:
1045 return 0;
1046 }
1047 }
1048
1049 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1050 int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
1051 _cleanup_free_ char *root_new = NULL, *root_old = NULL;
1052 const char *p = s;
1053 int r;
1054
1055 assert(pivot_root_new);
1056 assert(pivot_root_old);
1057
1058 r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1059 if (r < 0)
1060 return r;
1061 if (r == 0)
1062 return -EINVAL;
1063
1064 if (isempty(p))
1065 root_old = NULL;
1066 else {
1067 root_old = strdup(p);
1068 if (!root_old)
1069 return -ENOMEM;
1070 }
1071
1072 if (!path_is_absolute(root_new))
1073 return -EINVAL;
1074 if (root_old && !path_is_absolute(root_old))
1075 return -EINVAL;
1076
1077 free_and_replace(*pivot_root_new, root_new);
1078 free_and_replace(*pivot_root_old, root_old);
1079
1080 return 0;
1081 }
1082
1083 int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
1084 _cleanup_free_ char *directory_pivot_root_new = NULL;
1085 _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
1086 char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
1087 bool remove_pivot_tmp = false;
1088 int r;
1089
1090 assert(directory);
1091
1092 if (!pivot_root_new)
1093 return 0;
1094
1095 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1096 * If pivot_root_old is NULL, the existing / disappears.
1097 * This requires a temporary directory, pivot_tmp, which is
1098 * not a child of either.
1099 *
1100 * This is typically used for OSTree-style containers, where
1101 * the root partition contains several sysroots which could be
1102 * run. Normally, one would be chosen by the bootloader and
1103 * pivoted to / by initramfs.
1104 *
1105 * For example, for an OSTree deployment, pivot_root_new
1106 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1107 * code doesn’t do the /var mount which OSTree expects: use
1108 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1109 *
1110 * So in the OSTree case, we’ll end up with something like:
1111 * - directory = /tmp/nspawn-root-123456
1112 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1113 * - pivot_root_old = /sysroot
1114 * - directory_pivot_root_new =
1115 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1116 * - pivot_tmp = /tmp/nspawn-pivot-123456
1117 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1118 *
1119 * Requires all file systems at directory and below to be mounted
1120 * MS_PRIVATE or MS_SLAVE so they can be moved.
1121 */
1122 directory_pivot_root_new = prefix_root(directory, pivot_root_new);
1123
1124 /* Remount directory_pivot_root_new to make it movable. */
1125 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
1126 if (r < 0)
1127 goto done;
1128
1129 if (pivot_root_old) {
1130 if (!mkdtemp(pivot_tmp)) {
1131 r = log_error_errno(errno, "Failed to create temporary directory: %m");
1132 goto done;
1133 }
1134
1135 remove_pivot_tmp = true;
1136 pivot_tmp_pivot_root_old = prefix_root(pivot_tmp, pivot_root_old);
1137
1138 r = mount_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
1139 if (r < 0)
1140 goto done;
1141
1142 r = mount_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
1143 if (r < 0)
1144 goto done;
1145
1146 r = mount_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
1147 if (r < 0)
1148 goto done;
1149 } else {
1150 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
1151 if (r < 0)
1152 goto done;
1153 }
1154
1155 done:
1156 if (remove_pivot_tmp)
1157 (void) rmdir(pivot_tmp);
1158
1159 return r;
1160 }