]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-mount.c
tree-wide: remove Lennart's copyright lines
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <sys/mount.h>
4 #include <linux/magic.h>
5
6 #include "alloc-util.h"
7 #include "escape.h"
8 #include "fd-util.h"
9 #include "fileio.h"
10 #include "fs-util.h"
11 #include "label.h"
12 #include "mkdir.h"
13 #include "mount-util.h"
14 #include "nspawn-mount.h"
15 #include "parse-util.h"
16 #include "path-util.h"
17 #include "rm-rf.h"
18 #include "set.h"
19 #include "stat-util.h"
20 #include "string-util.h"
21 #include "strv.h"
22 #include "user-util.h"
23 #include "util.h"
24
25 CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
26 CustomMount *c, *ret;
27
28 assert(l);
29 assert(n);
30 assert(t >= 0);
31 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
32
33 c = reallocarray(*l, *n + 1, sizeof(CustomMount));
34 if (!c)
35 return NULL;
36
37 *l = c;
38 ret = *l + *n;
39 (*n)++;
40
41 *ret = (CustomMount) { .type = t };
42
43 return ret;
44 }
45
46 void custom_mount_free_all(CustomMount *l, size_t n) {
47 size_t i;
48
49 for (i = 0; i < n; i++) {
50 CustomMount *m = l + i;
51
52 free(m->source);
53 free(m->destination);
54 free(m->options);
55
56 if (m->work_dir) {
57 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
58 free(m->work_dir);
59 }
60
61 if (m->rm_rf_tmpdir) {
62 (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
63 free(m->rm_rf_tmpdir);
64 }
65
66 strv_free(m->lower);
67 }
68
69 free(l);
70 }
71
72 static int custom_mount_compare(const void *a, const void *b) {
73 const CustomMount *x = a, *y = b;
74 int r;
75
76 r = path_compare(x->destination, y->destination);
77 if (r != 0)
78 return r;
79
80 if (x->type < y->type)
81 return -1;
82 if (x->type > y->type)
83 return 1;
84
85 return 0;
86 }
87
88 static bool source_path_is_valid(const char *p) {
89 assert(p);
90
91 if (*p == '+')
92 p++;
93
94 return path_is_absolute(p);
95 }
96
97 static char *resolve_source_path(const char *dest, const char *source) {
98
99 if (!source)
100 return NULL;
101
102 if (source[0] == '+')
103 return prefix_root(dest, source + 1);
104
105 return strdup(source);
106 }
107
108 int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
109 size_t i;
110 int r;
111
112 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
113 * parent process, so that we know the temporary directories to remove on exit before we fork off the
114 * children. */
115
116 assert(l || n == 0);
117
118 /* Order the custom mounts, and make sure we have a working directory */
119 qsort_safe(l, n, sizeof(CustomMount), custom_mount_compare);
120
121 for (i = 0; i < n; i++) {
122 CustomMount *m = l + i;
123
124 if (m->source) {
125 char *s;
126
127 s = resolve_source_path(dest, m->source);
128 if (!s)
129 return log_oom();
130
131 free_and_replace(m->source, s);
132 } else {
133 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
134
135 m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
136 if (!m->rm_rf_tmpdir)
137 return log_oom();
138
139 if (!mkdtemp(m->rm_rf_tmpdir)) {
140 m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
141 return log_error_errno(errno, "Failed to acquire temporary directory: %m");
142 }
143
144 m->source = strjoin(m->rm_rf_tmpdir, "/src");
145 if (!m->source)
146 return log_oom();
147
148 if (mkdir(m->source, 0755) < 0)
149 return log_error_errno(errno, "Failed to create %s: %m", m->source);
150 }
151
152 if (m->type == CUSTOM_MOUNT_OVERLAY) {
153 char **j;
154
155 STRV_FOREACH(j, m->lower) {
156 char *s;
157
158 s = resolve_source_path(dest, *j);
159 if (!s)
160 return log_oom();
161
162 free_and_replace(*j, s);
163 }
164
165 if (m->work_dir) {
166 char *s;
167
168 s = resolve_source_path(dest, m->work_dir);
169 if (!s)
170 return log_oom();
171
172 free_and_replace(m->work_dir, s);
173 } else {
174 assert(m->source);
175
176 r = tempfn_random(m->source, NULL, &m->work_dir);
177 if (r < 0)
178 return log_error_errno(r, "Failed to acquire working directory: %m");
179 }
180
181 (void) mkdir_label(m->work_dir, 0700);
182 }
183 }
184
185 return 0;
186 }
187
188 int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
189 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
190 const char *p = s;
191 CustomMount *m;
192 int r;
193
194 assert(l);
195 assert(n);
196
197 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
198 if (r < 0)
199 return r;
200 if (r == 0)
201 return -EINVAL;
202 if (r == 1) {
203 destination = strdup(source[0] == '+' ? source+1 : source);
204 if (!destination)
205 return -ENOMEM;
206 }
207 if (r == 2 && !isempty(p)) {
208 opts = strdup(p);
209 if (!opts)
210 return -ENOMEM;
211 }
212
213 if (isempty(source))
214 source = NULL;
215 else if (!source_path_is_valid(source))
216 return -EINVAL;
217
218 if (!path_is_absolute(destination))
219 return -EINVAL;
220
221 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
222 if (!m)
223 return -ENOMEM;
224
225 m->source = source;
226 m->destination = destination;
227 m->read_only = read_only;
228 m->options = opts;
229
230 source = destination = opts = NULL;
231 return 0;
232 }
233
234 int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
235 _cleanup_free_ char *path = NULL, *opts = NULL;
236 const char *p = s;
237 CustomMount *m;
238 int r;
239
240 assert(l);
241 assert(n);
242 assert(s);
243
244 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
245 if (r < 0)
246 return r;
247 if (r == 0)
248 return -EINVAL;
249
250 if (isempty(p))
251 opts = strdup("mode=0755");
252 else
253 opts = strdup(p);
254 if (!opts)
255 return -ENOMEM;
256
257 if (!path_is_absolute(path))
258 return -EINVAL;
259
260 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
261 if (!m)
262 return -ENOMEM;
263
264 m->destination = TAKE_PTR(path);
265 m->options = TAKE_PTR(opts);
266
267 return 0;
268 }
269
270 int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
271 _cleanup_free_ char *upper = NULL, *destination = NULL;
272 _cleanup_strv_free_ char **lower = NULL;
273 CustomMount *m;
274 int k;
275
276 k = strv_split_extract(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
277 if (k < 0)
278 return k;
279 if (k < 2)
280 return -EADDRNOTAVAIL;
281 if (k == 2) {
282 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
283 * we'll also define the destination mount point the same as the upper. */
284
285 if (!source_path_is_valid(lower[0]) ||
286 !source_path_is_valid(lower[1]))
287 return -EINVAL;
288
289 upper = TAKE_PTR(lower[1]);
290
291 destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
292 if (!destination)
293 return -ENOMEM;
294 } else {
295 char **i;
296
297 /* If more than two parameters are specified, the last one is the destination, the second to last one
298 * the "upper", and all before that the "lower" directories. */
299
300 destination = lower[k - 1];
301 upper = TAKE_PTR(lower[k - 2]);
302
303 STRV_FOREACH(i, lower)
304 if (!source_path_is_valid(*i))
305 return -EINVAL;
306
307 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
308 * in /var/tmp */
309 if (isempty(upper))
310 upper = NULL;
311 else if (!source_path_is_valid(upper))
312 return -EINVAL;
313
314 if (!path_is_absolute(destination))
315 return -EINVAL;
316 }
317
318 m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
319 if (!m)
320 return -ENOMEM;
321
322 m->destination = TAKE_PTR(destination);
323 m->source = TAKE_PTR(upper);
324 m->lower = TAKE_PTR(lower);
325 m->read_only = read_only;
326
327 return 0;
328 }
329
330 static int tmpfs_patch_options(
331 const char *options,
332 bool userns,
333 uid_t uid_shift, uid_t uid_range,
334 bool patch_ids,
335 const char *selinux_apifs_context,
336 char **ret) {
337
338 char *buf = NULL;
339
340 if ((userns && uid_shift != 0) || patch_ids) {
341 assert(uid_shift != UID_INVALID);
342
343 if (asprintf(&buf, "%s%suid=" UID_FMT ",gid=" UID_FMT,
344 strempty(options), options ? "," : "",
345 uid_shift, uid_shift) < 0)
346 return -ENOMEM;
347
348 options = buf;
349 }
350
351 #if HAVE_SELINUX
352 if (selinux_apifs_context) {
353 char *t;
354
355 t = strjoin(strempty(options), options ? "," : "",
356 "context=\"", selinux_apifs_context, "\"");
357 free(buf);
358 if (!t)
359 return -ENOMEM;
360
361 buf = t;
362 }
363 #endif
364
365 if (!buf && options) {
366 buf = strdup(options);
367 if (!buf)
368 return -ENOMEM;
369 }
370 *ret = buf;
371
372 return !!buf;
373 }
374
375 int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
376 const char *full, *top, *x;
377 int r;
378 unsigned long extra_flags = 0;
379
380 top = prefix_roota(dest, "/sys");
381 r = path_is_fs_type(top, SYSFS_MAGIC);
382 if (r < 0)
383 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
384 /* /sys might already be mounted as sysfs by the outer child in the
385 * !netns case. In this case, it's all good. Don't touch it because we
386 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
387 */
388 if (r > 0)
389 return 0;
390
391 full = prefix_roota(top, "/full");
392
393 (void) mkdir(full, 0755);
394
395 if (mount_settings & MOUNT_APPLY_APIVFS_RO)
396 extra_flags |= MS_RDONLY;
397
398 r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
399 MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
400 if (r < 0)
401 return r;
402
403 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
404 _cleanup_free_ char *from = NULL, *to = NULL;
405
406 from = prefix_root(full, x);
407 if (!from)
408 return log_oom();
409
410 to = prefix_root(top, x);
411 if (!to)
412 return log_oom();
413
414 (void) mkdir(to, 0755);
415
416 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
417 if (r < 0)
418 return r;
419
420 r = mount_verbose(LOG_ERR, NULL, to, NULL,
421 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
422 if (r < 0)
423 return r;
424 }
425
426 r = umount_verbose(full);
427 if (r < 0)
428 return r;
429
430 if (rmdir(full) < 0)
431 return log_error_errno(errno, "Failed to remove %s: %m", full);
432
433 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
434 * remount /sys read-only.
435 */
436 if (cg_ns_supported()) {
437 x = prefix_roota(top, "/fs/cgroup");
438 (void) mkdir_p(x, 0755);
439 }
440
441 return mount_verbose(LOG_ERR, NULL, top, NULL,
442 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
443 }
444
445 static int mkdir_userns(const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
446 int r;
447
448 assert(path);
449
450 r = mkdir_errno_wrapper(path, mode);
451 if (r < 0 && r != -EEXIST)
452 return r;
453
454 if ((mask & MOUNT_USE_USERNS) == 0)
455 return 0;
456
457 if (mask & MOUNT_IN_USERNS)
458 return 0;
459
460 if (lchown(path, uid_shift, uid_shift) < 0)
461 return -errno;
462
463 return 0;
464 }
465
466 static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
467 const char *p, *e;
468 int r;
469
470 assert(path);
471
472 if (prefix && !path_startswith(path, prefix))
473 return -ENOTDIR;
474
475 /* create every parent directory in the path, except the last component */
476 p = path + strspn(path, "/");
477 for (;;) {
478 char t[strlen(path) + 1];
479
480 e = p + strcspn(p, "/");
481 p = e + strspn(e, "/");
482
483 /* Is this the last component? If so, then we're done */
484 if (*p == 0)
485 break;
486
487 memcpy(t, path, e - path);
488 t[e-path] = 0;
489
490 if (prefix && path_startswith(prefix, t))
491 continue;
492
493 r = mkdir_userns(t, mode, mask, uid_shift);
494 if (r < 0)
495 return r;
496 }
497
498 return mkdir_userns(path, mode, mask, uid_shift);
499 }
500
501 int mount_all(const char *dest,
502 MountSettingsMask mount_settings,
503 uid_t uid_shift, uid_t uid_range,
504 const char *selinux_apifs_context) {
505
506 #define PROC_INACCESSIBLE(path) \
507 { NULL, (path), NULL, NULL, MS_BIND, \
508 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \
509 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
510 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
511
512 #define PROC_READ_ONLY(path) \
513 { (path), (path), NULL, NULL, MS_BIND, \
514 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
515 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
516 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
517
518 typedef struct MountPoint {
519 const char *what;
520 const char *where;
521 const char *type;
522 const char *options;
523 unsigned long flags;
524 MountSettingsMask mount_settings;
525 } MountPoint;
526
527 static const MountPoint mount_table[] = {
528 /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
529 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
530 MOUNT_FATAL|MOUNT_IN_USERNS },
531
532 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
533 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
534
535 { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND,
536 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
537
538 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
539 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
540
541 /* Make these files inaccessible to container payloads: they potentially leak information about kernel
542 * internals or the host's execution environment to the container */
543 PROC_INACCESSIBLE("/proc/kallsyms"),
544 PROC_INACCESSIBLE("/proc/kcore"),
545 PROC_INACCESSIBLE("/proc/keys"),
546 PROC_INACCESSIBLE("/proc/sysrq-trigger"),
547 PROC_INACCESSIBLE("/proc/timer_list"),
548
549 /* Make these directories read-only to container payloads: they show hardware information, and in some
550 * cases contain tunables the container really shouldn't have access to. */
551 PROC_READ_ONLY("/proc/acpi"),
552 PROC_READ_ONLY("/proc/apm"),
553 PROC_READ_ONLY("/proc/asound"),
554 PROC_READ_ONLY("/proc/bus"),
555 PROC_READ_ONLY("/proc/fs"),
556 PROC_READ_ONLY("/proc/irq"),
557 PROC_READ_ONLY("/proc/scsi"),
558
559 /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
560 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
561 MOUNT_FATAL },
562 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV,
563 MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
564 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
565 MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
566 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
567 MOUNT_FATAL }, /* skipped if above was mounted */
568 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
569 MOUNT_FATAL },
570 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
571 MOUNT_FATAL },
572 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
573 MOUNT_FATAL },
574
575 #if HAVE_SELINUX
576 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
577 0 }, /* Bind mount first */
578 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
579 0 }, /* Then, make it r/o */
580 #endif
581 };
582
583 _cleanup_(unlink_and_freep) char *inaccessible = NULL;
584 bool use_userns = (mount_settings & MOUNT_USE_USERNS);
585 bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
586 bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
587 bool in_userns = (mount_settings & MOUNT_IN_USERNS);
588 size_t k;
589 int r;
590
591 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
592 _cleanup_free_ char *where = NULL, *options = NULL;
593 const char *o, *what;
594 bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
595
596 if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
597 continue;
598
599 if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
600 continue;
601
602 if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
603 continue;
604
605 r = chase_symlinks(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where);
606 if (r < 0)
607 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
608
609 if (mount_table[k].mount_settings & MOUNT_INACCESSIBLE_REG) {
610
611 if (!inaccessible) {
612 _cleanup_free_ char *np = NULL;
613
614 r = tempfn_random_child(NULL, "inaccessible", &np);
615 if (r < 0)
616 return log_error_errno(r, "Failed to generate inaccessible file node path: %m");
617
618 r = touch_file(np, false, USEC_INFINITY, UID_INVALID, GID_INVALID, 0000);
619 if (r < 0)
620 return log_error_errno(r, "Failed to create inaccessible file node '%s': %m", np);
621
622 inaccessible = TAKE_PTR(np);
623 }
624
625 what = inaccessible;
626 } else
627 what = mount_table[k].what;
628
629 r = path_is_mount_point(where, NULL, 0);
630 if (r < 0 && r != -ENOENT)
631 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
632
633 /* Skip this entry if it is not a remount. */
634 if (what && r > 0)
635 continue;
636
637 r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
638 if (r < 0 && r != -EEXIST) {
639 if (fatal && r != -EROFS)
640 return log_error_errno(r, "Failed to create directory %s: %m", where);
641
642 log_debug_errno(r, "Failed to create directory %s: %m", where);
643 /* If we failed mkdir() or chown() due to the root
644 * directory being read only, attempt to mount this fs
645 * anyway and let mount_verbose log any errors */
646 if (r != -EROFS)
647 continue;
648 }
649
650 o = mount_table[k].options;
651 if (streq_ptr(mount_table[k].type, "tmpfs")) {
652 if (in_userns)
653 r = tmpfs_patch_options(o, use_userns, 0, uid_range, true, selinux_apifs_context, &options);
654 else
655 r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
656 if (r < 0)
657 return log_oom();
658 if (r > 0)
659 o = options;
660 }
661
662 r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
663 what,
664 where,
665 mount_table[k].type,
666 mount_table[k].flags,
667 o);
668 if (r < 0 && fatal)
669 return r;
670 }
671
672 return 0;
673 }
674
675 static int mount_bind(const char *dest, CustomMount *m) {
676
677 _cleanup_free_ char *where = NULL;
678 struct stat source_st, dest_st;
679 int r;
680
681 assert(dest);
682 assert(m);
683
684 if (stat(m->source, &source_st) < 0)
685 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
686
687 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
688 if (r < 0)
689 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
690 if (r > 0) { /* Path exists already? */
691
692 if (stat(where, &dest_st) < 0)
693 return log_error_errno(errno, "Failed to stat %s: %m", where);
694
695 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
696 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
697 return -EINVAL;
698 }
699
700 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
701 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
702 return -EINVAL;
703 }
704
705 } else { /* Path doesn't exist yet? */
706 r = mkdir_parents_label(where, 0755);
707 if (r < 0)
708 return log_error_errno(r, "Failed to make parents of %s: %m", where);
709
710 /* Create the mount point. Any non-directory file can be
711 * mounted on any non-directory file (regular, fifo, socket,
712 * char, block).
713 */
714 if (S_ISDIR(source_st.st_mode))
715 r = mkdir_label(where, 0755);
716 else
717 r = touch(where);
718 if (r < 0)
719 return log_error_errno(r, "Failed to create mount point %s: %m", where);
720
721 }
722
723 r = mount_verbose(LOG_ERR, m->source, where, NULL, MS_BIND | MS_REC, m->options);
724 if (r < 0)
725 return r;
726
727 if (m->read_only) {
728 r = bind_remount_recursive(where, true, NULL);
729 if (r < 0)
730 return log_error_errno(r, "Read-only bind mount failed: %m");
731 }
732
733 return 0;
734 }
735
736 static int mount_tmpfs(
737 const char *dest,
738 CustomMount *m,
739 bool userns, uid_t uid_shift, uid_t uid_range,
740 const char *selinux_apifs_context) {
741
742 const char *options;
743 _cleanup_free_ char *buf = NULL, *where = NULL;
744 int r;
745
746 assert(dest);
747 assert(m);
748
749 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
750 if (r < 0)
751 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
752 if (r == 0) { /* Doesn't exist yet? */
753 r = mkdir_p_label(where, 0755);
754 if (r < 0)
755 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
756 }
757
758 r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
759 if (r < 0)
760 return log_oom();
761 options = r > 0 ? buf : m->options;
762
763 return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
764 }
765
766 static char *joined_and_escaped_lower_dirs(char **lower) {
767 _cleanup_strv_free_ char **sv = NULL;
768
769 sv = strv_copy(lower);
770 if (!sv)
771 return NULL;
772
773 strv_reverse(sv);
774
775 if (!strv_shell_escape(sv, ",:"))
776 return NULL;
777
778 return strv_join(sv, ":");
779 }
780
781 static int mount_overlay(const char *dest, CustomMount *m) {
782
783 _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
784 const char *options;
785 int r;
786
787 assert(dest);
788 assert(m);
789
790 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
791 if (r < 0)
792 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
793 if (r == 0) { /* Doesn't exist yet? */
794 r = mkdir_label(where, 0755);
795 if (r < 0)
796 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
797 }
798
799 (void) mkdir_p_label(m->source, 0755);
800
801 lower = joined_and_escaped_lower_dirs(m->lower);
802 if (!lower)
803 return log_oom();
804
805 escaped_source = shell_escape(m->source, ",:");
806 if (!escaped_source)
807 return log_oom();
808
809 if (m->read_only)
810 options = strjoina("lowerdir=", escaped_source, ":", lower);
811 else {
812 _cleanup_free_ char *escaped_work_dir = NULL;
813
814 escaped_work_dir = shell_escape(m->work_dir, ",:");
815 if (!escaped_work_dir)
816 return log_oom();
817
818 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
819 }
820
821 return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
822 }
823
824 int mount_custom(
825 const char *dest,
826 CustomMount *mounts, size_t n,
827 bool userns, uid_t uid_shift, uid_t uid_range,
828 const char *selinux_apifs_context) {
829
830 size_t i;
831 int r;
832
833 assert(dest);
834
835 for (i = 0; i < n; i++) {
836 CustomMount *m = mounts + i;
837
838 switch (m->type) {
839
840 case CUSTOM_MOUNT_BIND:
841 r = mount_bind(dest, m);
842 break;
843
844 case CUSTOM_MOUNT_TMPFS:
845 r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
846 break;
847
848 case CUSTOM_MOUNT_OVERLAY:
849 r = mount_overlay(dest, m);
850 break;
851
852 default:
853 assert_not_reached("Unknown custom mount type");
854 }
855
856 if (r < 0)
857 return r;
858 }
859
860 return 0;
861 }
862
863 /* Retrieve existing subsystems. This function is called in a new cgroup
864 * namespace.
865 */
866 static int get_process_controllers(Set **ret) {
867 _cleanup_set_free_free_ Set *controllers = NULL;
868 _cleanup_fclose_ FILE *f = NULL;
869 int r;
870
871 assert(ret);
872
873 controllers = set_new(&string_hash_ops);
874 if (!controllers)
875 return -ENOMEM;
876
877 f = fopen("/proc/self/cgroup", "re");
878 if (!f)
879 return errno == ENOENT ? -ESRCH : -errno;
880
881 for (;;) {
882 _cleanup_free_ char *line = NULL;
883 char *e, *l;
884
885 r = read_line(f, LONG_LINE_MAX, &line);
886 if (r < 0)
887 return r;
888 if (r == 0)
889 break;
890
891 l = strchr(line, ':');
892 if (!l)
893 continue;
894
895 l++;
896 e = strchr(l, ':');
897 if (!e)
898 continue;
899
900 *e = 0;
901
902 if (STR_IN_SET(l, "", "name=systemd", "name=unified"))
903 continue;
904
905 r = set_put_strdup(controllers, l);
906 if (r < 0)
907 return r;
908 }
909
910 *ret = TAKE_PTR(controllers);
911
912 return 0;
913 }
914
915 static int mount_legacy_cgroup_hierarchy(
916 const char *dest,
917 const char *controller,
918 const char *hierarchy,
919 bool read_only) {
920
921 const char *to, *fstype, *opts;
922 int r;
923
924 to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
925
926 r = path_is_mount_point(to, dest, 0);
927 if (r < 0 && r != -ENOENT)
928 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
929 if (r > 0)
930 return 0;
931
932 mkdir_p(to, 0755);
933
934 /* The superblock mount options of the mount point need to be
935 * identical to the hosts', and hence writable... */
936 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) {
937 fstype = "cgroup2";
938 opts = NULL;
939 } else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) {
940 fstype = "cgroup";
941 opts = "none,name=systemd,xattr";
942 } else {
943 fstype = "cgroup";
944 opts = controller;
945 }
946
947 r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
948 if (r < 0)
949 return r;
950
951 /* ... hence let's only make the bind mount read-only, not the superblock. */
952 if (read_only) {
953 r = mount_verbose(LOG_ERR, NULL, to, NULL,
954 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
955 if (r < 0)
956 return r;
957 }
958
959 return 1;
960 }
961
962 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
963 static int mount_legacy_cgns_supported(
964 const char *dest,
965 CGroupUnified unified_requested,
966 bool userns,
967 uid_t uid_shift,
968 uid_t uid_range,
969 const char *selinux_apifs_context) {
970
971 _cleanup_set_free_free_ Set *controllers = NULL;
972 const char *cgroup_root = "/sys/fs/cgroup", *c;
973 int r;
974
975 (void) mkdir_p(cgroup_root, 0755);
976
977 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
978 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
979 if (r < 0)
980 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
981 if (r == 0) {
982 _cleanup_free_ char *options = NULL;
983
984 /* When cgroup namespaces are enabled and user namespaces are
985 * used then the mount of the cgroupfs is done *inside* the new
986 * user namespace. We're root in the new user namespace and the
987 * kernel will happily translate our uid/gid to the correct
988 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
989 * pass uid 0 and not uid_shift to tmpfs_patch_options().
990 */
991 r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
992 if (r < 0)
993 return log_oom();
994
995 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
996 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
997 if (r < 0)
998 return r;
999 }
1000
1001 r = cg_all_unified();
1002 if (r < 0)
1003 return r;
1004 if (r > 0)
1005 goto skip_controllers;
1006
1007 r = get_process_controllers(&controllers);
1008 if (r < 0)
1009 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1010
1011 for (;;) {
1012 _cleanup_free_ const char *controller = NULL;
1013
1014 controller = set_steal_first(controllers);
1015 if (!controller)
1016 break;
1017
1018 r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
1019 if (r < 0)
1020 return r;
1021
1022 /* When multiple hierarchies are co-mounted, make their
1023 * constituting individual hierarchies a symlink to the
1024 * co-mount.
1025 */
1026 c = controller;
1027 for (;;) {
1028 _cleanup_free_ char *target = NULL, *tok = NULL;
1029
1030 r = extract_first_word(&c, &tok, ",", 0);
1031 if (r < 0)
1032 return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
1033 if (r == 0)
1034 break;
1035
1036 if (streq(controller, tok))
1037 break;
1038
1039 target = prefix_root("/sys/fs/cgroup/", tok);
1040 if (!target)
1041 return log_oom();
1042
1043 r = symlink_idempotent(controller, target);
1044 if (r == -EINVAL)
1045 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
1046 if (r < 0)
1047 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1048 }
1049 }
1050
1051 skip_controllers:
1052 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1053 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1054 if (r < 0)
1055 return r;
1056 }
1057
1058 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
1059 if (r < 0)
1060 return r;
1061
1062 if (!userns)
1063 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1064 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
1065
1066 return 0;
1067 }
1068
1069 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
1070 static int mount_legacy_cgns_unsupported(
1071 const char *dest,
1072 CGroupUnified unified_requested,
1073 bool userns,
1074 uid_t uid_shift,
1075 uid_t uid_range,
1076 const char *selinux_apifs_context) {
1077
1078 _cleanup_set_free_free_ Set *controllers = NULL;
1079 const char *cgroup_root;
1080 int r;
1081
1082 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1083
1084 (void) mkdir_p(cgroup_root, 0755);
1085
1086 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
1087 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
1088 if (r < 0)
1089 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1090 if (r == 0) {
1091 _cleanup_free_ char *options = NULL;
1092
1093 r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
1094 if (r < 0)
1095 return log_oom();
1096
1097 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
1098 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
1099 if (r < 0)
1100 return r;
1101 }
1102
1103 r = cg_all_unified();
1104 if (r < 0)
1105 return r;
1106 if (r > 0)
1107 goto skip_controllers;
1108
1109 r = cg_kernel_controllers(&controllers);
1110 if (r < 0)
1111 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1112
1113 for (;;) {
1114 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1115
1116 controller = set_steal_first(controllers);
1117 if (!controller)
1118 break;
1119
1120 origin = prefix_root("/sys/fs/cgroup/", controller);
1121 if (!origin)
1122 return log_oom();
1123
1124 r = readlink_malloc(origin, &combined);
1125 if (r == -EINVAL) {
1126 /* Not a symbolic link, but directly a single cgroup hierarchy */
1127
1128 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
1129 if (r < 0)
1130 return r;
1131
1132 } else if (r < 0)
1133 return log_error_errno(r, "Failed to read link %s: %m", origin);
1134 else {
1135 _cleanup_free_ char *target = NULL;
1136
1137 target = prefix_root(dest, origin);
1138 if (!target)
1139 return log_oom();
1140
1141 /* A symbolic link, a combination of controllers in one hierarchy */
1142
1143 if (!filename_is_valid(combined)) {
1144 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1145 continue;
1146 }
1147
1148 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
1149 if (r < 0)
1150 return r;
1151
1152 r = symlink_idempotent(combined, target);
1153 if (r == -EINVAL)
1154 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
1155 if (r < 0)
1156 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1157 }
1158 }
1159
1160 skip_controllers:
1161 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1162 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1163 if (r < 0)
1164 return r;
1165 }
1166
1167 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
1168 if (r < 0)
1169 return r;
1170
1171 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1172 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
1173 }
1174
1175 static int mount_unified_cgroups(const char *dest) {
1176 const char *p;
1177 int r;
1178
1179 assert(dest);
1180
1181 p = prefix_roota(dest, "/sys/fs/cgroup");
1182
1183 (void) mkdir_p(p, 0755);
1184
1185 r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW);
1186 if (r < 0)
1187 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
1188 if (r > 0) {
1189 p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
1190 if (access(p, F_OK) >= 0)
1191 return 0;
1192 if (errno != ENOENT)
1193 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
1194
1195 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
1196 return -EINVAL;
1197 }
1198
1199 return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1200 }
1201
1202 int mount_cgroups(
1203 const char *dest,
1204 CGroupUnified unified_requested,
1205 bool userns,
1206 uid_t uid_shift,
1207 uid_t uid_range,
1208 const char *selinux_apifs_context,
1209 bool use_cgns) {
1210
1211 if (unified_requested >= CGROUP_UNIFIED_ALL)
1212 return mount_unified_cgroups(dest);
1213 if (use_cgns)
1214 return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
1215
1216 return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
1217 }
1218
1219 static int mount_systemd_cgroup_writable_one(const char *root, const char *own) {
1220 int r;
1221
1222 assert(root);
1223 assert(own);
1224
1225 /* Make our own cgroup a (writable) bind mount */
1226 r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL);
1227 if (r < 0)
1228 return r;
1229
1230 /* And then remount the systemd cgroup root read-only */
1231 return mount_verbose(LOG_ERR, NULL, root, NULL,
1232 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
1233 }
1234
1235 int mount_systemd_cgroup_writable(
1236 const char *dest,
1237 CGroupUnified unified_requested) {
1238
1239 _cleanup_free_ char *own_cgroup_path = NULL;
1240 const char *root, *own;
1241 int r;
1242
1243 assert(dest);
1244
1245 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1246 if (r < 0)
1247 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1248
1249 /* If we are living in the top-level, then there's nothing to do... */
1250 if (path_equal(own_cgroup_path, "/"))
1251 return 0;
1252
1253 if (unified_requested >= CGROUP_UNIFIED_ALL) {
1254
1255 root = prefix_roota(dest, "/sys/fs/cgroup");
1256 own = strjoina(root, own_cgroup_path);
1257
1258 } else {
1259
1260 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1261 root = prefix_roota(dest, "/sys/fs/cgroup/unified");
1262 own = strjoina(root, own_cgroup_path);
1263
1264 r = mount_systemd_cgroup_writable_one(root, own);
1265 if (r < 0)
1266 return r;
1267 }
1268
1269 root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1270 own = strjoina(root, own_cgroup_path);
1271 }
1272
1273 return mount_systemd_cgroup_writable_one(root, own);
1274 }
1275
1276 int setup_volatile_state(
1277 const char *directory,
1278 VolatileMode mode,
1279 bool userns, uid_t uid_shift, uid_t uid_range,
1280 const char *selinux_apifs_context) {
1281
1282 _cleanup_free_ char *buf = NULL;
1283 const char *p, *options;
1284 int r;
1285
1286 assert(directory);
1287
1288 if (mode != VOLATILE_STATE)
1289 return 0;
1290
1291 /* --volatile=state means we simply overmount /var
1292 with a tmpfs, and the rest read-only. */
1293
1294 r = bind_remount_recursive(directory, true, NULL);
1295 if (r < 0)
1296 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1297
1298 p = prefix_roota(directory, "/var");
1299 r = mkdir(p, 0755);
1300 if (r < 0 && errno != EEXIST)
1301 return log_error_errno(errno, "Failed to create %s: %m", directory);
1302
1303 options = "mode=755";
1304 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
1305 if (r < 0)
1306 return log_oom();
1307 if (r > 0)
1308 options = buf;
1309
1310 return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
1311 }
1312
1313 int setup_volatile(
1314 const char *directory,
1315 VolatileMode mode,
1316 bool userns, uid_t uid_shift, uid_t uid_range,
1317 const char *selinux_apifs_context) {
1318
1319 bool tmpfs_mounted = false, bind_mounted = false;
1320 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1321 _cleanup_free_ char *buf = NULL;
1322 const char *f, *t, *options;
1323 int r;
1324
1325 assert(directory);
1326
1327 if (mode != VOLATILE_YES)
1328 return 0;
1329
1330 /* --volatile=yes means we mount a tmpfs to the root dir, and
1331 the original /usr to use inside it, and that read-only. */
1332
1333 if (!mkdtemp(template))
1334 return log_error_errno(errno, "Failed to create temporary directory: %m");
1335
1336 options = "mode=755";
1337 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
1338 if (r < 0)
1339 return log_oom();
1340 if (r > 0)
1341 options = buf;
1342
1343 r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1344 if (r < 0)
1345 goto fail;
1346
1347 tmpfs_mounted = true;
1348
1349 f = prefix_roota(directory, "/usr");
1350 t = prefix_roota(template, "/usr");
1351
1352 r = mkdir(t, 0755);
1353 if (r < 0 && errno != EEXIST) {
1354 r = log_error_errno(errno, "Failed to create %s: %m", t);
1355 goto fail;
1356 }
1357
1358 r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
1359 if (r < 0)
1360 goto fail;
1361
1362 bind_mounted = true;
1363
1364 r = bind_remount_recursive(t, true, NULL);
1365 if (r < 0) {
1366 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1367 goto fail;
1368 }
1369
1370 r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
1371 if (r < 0)
1372 goto fail;
1373
1374 (void) rmdir(template);
1375
1376 return 0;
1377
1378 fail:
1379 if (bind_mounted)
1380 (void) umount_verbose(t);
1381
1382 if (tmpfs_mounted)
1383 (void) umount_verbose(template);
1384 (void) rmdir(template);
1385 return r;
1386 }
1387
1388 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1389 int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
1390 _cleanup_free_ char *root_new = NULL, *root_old = NULL;
1391 const char *p = s;
1392 int r;
1393
1394 assert(pivot_root_new);
1395 assert(pivot_root_old);
1396
1397 r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1398 if (r < 0)
1399 return r;
1400 if (r == 0)
1401 return -EINVAL;
1402
1403 if (isempty(p))
1404 root_old = NULL;
1405 else {
1406 root_old = strdup(p);
1407 if (!root_old)
1408 return -ENOMEM;
1409 }
1410
1411 if (!path_is_absolute(root_new))
1412 return -EINVAL;
1413 if (root_old && !path_is_absolute(root_old))
1414 return -EINVAL;
1415
1416 free_and_replace(*pivot_root_new, root_new);
1417 free_and_replace(*pivot_root_old, root_old);
1418
1419 return 0;
1420 }
1421
1422 int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
1423 _cleanup_free_ char *directory_pivot_root_new = NULL;
1424 _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
1425 char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
1426 bool remove_pivot_tmp = false;
1427 int r;
1428
1429 assert(directory);
1430
1431 if (!pivot_root_new)
1432 return 0;
1433
1434 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1435 * If pivot_root_old is NULL, the existing / disappears.
1436 * This requires a temporary directory, pivot_tmp, which is
1437 * not a child of either.
1438 *
1439 * This is typically used for OSTree-style containers, where
1440 * the root partition contains several sysroots which could be
1441 * run. Normally, one would be chosen by the bootloader and
1442 * pivoted to / by initramfs.
1443 *
1444 * For example, for an OSTree deployment, pivot_root_new
1445 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1446 * code doesn’t do the /var mount which OSTree expects: use
1447 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1448 *
1449 * So in the OSTree case, we’ll end up with something like:
1450 * - directory = /tmp/nspawn-root-123456
1451 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1452 * - pivot_root_old = /sysroot
1453 * - directory_pivot_root_new =
1454 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1455 * - pivot_tmp = /tmp/nspawn-pivot-123456
1456 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1457 *
1458 * Requires all file systems at directory and below to be mounted
1459 * MS_PRIVATE or MS_SLAVE so they can be moved.
1460 */
1461 directory_pivot_root_new = prefix_root(directory, pivot_root_new);
1462
1463 /* Remount directory_pivot_root_new to make it movable. */
1464 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
1465 if (r < 0)
1466 goto done;
1467
1468 if (pivot_root_old) {
1469 if (!mkdtemp(pivot_tmp)) {
1470 r = log_error_errno(errno, "Failed to create temporary directory: %m");
1471 goto done;
1472 }
1473
1474 remove_pivot_tmp = true;
1475 pivot_tmp_pivot_root_old = prefix_root(pivot_tmp, pivot_root_old);
1476
1477 r = mount_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
1478 if (r < 0)
1479 goto done;
1480
1481 r = mount_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
1482 if (r < 0)
1483 goto done;
1484
1485 r = mount_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
1486 if (r < 0)
1487 goto done;
1488 } else {
1489 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
1490 if (r < 0)
1491 goto done;
1492 }
1493
1494 done:
1495 if (remove_pivot_tmp)
1496 (void) rmdir(pivot_tmp);
1497
1498 return r;
1499 }