]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-mount.c
macro: introduce TAKE_PTR() macro
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2015 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <sys/mount.h>
22 #include <linux/magic.h>
23
24 #include "alloc-util.h"
25 #include "escape.h"
26 #include "fd-util.h"
27 #include "fileio.h"
28 #include "fs-util.h"
29 #include "label.h"
30 #include "mkdir.h"
31 #include "mount-util.h"
32 #include "nspawn-mount.h"
33 #include "parse-util.h"
34 #include "path-util.h"
35 #include "rm-rf.h"
36 #include "set.h"
37 #include "stat-util.h"
38 #include "string-util.h"
39 #include "strv.h"
40 #include "user-util.h"
41 #include "util.h"
42
43 CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
44 CustomMount *c, *ret;
45
46 assert(l);
47 assert(n);
48 assert(t >= 0);
49 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
50
51 c = reallocarray(*l, *n + 1, sizeof(CustomMount));
52 if (!c)
53 return NULL;
54
55 *l = c;
56 ret = *l + *n;
57 (*n)++;
58
59 *ret = (CustomMount) { .type = t };
60
61 return ret;
62 }
63
64 void custom_mount_free_all(CustomMount *l, unsigned n) {
65 unsigned i;
66
67 for (i = 0; i < n; i++) {
68 CustomMount *m = l + i;
69
70 free(m->source);
71 free(m->destination);
72 free(m->options);
73
74 if (m->work_dir) {
75 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
76 free(m->work_dir);
77 }
78
79 if (m->rm_rf_tmpdir) {
80 (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
81 free(m->rm_rf_tmpdir);
82 }
83
84 strv_free(m->lower);
85 }
86
87 free(l);
88 }
89
90 static int custom_mount_compare(const void *a, const void *b) {
91 const CustomMount *x = a, *y = b;
92 int r;
93
94 r = path_compare(x->destination, y->destination);
95 if (r != 0)
96 return r;
97
98 if (x->type < y->type)
99 return -1;
100 if (x->type > y->type)
101 return 1;
102
103 return 0;
104 }
105
106 static bool source_path_is_valid(const char *p) {
107 assert(p);
108
109 if (*p == '+')
110 p++;
111
112 return path_is_absolute(p);
113 }
114
115 static char *resolve_source_path(const char *dest, const char *source) {
116
117 if (!source)
118 return NULL;
119
120 if (source[0] == '+')
121 return prefix_root(dest, source + 1);
122
123 return strdup(source);
124 }
125
126 int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
127 unsigned i;
128 int r;
129
130 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
131 * parent process, so that we know the temporary directories to remove on exit before we fork off the
132 * children. */
133
134 assert(l || n == 0);
135
136 /* Order the custom mounts, and make sure we have a working directory */
137 qsort_safe(l, n, sizeof(CustomMount), custom_mount_compare);
138
139 for (i = 0; i < n; i++) {
140 CustomMount *m = l + i;
141
142 if (m->source) {
143 char *s;
144
145 s = resolve_source_path(dest, m->source);
146 if (!s)
147 return log_oom();
148
149 free(m->source);
150 m->source = s;
151 } else {
152 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
153
154 m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
155 if (!m->rm_rf_tmpdir)
156 return log_oom();
157
158 if (!mkdtemp(m->rm_rf_tmpdir)) {
159 m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
160 return log_error_errno(errno, "Failed to acquire temporary directory: %m");
161 }
162
163 m->source = strjoin(m->rm_rf_tmpdir, "/src");
164 if (!m->source)
165 return log_oom();
166
167 if (mkdir(m->source, 0755) < 0)
168 return log_error_errno(errno, "Failed to create %s: %m", m->source);
169 }
170
171 if (m->type == CUSTOM_MOUNT_OVERLAY) {
172 char **j;
173
174 STRV_FOREACH(j, m->lower) {
175 char *s;
176
177 s = resolve_source_path(dest, *j);
178 if (!s)
179 return log_oom();
180
181 free(*j);
182 *j = s;
183 }
184
185 if (m->work_dir) {
186 char *s;
187
188 s = resolve_source_path(dest, m->work_dir);
189 if (!s)
190 return log_oom();
191
192 free(m->work_dir);
193 m->work_dir = s;
194 } else {
195 assert(m->source);
196
197 r = tempfn_random(m->source, NULL, &m->work_dir);
198 if (r < 0)
199 return log_error_errno(r, "Failed to acquire working directory: %m");
200 }
201
202 (void) mkdir_label(m->work_dir, 0700);
203 }
204 }
205
206 return 0;
207 }
208
209 int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
210 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
211 const char *p = s;
212 CustomMount *m;
213 int r;
214
215 assert(l);
216 assert(n);
217
218 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
219 if (r < 0)
220 return r;
221 if (r == 0)
222 return -EINVAL;
223 if (r == 1) {
224 destination = strdup(source[0] == '+' ? source+1 : source);
225 if (!destination)
226 return -ENOMEM;
227 }
228 if (r == 2 && !isempty(p)) {
229 opts = strdup(p);
230 if (!opts)
231 return -ENOMEM;
232 }
233
234 if (isempty(source))
235 source = NULL;
236 else if (!source_path_is_valid(source))
237 return -EINVAL;
238
239 if (!path_is_absolute(destination))
240 return -EINVAL;
241
242 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
243 if (!m)
244 return -ENOMEM;
245
246 m->source = source;
247 m->destination = destination;
248 m->read_only = read_only;
249 m->options = opts;
250
251 source = destination = opts = NULL;
252 return 0;
253 }
254
255 int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
256 _cleanup_free_ char *path = NULL, *opts = NULL;
257 const char *p = s;
258 CustomMount *m;
259 int r;
260
261 assert(l);
262 assert(n);
263 assert(s);
264
265 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
266 if (r < 0)
267 return r;
268 if (r == 0)
269 return -EINVAL;
270
271 if (isempty(p))
272 opts = strdup("mode=0755");
273 else
274 opts = strdup(p);
275 if (!opts)
276 return -ENOMEM;
277
278 if (!path_is_absolute(path))
279 return -EINVAL;
280
281 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
282 if (!m)
283 return -ENOMEM;
284
285 m->destination = path;
286 m->options = opts;
287
288 path = opts = NULL;
289 return 0;
290 }
291
292 int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
293 _cleanup_free_ char *upper = NULL, *destination = NULL;
294 _cleanup_strv_free_ char **lower = NULL;
295 CustomMount *m;
296 int k;
297
298 k = strv_split_extract(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
299 if (k < 0)
300 return k;
301 if (k < 2)
302 return -EADDRNOTAVAIL;
303 if (k == 2) {
304 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
305 * we'll also define the destination mount point the same as the upper. */
306
307 if (!source_path_is_valid(lower[0]) ||
308 !source_path_is_valid(lower[1]))
309 return -EINVAL;
310
311 upper = TAKE_PTR(lower[1]);
312
313 destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
314 if (!destination)
315 return -ENOMEM;
316 } else {
317 char **i;
318
319 /* If more than two parameters are specified, the last one is the destination, the second to last one
320 * the "upper", and all before that the "lower" directories. */
321
322 destination = lower[k - 1];
323 upper = TAKE_PTR(lower[k - 2]);
324
325 STRV_FOREACH(i, lower)
326 if (!source_path_is_valid(*i))
327 return -EINVAL;
328
329 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
330 * in /var/tmp */
331 if (isempty(upper))
332 upper = NULL;
333 else if (!source_path_is_valid(upper))
334 return -EINVAL;
335
336 if (!path_is_absolute(destination))
337 return -EINVAL;
338 }
339
340 m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
341 if (!m)
342 return -ENOMEM;
343
344 m->destination = destination;
345 m->source = upper;
346 m->lower = lower;
347 m->read_only = read_only;
348
349 upper = destination = NULL;
350 lower = NULL;
351
352 return 0;
353 }
354
355 static int tmpfs_patch_options(
356 const char *options,
357 bool userns,
358 uid_t uid_shift, uid_t uid_range,
359 bool patch_ids,
360 const char *selinux_apifs_context,
361 char **ret) {
362
363 char *buf = NULL;
364
365 if ((userns && uid_shift != 0) || patch_ids) {
366 assert(uid_shift != UID_INVALID);
367
368 if (asprintf(&buf, "%s%suid=" UID_FMT ",gid=" UID_FMT,
369 strempty(options), options ? "," : "",
370 uid_shift, uid_shift) < 0)
371 return -ENOMEM;
372
373 options = buf;
374 }
375
376 #if HAVE_SELINUX
377 if (selinux_apifs_context) {
378 char *t;
379
380 t = strjoin(strempty(options), options ? "," : "",
381 "context=\"", selinux_apifs_context, "\"");
382 free(buf);
383 if (!t)
384 return -ENOMEM;
385
386 buf = t;
387 }
388 #endif
389
390 if (!buf && options) {
391 buf = strdup(options);
392 if (!buf)
393 return -ENOMEM;
394 }
395 *ret = buf;
396
397 return !!buf;
398 }
399
400 int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
401 const char *full, *top, *x;
402 int r;
403 unsigned long extra_flags = 0;
404
405 top = prefix_roota(dest, "/sys");
406 r = path_is_fs_type(top, SYSFS_MAGIC);
407 if (r < 0)
408 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
409 /* /sys might already be mounted as sysfs by the outer child in the
410 * !netns case. In this case, it's all good. Don't touch it because we
411 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
412 */
413 if (r > 0)
414 return 0;
415
416 full = prefix_roota(top, "/full");
417
418 (void) mkdir(full, 0755);
419
420 if (mount_settings & MOUNT_APPLY_APIVFS_RO)
421 extra_flags |= MS_RDONLY;
422
423 r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
424 MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
425 if (r < 0)
426 return r;
427
428 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
429 _cleanup_free_ char *from = NULL, *to = NULL;
430
431 from = prefix_root(full, x);
432 if (!from)
433 return log_oom();
434
435 to = prefix_root(top, x);
436 if (!to)
437 return log_oom();
438
439 (void) mkdir(to, 0755);
440
441 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
442 if (r < 0)
443 return r;
444
445 r = mount_verbose(LOG_ERR, NULL, to, NULL,
446 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
447 if (r < 0)
448 return r;
449 }
450
451 r = umount_verbose(full);
452 if (r < 0)
453 return r;
454
455 if (rmdir(full) < 0)
456 return log_error_errno(errno, "Failed to remove %s: %m", full);
457
458 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
459 * remount /sys read-only.
460 */
461 if (cg_ns_supported()) {
462 x = prefix_roota(top, "/fs/cgroup");
463 (void) mkdir_p(x, 0755);
464 }
465
466 return mount_verbose(LOG_ERR, NULL, top, NULL,
467 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
468 }
469
470 static int mkdir_userns(const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
471 int r;
472
473 assert(path);
474
475 r = mkdir_errno_wrapper(path, mode);
476 if (r < 0 && r != -EEXIST)
477 return r;
478
479 if ((mask & MOUNT_USE_USERNS) == 0)
480 return 0;
481
482 if (mask & MOUNT_IN_USERNS)
483 return 0;
484
485 if (lchown(path, uid_shift, uid_shift) < 0)
486 return -errno;
487
488 return 0;
489 }
490
491 static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
492 const char *p, *e;
493 int r;
494
495 assert(path);
496
497 if (prefix && !path_startswith(path, prefix))
498 return -ENOTDIR;
499
500 /* create every parent directory in the path, except the last component */
501 p = path + strspn(path, "/");
502 for (;;) {
503 char t[strlen(path) + 1];
504
505 e = p + strcspn(p, "/");
506 p = e + strspn(e, "/");
507
508 /* Is this the last component? If so, then we're done */
509 if (*p == 0)
510 break;
511
512 memcpy(t, path, e - path);
513 t[e-path] = 0;
514
515 if (prefix && path_startswith(prefix, t))
516 continue;
517
518 r = mkdir_userns(t, mode, mask, uid_shift);
519 if (r < 0)
520 return r;
521 }
522
523 return mkdir_userns(path, mode, mask, uid_shift);
524 }
525
526 int mount_all(const char *dest,
527 MountSettingsMask mount_settings,
528 uid_t uid_shift, uid_t uid_range,
529 const char *selinux_apifs_context) {
530
531 typedef struct MountPoint {
532 const char *what;
533 const char *where;
534 const char *type;
535 const char *options;
536 unsigned long flags;
537 MountSettingsMask mount_settings;
538 } MountPoint;
539
540 static const MountPoint mount_table[] = {
541 /* inner child mounts */
542 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
543 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
544 { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
545 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
546 { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
547 { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
548
549 /* outer child mounts */
550 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
551 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
552 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
553 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
554
555 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
556 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
557 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
558 #if HAVE_SELINUX
559 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
560 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
561 #endif
562 };
563
564 unsigned k;
565 int r;
566 bool use_userns = (mount_settings & MOUNT_USE_USERNS);
567 bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
568 bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
569 bool in_userns = (mount_settings & MOUNT_IN_USERNS);
570
571 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
572 _cleanup_free_ char *where = NULL, *options = NULL;
573 const char *o;
574 bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
575
576 if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
577 continue;
578
579 if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
580 continue;
581
582 if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
583 continue;
584
585 r = chase_symlinks(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where);
586 if (r < 0)
587 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
588
589 r = path_is_mount_point(where, NULL, 0);
590 if (r < 0 && r != -ENOENT)
591 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
592
593 /* Skip this entry if it is not a remount. */
594 if (mount_table[k].what && r > 0)
595 continue;
596
597 r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
598 if (r < 0 && r != -EEXIST) {
599 if (fatal && r != -EROFS)
600 return log_error_errno(r, "Failed to create directory %s: %m", where);
601
602 log_debug_errno(r, "Failed to create directory %s: %m", where);
603 /* If we failed mkdir() or chown() due to the root
604 * directory being read only, attempt to mount this fs
605 * anyway and let mount_verbose log any errors */
606 if (r != -EROFS)
607 continue;
608 }
609
610 o = mount_table[k].options;
611 if (streq_ptr(mount_table[k].type, "tmpfs")) {
612 if (in_userns)
613 r = tmpfs_patch_options(o, use_userns, 0, uid_range, true, selinux_apifs_context, &options);
614 else
615 r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
616 if (r < 0)
617 return log_oom();
618 if (r > 0)
619 o = options;
620 }
621
622 r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
623 mount_table[k].what,
624 where,
625 mount_table[k].type,
626 mount_table[k].flags,
627 o);
628 if (r < 0 && fatal)
629 return r;
630 }
631
632 return 0;
633 }
634
635 static int mount_bind(const char *dest, CustomMount *m) {
636
637 _cleanup_free_ char *where = NULL;
638 struct stat source_st, dest_st;
639 int r;
640
641 assert(dest);
642 assert(m);
643
644 if (stat(m->source, &source_st) < 0)
645 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
646
647 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
648 if (r < 0)
649 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
650 if (r > 0) { /* Path exists already? */
651
652 if (stat(where, &dest_st) < 0)
653 return log_error_errno(errno, "Failed to stat %s: %m", where);
654
655 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
656 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
657 return -EINVAL;
658 }
659
660 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
661 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
662 return -EINVAL;
663 }
664
665 } else { /* Path doesn't exist yet? */
666 r = mkdir_parents_label(where, 0755);
667 if (r < 0)
668 return log_error_errno(r, "Failed to make parents of %s: %m", where);
669
670 /* Create the mount point. Any non-directory file can be
671 * mounted on any non-directory file (regular, fifo, socket,
672 * char, block).
673 */
674 if (S_ISDIR(source_st.st_mode))
675 r = mkdir_label(where, 0755);
676 else
677 r = touch(where);
678 if (r < 0)
679 return log_error_errno(r, "Failed to create mount point %s: %m", where);
680
681 }
682
683 r = mount_verbose(LOG_ERR, m->source, where, NULL, MS_BIND | MS_REC, m->options);
684 if (r < 0)
685 return r;
686
687 if (m->read_only) {
688 r = bind_remount_recursive(where, true, NULL);
689 if (r < 0)
690 return log_error_errno(r, "Read-only bind mount failed: %m");
691 }
692
693 return 0;
694 }
695
696 static int mount_tmpfs(
697 const char *dest,
698 CustomMount *m,
699 bool userns, uid_t uid_shift, uid_t uid_range,
700 const char *selinux_apifs_context) {
701
702 const char *options;
703 _cleanup_free_ char *buf = NULL, *where = NULL;
704 int r;
705
706 assert(dest);
707 assert(m);
708
709 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
710 if (r < 0)
711 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
712 if (r == 0) { /* Doesn't exist yet? */
713 r = mkdir_p_label(where, 0755);
714 if (r < 0)
715 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
716 }
717
718 r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
719 if (r < 0)
720 return log_oom();
721 options = r > 0 ? buf : m->options;
722
723 return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
724 }
725
726 static char *joined_and_escaped_lower_dirs(char **lower) {
727 _cleanup_strv_free_ char **sv = NULL;
728
729 sv = strv_copy(lower);
730 if (!sv)
731 return NULL;
732
733 strv_reverse(sv);
734
735 if (!strv_shell_escape(sv, ",:"))
736 return NULL;
737
738 return strv_join(sv, ":");
739 }
740
741 static int mount_overlay(const char *dest, CustomMount *m) {
742
743 _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
744 const char *options;
745 int r;
746
747 assert(dest);
748 assert(m);
749
750 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
751 if (r < 0)
752 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
753 if (r == 0) { /* Doesn't exist yet? */
754 r = mkdir_label(where, 0755);
755 if (r < 0)
756 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
757 }
758
759 (void) mkdir_p_label(m->source, 0755);
760
761 lower = joined_and_escaped_lower_dirs(m->lower);
762 if (!lower)
763 return log_oom();
764
765 escaped_source = shell_escape(m->source, ",:");
766 if (!escaped_source)
767 return log_oom();
768
769 if (m->read_only)
770 options = strjoina("lowerdir=", escaped_source, ":", lower);
771 else {
772 _cleanup_free_ char *escaped_work_dir = NULL;
773
774 escaped_work_dir = shell_escape(m->work_dir, ",:");
775 if (!escaped_work_dir)
776 return log_oom();
777
778 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
779 }
780
781 return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
782 }
783
784 int mount_custom(
785 const char *dest,
786 CustomMount *mounts, unsigned n,
787 bool userns, uid_t uid_shift, uid_t uid_range,
788 const char *selinux_apifs_context) {
789
790 unsigned i;
791 int r;
792
793 assert(dest);
794
795 for (i = 0; i < n; i++) {
796 CustomMount *m = mounts + i;
797
798 switch (m->type) {
799
800 case CUSTOM_MOUNT_BIND:
801 r = mount_bind(dest, m);
802 break;
803
804 case CUSTOM_MOUNT_TMPFS:
805 r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
806 break;
807
808 case CUSTOM_MOUNT_OVERLAY:
809 r = mount_overlay(dest, m);
810 break;
811
812 default:
813 assert_not_reached("Unknown custom mount type");
814 }
815
816 if (r < 0)
817 return r;
818 }
819
820 return 0;
821 }
822
823 /* Retrieve existing subsystems. This function is called in a new cgroup
824 * namespace.
825 */
826 static int get_process_controllers(Set **ret) {
827 _cleanup_set_free_free_ Set *controllers = NULL;
828 _cleanup_fclose_ FILE *f = NULL;
829 int r;
830
831 assert(ret);
832
833 controllers = set_new(&string_hash_ops);
834 if (!controllers)
835 return -ENOMEM;
836
837 f = fopen("/proc/self/cgroup", "re");
838 if (!f)
839 return errno == ENOENT ? -ESRCH : -errno;
840
841 for (;;) {
842 _cleanup_free_ char *line = NULL;
843 char *e, *l;
844
845 r = read_line(f, LONG_LINE_MAX, &line);
846 if (r < 0)
847 return r;
848 if (r == 0)
849 break;
850
851 l = strchr(line, ':');
852 if (!l)
853 continue;
854
855 l++;
856 e = strchr(l, ':');
857 if (!e)
858 continue;
859
860 *e = 0;
861
862 if (STR_IN_SET(l, "", "name=systemd", "name=unified"))
863 continue;
864
865 r = set_put_strdup(controllers, l);
866 if (r < 0)
867 return r;
868 }
869
870 *ret = controllers;
871 controllers = NULL;
872
873 return 0;
874 }
875
876 static int mount_legacy_cgroup_hierarchy(
877 const char *dest,
878 const char *controller,
879 const char *hierarchy,
880 bool read_only) {
881
882 const char *to, *fstype, *opts;
883 int r;
884
885 to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
886
887 r = path_is_mount_point(to, dest, 0);
888 if (r < 0 && r != -ENOENT)
889 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
890 if (r > 0)
891 return 0;
892
893 mkdir_p(to, 0755);
894
895 /* The superblock mount options of the mount point need to be
896 * identical to the hosts', and hence writable... */
897 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) {
898 fstype = "cgroup2";
899 opts = NULL;
900 } else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) {
901 fstype = "cgroup";
902 opts = "none,name=systemd,xattr";
903 } else {
904 fstype = "cgroup";
905 opts = controller;
906 }
907
908 r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
909 if (r < 0)
910 return r;
911
912 /* ... hence let's only make the bind mount read-only, not the superblock. */
913 if (read_only) {
914 r = mount_verbose(LOG_ERR, NULL, to, NULL,
915 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
916 if (r < 0)
917 return r;
918 }
919
920 return 1;
921 }
922
923 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
924 static int mount_legacy_cgns_supported(
925 const char *dest,
926 CGroupUnified unified_requested,
927 bool userns,
928 uid_t uid_shift,
929 uid_t uid_range,
930 const char *selinux_apifs_context) {
931
932 _cleanup_set_free_free_ Set *controllers = NULL;
933 const char *cgroup_root = "/sys/fs/cgroup", *c;
934 int r;
935
936 (void) mkdir_p(cgroup_root, 0755);
937
938 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
939 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
940 if (r < 0)
941 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
942 if (r == 0) {
943 _cleanup_free_ char *options = NULL;
944
945 /* When cgroup namespaces are enabled and user namespaces are
946 * used then the mount of the cgroupfs is done *inside* the new
947 * user namespace. We're root in the new user namespace and the
948 * kernel will happily translate our uid/gid to the correct
949 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
950 * pass uid 0 and not uid_shift to tmpfs_patch_options().
951 */
952 r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
953 if (r < 0)
954 return log_oom();
955
956 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
957 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
958 if (r < 0)
959 return r;
960 }
961
962 r = cg_all_unified();
963 if (r < 0)
964 return r;
965 if (r > 0)
966 goto skip_controllers;
967
968 r = get_process_controllers(&controllers);
969 if (r < 0)
970 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
971
972 for (;;) {
973 _cleanup_free_ const char *controller = NULL;
974
975 controller = set_steal_first(controllers);
976 if (!controller)
977 break;
978
979 r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
980 if (r < 0)
981 return r;
982
983 /* When multiple hierarchies are co-mounted, make their
984 * constituting individual hierarchies a symlink to the
985 * co-mount.
986 */
987 c = controller;
988 for (;;) {
989 _cleanup_free_ char *target = NULL, *tok = NULL;
990
991 r = extract_first_word(&c, &tok, ",", 0);
992 if (r < 0)
993 return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
994 if (r == 0)
995 break;
996
997 if (streq(controller, tok))
998 break;
999
1000 target = prefix_root("/sys/fs/cgroup/", tok);
1001 if (!target)
1002 return log_oom();
1003
1004 r = symlink_idempotent(controller, target);
1005 if (r == -EINVAL)
1006 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
1007 if (r < 0)
1008 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1009 }
1010 }
1011
1012 skip_controllers:
1013 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1014 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1015 if (r < 0)
1016 return r;
1017 }
1018
1019 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
1020 if (r < 0)
1021 return r;
1022
1023 if (!userns)
1024 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1025 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
1026
1027 return 0;
1028 }
1029
1030 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
1031 static int mount_legacy_cgns_unsupported(
1032 const char *dest,
1033 CGroupUnified unified_requested,
1034 bool userns,
1035 uid_t uid_shift,
1036 uid_t uid_range,
1037 const char *selinux_apifs_context) {
1038
1039 _cleanup_set_free_free_ Set *controllers = NULL;
1040 const char *cgroup_root;
1041 int r;
1042
1043 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1044
1045 (void) mkdir_p(cgroup_root, 0755);
1046
1047 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
1048 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
1049 if (r < 0)
1050 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1051 if (r == 0) {
1052 _cleanup_free_ char *options = NULL;
1053
1054 r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
1055 if (r < 0)
1056 return log_oom();
1057
1058 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
1059 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
1060 if (r < 0)
1061 return r;
1062 }
1063
1064 r = cg_all_unified();
1065 if (r < 0)
1066 return r;
1067 if (r > 0)
1068 goto skip_controllers;
1069
1070 r = cg_kernel_controllers(&controllers);
1071 if (r < 0)
1072 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1073
1074 for (;;) {
1075 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1076
1077 controller = set_steal_first(controllers);
1078 if (!controller)
1079 break;
1080
1081 origin = prefix_root("/sys/fs/cgroup/", controller);
1082 if (!origin)
1083 return log_oom();
1084
1085 r = readlink_malloc(origin, &combined);
1086 if (r == -EINVAL) {
1087 /* Not a symbolic link, but directly a single cgroup hierarchy */
1088
1089 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
1090 if (r < 0)
1091 return r;
1092
1093 } else if (r < 0)
1094 return log_error_errno(r, "Failed to read link %s: %m", origin);
1095 else {
1096 _cleanup_free_ char *target = NULL;
1097
1098 target = prefix_root(dest, origin);
1099 if (!target)
1100 return log_oom();
1101
1102 /* A symbolic link, a combination of controllers in one hierarchy */
1103
1104 if (!filename_is_valid(combined)) {
1105 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1106 continue;
1107 }
1108
1109 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
1110 if (r < 0)
1111 return r;
1112
1113 r = symlink_idempotent(combined, target);
1114 if (r == -EINVAL)
1115 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
1116 if (r < 0)
1117 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1118 }
1119 }
1120
1121 skip_controllers:
1122 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1123 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1124 if (r < 0)
1125 return r;
1126 }
1127
1128 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
1129 if (r < 0)
1130 return r;
1131
1132 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1133 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
1134 }
1135
1136 static int mount_unified_cgroups(const char *dest) {
1137 const char *p;
1138 int r;
1139
1140 assert(dest);
1141
1142 p = prefix_roota(dest, "/sys/fs/cgroup");
1143
1144 (void) mkdir_p(p, 0755);
1145
1146 r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW);
1147 if (r < 0)
1148 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
1149 if (r > 0) {
1150 p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
1151 if (access(p, F_OK) >= 0)
1152 return 0;
1153 if (errno != ENOENT)
1154 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
1155
1156 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
1157 return -EINVAL;
1158 }
1159
1160 return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1161 }
1162
1163 int mount_cgroups(
1164 const char *dest,
1165 CGroupUnified unified_requested,
1166 bool userns,
1167 uid_t uid_shift,
1168 uid_t uid_range,
1169 const char *selinux_apifs_context,
1170 bool use_cgns) {
1171
1172 if (unified_requested >= CGROUP_UNIFIED_ALL)
1173 return mount_unified_cgroups(dest);
1174 if (use_cgns)
1175 return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
1176
1177 return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
1178 }
1179
1180 static int mount_systemd_cgroup_writable_one(const char *root, const char *own) {
1181 int r;
1182
1183 assert(root);
1184 assert(own);
1185
1186 /* Make our own cgroup a (writable) bind mount */
1187 r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL);
1188 if (r < 0)
1189 return r;
1190
1191 /* And then remount the systemd cgroup root read-only */
1192 return mount_verbose(LOG_ERR, NULL, root, NULL,
1193 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
1194 }
1195
1196 int mount_systemd_cgroup_writable(
1197 const char *dest,
1198 CGroupUnified unified_requested) {
1199
1200 _cleanup_free_ char *own_cgroup_path = NULL;
1201 const char *root, *own;
1202 int r;
1203
1204 assert(dest);
1205
1206 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1207 if (r < 0)
1208 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1209
1210 /* If we are living in the top-level, then there's nothing to do... */
1211 if (path_equal(own_cgroup_path, "/"))
1212 return 0;
1213
1214 if (unified_requested >= CGROUP_UNIFIED_ALL) {
1215
1216 root = prefix_roota(dest, "/sys/fs/cgroup");
1217 own = strjoina(root, own_cgroup_path);
1218
1219 } else {
1220
1221 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1222 root = prefix_roota(dest, "/sys/fs/cgroup/unified");
1223 own = strjoina(root, own_cgroup_path);
1224
1225 r = mount_systemd_cgroup_writable_one(root, own);
1226 if (r < 0)
1227 return r;
1228 }
1229
1230 root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1231 own = strjoina(root, own_cgroup_path);
1232 }
1233
1234 return mount_systemd_cgroup_writable_one(root, own);
1235 }
1236
1237 int setup_volatile_state(
1238 const char *directory,
1239 VolatileMode mode,
1240 bool userns, uid_t uid_shift, uid_t uid_range,
1241 const char *selinux_apifs_context) {
1242
1243 _cleanup_free_ char *buf = NULL;
1244 const char *p, *options;
1245 int r;
1246
1247 assert(directory);
1248
1249 if (mode != VOLATILE_STATE)
1250 return 0;
1251
1252 /* --volatile=state means we simply overmount /var
1253 with a tmpfs, and the rest read-only. */
1254
1255 r = bind_remount_recursive(directory, true, NULL);
1256 if (r < 0)
1257 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1258
1259 p = prefix_roota(directory, "/var");
1260 r = mkdir(p, 0755);
1261 if (r < 0 && errno != EEXIST)
1262 return log_error_errno(errno, "Failed to create %s: %m", directory);
1263
1264 options = "mode=755";
1265 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
1266 if (r < 0)
1267 return log_oom();
1268 if (r > 0)
1269 options = buf;
1270
1271 return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
1272 }
1273
1274 int setup_volatile(
1275 const char *directory,
1276 VolatileMode mode,
1277 bool userns, uid_t uid_shift, uid_t uid_range,
1278 const char *selinux_apifs_context) {
1279
1280 bool tmpfs_mounted = false, bind_mounted = false;
1281 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1282 _cleanup_free_ char *buf = NULL;
1283 const char *f, *t, *options;
1284 int r;
1285
1286 assert(directory);
1287
1288 if (mode != VOLATILE_YES)
1289 return 0;
1290
1291 /* --volatile=yes means we mount a tmpfs to the root dir, and
1292 the original /usr to use inside it, and that read-only. */
1293
1294 if (!mkdtemp(template))
1295 return log_error_errno(errno, "Failed to create temporary directory: %m");
1296
1297 options = "mode=755";
1298 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
1299 if (r < 0)
1300 return log_oom();
1301 if (r > 0)
1302 options = buf;
1303
1304 r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1305 if (r < 0)
1306 goto fail;
1307
1308 tmpfs_mounted = true;
1309
1310 f = prefix_roota(directory, "/usr");
1311 t = prefix_roota(template, "/usr");
1312
1313 r = mkdir(t, 0755);
1314 if (r < 0 && errno != EEXIST) {
1315 r = log_error_errno(errno, "Failed to create %s: %m", t);
1316 goto fail;
1317 }
1318
1319 r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
1320 if (r < 0)
1321 goto fail;
1322
1323 bind_mounted = true;
1324
1325 r = bind_remount_recursive(t, true, NULL);
1326 if (r < 0) {
1327 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1328 goto fail;
1329 }
1330
1331 r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
1332 if (r < 0)
1333 goto fail;
1334
1335 (void) rmdir(template);
1336
1337 return 0;
1338
1339 fail:
1340 if (bind_mounted)
1341 (void) umount_verbose(t);
1342
1343 if (tmpfs_mounted)
1344 (void) umount_verbose(template);
1345 (void) rmdir(template);
1346 return r;
1347 }
1348
1349 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1350 int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
1351 _cleanup_free_ char *root_new = NULL, *root_old = NULL;
1352 const char *p = s;
1353 int r;
1354
1355 assert(pivot_root_new);
1356 assert(pivot_root_old);
1357
1358 r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1359 if (r < 0)
1360 return r;
1361 if (r == 0)
1362 return -EINVAL;
1363
1364 if (isempty(p))
1365 root_old = NULL;
1366 else {
1367 root_old = strdup(p);
1368 if (!root_old)
1369 return -ENOMEM;
1370 }
1371
1372 if (!path_is_absolute(root_new))
1373 return -EINVAL;
1374 if (root_old && !path_is_absolute(root_old))
1375 return -EINVAL;
1376
1377 free_and_replace(*pivot_root_new, root_new);
1378 free_and_replace(*pivot_root_old, root_old);
1379
1380 return 0;
1381 }
1382
1383 int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
1384 _cleanup_free_ char *directory_pivot_root_new = NULL;
1385 _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
1386 char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
1387 bool remove_pivot_tmp = false;
1388 int r;
1389
1390 assert(directory);
1391
1392 if (!pivot_root_new)
1393 return 0;
1394
1395 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1396 * If pivot_root_old is NULL, the existing / disappears.
1397 * This requires a temporary directory, pivot_tmp, which is
1398 * not a child of either.
1399 *
1400 * This is typically used for OSTree-style containers, where
1401 * the root partition contains several sysroots which could be
1402 * run. Normally, one would be chosen by the bootloader and
1403 * pivoted to / by initramfs.
1404 *
1405 * For example, for an OSTree deployment, pivot_root_new
1406 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1407 * code doesn’t do the /var mount which OSTree expects: use
1408 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1409 *
1410 * So in the OSTree case, we’ll end up with something like:
1411 * - directory = /tmp/nspawn-root-123456
1412 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1413 * - pivot_root_old = /sysroot
1414 * - directory_pivot_root_new =
1415 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1416 * - pivot_tmp = /tmp/nspawn-pivot-123456
1417 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1418 *
1419 * Requires all file systems at directory and below to be mounted
1420 * MS_PRIVATE or MS_SLAVE so they can be moved.
1421 */
1422 directory_pivot_root_new = prefix_root(directory, pivot_root_new);
1423
1424 /* Remount directory_pivot_root_new to make it movable. */
1425 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
1426 if (r < 0)
1427 goto done;
1428
1429 if (pivot_root_old) {
1430 if (!mkdtemp(pivot_tmp)) {
1431 r = log_error_errno(errno, "Failed to create temporary directory: %m");
1432 goto done;
1433 }
1434
1435 remove_pivot_tmp = true;
1436 pivot_tmp_pivot_root_old = prefix_root(pivot_tmp, pivot_root_old);
1437
1438 r = mount_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
1439 if (r < 0)
1440 goto done;
1441
1442 r = mount_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
1443 if (r < 0)
1444 goto done;
1445
1446 r = mount_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
1447 if (r < 0)
1448 goto done;
1449 } else {
1450 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
1451 if (r < 0)
1452 goto done;
1453 }
1454
1455 done:
1456 if (remove_pivot_tmp)
1457 (void) rmdir(pivot_tmp);
1458
1459 return r;
1460 }