]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-mount.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2015 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <sys/mount.h>
22 #include <linux/magic.h>
23
24 #include "alloc-util.h"
25 #include "escape.h"
26 #include "fd-util.h"
27 #include "fileio.h"
28 #include "fs-util.h"
29 #include "label.h"
30 #include "mkdir.h"
31 #include "mount-util.h"
32 #include "nspawn-mount.h"
33 #include "parse-util.h"
34 #include "path-util.h"
35 #include "rm-rf.h"
36 #include "set.h"
37 #include "stat-util.h"
38 #include "string-util.h"
39 #include "strv.h"
40 #include "user-util.h"
41 #include "util.h"
42
43 CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
44 CustomMount *c, *ret;
45
46 assert(l);
47 assert(n);
48 assert(t >= 0);
49 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
50
51 c = realloc_multiply(*l, (*n + 1), sizeof(CustomMount));
52 if (!c)
53 return NULL;
54
55 *l = c;
56 ret = *l + *n;
57 (*n)++;
58
59 *ret = (CustomMount) { .type = t };
60
61 return ret;
62 }
63
64 void custom_mount_free_all(CustomMount *l, unsigned n) {
65 unsigned i;
66
67 for (i = 0; i < n; i++) {
68 CustomMount *m = l + i;
69
70 free(m->source);
71 free(m->destination);
72 free(m->options);
73
74 if (m->work_dir) {
75 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
76 free(m->work_dir);
77 }
78
79 if (m->rm_rf_tmpdir) {
80 (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
81 free(m->rm_rf_tmpdir);
82 }
83
84 strv_free(m->lower);
85 }
86
87 free(l);
88 }
89
90 static int custom_mount_compare(const void *a, const void *b) {
91 const CustomMount *x = a, *y = b;
92 int r;
93
94 r = path_compare(x->destination, y->destination);
95 if (r != 0)
96 return r;
97
98 if (x->type < y->type)
99 return -1;
100 if (x->type > y->type)
101 return 1;
102
103 return 0;
104 }
105
106 static bool source_path_is_valid(const char *p) {
107 assert(p);
108
109 if (*p == '+')
110 p++;
111
112 return path_is_absolute(p);
113 }
114
115 static char *resolve_source_path(const char *dest, const char *source) {
116
117 if (!source)
118 return NULL;
119
120 if (source[0] == '+')
121 return prefix_root(dest, source + 1);
122
123 return strdup(source);
124 }
125
126 int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
127 unsigned i;
128 int r;
129
130 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
131 * parent process, so that we know the temporary directories to remove on exit before we fork off the
132 * children. */
133
134 assert(l || n == 0);
135
136 /* Order the custom mounts, and make sure we have a working directory */
137 qsort_safe(l, n, sizeof(CustomMount), custom_mount_compare);
138
139 for (i = 0; i < n; i++) {
140 CustomMount *m = l + i;
141
142 if (m->source) {
143 char *s;
144
145 s = resolve_source_path(dest, m->source);
146 if (!s)
147 return log_oom();
148
149 free(m->source);
150 m->source = s;
151 } else {
152 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
153
154 m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
155 if (!m->rm_rf_tmpdir)
156 return log_oom();
157
158 if (!mkdtemp(m->rm_rf_tmpdir)) {
159 m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
160 return log_error_errno(errno, "Failed to acquire temporary directory: %m");
161 }
162
163 m->source = strjoin(m->rm_rf_tmpdir, "/src");
164 if (!m->source)
165 return log_oom();
166
167 if (mkdir(m->source, 0755) < 0)
168 return log_error_errno(errno, "Failed to create %s: %m", m->source);
169 }
170
171 if (m->type == CUSTOM_MOUNT_OVERLAY) {
172 char **j;
173
174 STRV_FOREACH(j, m->lower) {
175 char *s;
176
177 s = resolve_source_path(dest, *j);
178 if (!s)
179 return log_oom();
180
181 free(*j);
182 *j = s;
183 }
184
185 if (m->work_dir) {
186 char *s;
187
188 s = resolve_source_path(dest, m->work_dir);
189 if (!s)
190 return log_oom();
191
192 free(m->work_dir);
193 m->work_dir = s;
194 } else {
195 assert(m->source);
196
197 r = tempfn_random(m->source, NULL, &m->work_dir);
198 if (r < 0)
199 return log_error_errno(r, "Failed to acquire working directory: %m");
200 }
201
202 (void) mkdir_label(m->work_dir, 0700);
203 }
204 }
205
206 return 0;
207 }
208
209 int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
210 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
211 const char *p = s;
212 CustomMount *m;
213 int r;
214
215 assert(l);
216 assert(n);
217
218 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
219 if (r < 0)
220 return r;
221 if (r == 0)
222 return -EINVAL;
223 if (r == 1) {
224 destination = strdup(source[0] == '+' ? source+1 : source);
225 if (!destination)
226 return -ENOMEM;
227 }
228 if (r == 2 && !isempty(p)) {
229 opts = strdup(p);
230 if (!opts)
231 return -ENOMEM;
232 }
233
234 if (isempty(source))
235 source = NULL;
236 else if (!source_path_is_valid(source))
237 return -EINVAL;
238
239 if (!path_is_absolute(destination))
240 return -EINVAL;
241
242 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
243 if (!m)
244 return -ENOMEM;
245
246 m->source = source;
247 m->destination = destination;
248 m->read_only = read_only;
249 m->options = opts;
250
251 source = destination = opts = NULL;
252 return 0;
253 }
254
255 int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
256 _cleanup_free_ char *path = NULL, *opts = NULL;
257 const char *p = s;
258 CustomMount *m;
259 int r;
260
261 assert(l);
262 assert(n);
263 assert(s);
264
265 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
266 if (r < 0)
267 return r;
268 if (r == 0)
269 return -EINVAL;
270
271 if (isempty(p))
272 opts = strdup("mode=0755");
273 else
274 opts = strdup(p);
275 if (!opts)
276 return -ENOMEM;
277
278 if (!path_is_absolute(path))
279 return -EINVAL;
280
281 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
282 if (!m)
283 return -ENOMEM;
284
285 m->destination = path;
286 m->options = opts;
287
288 path = opts = NULL;
289 return 0;
290 }
291
292 int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
293 _cleanup_free_ char *upper = NULL, *destination = NULL;
294 _cleanup_strv_free_ char **lower = NULL;
295 CustomMount *m;
296 int k;
297
298 k = strv_split_extract(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
299 if (k < 0)
300 return k;
301 if (k < 2)
302 return -EADDRNOTAVAIL;
303 if (k == 2) {
304 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
305 * we'll also define the destination mount point the same as the upper. */
306
307 if (!source_path_is_valid(lower[0]) ||
308 !source_path_is_valid(lower[1]))
309 return -EINVAL;
310
311 upper = lower[1];
312 lower[1] = NULL;
313
314 destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
315 if (!destination)
316 return -ENOMEM;
317 } else {
318 char **i;
319
320 /* If more than two parameters are specified, the last one is the destination, the second to last one
321 * the "upper", and all before that the "lower" directories. */
322
323 destination = lower[k - 1];
324 upper = lower[k - 2];
325 lower[k - 2] = NULL;
326
327 STRV_FOREACH(i, lower)
328 if (!source_path_is_valid(*i))
329 return -EINVAL;
330
331 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
332 * in /var/tmp */
333 if (isempty(upper))
334 upper = NULL;
335 else if (!source_path_is_valid(upper))
336 return -EINVAL;
337
338 if (!path_is_absolute(destination))
339 return -EINVAL;
340 }
341
342 m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
343 if (!m)
344 return -ENOMEM;
345
346 m->destination = destination;
347 m->source = upper;
348 m->lower = lower;
349 m->read_only = read_only;
350
351 upper = destination = NULL;
352 lower = NULL;
353
354 return 0;
355 }
356
357 static int tmpfs_patch_options(
358 const char *options,
359 bool userns,
360 uid_t uid_shift, uid_t uid_range,
361 bool patch_ids,
362 const char *selinux_apifs_context,
363 char **ret) {
364
365 char *buf = NULL;
366
367 if ((userns && uid_shift != 0) || patch_ids) {
368 assert(uid_shift != UID_INVALID);
369
370 if (asprintf(&buf, "%s%suid=" UID_FMT ",gid=" UID_FMT,
371 options ?: "", options ? "," : "",
372 uid_shift, uid_shift) < 0)
373 return -ENOMEM;
374
375 options = buf;
376 }
377
378 #if HAVE_SELINUX
379 if (selinux_apifs_context) {
380 char *t;
381
382 t = strjoin(options ?: "", options ? "," : "",
383 "context=\"", selinux_apifs_context, "\"");
384 free(buf);
385 if (!t)
386 return -ENOMEM;
387
388 buf = t;
389 }
390 #endif
391
392 if (!buf && options) {
393 buf = strdup(options);
394 if (!buf)
395 return -ENOMEM;
396 }
397 *ret = buf;
398
399 return !!buf;
400 }
401
402 int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
403 const char *full, *top, *x;
404 int r;
405 unsigned long extra_flags = 0;
406
407 top = prefix_roota(dest, "/sys");
408 r = path_check_fstype(top, SYSFS_MAGIC);
409 if (r < 0)
410 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
411 /* /sys might already be mounted as sysfs by the outer child in the
412 * !netns case. In this case, it's all good. Don't touch it because we
413 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
414 */
415 if (r > 0)
416 return 0;
417
418 full = prefix_roota(top, "/full");
419
420 (void) mkdir(full, 0755);
421
422 if (mount_settings & MOUNT_APPLY_APIVFS_RO)
423 extra_flags |= MS_RDONLY;
424
425 r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
426 MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
427 if (r < 0)
428 return r;
429
430 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
431 _cleanup_free_ char *from = NULL, *to = NULL;
432
433 from = prefix_root(full, x);
434 if (!from)
435 return log_oom();
436
437 to = prefix_root(top, x);
438 if (!to)
439 return log_oom();
440
441 (void) mkdir(to, 0755);
442
443 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
444 if (r < 0)
445 return r;
446
447 r = mount_verbose(LOG_ERR, NULL, to, NULL,
448 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
449 if (r < 0)
450 return r;
451 }
452
453 r = umount_verbose(full);
454 if (r < 0)
455 return r;
456
457 if (rmdir(full) < 0)
458 return log_error_errno(errno, "Failed to remove %s: %m", full);
459
460 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
461 * remount /sys read-only.
462 */
463 if (cg_ns_supported()) {
464 x = prefix_roota(top, "/fs/cgroup");
465 (void) mkdir_p(x, 0755);
466 }
467
468 return mount_verbose(LOG_ERR, NULL, top, NULL,
469 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
470 }
471
472 static int mkdir_userns(const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
473 int r;
474
475 assert(path);
476
477 r = mkdir(path, mode);
478 if (r < 0 && errno != EEXIST)
479 return -errno;
480
481 if ((mask & MOUNT_USE_USERNS) == 0)
482 return 0;
483
484 if (mask & MOUNT_IN_USERNS)
485 return 0;
486
487 r = lchown(path, uid_shift, uid_shift);
488 if (r < 0)
489 return -errno;
490
491 return 0;
492 }
493
494 static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
495 const char *p, *e;
496 int r;
497
498 assert(path);
499
500 if (prefix && !path_startswith(path, prefix))
501 return -ENOTDIR;
502
503 /* create every parent directory in the path, except the last component */
504 p = path + strspn(path, "/");
505 for (;;) {
506 char t[strlen(path) + 1];
507
508 e = p + strcspn(p, "/");
509 p = e + strspn(e, "/");
510
511 /* Is this the last component? If so, then we're done */
512 if (*p == 0)
513 break;
514
515 memcpy(t, path, e - path);
516 t[e-path] = 0;
517
518 if (prefix && path_startswith(prefix, t))
519 continue;
520
521 r = mkdir_userns(t, mode, mask, uid_shift);
522 if (r < 0)
523 return r;
524 }
525
526 return mkdir_userns(path, mode, mask, uid_shift);
527 }
528
529 int mount_all(const char *dest,
530 MountSettingsMask mount_settings,
531 uid_t uid_shift, uid_t uid_range,
532 const char *selinux_apifs_context) {
533
534 typedef struct MountPoint {
535 const char *what;
536 const char *where;
537 const char *type;
538 const char *options;
539 unsigned long flags;
540 MountSettingsMask mount_settings;
541 } MountPoint;
542
543 static const MountPoint mount_table[] = {
544 /* inner child mounts */
545 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
546 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
547 { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
548 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
549 { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
550 { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
551
552 /* outer child mounts */
553 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
554 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
555 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
556 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
557
558 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
559 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
560 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
561 #if HAVE_SELINUX
562 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
563 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
564 #endif
565 };
566
567 unsigned k;
568 int r;
569 bool use_userns = (mount_settings & MOUNT_USE_USERNS);
570 bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
571 bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
572 bool in_userns = (mount_settings & MOUNT_IN_USERNS);
573
574 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
575 _cleanup_free_ char *where = NULL, *options = NULL;
576 const char *o;
577 bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
578
579 if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
580 continue;
581
582 if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
583 continue;
584
585 if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
586 continue;
587
588 r = chase_symlinks(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where);
589 if (r < 0)
590 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
591
592 r = path_is_mount_point(where, NULL, 0);
593 if (r < 0 && r != -ENOENT)
594 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
595
596 /* Skip this entry if it is not a remount. */
597 if (mount_table[k].what && r > 0)
598 continue;
599
600 r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
601 if (r < 0 && r != -EEXIST) {
602 if (fatal && r != -EROFS)
603 return log_error_errno(r, "Failed to create directory %s: %m", where);
604
605 log_debug_errno(r, "Failed to create directory %s: %m", where);
606 /* If we failed mkdir() or chown() due to the root
607 * directory being read only, attempt to mount this fs
608 * anyway and let mount_verbose log any errors */
609 if (r != -EROFS)
610 continue;
611 }
612
613 o = mount_table[k].options;
614 if (streq_ptr(mount_table[k].type, "tmpfs")) {
615 if (in_userns)
616 r = tmpfs_patch_options(o, use_userns, 0, uid_range, true, selinux_apifs_context, &options);
617 else
618 r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
619 if (r < 0)
620 return log_oom();
621 if (r > 0)
622 o = options;
623 }
624
625 r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
626 mount_table[k].what,
627 where,
628 mount_table[k].type,
629 mount_table[k].flags,
630 o);
631 if (r < 0 && fatal)
632 return r;
633 }
634
635 return 0;
636 }
637
638 static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
639 const char *p = options;
640 unsigned long flags = *mount_flags;
641 char *opts = NULL;
642 int r;
643
644 assert(options);
645
646 for (;;) {
647 _cleanup_free_ char *word = NULL;
648
649 r = extract_first_word(&p, &word, ",", 0);
650 if (r < 0)
651 return log_error_errno(r, "Failed to extract mount option: %m");
652 if (r == 0)
653 break;
654
655 if (streq(word, "rbind"))
656 flags |= MS_REC;
657 else if (streq(word, "norbind"))
658 flags &= ~MS_REC;
659 else {
660 log_error("Invalid bind mount option: %s", word);
661 return -EINVAL;
662 }
663 }
664
665 *mount_flags = flags;
666 /* in the future mount_opts will hold string options for mount(2) */
667 *mount_opts = opts;
668
669 return 0;
670 }
671
672 static int mount_bind(const char *dest, CustomMount *m) {
673
674 _cleanup_free_ char *mount_opts = NULL, *where = NULL;
675 unsigned long mount_flags = MS_BIND | MS_REC;
676 struct stat source_st, dest_st;
677 int r;
678
679 assert(dest);
680 assert(m);
681
682 if (m->options) {
683 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
684 if (r < 0)
685 return r;
686 }
687
688 if (stat(m->source, &source_st) < 0)
689 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
690
691 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
692 if (r < 0)
693 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
694 if (r > 0) { /* Path exists already? */
695
696 if (stat(where, &dest_st) < 0)
697 return log_error_errno(errno, "Failed to stat %s: %m", where);
698
699 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
700 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
701 return -EINVAL;
702 }
703
704 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
705 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
706 return -EINVAL;
707 }
708
709 } else { /* Path doesn't exist yet? */
710 r = mkdir_parents_label(where, 0755);
711 if (r < 0)
712 return log_error_errno(r, "Failed to make parents of %s: %m", where);
713
714 /* Create the mount point. Any non-directory file can be
715 * mounted on any non-directory file (regular, fifo, socket,
716 * char, block).
717 */
718 if (S_ISDIR(source_st.st_mode))
719 r = mkdir_label(where, 0755);
720 else
721 r = touch(where);
722 if (r < 0)
723 return log_error_errno(r, "Failed to create mount point %s: %m", where);
724
725 }
726
727 r = mount_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
728 if (r < 0)
729 return r;
730
731 if (m->read_only) {
732 r = bind_remount_recursive(where, true, NULL);
733 if (r < 0)
734 return log_error_errno(r, "Read-only bind mount failed: %m");
735 }
736
737 return 0;
738 }
739
740 static int mount_tmpfs(
741 const char *dest,
742 CustomMount *m,
743 bool userns, uid_t uid_shift, uid_t uid_range,
744 const char *selinux_apifs_context) {
745
746 const char *options;
747 _cleanup_free_ char *buf = NULL, *where = NULL;
748 int r;
749
750 assert(dest);
751 assert(m);
752
753 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
754 if (r < 0)
755 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
756 if (r == 0) { /* Doesn't exist yet? */
757 r = mkdir_p_label(where, 0755);
758 if (r < 0)
759 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
760 }
761
762 r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
763 if (r < 0)
764 return log_oom();
765 options = r > 0 ? buf : m->options;
766
767 return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
768 }
769
770 static char *joined_and_escaped_lower_dirs(char **lower) {
771 _cleanup_strv_free_ char **sv = NULL;
772
773 sv = strv_copy(lower);
774 if (!sv)
775 return NULL;
776
777 strv_reverse(sv);
778
779 if (!strv_shell_escape(sv, ",:"))
780 return NULL;
781
782 return strv_join(sv, ":");
783 }
784
785 static int mount_overlay(const char *dest, CustomMount *m) {
786
787 _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
788 const char *options;
789 int r;
790
791 assert(dest);
792 assert(m);
793
794 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
795 if (r < 0)
796 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
797 if (r == 0) { /* Doesn't exist yet? */
798 r = mkdir_label(where, 0755);
799 if (r < 0)
800 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
801 }
802
803 (void) mkdir_p_label(m->source, 0755);
804
805 lower = joined_and_escaped_lower_dirs(m->lower);
806 if (!lower)
807 return log_oom();
808
809 escaped_source = shell_escape(m->source, ",:");
810 if (!escaped_source)
811 return log_oom();
812
813 if (m->read_only)
814 options = strjoina("lowerdir=", escaped_source, ":", lower);
815 else {
816 _cleanup_free_ char *escaped_work_dir = NULL;
817
818 escaped_work_dir = shell_escape(m->work_dir, ",:");
819 if (!escaped_work_dir)
820 return log_oom();
821
822 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
823 }
824
825 return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
826 }
827
828 int mount_custom(
829 const char *dest,
830 CustomMount *mounts, unsigned n,
831 bool userns, uid_t uid_shift, uid_t uid_range,
832 const char *selinux_apifs_context) {
833
834 unsigned i;
835 int r;
836
837 assert(dest);
838
839 for (i = 0; i < n; i++) {
840 CustomMount *m = mounts + i;
841
842 switch (m->type) {
843
844 case CUSTOM_MOUNT_BIND:
845 r = mount_bind(dest, m);
846 break;
847
848 case CUSTOM_MOUNT_TMPFS:
849 r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
850 break;
851
852 case CUSTOM_MOUNT_OVERLAY:
853 r = mount_overlay(dest, m);
854 break;
855
856 default:
857 assert_not_reached("Unknown custom mount type");
858 }
859
860 if (r < 0)
861 return r;
862 }
863
864 return 0;
865 }
866
867 /* Retrieve existing subsystems. This function is called in a new cgroup
868 * namespace.
869 */
870 static int get_controllers(Set *subsystems) {
871 _cleanup_fclose_ FILE *f = NULL;
872 char line[LINE_MAX];
873
874 assert(subsystems);
875
876 f = fopen("/proc/self/cgroup", "re");
877 if (!f)
878 return errno == ENOENT ? -ESRCH : -errno;
879
880 FOREACH_LINE(line, f, return -errno) {
881 int r;
882 char *e, *l, *p;
883
884 l = strchr(line, ':');
885 if (!l)
886 continue;
887
888 l++;
889 e = strchr(l, ':');
890 if (!e)
891 continue;
892
893 *e = 0;
894
895 if (STR_IN_SET(l, "", "name=systemd", "name=unified"))
896 continue;
897
898 p = strdup(l);
899 if (!p)
900 return -ENOMEM;
901
902 r = set_consume(subsystems, p);
903 if (r < 0)
904 return r;
905 }
906
907 return 0;
908 }
909
910 static int mount_legacy_cgroup_hierarchy(
911 const char *dest,
912 const char *controller,
913 const char *hierarchy,
914 bool read_only) {
915
916 const char *to, *fstype, *opts;
917 int r;
918
919 to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
920
921 r = path_is_mount_point(to, dest, 0);
922 if (r < 0 && r != -ENOENT)
923 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
924 if (r > 0)
925 return 0;
926
927 mkdir_p(to, 0755);
928
929 /* The superblock mount options of the mount point need to be
930 * identical to the hosts', and hence writable... */
931 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) {
932 fstype = "cgroup2";
933 opts = NULL;
934 } else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) {
935 fstype = "cgroup";
936 opts = "none,name=systemd,xattr";
937 } else {
938 fstype = "cgroup";
939 opts = controller;
940 }
941
942 r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
943 if (r < 0)
944 return r;
945
946 /* ... hence let's only make the bind mount read-only, not the superblock. */
947 if (read_only) {
948 r = mount_verbose(LOG_ERR, NULL, to, NULL,
949 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
950 if (r < 0)
951 return r;
952 }
953
954 return 1;
955 }
956
957 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
958 static int mount_legacy_cgns_supported(
959 const char *dest,
960 CGroupUnified unified_requested,
961 bool userns,
962 uid_t uid_shift,
963 uid_t uid_range,
964 const char *selinux_apifs_context) {
965
966 _cleanup_set_free_free_ Set *controllers = NULL;
967 const char *cgroup_root = "/sys/fs/cgroup", *c;
968 int r;
969
970 (void) mkdir_p(cgroup_root, 0755);
971
972 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
973 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
974 if (r < 0)
975 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
976 if (r == 0) {
977 _cleanup_free_ char *options = NULL;
978
979 /* When cgroup namespaces are enabled and user namespaces are
980 * used then the mount of the cgroupfs is done *inside* the new
981 * user namespace. We're root in the new user namespace and the
982 * kernel will happily translate our uid/gid to the correct
983 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
984 * pass uid 0 and not uid_shift to tmpfs_patch_options().
985 */
986 r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
987 if (r < 0)
988 return log_oom();
989
990 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
991 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
992 if (r < 0)
993 return r;
994 }
995
996 r = cg_all_unified();
997 if (r < 0)
998 return r;
999 if (r > 0)
1000 goto skip_controllers;
1001
1002 controllers = set_new(&string_hash_ops);
1003 if (!controllers)
1004 return log_oom();
1005
1006 r = get_controllers(controllers);
1007 if (r < 0)
1008 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1009
1010 for (;;) {
1011 _cleanup_free_ const char *controller = NULL;
1012
1013 controller = set_steal_first(controllers);
1014 if (!controller)
1015 break;
1016
1017 r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
1018 if (r < 0)
1019 return r;
1020
1021 /* When multiple hierarchies are co-mounted, make their
1022 * constituting individual hierarchies a symlink to the
1023 * co-mount.
1024 */
1025 c = controller;
1026 for (;;) {
1027 _cleanup_free_ char *target = NULL, *tok = NULL;
1028
1029 r = extract_first_word(&c, &tok, ",", 0);
1030 if (r < 0)
1031 return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
1032 if (r == 0)
1033 break;
1034
1035 target = prefix_root("/sys/fs/cgroup", tok);
1036 if (!target)
1037 return log_oom();
1038
1039 if (streq(controller, tok))
1040 break;
1041
1042 r = symlink_idempotent(controller, target);
1043 if (r == -EINVAL)
1044 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
1045 if (r < 0)
1046 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1047 }
1048 }
1049
1050 skip_controllers:
1051 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1052 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1053 if (r < 0)
1054 return r;
1055 }
1056
1057 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
1058 if (r < 0)
1059 return r;
1060
1061 if (!userns)
1062 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1063 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
1064
1065 return 0;
1066 }
1067
1068 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
1069 static int mount_legacy_cgns_unsupported(
1070 const char *dest,
1071 CGroupUnified unified_requested,
1072 bool userns,
1073 uid_t uid_shift,
1074 uid_t uid_range,
1075 const char *selinux_apifs_context) {
1076
1077 _cleanup_set_free_free_ Set *controllers = NULL;
1078 const char *cgroup_root;
1079 int r;
1080
1081 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1082
1083 (void) mkdir_p(cgroup_root, 0755);
1084
1085 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
1086 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
1087 if (r < 0)
1088 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1089 if (r == 0) {
1090 _cleanup_free_ char *options = NULL;
1091
1092 r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
1093 if (r < 0)
1094 return log_oom();
1095
1096 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
1097 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
1098 if (r < 0)
1099 return r;
1100 }
1101
1102 r = cg_all_unified();
1103 if (r < 0)
1104 return r;
1105 if (r > 0)
1106 goto skip_controllers;
1107
1108 controllers = set_new(&string_hash_ops);
1109 if (!controllers)
1110 return log_oom();
1111
1112 r = cg_kernel_controllers(controllers);
1113 if (r < 0)
1114 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1115
1116 for (;;) {
1117 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1118
1119 controller = set_steal_first(controllers);
1120 if (!controller)
1121 break;
1122
1123 origin = prefix_root("/sys/fs/cgroup/", controller);
1124 if (!origin)
1125 return log_oom();
1126
1127 r = readlink_malloc(origin, &combined);
1128 if (r == -EINVAL) {
1129 /* Not a symbolic link, but directly a single cgroup hierarchy */
1130
1131 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
1132 if (r < 0)
1133 return r;
1134
1135 } else if (r < 0)
1136 return log_error_errno(r, "Failed to read link %s: %m", origin);
1137 else {
1138 _cleanup_free_ char *target = NULL;
1139
1140 target = prefix_root(dest, origin);
1141 if (!target)
1142 return log_oom();
1143
1144 /* A symbolic link, a combination of controllers in one hierarchy */
1145
1146 if (!filename_is_valid(combined)) {
1147 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1148 continue;
1149 }
1150
1151 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
1152 if (r < 0)
1153 return r;
1154
1155 r = symlink_idempotent(combined, target);
1156 if (r == -EINVAL)
1157 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
1158 if (r < 0)
1159 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1160 }
1161 }
1162
1163 skip_controllers:
1164 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1165 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1166 if (r < 0)
1167 return r;
1168 }
1169
1170 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
1171 if (r < 0)
1172 return r;
1173
1174 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1175 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
1176 }
1177
1178 static int mount_unified_cgroups(const char *dest) {
1179 const char *p;
1180 int r;
1181
1182 assert(dest);
1183
1184 p = prefix_roota(dest, "/sys/fs/cgroup");
1185
1186 (void) mkdir_p(p, 0755);
1187
1188 r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW);
1189 if (r < 0)
1190 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
1191 if (r > 0) {
1192 p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
1193 if (access(p, F_OK) >= 0)
1194 return 0;
1195 if (errno != ENOENT)
1196 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
1197
1198 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
1199 return -EINVAL;
1200 }
1201
1202 return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1203 }
1204
1205 int mount_cgroups(
1206 const char *dest,
1207 CGroupUnified unified_requested,
1208 bool userns,
1209 uid_t uid_shift,
1210 uid_t uid_range,
1211 const char *selinux_apifs_context,
1212 bool use_cgns) {
1213
1214 if (unified_requested >= CGROUP_UNIFIED_ALL)
1215 return mount_unified_cgroups(dest);
1216 else if (use_cgns)
1217 return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
1218
1219 return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
1220 }
1221
1222 static int mount_systemd_cgroup_writable_one(const char *systemd_own, const char *systemd_root)
1223 {
1224 int r;
1225
1226 /* Make our own cgroup a (writable) bind mount */
1227 r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL);
1228 if (r < 0)
1229 return r;
1230
1231 /* And then remount the systemd cgroup root read-only */
1232 return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
1233 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
1234 }
1235
1236 int mount_systemd_cgroup_writable(
1237 const char *dest,
1238 CGroupUnified unified_requested) {
1239
1240 _cleanup_free_ char *own_cgroup_path = NULL;
1241 int r;
1242
1243 assert(dest);
1244
1245 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1246 if (r < 0)
1247 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1248
1249 /* If we are living in the top-level, then there's nothing to do... */
1250 if (path_equal(own_cgroup_path, "/"))
1251 return 0;
1252
1253 if (unified_requested >= CGROUP_UNIFIED_ALL)
1254 return mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup", own_cgroup_path),
1255 prefix_roota(dest, "/sys/fs/cgroup"));
1256
1257 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1258 r = mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup/unified", own_cgroup_path),
1259 prefix_roota(dest, "/sys/fs/cgroup/unified"));
1260 if (r < 0)
1261 return r;
1262 }
1263
1264 return mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path),
1265 prefix_roota(dest, "/sys/fs/cgroup/systemd"));
1266 }
1267
1268 int setup_volatile_state(
1269 const char *directory,
1270 VolatileMode mode,
1271 bool userns, uid_t uid_shift, uid_t uid_range,
1272 const char *selinux_apifs_context) {
1273
1274 _cleanup_free_ char *buf = NULL;
1275 const char *p, *options;
1276 int r;
1277
1278 assert(directory);
1279
1280 if (mode != VOLATILE_STATE)
1281 return 0;
1282
1283 /* --volatile=state means we simply overmount /var
1284 with a tmpfs, and the rest read-only. */
1285
1286 r = bind_remount_recursive(directory, true, NULL);
1287 if (r < 0)
1288 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1289
1290 p = prefix_roota(directory, "/var");
1291 r = mkdir(p, 0755);
1292 if (r < 0 && errno != EEXIST)
1293 return log_error_errno(errno, "Failed to create %s: %m", directory);
1294
1295 options = "mode=755";
1296 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
1297 if (r < 0)
1298 return log_oom();
1299 if (r > 0)
1300 options = buf;
1301
1302 return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
1303 }
1304
1305 int setup_volatile(
1306 const char *directory,
1307 VolatileMode mode,
1308 bool userns, uid_t uid_shift, uid_t uid_range,
1309 const char *selinux_apifs_context) {
1310
1311 bool tmpfs_mounted = false, bind_mounted = false;
1312 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1313 _cleanup_free_ char *buf = NULL;
1314 const char *f, *t, *options;
1315 int r;
1316
1317 assert(directory);
1318
1319 if (mode != VOLATILE_YES)
1320 return 0;
1321
1322 /* --volatile=yes means we mount a tmpfs to the root dir, and
1323 the original /usr to use inside it, and that read-only. */
1324
1325 if (!mkdtemp(template))
1326 return log_error_errno(errno, "Failed to create temporary directory: %m");
1327
1328 options = "mode=755";
1329 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
1330 if (r < 0)
1331 return log_oom();
1332 if (r > 0)
1333 options = buf;
1334
1335 r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1336 if (r < 0)
1337 goto fail;
1338
1339 tmpfs_mounted = true;
1340
1341 f = prefix_roota(directory, "/usr");
1342 t = prefix_roota(template, "/usr");
1343
1344 r = mkdir(t, 0755);
1345 if (r < 0 && errno != EEXIST) {
1346 r = log_error_errno(errno, "Failed to create %s: %m", t);
1347 goto fail;
1348 }
1349
1350 r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
1351 if (r < 0)
1352 goto fail;
1353
1354 bind_mounted = true;
1355
1356 r = bind_remount_recursive(t, true, NULL);
1357 if (r < 0) {
1358 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1359 goto fail;
1360 }
1361
1362 r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
1363 if (r < 0)
1364 goto fail;
1365
1366 (void) rmdir(template);
1367
1368 return 0;
1369
1370 fail:
1371 if (bind_mounted)
1372 (void) umount_verbose(t);
1373
1374 if (tmpfs_mounted)
1375 (void) umount_verbose(template);
1376 (void) rmdir(template);
1377 return r;
1378 }
1379
1380 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1381 int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
1382 _cleanup_free_ char *root_new = NULL, *root_old = NULL;
1383 const char *p = s;
1384 int r;
1385
1386 assert(pivot_root_new);
1387 assert(pivot_root_old);
1388
1389 r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1390 if (r < 0)
1391 return r;
1392 if (r == 0)
1393 return -EINVAL;
1394
1395 if (isempty(p))
1396 root_old = NULL;
1397 else {
1398 root_old = strdup(p);
1399 if (!root_old)
1400 return -ENOMEM;
1401 }
1402
1403 if (!path_is_absolute(root_new))
1404 return -EINVAL;
1405 if (root_old && !path_is_absolute(root_old))
1406 return -EINVAL;
1407
1408 free_and_replace(*pivot_root_new, root_new);
1409 free_and_replace(*pivot_root_old, root_old);
1410
1411 return 0;
1412 }
1413
1414 int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
1415 _cleanup_free_ char *directory_pivot_root_new = NULL;
1416 _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
1417 char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
1418 bool remove_pivot_tmp = false;
1419 int r;
1420
1421 assert(directory);
1422
1423 if (!pivot_root_new)
1424 return 0;
1425
1426 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1427 * If pivot_root_old is NULL, the existing / disappears.
1428 * This requires a temporary directory, pivot_tmp, which is
1429 * not a child of either.
1430 *
1431 * This is typically used for OSTree-style containers, where
1432 * the root partition contains several sysroots which could be
1433 * run. Normally, one would be chosen by the bootloader and
1434 * pivoted to / by initramfs.
1435 *
1436 * For example, for an OSTree deployment, pivot_root_new
1437 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1438 * code doesn’t do the /var mount which OSTree expects: use
1439 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1440 *
1441 * So in the OSTree case, we’ll end up with something like:
1442 * - directory = /tmp/nspawn-root-123456
1443 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1444 * - pivot_root_old = /sysroot
1445 * - directory_pivot_root_new =
1446 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1447 * - pivot_tmp = /tmp/nspawn-pivot-123456
1448 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1449 *
1450 * Requires all file systems at directory and below to be mounted
1451 * MS_PRIVATE or MS_SLAVE so they can be moved.
1452 */
1453 directory_pivot_root_new = prefix_root(directory, pivot_root_new);
1454
1455 /* Remount directory_pivot_root_new to make it movable. */
1456 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
1457 if (r < 0)
1458 goto done;
1459
1460 if (pivot_root_old) {
1461 if (!mkdtemp(pivot_tmp)) {
1462 r = log_error_errno(errno, "Failed to create temporary directory: %m");
1463 goto done;
1464 }
1465
1466 remove_pivot_tmp = true;
1467 pivot_tmp_pivot_root_old = prefix_root(pivot_tmp, pivot_root_old);
1468
1469 r = mount_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
1470 if (r < 0)
1471 goto done;
1472
1473 r = mount_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
1474 if (r < 0)
1475 goto done;
1476
1477 r = mount_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
1478 if (r < 0)
1479 goto done;
1480 } else {
1481 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
1482 if (r < 0)
1483 goto done;
1484 }
1485
1486 done:
1487 if (remove_pivot_tmp)
1488 (void) rmdir(pivot_tmp);
1489
1490 return r;
1491 }