]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-mount.c
Replace empty ternary with helper method
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2015 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <sys/mount.h>
22 #include <linux/magic.h>
23
24 #include "alloc-util.h"
25 #include "escape.h"
26 #include "fd-util.h"
27 #include "fileio.h"
28 #include "fs-util.h"
29 #include "label.h"
30 #include "mkdir.h"
31 #include "mount-util.h"
32 #include "nspawn-mount.h"
33 #include "parse-util.h"
34 #include "path-util.h"
35 #include "rm-rf.h"
36 #include "set.h"
37 #include "stat-util.h"
38 #include "string-util.h"
39 #include "strv.h"
40 #include "user-util.h"
41 #include "util.h"
42
43 CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
44 CustomMount *c, *ret;
45
46 assert(l);
47 assert(n);
48 assert(t >= 0);
49 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
50
51 c = realloc_multiply(*l, (*n + 1), sizeof(CustomMount));
52 if (!c)
53 return NULL;
54
55 *l = c;
56 ret = *l + *n;
57 (*n)++;
58
59 *ret = (CustomMount) { .type = t };
60
61 return ret;
62 }
63
64 void custom_mount_free_all(CustomMount *l, unsigned n) {
65 unsigned i;
66
67 for (i = 0; i < n; i++) {
68 CustomMount *m = l + i;
69
70 free(m->source);
71 free(m->destination);
72 free(m->options);
73
74 if (m->work_dir) {
75 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
76 free(m->work_dir);
77 }
78
79 if (m->rm_rf_tmpdir) {
80 (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
81 free(m->rm_rf_tmpdir);
82 }
83
84 strv_free(m->lower);
85 }
86
87 free(l);
88 }
89
90 static int custom_mount_compare(const void *a, const void *b) {
91 const CustomMount *x = a, *y = b;
92 int r;
93
94 r = path_compare(x->destination, y->destination);
95 if (r != 0)
96 return r;
97
98 if (x->type < y->type)
99 return -1;
100 if (x->type > y->type)
101 return 1;
102
103 return 0;
104 }
105
106 static bool source_path_is_valid(const char *p) {
107 assert(p);
108
109 if (*p == '+')
110 p++;
111
112 return path_is_absolute(p);
113 }
114
115 static char *resolve_source_path(const char *dest, const char *source) {
116
117 if (!source)
118 return NULL;
119
120 if (source[0] == '+')
121 return prefix_root(dest, source + 1);
122
123 return strdup(source);
124 }
125
126 int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
127 unsigned i;
128 int r;
129
130 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
131 * parent process, so that we know the temporary directories to remove on exit before we fork off the
132 * children. */
133
134 assert(l || n == 0);
135
136 /* Order the custom mounts, and make sure we have a working directory */
137 qsort_safe(l, n, sizeof(CustomMount), custom_mount_compare);
138
139 for (i = 0; i < n; i++) {
140 CustomMount *m = l + i;
141
142 if (m->source) {
143 char *s;
144
145 s = resolve_source_path(dest, m->source);
146 if (!s)
147 return log_oom();
148
149 free(m->source);
150 m->source = s;
151 } else {
152 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
153
154 m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
155 if (!m->rm_rf_tmpdir)
156 return log_oom();
157
158 if (!mkdtemp(m->rm_rf_tmpdir)) {
159 m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
160 return log_error_errno(errno, "Failed to acquire temporary directory: %m");
161 }
162
163 m->source = strjoin(m->rm_rf_tmpdir, "/src");
164 if (!m->source)
165 return log_oom();
166
167 if (mkdir(m->source, 0755) < 0)
168 return log_error_errno(errno, "Failed to create %s: %m", m->source);
169 }
170
171 if (m->type == CUSTOM_MOUNT_OVERLAY) {
172 char **j;
173
174 STRV_FOREACH(j, m->lower) {
175 char *s;
176
177 s = resolve_source_path(dest, *j);
178 if (!s)
179 return log_oom();
180
181 free(*j);
182 *j = s;
183 }
184
185 if (m->work_dir) {
186 char *s;
187
188 s = resolve_source_path(dest, m->work_dir);
189 if (!s)
190 return log_oom();
191
192 free(m->work_dir);
193 m->work_dir = s;
194 } else {
195 assert(m->source);
196
197 r = tempfn_random(m->source, NULL, &m->work_dir);
198 if (r < 0)
199 return log_error_errno(r, "Failed to acquire working directory: %m");
200 }
201
202 (void) mkdir_label(m->work_dir, 0700);
203 }
204 }
205
206 return 0;
207 }
208
209 int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
210 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
211 const char *p = s;
212 CustomMount *m;
213 int r;
214
215 assert(l);
216 assert(n);
217
218 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
219 if (r < 0)
220 return r;
221 if (r == 0)
222 return -EINVAL;
223 if (r == 1) {
224 destination = strdup(source[0] == '+' ? source+1 : source);
225 if (!destination)
226 return -ENOMEM;
227 }
228 if (r == 2 && !isempty(p)) {
229 opts = strdup(p);
230 if (!opts)
231 return -ENOMEM;
232 }
233
234 if (isempty(source))
235 source = NULL;
236 else if (!source_path_is_valid(source))
237 return -EINVAL;
238
239 if (!path_is_absolute(destination))
240 return -EINVAL;
241
242 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
243 if (!m)
244 return -ENOMEM;
245
246 m->source = source;
247 m->destination = destination;
248 m->read_only = read_only;
249 m->options = opts;
250
251 source = destination = opts = NULL;
252 return 0;
253 }
254
255 int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
256 _cleanup_free_ char *path = NULL, *opts = NULL;
257 const char *p = s;
258 CustomMount *m;
259 int r;
260
261 assert(l);
262 assert(n);
263 assert(s);
264
265 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
266 if (r < 0)
267 return r;
268 if (r == 0)
269 return -EINVAL;
270
271 if (isempty(p))
272 opts = strdup("mode=0755");
273 else
274 opts = strdup(p);
275 if (!opts)
276 return -ENOMEM;
277
278 if (!path_is_absolute(path))
279 return -EINVAL;
280
281 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
282 if (!m)
283 return -ENOMEM;
284
285 m->destination = path;
286 m->options = opts;
287
288 path = opts = NULL;
289 return 0;
290 }
291
292 int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
293 _cleanup_free_ char *upper = NULL, *destination = NULL;
294 _cleanup_strv_free_ char **lower = NULL;
295 CustomMount *m;
296 int k;
297
298 k = strv_split_extract(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
299 if (k < 0)
300 return k;
301 if (k < 2)
302 return -EADDRNOTAVAIL;
303 if (k == 2) {
304 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
305 * we'll also define the destination mount point the same as the upper. */
306
307 if (!source_path_is_valid(lower[0]) ||
308 !source_path_is_valid(lower[1]))
309 return -EINVAL;
310
311 upper = lower[1];
312 lower[1] = NULL;
313
314 destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
315 if (!destination)
316 return -ENOMEM;
317 } else {
318 char **i;
319
320 /* If more than two parameters are specified, the last one is the destination, the second to last one
321 * the "upper", and all before that the "lower" directories. */
322
323 destination = lower[k - 1];
324 upper = lower[k - 2];
325 lower[k - 2] = NULL;
326
327 STRV_FOREACH(i, lower)
328 if (!source_path_is_valid(*i))
329 return -EINVAL;
330
331 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
332 * in /var/tmp */
333 if (isempty(upper))
334 upper = NULL;
335 else if (!source_path_is_valid(upper))
336 return -EINVAL;
337
338 if (!path_is_absolute(destination))
339 return -EINVAL;
340 }
341
342 m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
343 if (!m)
344 return -ENOMEM;
345
346 m->destination = destination;
347 m->source = upper;
348 m->lower = lower;
349 m->read_only = read_only;
350
351 upper = destination = NULL;
352 lower = NULL;
353
354 return 0;
355 }
356
357 static int tmpfs_patch_options(
358 const char *options,
359 bool userns,
360 uid_t uid_shift, uid_t uid_range,
361 bool patch_ids,
362 const char *selinux_apifs_context,
363 char **ret) {
364
365 char *buf = NULL;
366
367 if ((userns && uid_shift != 0) || patch_ids) {
368 assert(uid_shift != UID_INVALID);
369
370 if (asprintf(&buf, "%s%suid=" UID_FMT ",gid=" UID_FMT,
371 strempty(options), options ? "," : "",
372 uid_shift, uid_shift) < 0)
373 return -ENOMEM;
374
375 options = buf;
376 }
377
378 #if HAVE_SELINUX
379 if (selinux_apifs_context) {
380 char *t;
381
382 t = strjoin(strempty(options), options ? "," : "",
383 "context=\"", selinux_apifs_context, "\"");
384 free(buf);
385 if (!t)
386 return -ENOMEM;
387
388 buf = t;
389 }
390 #endif
391
392 if (!buf && options) {
393 buf = strdup(options);
394 if (!buf)
395 return -ENOMEM;
396 }
397 *ret = buf;
398
399 return !!buf;
400 }
401
402 int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
403 const char *full, *top, *x;
404 int r;
405 unsigned long extra_flags = 0;
406
407 top = prefix_roota(dest, "/sys");
408 r = path_check_fstype(top, SYSFS_MAGIC);
409 if (r < 0)
410 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
411 /* /sys might already be mounted as sysfs by the outer child in the
412 * !netns case. In this case, it's all good. Don't touch it because we
413 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
414 */
415 if (r > 0)
416 return 0;
417
418 full = prefix_roota(top, "/full");
419
420 (void) mkdir(full, 0755);
421
422 if (mount_settings & MOUNT_APPLY_APIVFS_RO)
423 extra_flags |= MS_RDONLY;
424
425 r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
426 MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
427 if (r < 0)
428 return r;
429
430 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
431 _cleanup_free_ char *from = NULL, *to = NULL;
432
433 from = prefix_root(full, x);
434 if (!from)
435 return log_oom();
436
437 to = prefix_root(top, x);
438 if (!to)
439 return log_oom();
440
441 (void) mkdir(to, 0755);
442
443 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
444 if (r < 0)
445 return r;
446
447 r = mount_verbose(LOG_ERR, NULL, to, NULL,
448 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
449 if (r < 0)
450 return r;
451 }
452
453 r = umount_verbose(full);
454 if (r < 0)
455 return r;
456
457 if (rmdir(full) < 0)
458 return log_error_errno(errno, "Failed to remove %s: %m", full);
459
460 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
461 * remount /sys read-only.
462 */
463 if (cg_ns_supported()) {
464 x = prefix_roota(top, "/fs/cgroup");
465 (void) mkdir_p(x, 0755);
466 }
467
468 return mount_verbose(LOG_ERR, NULL, top, NULL,
469 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
470 }
471
472 static int mkdir_userns(const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
473 int r;
474
475 assert(path);
476
477 r = mkdir(path, mode);
478 if (r < 0 && errno != EEXIST)
479 return -errno;
480
481 if ((mask & MOUNT_USE_USERNS) == 0)
482 return 0;
483
484 if (mask & MOUNT_IN_USERNS)
485 return 0;
486
487 r = lchown(path, uid_shift, uid_shift);
488 if (r < 0)
489 return -errno;
490
491 return 0;
492 }
493
494 static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) {
495 const char *p, *e;
496 int r;
497
498 assert(path);
499
500 if (prefix && !path_startswith(path, prefix))
501 return -ENOTDIR;
502
503 /* create every parent directory in the path, except the last component */
504 p = path + strspn(path, "/");
505 for (;;) {
506 char t[strlen(path) + 1];
507
508 e = p + strcspn(p, "/");
509 p = e + strspn(e, "/");
510
511 /* Is this the last component? If so, then we're done */
512 if (*p == 0)
513 break;
514
515 memcpy(t, path, e - path);
516 t[e-path] = 0;
517
518 if (prefix && path_startswith(prefix, t))
519 continue;
520
521 r = mkdir_userns(t, mode, mask, uid_shift);
522 if (r < 0)
523 return r;
524 }
525
526 return mkdir_userns(path, mode, mask, uid_shift);
527 }
528
529 int mount_all(const char *dest,
530 MountSettingsMask mount_settings,
531 uid_t uid_shift, uid_t uid_range,
532 const char *selinux_apifs_context) {
533
534 typedef struct MountPoint {
535 const char *what;
536 const char *where;
537 const char *type;
538 const char *options;
539 unsigned long flags;
540 MountSettingsMask mount_settings;
541 } MountPoint;
542
543 static const MountPoint mount_table[] = {
544 /* inner child mounts */
545 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
546 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
547 { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
548 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
549 { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
550 { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
551
552 /* outer child mounts */
553 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
554 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
555 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
556 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
557
558 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
559 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
560 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
561 #if HAVE_SELINUX
562 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
563 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
564 #endif
565 };
566
567 unsigned k;
568 int r;
569 bool use_userns = (mount_settings & MOUNT_USE_USERNS);
570 bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
571 bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
572 bool in_userns = (mount_settings & MOUNT_IN_USERNS);
573
574 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
575 _cleanup_free_ char *where = NULL, *options = NULL;
576 const char *o;
577 bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
578
579 if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
580 continue;
581
582 if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
583 continue;
584
585 if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
586 continue;
587
588 r = chase_symlinks(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where);
589 if (r < 0)
590 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
591
592 r = path_is_mount_point(where, NULL, 0);
593 if (r < 0 && r != -ENOENT)
594 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
595
596 /* Skip this entry if it is not a remount. */
597 if (mount_table[k].what && r > 0)
598 continue;
599
600 r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
601 if (r < 0 && r != -EEXIST) {
602 if (fatal && r != -EROFS)
603 return log_error_errno(r, "Failed to create directory %s: %m", where);
604
605 log_debug_errno(r, "Failed to create directory %s: %m", where);
606 /* If we failed mkdir() or chown() due to the root
607 * directory being read only, attempt to mount this fs
608 * anyway and let mount_verbose log any errors */
609 if (r != -EROFS)
610 continue;
611 }
612
613 o = mount_table[k].options;
614 if (streq_ptr(mount_table[k].type, "tmpfs")) {
615 if (in_userns)
616 r = tmpfs_patch_options(o, use_userns, 0, uid_range, true, selinux_apifs_context, &options);
617 else
618 r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
619 if (r < 0)
620 return log_oom();
621 if (r > 0)
622 o = options;
623 }
624
625 r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
626 mount_table[k].what,
627 where,
628 mount_table[k].type,
629 mount_table[k].flags,
630 o);
631 if (r < 0 && fatal)
632 return r;
633 }
634
635 return 0;
636 }
637
638 static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
639 const char *p = options;
640 unsigned long flags = *mount_flags;
641 char *opts = NULL;
642 int r;
643
644 assert(options);
645
646 for (;;) {
647 _cleanup_free_ char *word = NULL;
648
649 r = extract_first_word(&p, &word, ",", 0);
650 if (r < 0)
651 return log_error_errno(r, "Failed to extract mount option: %m");
652 if (r == 0)
653 break;
654
655 if (streq(word, "rbind"))
656 flags |= MS_REC;
657 else if (streq(word, "norbind"))
658 flags &= ~MS_REC;
659 else {
660 log_error("Invalid bind mount option: %s", word);
661 return -EINVAL;
662 }
663 }
664
665 *mount_flags = flags;
666 /* in the future mount_opts will hold string options for mount(2) */
667 *mount_opts = opts;
668
669 return 0;
670 }
671
672 static int mount_bind(const char *dest, CustomMount *m) {
673
674 _cleanup_free_ char *mount_opts = NULL, *where = NULL;
675 unsigned long mount_flags = MS_BIND | MS_REC;
676 struct stat source_st, dest_st;
677 int r;
678
679 assert(dest);
680 assert(m);
681
682 if (m->options) {
683 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
684 if (r < 0)
685 return r;
686 }
687
688 if (stat(m->source, &source_st) < 0)
689 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
690
691 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
692 if (r < 0)
693 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
694 if (r > 0) { /* Path exists already? */
695
696 if (stat(where, &dest_st) < 0)
697 return log_error_errno(errno, "Failed to stat %s: %m", where);
698
699 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
700 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
701 return -EINVAL;
702 }
703
704 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
705 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
706 return -EINVAL;
707 }
708
709 } else { /* Path doesn't exist yet? */
710 r = mkdir_parents_label(where, 0755);
711 if (r < 0)
712 return log_error_errno(r, "Failed to make parents of %s: %m", where);
713
714 /* Create the mount point. Any non-directory file can be
715 * mounted on any non-directory file (regular, fifo, socket,
716 * char, block).
717 */
718 if (S_ISDIR(source_st.st_mode))
719 r = mkdir_label(where, 0755);
720 else
721 r = touch(where);
722 if (r < 0)
723 return log_error_errno(r, "Failed to create mount point %s: %m", where);
724
725 }
726
727 r = mount_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
728 if (r < 0)
729 return r;
730
731 if (m->read_only) {
732 r = bind_remount_recursive(where, true, NULL);
733 if (r < 0)
734 return log_error_errno(r, "Read-only bind mount failed: %m");
735 }
736
737 return 0;
738 }
739
740 static int mount_tmpfs(
741 const char *dest,
742 CustomMount *m,
743 bool userns, uid_t uid_shift, uid_t uid_range,
744 const char *selinux_apifs_context) {
745
746 const char *options;
747 _cleanup_free_ char *buf = NULL, *where = NULL;
748 int r;
749
750 assert(dest);
751 assert(m);
752
753 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
754 if (r < 0)
755 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
756 if (r == 0) { /* Doesn't exist yet? */
757 r = mkdir_p_label(where, 0755);
758 if (r < 0)
759 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
760 }
761
762 r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
763 if (r < 0)
764 return log_oom();
765 options = r > 0 ? buf : m->options;
766
767 return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
768 }
769
770 static char *joined_and_escaped_lower_dirs(char **lower) {
771 _cleanup_strv_free_ char **sv = NULL;
772
773 sv = strv_copy(lower);
774 if (!sv)
775 return NULL;
776
777 strv_reverse(sv);
778
779 if (!strv_shell_escape(sv, ",:"))
780 return NULL;
781
782 return strv_join(sv, ":");
783 }
784
785 static int mount_overlay(const char *dest, CustomMount *m) {
786
787 _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
788 const char *options;
789 int r;
790
791 assert(dest);
792 assert(m);
793
794 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
795 if (r < 0)
796 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
797 if (r == 0) { /* Doesn't exist yet? */
798 r = mkdir_label(where, 0755);
799 if (r < 0)
800 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
801 }
802
803 (void) mkdir_p_label(m->source, 0755);
804
805 lower = joined_and_escaped_lower_dirs(m->lower);
806 if (!lower)
807 return log_oom();
808
809 escaped_source = shell_escape(m->source, ",:");
810 if (!escaped_source)
811 return log_oom();
812
813 if (m->read_only)
814 options = strjoina("lowerdir=", escaped_source, ":", lower);
815 else {
816 _cleanup_free_ char *escaped_work_dir = NULL;
817
818 escaped_work_dir = shell_escape(m->work_dir, ",:");
819 if (!escaped_work_dir)
820 return log_oom();
821
822 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
823 }
824
825 return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
826 }
827
828 int mount_custom(
829 const char *dest,
830 CustomMount *mounts, unsigned n,
831 bool userns, uid_t uid_shift, uid_t uid_range,
832 const char *selinux_apifs_context) {
833
834 unsigned i;
835 int r;
836
837 assert(dest);
838
839 for (i = 0; i < n; i++) {
840 CustomMount *m = mounts + i;
841
842 switch (m->type) {
843
844 case CUSTOM_MOUNT_BIND:
845 r = mount_bind(dest, m);
846 break;
847
848 case CUSTOM_MOUNT_TMPFS:
849 r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
850 break;
851
852 case CUSTOM_MOUNT_OVERLAY:
853 r = mount_overlay(dest, m);
854 break;
855
856 default:
857 assert_not_reached("Unknown custom mount type");
858 }
859
860 if (r < 0)
861 return r;
862 }
863
864 return 0;
865 }
866
867 /* Retrieve existing subsystems. This function is called in a new cgroup
868 * namespace.
869 */
870 static int get_process_controllers(Set **ret) {
871 _cleanup_set_free_free_ Set *controllers = NULL;
872 _cleanup_fclose_ FILE *f = NULL;
873 int r;
874
875 assert(ret);
876
877 controllers = set_new(&string_hash_ops);
878 if (!controllers)
879 return -ENOMEM;
880
881 f = fopen("/proc/self/cgroup", "re");
882 if (!f)
883 return errno == ENOENT ? -ESRCH : -errno;
884
885 for (;;) {
886 _cleanup_free_ char *line = NULL;
887 char *e, *l;
888
889 r = read_line(f, LONG_LINE_MAX, &line);
890 if (r < 0)
891 return r;
892 if (r == 0)
893 break;
894
895 l = strchr(line, ':');
896 if (!l)
897 continue;
898
899 l++;
900 e = strchr(l, ':');
901 if (!e)
902 continue;
903
904 *e = 0;
905
906 if (STR_IN_SET(l, "", "name=systemd", "name=unified"))
907 continue;
908
909 r = set_put_strdup(controllers, l);
910 if (r < 0)
911 return r;
912 }
913
914 *ret = controllers;
915 controllers = NULL;
916
917 return 0;
918 }
919
920 static int mount_legacy_cgroup_hierarchy(
921 const char *dest,
922 const char *controller,
923 const char *hierarchy,
924 bool read_only) {
925
926 const char *to, *fstype, *opts;
927 int r;
928
929 to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
930
931 r = path_is_mount_point(to, dest, 0);
932 if (r < 0 && r != -ENOENT)
933 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
934 if (r > 0)
935 return 0;
936
937 mkdir_p(to, 0755);
938
939 /* The superblock mount options of the mount point need to be
940 * identical to the hosts', and hence writable... */
941 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) {
942 fstype = "cgroup2";
943 opts = NULL;
944 } else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) {
945 fstype = "cgroup";
946 opts = "none,name=systemd,xattr";
947 } else {
948 fstype = "cgroup";
949 opts = controller;
950 }
951
952 r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
953 if (r < 0)
954 return r;
955
956 /* ... hence let's only make the bind mount read-only, not the superblock. */
957 if (read_only) {
958 r = mount_verbose(LOG_ERR, NULL, to, NULL,
959 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
960 if (r < 0)
961 return r;
962 }
963
964 return 1;
965 }
966
967 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
968 static int mount_legacy_cgns_supported(
969 const char *dest,
970 CGroupUnified unified_requested,
971 bool userns,
972 uid_t uid_shift,
973 uid_t uid_range,
974 const char *selinux_apifs_context) {
975
976 _cleanup_set_free_free_ Set *controllers = NULL;
977 const char *cgroup_root = "/sys/fs/cgroup", *c;
978 int r;
979
980 (void) mkdir_p(cgroup_root, 0755);
981
982 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
983 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
984 if (r < 0)
985 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
986 if (r == 0) {
987 _cleanup_free_ char *options = NULL;
988
989 /* When cgroup namespaces are enabled and user namespaces are
990 * used then the mount of the cgroupfs is done *inside* the new
991 * user namespace. We're root in the new user namespace and the
992 * kernel will happily translate our uid/gid to the correct
993 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
994 * pass uid 0 and not uid_shift to tmpfs_patch_options().
995 */
996 r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
997 if (r < 0)
998 return log_oom();
999
1000 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
1001 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
1002 if (r < 0)
1003 return r;
1004 }
1005
1006 r = cg_all_unified();
1007 if (r < 0)
1008 return r;
1009 if (r > 0)
1010 goto skip_controllers;
1011
1012 r = get_process_controllers(&controllers);
1013 if (r < 0)
1014 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1015
1016 for (;;) {
1017 _cleanup_free_ const char *controller = NULL;
1018
1019 controller = set_steal_first(controllers);
1020 if (!controller)
1021 break;
1022
1023 r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
1024 if (r < 0)
1025 return r;
1026
1027 /* When multiple hierarchies are co-mounted, make their
1028 * constituting individual hierarchies a symlink to the
1029 * co-mount.
1030 */
1031 c = controller;
1032 for (;;) {
1033 _cleanup_free_ char *target = NULL, *tok = NULL;
1034
1035 r = extract_first_word(&c, &tok, ",", 0);
1036 if (r < 0)
1037 return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
1038 if (r == 0)
1039 break;
1040
1041 if (streq(controller, tok))
1042 break;
1043
1044 target = prefix_root("/sys/fs/cgroup/", tok);
1045 if (!target)
1046 return log_oom();
1047
1048 r = symlink_idempotent(controller, target);
1049 if (r == -EINVAL)
1050 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
1051 if (r < 0)
1052 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1053 }
1054 }
1055
1056 skip_controllers:
1057 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1058 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1059 if (r < 0)
1060 return r;
1061 }
1062
1063 r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
1064 if (r < 0)
1065 return r;
1066
1067 if (!userns)
1068 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1069 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
1070
1071 return 0;
1072 }
1073
1074 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
1075 static int mount_legacy_cgns_unsupported(
1076 const char *dest,
1077 CGroupUnified unified_requested,
1078 bool userns,
1079 uid_t uid_shift,
1080 uid_t uid_range,
1081 const char *selinux_apifs_context) {
1082
1083 _cleanup_set_free_free_ Set *controllers = NULL;
1084 const char *cgroup_root;
1085 int r;
1086
1087 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1088
1089 (void) mkdir_p(cgroup_root, 0755);
1090
1091 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
1092 r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
1093 if (r < 0)
1094 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1095 if (r == 0) {
1096 _cleanup_free_ char *options = NULL;
1097
1098 r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
1099 if (r < 0)
1100 return log_oom();
1101
1102 r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
1103 MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
1104 if (r < 0)
1105 return r;
1106 }
1107
1108 r = cg_all_unified();
1109 if (r < 0)
1110 return r;
1111 if (r > 0)
1112 goto skip_controllers;
1113
1114 r = cg_kernel_controllers(&controllers);
1115 if (r < 0)
1116 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1117
1118 for (;;) {
1119 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1120
1121 controller = set_steal_first(controllers);
1122 if (!controller)
1123 break;
1124
1125 origin = prefix_root("/sys/fs/cgroup/", controller);
1126 if (!origin)
1127 return log_oom();
1128
1129 r = readlink_malloc(origin, &combined);
1130 if (r == -EINVAL) {
1131 /* Not a symbolic link, but directly a single cgroup hierarchy */
1132
1133 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
1134 if (r < 0)
1135 return r;
1136
1137 } else if (r < 0)
1138 return log_error_errno(r, "Failed to read link %s: %m", origin);
1139 else {
1140 _cleanup_free_ char *target = NULL;
1141
1142 target = prefix_root(dest, origin);
1143 if (!target)
1144 return log_oom();
1145
1146 /* A symbolic link, a combination of controllers in one hierarchy */
1147
1148 if (!filename_is_valid(combined)) {
1149 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1150 continue;
1151 }
1152
1153 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
1154 if (r < 0)
1155 return r;
1156
1157 r = symlink_idempotent(combined, target);
1158 if (r == -EINVAL)
1159 return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
1160 if (r < 0)
1161 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1162 }
1163 }
1164
1165 skip_controllers:
1166 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1167 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
1168 if (r < 0)
1169 return r;
1170 }
1171
1172 r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
1173 if (r < 0)
1174 return r;
1175
1176 return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
1177 MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
1178 }
1179
1180 static int mount_unified_cgroups(const char *dest) {
1181 const char *p;
1182 int r;
1183
1184 assert(dest);
1185
1186 p = prefix_roota(dest, "/sys/fs/cgroup");
1187
1188 (void) mkdir_p(p, 0755);
1189
1190 r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW);
1191 if (r < 0)
1192 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
1193 if (r > 0) {
1194 p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
1195 if (access(p, F_OK) >= 0)
1196 return 0;
1197 if (errno != ENOENT)
1198 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
1199
1200 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
1201 return -EINVAL;
1202 }
1203
1204 return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1205 }
1206
1207 int mount_cgroups(
1208 const char *dest,
1209 CGroupUnified unified_requested,
1210 bool userns,
1211 uid_t uid_shift,
1212 uid_t uid_range,
1213 const char *selinux_apifs_context,
1214 bool use_cgns) {
1215
1216 if (unified_requested >= CGROUP_UNIFIED_ALL)
1217 return mount_unified_cgroups(dest);
1218 if (use_cgns)
1219 return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
1220
1221 return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
1222 }
1223
1224 static int mount_systemd_cgroup_writable_one(const char *root, const char *own) {
1225 int r;
1226
1227 assert(root);
1228 assert(own);
1229
1230 /* Make our own cgroup a (writable) bind mount */
1231 r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL);
1232 if (r < 0)
1233 return r;
1234
1235 /* And then remount the systemd cgroup root read-only */
1236 return mount_verbose(LOG_ERR, NULL, root, NULL,
1237 MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
1238 }
1239
1240 int mount_systemd_cgroup_writable(
1241 const char *dest,
1242 CGroupUnified unified_requested) {
1243
1244 _cleanup_free_ char *own_cgroup_path = NULL;
1245 const char *root, *own;
1246 int r;
1247
1248 assert(dest);
1249
1250 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1251 if (r < 0)
1252 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1253
1254 /* If we are living in the top-level, then there's nothing to do... */
1255 if (path_equal(own_cgroup_path, "/"))
1256 return 0;
1257
1258 if (unified_requested >= CGROUP_UNIFIED_ALL) {
1259
1260 root = prefix_roota(dest, "/sys/fs/cgroup");
1261 own = strjoina(root, own_cgroup_path);
1262
1263 } else {
1264
1265 if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
1266 root = prefix_roota(dest, "/sys/fs/cgroup/unified");
1267 own = strjoina(root, own_cgroup_path);
1268
1269 r = mount_systemd_cgroup_writable_one(root, own);
1270 if (r < 0)
1271 return r;
1272 }
1273
1274 root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1275 own = strjoina(root, own_cgroup_path);
1276 }
1277
1278 return mount_systemd_cgroup_writable_one(root, own);
1279 }
1280
1281 int setup_volatile_state(
1282 const char *directory,
1283 VolatileMode mode,
1284 bool userns, uid_t uid_shift, uid_t uid_range,
1285 const char *selinux_apifs_context) {
1286
1287 _cleanup_free_ char *buf = NULL;
1288 const char *p, *options;
1289 int r;
1290
1291 assert(directory);
1292
1293 if (mode != VOLATILE_STATE)
1294 return 0;
1295
1296 /* --volatile=state means we simply overmount /var
1297 with a tmpfs, and the rest read-only. */
1298
1299 r = bind_remount_recursive(directory, true, NULL);
1300 if (r < 0)
1301 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1302
1303 p = prefix_roota(directory, "/var");
1304 r = mkdir(p, 0755);
1305 if (r < 0 && errno != EEXIST)
1306 return log_error_errno(errno, "Failed to create %s: %m", directory);
1307
1308 options = "mode=755";
1309 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
1310 if (r < 0)
1311 return log_oom();
1312 if (r > 0)
1313 options = buf;
1314
1315 return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
1316 }
1317
1318 int setup_volatile(
1319 const char *directory,
1320 VolatileMode mode,
1321 bool userns, uid_t uid_shift, uid_t uid_range,
1322 const char *selinux_apifs_context) {
1323
1324 bool tmpfs_mounted = false, bind_mounted = false;
1325 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1326 _cleanup_free_ char *buf = NULL;
1327 const char *f, *t, *options;
1328 int r;
1329
1330 assert(directory);
1331
1332 if (mode != VOLATILE_YES)
1333 return 0;
1334
1335 /* --volatile=yes means we mount a tmpfs to the root dir, and
1336 the original /usr to use inside it, and that read-only. */
1337
1338 if (!mkdtemp(template))
1339 return log_error_errno(errno, "Failed to create temporary directory: %m");
1340
1341 options = "mode=755";
1342 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
1343 if (r < 0)
1344 return log_oom();
1345 if (r > 0)
1346 options = buf;
1347
1348 r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1349 if (r < 0)
1350 goto fail;
1351
1352 tmpfs_mounted = true;
1353
1354 f = prefix_roota(directory, "/usr");
1355 t = prefix_roota(template, "/usr");
1356
1357 r = mkdir(t, 0755);
1358 if (r < 0 && errno != EEXIST) {
1359 r = log_error_errno(errno, "Failed to create %s: %m", t);
1360 goto fail;
1361 }
1362
1363 r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
1364 if (r < 0)
1365 goto fail;
1366
1367 bind_mounted = true;
1368
1369 r = bind_remount_recursive(t, true, NULL);
1370 if (r < 0) {
1371 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1372 goto fail;
1373 }
1374
1375 r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
1376 if (r < 0)
1377 goto fail;
1378
1379 (void) rmdir(template);
1380
1381 return 0;
1382
1383 fail:
1384 if (bind_mounted)
1385 (void) umount_verbose(t);
1386
1387 if (tmpfs_mounted)
1388 (void) umount_verbose(template);
1389 (void) rmdir(template);
1390 return r;
1391 }
1392
1393 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1394 int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
1395 _cleanup_free_ char *root_new = NULL, *root_old = NULL;
1396 const char *p = s;
1397 int r;
1398
1399 assert(pivot_root_new);
1400 assert(pivot_root_old);
1401
1402 r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1403 if (r < 0)
1404 return r;
1405 if (r == 0)
1406 return -EINVAL;
1407
1408 if (isempty(p))
1409 root_old = NULL;
1410 else {
1411 root_old = strdup(p);
1412 if (!root_old)
1413 return -ENOMEM;
1414 }
1415
1416 if (!path_is_absolute(root_new))
1417 return -EINVAL;
1418 if (root_old && !path_is_absolute(root_old))
1419 return -EINVAL;
1420
1421 free_and_replace(*pivot_root_new, root_new);
1422 free_and_replace(*pivot_root_old, root_old);
1423
1424 return 0;
1425 }
1426
1427 int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
1428 _cleanup_free_ char *directory_pivot_root_new = NULL;
1429 _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
1430 char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
1431 bool remove_pivot_tmp = false;
1432 int r;
1433
1434 assert(directory);
1435
1436 if (!pivot_root_new)
1437 return 0;
1438
1439 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1440 * If pivot_root_old is NULL, the existing / disappears.
1441 * This requires a temporary directory, pivot_tmp, which is
1442 * not a child of either.
1443 *
1444 * This is typically used for OSTree-style containers, where
1445 * the root partition contains several sysroots which could be
1446 * run. Normally, one would be chosen by the bootloader and
1447 * pivoted to / by initramfs.
1448 *
1449 * For example, for an OSTree deployment, pivot_root_new
1450 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1451 * code doesn’t do the /var mount which OSTree expects: use
1452 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1453 *
1454 * So in the OSTree case, we’ll end up with something like:
1455 * - directory = /tmp/nspawn-root-123456
1456 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1457 * - pivot_root_old = /sysroot
1458 * - directory_pivot_root_new =
1459 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1460 * - pivot_tmp = /tmp/nspawn-pivot-123456
1461 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1462 *
1463 * Requires all file systems at directory and below to be mounted
1464 * MS_PRIVATE or MS_SLAVE so they can be moved.
1465 */
1466 directory_pivot_root_new = prefix_root(directory, pivot_root_new);
1467
1468 /* Remount directory_pivot_root_new to make it movable. */
1469 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
1470 if (r < 0)
1471 goto done;
1472
1473 if (pivot_root_old) {
1474 if (!mkdtemp(pivot_tmp)) {
1475 r = log_error_errno(errno, "Failed to create temporary directory: %m");
1476 goto done;
1477 }
1478
1479 remove_pivot_tmp = true;
1480 pivot_tmp_pivot_root_old = prefix_root(pivot_tmp, pivot_root_old);
1481
1482 r = mount_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
1483 if (r < 0)
1484 goto done;
1485
1486 r = mount_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
1487 if (r < 0)
1488 goto done;
1489
1490 r = mount_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
1491 if (r < 0)
1492 goto done;
1493 } else {
1494 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
1495 if (r < 0)
1496 goto done;
1497 }
1498
1499 done:
1500 if (remove_pivot_tmp)
1501 (void) rmdir(pivot_tmp);
1502
1503 return r;
1504 }