]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-mount.c
521991f412ab911e8acd09993f5ae90ac9c549da
[thirdparty/systemd.git] / src / nspawn / nspawn-mount.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <sys/mount.h>
4 #include <linux/magic.h>
5
6 #include "alloc-util.h"
7 #include "escape.h"
8 #include "fd-util.h"
9 #include "format-util.h"
10 #include "fs-util.h"
11 #include "label.h"
12 #include "mkdir.h"
13 #include "mount-util.h"
14 #include "mountpoint-util.h"
15 #include "nspawn-mount.h"
16 #include "parse-util.h"
17 #include "path-util.h"
18 #include "rm-rf.h"
19 #include "set.h"
20 #include "sort-util.h"
21 #include "stat-util.h"
22 #include "string-util.h"
23 #include "strv.h"
24 #include "tmpfile-util.h"
25 #include "user-util.h"
26
27 CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
28 CustomMount *c, *ret;
29
30 assert(l);
31 assert(n);
32 assert(t >= 0);
33 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
34
35 c = reallocarray(*l, *n + 1, sizeof(CustomMount));
36 if (!c)
37 return NULL;
38
39 *l = c;
40 ret = *l + *n;
41 (*n)++;
42
43 *ret = (CustomMount) { .type = t };
44
45 return ret;
46 }
47
48 void custom_mount_free_all(CustomMount *l, size_t n) {
49 size_t i;
50
51 for (i = 0; i < n; i++) {
52 CustomMount *m = l + i;
53
54 free(m->source);
55 free(m->destination);
56 free(m->options);
57
58 if (m->work_dir) {
59 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
60 free(m->work_dir);
61 }
62
63 if (m->rm_rf_tmpdir) {
64 (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
65 free(m->rm_rf_tmpdir);
66 }
67
68 strv_free(m->lower);
69 free(m->type_argument);
70 }
71
72 free(l);
73 }
74
75 static int custom_mount_compare(const CustomMount *a, const CustomMount *b) {
76 int r;
77
78 r = path_compare(a->destination, b->destination);
79 if (r != 0)
80 return r;
81
82 return CMP(a->type, b->type);
83 }
84
85 static bool source_path_is_valid(const char *p) {
86 assert(p);
87
88 if (*p == '+')
89 p++;
90
91 return path_is_absolute(p);
92 }
93
94 static char *resolve_source_path(const char *dest, const char *source) {
95
96 if (!source)
97 return NULL;
98
99 if (source[0] == '+')
100 return path_join(dest, source + 1);
101
102 return strdup(source);
103 }
104
105 int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
106 size_t i;
107 int r;
108
109 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
110 * parent process, so that we know the temporary directories to remove on exit before we fork off the
111 * children. */
112
113 assert(l || n == 0);
114
115 /* Order the custom mounts, and make sure we have a working directory */
116 typesafe_qsort(l, n, custom_mount_compare);
117
118 for (i = 0; i < n; i++) {
119 CustomMount *m = l + i;
120
121 /* /proc we mount in the inner child, i.e. when we acquired CLONE_NEWPID. All other mounts we mount
122 * already in the outer child, so that the mounts are already established before CLONE_NEWPID and in
123 * particular CLONE_NEWUSER. This also means any custom mounts below /proc also need to be mounted in
124 * the inner child, not the outer one. Determine this here. */
125 m->in_userns = path_startswith(m->destination, "/proc");
126
127 if (m->type == CUSTOM_MOUNT_BIND) {
128 if (m->source) {
129 char *s;
130
131 s = resolve_source_path(dest, m->source);
132 if (!s)
133 return log_oom();
134
135 free_and_replace(m->source, s);
136 } else {
137 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
138
139 m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
140 if (!m->rm_rf_tmpdir)
141 return log_oom();
142
143 if (!mkdtemp(m->rm_rf_tmpdir)) {
144 m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
145 return log_error_errno(errno, "Failed to acquire temporary directory: %m");
146 }
147
148 m->source = path_join(m->rm_rf_tmpdir, "src");
149 if (!m->source)
150 return log_oom();
151
152 if (mkdir(m->source, 0755) < 0)
153 return log_error_errno(errno, "Failed to create %s: %m", m->source);
154 }
155 }
156
157 if (m->type == CUSTOM_MOUNT_OVERLAY) {
158 char **j;
159
160 STRV_FOREACH(j, m->lower) {
161 char *s;
162
163 s = resolve_source_path(dest, *j);
164 if (!s)
165 return log_oom();
166
167 free_and_replace(*j, s);
168 }
169
170 if (m->work_dir) {
171 char *s;
172
173 s = resolve_source_path(dest, m->work_dir);
174 if (!s)
175 return log_oom();
176
177 free_and_replace(m->work_dir, s);
178 } else {
179 assert(m->source);
180
181 r = tempfn_random(m->source, NULL, &m->work_dir);
182 if (r < 0)
183 return log_error_errno(r, "Failed to acquire working directory: %m");
184 }
185
186 (void) mkdir_label(m->work_dir, 0700);
187 }
188 }
189
190 return 0;
191 }
192
193 int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
194 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
195 const char *p = s;
196 CustomMount *m;
197 int r;
198
199 assert(l);
200 assert(n);
201
202 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
203 if (r < 0)
204 return r;
205 if (r == 0)
206 return -EINVAL;
207 if (r == 1) {
208 destination = strdup(source[0] == '+' ? source+1 : source);
209 if (!destination)
210 return -ENOMEM;
211 }
212 if (r == 2 && !isempty(p)) {
213 opts = strdup(p);
214 if (!opts)
215 return -ENOMEM;
216 }
217
218 if (isempty(source))
219 source = mfree(source);
220 else if (!source_path_is_valid(source))
221 return -EINVAL;
222
223 if (!path_is_absolute(destination))
224 return -EINVAL;
225
226 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
227 if (!m)
228 return -ENOMEM;
229
230 m->source = TAKE_PTR(source);
231 m->destination = TAKE_PTR(destination);
232 m->read_only = read_only;
233 m->options = TAKE_PTR(opts);
234
235 return 0;
236 }
237
238 int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
239 _cleanup_free_ char *path = NULL, *opts = NULL;
240 const char *p = s;
241 CustomMount *m;
242 int r;
243
244 assert(l);
245 assert(n);
246 assert(s);
247
248 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
249 if (r < 0)
250 return r;
251 if (r == 0)
252 return -EINVAL;
253
254 if (isempty(p))
255 opts = strdup("mode=0755");
256 else
257 opts = strdup(p);
258 if (!opts)
259 return -ENOMEM;
260
261 if (!path_is_absolute(path))
262 return -EINVAL;
263
264 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
265 if (!m)
266 return -ENOMEM;
267
268 m->destination = TAKE_PTR(path);
269 m->options = TAKE_PTR(opts);
270
271 return 0;
272 }
273
274 int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
275 _cleanup_free_ char *upper = NULL, *destination = NULL;
276 _cleanup_strv_free_ char **lower = NULL;
277 CustomMount *m;
278 int k;
279
280 k = strv_split_extract(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
281 if (k < 0)
282 return k;
283 if (k < 2)
284 return -EADDRNOTAVAIL;
285 if (k == 2) {
286 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
287 * we'll also define the destination mount point the same as the upper. */
288
289 if (!source_path_is_valid(lower[0]) ||
290 !source_path_is_valid(lower[1]))
291 return -EINVAL;
292
293 upper = TAKE_PTR(lower[1]);
294
295 destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
296 if (!destination)
297 return -ENOMEM;
298 } else {
299 char **i;
300
301 /* If more than two parameters are specified, the last one is the destination, the second to last one
302 * the "upper", and all before that the "lower" directories. */
303
304 destination = lower[k - 1];
305 upper = TAKE_PTR(lower[k - 2]);
306
307 STRV_FOREACH(i, lower)
308 if (!source_path_is_valid(*i))
309 return -EINVAL;
310
311 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
312 * in /var/tmp */
313 if (isempty(upper))
314 upper = mfree(upper);
315 else if (!source_path_is_valid(upper))
316 return -EINVAL;
317
318 if (!path_is_absolute(destination))
319 return -EINVAL;
320 }
321
322 m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
323 if (!m)
324 return -ENOMEM;
325
326 m->destination = TAKE_PTR(destination);
327 m->source = TAKE_PTR(upper);
328 m->lower = TAKE_PTR(lower);
329 m->read_only = read_only;
330
331 return 0;
332 }
333
334 int inaccessible_mount_parse(CustomMount **l, size_t *n, const char *s) {
335 _cleanup_free_ char *path = NULL;
336 CustomMount *m;
337
338 assert(l);
339 assert(n);
340 assert(s);
341
342 if (!path_is_absolute(s))
343 return -EINVAL;
344
345 path = strdup(s);
346 if (!path)
347 return -ENOMEM;
348
349 m = custom_mount_add(l, n, CUSTOM_MOUNT_INACCESSIBLE);
350 if (!m)
351 return -ENOMEM;
352
353 m->destination = TAKE_PTR(path);
354 return 0;
355 }
356
357 int tmpfs_patch_options(
358 const char *options,
359 uid_t uid_shift,
360 const char *selinux_apifs_context,
361 char **ret) {
362
363 char *buf = NULL;
364
365 if (uid_shift != UID_INVALID) {
366 if (asprintf(&buf, "%s%suid=" UID_FMT ",gid=" UID_FMT,
367 strempty(options), options ? "," : "",
368 uid_shift, uid_shift) < 0)
369 return -ENOMEM;
370
371 options = buf;
372 }
373
374 #if HAVE_SELINUX
375 if (selinux_apifs_context) {
376 char *t;
377
378 t = strjoin(strempty(options), options ? "," : "",
379 "context=\"", selinux_apifs_context, "\"");
380 free(buf);
381 if (!t)
382 return -ENOMEM;
383
384 buf = t;
385 }
386 #endif
387
388 if (!buf && options) {
389 buf = strdup(options);
390 if (!buf)
391 return -ENOMEM;
392 }
393 *ret = buf;
394
395 return !!buf;
396 }
397
398 int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
399 const char *full, *top, *x;
400 int r;
401 unsigned long extra_flags = 0;
402
403 top = prefix_roota(dest, "/sys");
404 r = path_is_fs_type(top, SYSFS_MAGIC);
405 if (r < 0)
406 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
407 /* /sys might already be mounted as sysfs by the outer child in the
408 * !netns case. In this case, it's all good. Don't touch it because we
409 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
410 */
411 if (r > 0)
412 return 0;
413
414 full = prefix_roota(top, "/full");
415
416 (void) mkdir(full, 0755);
417
418 if (mount_settings & MOUNT_APPLY_APIVFS_RO)
419 extra_flags |= MS_RDONLY;
420
421 r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
422 MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
423 if (r < 0)
424 return r;
425
426 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
427 _cleanup_free_ char *from = NULL, *to = NULL;
428
429 from = path_join(full, x);
430 if (!from)
431 return log_oom();
432
433 to = path_join(top, x);
434 if (!to)
435 return log_oom();
436
437 (void) mkdir(to, 0755);
438
439 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
440 if (r < 0)
441 return r;
442
443 r = mount_verbose(LOG_ERR, NULL, to, NULL,
444 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
445 if (r < 0)
446 return r;
447 }
448
449 r = umount_verbose(full);
450 if (r < 0)
451 return r;
452
453 if (rmdir(full) < 0)
454 return log_error_errno(errno, "Failed to remove %s: %m", full);
455
456 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
457 * remount /sys read-only.
458 */
459 x = prefix_roota(top, "/fs/cgroup");
460 (void) mkdir_p(x, 0755);
461
462 return mount_verbose(LOG_ERR, NULL, top, NULL,
463 MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
464 }
465
466 static int mkdir_userns(const char *path, mode_t mode, uid_t uid_shift) {
467 int r;
468
469 assert(path);
470
471 r = mkdir_errno_wrapper(path, mode);
472 if (r < 0 && r != -EEXIST)
473 return r;
474
475 if (uid_shift == UID_INVALID)
476 return 0;
477
478 if (lchown(path, uid_shift, uid_shift) < 0)
479 return -errno;
480
481 return 0;
482 }
483
484 static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, uid_t uid_shift) {
485 const char *p, *e;
486 int r;
487
488 assert(path);
489
490 if (prefix && !path_startswith(path, prefix))
491 return -ENOTDIR;
492
493 /* create every parent directory in the path, except the last component */
494 p = path + strspn(path, "/");
495 for (;;) {
496 char t[strlen(path) + 1];
497
498 e = p + strcspn(p, "/");
499 p = e + strspn(e, "/");
500
501 /* Is this the last component? If so, then we're done */
502 if (*p == 0)
503 break;
504
505 memcpy(t, path, e - path);
506 t[e-path] = 0;
507
508 if (prefix && path_startswith(prefix, t))
509 continue;
510
511 r = mkdir_userns(t, mode, uid_shift);
512 if (r < 0)
513 return r;
514 }
515
516 return mkdir_userns(path, mode, uid_shift);
517 }
518
519 int mount_all(const char *dest,
520 MountSettingsMask mount_settings,
521 uid_t uid_shift,
522 const char *selinux_apifs_context) {
523
524 #define PROC_INACCESSIBLE_REG(path) \
525 { "/run/systemd/inaccessible/reg", (path), NULL, NULL, MS_BIND, \
526 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
527 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
528 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
529
530 #define PROC_READ_ONLY(path) \
531 { (path), (path), NULL, NULL, MS_BIND, \
532 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
533 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
534 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
535
536 typedef struct MountPoint {
537 const char *what;
538 const char *where;
539 const char *type;
540 const char *options;
541 unsigned long flags;
542 MountSettingsMask mount_settings;
543 } MountPoint;
544
545 static const MountPoint mount_table[] = {
546 /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
547 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
548 MOUNT_FATAL|MOUNT_IN_USERNS },
549
550 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
551 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
552
553 { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND,
554 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
555
556 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
557 MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
558
559 /* Make these files inaccessible to container payloads: they potentially leak information about kernel
560 * internals or the host's execution environment to the container */
561 PROC_INACCESSIBLE_REG("/proc/kallsyms"),
562 PROC_INACCESSIBLE_REG("/proc/kcore"),
563 PROC_INACCESSIBLE_REG("/proc/keys"),
564 PROC_INACCESSIBLE_REG("/proc/sysrq-trigger"),
565 PROC_INACCESSIBLE_REG("/proc/timer_list"),
566
567 /* Make these directories read-only to container payloads: they show hardware information, and in some
568 * cases contain tunables the container really shouldn't have access to. */
569 PROC_READ_ONLY("/proc/acpi"),
570 PROC_READ_ONLY("/proc/apm"),
571 PROC_READ_ONLY("/proc/asound"),
572 PROC_READ_ONLY("/proc/bus"),
573 PROC_READ_ONLY("/proc/fs"),
574 PROC_READ_ONLY("/proc/irq"),
575 PROC_READ_ONLY("/proc/scsi"),
576
577 { "mqueue", "/dev/mqueue", "mqueue", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
578 MOUNT_IN_USERNS },
579
580 /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
581 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
582 MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP },
583 { "tmpfs", "/sys", "tmpfs", "mode=555", MS_NOSUID|MS_NOEXEC|MS_NODEV,
584 MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
585 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
586 MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
587 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
588 MOUNT_FATAL }, /* skipped if above was mounted */
589 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
590 MOUNT_FATAL },
591 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
592 MOUNT_FATAL },
593 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
594 MOUNT_FATAL },
595
596 #if HAVE_SELINUX
597 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
598 0 }, /* Bind mount first */
599 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
600 0 }, /* Then, make it r/o */
601 #endif
602 };
603
604 bool use_userns = (mount_settings & MOUNT_USE_USERNS);
605 bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
606 bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
607 bool in_userns = (mount_settings & MOUNT_IN_USERNS);
608 bool tmpfs_tmp = (mount_settings & MOUNT_APPLY_TMPFS_TMP);
609 size_t k;
610 int r;
611
612 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
613 _cleanup_free_ char *where = NULL, *options = NULL;
614 const char *o;
615 bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
616
617 if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
618 continue;
619
620 if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
621 continue;
622
623 if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
624 continue;
625
626 if (!tmpfs_tmp && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_TMPFS_TMP))
627 continue;
628
629 r = chase_symlinks(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where, NULL);
630 if (r < 0)
631 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
632
633 /* Skip this entry if it is not a remount. */
634 if (mount_table[k].what) {
635 r = path_is_mount_point(where, NULL, 0);
636 if (r < 0 && r != -ENOENT)
637 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
638 if (r > 0)
639 continue;
640 }
641
642 r = mkdir_userns_p(dest, where, 0755, (use_userns && !in_userns) ? uid_shift : UID_INVALID);
643 if (r < 0 && r != -EEXIST) {
644 if (fatal && r != -EROFS)
645 return log_error_errno(r, "Failed to create directory %s: %m", where);
646
647 log_debug_errno(r, "Failed to create directory %s: %m", where);
648 /* If we failed mkdir() or chown() due to the root
649 * directory being read only, attempt to mount this fs
650 * anyway and let mount_verbose log any errors */
651 if (r != -EROFS)
652 continue;
653 }
654
655 o = mount_table[k].options;
656 if (streq_ptr(mount_table[k].type, "tmpfs")) {
657 r = tmpfs_patch_options(o, in_userns ? 0 : uid_shift, selinux_apifs_context, &options);
658 if (r < 0)
659 return log_oom();
660 if (r > 0)
661 o = options;
662 }
663
664 r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
665 mount_table[k].what,
666 where,
667 mount_table[k].type,
668 mount_table[k].flags,
669 o);
670 if (r < 0 && fatal)
671 return r;
672 }
673
674 return 0;
675 }
676
677 static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
678 const char *p = options;
679 unsigned long flags = *mount_flags;
680 char *opts = NULL;
681 int r;
682
683 assert(options);
684
685 for (;;) {
686 _cleanup_free_ char *word = NULL;
687
688 r = extract_first_word(&p, &word, ",", 0);
689 if (r < 0)
690 return log_error_errno(r, "Failed to extract mount option: %m");
691 if (r == 0)
692 break;
693
694 if (streq(word, "rbind"))
695 flags |= MS_REC;
696 else if (streq(word, "norbind"))
697 flags &= ~MS_REC;
698 else {
699 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
700 "Invalid bind mount option: %s",
701 word);
702 }
703 }
704
705 *mount_flags = flags;
706 /* in the future mount_opts will hold string options for mount(2) */
707 *mount_opts = opts;
708
709 return 0;
710 }
711
712 static int mount_bind(const char *dest, CustomMount *m) {
713 _cleanup_free_ char *mount_opts = NULL, *where = NULL;
714 unsigned long mount_flags = MS_BIND | MS_REC;
715 struct stat source_st, dest_st;
716 int r;
717
718 assert(dest);
719 assert(m);
720
721 if (m->options) {
722 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
723 if (r < 0)
724 return r;
725 }
726
727 if (stat(m->source, &source_st) < 0)
728 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
729
730 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
731 if (r < 0)
732 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
733 if (r > 0) { /* Path exists already? */
734
735 if (stat(where, &dest_st) < 0)
736 return log_error_errno(errno, "Failed to stat %s: %m", where);
737
738 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode))
739 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
740 "Cannot bind mount directory %s on file %s.",
741 m->source, where);
742
743 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode))
744 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
745 "Cannot bind mount file %s on directory %s.",
746 m->source, where);
747
748 } else { /* Path doesn't exist yet? */
749 r = mkdir_parents_label(where, 0755);
750 if (r < 0)
751 return log_error_errno(r, "Failed to make parents of %s: %m", where);
752
753 /* Create the mount point. Any non-directory file can be
754 * mounted on any non-directory file (regular, fifo, socket,
755 * char, block).
756 */
757 if (S_ISDIR(source_st.st_mode))
758 r = mkdir_label(where, 0755);
759 else
760 r = touch(where);
761 if (r < 0)
762 return log_error_errno(r, "Failed to create mount point %s: %m", where);
763 }
764
765 r = mount_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
766 if (r < 0)
767 return r;
768
769 if (m->read_only) {
770 r = bind_remount_recursive(where, MS_RDONLY, MS_RDONLY, NULL);
771 if (r < 0)
772 return log_error_errno(r, "Read-only bind mount failed: %m");
773 }
774
775 return 0;
776 }
777
778 static int mount_tmpfs(const char *dest, CustomMount *m, uid_t uid_shift, const char *selinux_apifs_context) {
779
780 const char *options;
781 _cleanup_free_ char *buf = NULL, *where = NULL;
782 int r;
783
784 assert(dest);
785 assert(m);
786
787 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
788 if (r < 0)
789 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
790 if (r == 0) { /* Doesn't exist yet? */
791 r = mkdir_p_label(where, 0755);
792 if (r < 0)
793 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
794 }
795
796 r = tmpfs_patch_options(m->options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
797 if (r < 0)
798 return log_oom();
799 options = r > 0 ? buf : m->options;
800
801 return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
802 }
803
804 static char *joined_and_escaped_lower_dirs(char **lower) {
805 _cleanup_strv_free_ char **sv = NULL;
806
807 sv = strv_copy(lower);
808 if (!sv)
809 return NULL;
810
811 strv_reverse(sv);
812
813 if (!strv_shell_escape(sv, ",:"))
814 return NULL;
815
816 return strv_join(sv, ":");
817 }
818
819 static int mount_overlay(const char *dest, CustomMount *m) {
820 _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
821 const char *options;
822 int r;
823
824 assert(dest);
825 assert(m);
826
827 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
828 if (r < 0)
829 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
830 if (r == 0) { /* Doesn't exist yet? */
831 r = mkdir_label(where, 0755);
832 if (r < 0)
833 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
834 }
835
836 (void) mkdir_p_label(m->source, 0755);
837
838 lower = joined_and_escaped_lower_dirs(m->lower);
839 if (!lower)
840 return log_oom();
841
842 escaped_source = shell_escape(m->source, ",:");
843 if (!escaped_source)
844 return log_oom();
845
846 if (m->read_only)
847 options = strjoina("lowerdir=", escaped_source, ":", lower);
848 else {
849 _cleanup_free_ char *escaped_work_dir = NULL;
850
851 escaped_work_dir = shell_escape(m->work_dir, ",:");
852 if (!escaped_work_dir)
853 return log_oom();
854
855 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
856 }
857
858 return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
859 }
860
861 static int mount_inaccessible(const char *dest, CustomMount *m) {
862 _cleanup_free_ char *where = NULL;
863 const char *source;
864 struct stat st;
865 int r;
866
867 assert(dest);
868 assert(m);
869
870 r = chase_symlinks_and_stat(m->destination, dest, CHASE_PREFIX_ROOT, &where, &st, NULL);
871 if (r < 0) {
872 log_full_errno(m->graceful ? LOG_DEBUG : LOG_ERR, r, "Failed to resolve %s/%s: %m", dest, m->destination);
873 return m->graceful ? 0 : r;
874 }
875
876 assert_se(source = mode_to_inaccessible_node(st.st_mode));
877
878 r = mount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, source, where, NULL, MS_BIND, NULL);
879 if (r < 0)
880 return m->graceful ? 0 : r;
881
882 r = mount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, NULL, where, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, NULL);
883 if (r < 0) {
884 umount_verbose(where);
885 return m->graceful ? 0 : r;
886 }
887
888 return 0;
889 }
890
891 static int mount_arbitrary(const char *dest, CustomMount *m) {
892 _cleanup_free_ char *where = NULL;
893 int r;
894
895 assert(dest);
896 assert(m);
897
898 r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
899 if (r < 0)
900 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
901 if (r == 0) { /* Doesn't exist yet? */
902 r = mkdir_p_label(where, 0755);
903 if (r < 0)
904 return log_error_errno(r, "Creating mount point for mount %s failed: %m", where);
905 }
906
907 return mount_verbose(LOG_ERR, m->source, where, m->type_argument, 0, m->options);
908 }
909
910 int mount_custom(
911 const char *dest,
912 CustomMount *mounts, size_t n,
913 uid_t uid_shift,
914 const char *selinux_apifs_context,
915 MountSettingsMask mount_settings) {
916
917 size_t i;
918 int r;
919
920 assert(dest);
921
922 for (i = 0; i < n; i++) {
923 CustomMount *m = mounts + i;
924
925 if ((mount_settings & MOUNT_IN_USERNS) != m->in_userns)
926 continue;
927
928 if (mount_settings & MOUNT_ROOT_ONLY && !path_equal(m->destination, "/"))
929 continue;
930
931 if (mount_settings & MOUNT_NON_ROOT_ONLY && path_equal(m->destination, "/"))
932 continue;
933
934 switch (m->type) {
935
936 case CUSTOM_MOUNT_BIND:
937 r = mount_bind(dest, m);
938 break;
939
940 case CUSTOM_MOUNT_TMPFS:
941 r = mount_tmpfs(dest, m, uid_shift, selinux_apifs_context);
942 break;
943
944 case CUSTOM_MOUNT_OVERLAY:
945 r = mount_overlay(dest, m);
946 break;
947
948 case CUSTOM_MOUNT_INACCESSIBLE:
949 r = mount_inaccessible(dest, m);
950 break;
951
952 case CUSTOM_MOUNT_ARBITRARY:
953 r = mount_arbitrary(dest, m);
954 break;
955
956 default:
957 assert_not_reached("Unknown custom mount type");
958 }
959
960 if (r < 0)
961 return r;
962 }
963
964 return 0;
965 }
966
967 static int setup_volatile_state(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) {
968
969 _cleanup_free_ char *buf = NULL;
970 const char *p, *options;
971 int r;
972
973 assert(directory);
974
975 /* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */
976
977 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
978 if (r < 0)
979 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
980
981 p = prefix_roota(directory, "/var");
982 r = mkdir(p, 0755);
983 if (r < 0 && errno != EEXIST)
984 return log_error_errno(errno, "Failed to create %s: %m", directory);
985
986 options = "mode=755";
987 r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
988 if (r < 0)
989 return log_oom();
990 if (r > 0)
991 options = buf;
992
993 return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
994 }
995
996 static int setup_volatile_yes(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) {
997
998 bool tmpfs_mounted = false, bind_mounted = false;
999 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1000 _cleanup_free_ char *buf = NULL, *bindir = NULL;
1001 const char *f, *t, *options;
1002 struct stat st;
1003 int r;
1004
1005 assert(directory);
1006
1007 /* --volatile=yes means we mount a tmpfs to the root dir, and the original /usr to use inside it, and
1008 * that read-only. Before we start setting this up let's validate if the image has the /usr merge
1009 * implemented, and let's output a friendly log message if it hasn't. */
1010
1011 bindir = path_join(directory, "/bin");
1012 if (!bindir)
1013 return log_oom();
1014 if (lstat(bindir, &st) < 0) {
1015 if (errno != ENOENT)
1016 return log_error_errno(errno, "Failed to stat /bin directory below image: %m");
1017
1018 /* ENOENT is fine, just means the image is probably just a naked /usr and we can create the
1019 * rest. */
1020 } else if (S_ISDIR(st.st_mode))
1021 return log_error_errno(SYNTHETIC_ERRNO(EISDIR),
1022 "Sorry, --volatile=yes mode is not supported with OS images that have not merged /bin/, /sbin/, /lib/, /lib64/ into /usr/. "
1023 "Please work with your distribution and help them adopt the merged /usr scheme.");
1024 else if (!S_ISLNK(st.st_mode))
1025 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1026 "Error starting image: if --volatile=yes is used /bin must be a symlink (for merged /usr support) or non-existent (in which case a symlink is created automatically).");
1027
1028 if (!mkdtemp(template))
1029 return log_error_errno(errno, "Failed to create temporary directory: %m");
1030
1031 options = "mode=755";
1032 r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
1033 if (r < 0)
1034 goto fail;
1035 if (r > 0)
1036 options = buf;
1037
1038 r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1039 if (r < 0)
1040 goto fail;
1041
1042 tmpfs_mounted = true;
1043
1044 f = prefix_roota(directory, "/usr");
1045 t = prefix_roota(template, "/usr");
1046
1047 r = mkdir(t, 0755);
1048 if (r < 0 && errno != EEXIST) {
1049 r = log_error_errno(errno, "Failed to create %s: %m", t);
1050 goto fail;
1051 }
1052
1053 r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
1054 if (r < 0)
1055 goto fail;
1056
1057 bind_mounted = true;
1058
1059 r = bind_remount_recursive(t, MS_RDONLY, MS_RDONLY, NULL);
1060 if (r < 0) {
1061 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1062 goto fail;
1063 }
1064
1065 r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
1066 if (r < 0)
1067 goto fail;
1068
1069 (void) rmdir(template);
1070
1071 return 0;
1072
1073 fail:
1074 if (bind_mounted)
1075 (void) umount_verbose(t);
1076
1077 if (tmpfs_mounted)
1078 (void) umount_verbose(template);
1079 (void) rmdir(template);
1080 return r;
1081 }
1082
1083 static int setup_volatile_overlay(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) {
1084
1085 _cleanup_free_ char *buf = NULL, *escaped_directory = NULL, *escaped_upper = NULL, *escaped_work = NULL;
1086 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1087 const char *upper, *work, *options;
1088 bool tmpfs_mounted = false;
1089 int r;
1090
1091 assert(directory);
1092
1093 /* --volatile=overlay means we mount an overlayfs to the root dir. */
1094
1095 if (!mkdtemp(template))
1096 return log_error_errno(errno, "Failed to create temporary directory: %m");
1097
1098 options = "mode=755";
1099 r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
1100 if (r < 0)
1101 goto finish;
1102 if (r > 0)
1103 options = buf;
1104
1105 r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1106 if (r < 0)
1107 goto finish;
1108
1109 tmpfs_mounted = true;
1110
1111 upper = strjoina(template, "/upper");
1112 work = strjoina(template, "/work");
1113
1114 if (mkdir(upper, 0755) < 0) {
1115 r = log_error_errno(errno, "Failed to create %s: %m", upper);
1116 goto finish;
1117 }
1118 if (mkdir(work, 0755) < 0) {
1119 r = log_error_errno(errno, "Failed to create %s: %m", work);
1120 goto finish;
1121 }
1122
1123 /* And now, let's overmount the root dir with an overlayfs that uses the root dir as lower dir. It's kinda nice
1124 * that the kernel allows us to do that without going through some mount point rearrangements. */
1125
1126 escaped_directory = shell_escape(directory, ",:");
1127 escaped_upper = shell_escape(upper, ",:");
1128 escaped_work = shell_escape(work, ",:");
1129 if (!escaped_directory || !escaped_upper || !escaped_work) {
1130 r = -ENOMEM;
1131 goto finish;
1132 }
1133
1134 options = strjoina("lowerdir=", escaped_directory, ",upperdir=", escaped_upper, ",workdir=", escaped_work);
1135 r = mount_verbose(LOG_ERR, "overlay", directory, "overlay", 0, options);
1136
1137 finish:
1138 if (tmpfs_mounted)
1139 (void) umount_verbose(template);
1140
1141 (void) rmdir(template);
1142 return r;
1143 }
1144
1145 int setup_volatile_mode(
1146 const char *directory,
1147 VolatileMode mode,
1148 uid_t uid_shift,
1149 const char *selinux_apifs_context) {
1150
1151 switch (mode) {
1152
1153 case VOLATILE_YES:
1154 return setup_volatile_yes(directory, uid_shift, selinux_apifs_context);
1155
1156 case VOLATILE_STATE:
1157 return setup_volatile_state(directory, uid_shift, selinux_apifs_context);
1158
1159 case VOLATILE_OVERLAY:
1160 return setup_volatile_overlay(directory, uid_shift, selinux_apifs_context);
1161
1162 default:
1163 return 0;
1164 }
1165 }
1166
1167 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1168 int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
1169 _cleanup_free_ char *root_new = NULL, *root_old = NULL;
1170 const char *p = s;
1171 int r;
1172
1173 assert(pivot_root_new);
1174 assert(pivot_root_old);
1175
1176 r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1177 if (r < 0)
1178 return r;
1179 if (r == 0)
1180 return -EINVAL;
1181
1182 if (isempty(p))
1183 root_old = NULL;
1184 else {
1185 root_old = strdup(p);
1186 if (!root_old)
1187 return -ENOMEM;
1188 }
1189
1190 if (!path_is_absolute(root_new))
1191 return -EINVAL;
1192 if (root_old && !path_is_absolute(root_old))
1193 return -EINVAL;
1194
1195 free_and_replace(*pivot_root_new, root_new);
1196 free_and_replace(*pivot_root_old, root_old);
1197
1198 return 0;
1199 }
1200
1201 int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
1202 _cleanup_free_ char *directory_pivot_root_new = NULL;
1203 _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
1204 char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
1205 bool remove_pivot_tmp = false;
1206 int r;
1207
1208 assert(directory);
1209
1210 if (!pivot_root_new)
1211 return 0;
1212
1213 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1214 * If pivot_root_old is NULL, the existing / disappears.
1215 * This requires a temporary directory, pivot_tmp, which is
1216 * not a child of either.
1217 *
1218 * This is typically used for OSTree-style containers, where
1219 * the root partition contains several sysroots which could be
1220 * run. Normally, one would be chosen by the bootloader and
1221 * pivoted to / by initramfs.
1222 *
1223 * For example, for an OSTree deployment, pivot_root_new
1224 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1225 * code doesn’t do the /var mount which OSTree expects: use
1226 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1227 *
1228 * So in the OSTree case, we’ll end up with something like:
1229 * - directory = /tmp/nspawn-root-123456
1230 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1231 * - pivot_root_old = /sysroot
1232 * - directory_pivot_root_new =
1233 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1234 * - pivot_tmp = /tmp/nspawn-pivot-123456
1235 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1236 *
1237 * Requires all file systems at directory and below to be mounted
1238 * MS_PRIVATE or MS_SLAVE so they can be moved.
1239 */
1240 directory_pivot_root_new = path_join(directory, pivot_root_new);
1241 if (!directory_pivot_root_new)
1242 return log_oom();
1243
1244 /* Remount directory_pivot_root_new to make it movable. */
1245 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
1246 if (r < 0)
1247 goto done;
1248
1249 if (pivot_root_old) {
1250 if (!mkdtemp(pivot_tmp)) {
1251 r = log_error_errno(errno, "Failed to create temporary directory: %m");
1252 goto done;
1253 }
1254
1255 remove_pivot_tmp = true;
1256 pivot_tmp_pivot_root_old = path_join(pivot_tmp, pivot_root_old);
1257 if (!pivot_tmp_pivot_root_old) {
1258 r = log_oom();
1259 goto done;
1260 }
1261
1262 r = mount_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
1263 if (r < 0)
1264 goto done;
1265
1266 r = mount_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
1267 if (r < 0)
1268 goto done;
1269
1270 r = mount_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
1271 if (r < 0)
1272 goto done;
1273 } else {
1274 r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
1275 if (r < 0)
1276 goto done;
1277 }
1278
1279 done:
1280 if (remove_pivot_tmp)
1281 (void) rmdir(pivot_tmp);
1282
1283 return r;
1284 }