1 /* SPDX-License-Identifier: LGPL-2.1+ */
4 #include <linux/magic.h>
6 #include "alloc-util.h"
9 #include "format-util.h"
13 #include "mount-util.h"
14 #include "mountpoint-util.h"
15 #include "nspawn-mount.h"
16 #include "parse-util.h"
17 #include "path-util.h"
20 #include "sort-util.h"
21 #include "stat-util.h"
22 #include "string-util.h"
24 #include "tmpfile-util.h"
25 #include "user-util.h"
27 CustomMount
* custom_mount_add(CustomMount
**l
, size_t *n
, CustomMountType t
) {
33 assert(t
< _CUSTOM_MOUNT_TYPE_MAX
);
35 c
= reallocarray(*l
, *n
+ 1, sizeof(CustomMount
));
43 *ret
= (CustomMount
) { .type
= t
};
48 void custom_mount_free_all(CustomMount
*l
, size_t n
) {
51 for (i
= 0; i
< n
; i
++) {
52 CustomMount
*m
= l
+ i
;
59 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
63 if (m
->rm_rf_tmpdir
) {
64 (void) rm_rf(m
->rm_rf_tmpdir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
65 free(m
->rm_rf_tmpdir
);
69 free(m
->type_argument
);
75 static int custom_mount_compare(const CustomMount
*a
, const CustomMount
*b
) {
78 r
= path_compare(a
->destination
, b
->destination
);
82 return CMP(a
->type
, b
->type
);
85 static bool source_path_is_valid(const char *p
) {
91 return path_is_absolute(p
);
94 static char *resolve_source_path(const char *dest
, const char *source
) {
100 return path_join(dest
, source
+ 1);
102 return strdup(source
);
105 static int allocate_temporary_source(CustomMount
*m
) {
108 assert(!m
->rm_rf_tmpdir
);
110 m
->rm_rf_tmpdir
= strdup("/var/tmp/nspawn-temp-XXXXXX");
111 if (!m
->rm_rf_tmpdir
)
114 if (!mkdtemp(m
->rm_rf_tmpdir
)) {
115 m
->rm_rf_tmpdir
= mfree(m
->rm_rf_tmpdir
);
116 return log_error_errno(errno
, "Failed to acquire temporary directory: %m");
119 m
->source
= path_join(m
->rm_rf_tmpdir
, "src");
123 if (mkdir(m
->source
, 0755) < 0)
124 return log_error_errno(errno
, "Failed to create %s: %m", m
->source
);
129 int custom_mount_prepare_all(const char *dest
, CustomMount
*l
, size_t n
) {
133 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
134 * parent process, so that we know the temporary directories to remove on exit before we fork off the
139 /* Order the custom mounts, and make sure we have a working directory */
140 typesafe_qsort(l
, n
, custom_mount_compare
);
142 for (i
= 0; i
< n
; i
++) {
143 CustomMount
*m
= l
+ i
;
145 /* /proc we mount in the inner child, i.e. when we acquired CLONE_NEWPID. All other mounts we mount
146 * already in the outer child, so that the mounts are already established before CLONE_NEWPID and in
147 * particular CLONE_NEWUSER. This also means any custom mounts below /proc also need to be mounted in
148 * the inner child, not the outer one. Determine this here. */
149 m
->in_userns
= path_startswith(m
->destination
, "/proc");
151 if (m
->type
== CUSTOM_MOUNT_BIND
) {
155 s
= resolve_source_path(dest
, m
->source
);
159 free_and_replace(m
->source
, s
);
161 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
163 r
= allocate_temporary_source(m
);
169 if (m
->type
== CUSTOM_MOUNT_OVERLAY
) {
172 STRV_FOREACH(j
, m
->lower
) {
175 s
= resolve_source_path(dest
, *j
);
179 free_and_replace(*j
, s
);
185 s
= resolve_source_path(dest
, m
->source
);
189 free_and_replace(m
->source
, s
);
191 r
= allocate_temporary_source(m
);
199 s
= resolve_source_path(dest
, m
->work_dir
);
203 free_and_replace(m
->work_dir
, s
);
205 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
207 return log_error_errno(r
, "Failed to acquire working directory: %m");
210 (void) mkdir_label(m
->work_dir
, 0700);
217 int bind_mount_parse(CustomMount
**l
, size_t *n
, const char *s
, bool read_only
) {
218 _cleanup_free_
char *source
= NULL
, *destination
= NULL
, *opts
= NULL
;
226 r
= extract_many_words(&p
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
, &source
, &destination
, NULL
);
232 destination
= strdup(source
[0] == '+' ? source
+1 : source
);
236 if (r
== 2 && !isempty(p
)) {
243 source
= mfree(source
);
244 else if (!source_path_is_valid(source
))
247 if (!path_is_absolute(destination
))
250 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_BIND
);
254 m
->source
= TAKE_PTR(source
);
255 m
->destination
= TAKE_PTR(destination
);
256 m
->read_only
= read_only
;
257 m
->options
= TAKE_PTR(opts
);
262 int tmpfs_mount_parse(CustomMount
**l
, size_t *n
, const char *s
) {
263 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
272 r
= extract_first_word(&p
, &path
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
279 opts
= strdup("mode=0755");
285 if (!path_is_absolute(path
))
288 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_TMPFS
);
292 m
->destination
= TAKE_PTR(path
);
293 m
->options
= TAKE_PTR(opts
);
298 int overlay_mount_parse(CustomMount
**l
, size_t *n
, const char *s
, bool read_only
) {
299 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
300 _cleanup_strv_free_
char **lower
= NULL
;
304 k
= strv_split_extract(&lower
, s
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
308 return -EADDRNOTAVAIL
;
310 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
311 * we'll also define the destination mount point the same as the upper. */
313 if (!source_path_is_valid(lower
[0]) ||
314 !source_path_is_valid(lower
[1]))
317 upper
= TAKE_PTR(lower
[1]);
319 destination
= strdup(upper
[0] == '+' ? upper
+1 : upper
); /* take the destination without "+" prefix */
325 /* If more than two parameters are specified, the last one is the destination, the second to last one
326 * the "upper", and all before that the "lower" directories. */
328 destination
= lower
[k
- 1];
329 upper
= TAKE_PTR(lower
[k
- 2]);
331 STRV_FOREACH(i
, lower
)
332 if (!source_path_is_valid(*i
))
335 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
338 upper
= mfree(upper
);
339 else if (!source_path_is_valid(upper
))
342 if (!path_is_absolute(destination
))
346 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_OVERLAY
);
350 m
->destination
= TAKE_PTR(destination
);
351 m
->source
= TAKE_PTR(upper
);
352 m
->lower
= TAKE_PTR(lower
);
353 m
->read_only
= read_only
;
358 int inaccessible_mount_parse(CustomMount
**l
, size_t *n
, const char *s
) {
359 _cleanup_free_
char *path
= NULL
;
366 if (!path_is_absolute(s
))
373 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_INACCESSIBLE
);
377 m
->destination
= TAKE_PTR(path
);
381 int tmpfs_patch_options(
384 const char *selinux_apifs_context
,
389 if (uid_shift
!= UID_INVALID
) {
390 if (asprintf(&buf
, "%s%suid=" UID_FMT
",gid=" UID_FMT
,
391 strempty(options
), options
? "," : "",
392 uid_shift
, uid_shift
) < 0)
399 if (selinux_apifs_context
) {
402 t
= strjoin(strempty(options
), options
? "," : "",
403 "context=\"", selinux_apifs_context
, "\"");
412 if (!buf
&& options
) {
413 buf
= strdup(options
);
422 int mount_sysfs(const char *dest
, MountSettingsMask mount_settings
) {
423 const char *full
, *top
, *x
;
425 unsigned long extra_flags
= 0;
427 top
= prefix_roota(dest
, "/sys");
428 r
= path_is_fs_type(top
, SYSFS_MAGIC
);
430 return log_error_errno(r
, "Failed to determine filesystem type of %s: %m", top
);
431 /* /sys might already be mounted as sysfs by the outer child in the
432 * !netns case. In this case, it's all good. Don't touch it because we
433 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
438 full
= prefix_roota(top
, "/full");
440 (void) mkdir(full
, 0755);
442 if (FLAGS_SET(mount_settings
, MOUNT_APPLY_APIVFS_RO
))
443 extra_flags
|= MS_RDONLY
;
445 r
= mount_verbose(LOG_ERR
, "sysfs", full
, "sysfs",
446 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|extra_flags
, NULL
);
450 FOREACH_STRING(x
, "block", "bus", "class", "dev", "devices", "kernel") {
451 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
453 from
= path_join(full
, x
);
457 to
= path_join(top
, x
);
461 (void) mkdir(to
, 0755);
463 r
= mount_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
467 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
468 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
473 r
= umount_verbose(full
);
478 return log_error_errno(errno
, "Failed to remove %s: %m", full
);
480 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
481 * remount /sys read-only.
483 x
= prefix_roota(top
, "/fs/cgroup");
484 (void) mkdir_p(x
, 0755);
486 return mount_verbose(LOG_ERR
, NULL
, top
, NULL
,
487 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
490 int mount_all(const char *dest
,
491 MountSettingsMask mount_settings
,
493 const char *selinux_apifs_context
) {
495 #define PROC_INACCESSIBLE_REG(path) \
496 { "/run/systemd/inaccessible/reg", (path), NULL, NULL, MS_BIND, \
497 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
498 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
499 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
501 #define PROC_READ_ONLY(path) \
502 { (path), (path), NULL, NULL, MS_BIND, \
503 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
504 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
505 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
507 typedef struct MountPoint
{
513 MountSettingsMask mount_settings
;
516 static const MountPoint mount_table
[] = {
517 /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
518 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
519 MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_MKDIR
},
521 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
,
522 MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* Bind mount first ... */
524 { "/proc/sys/net", "/proc/sys/net", NULL
, NULL
, MS_BIND
,
525 MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_APIVFS_NETNS
}, /* (except for this) */
527 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
,
528 MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* ... then, make it r/o */
530 /* Make these files inaccessible to container payloads: they potentially leak information about kernel
531 * internals or the host's execution environment to the container */
532 PROC_INACCESSIBLE_REG("/proc/kallsyms"),
533 PROC_INACCESSIBLE_REG("/proc/kcore"),
534 PROC_INACCESSIBLE_REG("/proc/keys"),
535 PROC_INACCESSIBLE_REG("/proc/sysrq-trigger"),
536 PROC_INACCESSIBLE_REG("/proc/timer_list"),
538 /* Make these directories read-only to container payloads: they show hardware information, and in some
539 * cases contain tunables the container really shouldn't have access to. */
540 PROC_READ_ONLY("/proc/acpi"),
541 PROC_READ_ONLY("/proc/apm"),
542 PROC_READ_ONLY("/proc/asound"),
543 PROC_READ_ONLY("/proc/bus"),
544 PROC_READ_ONLY("/proc/fs"),
545 PROC_READ_ONLY("/proc/irq"),
546 PROC_READ_ONLY("/proc/scsi"),
548 { "mqueue", "/dev/mqueue", "mqueue", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
549 MOUNT_IN_USERNS
|MOUNT_MKDIR
},
551 /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
552 { "tmpfs", "/tmp", "tmpfs", "mode=1777" TMPFS_LIMITS_TMP
, MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
553 MOUNT_FATAL
|MOUNT_APPLY_TMPFS_TMP
|MOUNT_MKDIR
},
554 { "tmpfs", "/sys", "tmpfs", "mode=555" TMPFS_LIMITS_SYS
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
555 MOUNT_FATAL
|MOUNT_APPLY_APIVFS_NETNS
|MOUNT_MKDIR
},
556 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
557 MOUNT_FATAL
|MOUNT_APPLY_APIVFS_RO
|MOUNT_MKDIR
}, /* skipped if above was mounted */
558 { "sysfs", "/sys", "sysfs", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
559 MOUNT_FATAL
|MOUNT_MKDIR
}, /* skipped if above was mounted */
560 { "tmpfs", "/dev", "tmpfs", "mode=755" TMPFS_LIMITS_DEV
, MS_NOSUID
|MS_STRICTATIME
,
561 MOUNT_FATAL
|MOUNT_MKDIR
},
562 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777" TMPFS_LIMITS_DEV_SHM
, MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
563 MOUNT_FATAL
|MOUNT_MKDIR
},
564 { "tmpfs", "/run", "tmpfs", "mode=755" TMPFS_LIMITS_RUN
, MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
565 MOUNT_FATAL
|MOUNT_MKDIR
},
566 { "/usr/lib/os-release", "/run/host/usr/lib/os-release", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
567 MOUNT_FATAL
|MOUNT_MKDIR
|MOUNT_TOUCH
},
568 { "/etc/os-release", "/run/host/etc/os-release", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
569 MOUNT_MKDIR
|MOUNT_TOUCH
},
572 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
,
573 MOUNT_MKDIR
}, /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
574 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
,
575 0 }, /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
579 bool use_userns
= FLAGS_SET(mount_settings
, MOUNT_USE_USERNS
);
580 bool netns
= FLAGS_SET(mount_settings
, MOUNT_APPLY_APIVFS_NETNS
);
581 bool ro
= FLAGS_SET(mount_settings
, MOUNT_APPLY_APIVFS_RO
);
582 bool in_userns
= FLAGS_SET(mount_settings
, MOUNT_IN_USERNS
);
583 bool tmpfs_tmp
= FLAGS_SET(mount_settings
, MOUNT_APPLY_TMPFS_TMP
);
587 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
588 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
590 struct stat source_st
;
591 bool fatal
= FLAGS_SET(mount_table
[k
].mount_settings
, MOUNT_FATAL
);
593 if (in_userns
!= FLAGS_SET(mount_table
[k
].mount_settings
, MOUNT_IN_USERNS
))
596 if (!netns
&& FLAGS_SET(mount_table
[k
].mount_settings
, MOUNT_APPLY_APIVFS_NETNS
))
599 if (!ro
&& FLAGS_SET(mount_table
[k
].mount_settings
, MOUNT_APPLY_APIVFS_RO
))
602 if (!tmpfs_tmp
&& FLAGS_SET(mount_table
[k
].mount_settings
, MOUNT_APPLY_TMPFS_TMP
))
605 r
= chase_symlinks(mount_table
[k
].where
, dest
, CHASE_NONEXISTENT
|CHASE_PREFIX_ROOT
, &where
, NULL
);
607 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, mount_table
[k
].where
);
609 /* Skip this entry if it is not a remount. */
610 if (mount_table
[k
].what
) {
611 r
= path_is_mount_point(where
, NULL
, 0);
612 if (r
< 0 && r
!= -ENOENT
)
613 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
617 /* Shortcut for optional bind mounts: if the source can't be found skip ahead to avoid creating
618 * empty and unused directories. */
619 if (!fatal
&& FLAGS_SET(mount_table
[k
].mount_settings
, MOUNT_MKDIR
) && FLAGS_SET(mount_table
[k
].flags
, MS_BIND
)) {
620 r
= stat(mount_table
[k
].what
, &source_st
);
624 return log_error_errno(errno
, "Failed to stat %s: %m", mount_table
[k
].what
);
629 if (FLAGS_SET(mount_table
[k
].mount_settings
, MOUNT_MKDIR
)) {
630 uid_t u
= (use_userns
&& !in_userns
) ? uid_shift
: UID_INVALID
;
632 if (FLAGS_SET(mount_table
[k
].mount_settings
, MOUNT_TOUCH
))
633 r
= mkdir_parents_safe(dest
, where
, 0755, u
, u
, 0);
635 r
= mkdir_p_safe(dest
, where
, 0755, u
, u
, 0);
636 if (r
< 0 && r
!= -EEXIST
) {
637 if (fatal
&& r
!= -EROFS
)
638 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
640 log_debug_errno(r
, "Failed to create directory %s: %m", where
);
642 /* If we failed mkdir() or chown() due to the root directory being read only,
643 * attempt to mount this fs anyway and let mount_verbose log any errors */
647 if (FLAGS_SET(mount_table
[k
].mount_settings
, MOUNT_TOUCH
)) {
649 if (r
< 0 && r
!= -EEXIST
) {
651 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
652 log_debug_errno(r
, "Failed to create mount point %s: %m", where
);
657 o
= mount_table
[k
].options
;
658 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
659 r
= tmpfs_patch_options(o
, in_userns
? 0 : uid_shift
, selinux_apifs_context
, &options
);
666 r
= mount_verbose(fatal
? LOG_ERR
: LOG_DEBUG
,
670 mount_table
[k
].flags
,
679 static int parse_mount_bind_options(const char *options
, unsigned long *mount_flags
, char **mount_opts
) {
680 const char *p
= options
;
681 unsigned long flags
= *mount_flags
;
688 _cleanup_free_
char *word
= NULL
;
690 r
= extract_first_word(&p
, &word
, ",", 0);
692 return log_error_errno(r
, "Failed to extract mount option: %m");
696 if (streq(word
, "rbind"))
698 else if (streq(word
, "norbind"))
701 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
702 "Invalid bind mount option: %s",
707 *mount_flags
= flags
;
708 /* in the future mount_opts will hold string options for mount(2) */
714 static int mount_bind(const char *dest
, CustomMount
*m
) {
715 _cleanup_free_
char *mount_opts
= NULL
, *where
= NULL
;
716 unsigned long mount_flags
= MS_BIND
| MS_REC
;
717 struct stat source_st
, dest_st
;
724 r
= parse_mount_bind_options(m
->options
, &mount_flags
, &mount_opts
);
729 if (stat(m
->source
, &source_st
) < 0)
730 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
732 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
, NULL
);
734 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
735 if (r
> 0) { /* Path exists already? */
737 if (stat(where
, &dest_st
) < 0)
738 return log_error_errno(errno
, "Failed to stat %s: %m", where
);
740 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
))
741 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
742 "Cannot bind mount directory %s on file %s.",
745 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
))
746 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
747 "Cannot bind mount file %s on directory %s.",
750 } else { /* Path doesn't exist yet? */
751 r
= mkdir_parents_label(where
, 0755);
753 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
755 /* Create the mount point. Any non-directory file can be
756 * mounted on any non-directory file (regular, fifo, socket,
759 if (S_ISDIR(source_st
.st_mode
))
760 r
= mkdir_label(where
, 0755);
764 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
767 r
= mount_verbose(LOG_ERR
, m
->source
, where
, NULL
, mount_flags
, mount_opts
);
772 r
= bind_remount_recursive(where
, MS_RDONLY
, MS_RDONLY
, NULL
);
774 return log_error_errno(r
, "Read-only bind mount failed: %m");
780 static int mount_tmpfs(const char *dest
, CustomMount
*m
, uid_t uid_shift
, const char *selinux_apifs_context
) {
783 _cleanup_free_
char *buf
= NULL
, *where
= NULL
;
789 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
, NULL
);
791 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
792 if (r
== 0) { /* Doesn't exist yet? */
793 r
= mkdir_p_label(where
, 0755);
795 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
798 r
= tmpfs_patch_options(m
->options
, uid_shift
== 0 ? UID_INVALID
: uid_shift
, selinux_apifs_context
, &buf
);
801 options
= r
> 0 ? buf
: m
->options
;
803 return mount_verbose(LOG_ERR
, "tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
);
806 static char *joined_and_escaped_lower_dirs(char **lower
) {
807 _cleanup_strv_free_
char **sv
= NULL
;
809 sv
= strv_copy(lower
);
815 if (!strv_shell_escape(sv
, ",:"))
818 return strv_join(sv
, ":");
821 static int mount_overlay(const char *dest
, CustomMount
*m
) {
822 _cleanup_free_
char *lower
= NULL
, *where
= NULL
, *escaped_source
= NULL
;
829 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
, NULL
);
831 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
832 if (r
== 0) { /* Doesn't exist yet? */
833 r
= mkdir_label(where
, 0755);
835 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
838 (void) mkdir_p_label(m
->source
, 0755);
840 lower
= joined_and_escaped_lower_dirs(m
->lower
);
844 escaped_source
= shell_escape(m
->source
, ",:");
849 options
= strjoina("lowerdir=", escaped_source
, ":", lower
);
851 _cleanup_free_
char *escaped_work_dir
= NULL
;
853 escaped_work_dir
= shell_escape(m
->work_dir
, ",:");
854 if (!escaped_work_dir
)
857 options
= strjoina("lowerdir=", lower
, ",upperdir=", escaped_source
, ",workdir=", escaped_work_dir
);
860 return mount_verbose(LOG_ERR
, "overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
);
863 static int mount_inaccessible(const char *dest
, CustomMount
*m
) {
864 _cleanup_free_
char *where
= NULL
, *source
= NULL
;
871 r
= chase_symlinks_and_stat(m
->destination
, dest
, CHASE_PREFIX_ROOT
, &where
, &st
, NULL
);
873 log_full_errno(m
->graceful
? LOG_DEBUG
: LOG_ERR
, r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
874 return m
->graceful
? 0 : r
;
877 r
= mode_to_inaccessible_node(NULL
, st
.st_mode
, &source
);
879 return m
->graceful
? 0 : r
;
881 r
= mount_verbose(m
->graceful
? LOG_DEBUG
: LOG_ERR
, source
, where
, NULL
, MS_BIND
, NULL
);
883 return m
->graceful
? 0 : r
;
885 r
= mount_verbose(m
->graceful
? LOG_DEBUG
: LOG_ERR
, NULL
, where
, NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, NULL
);
887 (void) umount_verbose(where
);
888 return m
->graceful
? 0 : r
;
894 static int mount_arbitrary(const char *dest
, CustomMount
*m
) {
895 _cleanup_free_
char *where
= NULL
;
901 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
, NULL
);
903 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
904 if (r
== 0) { /* Doesn't exist yet? */
905 r
= mkdir_p_label(where
, 0755);
907 return log_error_errno(r
, "Creating mount point for mount %s failed: %m", where
);
910 return mount_verbose(LOG_ERR
, m
->source
, where
, m
->type_argument
, 0, m
->options
);
915 CustomMount
*mounts
, size_t n
,
917 const char *selinux_apifs_context
,
918 MountSettingsMask mount_settings
) {
925 for (i
= 0; i
< n
; i
++) {
926 CustomMount
*m
= mounts
+ i
;
928 if (FLAGS_SET(mount_settings
, MOUNT_IN_USERNS
) != m
->in_userns
)
931 if (FLAGS_SET(mount_settings
, MOUNT_ROOT_ONLY
) && !path_equal(m
->destination
, "/"))
934 if (FLAGS_SET(mount_settings
, MOUNT_NON_ROOT_ONLY
) && path_equal(m
->destination
, "/"))
939 case CUSTOM_MOUNT_BIND
:
940 r
= mount_bind(dest
, m
);
943 case CUSTOM_MOUNT_TMPFS
:
944 r
= mount_tmpfs(dest
, m
, uid_shift
, selinux_apifs_context
);
947 case CUSTOM_MOUNT_OVERLAY
:
948 r
= mount_overlay(dest
, m
);
951 case CUSTOM_MOUNT_INACCESSIBLE
:
952 r
= mount_inaccessible(dest
, m
);
955 case CUSTOM_MOUNT_ARBITRARY
:
956 r
= mount_arbitrary(dest
, m
);
960 assert_not_reached("Unknown custom mount type");
970 bool has_custom_root_mount(const CustomMount
*mounts
, size_t n
) {
973 for (i
= 0; i
< n
; i
++) {
974 const CustomMount
*m
= mounts
+ i
;
976 if (path_equal(m
->destination
, "/"))
983 static int setup_volatile_state(const char *directory
, uid_t uid_shift
, const char *selinux_apifs_context
) {
985 _cleanup_free_
char *buf
= NULL
;
986 const char *p
, *options
;
991 /* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */
993 r
= bind_remount_recursive(directory
, MS_RDONLY
, MS_RDONLY
, NULL
);
995 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
997 p
= prefix_roota(directory
, "/var");
999 if (r
< 0 && errno
!= EEXIST
)
1000 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1002 options
= "mode=755" TMPFS_LIMITS_VOLATILE_STATE
;
1003 r
= tmpfs_patch_options(options
, uid_shift
== 0 ? UID_INVALID
: uid_shift
, selinux_apifs_context
, &buf
);
1009 return mount_verbose(LOG_ERR
, "tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
);
1012 static int setup_volatile_yes(const char *directory
, uid_t uid_shift
, const char *selinux_apifs_context
) {
1014 bool tmpfs_mounted
= false, bind_mounted
= false;
1015 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1016 _cleanup_free_
char *buf
= NULL
, *bindir
= NULL
;
1017 const char *f
, *t
, *options
;
1023 /* --volatile=yes means we mount a tmpfs to the root dir, and the original /usr to use inside it, and
1024 * that read-only. Before we start setting this up let's validate if the image has the /usr merge
1025 * implemented, and let's output a friendly log message if it hasn't. */
1027 bindir
= path_join(directory
, "/bin");
1030 if (lstat(bindir
, &st
) < 0) {
1031 if (errno
!= ENOENT
)
1032 return log_error_errno(errno
, "Failed to stat /bin directory below image: %m");
1034 /* ENOENT is fine, just means the image is probably just a naked /usr and we can create the
1036 } else if (S_ISDIR(st
.st_mode
))
1037 return log_error_errno(SYNTHETIC_ERRNO(EISDIR
),
1038 "Sorry, --volatile=yes mode is not supported with OS images that have not merged /bin/, /sbin/, /lib/, /lib64/ into /usr/. "
1039 "Please work with your distribution and help them adopt the merged /usr scheme.");
1040 else if (!S_ISLNK(st
.st_mode
))
1041 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1042 "Error starting image: if --volatile=yes is used /bin must be a symlink (for merged /usr support) or non-existent (in which case a symlink is created automatically).");
1044 if (!mkdtemp(template))
1045 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1047 options
= "mode=755" TMPFS_LIMITS_ROOTFS
;
1048 r
= tmpfs_patch_options(options
, uid_shift
== 0 ? UID_INVALID
: uid_shift
, selinux_apifs_context
, &buf
);
1054 r
= mount_verbose(LOG_ERR
, "tmpfs", template, "tmpfs", MS_STRICTATIME
, options
);
1058 tmpfs_mounted
= true;
1060 f
= prefix_roota(directory
, "/usr");
1061 t
= prefix_roota(template, "/usr");
1064 if (r
< 0 && errno
!= EEXIST
) {
1065 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1069 r
= mount_verbose(LOG_ERR
, f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
);
1073 bind_mounted
= true;
1075 r
= bind_remount_recursive(t
, MS_RDONLY
, MS_RDONLY
, NULL
);
1077 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1081 r
= mount_verbose(LOG_ERR
, template, directory
, NULL
, MS_MOVE
, NULL
);
1085 (void) rmdir(template);
1091 (void) umount_verbose(t
);
1094 (void) umount_verbose(template);
1095 (void) rmdir(template);
1099 static int setup_volatile_overlay(const char *directory
, uid_t uid_shift
, const char *selinux_apifs_context
) {
1101 _cleanup_free_
char *buf
= NULL
, *escaped_directory
= NULL
, *escaped_upper
= NULL
, *escaped_work
= NULL
;
1102 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1103 const char *upper
, *work
, *options
;
1104 bool tmpfs_mounted
= false;
1109 /* --volatile=overlay means we mount an overlayfs to the root dir. */
1111 if (!mkdtemp(template))
1112 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1114 options
= "mode=755" TMPFS_LIMITS_ROOTFS
;
1115 r
= tmpfs_patch_options(options
, uid_shift
== 0 ? UID_INVALID
: uid_shift
, selinux_apifs_context
, &buf
);
1121 r
= mount_verbose(LOG_ERR
, "tmpfs", template, "tmpfs", MS_STRICTATIME
, options
);
1125 tmpfs_mounted
= true;
1127 upper
= strjoina(template, "/upper");
1128 work
= strjoina(template, "/work");
1130 if (mkdir(upper
, 0755) < 0) {
1131 r
= log_error_errno(errno
, "Failed to create %s: %m", upper
);
1134 if (mkdir(work
, 0755) < 0) {
1135 r
= log_error_errno(errno
, "Failed to create %s: %m", work
);
1139 /* And now, let's overmount the root dir with an overlayfs that uses the root dir as lower dir. It's kinda nice
1140 * that the kernel allows us to do that without going through some mount point rearrangements. */
1142 escaped_directory
= shell_escape(directory
, ",:");
1143 escaped_upper
= shell_escape(upper
, ",:");
1144 escaped_work
= shell_escape(work
, ",:");
1145 if (!escaped_directory
|| !escaped_upper
|| !escaped_work
) {
1150 options
= strjoina("lowerdir=", escaped_directory
, ",upperdir=", escaped_upper
, ",workdir=", escaped_work
);
1151 r
= mount_verbose(LOG_ERR
, "overlay", directory
, "overlay", 0, options
);
1155 (void) umount_verbose(template);
1157 (void) rmdir(template);
1161 int setup_volatile_mode(
1162 const char *directory
,
1165 const char *selinux_apifs_context
) {
1170 return setup_volatile_yes(directory
, uid_shift
, selinux_apifs_context
);
1172 case VOLATILE_STATE
:
1173 return setup_volatile_state(directory
, uid_shift
, selinux_apifs_context
);
1175 case VOLATILE_OVERLAY
:
1176 return setup_volatile_overlay(directory
, uid_shift
, selinux_apifs_context
);
1183 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1184 int pivot_root_parse(char **pivot_root_new
, char **pivot_root_old
, const char *s
) {
1185 _cleanup_free_
char *root_new
= NULL
, *root_old
= NULL
;
1189 assert(pivot_root_new
);
1190 assert(pivot_root_old
);
1192 r
= extract_first_word(&p
, &root_new
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
1201 root_old
= strdup(p
);
1206 if (!path_is_absolute(root_new
))
1208 if (root_old
&& !path_is_absolute(root_old
))
1211 free_and_replace(*pivot_root_new
, root_new
);
1212 free_and_replace(*pivot_root_old
, root_old
);
1217 int setup_pivot_root(const char *directory
, const char *pivot_root_new
, const char *pivot_root_old
) {
1218 _cleanup_free_
char *directory_pivot_root_new
= NULL
;
1219 _cleanup_free_
char *pivot_tmp_pivot_root_old
= NULL
;
1220 char pivot_tmp
[] = "/tmp/nspawn-pivot-XXXXXX";
1221 bool remove_pivot_tmp
= false;
1226 if (!pivot_root_new
)
1229 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1230 * If pivot_root_old is NULL, the existing / disappears.
1231 * This requires a temporary directory, pivot_tmp, which is
1232 * not a child of either.
1234 * This is typically used for OSTree-style containers, where
1235 * the root partition contains several sysroots which could be
1236 * run. Normally, one would be chosen by the bootloader and
1237 * pivoted to / by initramfs.
1239 * For example, for an OSTree deployment, pivot_root_new
1240 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1241 * code doesn’t do the /var mount which OSTree expects: use
1242 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1244 * So in the OSTree case, we’ll end up with something like:
1245 * - directory = /tmp/nspawn-root-123456
1246 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1247 * - pivot_root_old = /sysroot
1248 * - directory_pivot_root_new =
1249 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1250 * - pivot_tmp = /tmp/nspawn-pivot-123456
1251 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1253 * Requires all file systems at directory and below to be mounted
1254 * MS_PRIVATE or MS_SLAVE so they can be moved.
1256 directory_pivot_root_new
= path_join(directory
, pivot_root_new
);
1257 if (!directory_pivot_root_new
)
1260 /* Remount directory_pivot_root_new to make it movable. */
1261 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory_pivot_root_new
, NULL
, MS_BIND
, NULL
);
1265 if (pivot_root_old
) {
1266 if (!mkdtemp(pivot_tmp
)) {
1267 r
= log_error_errno(errno
, "Failed to create temporary directory: %m");
1271 remove_pivot_tmp
= true;
1272 pivot_tmp_pivot_root_old
= path_join(pivot_tmp
, pivot_root_old
);
1273 if (!pivot_tmp_pivot_root_old
) {
1278 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, pivot_tmp
, NULL
, MS_MOVE
, NULL
);
1282 r
= mount_verbose(LOG_ERR
, directory
, pivot_tmp_pivot_root_old
, NULL
, MS_MOVE
, NULL
);
1286 r
= mount_verbose(LOG_ERR
, pivot_tmp
, directory
, NULL
, MS_MOVE
, NULL
);
1290 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory
, NULL
, MS_MOVE
, NULL
);
1296 if (remove_pivot_tmp
)
1297 (void) rmdir(pivot_tmp
);