1 /* SPDX-License-Identifier: LGPL-2.1+ */
4 #include <linux/magic.h>
6 #include "alloc-util.h"
12 #include "mount-util.h"
13 #include "mountpoint-util.h"
14 #include "nspawn-mount.h"
15 #include "parse-util.h"
16 #include "path-util.h"
19 #include "stat-util.h"
20 #include "string-util.h"
22 #include "tmpfile-util.h"
23 #include "user-util.h"
26 CustomMount
* custom_mount_add(CustomMount
**l
, size_t *n
, CustomMountType t
) {
32 assert(t
< _CUSTOM_MOUNT_TYPE_MAX
);
34 c
= reallocarray(*l
, *n
+ 1, sizeof(CustomMount
));
42 *ret
= (CustomMount
) { .type
= t
};
47 void custom_mount_free_all(CustomMount
*l
, size_t n
) {
50 for (i
= 0; i
< n
; i
++) {
51 CustomMount
*m
= l
+ i
;
58 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
62 if (m
->rm_rf_tmpdir
) {
63 (void) rm_rf(m
->rm_rf_tmpdir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
64 free(m
->rm_rf_tmpdir
);
73 static int custom_mount_compare(const CustomMount
*a
, const CustomMount
*b
) {
76 r
= path_compare(a
->destination
, b
->destination
);
80 return CMP(a
->type
, b
->type
);
83 static bool source_path_is_valid(const char *p
) {
89 return path_is_absolute(p
);
92 static char *resolve_source_path(const char *dest
, const char *source
) {
98 return prefix_root(dest
, source
+ 1);
100 return strdup(source
);
103 int custom_mount_prepare_all(const char *dest
, CustomMount
*l
, size_t n
) {
107 /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
108 * parent process, so that we know the temporary directories to remove on exit before we fork off the
113 /* Order the custom mounts, and make sure we have a working directory */
114 typesafe_qsort(l
, n
, custom_mount_compare
);
116 for (i
= 0; i
< n
; i
++) {
117 CustomMount
*m
= l
+ i
;
122 s
= resolve_source_path(dest
, m
->source
);
126 free_and_replace(m
->source
, s
);
128 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
130 m
->rm_rf_tmpdir
= strdup("/var/tmp/nspawn-temp-XXXXXX");
131 if (!m
->rm_rf_tmpdir
)
134 if (!mkdtemp(m
->rm_rf_tmpdir
)) {
135 m
->rm_rf_tmpdir
= mfree(m
->rm_rf_tmpdir
);
136 return log_error_errno(errno
, "Failed to acquire temporary directory: %m");
139 m
->source
= strjoin(m
->rm_rf_tmpdir
, "/src");
143 if (mkdir(m
->source
, 0755) < 0)
144 return log_error_errno(errno
, "Failed to create %s: %m", m
->source
);
147 if (m
->type
== CUSTOM_MOUNT_OVERLAY
) {
150 STRV_FOREACH(j
, m
->lower
) {
153 s
= resolve_source_path(dest
, *j
);
157 free_and_replace(*j
, s
);
163 s
= resolve_source_path(dest
, m
->work_dir
);
167 free_and_replace(m
->work_dir
, s
);
171 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
173 return log_error_errno(r
, "Failed to acquire working directory: %m");
176 (void) mkdir_label(m
->work_dir
, 0700);
183 int bind_mount_parse(CustomMount
**l
, size_t *n
, const char *s
, bool read_only
) {
184 _cleanup_free_
char *source
= NULL
, *destination
= NULL
, *opts
= NULL
;
192 r
= extract_many_words(&p
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
, &source
, &destination
, NULL
);
198 destination
= strdup(source
[0] == '+' ? source
+1 : source
);
202 if (r
== 2 && !isempty(p
)) {
210 else if (!source_path_is_valid(source
))
213 if (!path_is_absolute(destination
))
216 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_BIND
);
221 m
->destination
= destination
;
222 m
->read_only
= read_only
;
225 source
= destination
= opts
= NULL
;
229 int tmpfs_mount_parse(CustomMount
**l
, size_t *n
, const char *s
) {
230 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
239 r
= extract_first_word(&p
, &path
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
246 opts
= strdup("mode=0755");
252 if (!path_is_absolute(path
))
255 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_TMPFS
);
259 m
->destination
= TAKE_PTR(path
);
260 m
->options
= TAKE_PTR(opts
);
265 int overlay_mount_parse(CustomMount
**l
, size_t *n
, const char *s
, bool read_only
) {
266 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
267 _cleanup_strv_free_
char **lower
= NULL
;
271 k
= strv_split_extract(&lower
, s
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
275 return -EADDRNOTAVAIL
;
277 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
278 * we'll also define the destination mount point the same as the upper. */
280 if (!source_path_is_valid(lower
[0]) ||
281 !source_path_is_valid(lower
[1]))
284 upper
= TAKE_PTR(lower
[1]);
286 destination
= strdup(upper
[0] == '+' ? upper
+1 : upper
); /* take the destination without "+" prefix */
292 /* If more than two parameters are specified, the last one is the destination, the second to last one
293 * the "upper", and all before that the "lower" directories. */
295 destination
= lower
[k
- 1];
296 upper
= TAKE_PTR(lower
[k
- 2]);
298 STRV_FOREACH(i
, lower
)
299 if (!source_path_is_valid(*i
))
302 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
306 else if (!source_path_is_valid(upper
))
309 if (!path_is_absolute(destination
))
313 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_OVERLAY
);
317 m
->destination
= TAKE_PTR(destination
);
318 m
->source
= TAKE_PTR(upper
);
319 m
->lower
= TAKE_PTR(lower
);
320 m
->read_only
= read_only
;
325 int tmpfs_patch_options(
328 const char *selinux_apifs_context
,
333 if (uid_shift
!= UID_INVALID
) {
334 if (asprintf(&buf
, "%s%suid=" UID_FMT
",gid=" UID_FMT
,
335 strempty(options
), options
? "," : "",
336 uid_shift
, uid_shift
) < 0)
343 if (selinux_apifs_context
) {
346 t
= strjoin(strempty(options
), options
? "," : "",
347 "context=\"", selinux_apifs_context
, "\"");
356 if (!buf
&& options
) {
357 buf
= strdup(options
);
366 int mount_sysfs(const char *dest
, MountSettingsMask mount_settings
) {
367 const char *full
, *top
, *x
;
369 unsigned long extra_flags
= 0;
371 top
= prefix_roota(dest
, "/sys");
372 r
= path_is_fs_type(top
, SYSFS_MAGIC
);
374 return log_error_errno(r
, "Failed to determine filesystem type of %s: %m", top
);
375 /* /sys might already be mounted as sysfs by the outer child in the
376 * !netns case. In this case, it's all good. Don't touch it because we
377 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
382 full
= prefix_roota(top
, "/full");
384 (void) mkdir(full
, 0755);
386 if (mount_settings
& MOUNT_APPLY_APIVFS_RO
)
387 extra_flags
|= MS_RDONLY
;
389 r
= mount_verbose(LOG_ERR
, "sysfs", full
, "sysfs",
390 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|extra_flags
, NULL
);
394 FOREACH_STRING(x
, "block", "bus", "class", "dev", "devices", "kernel") {
395 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
397 from
= prefix_root(full
, x
);
401 to
= prefix_root(top
, x
);
405 (void) mkdir(to
, 0755);
407 r
= mount_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
411 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
412 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
417 r
= umount_verbose(full
);
422 return log_error_errno(errno
, "Failed to remove %s: %m", full
);
424 /* Create mountpoint for cgroups. Otherwise we are not allowed since we
425 * remount /sys read-only.
427 x
= prefix_roota(top
, "/fs/cgroup");
428 (void) mkdir_p(x
, 0755);
430 return mount_verbose(LOG_ERR
, NULL
, top
, NULL
,
431 MS_BIND
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
|extra_flags
, NULL
);
434 static int mkdir_userns(const char *path
, mode_t mode
, uid_t uid_shift
) {
439 r
= mkdir_errno_wrapper(path
, mode
);
440 if (r
< 0 && r
!= -EEXIST
)
443 if (uid_shift
== UID_INVALID
)
446 if (lchown(path
, uid_shift
, uid_shift
) < 0)
452 static int mkdir_userns_p(const char *prefix
, const char *path
, mode_t mode
, uid_t uid_shift
) {
458 if (prefix
&& !path_startswith(path
, prefix
))
461 /* create every parent directory in the path, except the last component */
462 p
= path
+ strspn(path
, "/");
464 char t
[strlen(path
) + 1];
466 e
= p
+ strcspn(p
, "/");
467 p
= e
+ strspn(e
, "/");
469 /* Is this the last component? If so, then we're done */
473 memcpy(t
, path
, e
- path
);
476 if (prefix
&& path_startswith(prefix
, t
))
479 r
= mkdir_userns(t
, mode
, uid_shift
);
484 return mkdir_userns(path
, mode
, uid_shift
);
487 int mount_all(const char *dest
,
488 MountSettingsMask mount_settings
,
490 const char *selinux_apifs_context
) {
492 #define PROC_INACCESSIBLE(path) \
493 { NULL, (path), NULL, NULL, MS_BIND, \
494 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \
495 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
496 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
498 #define PROC_READ_ONLY(path) \
499 { (path), (path), NULL, NULL, MS_BIND, \
500 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
501 { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
502 MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
504 typedef struct MountPoint
{
510 MountSettingsMask mount_settings
;
513 static const MountPoint mount_table
[] = {
514 /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
515 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
516 MOUNT_FATAL
|MOUNT_IN_USERNS
},
518 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
,
519 MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* Bind mount first ... */
521 { "/proc/sys/net", "/proc/sys/net", NULL
, NULL
, MS_BIND
,
522 MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_APIVFS_NETNS
}, /* (except for this) */
524 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
,
525 MOUNT_FATAL
|MOUNT_IN_USERNS
|MOUNT_APPLY_APIVFS_RO
}, /* ... then, make it r/o */
527 /* Make these files inaccessible to container payloads: they potentially leak information about kernel
528 * internals or the host's execution environment to the container */
529 PROC_INACCESSIBLE("/proc/kallsyms"),
530 PROC_INACCESSIBLE("/proc/kcore"),
531 PROC_INACCESSIBLE("/proc/keys"),
532 PROC_INACCESSIBLE("/proc/sysrq-trigger"),
533 PROC_INACCESSIBLE("/proc/timer_list"),
535 /* Make these directories read-only to container payloads: they show hardware information, and in some
536 * cases contain tunables the container really shouldn't have access to. */
537 PROC_READ_ONLY("/proc/acpi"),
538 PROC_READ_ONLY("/proc/apm"),
539 PROC_READ_ONLY("/proc/asound"),
540 PROC_READ_ONLY("/proc/bus"),
541 PROC_READ_ONLY("/proc/fs"),
542 PROC_READ_ONLY("/proc/irq"),
543 PROC_READ_ONLY("/proc/scsi"),
545 /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
546 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
547 MOUNT_FATAL
|MOUNT_APPLY_TMPFS_TMP
},
548 { "tmpfs", "/sys", "tmpfs", "mode=555", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
549 MOUNT_FATAL
|MOUNT_APPLY_APIVFS_NETNS
},
550 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
551 MOUNT_FATAL
|MOUNT_APPLY_APIVFS_RO
}, /* skipped if above was mounted */
552 { "sysfs", "/sys", "sysfs", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
,
553 MOUNT_FATAL
}, /* skipped if above was mounted */
554 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
,
556 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
558 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
,
560 { "mqueue", "/dev/mqueue", "mqueue", NULL
, 0,
564 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
,
565 0 }, /* Bind mount first */
566 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
,
567 0 }, /* Then, make it r/o */
571 _cleanup_(unlink_and_freep
) char *inaccessible
= NULL
;
572 bool use_userns
= (mount_settings
& MOUNT_USE_USERNS
);
573 bool netns
= (mount_settings
& MOUNT_APPLY_APIVFS_NETNS
);
574 bool ro
= (mount_settings
& MOUNT_APPLY_APIVFS_RO
);
575 bool in_userns
= (mount_settings
& MOUNT_IN_USERNS
);
576 bool tmpfs_tmp
= (mount_settings
& MOUNT_APPLY_TMPFS_TMP
);
580 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
581 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
582 const char *o
, *what
;
583 bool fatal
= (mount_table
[k
].mount_settings
& MOUNT_FATAL
);
585 if (in_userns
!= (bool)(mount_table
[k
].mount_settings
& MOUNT_IN_USERNS
))
588 if (!netns
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_NETNS
))
591 if (!ro
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_APIVFS_RO
))
594 if (!tmpfs_tmp
&& (bool)(mount_table
[k
].mount_settings
& MOUNT_APPLY_TMPFS_TMP
))
597 r
= chase_symlinks(mount_table
[k
].where
, dest
, CHASE_NONEXISTENT
|CHASE_PREFIX_ROOT
, &where
);
599 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, mount_table
[k
].where
);
601 if (mount_table
[k
].mount_settings
& MOUNT_INACCESSIBLE_REG
) {
604 _cleanup_free_
char *np
= NULL
;
606 r
= tempfn_random_child(NULL
, "inaccessible", &np
);
608 return log_error_errno(r
, "Failed to generate inaccessible file node path: %m");
610 r
= touch_file(np
, false, USEC_INFINITY
, UID_INVALID
, GID_INVALID
, 0000);
612 return log_error_errno(r
, "Failed to create inaccessible file node '%s': %m", np
);
614 inaccessible
= TAKE_PTR(np
);
619 what
= mount_table
[k
].what
;
621 r
= path_is_mount_point(where
, NULL
, 0);
622 if (r
< 0 && r
!= -ENOENT
)
623 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
625 /* Skip this entry if it is not a remount. */
629 r
= mkdir_userns_p(dest
, where
, 0755, (use_userns
&& !in_userns
) ? uid_shift
: UID_INVALID
);
630 if (r
< 0 && r
!= -EEXIST
) {
631 if (fatal
&& r
!= -EROFS
)
632 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
634 log_debug_errno(r
, "Failed to create directory %s: %m", where
);
635 /* If we failed mkdir() or chown() due to the root
636 * directory being read only, attempt to mount this fs
637 * anyway and let mount_verbose log any errors */
642 o
= mount_table
[k
].options
;
643 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
644 r
= tmpfs_patch_options(o
, in_userns
? 0 : uid_shift
, selinux_apifs_context
, &options
);
651 r
= mount_verbose(fatal
? LOG_ERR
: LOG_DEBUG
,
655 mount_table
[k
].flags
,
664 static int mount_bind(const char *dest
, CustomMount
*m
) {
666 _cleanup_free_
char *where
= NULL
;
667 struct stat source_st
, dest_st
;
673 if (stat(m
->source
, &source_st
) < 0)
674 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
676 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
678 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
679 if (r
> 0) { /* Path exists already? */
681 if (stat(where
, &dest_st
) < 0)
682 return log_error_errno(errno
, "Failed to stat %s: %m", where
);
684 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
))
685 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
686 "Cannot bind mount directory %s on file %s.",
689 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
))
690 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
691 "Cannot bind mount file %s on directory %s.",
694 } else { /* Path doesn't exist yet? */
695 r
= mkdir_parents_label(where
, 0755);
697 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
699 /* Create the mount point. Any non-directory file can be
700 * mounted on any non-directory file (regular, fifo, socket,
703 if (S_ISDIR(source_st
.st_mode
))
704 r
= mkdir_label(where
, 0755);
708 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
712 r
= mount_verbose(LOG_ERR
, m
->source
, where
, NULL
, MS_BIND
| MS_REC
, m
->options
);
717 r
= bind_remount_recursive(where
, true, NULL
);
719 return log_error_errno(r
, "Read-only bind mount failed: %m");
725 static int mount_tmpfs(
728 bool userns
, uid_t uid_shift
, uid_t uid_range
,
729 const char *selinux_apifs_context
) {
732 _cleanup_free_
char *buf
= NULL
, *where
= NULL
;
738 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
740 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
741 if (r
== 0) { /* Doesn't exist yet? */
742 r
= mkdir_p_label(where
, 0755);
744 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
747 r
= tmpfs_patch_options(m
->options
, uid_shift
== 0 ? UID_INVALID
: uid_shift
, selinux_apifs_context
, &buf
);
750 options
= r
> 0 ? buf
: m
->options
;
752 return mount_verbose(LOG_ERR
, "tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
);
755 static char *joined_and_escaped_lower_dirs(char **lower
) {
756 _cleanup_strv_free_
char **sv
= NULL
;
758 sv
= strv_copy(lower
);
764 if (!strv_shell_escape(sv
, ",:"))
767 return strv_join(sv
, ":");
770 static int mount_overlay(const char *dest
, CustomMount
*m
) {
772 _cleanup_free_
char *lower
= NULL
, *where
= NULL
, *escaped_source
= NULL
;
779 r
= chase_symlinks(m
->destination
, dest
, CHASE_PREFIX_ROOT
|CHASE_NONEXISTENT
, &where
);
781 return log_error_errno(r
, "Failed to resolve %s/%s: %m", dest
, m
->destination
);
782 if (r
== 0) { /* Doesn't exist yet? */
783 r
= mkdir_label(where
, 0755);
785 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
788 (void) mkdir_p_label(m
->source
, 0755);
790 lower
= joined_and_escaped_lower_dirs(m
->lower
);
794 escaped_source
= shell_escape(m
->source
, ",:");
799 options
= strjoina("lowerdir=", escaped_source
, ":", lower
);
801 _cleanup_free_
char *escaped_work_dir
= NULL
;
803 escaped_work_dir
= shell_escape(m
->work_dir
, ",:");
804 if (!escaped_work_dir
)
807 options
= strjoina("lowerdir=", lower
, ",upperdir=", escaped_source
, ",workdir=", escaped_work_dir
);
810 return mount_verbose(LOG_ERR
, "overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
);
815 CustomMount
*mounts
, size_t n
,
816 bool userns
, uid_t uid_shift
, uid_t uid_range
,
817 const char *selinux_apifs_context
) {
824 for (i
= 0; i
< n
; i
++) {
825 CustomMount
*m
= mounts
+ i
;
829 case CUSTOM_MOUNT_BIND
:
830 r
= mount_bind(dest
, m
);
833 case CUSTOM_MOUNT_TMPFS
:
834 r
= mount_tmpfs(dest
, m
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
837 case CUSTOM_MOUNT_OVERLAY
:
838 r
= mount_overlay(dest
, m
);
842 assert_not_reached("Unknown custom mount type");
852 int setup_volatile_state(
853 const char *directory
,
855 bool userns
, uid_t uid_shift
, uid_t uid_range
,
856 const char *selinux_apifs_context
) {
858 _cleanup_free_
char *buf
= NULL
;
859 const char *p
, *options
;
864 if (mode
!= VOLATILE_STATE
)
867 /* --volatile=state means we simply overmount /var
868 with a tmpfs, and the rest read-only. */
870 r
= bind_remount_recursive(directory
, true, NULL
);
872 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
874 p
= prefix_roota(directory
, "/var");
876 if (r
< 0 && errno
!= EEXIST
)
877 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
879 options
= "mode=755";
880 r
= tmpfs_patch_options(options
, uid_shift
== 0 ? UID_INVALID
: uid_shift
, selinux_apifs_context
, &buf
);
886 return mount_verbose(LOG_ERR
, "tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
);
890 const char *directory
,
892 bool userns
, uid_t uid_shift
, uid_t uid_range
,
893 const char *selinux_apifs_context
) {
895 bool tmpfs_mounted
= false, bind_mounted
= false;
896 char template[] = "/tmp/nspawn-volatile-XXXXXX";
897 _cleanup_free_
char *buf
= NULL
;
898 const char *f
, *t
, *options
;
903 if (mode
!= VOLATILE_YES
)
906 /* --volatile=yes means we mount a tmpfs to the root dir, and
907 the original /usr to use inside it, and that read-only. */
909 if (!mkdtemp(template))
910 return log_error_errno(errno
, "Failed to create temporary directory: %m");
912 options
= "mode=755";
913 r
= tmpfs_patch_options(options
, uid_shift
== 0 ? UID_INVALID
: uid_shift
, selinux_apifs_context
, &buf
);
919 r
= mount_verbose(LOG_ERR
, "tmpfs", template, "tmpfs", MS_STRICTATIME
, options
);
923 tmpfs_mounted
= true;
925 f
= prefix_roota(directory
, "/usr");
926 t
= prefix_roota(template, "/usr");
929 if (r
< 0 && errno
!= EEXIST
) {
930 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
934 r
= mount_verbose(LOG_ERR
, f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
);
940 r
= bind_remount_recursive(t
, true, NULL
);
942 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
946 r
= mount_verbose(LOG_ERR
, template, directory
, NULL
, MS_MOVE
, NULL
);
950 (void) rmdir(template);
956 (void) umount_verbose(t
);
959 (void) umount_verbose(template);
960 (void) rmdir(template);
964 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
965 int pivot_root_parse(char **pivot_root_new
, char **pivot_root_old
, const char *s
) {
966 _cleanup_free_
char *root_new
= NULL
, *root_old
= NULL
;
970 assert(pivot_root_new
);
971 assert(pivot_root_old
);
973 r
= extract_first_word(&p
, &root_new
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
982 root_old
= strdup(p
);
987 if (!path_is_absolute(root_new
))
989 if (root_old
&& !path_is_absolute(root_old
))
992 free_and_replace(*pivot_root_new
, root_new
);
993 free_and_replace(*pivot_root_old
, root_old
);
998 int setup_pivot_root(const char *directory
, const char *pivot_root_new
, const char *pivot_root_old
) {
999 _cleanup_free_
char *directory_pivot_root_new
= NULL
;
1000 _cleanup_free_
char *pivot_tmp_pivot_root_old
= NULL
;
1001 char pivot_tmp
[] = "/tmp/nspawn-pivot-XXXXXX";
1002 bool remove_pivot_tmp
= false;
1007 if (!pivot_root_new
)
1010 /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1011 * If pivot_root_old is NULL, the existing / disappears.
1012 * This requires a temporary directory, pivot_tmp, which is
1013 * not a child of either.
1015 * This is typically used for OSTree-style containers, where
1016 * the root partition contains several sysroots which could be
1017 * run. Normally, one would be chosen by the bootloader and
1018 * pivoted to / by initramfs.
1020 * For example, for an OSTree deployment, pivot_root_new
1021 * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1022 * code doesn’t do the /var mount which OSTree expects: use
1023 * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1025 * So in the OSTree case, we’ll end up with something like:
1026 * - directory = /tmp/nspawn-root-123456
1027 * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1028 * - pivot_root_old = /sysroot
1029 * - directory_pivot_root_new =
1030 * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1031 * - pivot_tmp = /tmp/nspawn-pivot-123456
1032 * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1034 * Requires all file systems at directory and below to be mounted
1035 * MS_PRIVATE or MS_SLAVE so they can be moved.
1037 directory_pivot_root_new
= prefix_root(directory
, pivot_root_new
);
1039 /* Remount directory_pivot_root_new to make it movable. */
1040 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory_pivot_root_new
, NULL
, MS_BIND
, NULL
);
1044 if (pivot_root_old
) {
1045 if (!mkdtemp(pivot_tmp
)) {
1046 r
= log_error_errno(errno
, "Failed to create temporary directory: %m");
1050 remove_pivot_tmp
= true;
1051 pivot_tmp_pivot_root_old
= prefix_root(pivot_tmp
, pivot_root_old
);
1053 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, pivot_tmp
, NULL
, MS_MOVE
, NULL
);
1057 r
= mount_verbose(LOG_ERR
, directory
, pivot_tmp_pivot_root_old
, NULL
, MS_MOVE
, NULL
);
1061 r
= mount_verbose(LOG_ERR
, pivot_tmp
, directory
, NULL
, MS_MOVE
, NULL
);
1065 r
= mount_verbose(LOG_ERR
, directory_pivot_root_new
, directory
, NULL
, MS_MOVE
, NULL
);
1071 if (remove_pivot_tmp
)
1072 (void) rmdir(pivot_tmp
);