From 1bc237dd7a1bb3762a2cfc60ce959d4dbf520b85 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 12 Nov 2025 16:46:59 +0100 Subject: [PATCH] nspawn: add support for running mstack container images --- src/nspawn/nspawn.c | 177 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 152 insertions(+), 25 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 296b12bb90b..623de6d9328 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -69,6 +69,7 @@ #include "mkdir.h" #include "mount-util.h" #include "mountpoint-util.h" +#include "mstack.h" #include "namespace-util.h" #include "netlink-internal.h" #include "notify-recv.h" @@ -204,6 +205,7 @@ struct ether_addr arg_network_provided_mac = {}; static PagerFlags arg_pager_flags = 0; static unsigned long arg_personality = PERSONALITY_INVALID; static char *arg_image = NULL; +static char *arg_mstack = NULL; static char *arg_oci_bundle = NULL; static VolatileMode arg_volatile_mode = VOLATILE_NO; static ExposePort *arg_expose_ports = NULL; @@ -272,6 +274,7 @@ STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep); STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep); STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep); STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_mstack, freep); STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep); STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep); STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp); @@ -738,6 +741,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_BACKGROUND, ARG_CLEANUP, ARG_NO_ASK_PASSWORD, + ARG_MSTACK, }; static const struct option options[] = { @@ -817,6 +821,7 @@ static int parse_argv(int argc, char *argv[]) { { "background", required_argument, NULL, ARG_BACKGROUND }, { "cleanup", no_argument, NULL, ARG_CLEANUP }, { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD }, + { "mstack", required_argument, NULL, ARG_MSTACK }, {} }; @@ -863,6 +868,14 @@ static int parse_argv(int argc, char *argv[]) { arg_settings_mask |= SETTING_DIRECTORY; break; + case ARG_MSTACK: + r = parse_path_argument(optarg, false, &arg_mstack); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_DIRECTORY; + break; + case ARG_OCI_BUNDLE: r = parse_path_argument(optarg, false, &arg_oci_bundle); if (r < 0) @@ -1654,11 +1667,11 @@ static int verify_arguments(void) { if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) arg_read_only = true; - if (arg_directory && arg_image) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined."); + if (!!arg_directory + !!arg_image + !!arg_mstack > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory=, --image= --mstack= may not be combined."); - if (arg_template && arg_image) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined."); + if (arg_template && (arg_image || arg_mstack)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image=/--mstack= may not be combined."); if (arg_template && !(arg_directory || arg_machine)) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=."); @@ -1666,6 +1679,9 @@ static int verify_arguments(void) { if (arg_ephemeral && arg_template) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined."); + if (arg_ephemeral && arg_mstack) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --mstack= may not be combined."); + /* Permit --ephemeral with --link-journal=try-* to satisfy principle of the least astonishment * (by common sense, "try" means "do not fail if not possible") */ if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO) && !arg_link_journal_try) @@ -3054,6 +3070,24 @@ static int pick_paths(void) { arg_architecture = result.architecture; } + if (arg_mstack) { + _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL; + PickFilter filter = *pick_filter_image_mstack; + + filter.architecture = arg_architecture; + + r = path_pick_update_warn( + &arg_mstack, + &filter, + /* n_filters= */ 1, + PICK_ARCHITECTURE|PICK_TRIES, + &result); + if (r < 0) + return r; + + arg_architecture = result.architecture; + } + if (arg_template) { _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL; PickFilter filter = *pick_filter_image_dir; @@ -3088,7 +3122,7 @@ static int determine_names(void) { return log_oom(); } - if (!arg_image && !arg_directory) { + if (!arg_image && !arg_directory && !arg_mstack) { if (arg_machine) { _cleanup_(image_unrefp) Image *i = NULL; @@ -3099,10 +3133,24 @@ static int determine_names(void) { if (r < 0) return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine); - if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK)) + switch (i->type) { + case IMAGE_RAW: + case IMAGE_BLOCK: r = free_and_strdup(&arg_image, i->path); - else + break; + + case IMAGE_DIRECTORY: + case IMAGE_SUBVOLUME: r = free_and_strdup(&arg_directory, i->path); + break; + + case IMAGE_MSTACK: + r = free_and_strdup(&arg_mstack, i->path); + break; + + default: + assert_not_reached(); + } if (r < 0) return log_oom(); @@ -3114,31 +3162,40 @@ static int determine_names(void) { return log_error_errno(r, "Failed to determine current directory: %m"); } - if (!arg_directory && !arg_image) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i."); + if (!arg_directory && !arg_image && !arg_mstack) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use --directory=, --image= or --mstack=."); } if (!arg_machine) { - if (arg_directory && path_equal(arg_directory, "/")) { - arg_machine = gethostname_malloc(); - if (!arg_machine) - return log_oom(); + if (arg_directory) { + if (path_equal(arg_directory, "/")) { + arg_machine = gethostname_malloc(); + if (!arg_machine) + return log_oom(); + } else { + r = path_extract_filename(arg_directory, &arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory); + } } else if (arg_image) { - char *e; - r = path_extract_filename(arg_image, &arg_machine); if (r < 0) return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image); /* Truncate suffix if there is one */ - e = endswith(arg_machine, ".raw"); + char *e = endswith(arg_machine, ".raw"); if (e) *e = 0; - } else { - r = path_extract_filename(arg_directory, &arg_machine); + } else if (arg_mstack) { + r = path_extract_filename(arg_mstack, &arg_machine); if (r < 0) - return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory); - } + return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_mstack); + + char *e = endswith(arg_machine, ".mstack"); + if (e) + *e = 0; + } else + assert_not_reached(); hostname_cleanup(arg_machine); if (!hostname_is_valid(arg_machine, 0)) @@ -3873,6 +3930,7 @@ static int outer_child( const char *directory, int mount_fd, DissectedImage *dissected_image, + MStack *mstack, int fd_outer_socket, int fd_inner_socket, FDSet *fds, @@ -3928,6 +3986,7 @@ static int outer_child( if (mount_fd >= 0) { assert(arg_directory); assert(!arg_image); + assert(!arg_mstack); if (move_mount(mount_fd, "", AT_FDCWD, directory, MOVE_MOUNT_F_EMPTY_PATH) < 0) return log_error_errno(errno, "Failed to attach root directory: %m"); @@ -3938,6 +3997,7 @@ static int outer_child( } else if (dissected_image) { assert(!arg_directory); assert(arg_image); + assert(!arg_mstack); /* If we are operating on a disk image, then mount its root directory now, but leave out the * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest, @@ -3955,9 +4015,38 @@ static int outer_child( (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0)); if (r < 0) return r; + + } else if (arg_mstack) { + assert(!arg_directory); + assert(!arg_image); + assert(arg_mstack); + + MStackFlags mstack_flags = arg_read_only ? MSTACK_RDONLY : 0; + + /* This creates the needed overlayfs or tmpfs, owned by our target userns. Note that we pass + * the target mount dir as temporary mount dir here. We after all just need some dir here + * that definitely exists, and the temporary mounts on it are not going to be visible + * outside. */ + r = mstack_make_mounts( + mstack, + /* temp_mount_dir= */ directory, /* !! */ + mstack_flags); + if (r < 0) + return log_error_errno(r, "Failed to make .mstack/ mounts: %m"); + + /* And then attaches all mounts to the directory */ + r = mstack_bind_mounts( + mstack, + directory, + /* where_fd= */ -EBADF, + mstack_flags, + /* ret_root_fd= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed bind mount .mstack/ mounts: %m"); } else { assert(arg_directory); assert(!arg_image); + assert(!arg_mstack); r = mount_nofollow_verbose(LOG_ERR, arg_directory, directory, /* fstype= */ NULL, MS_BIND|MS_REC, /* options= */ NULL); if (r < 0) @@ -5023,6 +5112,10 @@ static int load_settings(void) { r = file_in_same_dir(arg_directory, arg_settings_filename, &p); if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */ return log_error_errno(r, "Failed to generate settings path from directory path: %m"); + } else if (arg_mstack) { + r = file_in_same_dir(arg_mstack, arg_settings_filename, &p); + if (r < 0) + return log_error_errno(r, "Failed to generate settings path from mstack path: %m"); } if (p) { @@ -5073,6 +5166,7 @@ static int run_container( const char *directory, int mount_fd, DissectedImage *dissected_image, + MStack *mstack, int userns_fd, FDSet *fds, char veth_name[IFNAMSIZ], @@ -5223,6 +5317,7 @@ static int run_container( directory, mount_fd, dissected_image, + mstack, fd_outer_socket_pair[1], fd_inner_socket_pair[1], fds, @@ -5932,6 +6027,7 @@ static int run(int argc, char *argv[]) { _cleanup_(rm_rf_subvolume_and_freep) char *snapshot_dir = NULL; _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL; _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; + _cleanup_(mstack_freep) MStack *mstack = NULL; _cleanup_(sd_netlink_unrefp) sd_netlink *nfnl = NULL; _cleanup_(pidref_done) PidRef pid = PIDREF_NULL; @@ -6195,7 +6291,7 @@ static int run(int argc, char *argv[]) { } } - if (arg_userns_mode == USER_NAMESPACE_MANAGED) { + if (userns_fd >= 0) { r = mountfsd_mount_directory( arg_directory, userns_fd, @@ -6204,11 +6300,11 @@ static int run(int argc, char *argv[]) { if (r < 0) goto finish; } - } else { + + } else if (arg_image) { DissectImageFlags dissect_image_flags = determine_dissect_image_flags(); - assert(arg_image); assert(!arg_template); r = chase_and_update(&arg_image, 0); @@ -6362,7 +6458,37 @@ static int run(int argc, char *argv[]) { if (arg_architecture < 0) arg_architecture = dissected_image_architecture(dissected_image); - } + + } else if (arg_mstack) { + assert(!arg_template); + assert(!arg_ephemeral); + + r = chase_and_update(&arg_mstack, CHASE_MUST_BE_DIRECTORY); + if (r < 0) + goto finish; + + if (!IN_SET(arg_userns_mode, USER_NAMESPACE_NO, USER_NAMESPACE_MANAGED)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--mstack= requires managed user namespacing, or user namespace turned off."); + + MStackFlags mstack_flags = arg_read_only ? MSTACK_RDONLY : 0; + r = mstack_load(arg_mstack, + /* dir_fd= */ -EBADF, + &mstack); + if (r < 0) + goto finish; + + r = mstack_open_images( + mstack, + userns_fd, + arg_image_policy, + /* image_filter= */ NULL, + mstack_flags); + if (r < 0) { + log_error_errno(r, "Failed to open .mstack/ layer images '%s': %m", arg_mstack); + goto finish; + } + } else + assert_not_reached(); /* Create a temporary place to mount stuff. */ r = mkdtemp_malloc("/tmp/nspawn-root-XXXXXX", &rootdir); @@ -6376,7 +6502,7 @@ static int run(int argc, char *argv[]) { goto finish; if (!arg_quiet) { - const char *t = arg_image ?: arg_directory; + const char *t = arg_mstack ?: arg_image ?: arg_directory; _cleanup_free_ char *u = NULL; (void) terminal_urlify_path(t, t, &u); @@ -6412,6 +6538,7 @@ static int run(int argc, char *argv[]) { rootdir, mount_fd, dissected_image, + mstack, userns_fd, fds, veth_name, &veth_created, -- 2.47.3