]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nspawn: add support for running mstack container images
authorLennart Poettering <lennart@amutable.com>
Wed, 12 Nov 2025 15:46:59 +0000 (16:46 +0100)
committerLennart Poettering <lennart@amutable.com>
Thu, 19 Feb 2026 14:05:15 +0000 (15:05 +0100)
src/nspawn/nspawn.c

index 296b12bb90b54c9a16cc53b9fc42465163533b78..623de6d9328f2d3ee8ee3bcd4a18a252ad5d0284 100644 (file)
@@ -69,6 +69,7 @@
 #include "mkdir.h"
 #include "mount-util.h"
 #include "mountpoint-util.h"
+#include "mstack.h"
 #include "namespace-util.h"
 #include "netlink-internal.h"
 #include "notify-recv.h"
@@ -204,6 +205,7 @@ struct ether_addr arg_network_provided_mac = {};
 static PagerFlags arg_pager_flags = 0;
 static unsigned long arg_personality = PERSONALITY_INVALID;
 static char *arg_image = NULL;
+static char *arg_mstack = NULL;
 static char *arg_oci_bundle = NULL;
 static VolatileMode arg_volatile_mode = VOLATILE_NO;
 static ExposePort *arg_expose_ports = NULL;
@@ -272,6 +274,7 @@ STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_mstack, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
@@ -738,6 +741,7 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_BACKGROUND,
                 ARG_CLEANUP,
                 ARG_NO_ASK_PASSWORD,
+                ARG_MSTACK,
         };
 
         static const struct option options[] = {
@@ -817,6 +821,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "background",             required_argument, NULL, ARG_BACKGROUND             },
                 { "cleanup",                no_argument,       NULL, ARG_CLEANUP                },
                 { "no-ask-password",        no_argument,       NULL, ARG_NO_ASK_PASSWORD        },
+                { "mstack",                 required_argument, NULL, ARG_MSTACK                 },
                 {}
         };
 
@@ -863,6 +868,14 @@ static int parse_argv(int argc, char *argv[]) {
                         arg_settings_mask |= SETTING_DIRECTORY;
                         break;
 
+                case ARG_MSTACK:
+                        r = parse_path_argument(optarg, false, &arg_mstack);
+                        if (r < 0)
+                                return r;
+
+                        arg_settings_mask |= SETTING_DIRECTORY;
+                        break;
+
                 case ARG_OCI_BUNDLE:
                         r = parse_path_argument(optarg, false, &arg_oci_bundle);
                         if (r < 0)
@@ -1654,11 +1667,11 @@ static int verify_arguments(void) {
         if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
                 arg_read_only = true;
 
-        if (arg_directory && arg_image)
-                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
+        if (!!arg_directory + !!arg_image + !!arg_mstack > 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory=, --image= --mstack= may not be combined.");
 
-        if (arg_template && arg_image)
-                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
+        if (arg_template && (arg_image || arg_mstack))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image=/--mstack= may not be combined.");
 
         if (arg_template && !(arg_directory || arg_machine))
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
@@ -1666,6 +1679,9 @@ static int verify_arguments(void) {
         if (arg_ephemeral && arg_template)
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
 
+        if (arg_ephemeral && arg_mstack)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --mstack= may not be combined.");
+
         /* Permit --ephemeral with --link-journal=try-* to satisfy principle of the least astonishment
          * (by common sense, "try" means "do not fail if not possible") */
         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO) && !arg_link_journal_try)
@@ -3054,6 +3070,24 @@ static int pick_paths(void) {
                 arg_architecture = result.architecture;
         }
 
+        if (arg_mstack) {
+                _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
+                PickFilter filter = *pick_filter_image_mstack;
+
+                filter.architecture = arg_architecture;
+
+                r = path_pick_update_warn(
+                                &arg_mstack,
+                                &filter,
+                                /* n_filters= */ 1,
+                                PICK_ARCHITECTURE|PICK_TRIES,
+                                &result);
+                if (r < 0)
+                        return r;
+
+                arg_architecture = result.architecture;
+        }
+
         if (arg_template) {
                 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
                 PickFilter filter = *pick_filter_image_dir;
@@ -3088,7 +3122,7 @@ static int determine_names(void) {
                         return log_oom();
         }
 
-        if (!arg_image && !arg_directory) {
+        if (!arg_image && !arg_directory && !arg_mstack) {
                 if (arg_machine) {
                         _cleanup_(image_unrefp) Image *i = NULL;
 
@@ -3099,10 +3133,24 @@ static int determine_names(void) {
                         if (r < 0)
                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
 
-                        if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
+                        switch (i->type) {
+                        case IMAGE_RAW:
+                        case IMAGE_BLOCK:
                                 r = free_and_strdup(&arg_image, i->path);
-                        else
+                                break;
+
+                        case IMAGE_DIRECTORY:
+                        case IMAGE_SUBVOLUME:
                                 r = free_and_strdup(&arg_directory, i->path);
+                                break;
+
+                        case IMAGE_MSTACK:
+                                r = free_and_strdup(&arg_mstack, i->path);
+                                break;
+
+                        default:
+                                assert_not_reached();
+                        }
                         if (r < 0)
                                 return log_oom();
 
@@ -3114,31 +3162,40 @@ static int determine_names(void) {
                                 return log_error_errno(r, "Failed to determine current directory: %m");
                 }
 
-                if (!arg_directory && !arg_image)
-                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
+                if (!arg_directory && !arg_image && !arg_mstack)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use --directory=, --image= or --mstack=.");
         }
 
         if (!arg_machine) {
-                if (arg_directory && path_equal(arg_directory, "/")) {
-                        arg_machine = gethostname_malloc();
-                        if (!arg_machine)
-                                return log_oom();
+                if (arg_directory) {
+                        if (path_equal(arg_directory, "/")) {
+                                arg_machine = gethostname_malloc();
+                                if (!arg_machine)
+                                        return log_oom();
+                        } else {
+                                r = path_extract_filename(arg_directory, &arg_machine);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
+                        }
                 } else if (arg_image) {
-                        char *e;
-
                         r = path_extract_filename(arg_image, &arg_machine);
                         if (r < 0)
                                 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
 
                         /* Truncate suffix if there is one */
-                        e = endswith(arg_machine, ".raw");
+                        char *e = endswith(arg_machine, ".raw");
                         if (e)
                                 *e = 0;
-                } else {
-                        r = path_extract_filename(arg_directory, &arg_machine);
+                } else if (arg_mstack)  {
+                        r = path_extract_filename(arg_mstack, &arg_machine);
                         if (r < 0)
-                                return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
-                }
+                                return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_mstack);
+
+                        char *e = endswith(arg_machine, ".mstack");
+                        if (e)
+                                *e = 0;
+                } else
+                        assert_not_reached();
 
                 hostname_cleanup(arg_machine);
                 if (!hostname_is_valid(arg_machine, 0))
@@ -3873,6 +3930,7 @@ static int outer_child(
                 const char *directory,
                 int mount_fd,
                 DissectedImage *dissected_image,
+                MStack *mstack,
                 int fd_outer_socket,
                 int fd_inner_socket,
                 FDSet *fds,
@@ -3928,6 +3986,7 @@ static int outer_child(
         if (mount_fd >= 0) {
                 assert(arg_directory);
                 assert(!arg_image);
+                assert(!arg_mstack);
 
                 if (move_mount(mount_fd, "", AT_FDCWD, directory, MOVE_MOUNT_F_EMPTY_PATH) < 0)
                         return log_error_errno(errno, "Failed to attach root directory: %m");
@@ -3938,6 +3997,7 @@ static int outer_child(
         } else if (dissected_image) {
                 assert(!arg_directory);
                 assert(arg_image);
+                assert(!arg_mstack);
 
                 /* If we are operating on a disk image, then mount its root directory now, but leave out the
                  * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
@@ -3955,9 +4015,38 @@ static int outer_child(
                                 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
                 if (r < 0)
                         return r;
+
+        } else if (arg_mstack) {
+                assert(!arg_directory);
+                assert(!arg_image);
+                assert(arg_mstack);
+
+                MStackFlags mstack_flags = arg_read_only ? MSTACK_RDONLY : 0;
+
+                /* This creates the needed overlayfs or tmpfs, owned by our target userns. Note that we pass
+                 * the target mount dir as temporary mount dir here. We after all just need some dir here
+                 * that definitely exists, and the temporary mounts on it are not going to be visible
+                 * outside. */
+                r = mstack_make_mounts(
+                                mstack,
+                                /* temp_mount_dir= */ directory, /* !! */
+                                mstack_flags);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to make .mstack/ mounts: %m");
+
+                /* And then attaches all mounts to the directory */
+                r = mstack_bind_mounts(
+                                mstack,
+                                directory,
+                                /* where_fd= */ -EBADF,
+                                mstack_flags,
+                                /* ret_root_fd= */ NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed bind mount .mstack/ mounts: %m");
         } else {
                 assert(arg_directory);
                 assert(!arg_image);
+                assert(!arg_mstack);
 
                 r = mount_nofollow_verbose(LOG_ERR, arg_directory, directory, /* fstype= */ NULL, MS_BIND|MS_REC, /* options= */ NULL);
                 if (r < 0)
@@ -5023,6 +5112,10 @@ static int load_settings(void) {
                         r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
                         if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
                                 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
+                } else if (arg_mstack) {
+                        r = file_in_same_dir(arg_mstack, arg_settings_filename, &p);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to generate settings path from mstack path: %m");
                 }
 
                 if (p) {
@@ -5073,6 +5166,7 @@ static int run_container(
                 const char *directory,
                 int mount_fd,
                 DissectedImage *dissected_image,
+                MStack *mstack,
                 int userns_fd,
                 FDSet *fds,
                 char veth_name[IFNAMSIZ],
@@ -5223,6 +5317,7 @@ static int run_container(
                                 directory,
                                 mount_fd,
                                 dissected_image,
+                                mstack,
                                 fd_outer_socket_pair[1],
                                 fd_inner_socket_pair[1],
                                 fds,
@@ -5932,6 +6027,7 @@ static int run(int argc, char *argv[]) {
         _cleanup_(rm_rf_subvolume_and_freep) char *snapshot_dir = NULL;
         _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
+        _cleanup_(mstack_freep) MStack *mstack = NULL;
         _cleanup_(sd_netlink_unrefp) sd_netlink *nfnl = NULL;
         _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
 
@@ -6195,7 +6291,7 @@ static int run(int argc, char *argv[]) {
                         }
                 }
 
-                if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                if (userns_fd >= 0) {
                         r = mountfsd_mount_directory(
                                         arg_directory,
                                         userns_fd,
@@ -6204,11 +6300,11 @@ static int run(int argc, char *argv[]) {
                         if (r < 0)
                                 goto finish;
                 }
-        } else {
+
+        } else if (arg_image) {
                 DissectImageFlags dissect_image_flags =
                         determine_dissect_image_flags();
 
-                assert(arg_image);
                 assert(!arg_template);
 
                 r = chase_and_update(&arg_image, 0);
@@ -6362,7 +6458,37 @@ static int run(int argc, char *argv[]) {
 
                 if (arg_architecture < 0)
                         arg_architecture = dissected_image_architecture(dissected_image);
-        }
+
+        } else if (arg_mstack) {
+                assert(!arg_template);
+                assert(!arg_ephemeral);
+
+                r = chase_and_update(&arg_mstack, CHASE_MUST_BE_DIRECTORY);
+                if (r < 0)
+                        goto finish;
+
+                if (!IN_SET(arg_userns_mode, USER_NAMESPACE_NO, USER_NAMESPACE_MANAGED))
+                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--mstack= requires managed user namespacing, or user namespace turned off.");
+
+                MStackFlags mstack_flags = arg_read_only ? MSTACK_RDONLY : 0;
+                r = mstack_load(arg_mstack,
+                                /* dir_fd= */ -EBADF,
+                                &mstack);
+                if (r < 0)
+                        goto finish;
+
+                r = mstack_open_images(
+                                mstack,
+                                userns_fd,
+                                arg_image_policy,
+                                /* image_filter= */ NULL,
+                                mstack_flags);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to open .mstack/ layer images '%s': %m", arg_mstack);
+                        goto finish;
+                }
+        } else
+                assert_not_reached();
 
         /* Create a temporary place to mount stuff. */
         r = mkdtemp_malloc("/tmp/nspawn-root-XXXXXX", &rootdir);
@@ -6376,7 +6502,7 @@ static int run(int argc, char *argv[]) {
                 goto finish;
 
         if (!arg_quiet) {
-                const char *t = arg_image ?: arg_directory;
+                const char *t = arg_mstack ?: arg_image ?: arg_directory;
                 _cleanup_free_ char *u = NULL;
                 (void) terminal_urlify_path(t, t, &u);
 
@@ -6412,6 +6538,7 @@ static int run(int argc, char *argv[]) {
                                 rootdir,
                                 mount_fd,
                                 dissected_image,
+                                mstack,
                                 userns_fd,
                                 fds,
                                 veth_name, &veth_created,