]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
vmspawn: substantially beef up cgroup logic, to match more closely what nspawn does
authorLennart Poettering <lennart@poettering.net>
Fri, 13 Jun 2025 08:29:01 +0000 (10:29 +0200)
committerLennart Poettering <lennart@poettering.net>
Fri, 11 Jul 2025 16:17:04 +0000 (18:17 +0200)
This beefs up the cgroup logic, adding --slice=, --property= to vmspawn
the same way it already exists in nspawn.

There are a bunch of differences though: we don't delegate the cgroup
access in the allocated unit (since qemu wouldn't need that), and we do
registration via varlink not dbus. Hence, while this follows a similar
logic now, it differs in a lot of details.

This makes in particular one change: when invoked on the command line
we'll only add the qemu instance to the allocated scope, not the vmspawn
process itself (this follows more closely how nspawn does this where
only the container payload has its scope, not nspawn itself). This is
quite tricky to implement: unlike in nspawn we have auxiliary services
to start, with depencies to the scope. This means we need to start the
scope early, so that we know the scope's name. But the command line to
invoke is only assembled from the data we learn about the auxiliary
services, hence much later. To addres we'll now fork off the child that
eventually will become early, then move it to a scope, prepare the
cmdline and then very late send the cmdline (and the fds we want to
pass) to the prepared child, which then execs it.

man/systemd-vmspawn.xml
src/vmspawn/vmspawn-scope.c
src/vmspawn/vmspawn-scope.h
src/vmspawn/vmspawn.c

index cbf7d20e64fa2d49d895b5790cd04cdd21eb1c43..304bbb44a387be43fdda7a5d5715c5899fc4c10d 100644 (file)
       <title>Property Options</title>
 
       <variablelist>
+        <varlistentry>
+          <term><option>-S</option></term>
+          <term><option>--slice=</option></term>
+
+          <listitem><para>Make the VM part of the specified slice, instead of the default
+          <filename>machine.slice</filename>. This applies only if the machine is run in its own scope unit,
+          i.e. if <option>--keep-unit</option> is not used.</para>
+
+          <xi:include href="version-info.xml" xpointer="v258"/>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <term><option>--property=</option></term>
+
+          <listitem><para>Set a unit property on the scope unit to register for the machine. This applies
+          only if the machine is run in its own scope unit, i.e. if <option>--keep-unit</option> is not
+          used. Takes unit property assignments in the same format as <command>systemctl
+          set-property</command>. This is useful to set memory limits and similar for the VM.</para>
+
+          <xi:include href="version-info.xml" xpointer="v258"/>
+          </listitem>
+        </varlistentry>
+
         <varlistentry>
           <term><option>--register=</option></term>
 
         <xi:include href="standard-options.xml" xpointer="no-pager" />
         <xi:include href="standard-options.xml" xpointer="help" />
         <xi:include href="standard-options.xml" xpointer="version" />
+        <xi:include href="standard-options.xml" xpointer="no-ask-password" />
       </variablelist>
     </refsect2>
   </refsect1>
index 624bd7730d5b57f7444edb3903d7ce55e4da36fa..2aa2aaa23283cd59bb3275ebf61de4d7dc8760fa 100644 (file)
 #include "pidref.h"
 #include "random-util.h"
 #include "socket-util.h"
+#include "special.h"
 #include "string-util.h"
 #include "strv.h"
+#include "unit-def.h"
+#include "unit-name.h"
 #include "vmspawn-scope.h"
 
-int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidfd, char **ret_scope) {
+static int append_controller_property(sd_bus *bus, sd_bus_message *m) {
+        const char *unique;
+        int r;
+
+        assert(bus);
+        assert(m);
+
+        r = sd_bus_get_unique_name(bus, &unique);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get unique name: %m");
+
+        r = sd_bus_message_append(m, "(sv)", "Controller", "s", unique);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        return 0;
+}
+
+int allocate_scope(
+                sd_bus *bus,
+                const char *machine_name,
+                const PidRef *pid,
+                const char *slice,
+                char **properties,
+                bool allow_pidfd,
+                char **ret_scope) {
+
         _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL;
         _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
         _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL, *m = NULL;
@@ -33,8 +62,9 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf
         if (r < 0)
                 return log_error_errno(r, "Could not watch job: %m");
 
-        if (asprintf(&scope, "machine-%"PRIu64"-%s.scope", random_u64(), machine_name) < 0)
-                return log_oom();
+        r = unit_name_mangle_with_suffix(machine_name, "as machine name", /* flags= */ 0, ".scope", &scope);
+        if (r < 0)
+                return log_error_errno(r, "Failed to mangle scope name: %m");
 
         description = strjoin("Virtual Machine ", machine_name);
         if (!description)
@@ -53,21 +83,25 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf
         if (r < 0)
                 return bus_log_create_error(r);
 
-        r = sd_bus_message_append(m, "(sv)(sv)(sv)",
-                                  "Description", "s",  description,
-                                  "AddRef",      "b",  1,
-                                  "CollectMode", "s",  "inactive-or-failed");
+        r = bus_append_scope_pidref(m, pid, allow_pidfd);
         if (r < 0)
                 return bus_log_create_error(r);
 
-        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
-        r = pidref_set_self(&pidref);
+        r = sd_bus_message_append(m, "(sv)(sv)(sv)(sv)",
+                                  "Description", "s", description,
+                                  "CollectMode", "s", "inactive-or-failed",
+                                  "AddRef",      "b", 1,
+                                  "Slice",       "s", isempty(slice) ? SPECIAL_MACHINE_SLICE : slice);
         if (r < 0)
-                return log_error_errno(r, "Failed to allocate PID reference: %m");
+                return bus_log_create_error(r);
 
-        r = bus_append_scope_pidref(m, &pidref, allow_pidfd);
+        r = append_controller_property(bus, m);
         if (r < 0)
-                return bus_log_create_error(r);
+                return r;
+
+        r = bus_append_unit_property_assignment_many(m, UNIT_SCOPE, properties);
+        if (r < 0)
+                return r;
 
         r = sd_bus_message_close_container(m);
         if (r < 0)
@@ -87,7 +121,14 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf
                  * doesn't support PIDFDs yet, let's try without. */
                 if (allow_pidfd &&
                     sd_bus_error_has_names(&error, SD_BUS_ERROR_UNKNOWN_PROPERTY, SD_BUS_ERROR_PROPERTY_READ_ONLY))
-                        return start_transient_scope(bus, machine_name, false, ret_scope);
+                        return allocate_scope(
+                                        bus,
+                                        machine_name,
+                                        pid,
+                                        slice,
+                                        properties,
+                                        /* allow_pidfd= */ false,
+                                        ret_scope);
 
                 return log_error_errno(r, "Failed to start transient scope unit: %s", bus_error_message(&error, r));
         }
@@ -96,7 +137,11 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf
         if (r < 0)
                 return bus_log_parse_error(r);
 
-        r = bus_wait_for_jobs_one(w, object, /* quiet */ false, NULL);
+        r = bus_wait_for_jobs_one(
+                        w,
+                        object,
+                        BUS_WAIT_JOBS_LOG_ERROR,
+                        /* extra_args= */ NULL);
         if (r < 0)
                 return r;
 
@@ -106,6 +151,46 @@ int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidf
         return 0;
 }
 
+int terminate_scope(
+                sd_bus *bus,
+                const char *machine_name) {
+
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_free_ char *scope = NULL;
+        int r;
+
+        r = unit_name_mangle_with_suffix(machine_name, "to terminate", /* flags= */ 0, ".scope", &scope);
+        if (r < 0)
+                return log_error_errno(r, "Failed to mangle scope name: %m");
+
+        r = bus_call_method(bus, bus_systemd_mgr, "AbandonScope", &error, /* ret_reply= */ NULL, "s", scope);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to abandon scope '%s', ignoring: %s", scope, bus_error_message(&error, r));
+                sd_bus_error_free(&error);
+        }
+
+        r = bus_call_method(
+                        bus,
+                        bus_systemd_mgr,
+                        "KillUnit",
+                        &error,
+                        NULL,
+                        "ssi",
+                        scope,
+                        "all",
+                        (int32_t) SIGKILL);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to SIGKILL scope '%s', ignoring: %s", scope, bus_error_message(&error, r));
+                sd_bus_error_free(&error);
+        }
+
+        r = bus_call_method(bus, bus_systemd_mgr, "UnrefUnit", &error, /* ret_reply= */ NULL, "s", scope);
+        if (r < 0)
+                log_debug_errno(r, "Failed to drop reference to scope '%s', ignoring: %s", scope, bus_error_message(&error, r));
+
+        return 0;
+}
+
 static int message_add_commands(sd_bus_message *m, const char *exec_type, char ***commands, size_t n_commands) {
         int r;
 
index c29325a31459079a6ba477882ca6229899a230fc..e456498f2e5bdcc3fc4db2d5b418bead57e71693 100644 (file)
@@ -14,5 +14,8 @@ typedef struct SocketServicePair {
 
 void socket_service_pair_done(SocketServicePair *p);
 
-int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidfd, char **ret_scope);
+int allocate_scope(sd_bus *bus, const char *machine_name, const PidRef *pid, const char *slice, char **properties, bool allow_pidfd, char **ret_scope);
+
+int terminate_scope(sd_bus *bus, const char *machine_name);
+
 int start_socket_service_pair(sd_bus *bus, const char *scope, SocketServicePair *p);
index 94453356b6616647509885eeeff0d714946c722c..ce623cfd99bd1da92df76140a1be0c70c2d95bc3 100644 (file)
@@ -4,6 +4,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/poll.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
@@ -38,6 +39,8 @@
 #include "hostname-setup.h"
 #include "hostname-util.h"
 #include "id128-util.h"
+#include "io-util.h"
+#include "iovec-util.h"
 #include "log.h"
 #include "machine-credential.h"
 #include "main-func.h"
@@ -45,6 +48,7 @@
 #include "namespace-util.h"
 #include "netif-util.h"
 #include "nsresource.h"
+#include "nulstr-util.h"
 #include "osc-context.h"
 #include "pager.h"
 #include "parse-argument.h"
@@ -100,6 +104,8 @@ static PagerFlags arg_pager_flags = 0;
 static char *arg_directory = NULL;
 static char *arg_image = NULL;
 static char *arg_machine = NULL;
+static char *arg_slice = NULL;
+static char **arg_property = NULL;
 static char *arg_cpus = NULL;
 static uint64_t arg_ram = UINT64_C(2) * U64_GB;
 static int arg_kvm = -1;
@@ -118,7 +124,7 @@ static SettingsMask arg_settings_mask = 0;
 static char *arg_firmware = NULL;
 static char *arg_forward_journal = NULL;
 static bool arg_privileged = false;
-static bool arg_register = false;
+static bool arg_register = true;
 static bool arg_keep_unit = false;
 static sd_id128_t arg_uuid = {};
 static char **arg_kernel_cmdline_extra = NULL;
@@ -132,10 +138,12 @@ static char **arg_smbios11 = NULL;
 static uint64_t arg_grow_image = 0;
 static char *arg_tpm_state_path = NULL;
 static TpmStateMode arg_tpm_state_mode = TPM_STATE_AUTO;
+static bool arg_ask_password = true;
 
 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_cpus, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_credentials, machine_credential_context_done);
 STATIC_DESTRUCTOR_REGISTER(arg_firmware, freep);
@@ -149,6 +157,7 @@ STATIC_DESTRUCTOR_REGISTER(arg_background, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_ssh_key_type, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_smbios11, strv_freep);
 STATIC_DESTRUCTOR_REGISTER(arg_tpm_state_path, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
 
 static int help(void) {
         _cleanup_free_ char *link = NULL;
@@ -166,6 +175,7 @@ static int help(void) {
                "     --version             Print version string\n"
                "  -q --quiet               Do not show status information\n"
                "     --no-pager            Do not pipe output into a pager\n"
+               "     --no-ask-password     Do not prompt for password\n"
                "\n%3$sImage:%4$s\n"
                "  -D --directory=PATH      Root directory for the VM\n"
                "  -i --image=FILE|DEVICE   Root file system disk image or device for the VM\n"
@@ -191,8 +201,11 @@ static int help(void) {
                "  -M --machine=NAME        Set the machine name for the VM\n"
                "     --uuid=UUID           Set a specific machine UUID for the VM\n"
                "\n%3$sProperties:%4$s\n"
-               "     --register=BOOLEAN    Register VM with systemd-machined\n"
-               "     --keep-unit           Don't let systemd-machined allocate scope unit for us\n"
+               "  -S --slice=SLICE         Place the VM in the specified slice\n"
+               "     --property=NAME=VALUE Set scope unit property\n"
+               "     --register=BOOLEAN    Register VM as machine\n"
+               "     --keep-unit           Do not register a scope for the machine, reuse\n"
+               "                           the service unit vmspawn is running in\n"
                "\n%3$sUser Namespacing:%4$s\n"
                "     --private-users=UIDBASE[:NUIDS]\n"
                "                           Configure the UID/GID range to map into the\n"
@@ -274,6 +287,8 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_CONSOLE,
                 ARG_BACKGROUND,
                 ARG_TPM_STATE,
+                ARG_NO_ASK_PASSWORD,
+                ARG_PROPERTY,
         };
 
         static const struct option options[] = {
@@ -284,6 +299,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "image",             required_argument, NULL, 'i'                   },
                 { "directory",         required_argument, NULL, 'D'                   },
                 { "machine",           required_argument, NULL, 'M'                   },
+                { "slice",             required_argument, NULL, 'S'                   },
                 { "cpus",              required_argument, NULL, ARG_CPUS              },
                 { "qemu-smp",          required_argument, NULL, ARG_CPUS              }, /* Compat alias */
                 { "ram",               required_argument, NULL, ARG_RAM               },
@@ -319,6 +335,8 @@ static int parse_argv(int argc, char *argv[]) {
                 { "smbios11",          required_argument, NULL, 's'                   },
                 { "grow-image",        required_argument, NULL, 'G'                   },
                 { "tpm-state",         required_argument, NULL, ARG_TPM_STATE         },
+                { "no-ask-password",   no_argument,       NULL, ARG_NO_ASK_PASSWORD   },
+                { "property",          required_argument, NULL, ARG_PROPERTY          },
                 {}
         };
 
@@ -328,7 +346,7 @@ static int parse_argv(int argc, char *argv[]) {
         assert(argv);
 
         optind = 0;
-        while ((c = getopt_long(argc, argv, "+hD:i:M:nqs:G:", options, NULL)) >= 0)
+        while ((c = getopt_long(argc, argv, "+hD:i:M:nqs:G:S:", options, NULL)) >= 0)
                 switch (c) {
                 case 'h':
                         return help();
@@ -634,6 +652,27 @@ static int parse_argv(int argc, char *argv[]) {
                         arg_tpm_state_path = mfree(arg_tpm_state_path);
                         break;
 
+                case ARG_NO_ASK_PASSWORD:
+                        arg_ask_password = false;
+                        break;
+
+                case 'S': {
+                        _cleanup_free_ char *mangled = NULL;
+
+                        r = unit_name_mangle_with_suffix(optarg, /* operation= */ NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to turn '%s' into unit name: %m", optarg);
+
+                        free_and_replace(arg_slice, mangled);
+                        break;
+                }
+
+                case ARG_PROPERTY:
+                        if (strv_extend(&arg_property, optarg) < 0)
+                                return log_oom();
+
+                        break;
+
                 case '?':
                         return -EINVAL;
 
@@ -1535,30 +1574,261 @@ static int grow_image(const char *path, uint64_t size) {
         return 1;
 }
 
+static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        assert(m);
+
+        log_info("VM termination requested. Exiting.");
+        sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
+
+        return 0;
+}
+
+static int datagram_read_cmdline_and_exec(int _fd /* always taking possession, even on error */) {
+        _cleanup_close_ int fd = TAKE_FD(_fd);
+        int r;
+
+        assert(fd >= 0);
+
+        /* The first datagram contains the cmdline */
+        r = fd_wait_for_event(fd, POLLIN, USEC_INFINITY);
+        if (r < 0)
+                return log_error_errno(r, "Failed to wait for command line: %m");
+
+        ssize_t n = next_datagram_size_fd(fd);
+        if (n < 0)
+                return log_error_errno(n, "Failed to determine datagram size: %m");
+        n += 1; /* extra byte to validate that the size we determined here was correct */
+
+        _cleanup_free_ char *p = malloc(n);
+        if (!p)
+                return log_oom();
+
+        ssize_t m = recv(fd, p, n, /* flags= */ 0);
+        if (m < 0)
+                return log_error_errno(errno, "Failed to read datagram: %m");
+        if (m >= n)
+                return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected message size.");
+
+        _cleanup_strv_free_ char **a = strv_parse_nulstr(p, m);
+        if (!a)
+                return log_oom();
+        if (strv_isempty(a))
+                return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Invalid command line.");
+
+        /* The second datagram contains an integer array of the intended fd numbers, and the an SCM_RIGHTS fd
+         * list along with it, matching that. */
+        r = fd_wait_for_event(fd, POLLIN, USEC_INFINITY);
+        if (r < 0)
+                return log_error_errno(r, "Failed to wait for command line: %m");
+
+        n = next_datagram_size_fd(fd);
+        if (n < 0)
+                return log_error_errno(n, "Failed to determine datagram size: %m");
+        n += 1; /* extra byte to validate that the size we determined here was correct */
+
+        _cleanup_free_ int *f = malloc(n);
+        if (!p)
+                return log_oom();
+
+        struct iovec iov = {
+                .iov_base = f,
+                .iov_len = n,
+        };
+
+        int *fds = NULL;
+        size_t n_fds = 0;
+        CLEANUP_ARRAY(fds, n_fds, close_many_and_free);
+
+        m = receive_many_fds_iov(
+                        fd,
+                        &iov, /* iovlen= */ 1,
+                        &fds,
+                        &n_fds,
+                        /* flags= */ MSG_TRUNC);
+        if (m < 0)
+                return log_error_errno(m, "Failed to read datagram: %m");
+        if (m >= n || (size_t) m != n_fds * sizeof(int))
+                return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected message size.");
+
+        fd = safe_close(fd);
+
+        /* At this point the fds[] contains the file descriptors we got, and f[] contains the numbers we want
+         * for them. Let's rearrange things. */
+
+        /* 1. Determine largest number we want */
+        int max_fd = 2;
+        for (size_t k = 0; k < n_fds; k++)
+                max_fd = MAX(max_fd, f[k]);
+
+        /* 2. Move all fds we got above that */
+        for (size_t k = 0; k < n_fds; k++) {
+                if (fds[k] > max_fd)
+                        continue;
+
+                _cleanup_close_ int copy = fcntl(fds[k], F_DUPFD_CLOEXEC, max_fd+1);
+                if (copy < 0)
+                        return log_error_errno(errno, "Failed to duplicate file descriptor: %m");
+
+                safe_close(fds[k]);
+                fds[k] = TAKE_FD(copy);
+
+                assert(fds[k] > max_fd);
+        }
+
+        log_close();
+
+        r = close_all_fds(fds, n_fds);
+        if (r < 0)
+                return log_error_errno(r, "Failed to close remaining file descriptors: %m");
+
+        /* 3. Move into place (this also disables O_CLOEXEC) */
+        for (size_t k = 0; k < n_fds; k++) {
+                if (dup2(fds[k], f[k]) < 0)
+                        return log_error_errno(errno, "Failed to move file descriptor: %m");
+
+                safe_close(fds[k]);
+                fds[k] = f[k];
+        }
+
+        execv(a[0], a);
+        return log_error_errno(errno, "Failed to execve %s: %m", a[0]);
+}
+
+_noreturn_ static void child(int cmdline_fd) {
+        assert(cmdline_fd >= 0);
+
+        /* set TERM and LANG if they are missing */
+        if (setenv("TERM", "vt220", 0) < 0) {
+                log_oom();
+                goto fail;
+        }
+
+        if (setenv("LANG", "C.UTF-8", 0) < 0) {
+                log_oom();
+                goto fail;
+        }
+
+        /* Now wait for the command line from the parent, and then execute it */
+
+        (void) datagram_read_cmdline_and_exec(TAKE_FD(cmdline_fd));
+
+fail:
+        _exit(EXIT_FAILURE);
+}
+
 static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
         _cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL;
-        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
-        _cleanup_free_ char *machine = NULL, *qemu_binary = NULL, *mem = NULL, *trans_scope = NULL, *kernel = NULL;
+        _cleanup_free_ char *qemu_binary = NULL, *mem = NULL, *kernel = NULL;
         _cleanup_(rm_rf_physical_and_freep) char *ssh_private_key_path = NULL, *ssh_public_key_path = NULL;
         _cleanup_close_ int notify_sock_fd = -EBADF;
         _cleanup_strv_free_ char **cmdline = NULL;
         _cleanup_free_ int *pass_fds = NULL;
         size_t n_pass_fds = 0;
-        const char *accel, *shm;
+        const char *accel;
         int r;
 
         polkit_agent_open();
 
+        /* Registration always happens on the system bus */
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *system_bus = NULL;
+        if (arg_register || arg_privileged) {
+                r = sd_bus_default_system(&system_bus);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open system bus: %m");
+
+                r = sd_bus_set_close_on_exit(system_bus, false);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
+
+                (void) sd_bus_set_allow_interactive_authorization(system_bus, arg_ask_password);
+        }
+
+        /* Scope allocation happens on the user bus if we are unpriv, otherwise system bus. */
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *user_bus = NULL;
+        _cleanup_(sd_bus_unrefp) sd_bus *runtime_bus = NULL;
         if (arg_privileged)
-                r = sd_bus_default_system(&bus);
-        else
-                r = sd_bus_default_user(&bus);
-        if (r < 0)
-                return log_error_errno(r, "Failed to connect to systemd bus: %m");
+                runtime_bus = sd_bus_ref(system_bus);
+        else {
+                r = sd_bus_default_user(&user_bus);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open system bus: %m");
+
+                r = sd_bus_set_close_on_exit(user_bus, false);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
+
+                runtime_bus = sd_bus_ref(user_bus);
+        }
 
-        r = start_transient_scope(bus, arg_machine, /* allow_pidfd= */ true, &trans_scope);
+        assert_se(sigprocmask_many(SIG_BLOCK, /* ret_old_mask=*/ NULL, SIGCHLD) >= 0);
+
+        _cleanup_close_pair_ int cmdline_socket[2] = EBADF_PAIR;
+        if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, cmdline_socket) < 0)
+                return log_error_errno(errno, "Failed to allocate command line socket pair: %m");
+
+        /* Fork off child early on, as we need to assign it to a scope unit, which we can generate
+         * dependencies towards for swtpm, virtiofsd and so on. It's just going to hang until we fully
+         * prepared a command line */
+        _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL;
+        r = pidref_safe_fork_full(
+                        "(qemu)",
+                        /* stdio_fds= */ NULL,
+                        cmdline_socket + 0, 1,
+                        FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_CLOEXEC_OFF|FORK_RLIMIT_NOFILE_SAFE,
+                        &child_pidref);
         if (r < 0)
                 return r;
+        if (r == 0) {
+                cmdline_socket[1] = -EBADF; /* closed due to FORK_CLOEXEC_ALL_FDS */
+
+                child(cmdline_socket[0]);
+                assert_not_reached();
+        }
+
+        cmdline_socket[0] = safe_close(cmdline_socket[0]);
+
+        if (!arg_keep_unit) {
+                /* When a new scope is created for this container, then we'll be registered as its controller, in which
+                 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
+                 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
+
+                r = sd_bus_match_signal_async(
+                                runtime_bus,
+                                /* ret= */ NULL,
+                                "org.freedesktop.systemd1",
+                                /* path= */ NULL,
+                                "org.freedesktop.systemd1.Scope",
+                                "RequestStop",
+                                on_request_stop,
+                                /* install_callback= */ NULL,
+                                /* userdata= */ NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to request RequestStop match: %m");
+        }
+
+        _cleanup_free_ char *unit = NULL;
+        bool scope_allocated = false;
+        if (!arg_keep_unit && (!arg_register || !arg_privileged)) {
+                r = allocate_scope(
+                                runtime_bus,
+                                arg_machine,
+                                &child_pidref,
+                                arg_slice,
+                                arg_property,
+                                /* allow_pidfd= */ true,
+                                &unit);
+                if (r < 0)
+                        return r;
+
+                scope_allocated = true;
+        } else {
+                if (arg_privileged)
+                        r = cg_pid_get_unit(0, &unit);
+                else
+                        r = cg_pid_get_user_unit(0, &unit);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get our own unit: %m");
+        }
 
         bool use_kvm = arg_kvm > 0;
         if (arg_kvm < 0) {
@@ -1580,7 +1850,8 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                 log_warning("Couldn't find OVMF firmware blob with Secure Boot support, "
                             "falling back to OVMF firmware blobs without Secure Boot support.");
 
-        shm = arg_directory || arg_runtime_mounts.n_mounts != 0 ? ",memory-backend=mem" : "";
+        _cleanup_free_ char *machine = NULL;
+        const char *shm = arg_directory || arg_runtime_mounts.n_mounts != 0 ? ",memory-backend=mem" : "";
         const char *hpet = ARCHITECTURE_SUPPORTS_HPET ? ",hpet=off" : "";
         if (ARCHITECTURE_SUPPORTS_SMM)
                 machine = strjoin("type=" QEMU_MACHINE_TYPE ",smm=", on_off(ovmf_config->supports_sb), shm, hpet);
@@ -1993,7 +2264,13 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
 
         if (arg_directory) {
                 _cleanup_free_ char *listen_address = NULL;
-                r = start_virtiofsd(bus, trans_scope, arg_directory, /* uidmap= */ true, runtime_dir, &listen_address);
+                r = start_virtiofsd(
+                                runtime_bus,
+                                unit,
+                                arg_directory,
+                                /* uidmap= */ true,
+                                runtime_dir,
+                                &listen_address);
                 if (r < 0)
                         return r;
 
@@ -2060,7 +2337,13 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
 
         FOREACH_ARRAY(mount, arg_runtime_mounts.mounts, arg_runtime_mounts.n_mounts) {
                 _cleanup_free_ char *listen_address = NULL;
-                r = start_virtiofsd(bus, trans_scope, mount->source, /* uidmap= */ false, runtime_dir, &listen_address);
+                r = start_virtiofsd(
+                                runtime_bus,
+                                unit,
+                                mount->source,
+                                /* uidmap= */ false,
+                                runtime_dir,
+                                &listen_address);
                 if (r < 0)
                         return r;
 
@@ -2151,7 +2434,11 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
 
         _cleanup_free_ char *tpm_socket_address = NULL;
         if (swtpm) {
-                r = start_tpm(bus, trans_scope, swtpm, runtime_dir, &tpm_socket_address);
+                r = start_tpm(runtime_bus,
+                              unit,
+                              swtpm,
+                              runtime_dir,
+                              &tpm_socket_address);
                 if (r < 0) {
                         /* only bail if the user asked for a tpm */
                         if (arg_tpm > 0)
@@ -2217,7 +2504,12 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                 if (r < 0)
                         return log_error_errno(r, "Failed to find systemd-journal-remote binary: %m");
 
-                r = start_systemd_journal_remote(bus, trans_scope, child_cid, sd_journal_remote, &listen_address);
+                r = start_systemd_journal_remote(
+                                runtime_bus,
+                                unit,
+                                child_cid,
+                                sd_journal_remote,
+                                &listen_address);
                 if (r < 0)
                         return r;
 
@@ -2234,7 +2526,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                 _cleanup_free_ char *scope_prefix = NULL, *privkey_path = NULL, *pubkey_path = NULL;
                 const char *key_type = arg_ssh_key_type ?: "ed25519";
 
-                r = unit_name_to_prefix(trans_scope, &scope_prefix);
+                r = unit_name_to_prefix(unit, &scope_prefix);
                 if (r < 0)
                         return log_error_errno(r, "Failed to strip .scope suffix from scope: %m");
 
@@ -2265,7 +2557,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                 if (r < 0)
                         return log_error_errno(r, "Failed to load credential %s: %m", cred_path);
 
-                r = unit_name_to_prefix(trans_scope, &scope_prefix);
+                r = unit_name_to_prefix(unit, &scope_prefix);
                 if (r < 0)
                         return log_error_errno(r, "Failed to strip .scope suffix from scope: %m");
 
@@ -2329,45 +2621,12 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                 log_debug("Executing: %s", joined);
         }
 
-        assert_se(sigprocmask_many(SIG_BLOCK, /* ret_old_mask=*/ NULL, SIGCHLD) >= 0);
-
-        _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL;
-
-        r = pidref_safe_fork_full(
-                        qemu_binary,
-                        /* stdio_fds= */ NULL,
-                        pass_fds, n_pass_fds,
-                        FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_CLOEXEC_OFF|FORK_RLIMIT_NOFILE_SAFE,
-                        &child_pidref);
-        if (r < 0)
-                return r;
-        if (r == 0) {
-                /* set TERM and LANG if they are missing */
-                if (setenv("TERM", "vt220", 0) < 0) {
-                        log_oom();
-                        goto fail;
-                }
-
-                if (setenv("LANG", "C.UTF-8", 0) < 0) {
-                        log_oom();
-                        goto fail;
-                }
-
-                execv(qemu_binary, cmdline);
-                log_error_errno(errno, "Failed to execve %s: %m", qemu_binary);
-        fail:
-                _exit(EXIT_FAILURE);
-        }
-
-        /* Close relevant fds we passed to qemu in the parent. We don't need them anymore. */
-        child_vsock_fd = safe_close(child_vsock_fd);
-        tap_fd = safe_close(tap_fd);
-
+        bool registered = false;
         if (arg_register) {
                 char vm_address[STRLEN("vsock/") + DECIMAL_STR_MAX(unsigned)];
                 xsprintf(vm_address, "vsock/%u", child_cid);
                 r = register_machine(
-                                bus,
+                                system_bus,
                                 arg_machine,
                                 arg_uuid,
                                 "systemd-vmspawn",
@@ -2376,11 +2635,40 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                                 child_cid,
                                 child_cid != VMADDR_CID_ANY ? vm_address : NULL,
                                 ssh_private_key_path,
-                                arg_keep_unit);
+                                arg_keep_unit || !arg_privileged);
                 if (r < 0)
                         return r;
+
+                registered = true;
         }
 
+        _cleanup_free_ char *nulstr = NULL;
+        size_t nulstr_size = 0;
+        if (strv_make_nulstr(cmdline, &nulstr, &nulstr_size) < 0)
+                return log_oom();
+
+        /* First datagram: the command line to execute */
+        ssize_t n = send(cmdline_socket[1], nulstr, nulstr_size, /* flags= */ 0);
+        if (n < 0)
+                return log_error_errno(errno, "Failed to send command line: %m");
+
+        /* Second datagram: the file descriptor array and the fds inside it */
+        n = send_many_fds_iov(
+                        cmdline_socket[1],
+                        pass_fds, n_pass_fds, /* both as payload … */
+                        &IOVEC_MAKE(pass_fds, n_pass_fds * sizeof(int)), /* … and as auxiliary fds */
+                        /* iovlen= */ 1,
+                        /* flags= */ 0);
+        if (n < 0)
+                return log_error_errno(n, "Failed to send file descriptors to child: %m");
+
+        /* We submitted the command line now, qemu is running now */
+        cmdline_socket[1] = safe_close(cmdline_socket[1]);
+
+        /* Close relevant fds we passed to qemu in the parent. We don't need them anymore. */
+        child_vsock_fd = safe_close(child_vsock_fd);
+        tap_fd = safe_close(tap_fd);
+
         /* All operations that might need Polkit authorizations (i.e. machine registration, netif
          * acquisition, …) are complete now, get rid of the agent again, so that we retain exclusive control
          * of the TTY from now on. */
@@ -2394,6 +2682,18 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
 
         (void) sd_event_set_watchdog(event, true);
 
+        if (system_bus) {
+                r = sd_bus_attach_event(system_bus, event, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to attach system bus to event loop: %m");
+        }
+
+        if (user_bus) {
+                r = sd_bus_attach_event(user_bus, event, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to attach user bus to event loop: %m");
+        }
+
         int exit_status = INT_MAX;
         if (use_vsock) {
                 r = setup_notify_parent(event, notify_sock_fd, &exit_status, &notify_event_source);
@@ -2459,8 +2759,12 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
         if (r < 0)
                 return log_error_errno(r, "Failed to run event loop: %m");
 
-        if (arg_register)
-                (void) unregister_machine(bus, arg_machine);
+        /* Kill if it is not dead yet anyway */
+        if (scope_allocated)
+                terminate_scope(runtime_bus, arg_machine);
+
+        if (registered)
+                (void) unregister_machine(system_bus, arg_machine);
 
         if (use_vsock) {
                 if (exit_status == INT_MAX) {
@@ -2538,11 +2842,6 @@ static int verify_arguments(void) {
         if (!strv_isempty(arg_initrds) && !arg_linux)
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Option --initrd= cannot be used without --linux=.");
 
-        if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
-                /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
-                 * The latter is not technically a user session, but we don't need to labour the point. */
-                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
-
         return 0;
 }
 
@@ -2554,9 +2853,6 @@ static int run(int argc, char *argv[]) {
 
         arg_privileged = getuid() == 0;
 
-        /* don't attempt to register as a machine when running as a user */
-        arg_register = arg_privileged;
-
         r = parse_environment();
         if (r < 0)
                 return r;