]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
vmspawn: accept kvm/vhost-vsock device fds through sd_listen 29960/head
authorSam Leonard <sam.leonard@codethink.co.uk>
Thu, 18 Jan 2024 12:32:10 +0000 (12:32 +0000)
committerSam Leonard <sam.leonard@codethink.co.uk>
Fri, 9 Feb 2024 12:46:44 +0000 (12:46 +0000)
man/systemd-vmspawn.xml
src/vmspawn/vmspawn-util.c
src/vmspawn/vmspawn-util.h
src/vmspawn/vmspawn.c

index 38f5e3c93e73e5f81247f63eeaf0371e239a3757..b84cc9c062921af2154b0f075f600bf08badd7eb 100644 (file)
     project='man-pages'><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>, but it
     launches a full virtual machine instead of using namespaces.</para>
 
+    <para>File descriptors for <filename>/dev/kvm</filename> and <filename>/dev/vhost-vsock</filename> can be
+    passed to <command>systemd-vmspawn</command> via systemd's native socket passing interface (see
+    <citerefentry><refentrytitle>sd_listen_fds</refentrytitle><manvolnum>3</manvolnum></citerefentry> for
+    details about the precise protocol used and the order in which the file descriptors are passed), these
+    fds must be passed with the names <literal>kvm</literal> and <literal>vhost-vsock</literal> respectively.</para>
+
     <para>Note: on Ubuntu/Debian derivatives systemd-vmspawn requires the user to be in the <literal>kvm</literal> group to use the VSock options.</para>
   </refsect1>
 
index 822b10291217e44fbb7b40d7e5da5b54c777db66..42c7bbfac62f91e02dc573bf2a4de54c156c8030 100644 (file)
@@ -367,7 +367,7 @@ int find_qemu_binary(char **ret_qemu_binary) {
         return find_executable(qemu_arch_specific, ret_qemu_binary);
 }
 
-int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock) {
+int vsock_fix_child_cid(int vhost_device_fd, unsigned *machine_cid, const char *machine) {
         /* this is an arbitrary value picked from /dev/urandom */
         static const uint8_t sip_key[HASH_KEY_SIZE] = {
                 0x03, 0xad, 0xf0, 0xa4,
@@ -376,14 +376,13 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi
                 0xf5, 0x4c, 0x80, 0x52
         };
         struct siphash machine_hash_state, state;
-        _cleanup_close_ int vfd = -EBADF;
         int r;
 
         /* uint64_t is required here for the ioctl call, but valid CIDs are only 32 bits */
         uint64_t cid = *ASSERT_PTR(machine_cid);
 
         assert(machine);
-        assert(ret_child_sock);
+        assert(vhost_device_fd >= 0);
 
         /* Fix the CID of the AF_VSOCK socket passed to qemu
          *
@@ -396,16 +395,10 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi
          * If after another 64 attempts this hasn't worked then give up and return EADDRNOTAVAIL.
          */
 
-        /* remove O_CLOEXEC before this fd is passed to QEMU */
-        vfd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC);
-        if (vfd < 0)
-                return log_debug_errno(errno, "Failed to open /dev/vhost-vsock as read/write: %m");
-
         if (cid != VMADDR_CID_ANY) {
-                r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid);
+                r = ioctl(vhost_device_fd, VHOST_VSOCK_SET_GUEST_CID, &cid);
                 if (r < 0)
                         return log_debug_errno(errno, "Failed to set CID for child vsock with user provided CID %" PRIu64 ": %m", cid);
-                *ret_child_sock = TAKE_FD(vfd);
                 return 0;
         }
 
@@ -417,10 +410,9 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi
                 uint64_t hash = siphash24_finalize(&state);
 
                 cid = 3 + (hash % (UINT_MAX - 4));
-                r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid);
+                r = ioctl(vhost_device_fd, VHOST_VSOCK_SET_GUEST_CID, &cid);
                 if (r >= 0) {
                         *machine_cid = cid;
-                        *ret_child_sock = TAKE_FD(vfd);
                         return 0;
                 }
                 if (errno != EADDRINUSE)
@@ -429,10 +421,9 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi
 
         for (unsigned i = 0; i < 64; i++) {
                 cid = 3 + random_u64_range(UINT_MAX - 4);
-                r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid);
+                r = ioctl(vhost_device_fd, VHOST_VSOCK_SET_GUEST_CID, &cid);
                 if (r >= 0) {
                         *machine_cid = cid;
-                        *ret_child_sock = TAKE_FD(vfd);
                         return 0;
                 }
 
index 9c9b1867d38c82144f5dd63622c13c2a7fa53f57..e1ce78067116479439c8f5901370528f0ef60555 100644 (file)
@@ -68,4 +68,4 @@ int list_ovmf_config(char ***ret);
 int load_ovmf_config(const char *path, OvmfConfig **ret);
 int find_ovmf_config(int search_sb, OvmfConfig **ret);
 int find_qemu_binary(char **ret_qemu_binary);
-int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock);
+int vsock_fix_child_cid(int vsock_fd, unsigned *machine_cid, const char *machine);
index 37c354f0b077a76de6b8b59434d80de5caa7114d..d5ab2a1a2708a65dad732b9411b6d765a42b1a56 100644 (file)
@@ -6,6 +6,7 @@
 #include <string.h>
 #include <unistd.h>
 
+#include "sd-daemon.h"
 #include "sd-event.h"
 #include "sd-id128.h"
 
@@ -654,12 +655,15 @@ static int kernel_cmdline_maybe_append_root(void) {
         return 0;
 }
 
-static int run_virtual_machine(void) {
+static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
         _cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL;
         _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
-        _cleanup_close_ int vsock_fd = -EBADF;
-        _cleanup_free_ char *machine = NULL, *qemu_binary = NULL, *mem = NULL, *trans_scope = NULL;
+        _cleanup_free_ char *qemu_binary = NULL, *mem = NULL, *trans_scope = NULL;
+        _cleanup_close_ int notify_sock_fd = -EBADF;
         _cleanup_strv_free_ char **cmdline = NULL;
+        _cleanup_free_ int *pass_fds = NULL;
+        size_t n_pass_fds = 0;
+        const char *machine, *accel;
         int r;
 
         if (arg_privileged)
@@ -693,20 +697,17 @@ static int run_virtual_machine(void) {
                 log_warning("Couldn't find OVMF firmware blob with Secure Boot support, "
                             "falling back to OVMF firmware blobs without Secure Boot support.");
 
-        const char *accel = use_kvm ? "kvm" : "tcg";
-        if (IN_SET(native_architecture(), ARCHITECTURE_ARM64, ARCHITECTURE_ARM64_BE))
-                machine = strjoin("type=virt,accel=", accel);
-        else
-                machine = strjoin("type=q35,accel=", accel, ",smm=", on_off(ovmf_config->supports_sb));
-        if (!machine)
-                return log_oom();
-
         r = find_qemu_binary(&qemu_binary);
         if (r == -EOPNOTSUPP)
                 return log_error_errno(r, "Native architecture is not supported by qemu.");
         if (r < 0)
                 return log_error_errno(r, "Failed to find QEMU binary: %m");
 
+        if (IN_SET(native_architecture(), ARCHITECTURE_ARM64, ARCHITECTURE_ARM64_BE))
+                machine = "type=virt";
+        else
+                machine = ovmf_config->supports_sb ? "type=q35,smm=on" : "type=q35,smm=off";
+
         if (asprintf(&mem, "%" PRIu64, DIV_ROUND_UP(arg_qemu_mem, U64_MB)) < 0)
                 return log_oom();
 
@@ -744,6 +745,37 @@ static int run_virtual_machine(void) {
         if (r < 0)
                 return log_oom();
 
+        if (!use_kvm && kvm_device_fd >= 0) {
+                log_warning("KVM is disabled but fd for /dev/kvm was passed, closing fd and ignoring");
+                kvm_device_fd = safe_close(kvm_device_fd);
+        }
+
+        if (use_kvm && kvm_device_fd >= 0) {
+                /* /dev/fdset/1 is magic string to tell qemu where to find the fd for /dev/kvm
+                 * we use this so that we can take a fd to /dev/kvm and then give qemu that fd */
+                accel = "kvm,device=/dev/fdset/1";
+
+                r = strv_extend(&cmdline, "--add-fd");
+                if (r < 0)
+                        return log_oom();
+
+                r = strv_extendf(&cmdline, "fd=%d,set=1,opaque=/dev/kvm", kvm_device_fd);
+                if (r < 0)
+                        return log_oom();
+
+                if (!GREEDY_REALLOC(pass_fds, n_pass_fds + 1))
+                        return log_oom();
+
+                pass_fds[n_pass_fds++] = kvm_device_fd;
+        } else if (use_kvm)
+                accel = "kvm";
+        else
+                accel = "tcg";
+
+        r = strv_extend_many(&cmdline, "-accel", accel);
+        if (r < 0)
+                return log_oom();
+
         bool use_vsock = arg_qemu_vsock > 0 && ARCHITECTURE_SUPPORTS_SMBIOS;
         if (arg_qemu_vsock < 0) {
                 r = qemu_check_vsock_support();
@@ -753,12 +785,20 @@ static int run_virtual_machine(void) {
                 use_vsock = r;
         }
 
-        unsigned child_cid = VMADDR_CID_ANY;
         _cleanup_close_ int child_vsock_fd = -EBADF;
         if (use_vsock) {
-                child_cid = arg_vsock_cid;
+                int device_fd = vhost_device_fd;
+                unsigned child_cid = arg_vsock_cid;
+
+                if (device_fd < 0) {
+                        child_vsock_fd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC);
+                        if (child_vsock_fd < 0)
+                                return log_error_errno(errno, "Failed to open /dev/vhost-vsock as read/write: %m");
+
+                        device_fd = child_vsock_fd;
+                }
 
-                r = vsock_fix_child_cid(&child_cid, arg_machine, &child_vsock_fd);
+                r = vsock_fix_child_cid(device_fd, &child_cid, arg_machine);
                 if (r < 0)
                         return log_error_errno(r, "Failed to fix CID for the guest vsock socket: %m");
 
@@ -766,9 +806,14 @@ static int run_virtual_machine(void) {
                 if (r < 0)
                         return log_oom();
 
-                r = strv_extendf(&cmdline, "vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, child_vsock_fd);
+                r = strv_extendf(&cmdline, "vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, device_fd);
                 if (r < 0)
                         return log_oom();
+
+                if (!GREEDY_REALLOC(pass_fds, n_pass_fds + 1))
+                        return log_oom();
+
+                pass_fds[n_pass_fds++] = device_fd;
         }
 
         r = strv_extend_many(&cmdline, "-cpu", "max");
@@ -972,11 +1017,11 @@ static int run_virtual_machine(void) {
         }
 
         if (use_vsock) {
-                vsock_fd = open_vsock();
-                if (vsock_fd < 0)
-                        return log_error_errno(vsock_fd, "Failed to open vsock: %m");
+                notify_sock_fd = open_vsock();
+                if (notify_sock_fd < 0)
+                        return log_error_errno(notify_sock_fd, "Failed to open vsock: %m");
 
-                r = cmdline_add_vsock(&cmdline, vsock_fd);
+                r = cmdline_add_vsock(&cmdline, notify_sock_fd);
                 if (r == -ENOMEM)
                         return log_oom();
                 if (r < 0)
@@ -1027,7 +1072,7 @@ static int run_virtual_machine(void) {
 
         int exit_status = INT_MAX;
         if (use_vsock) {
-                r = setup_notify_parent(event, vsock_fd, &exit_status, &notify_event_source);
+                r = setup_notify_parent(event, notify_sock_fd, &exit_status, &notify_event_source);
                 if (r < 0)
                         return log_error_errno(r, "Failed to setup event loop to handle vsock notify events: %m");
         }
@@ -1092,7 +1137,8 @@ static int verify_arguments(void) {
 }
 
 static int run(int argc, char *argv[]) {
-        int r;
+        int r, kvm_device_fd = -EBADF, vhost_device_fd = -EBADF;
+        _cleanup_strv_free_ char **names = NULL;
 
         log_setup();
 
@@ -1120,9 +1166,25 @@ static int run(int argc, char *argv[]) {
                          special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), ansi_highlight(), ansi_grey(), ansi_normal());
         }
 
+        r = sd_listen_fds_with_names(true, &names);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get passed file descriptors: %m");
+
+        for (int i = 0; i < r; i++) {
+                int fd = SD_LISTEN_FDS_START + i;
+                if (streq(names[i], "kvm"))
+                        kvm_device_fd = fd;
+                else if (streq(names[i], "vhost-vsock"))
+                        vhost_device_fd = fd;
+                else {
+                        log_notice("Couldn't recognize passed fd %d (%s), closing fd and ignoring...", fd, names[i]);
+                        safe_close(fd);
+                }
+        }
+
         assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0);
 
-        return run_virtual_machine();
+        return run_virtual_machine(kvm_device_fd, vhost_device_fd);
 }
 
 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);