From: Sam Leonard Date: Thu, 18 Jan 2024 12:32:10 +0000 (+0000) Subject: vmspawn: accept kvm/vhost-vsock device fds through sd_listen X-Git-Tag: v256-rc1~913^2 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=refs%2Fpull%2F29960%2Fhead;p=thirdparty%2Fsystemd.git vmspawn: accept kvm/vhost-vsock device fds through sd_listen --- diff --git a/man/systemd-vmspawn.xml b/man/systemd-vmspawn.xml index 38f5e3c93e7..b84cc9c0629 100644 --- a/man/systemd-vmspawn.xml +++ b/man/systemd-vmspawn.xml @@ -35,6 +35,12 @@ project='man-pages'>systemd-nspawn1, but it launches a full virtual machine instead of using namespaces. + File descriptors for /dev/kvm and /dev/vhost-vsock can be + passed to systemd-vmspawn via systemd's native socket passing interface (see + sd_listen_fds3 for + details about the precise protocol used and the order in which the file descriptors are passed), these + fds must be passed with the names kvm and vhost-vsock respectively. + Note: on Ubuntu/Debian derivatives systemd-vmspawn requires the user to be in the kvm group to use the VSock options. diff --git a/src/vmspawn/vmspawn-util.c b/src/vmspawn/vmspawn-util.c index 822b1029121..42c7bbfac62 100644 --- a/src/vmspawn/vmspawn-util.c +++ b/src/vmspawn/vmspawn-util.c @@ -367,7 +367,7 @@ int find_qemu_binary(char **ret_qemu_binary) { return find_executable(qemu_arch_specific, ret_qemu_binary); } -int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock) { +int vsock_fix_child_cid(int vhost_device_fd, unsigned *machine_cid, const char *machine) { /* this is an arbitrary value picked from /dev/urandom */ static const uint8_t sip_key[HASH_KEY_SIZE] = { 0x03, 0xad, 0xf0, 0xa4, @@ -376,14 +376,13 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi 0xf5, 0x4c, 0x80, 0x52 }; struct siphash machine_hash_state, state; - _cleanup_close_ int vfd = -EBADF; int r; /* uint64_t is required here for the ioctl call, but valid CIDs are only 32 bits */ uint64_t cid = *ASSERT_PTR(machine_cid); assert(machine); - assert(ret_child_sock); + assert(vhost_device_fd >= 0); /* Fix the CID of the AF_VSOCK socket passed to qemu * @@ -396,16 +395,10 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi * If after another 64 attempts this hasn't worked then give up and return EADDRNOTAVAIL. */ - /* remove O_CLOEXEC before this fd is passed to QEMU */ - vfd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC); - if (vfd < 0) - return log_debug_errno(errno, "Failed to open /dev/vhost-vsock as read/write: %m"); - if (cid != VMADDR_CID_ANY) { - r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + r = ioctl(vhost_device_fd, VHOST_VSOCK_SET_GUEST_CID, &cid); if (r < 0) return log_debug_errno(errno, "Failed to set CID for child vsock with user provided CID %" PRIu64 ": %m", cid); - *ret_child_sock = TAKE_FD(vfd); return 0; } @@ -417,10 +410,9 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi uint64_t hash = siphash24_finalize(&state); cid = 3 + (hash % (UINT_MAX - 4)); - r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + r = ioctl(vhost_device_fd, VHOST_VSOCK_SET_GUEST_CID, &cid); if (r >= 0) { *machine_cid = cid; - *ret_child_sock = TAKE_FD(vfd); return 0; } if (errno != EADDRINUSE) @@ -429,10 +421,9 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi for (unsigned i = 0; i < 64; i++) { cid = 3 + random_u64_range(UINT_MAX - 4); - r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + r = ioctl(vhost_device_fd, VHOST_VSOCK_SET_GUEST_CID, &cid); if (r >= 0) { *machine_cid = cid; - *ret_child_sock = TAKE_FD(vfd); return 0; } diff --git a/src/vmspawn/vmspawn-util.h b/src/vmspawn/vmspawn-util.h index 9c9b1867d38..e1ce7806711 100644 --- a/src/vmspawn/vmspawn-util.h +++ b/src/vmspawn/vmspawn-util.h @@ -68,4 +68,4 @@ int list_ovmf_config(char ***ret); int load_ovmf_config(const char *path, OvmfConfig **ret); int find_ovmf_config(int search_sb, OvmfConfig **ret); int find_qemu_binary(char **ret_qemu_binary); -int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock); +int vsock_fix_child_cid(int vsock_fd, unsigned *machine_cid, const char *machine); diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c index 37c354f0b07..d5ab2a1a270 100644 --- a/src/vmspawn/vmspawn.c +++ b/src/vmspawn/vmspawn.c @@ -6,6 +6,7 @@ #include #include +#include "sd-daemon.h" #include "sd-event.h" #include "sd-id128.h" @@ -654,12 +655,15 @@ static int kernel_cmdline_maybe_append_root(void) { return 0; } -static int run_virtual_machine(void) { +static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { _cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL; _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; - _cleanup_close_ int vsock_fd = -EBADF; - _cleanup_free_ char *machine = NULL, *qemu_binary = NULL, *mem = NULL, *trans_scope = NULL; + _cleanup_free_ char *qemu_binary = NULL, *mem = NULL, *trans_scope = NULL; + _cleanup_close_ int notify_sock_fd = -EBADF; _cleanup_strv_free_ char **cmdline = NULL; + _cleanup_free_ int *pass_fds = NULL; + size_t n_pass_fds = 0; + const char *machine, *accel; int r; if (arg_privileged) @@ -693,20 +697,17 @@ static int run_virtual_machine(void) { log_warning("Couldn't find OVMF firmware blob with Secure Boot support, " "falling back to OVMF firmware blobs without Secure Boot support."); - const char *accel = use_kvm ? "kvm" : "tcg"; - if (IN_SET(native_architecture(), ARCHITECTURE_ARM64, ARCHITECTURE_ARM64_BE)) - machine = strjoin("type=virt,accel=", accel); - else - machine = strjoin("type=q35,accel=", accel, ",smm=", on_off(ovmf_config->supports_sb)); - if (!machine) - return log_oom(); - r = find_qemu_binary(&qemu_binary); if (r == -EOPNOTSUPP) return log_error_errno(r, "Native architecture is not supported by qemu."); if (r < 0) return log_error_errno(r, "Failed to find QEMU binary: %m"); + if (IN_SET(native_architecture(), ARCHITECTURE_ARM64, ARCHITECTURE_ARM64_BE)) + machine = "type=virt"; + else + machine = ovmf_config->supports_sb ? "type=q35,smm=on" : "type=q35,smm=off"; + if (asprintf(&mem, "%" PRIu64, DIV_ROUND_UP(arg_qemu_mem, U64_MB)) < 0) return log_oom(); @@ -744,6 +745,37 @@ static int run_virtual_machine(void) { if (r < 0) return log_oom(); + if (!use_kvm && kvm_device_fd >= 0) { + log_warning("KVM is disabled but fd for /dev/kvm was passed, closing fd and ignoring"); + kvm_device_fd = safe_close(kvm_device_fd); + } + + if (use_kvm && kvm_device_fd >= 0) { + /* /dev/fdset/1 is magic string to tell qemu where to find the fd for /dev/kvm + * we use this so that we can take a fd to /dev/kvm and then give qemu that fd */ + accel = "kvm,device=/dev/fdset/1"; + + r = strv_extend(&cmdline, "--add-fd"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "fd=%d,set=1,opaque=/dev/kvm", kvm_device_fd); + if (r < 0) + return log_oom(); + + if (!GREEDY_REALLOC(pass_fds, n_pass_fds + 1)) + return log_oom(); + + pass_fds[n_pass_fds++] = kvm_device_fd; + } else if (use_kvm) + accel = "kvm"; + else + accel = "tcg"; + + r = strv_extend_many(&cmdline, "-accel", accel); + if (r < 0) + return log_oom(); + bool use_vsock = arg_qemu_vsock > 0 && ARCHITECTURE_SUPPORTS_SMBIOS; if (arg_qemu_vsock < 0) { r = qemu_check_vsock_support(); @@ -753,12 +785,20 @@ static int run_virtual_machine(void) { use_vsock = r; } - unsigned child_cid = VMADDR_CID_ANY; _cleanup_close_ int child_vsock_fd = -EBADF; if (use_vsock) { - child_cid = arg_vsock_cid; + int device_fd = vhost_device_fd; + unsigned child_cid = arg_vsock_cid; + + if (device_fd < 0) { + child_vsock_fd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC); + if (child_vsock_fd < 0) + return log_error_errno(errno, "Failed to open /dev/vhost-vsock as read/write: %m"); + + device_fd = child_vsock_fd; + } - r = vsock_fix_child_cid(&child_cid, arg_machine, &child_vsock_fd); + r = vsock_fix_child_cid(device_fd, &child_cid, arg_machine); if (r < 0) return log_error_errno(r, "Failed to fix CID for the guest vsock socket: %m"); @@ -766,9 +806,14 @@ static int run_virtual_machine(void) { if (r < 0) return log_oom(); - r = strv_extendf(&cmdline, "vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, child_vsock_fd); + r = strv_extendf(&cmdline, "vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, device_fd); if (r < 0) return log_oom(); + + if (!GREEDY_REALLOC(pass_fds, n_pass_fds + 1)) + return log_oom(); + + pass_fds[n_pass_fds++] = device_fd; } r = strv_extend_many(&cmdline, "-cpu", "max"); @@ -972,11 +1017,11 @@ static int run_virtual_machine(void) { } if (use_vsock) { - vsock_fd = open_vsock(); - if (vsock_fd < 0) - return log_error_errno(vsock_fd, "Failed to open vsock: %m"); + notify_sock_fd = open_vsock(); + if (notify_sock_fd < 0) + return log_error_errno(notify_sock_fd, "Failed to open vsock: %m"); - r = cmdline_add_vsock(&cmdline, vsock_fd); + r = cmdline_add_vsock(&cmdline, notify_sock_fd); if (r == -ENOMEM) return log_oom(); if (r < 0) @@ -1027,7 +1072,7 @@ static int run_virtual_machine(void) { int exit_status = INT_MAX; if (use_vsock) { - r = setup_notify_parent(event, vsock_fd, &exit_status, ¬ify_event_source); + r = setup_notify_parent(event, notify_sock_fd, &exit_status, ¬ify_event_source); if (r < 0) return log_error_errno(r, "Failed to setup event loop to handle vsock notify events: %m"); } @@ -1092,7 +1137,8 @@ static int verify_arguments(void) { } static int run(int argc, char *argv[]) { - int r; + int r, kvm_device_fd = -EBADF, vhost_device_fd = -EBADF; + _cleanup_strv_free_ char **names = NULL; log_setup(); @@ -1120,9 +1166,25 @@ static int run(int argc, char *argv[]) { special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), ansi_highlight(), ansi_grey(), ansi_normal()); } + r = sd_listen_fds_with_names(true, &names); + if (r < 0) + return log_error_errno(r, "Failed to get passed file descriptors: %m"); + + for (int i = 0; i < r; i++) { + int fd = SD_LISTEN_FDS_START + i; + if (streq(names[i], "kvm")) + kvm_device_fd = fd; + else if (streq(names[i], "vhost-vsock")) + vhost_device_fd = fd; + else { + log_notice("Couldn't recognize passed fd %d (%s), closing fd and ignoring...", fd, names[i]); + safe_close(fd); + } + } + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); - return run_virtual_machine(); + return run_virtual_machine(kvm_device_fd, vhost_device_fd); } DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);