From: Christian Brauner Date: Wed, 8 Apr 2026 06:54:23 +0000 (+0200) Subject: vmspawn: convert drive device setup to bridge X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=e7d84aa4de9b293c2f48b2f76100befd88d5ab3c;p=thirdparty%2Fsystemd.git vmspawn: convert drive device setup to bridge Remove the static blockdev/device/snapshot INI config sections and the SCSI controller setup for both the root image drive and extra drives. Replace with DriveInfos that are constructed in the parent after fork: vmspawn opens all image files and passes fds to QEMU via the add-fd path. For ephemeral mode, anonymous overlay files are created via O_TMPFILE or memfd. The resolve_disk_driver() helper maps DiskType to the appropriate QEMU driver name and serial format. The post-fork device-info preparation is split into helpers: prepare_primary_drive() and prepare_extra_drives() for per-drive construction, assign_pcie_ports() for naming the pre-allocated pcie-root-port bridges once every device type is known, and prepare_device_info() that stitches them together against the MachineConfig aggregate. Signed-off-by: Christian Brauner (Amutable) --- diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c index 8fdb64786cd..548a083fac5 100644 --- a/src/vmspawn/vmspawn.c +++ b/src/vmspawn/vmspawn.c @@ -100,8 +100,9 @@ #define VM_TAP_HASH_KEY SD_ID128_MAKE(01,d0,c6,4c,2b,df,24,fb,c0,f8,b2,09,7d,59,b2,93) -#define DISK_SERIAL_MAX_LEN_SCSI 30 -#define DISK_SERIAL_MAX_LEN_NVME 20 +#define DISK_SERIAL_MAX_LEN_SCSI 30 +#define DISK_SERIAL_MAX_LEN_NVME 20 +#define DISK_SERIAL_MAX_LEN_VIRTIO_BLK 20 /* Spare pcie-root-ports reserved for future runtime hotplug beyond the pre-wired devices. */ #define VMSPAWN_PCIE_HOTPLUG_SPARES 10u @@ -2326,6 +2327,232 @@ static int qemu_config_add_qmp_monitor(FILE *config_file, int bridge_fds[2], int "mode", "control"); } +static int resolve_disk_driver(DiskType dt, const char *filename, DriveInfo *info) { + int r; + + assert(filename); + assert(info); + + switch (dt) { + case DISK_TYPE_VIRTIO_BLK: + info->disk_driver = "virtio-blk-pci"; + r = disk_serial(filename, DISK_SERIAL_MAX_LEN_VIRTIO_BLK, &info->serial); + if (r < 0) + return r; + break; + case DISK_TYPE_VIRTIO_SCSI: + info->disk_driver = "scsi-hd"; + r = disk_serial(filename, DISK_SERIAL_MAX_LEN_SCSI, &info->serial); + if (r < 0) + return r; + break; + case DISK_TYPE_NVME: + info->disk_driver = "nvme"; + r = disk_serial(filename, DISK_SERIAL_MAX_LEN_NVME, &info->serial); + if (r < 0) + return r; + break; + case DISK_TYPE_VIRTIO_SCSI_CDROM: + info->disk_driver = "scsi-cd"; + info->flags |= QMP_DRIVE_READ_ONLY; + r = disk_serial(filename, DISK_SERIAL_MAX_LEN_SCSI, &info->serial); + if (r < 0) + return r; + break; + default: + assert_not_reached(); + } + + return 0; +} + +static int prepare_primary_drive(const char *runtime_dir, DriveInfos *drives) { + int r; + + assert(runtime_dir); + assert(drives); + + if (!arg_image) + return 0; + + _cleanup_free_ char *image_fn = NULL; + r = path_extract_filename(arg_image, &image_fn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", arg_image); + + DriveInfo *d = &drives->drives[drives->n_drives++]; + *d = (DriveInfo) { .fd = -EBADF, .overlay_fd = -EBADF }; + + r = resolve_disk_driver(arg_image_disk_type, image_fn, d); + if (r < 0) + return log_error_errno(r, "Failed to resolve disk driver for '%s': %m", image_fn); + + int open_flags = ((arg_ephemeral || FLAGS_SET(d->flags, QMP_DRIVE_READ_ONLY)) ? O_RDONLY : O_RDWR) | O_CLOEXEC | O_NOCTTY; + + _cleanup_close_ int image_fd = open(arg_image, open_flags); + if (image_fd < 0) + return log_error_errno(errno, "Failed to open '%s': %m", arg_image); + + struct stat st; + if (fstat(image_fd, &st) < 0) + return log_error_errno(errno, "Failed to stat '%s': %m", arg_image); + + r = stat_verify_regular_or_block(&st); + if (r < 0) + return log_error_errno(r, "Expected regular file or block device for image: %s", arg_image); + + d->path = arg_image; + d->format = image_format_to_string(arg_image_format); + d->node_name = strdup("vmspawn"); + if (!d->node_name) + return log_oom(); + d->fd = TAKE_FD(image_fd); + if (S_ISBLK(st.st_mode)) + d->flags |= QMP_DRIVE_BLOCK_DEVICE; + if (arg_discard_disk && !FLAGS_SET(d->flags, QMP_DRIVE_READ_ONLY)) + d->flags |= QMP_DRIVE_DISCARD; + d->flags |= QMP_DRIVE_BOOT; + + /* For ephemeral mode, create an anonymous overlay file. QEMU will format it + * as qcow2 via blockdev-create, so no filesystem path is needed. + * Skip for read-only drives (e.g. CDROM) where overlays are not meaningful. */ + if (arg_ephemeral && !FLAGS_SET(d->flags, QMP_DRIVE_READ_ONLY)) { + _cleanup_close_ int overlay_fd = open(runtime_dir, O_TMPFILE | O_RDWR | O_CLOEXEC, 0600); + if (overlay_fd < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(errno)) + return log_error_errno(errno, "Failed to create ephemeral overlay in '%s': %m", runtime_dir); + + /* Fallback to memfd if O_TMPFILE is not supported */ + overlay_fd = memfd_new("vmspawn-overlay"); + if (overlay_fd < 0) + return log_error_errno(overlay_fd, "Failed to create ephemeral overlay via memfd: %m"); + } + d->overlay_fd = TAKE_FD(overlay_fd); + d->flags |= QMP_DRIVE_NO_FLUSH; + } + + return 0; +} + +static int prepare_extra_drives(DriveInfos *drives) { + int r; + + assert(drives); + + size_t extra_idx = 0; + FOREACH_ARRAY(drive, arg_extra_drives.drives, arg_extra_drives.n_drives) { + _cleanup_free_ char *drive_fn = NULL; + r = path_extract_filename(drive->path, &drive_fn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", drive->path); + + DiskType dt = drive->disk_type >= 0 ? drive->disk_type : arg_image_disk_type; + + DriveInfo *d = &drives->drives[drives->n_drives++]; + *d = (DriveInfo) { .fd = -EBADF, .overlay_fd = -EBADF }; + + r = resolve_disk_driver(dt, drive_fn, d); + if (r < 0) + return log_error_errno(r, "Failed to resolve disk driver for '%s': %m", drive_fn); + + _cleanup_close_ int drive_fd = open(drive->path, (FLAGS_SET(d->flags, QMP_DRIVE_READ_ONLY) ? O_RDONLY : O_RDWR) | O_CLOEXEC | O_NOCTTY); + if (drive_fd < 0) + return log_error_errno(errno, "Failed to open '%s': %m", drive->path); + + struct stat drive_st; + if (fstat(drive_fd, &drive_st) < 0) + return log_error_errno(errno, "Failed to stat '%s': %m", drive->path); + r = stat_verify_regular_or_block(&drive_st); + if (r < 0) + return log_error_errno(r, "Expected regular file or block device, not '%s'.", drive->path); + if (S_ISBLK(drive_st.st_mode) && drive->format == IMAGE_FORMAT_QCOW2) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Block device '%s' cannot be used with 'qcow2' format, only 'raw' is supported.", + drive->path); + + d->path = drive->path; + d->format = image_format_to_string(drive->format); + d->fd = TAKE_FD(drive_fd); + if (S_ISBLK(drive_st.st_mode)) + d->flags |= QMP_DRIVE_BLOCK_DEVICE; + d->flags |= QMP_DRIVE_NO_FLUSH; + + if (asprintf(&d->node_name, "vmspawn_extra_%zu", extra_idx++) < 0) + return log_oom(); + } + + return 0; +} + +/* Assign PCIe root port names to devices. The ports were pre-allocated in the config + * file. Each PCI device that will be hotplugged via QMP device_add gets a port. */ +static int assign_pcie_ports(MachineConfig *c) { + assert(c); + + if (!ARCHITECTURE_NEEDS_PCIE_ROOT_PORTS) + return 0; + + DriveInfos *drives = &c->drives; + NetworkInfo *network = &c->network; + VirtiofsInfos *virtiofs = &c->virtiofs; + VsockInfo *vsock = &c->vsock; + + size_t port = 0; + + /* Drives: non-SCSI drives get individual ports, SCSI controller gets one port */ + bool need_scsi = false; + FOREACH_ARRAY(d, drives->drives, drives->n_drives) { + if (STR_IN_SET(d->disk_driver, "scsi-hd", "scsi-cd")) { + need_scsi = true; + continue; + } + if (asprintf(&d->pcie_port, "vmspawn-pcieport-%zu", port++) < 0) + return log_oom(); + } + if (need_scsi) + if (asprintf(&drives->scsi_pcie_port, "vmspawn-pcieport-%zu", port++) < 0) + return log_oom(); + + if (network->type) + if (asprintf(&network->pcie_port, "vmspawn-pcieport-%zu", port++) < 0) + return log_oom(); + + FOREACH_ARRAY(v, virtiofs->entries, virtiofs->n_entries) + if (asprintf(&v->pcie_port, "vmspawn-pcieport-%zu", port++) < 0) + return log_oom(); + + if (vsock->fd >= 0) + if (asprintf(&vsock->pcie_port, "vmspawn-pcieport-%zu", port++) < 0) + return log_oom(); + + return 0; +} + +static int prepare_device_info(const char *runtime_dir, MachineConfig *c) { + int r; + + assert(runtime_dir); + assert(c); + + DriveInfos *drives = &c->drives; + + /* Build drive info for QMP-based setup. vmspawn opens all image files and + * passes fds to QEMU via add-fd — QEMU never needs filesystem access. */ + drives->drives = new0(DriveInfo, 1 + arg_extra_drives.n_drives); + if (!drives->drives) + return log_oom(); + + r = prepare_primary_drive(runtime_dir, drives); + if (r < 0) + return r; + + r = prepare_extra_drives(drives); + if (r < 0) + return r; + + return assign_pcie_ports(c); +} + static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { _cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL; _cleanup_free_ char *qemu_binary = NULL, *mem = NULL; @@ -2851,24 +3078,6 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { } } - bool need_scsi_controller = - IN_SET(arg_image_disk_type, DISK_TYPE_VIRTIO_SCSI, DISK_TYPE_VIRTIO_SCSI_CDROM) && arg_image; - if (!need_scsi_controller) - FOREACH_ARRAY(drive, arg_extra_drives.drives, arg_extra_drives.n_drives) { - DiskType dt = drive->disk_type >= 0 ? drive->disk_type : arg_image_disk_type; - if (IN_SET(dt, DISK_TYPE_VIRTIO_SCSI, DISK_TYPE_VIRTIO_SCSI_CDROM)) { - need_scsi_controller = true; - break; - } - } - - if (need_scsi_controller) { - r = qemu_config_section(config_file, "device", "vmspawn_scsi", - "driver", "virtio-scsi-pci"); - if (r < 0) - return r; - } - if (arg_image) { assert(!arg_directory); @@ -2880,74 +3089,6 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { arg_image); } - if (arg_image_disk_type == DISK_TYPE_VIRTIO_SCSI_CDROM) - r = qemu_config_section(config_file, "drive", "vmspawn", - "if", "none", - "file", arg_image, - "format", image_format_to_string(arg_image_format), - "media", "cdrom", - "readonly", "on"); - else - r = qemu_config_section(config_file, "drive", "vmspawn", - "if", "none", - "file", arg_image, - "format", image_format_to_string(arg_image_format), - "discard", on_off(arg_discard_disk), - "snapshot", on_off(arg_ephemeral)); - if (r < 0) - return r; - - _cleanup_free_ char *image_fn = NULL; - r = path_extract_filename(arg_image, &image_fn); - if (r < 0) - return log_error_errno(r, "Failed to extract filename from path '%s': %m", image_fn); - - const char *disk_driver; - _cleanup_free_ char *serial = NULL; - - switch (arg_image_disk_type) { - case DISK_TYPE_VIRTIO_BLK: - disk_driver = "virtio-blk-pci"; - serial = strdup(image_fn); - if (!serial) - return log_oom(); - break; - case DISK_TYPE_VIRTIO_SCSI: - disk_driver = "scsi-hd"; - r = disk_serial(image_fn, DISK_SERIAL_MAX_LEN_SCSI, &serial); - if (r < 0) - return log_oom(); - break; - case DISK_TYPE_NVME: - disk_driver = "nvme"; - r = disk_serial(image_fn, DISK_SERIAL_MAX_LEN_NVME, &serial); - if (r < 0) - return log_oom(); - break; - case DISK_TYPE_VIRTIO_SCSI_CDROM: - disk_driver = "scsi-cd"; - r = disk_serial(image_fn, DISK_SERIAL_MAX_LEN_SCSI, &serial); - if (r < 0) - return log_oom(); - break; - default: - assert_not_reached(); - } - - r = qemu_config_section(config_file, "device", "vmspawn-disk", - "driver", disk_driver, - "drive", "vmspawn", - "bootindex", "1", - "serial", serial); - if (r < 0) - return r; - - if (IN_SET(arg_image_disk_type, DISK_TYPE_VIRTIO_SCSI, DISK_TYPE_VIRTIO_SCSI_CDROM)) { - r = qemu_config_key(config_file, "bus", "vmspawn_scsi.0"); - if (r < 0) - return r; - } - if (arg_image_disk_type != DISK_TYPE_VIRTIO_SCSI_CDROM) { r = grow_image(arg_image, arg_grow_image); if (r < 0) @@ -3034,86 +3175,8 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { return log_oom(); } - size_t i = 0; - FOREACH_ARRAY(drive, arg_extra_drives.drives, arg_extra_drives.n_drives) { - if (strv_extend(&cmdline, "-blockdev") < 0) - return log_oom(); - - _cleanup_free_ char *escaped_drive = escape_qemu_value(drive->path); - if (!escaped_drive) - return log_oom(); - - struct stat st; - if (stat(drive->path, &st) < 0) - return log_error_errno(errno, "Failed to stat '%s': %m", drive->path); - - const char *driver = NULL; - if (S_ISREG(st.st_mode)) - driver = "file"; - else if (S_ISBLK(st.st_mode)) { - if (drive->format == IMAGE_FORMAT_QCOW2) - return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), - "Block device '%s' cannot be used with 'qcow2' format, only 'raw' is supported.", - drive->path); - driver = "host_device"; - } else - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expected regular file or block device, not '%s'.", drive->path); - - DiskType dt = drive->disk_type >= 0 ? drive->disk_type : arg_image_disk_type; - - if (strv_extendf(&cmdline, "driver=%s,cache.direct=off,cache.no-flush=on,file.driver=%s,file.filename=%s,node-name=vmspawn_extra_%zu%s", - image_format_to_string(drive->format), driver, escaped_drive, i, - dt == DISK_TYPE_VIRTIO_SCSI_CDROM ? ",read-only=on" : "") < 0) - return log_oom(); - - _cleanup_free_ char *drive_fn = NULL; - r = path_extract_filename(drive->path, &drive_fn); - if (r < 0) - return log_error_errno(r, "Failed to extract filename from path '%s': %m", drive->path); - - _cleanup_free_ char *escaped_drive_fn = escape_qemu_value(drive_fn); - if (!escaped_drive_fn) - return log_oom(); - - if (strv_extend(&cmdline, "-device") < 0) - return log_oom(); - - switch (dt) { - case DISK_TYPE_VIRTIO_BLK: - if (strv_extendf(&cmdline, "virtio-blk-pci,drive=vmspawn_extra_%zu,serial=%s", i++, escaped_drive_fn) < 0) - return log_oom(); - break; - case DISK_TYPE_VIRTIO_SCSI: { - _cleanup_free_ char *serial = NULL; - r = disk_serial(escaped_drive_fn, DISK_SERIAL_MAX_LEN_SCSI, &serial); - if (r < 0) - return log_oom(); - if (strv_extendf(&cmdline, "scsi-hd,bus=vmspawn_scsi.0,drive=vmspawn_extra_%zu,serial=%s", i++, serial) < 0) - return log_oom(); - break; - } - case DISK_TYPE_NVME: { - _cleanup_free_ char *serial = NULL; - r = disk_serial(escaped_drive_fn, DISK_SERIAL_MAX_LEN_NVME, &serial); - if (r < 0) - return log_oom(); - if (strv_extendf(&cmdline, "nvme,drive=vmspawn_extra_%zu,serial=%s", i++, serial) < 0) - return log_oom(); - break; - } - case DISK_TYPE_VIRTIO_SCSI_CDROM: { - _cleanup_free_ char *serial = NULL; - r = disk_serial(escaped_drive_fn, DISK_SERIAL_MAX_LEN_SCSI, &serial); - if (r < 0) - return log_oom(); - if (strv_extendf(&cmdline, "scsi-cd,bus=vmspawn_scsi.0,drive=vmspawn_extra_%zu,serial=%s", i++, serial) < 0) - return log_oom(); - break; - } - default: - assert_not_reached(); - } - } + /* Extra drive validation is done in the post-fork drive info construction loop + * to avoid stat()'ing each drive twice. */ if (!IN_SET(arg_console_mode, CONSOLE_GUI, CONSOLE_HEADLESS)) { r = strv_prepend(&arg_kernel_cmdline_extra, @@ -3480,7 +3543,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { n_pcie_ports); size_t n_builtin_ports = n_pcie_ports - VMSPAWN_PCIE_HOTPLUG_SPARES; - for (i = 0; i < n_pcie_ports; i++) { + for (size_t i = 0; i < n_pcie_ports; i++) { char id[STRLEN("vmspawn-hotplug-pci-root-port-") + DECIMAL_STR_MAX(size_t)]; if (i < n_builtin_ports) xsprintf(id, "vmspawn-pcieport-%zu", i); @@ -3501,6 +3564,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { return r; } } + /* Finalize the config file and add -readconfig to the cmdline */ r = fflush_and_check(config_file); if (r < 0) @@ -3570,6 +3634,10 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { child_pty = safe_close(child_pty); bridge_fds[1] = safe_close(bridge_fds[1]); + r = prepare_device_info(runtime_dir, &config); + if (r < 0) + return r; + /* Connect to VMM backend */ _cleanup_(vmspawn_qmp_bridge_freep) VmspawnQmpBridge *bridge = NULL; r = vmspawn_qmp_init(&bridge, bridge_fds[0], event); @@ -3584,6 +3652,10 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { return r; /* Device setup — all before resuming vCPUs */ + r = vmspawn_qmp_setup_drives(bridge, &config.drives); + if (r < 0) + return r; + if (config.network.type) { r = vmspawn_qmp_setup_network(bridge, &config.network); if (r < 0)