]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
vmspawn: convert drive device setup to bridge
authorChristian Brauner <brauner@kernel.org>
Wed, 8 Apr 2026 06:54:23 +0000 (08:54 +0200)
committerChristian Brauner <brauner@kernel.org>
Wed, 15 Apr 2026 08:14:48 +0000 (10:14 +0200)
Remove the static blockdev/device/snapshot INI config sections and
the SCSI controller setup for both the root image drive and extra
drives. Replace with DriveInfos that are constructed in the parent
after fork: vmspawn opens all image files and passes fds to QEMU via
the add-fd path. For ephemeral mode, anonymous overlay files are
created via O_TMPFILE or memfd.

The resolve_disk_driver() helper maps DiskType to the appropriate
QEMU driver name and serial format.

The post-fork device-info preparation is split into helpers:
prepare_primary_drive() and prepare_extra_drives() for per-drive
construction, assign_pcie_ports() for naming the pre-allocated
pcie-root-port bridges once every device type is known, and
prepare_device_info() that stitches them together against the
MachineConfig aggregate.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
src/vmspawn/vmspawn.c

index 8fdb64786cd9f53cbd0b93bee09b1b986116e3e1..548a083fac5c6897059c2fc14599b5c80eff0019 100644 (file)
 
 #define VM_TAP_HASH_KEY SD_ID128_MAKE(01,d0,c6,4c,2b,df,24,fb,c0,f8,b2,09,7d,59,b2,93)
 
-#define DISK_SERIAL_MAX_LEN_SCSI 30
-#define DISK_SERIAL_MAX_LEN_NVME 20
+#define DISK_SERIAL_MAX_LEN_SCSI        30
+#define DISK_SERIAL_MAX_LEN_NVME        20
+#define DISK_SERIAL_MAX_LEN_VIRTIO_BLK  20
 
 /* Spare pcie-root-ports reserved for future runtime hotplug beyond the pre-wired devices. */
 #define VMSPAWN_PCIE_HOTPLUG_SPARES 10u
@@ -2326,6 +2327,232 @@ static int qemu_config_add_qmp_monitor(FILE *config_file, int bridge_fds[2], int
                                    "mode", "control");
 }
 
+static int resolve_disk_driver(DiskType dt, const char *filename, DriveInfo *info) {
+        int r;
+
+        assert(filename);
+        assert(info);
+
+        switch (dt) {
+        case DISK_TYPE_VIRTIO_BLK:
+                info->disk_driver = "virtio-blk-pci";
+                r = disk_serial(filename, DISK_SERIAL_MAX_LEN_VIRTIO_BLK, &info->serial);
+                if (r < 0)
+                        return r;
+                break;
+        case DISK_TYPE_VIRTIO_SCSI:
+                info->disk_driver = "scsi-hd";
+                r = disk_serial(filename, DISK_SERIAL_MAX_LEN_SCSI, &info->serial);
+                if (r < 0)
+                        return r;
+                break;
+        case DISK_TYPE_NVME:
+                info->disk_driver = "nvme";
+                r = disk_serial(filename, DISK_SERIAL_MAX_LEN_NVME, &info->serial);
+                if (r < 0)
+                        return r;
+                break;
+        case DISK_TYPE_VIRTIO_SCSI_CDROM:
+                info->disk_driver = "scsi-cd";
+                info->flags |= QMP_DRIVE_READ_ONLY;
+                r = disk_serial(filename, DISK_SERIAL_MAX_LEN_SCSI, &info->serial);
+                if (r < 0)
+                        return r;
+                break;
+        default:
+                assert_not_reached();
+        }
+
+        return 0;
+}
+
+static int prepare_primary_drive(const char *runtime_dir, DriveInfos *drives) {
+        int r;
+
+        assert(runtime_dir);
+        assert(drives);
+
+        if (!arg_image)
+                return 0;
+
+        _cleanup_free_ char *image_fn = NULL;
+        r = path_extract_filename(arg_image, &image_fn);
+        if (r < 0)
+                return log_error_errno(r, "Failed to extract filename from path '%s': %m", arg_image);
+
+        DriveInfo *d = &drives->drives[drives->n_drives++];
+        *d = (DriveInfo) { .fd = -EBADF, .overlay_fd = -EBADF };
+
+        r = resolve_disk_driver(arg_image_disk_type, image_fn, d);
+        if (r < 0)
+                return log_error_errno(r, "Failed to resolve disk driver for '%s': %m", image_fn);
+
+        int open_flags = ((arg_ephemeral || FLAGS_SET(d->flags, QMP_DRIVE_READ_ONLY)) ? O_RDONLY : O_RDWR) | O_CLOEXEC | O_NOCTTY;
+
+        _cleanup_close_ int image_fd = open(arg_image, open_flags);
+        if (image_fd < 0)
+                return log_error_errno(errno, "Failed to open '%s': %m", arg_image);
+
+        struct stat st;
+        if (fstat(image_fd, &st) < 0)
+                return log_error_errno(errno, "Failed to stat '%s': %m", arg_image);
+
+        r = stat_verify_regular_or_block(&st);
+        if (r < 0)
+                return log_error_errno(r, "Expected regular file or block device for image: %s", arg_image);
+
+        d->path = arg_image;
+        d->format = image_format_to_string(arg_image_format);
+        d->node_name = strdup("vmspawn");
+        if (!d->node_name)
+                return log_oom();
+        d->fd = TAKE_FD(image_fd);
+        if (S_ISBLK(st.st_mode))
+                d->flags |= QMP_DRIVE_BLOCK_DEVICE;
+        if (arg_discard_disk && !FLAGS_SET(d->flags, QMP_DRIVE_READ_ONLY))
+                d->flags |= QMP_DRIVE_DISCARD;
+        d->flags |= QMP_DRIVE_BOOT;
+
+        /* For ephemeral mode, create an anonymous overlay file. QEMU will format it
+         * as qcow2 via blockdev-create, so no filesystem path is needed.
+         * Skip for read-only drives (e.g. CDROM) where overlays are not meaningful. */
+        if (arg_ephemeral && !FLAGS_SET(d->flags, QMP_DRIVE_READ_ONLY)) {
+                _cleanup_close_ int overlay_fd = open(runtime_dir, O_TMPFILE | O_RDWR | O_CLOEXEC, 0600);
+                if (overlay_fd < 0) {
+                        if (!ERRNO_IS_NOT_SUPPORTED(errno))
+                                return log_error_errno(errno, "Failed to create ephemeral overlay in '%s': %m", runtime_dir);
+
+                        /* Fallback to memfd if O_TMPFILE is not supported */
+                        overlay_fd = memfd_new("vmspawn-overlay");
+                        if (overlay_fd < 0)
+                                return log_error_errno(overlay_fd, "Failed to create ephemeral overlay via memfd: %m");
+                }
+                d->overlay_fd = TAKE_FD(overlay_fd);
+                d->flags |= QMP_DRIVE_NO_FLUSH;
+        }
+
+        return 0;
+}
+
+static int prepare_extra_drives(DriveInfos *drives) {
+        int r;
+
+        assert(drives);
+
+        size_t extra_idx = 0;
+        FOREACH_ARRAY(drive, arg_extra_drives.drives, arg_extra_drives.n_drives) {
+                _cleanup_free_ char *drive_fn = NULL;
+                r = path_extract_filename(drive->path, &drive_fn);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract filename from path '%s': %m", drive->path);
+
+                DiskType dt = drive->disk_type >= 0 ? drive->disk_type : arg_image_disk_type;
+
+                DriveInfo *d = &drives->drives[drives->n_drives++];
+                *d = (DriveInfo) { .fd = -EBADF, .overlay_fd = -EBADF };
+
+                r = resolve_disk_driver(dt, drive_fn, d);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to resolve disk driver for '%s': %m", drive_fn);
+
+                _cleanup_close_ int drive_fd = open(drive->path, (FLAGS_SET(d->flags, QMP_DRIVE_READ_ONLY) ? O_RDONLY : O_RDWR) | O_CLOEXEC | O_NOCTTY);
+                if (drive_fd < 0)
+                        return log_error_errno(errno, "Failed to open '%s': %m", drive->path);
+
+                struct stat drive_st;
+                if (fstat(drive_fd, &drive_st) < 0)
+                        return log_error_errno(errno, "Failed to stat '%s': %m", drive->path);
+                r = stat_verify_regular_or_block(&drive_st);
+                if (r < 0)
+                        return log_error_errno(r, "Expected regular file or block device, not '%s'.", drive->path);
+                if (S_ISBLK(drive_st.st_mode) && drive->format == IMAGE_FORMAT_QCOW2)
+                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                               "Block device '%s' cannot be used with 'qcow2' format, only 'raw' is supported.",
+                                               drive->path);
+
+                d->path = drive->path;
+                d->format = image_format_to_string(drive->format);
+                d->fd = TAKE_FD(drive_fd);
+                if (S_ISBLK(drive_st.st_mode))
+                        d->flags |= QMP_DRIVE_BLOCK_DEVICE;
+                d->flags |= QMP_DRIVE_NO_FLUSH;
+
+                if (asprintf(&d->node_name, "vmspawn_extra_%zu", extra_idx++) < 0)
+                        return log_oom();
+        }
+
+        return 0;
+}
+
+/* Assign PCIe root port names to devices. The ports were pre-allocated in the config
+ * file. Each PCI device that will be hotplugged via QMP device_add gets a port. */
+static int assign_pcie_ports(MachineConfig *c) {
+        assert(c);
+
+        if (!ARCHITECTURE_NEEDS_PCIE_ROOT_PORTS)
+                return 0;
+
+        DriveInfos *drives = &c->drives;
+        NetworkInfo *network = &c->network;
+        VirtiofsInfos *virtiofs = &c->virtiofs;
+        VsockInfo *vsock = &c->vsock;
+
+        size_t port = 0;
+
+        /* Drives: non-SCSI drives get individual ports, SCSI controller gets one port */
+        bool need_scsi = false;
+        FOREACH_ARRAY(d, drives->drives, drives->n_drives) {
+                if (STR_IN_SET(d->disk_driver, "scsi-hd", "scsi-cd")) {
+                        need_scsi = true;
+                        continue;
+                }
+                if (asprintf(&d->pcie_port, "vmspawn-pcieport-%zu", port++) < 0)
+                        return log_oom();
+        }
+        if (need_scsi)
+                if (asprintf(&drives->scsi_pcie_port, "vmspawn-pcieport-%zu", port++) < 0)
+                        return log_oom();
+
+        if (network->type)
+                if (asprintf(&network->pcie_port, "vmspawn-pcieport-%zu", port++) < 0)
+                        return log_oom();
+
+        FOREACH_ARRAY(v, virtiofs->entries, virtiofs->n_entries)
+                if (asprintf(&v->pcie_port, "vmspawn-pcieport-%zu", port++) < 0)
+                        return log_oom();
+
+        if (vsock->fd >= 0)
+                if (asprintf(&vsock->pcie_port, "vmspawn-pcieport-%zu", port++) < 0)
+                        return log_oom();
+
+        return 0;
+}
+
+static int prepare_device_info(const char *runtime_dir, MachineConfig *c) {
+        int r;
+
+        assert(runtime_dir);
+        assert(c);
+
+        DriveInfos *drives = &c->drives;
+
+        /* Build drive info for QMP-based setup. vmspawn opens all image files and
+         * passes fds to QEMU via add-fd — QEMU never needs filesystem access. */
+        drives->drives = new0(DriveInfo, 1 + arg_extra_drives.n_drives);
+        if (!drives->drives)
+                return log_oom();
+
+        r = prepare_primary_drive(runtime_dir, drives);
+        if (r < 0)
+                return r;
+
+        r = prepare_extra_drives(drives);
+        if (r < 0)
+                return r;
+
+        return assign_pcie_ports(c);
+}
+
 static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
         _cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL;
         _cleanup_free_ char *qemu_binary = NULL, *mem = NULL;
@@ -2851,24 +3078,6 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                 }
         }
 
-        bool need_scsi_controller =
-                IN_SET(arg_image_disk_type, DISK_TYPE_VIRTIO_SCSI, DISK_TYPE_VIRTIO_SCSI_CDROM) && arg_image;
-        if (!need_scsi_controller)
-                FOREACH_ARRAY(drive, arg_extra_drives.drives, arg_extra_drives.n_drives) {
-                        DiskType dt = drive->disk_type >= 0 ? drive->disk_type : arg_image_disk_type;
-                        if (IN_SET(dt, DISK_TYPE_VIRTIO_SCSI, DISK_TYPE_VIRTIO_SCSI_CDROM)) {
-                                need_scsi_controller = true;
-                                break;
-                        }
-                }
-
-        if (need_scsi_controller) {
-                r = qemu_config_section(config_file, "device", "vmspawn_scsi",
-                                        "driver", "virtio-scsi-pci");
-                if (r < 0)
-                        return r;
-        }
-
         if (arg_image) {
                 assert(!arg_directory);
 
@@ -2880,74 +3089,6 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                                                        arg_image);
                 }
 
-                if (arg_image_disk_type == DISK_TYPE_VIRTIO_SCSI_CDROM)
-                        r = qemu_config_section(config_file, "drive", "vmspawn",
-                                                "if", "none",
-                                                "file", arg_image,
-                                                "format", image_format_to_string(arg_image_format),
-                                                "media", "cdrom",
-                                                "readonly", "on");
-                else
-                        r = qemu_config_section(config_file, "drive", "vmspawn",
-                                                "if", "none",
-                                                "file", arg_image,
-                                                "format", image_format_to_string(arg_image_format),
-                                                "discard", on_off(arg_discard_disk),
-                                                "snapshot", on_off(arg_ephemeral));
-                if (r < 0)
-                        return r;
-
-                _cleanup_free_ char *image_fn = NULL;
-                r = path_extract_filename(arg_image, &image_fn);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to extract filename from path '%s': %m", image_fn);
-
-                const char *disk_driver;
-                _cleanup_free_ char *serial = NULL;
-
-                switch (arg_image_disk_type) {
-                case DISK_TYPE_VIRTIO_BLK:
-                        disk_driver = "virtio-blk-pci";
-                        serial = strdup(image_fn);
-                        if (!serial)
-                                return log_oom();
-                        break;
-                case DISK_TYPE_VIRTIO_SCSI:
-                        disk_driver = "scsi-hd";
-                        r = disk_serial(image_fn, DISK_SERIAL_MAX_LEN_SCSI, &serial);
-                        if (r < 0)
-                                return log_oom();
-                        break;
-                case DISK_TYPE_NVME:
-                        disk_driver = "nvme";
-                        r = disk_serial(image_fn, DISK_SERIAL_MAX_LEN_NVME, &serial);
-                        if (r < 0)
-                                return log_oom();
-                        break;
-                case DISK_TYPE_VIRTIO_SCSI_CDROM:
-                        disk_driver = "scsi-cd";
-                        r = disk_serial(image_fn, DISK_SERIAL_MAX_LEN_SCSI, &serial);
-                        if (r < 0)
-                                return log_oom();
-                        break;
-                default:
-                        assert_not_reached();
-                }
-
-                r = qemu_config_section(config_file, "device", "vmspawn-disk",
-                                        "driver", disk_driver,
-                                        "drive", "vmspawn",
-                                        "bootindex", "1",
-                                        "serial", serial);
-                if (r < 0)
-                        return r;
-
-                if (IN_SET(arg_image_disk_type, DISK_TYPE_VIRTIO_SCSI, DISK_TYPE_VIRTIO_SCSI_CDROM)) {
-                        r = qemu_config_key(config_file, "bus", "vmspawn_scsi.0");
-                        if (r < 0)
-                                return r;
-                }
-
                 if (arg_image_disk_type != DISK_TYPE_VIRTIO_SCSI_CDROM) {
                         r = grow_image(arg_image, arg_grow_image);
                         if (r < 0)
@@ -3034,86 +3175,8 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                         return log_oom();
         }
 
-        size_t i = 0;
-        FOREACH_ARRAY(drive, arg_extra_drives.drives, arg_extra_drives.n_drives) {
-                if (strv_extend(&cmdline, "-blockdev") < 0)
-                        return log_oom();
-
-                _cleanup_free_ char *escaped_drive = escape_qemu_value(drive->path);
-                if (!escaped_drive)
-                        return log_oom();
-
-                struct stat st;
-                if (stat(drive->path, &st) < 0)
-                        return log_error_errno(errno, "Failed to stat '%s': %m", drive->path);
-
-                const char *driver = NULL;
-                if (S_ISREG(st.st_mode))
-                        driver = "file";
-                else if (S_ISBLK(st.st_mode)) {
-                        if (drive->format == IMAGE_FORMAT_QCOW2)
-                                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
-                                                       "Block device '%s' cannot be used with 'qcow2' format, only 'raw' is supported.",
-                                                       drive->path);
-                        driver = "host_device";
-                } else
-                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expected regular file or block device, not '%s'.", drive->path);
-
-                DiskType dt = drive->disk_type >= 0 ? drive->disk_type : arg_image_disk_type;
-
-                if (strv_extendf(&cmdline, "driver=%s,cache.direct=off,cache.no-flush=on,file.driver=%s,file.filename=%s,node-name=vmspawn_extra_%zu%s",
-                                 image_format_to_string(drive->format), driver, escaped_drive, i,
-                                 dt == DISK_TYPE_VIRTIO_SCSI_CDROM ? ",read-only=on" : "") < 0)
-                        return log_oom();
-
-                _cleanup_free_ char *drive_fn = NULL;
-                r = path_extract_filename(drive->path, &drive_fn);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to extract filename from path '%s': %m", drive->path);
-
-                _cleanup_free_ char *escaped_drive_fn = escape_qemu_value(drive_fn);
-                if (!escaped_drive_fn)
-                        return log_oom();
-
-                if (strv_extend(&cmdline, "-device") < 0)
-                        return log_oom();
-
-                switch (dt) {
-                case DISK_TYPE_VIRTIO_BLK:
-                        if (strv_extendf(&cmdline, "virtio-blk-pci,drive=vmspawn_extra_%zu,serial=%s", i++, escaped_drive_fn) < 0)
-                                return log_oom();
-                        break;
-                case DISK_TYPE_VIRTIO_SCSI: {
-                        _cleanup_free_ char *serial = NULL;
-                        r = disk_serial(escaped_drive_fn, DISK_SERIAL_MAX_LEN_SCSI, &serial);
-                        if (r < 0)
-                                return log_oom();
-                        if (strv_extendf(&cmdline, "scsi-hd,bus=vmspawn_scsi.0,drive=vmspawn_extra_%zu,serial=%s", i++, serial) < 0)
-                                return log_oom();
-                        break;
-                }
-                case DISK_TYPE_NVME: {
-                        _cleanup_free_ char *serial = NULL;
-                        r = disk_serial(escaped_drive_fn, DISK_SERIAL_MAX_LEN_NVME, &serial);
-                        if (r < 0)
-                                return log_oom();
-                        if (strv_extendf(&cmdline, "nvme,drive=vmspawn_extra_%zu,serial=%s", i++, serial) < 0)
-                                return log_oom();
-                        break;
-                }
-                case DISK_TYPE_VIRTIO_SCSI_CDROM: {
-                        _cleanup_free_ char *serial = NULL;
-                        r = disk_serial(escaped_drive_fn, DISK_SERIAL_MAX_LEN_SCSI, &serial);
-                        if (r < 0)
-                                return log_oom();
-                        if (strv_extendf(&cmdline, "scsi-cd,bus=vmspawn_scsi.0,drive=vmspawn_extra_%zu,serial=%s", i++, serial) < 0)
-                                return log_oom();
-                        break;
-                }
-                default:
-                        assert_not_reached();
-                }
-        }
+        /* Extra drive validation is done in the post-fork drive info construction loop
+         * to avoid stat()'ing each drive twice. */
 
         if (!IN_SET(arg_console_mode, CONSOLE_GUI, CONSOLE_HEADLESS)) {
                 r = strv_prepend(&arg_kernel_cmdline_extra,
@@ -3480,7 +3543,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                                                n_pcie_ports);
 
                 size_t n_builtin_ports = n_pcie_ports - VMSPAWN_PCIE_HOTPLUG_SPARES;
-                for (i = 0; i < n_pcie_ports; i++) {
+                for (size_t i = 0; i < n_pcie_ports; i++) {
                         char id[STRLEN("vmspawn-hotplug-pci-root-port-") + DECIMAL_STR_MAX(size_t)];
                         if (i < n_builtin_ports)
                                 xsprintf(id, "vmspawn-pcieport-%zu", i);
@@ -3501,6 +3564,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                                 return r;
                 }
         }
+
         /* Finalize the config file and add -readconfig to the cmdline */
         r = fflush_and_check(config_file);
         if (r < 0)
@@ -3570,6 +3634,10 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
         child_pty = safe_close(child_pty);
         bridge_fds[1] = safe_close(bridge_fds[1]);
 
+        r = prepare_device_info(runtime_dir, &config);
+        if (r < 0)
+                return r;
+
         /* Connect to VMM backend */
         _cleanup_(vmspawn_qmp_bridge_freep) VmspawnQmpBridge *bridge = NULL;
         r = vmspawn_qmp_init(&bridge, bridge_fds[0], event);
@@ -3584,6 +3652,10 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                 return r;
 
         /* Device setup — all before resuming vCPUs */
+        r = vmspawn_qmp_setup_drives(bridge, &config.drives);
+        if (r < 0)
+                return r;
+
         if (config.network.type) {
                 r = vmspawn_qmp_setup_network(bridge, &config.network);
                 if (r < 0)