From: Christian Brauner Date: Mon, 30 Mar 2026 12:08:40 +0000 (+0200) Subject: vmspawn: set up varlink bridge infrastructure X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1a7b6fd9dd44d53dd64d895b33ee070e7c9af0b8;p=thirdparty%2Fsystemd.git vmspawn: set up varlink bridge infrastructure Create the QMP-to-varlink bridge layer (vmspawn-qmp.{c,h}) and the varlink server layer (vmspawn-varlink.{c,h}). The QMP bridge (VmspawnQmpBridge) owns the QmpClient connection and manages pending background jobs (e.g. blockdev-create continuations). vmspawn_qmp_init() creates the client and attaches it to the event loop. vmspawn_qmp_probe_features() drives io_uring and qcow2 discard-no-unref probes synchronously via a qmp_client_process() + qmp_client_wait() loop — the QMP handshake completes transparently on the first invoke. vmspawn_qmp_start() resumes vCPUs via an async "cont" command. The varlink server (VmspawnVarlinkContext) exposes three interfaces: - io.systemd.MachineInstance: generic machine control (Terminate, PowerOff, Reboot, Pause, Resume, Describe, SubscribeEvents). Method handlers forward to QMP commands asynchronously — the varlink reply is deferred until the QMP response arrives. - io.systemd.VirtualMachineInstance: VM-specific (placeholder for future snapshot/migration methods). - io.systemd.QemuMachineInstance: QEMU-specific (AcquireQMP stub). The server listens on /control with mode 0600. Event streaming follows the importd Pull pattern: SubscribeEvents sends an initial {ready:true} notification, then fans out QMP events to all subscribers. The disconnect handler only unrefs subscriber links (matching resolved's vl_on_notification_disconnect pattern). Introduce the MachineConfig aggregate in vmspawn-qmp.h grouping the per-device info structures (DriveInfos, NetworkInfo, VirtiofsInfos, VsockInfo) together with machine_config_done() that chains the individual done helpers. Callers populate it field-by-field and rely on the _cleanup_ attribute for orderly teardown regardless of which device types the invocation ends up using. Signed-off-by: Christian Brauner (Amutable) --- diff --git a/src/vmspawn/meson.build b/src/vmspawn/meson.build index 99bad2d6189..6d08755fedf 100644 --- a/src/vmspawn/meson.build +++ b/src/vmspawn/meson.build @@ -7,6 +7,8 @@ endif vmspawn_sources = files( 'vmspawn.c', 'vmspawn-qemu-config.c', + 'vmspawn-qmp.c', + 'vmspawn-varlink.c', 'vmspawn-settings.c', 'vmspawn-scope.c', 'vmspawn-mount.c', diff --git a/src/vmspawn/vmspawn-qmp.c b/src/vmspawn/vmspawn-qmp.c new file mode 100644 index 00000000000..4171772ee7c --- /dev/null +++ b/src/vmspawn/vmspawn-qmp.c @@ -0,0 +1,1059 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-event.h" +#include "sd-json.h" + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "ether-addr-util.h" +#include "fd-util.h" +#include "hashmap.h" +#include "json-util.h" +#include "log.h" +#include "qmp-client.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "vmspawn-qmp.h" + +DEFINE_PRIVATE_HASH_OPS_FULL( + pending_job_hash_ops, + char, string_hash_func, string_compare_func, free, + PendingJob, pending_job_free); + +void drive_info_done(DriveInfo *info) { + assert(info); + info->serial = mfree(info->serial); + info->node_name = mfree(info->node_name); + info->pcie_port = mfree(info->pcie_port); + info->fd = safe_close(info->fd); + info->overlay_fd = safe_close(info->overlay_fd); +} + +void drive_infos_done(DriveInfos *infos) { + assert(infos); + FOREACH_ARRAY(d, infos->drives, infos->n_drives) + drive_info_done(d); + infos->drives = mfree(infos->drives); + infos->n_drives = 0; + infos->scsi_pcie_port = mfree(infos->scsi_pcie_port); +} + +void network_info_done(NetworkInfo *info) { + assert(info); + info->ifname = mfree(info->ifname); + info->pcie_port = mfree(info->pcie_port); + info->fd = safe_close(info->fd); +} + +void virtiofs_info_done(VirtiofsInfo *info) { + assert(info); + info->id = mfree(info->id); + info->socket_path = mfree(info->socket_path); + info->tag = mfree(info->tag); + info->pcie_port = mfree(info->pcie_port); +} + +void virtiofs_infos_done(VirtiofsInfos *infos) { + assert(infos); + FOREACH_ARRAY(e, infos->entries, infos->n_entries) + virtiofs_info_done(e); + infos->entries = mfree(infos->entries); + infos->n_entries = 0; +} + +void vsock_info_done(VsockInfo *info) { + assert(info); + info->pcie_port = mfree(info->pcie_port); + info->fd = safe_close(info->fd); +} + +void machine_config_done(MachineConfig *c) { + if (!c) + return; + + drive_infos_done(&c->drives); + network_info_done(&c->network); + virtiofs_infos_done(&c->virtiofs); + vsock_info_done(&c->vsock); +} + +/* Generic async QMP setup-completion callback. The userdata argument carries the + * command name (as a string literal) for logging. On failure, request a clean + * event loop exit so vmspawn shuts down instead of running a VM with missing devices. */ +static int on_qmp_setup_complete( + QmpClient *client, + sd_json_variant *result, + const char *error_desc, + int error, + void *userdata) { + + const char *label = ASSERT_PTR(userdata); + + assert(client); + + if (error < 0) { + log_error_errno(error, "%s failed: %s", label, strna(error_desc)); + return sd_event_exit(qmp_client_get_event(client), error); + } + + return 0; +} + +/* Send add-fd via SCM_RIGHTS; return /dev/fdset/N. Allocations run before invoke so a late + * OOM cannot orphan an fdset on QEMU's side; *ret_path is only written on full success. */ +static int qmp_fdset_add(QmpClient *qmp, int fd_consume, char **ret_path) { + _cleanup_(sd_json_variant_unrefp) sd_json_variant *args = NULL; + _cleanup_close_ int fd = fd_consume; + _cleanup_free_ char *path = NULL; + unsigned id; + int r; + + assert(qmp); + assert(fd_consume >= 0); + assert(ret_path); + + id = qmp_client_next_fdset_id(qmp); + + r = sd_json_buildo(&args, SD_JSON_BUILD_PAIR_UNSIGNED("fdset-id", id)); + if (r < 0) + return r; + + if (asprintf(&path, "/dev/fdset/%u", id) < 0) + return -ENOMEM; + + r = qmp_client_invoke(qmp, "add-fd", QMP_CLIENT_ARGS_FD(args, TAKE_FD(fd)), + on_qmp_setup_complete, (void*) "add-fd"); + if (r < 0) + return r; + + *ret_path = TAKE_PTR(path); + return 0; +} + +typedef struct QmpFileNodeParams { + const char *node_name; + const char *filename; + const char *driver; /* "file" or "host_device" */ + QmpDriveFlags flags; +} QmpFileNodeParams; + +/* Build blockdev-add JSON for the protocol-level (file) node */ +static int qmp_build_blockdev_add_file(const QmpFileNodeParams *p, sd_json_variant **ret) { + assert(p); + assert(p->node_name); + assert(p->filename); + assert(p->driver); + assert(ret); + + /* cache.direct=false uses the page cache (QEMU default). cache.no-flush suppresses host + * flush on guest fsync — only safe for ephemeral/extra drives where data loss is acceptable. */ + return sd_json_buildo( + ret, + SD_JSON_BUILD_PAIR_STRING("node-name", p->node_name), + SD_JSON_BUILD_PAIR_STRING("driver", p->driver), + SD_JSON_BUILD_PAIR_STRING("filename", p->filename), + SD_JSON_BUILD_PAIR_CONDITION(FLAGS_SET(p->flags, QMP_DRIVE_READ_ONLY), "read-only", SD_JSON_BUILD_BOOLEAN(true)), + SD_JSON_BUILD_PAIR_CONDITION(FLAGS_SET(p->flags, QMP_DRIVE_IO_URING), "aio", JSON_BUILD_CONST_STRING("io_uring")), + SD_JSON_BUILD_PAIR("cache", SD_JSON_BUILD_OBJECT( + SD_JSON_BUILD_PAIR_BOOLEAN("direct", false), + SD_JSON_BUILD_PAIR_BOOLEAN("no-flush", FLAGS_SET(p->flags, QMP_DRIVE_NO_FLUSH))))); +} + +typedef struct QmpFormatNodeParams { + const char *node_name; + const char *format; /* "raw", "qcow2", etc. */ + const char *file_node_name; /* reference to the underlying file node */ + const char *backing; /* reference to a backing format node (NULL if none) */ + QmpDriveFlags flags; +} QmpFormatNodeParams; + +/* Build blockdev-add JSON for the format-level node */ +static int qmp_build_blockdev_add_format(const QmpFormatNodeParams *p, sd_json_variant **ret) { + assert(p); + assert(p->node_name); + assert(p->format); + assert(p->file_node_name); + assert(ret); + + /* When "file" is a string (not an object), QEMU interprets it as a reference to an + * existing node-name. The "backing" field likewise references a format-level node. */ + return sd_json_buildo( + ret, + SD_JSON_BUILD_PAIR_STRING("node-name", p->node_name), + SD_JSON_BUILD_PAIR_STRING("driver", p->format), + SD_JSON_BUILD_PAIR_STRING("file", p->file_node_name), + SD_JSON_BUILD_PAIR_CONDITION(FLAGS_SET(p->flags, QMP_DRIVE_READ_ONLY), "read-only", SD_JSON_BUILD_BOOLEAN(true)), + SD_JSON_BUILD_PAIR_CONDITION(FLAGS_SET(p->flags, QMP_DRIVE_DISCARD), "discard", JSON_BUILD_CONST_STRING("unmap")), + SD_JSON_BUILD_PAIR_CONDITION(FLAGS_SET(p->flags, QMP_DRIVE_DISCARD_NO_UNREF), "discard-no-unref", SD_JSON_BUILD_BOOLEAN(true)), + SD_JSON_BUILD_PAIR_CONDITION(!!p->backing, "backing", SD_JSON_BUILD_STRING(p->backing))); +} + +/* Build device_add JSON arguments for a drive */ +static int qmp_build_device_add(const DriveInfo *drive, sd_json_variant **ret) { + assert(drive); + assert(ret); + + return sd_json_buildo( + ret, + SD_JSON_BUILD_PAIR_STRING("driver", drive->disk_driver), + SD_JSON_BUILD_PAIR_STRING("drive", drive->node_name), + SD_JSON_BUILD_PAIR_STRING("id", drive->node_name), + SD_JSON_BUILD_PAIR_CONDITION(FLAGS_SET(drive->flags, QMP_DRIVE_BOOT), "bootindex", SD_JSON_BUILD_INTEGER(1)), + SD_JSON_BUILD_PAIR_CONDITION(!!drive->serial, "serial", SD_JSON_BUILD_STRING(drive->serial)), + SD_JSON_BUILD_PAIR_CONDITION(STR_IN_SET(drive->disk_driver, "scsi-hd", "scsi-cd"), + "bus", JSON_BUILD_CONST_STRING("vmspawn_scsi.0")), + SD_JSON_BUILD_PAIR_CONDITION( + !STR_IN_SET(drive->disk_driver, "scsi-hd", "scsi-cd") && !!drive->pcie_port, + "bus", SD_JSON_BUILD_STRING(drive->pcie_port))); +} + +/* Issue blockdev-add for a file node. */ +static int qmp_add_file_node(QmpClient *qmp, const QmpFileNodeParams *p) { + _cleanup_(sd_json_variant_unrefp) sd_json_variant *args = NULL; + int r; + + r = qmp_build_blockdev_add_file(p, &args); + if (r < 0) + return r; + + return qmp_client_invoke(qmp, "blockdev-add", QMP_CLIENT_ARGS(args), on_qmp_setup_complete, (void*) "blockdev-add"); +} + +/* Get the virtual size of an image from the fd directly. For raw images the virtual size + * equals the file/device size. For qcow2 the virtual size is a big-endian uint64 at header + * offset 24 (the "size" field in the qcow2 header). */ +static int get_image_virtual_size(int fd, const char *format, bool is_block_device, uint64_t *ret) { + int r; + + assert(fd >= 0); + assert(format); + assert(ret); + + if (streq(format, "raw")) { + if (is_block_device) + return blockdev_get_device_size(fd, ret); + + struct stat st; + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat image: %m"); + + r = stat_verify_regular(&st); + if (r < 0) + return log_error_errno(r, "Raw device is neither a regular file nor a block device"); + + *ret = st.st_size; + return 0; + } + + if (streq(format, "qcow2")) { + uint32_t magic = 0; + ssize_t n = pread(fd, &magic, sizeof(magic), 0); + if (n < 0) + return log_error_errno(errno, "Failed to read qcow2 magic: %m"); + if (n != sizeof(magic) || be32toh(magic) != UINT32_C(0x514649fb)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Not a valid qcow2 image (bad magic)"); + + uint64_t size_be = 0; + n = pread(fd, &size_be, sizeof(size_be), 24); + if (n < 0) + return log_error_errno(errno, "Failed to read qcow2 header: %m"); + if (n != sizeof(size_be)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read on qcow2 header"); + + *ret = be64toh(size_be); + return 0; + } + + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unsupported image format '%s'", format); +} + +/* Ephemeral drive continuation — fired when the blockdev-create job concludes. + * Completes the drive setup by adding the overlay format node and the device. */ +typedef struct EphemeralDriveCtx { + char *node_name; /* overlay format node name (= drive node name) */ + char *overlay_file_node; + char *base_fmt_node; + /* Fields for device_add */ + char *disk_driver; + char *serial; /* NULL if unset */ + char *pcie_port; /* pcie-root-port bus for device_add (NULL on non-PCIe) */ + QmpDriveFlags flags; /* subset: QMP_DRIVE_DISCARD, QMP_DRIVE_DISCARD_NO_UNREF, QMP_DRIVE_BOOT */ +} EphemeralDriveCtx; + +static EphemeralDriveCtx* ephemeral_drive_ctx_free(EphemeralDriveCtx *ctx) { + if (!ctx) + return NULL; + free(ctx->node_name); + free(ctx->overlay_file_node); + free(ctx->base_fmt_node); + free(ctx->disk_driver); + free(ctx->serial); + free(ctx->pcie_port); + return mfree(ctx); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(EphemeralDriveCtx *, ephemeral_drive_ctx_free); + +static void ephemeral_drive_ctx_free_void(void *p) { + ephemeral_drive_ctx_free(p); +} + +static int on_ephemeral_create_concluded(QmpClient *qmp, void *userdata) { + _cleanup_(ephemeral_drive_ctx_freep) EphemeralDriveCtx *ctx = ASSERT_PTR(userdata); + _cleanup_(sd_json_variant_unrefp) sd_json_variant *fmt_args = NULL, *device_args = NULL; + int r; + + /* Open formatted overlay as qcow2 with backing reference */ + QmpFormatNodeParams overlay_fmt_params = { + .node_name = ctx->node_name, + .format = "qcow2", + .file_node_name = ctx->overlay_file_node, + .backing = ctx->base_fmt_node, + .flags = ctx->flags & (QMP_DRIVE_DISCARD|QMP_DRIVE_DISCARD_NO_UNREF), + }; + r = qmp_build_blockdev_add_format(&overlay_fmt_params, &fmt_args); + if (r < 0) + return log_error_errno(r, "Failed to build overlay format JSON for '%s': %m", ctx->node_name); + + r = qmp_client_invoke(qmp, "blockdev-add", QMP_CLIENT_ARGS(fmt_args), on_qmp_setup_complete, (void*) "blockdev-add"); + if (r < 0) + return r; + + /* device_add: attach to virtual hardware. Build a temporary DriveInfo as a + * read-only view into the continuation context to reuse qmp_build_device_add(). */ + const DriveInfo tmp = { + .disk_driver = ctx->disk_driver, + .node_name = ctx->node_name, + .serial = ctx->serial, + .pcie_port = ctx->pcie_port, + .flags = ctx->flags & QMP_DRIVE_BOOT, + .fd = -EBADF, + .overlay_fd = -EBADF, + }; + r = qmp_build_device_add(&tmp, &device_args); + if (r < 0) + return log_error_errno(r, "Failed to build device_add JSON for '%s': %m", ctx->node_name); + + r = qmp_client_invoke(qmp, "device_add", QMP_CLIENT_ARGS(device_args), on_qmp_setup_complete, (void*) "device_add"); + if (r < 0) + return r; + + log_debug("Queued ephemeral drive completion for '%s'", ctx->node_name); + return 0; +} + +/* Set up an ephemeral drive: base image (read-only) + anonymous qcow2 overlay (read-write). + * The final steps (overlay format + device_add) are deferred to a job continuation that + * fires when the blockdev-create job concludes. */ +static int qmp_setup_ephemeral_drive(VmspawnQmpBridge *bridge, QmpClient *qmp, DriveInfo *drive) { + int r; + + assert(bridge); + assert(qmp); + assert(drive); + assert(drive->fd >= 0); + assert(drive->overlay_fd >= 0); + + /* Node names: -base-file, -base-fmt, -overlay-file, */ + _cleanup_free_ char *base_file_node = strjoin(drive->node_name, "-base-file"); + _cleanup_free_ char *base_fmt_node = strjoin(drive->node_name, "-base-fmt"); + _cleanup_free_ char *overlay_file_node = strjoin(drive->node_name, "-overlay-file"); + if (!base_file_node || !base_fmt_node || !overlay_file_node) + return log_oom(); + + /* Read virtual size before passing the fd to QEMU (TAKE_FD consumes it) */ + uint64_t virtual_size; + r = get_image_virtual_size(drive->fd, drive->format, FLAGS_SET(drive->flags, QMP_DRIVE_BLOCK_DEVICE), &virtual_size); + if (r < 0) + return r; + + /* Step 1-2: Pass both fds to QEMU */ + _cleanup_free_ char *base_path = NULL; + r = qmp_fdset_add(qmp, TAKE_FD(drive->fd), &base_path); + if (r < 0) + return log_error_errno(r, "Failed to send add-fd for base image '%s': %m", drive->path); + + _cleanup_free_ char *overlay_path = NULL; + r = qmp_fdset_add(qmp, TAKE_FD(drive->overlay_fd), &overlay_path); + if (r < 0) + return log_error_errno(r, "Failed to send add-fd for overlay of '%s': %m", drive->path); + + /* Step 3: Base image file node (read-only) */ + QmpFileNodeParams base_file_params = { + .node_name = base_file_node, + .filename = base_path, + .driver = FLAGS_SET(drive->flags, QMP_DRIVE_BLOCK_DEVICE) ? "host_device" : "file", + .flags = QMP_DRIVE_READ_ONLY | (drive->flags & QMP_DRIVE_NO_FLUSH), + }; + if (FLAGS_SET(bridge->features, VMSPAWN_QMP_FEATURE_IO_URING)) + base_file_params.flags |= QMP_DRIVE_IO_URING; + r = qmp_add_file_node(qmp, &base_file_params); + if (r < 0) + return log_error_errno(r, "Failed to send blockdev-add for base file '%s': %m", drive->path); + + /* Step 4: Base image format node (read-only) */ + QmpFormatNodeParams base_fmt_params = { + .node_name = base_fmt_node, + .format = drive->format, + .file_node_name = base_file_node, + .flags = QMP_DRIVE_READ_ONLY, + }; + _cleanup_(sd_json_variant_unrefp) sd_json_variant *base_fmt_args = NULL; + r = qmp_build_blockdev_add_format(&base_fmt_params, &base_fmt_args); + if (r < 0) + return r; + + r = qmp_client_invoke(qmp, "blockdev-add", QMP_CLIENT_ARGS(base_fmt_args), on_qmp_setup_complete, (void*) "blockdev-add"); + if (r < 0) + return log_error_errno(r, "Failed to send blockdev-add for base format '%s': %m", drive->path); + + /* Step 5: Overlay file node (read-write, no io_uring for anon overlay) */ + QmpFileNodeParams overlay_file_params = { + .node_name = overlay_file_node, + .filename = overlay_path, + .driver = "file", + .flags = QMP_DRIVE_NO_FLUSH, + }; + _cleanup_(sd_json_variant_unrefp) sd_json_variant *overlay_file_args = NULL; + r = qmp_build_blockdev_add_file(&overlay_file_params, &overlay_file_args); + if (r < 0) + return r; + + r = qmp_client_invoke(qmp, "blockdev-add", QMP_CLIENT_ARGS(overlay_file_args), on_qmp_setup_complete, (void*) "blockdev-add"); + if (r < 0) + return log_error_errno(r, "Failed to send blockdev-add for overlay file '%s': %m", drive->path); + + /* Step 6: Fire blockdev-create to format the overlay */ + _cleanup_(sd_json_variant_unrefp) sd_json_variant *create_options = NULL; + r = sd_json_buildo(&create_options, + SD_JSON_BUILD_PAIR_STRING("driver", "qcow2"), + SD_JSON_BUILD_PAIR_STRING("file", overlay_file_node), + SD_JSON_BUILD_PAIR_UNSIGNED("size", virtual_size), + SD_JSON_BUILD_PAIR_STRING("backing-file", base_fmt_node), + SD_JSON_BUILD_PAIR_STRING("backing-fmt", drive->format)); + if (r < 0) + return log_error_errno(r, "Failed to build blockdev-create options: %m"); + + _cleanup_free_ char *job_id = strjoin("create-", drive->node_name); + if (!job_id) + return log_oom(); + + _cleanup_(sd_json_variant_unrefp) sd_json_variant *cmd_args = NULL; + r = sd_json_buildo(&cmd_args, + SD_JSON_BUILD_PAIR_STRING("job-id", job_id), + SD_JSON_BUILD_PAIR_VARIANT("options", create_options)); + if (r < 0) + return log_error_errno(r, "Failed to build blockdev-create JSON: %m"); + + /* Register continuation: when the job concludes, fire overlay format + device_add */ + _cleanup_(ephemeral_drive_ctx_freep) EphemeralDriveCtx *ectx = new(EphemeralDriveCtx, 1); + if (!ectx) + return log_oom(); + + QmpDriveFlags ectx_flags = drive->flags & (QMP_DRIVE_DISCARD|QMP_DRIVE_BOOT); + if (FLAGS_SET(drive->flags, QMP_DRIVE_DISCARD) && + FLAGS_SET(bridge->features, VMSPAWN_QMP_FEATURE_DISCARD_NO_UNREF)) + ectx_flags |= QMP_DRIVE_DISCARD_NO_UNREF; + + *ectx = (EphemeralDriveCtx) { + .node_name = strdup(drive->node_name), + .overlay_file_node = strdup(overlay_file_node), + .base_fmt_node = strdup(base_fmt_node), + .disk_driver = strdup(drive->disk_driver), + .serial = drive->serial ? strdup(drive->serial) : NULL, + .pcie_port = drive->pcie_port ? strdup(drive->pcie_port) : NULL, + .flags = ectx_flags, + }; + if (!ectx->node_name || !ectx->overlay_file_node || !ectx->base_fmt_node || + !ectx->disk_driver || (drive->serial && !ectx->serial) || + (drive->pcie_port && !ectx->pcie_port)) + return log_oom(); + + r = vmspawn_qmp_bridge_register_job(bridge, job_id, + on_ephemeral_create_concluded, ectx, + ephemeral_drive_ctx_free_void); + if (r < 0) + return log_error_errno(r, "Failed to register job continuation: %m"); + + TAKE_PTR(ectx); + + r = qmp_client_invoke(qmp, "blockdev-create", QMP_CLIENT_ARGS(cmd_args), on_qmp_setup_complete, (void*) "blockdev-create"); + if (r < 0) + return log_error_errno(r, "Failed to send blockdev-create for '%s': %m", drive->path); + + log_debug("Queued ephemeral drive setup for '%s' (job %s)", drive->path, job_id); + return 0; +} + +/* Set up a regular (non-ephemeral) drive: single file node + format node + device_add. */ +static int qmp_setup_regular_drive(VmspawnQmpBridge *bridge, QmpClient *qmp, DriveInfo *drive) { + int r; + + assert(bridge); + assert(qmp); + assert(drive); + assert(drive->fd >= 0); + + /* Node names: -file, */ + _cleanup_free_ char *file_node_name = strjoin(drive->node_name, "-file"); + if (!file_node_name) + return log_oom(); + + _cleanup_free_ char *fdset_path = NULL; + r = qmp_fdset_add(qmp, TAKE_FD(drive->fd), &fdset_path); + if (r < 0) + return log_error_errno(r, "Failed to send add-fd for '%s': %m", drive->path); + + QmpFileNodeParams file_params = { + .node_name = file_node_name, + .filename = fdset_path, + .driver = FLAGS_SET(drive->flags, QMP_DRIVE_BLOCK_DEVICE) ? "host_device" : "file", + .flags = drive->flags & (QMP_DRIVE_READ_ONLY|QMP_DRIVE_NO_FLUSH), + }; + if (FLAGS_SET(bridge->features, VMSPAWN_QMP_FEATURE_IO_URING)) + file_params.flags |= QMP_DRIVE_IO_URING; + r = qmp_add_file_node(qmp, &file_params); + if (r < 0) + return log_error_errno(r, "Failed to send blockdev-add for '%s': %m", drive->path); + + QmpFormatNodeParams fmt_params = { + .node_name = drive->node_name, + .format = drive->format, + .file_node_name = file_node_name, + .flags = drive->flags & (QMP_DRIVE_READ_ONLY|QMP_DRIVE_DISCARD), + }; + _cleanup_(sd_json_variant_unrefp) sd_json_variant *fmt_args = NULL; + r = qmp_build_blockdev_add_format(&fmt_params, &fmt_args); + if (r < 0) + return r; + + r = qmp_client_invoke(qmp, "blockdev-add", QMP_CLIENT_ARGS(fmt_args), on_qmp_setup_complete, (void*) "blockdev-add"); + if (r < 0) + return log_error_errno(r, "Failed to send blockdev-add format for '%s': %m", drive->path); + + /* device_add: attach to virtual hardware */ + _cleanup_(sd_json_variant_unrefp) sd_json_variant *device_args = NULL; + r = qmp_build_device_add(drive, &device_args); + if (r < 0) + return r; + + r = qmp_client_invoke(qmp, "device_add", QMP_CLIENT_ARGS(device_args), on_qmp_setup_complete, (void*) "device_add"); + if (r < 0) + return log_error_errno(r, "Failed to send device_add for '%s': %m", drive->path); + + log_debug("Queued drive setup for '%s'", drive->path); + return 0; +} + +/* Configure a single drive via QMP. Dispatches to ephemeral or regular setup. */ +static int qmp_setup_drive(VmspawnQmpBridge *bridge, QmpClient *qmp, DriveInfo *drive) { + assert(drive); + + if (drive->overlay_fd >= 0) + return qmp_setup_ephemeral_drive(bridge, qmp, drive); + + return qmp_setup_regular_drive(bridge, qmp, drive); +} + +int vmspawn_qmp_setup_network(VmspawnQmpBridge *bridge, NetworkInfo *network) { + _cleanup_(sd_json_variant_unrefp) sd_json_variant *netdev_args = NULL, *device_args = NULL; + bool tap_by_fd; + int r; + + assert(bridge); + + QmpClient *qmp = vmspawn_qmp_bridge_get_qmp(bridge); + assert(network); + assert(network->type); + + tap_by_fd = streq(network->type, "tap") && network->fd >= 0; + + /* For TAP-by-fd: pass the TAP fd to QEMU via getfd + SCM_RIGHTS, then reference it by name + * in netdev_add. QEMU stores the received fd under the given fdname and closes it on removal. */ + if (tap_by_fd) { + _cleanup_(sd_json_variant_unrefp) sd_json_variant *getfd_args = NULL; + + r = sd_json_buildo( + &getfd_args, + SD_JSON_BUILD_PAIR_STRING("fdname", "vmspawn_tap")); + if (r < 0) + return log_error_errno(r, "Failed to build getfd JSON: %m"); + + r = qmp_client_invoke(qmp, "getfd", QMP_CLIENT_ARGS_FD(getfd_args, TAKE_FD(network->fd)), + on_qmp_setup_complete, (void*) "getfd"); + if (r < 0) + return log_error_errno(r, "Failed to send getfd for TAP fd: %m"); + } + + /* netdev_add: create the network backend */ + r = sd_json_buildo( + &netdev_args, + SD_JSON_BUILD_PAIR_STRING("type", network->type), + SD_JSON_BUILD_PAIR_STRING("id", "net0"), + SD_JSON_BUILD_PAIR_CONDITION(tap_by_fd, + "fd", JSON_BUILD_CONST_STRING("vmspawn_tap")), + SD_JSON_BUILD_PAIR_CONDITION(!tap_by_fd && !!network->ifname, + "ifname", SD_JSON_BUILD_STRING(network->ifname)), + SD_JSON_BUILD_PAIR_CONDITION(!tap_by_fd && streq(network->type, "tap"), + "script", JSON_BUILD_CONST_STRING("no")), + SD_JSON_BUILD_PAIR_CONDITION(!tap_by_fd && streq(network->type, "tap"), + "downscript", JSON_BUILD_CONST_STRING("no"))); + if (r < 0) + return log_error_errno(r, "Failed to build netdev_add JSON: %m"); + + r = qmp_client_invoke(qmp, "netdev_add", QMP_CLIENT_ARGS(netdev_args), on_qmp_setup_complete, (void*) "netdev_add"); + if (r < 0) + return log_error_errno(r, "Failed to send netdev_add: %m"); + + /* device_add: attach NIC frontend */ + r = sd_json_buildo( + &device_args, + SD_JSON_BUILD_PAIR_STRING("driver", "virtio-net-pci"), + SD_JSON_BUILD_PAIR_STRING("netdev", "net0"), + SD_JSON_BUILD_PAIR_STRING("id", "nic0"), + SD_JSON_BUILD_PAIR_CONDITION(network->mac_set, + "mac", SD_JSON_BUILD_STRING(network->mac_set ? ETHER_ADDR_TO_STR(&network->mac) : NULL)), + SD_JSON_BUILD_PAIR_CONDITION(!!network->pcie_port, + "bus", SD_JSON_BUILD_STRING(network->pcie_port))); + if (r < 0) + return log_error_errno(r, "Failed to build NIC device_add JSON: %m"); + + r = qmp_client_invoke(qmp, "device_add", QMP_CLIENT_ARGS(device_args), on_qmp_setup_complete, (void*) "device_add"); + if (r < 0) + return log_error_errno(r, "Failed to send NIC device_add: %m"); + + log_debug("Queued %s network setup%s", network->type, tap_by_fd ? " (fd via getfd)" : ""); + return 0; +} + +static int vmspawn_qmp_setup_one_virtiofs(QmpClient *qmp, const VirtiofsInfo *vfs) { + _cleanup_(sd_json_variant_unrefp) sd_json_variant *chardev_args = NULL, *device_args = NULL; + int r; + + assert(qmp); + assert(vfs); + assert(vfs->id); + assert(vfs->socket_path); + assert(vfs->tag); + + /* chardev-add: connect to virtiofsd socket. + * ChardevBackend and SocketAddressLegacy are QAPI legacy unions with explicit "data" + * wrapper objects at each level — the nesting is mandatory on the wire. */ + r = sd_json_buildo( + &chardev_args, + SD_JSON_BUILD_PAIR_STRING("id", vfs->id), + SD_JSON_BUILD_PAIR("backend", SD_JSON_BUILD_OBJECT( + SD_JSON_BUILD_PAIR_STRING("type", "socket"), + SD_JSON_BUILD_PAIR("data", SD_JSON_BUILD_OBJECT( + SD_JSON_BUILD_PAIR("addr", SD_JSON_BUILD_OBJECT( + SD_JSON_BUILD_PAIR_STRING("type", "unix"), + SD_JSON_BUILD_PAIR("data", SD_JSON_BUILD_OBJECT( + SD_JSON_BUILD_PAIR_STRING("path", vfs->socket_path))))), + SD_JSON_BUILD_PAIR_BOOLEAN("server", false)))))); + if (r < 0) + return log_error_errno(r, "Failed to build chardev-add JSON for '%s': %m", vfs->id); + + r = qmp_client_invoke(qmp, "chardev-add", QMP_CLIENT_ARGS(chardev_args), on_qmp_setup_complete, (void*) "chardev-add"); + if (r < 0) + return log_error_errno(r, "Failed to send chardev-add '%s': %m", vfs->id); + + /* device_add: create vhost-user-fs-pci device */ + r = sd_json_buildo( + &device_args, + SD_JSON_BUILD_PAIR_STRING("driver", "vhost-user-fs-pci"), + SD_JSON_BUILD_PAIR_STRING("id", vfs->id), + SD_JSON_BUILD_PAIR_STRING("chardev", vfs->id), + SD_JSON_BUILD_PAIR_STRING("tag", vfs->tag), + SD_JSON_BUILD_PAIR_UNSIGNED("queue-size", 1024), + SD_JSON_BUILD_PAIR_CONDITION(!!vfs->pcie_port, + "bus", SD_JSON_BUILD_STRING(vfs->pcie_port))); + if (r < 0) + return log_error_errno(r, "Failed to build virtiofs device_add JSON for '%s': %m", vfs->id); + + r = qmp_client_invoke(qmp, "device_add", QMP_CLIENT_ARGS(device_args), on_qmp_setup_complete, (void*) "device_add"); + if (r < 0) + return log_error_errno(r, "Failed to send virtiofs device_add '%s': %m", vfs->id); + + log_debug("Queued virtiofs device '%s' (tag=%s)", vfs->id, vfs->tag); + return 0; +} + +int vmspawn_qmp_setup_virtiofs(VmspawnQmpBridge *bridge, const VirtiofsInfos *virtiofs) { + int r; + + assert(bridge); + + QmpClient *qmp = vmspawn_qmp_bridge_get_qmp(bridge); + assert(virtiofs); + + FOREACH_ARRAY(e, virtiofs->entries, virtiofs->n_entries) { + r = vmspawn_qmp_setup_one_virtiofs(qmp, e); + if (r < 0) + return r; + } + + return 0; +} + +int vmspawn_qmp_setup_vsock(VmspawnQmpBridge *bridge, VsockInfo *vsock) { + _cleanup_(sd_json_variant_unrefp) sd_json_variant *getfd_args = NULL, *device_args = NULL; + int r; + + assert(bridge); + assert(vsock); + + if (vsock->fd < 0) + return 0; + + QmpClient *qmp = vmspawn_qmp_bridge_get_qmp(bridge); + + /* getfd: pass the vhost-vsock fd to QEMU via SCM_RIGHTS */ + r = sd_json_buildo( + &getfd_args, + SD_JSON_BUILD_PAIR_STRING("fdname", "vmspawn_vsock")); + if (r < 0) + return log_error_errno(r, "Failed to build getfd JSON for VSOCK: %m"); + + r = qmp_client_invoke(qmp, "getfd", QMP_CLIENT_ARGS_FD(getfd_args, TAKE_FD(vsock->fd)), + on_qmp_setup_complete, (void*) "getfd"); + if (r < 0) + return log_error_errno(r, "Failed to send getfd for VSOCK fd: %m"); + + /* device_add: create vhost-vsock-pci device referencing the named fd */ + r = sd_json_buildo( + &device_args, + SD_JSON_BUILD_PAIR_STRING("driver", "vhost-vsock-pci"), + SD_JSON_BUILD_PAIR_STRING("id", "vsock0"), + SD_JSON_BUILD_PAIR_UNSIGNED("guest-cid", vsock->cid), + SD_JSON_BUILD_PAIR_STRING("vhostfd", "vmspawn_vsock"), + SD_JSON_BUILD_PAIR_CONDITION(!!vsock->pcie_port, + "bus", SD_JSON_BUILD_STRING(vsock->pcie_port))); + if (r < 0) + return log_error_errno(r, "Failed to build VSOCK device_add JSON: %m"); + + r = qmp_client_invoke(qmp, "device_add", QMP_CLIENT_ARGS(device_args), on_qmp_setup_complete, (void*) "device_add"); + if (r < 0) + return log_error_errno(r, "Failed to send VSOCK device_add: %m"); + + log_debug("Queued vhost-vsock-pci device setup (cid=%u)", vsock->cid); + return 0; +} + +static bool drives_need_scsi_controller(DriveInfos *drives) { + assert(drives); + + FOREACH_ARRAY(d, drives->drives, drives->n_drives) + if (STR_IN_SET(d->disk_driver, "scsi-hd", "scsi-cd")) + return true; + + return false; +} + +static int qmp_setup_scsi_controller(QmpClient *qmp, const char *pcie_port) { + _cleanup_(sd_json_variant_unrefp) sd_json_variant *args = NULL; + int r; + + r = sd_json_buildo( + &args, + SD_JSON_BUILD_PAIR_STRING("driver", "virtio-scsi-pci"), + SD_JSON_BUILD_PAIR_STRING("id", "vmspawn_scsi"), + SD_JSON_BUILD_PAIR_CONDITION(!!pcie_port, "bus", SD_JSON_BUILD_STRING(pcie_port))); + if (r < 0) + return log_error_errno(r, "Failed to build SCSI controller JSON: %m"); + + r = qmp_client_invoke(qmp, "device_add", QMP_CLIENT_ARGS(args), on_qmp_setup_complete, (void*) "device_add"); + if (r < 0) + return log_error_errno(r, "Failed to send SCSI controller device_add: %m"); + + log_debug("Queued virtio-scsi-pci controller setup"); + return 0; +} + +int vmspawn_qmp_setup_drives(VmspawnQmpBridge *bridge, DriveInfos *drives) { + int r; + + assert(bridge); + assert(drives); + + QmpClient *qmp = vmspawn_qmp_bridge_get_qmp(bridge); + + /* io_uring support was probed during vmspawn_qmp_init(). The cached result in + * bridge->features is passed to each file node setup call. */ + + if (drives_need_scsi_controller(drives)) { + r = qmp_setup_scsi_controller(qmp, drives->scsi_pcie_port); + if (r < 0) + return r; + } + + FOREACH_ARRAY(d, drives->drives, drives->n_drives) { + r = qmp_setup_drive(bridge, qmp, d); + if (r < 0) + return r; + } + + return 0; +} + +PendingJob* pending_job_free(PendingJob *j) { + if (!j) + return NULL; + if (j->free_userdata) + j->free_userdata(j->userdata); + return mfree(j); +} + +VmspawnQmpBridge* vmspawn_qmp_bridge_free(VmspawnQmpBridge *b) { + if (!b) + return NULL; + + hashmap_free(b->pending_jobs); + + qmp_client_unref(b->qmp); + return mfree(b); +} + +int vmspawn_qmp_bridge_register_job( + VmspawnQmpBridge *b, + const char *job_id, + pending_job_callback_t on_concluded, + void *userdata, + pending_job_free_t free_userdata) { + + _cleanup_free_ PendingJob *job = NULL; + _cleanup_free_ char *id = NULL; + int r; + + assert(b); + assert(job_id); + + id = strdup(job_id); + if (!id) + return -ENOMEM; + + job = new(PendingJob, 1); + if (!job) + return -ENOMEM; + + *job = (PendingJob) { + .on_concluded = on_concluded, + .free_userdata = free_userdata, + .userdata = userdata, + }; + + r = hashmap_ensure_put(&b->pending_jobs, &pending_job_hash_ops, id, job); + if (r < 0) + return r; + + TAKE_PTR(id); + TAKE_PTR(job); + return 0; +} + +QmpClient* vmspawn_qmp_bridge_get_qmp(VmspawnQmpBridge *b) { + assert(b); + return b->qmp; +} + +/* Probe-reply convention: ignore -EIO (QMP rejection = "feature absent", log at debug + * and leave the feature flag clear) and transport errors (caught by the post-loop + * qmp_client_is_disconnected() check in vmspawn_qmp_probe_features()). Cleanup calls + * are best-effort — failing to delete a private probe node leaves a harmless /dev/null + * blockdev in QEMU until it exits. */ + +static int on_io_uring_probe_del_reply( + QmpClient *c, + sd_json_variant *result, + const char *error_desc, + int error, + void *userdata) { + + assert(c); + + if (error_desc) + log_debug("Failed to remove io_uring probe node: %s", error_desc); + return 0; +} + +static int on_io_uring_probe_add_reply( + QmpClient *c, + sd_json_variant *result, + const char *error_desc, + int error, + void *userdata) { + + VmspawnQmpBridge *bridge = ASSERT_PTR(userdata); + _cleanup_(sd_json_variant_unrefp) sd_json_variant *del_args = NULL; + int r; + + assert(c); + + if (error < 0 && !error_desc) + return log_debug_errno(error, "io_uring probe did not execute: %m"); + if (error_desc) { + log_debug("QEMU does not support aio=io_uring: %s", error_desc); + return 0; + } + + bridge->features |= VMSPAWN_QMP_FEATURE_IO_URING; + log_debug("QEMU supports aio=io_uring"); + + /* Best-effort cleanup; the chained reply keeps the pump busy via the slots set. */ + r = sd_json_buildo(&del_args, + SD_JSON_BUILD_PAIR_STRING("node-name", "__io_uring_probe")); + if (r < 0) + return r; + + return qmp_client_invoke(c, "blockdev-del", QMP_CLIENT_ARGS(del_args), + on_io_uring_probe_del_reply, bridge); +} + +static int probe_io_uring(QmpClient *c, VmspawnQmpBridge *bridge) { + _cleanup_(sd_json_variant_unrefp) sd_json_variant *args = NULL; + int r; + + assert(c); + assert(bridge); + + r = sd_json_buildo( + &args, + SD_JSON_BUILD_PAIR_STRING("node-name", "__io_uring_probe"), + SD_JSON_BUILD_PAIR_STRING("driver", "file"), + SD_JSON_BUILD_PAIR_STRING("filename", "/dev/null"), + SD_JSON_BUILD_PAIR_BOOLEAN("read-only", true), + SD_JSON_BUILD_PAIR_STRING("aio", "io_uring")); + if (r < 0) + return r; + + return qmp_client_invoke(c, "blockdev-add", QMP_CLIENT_ARGS(args), + on_io_uring_probe_add_reply, bridge); +} + +static int on_probe_schema_reply( + QmpClient *c, + sd_json_variant *result, + const char *error_desc, + int error, + void *userdata) { + + VmspawnQmpBridge *bridge = ASSERT_PTR(userdata); + + assert(c); + + if (error < 0 && !error_desc) + return log_debug_errno(error, "query-qmp-schema probe did not execute: %m"); + if (error_desc) { + log_debug("query-qmp-schema rejected: %s", error_desc); + return 0; + } + + if (qmp_schema_has_member(result, "discard-no-unref")) { + bridge->features |= VMSPAWN_QMP_FEATURE_DISCARD_NO_UNREF; + log_debug("QEMU supports qcow2 discard-no-unref"); + } else + log_debug("QEMU does not support qcow2 discard-no-unref"); + + return 0; +} + +static int probe_schema(QmpClient *c, VmspawnQmpBridge *bridge) { + assert(c); + assert(bridge); + + return qmp_client_invoke(c, "query-qmp-schema", QMP_CLIENT_ARGS(NULL), + on_probe_schema_reply, bridge); +} + +int vmspawn_qmp_init(VmspawnQmpBridge **ret, int fd, sd_event *event) { + _cleanup_(vmspawn_qmp_bridge_freep) VmspawnQmpBridge *bridge = NULL; + int r; + + assert(ret); + assert(fd >= 0); + assert(event); + + bridge = new0(VmspawnQmpBridge, 1); + if (!bridge) + return log_oom(); + + r = qmp_client_connect_fd(&bridge->qmp, fd); + if (r < 0) + return log_error_errno(r, "Failed to create QMP client: %m"); + + r = qmp_client_set_description(bridge->qmp, "vmspawn-qmp-client"); + if (r < 0) + return log_error_errno(r, "Failed to set QMP client description: %m"); + + r = qmp_client_attach_event(bridge->qmp, event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach QMP client to event loop: %m"); + + *ret = TAKE_PTR(bridge); + return 0; +} + +int vmspawn_qmp_probe_features(VmspawnQmpBridge *bridge) { + int r; + + assert(bridge); + + /* probe_io_uring() and probe_schema() both call qmp_client_invoke(), which internally + * drives the handshake to RUNNING via qmp_client_ensure_running() on its first call. */ + r = probe_io_uring(bridge->qmp, bridge); + if (r < 0) + return log_error_errno(r, "Failed to issue io_uring probe: %m"); + + r = probe_schema(bridge->qmp, bridge); + if (r < 0) + return log_error_errno(r, "Failed to issue schema probe: %m"); + + /* Canonical sync-on-async pump, matching varlink_call_internal(). The QMP client tracks + * outstanding replies in its own slots set; drain until it's idle. */ + while (!qmp_client_is_idle(bridge->qmp)) { + r = qmp_client_process(bridge->qmp); + if (r < 0) + return log_error_errno(r, "QMP probe pump failed: %m"); + if (r > 0) + continue; + + r = qmp_client_wait(bridge->qmp, USEC_INFINITY); + if (r < 0) + return log_error_errno(r, "QMP probe wait failed: %m"); + } + + /* If fail_pending() drained the slots (transport dropped mid-probe), features can't be + * trusted and we have no QMP channel for device setup anyway. */ + if (qmp_client_is_disconnected(bridge->qmp)) + return log_error_errno(SYNTHETIC_ERRNO(ECONNRESET), + "QMP connection dropped during feature probing"); + + return 0; +} + +static int on_cont_complete( + QmpClient *client, + sd_json_variant *result, + const char *error_desc, + int error, + void *userdata) { + + assert(client); + + if (error < 0) { + log_error_errno(error, "Failed to resume QEMU execution: %s", strna(error_desc)); + return sd_event_exit(qmp_client_get_event(client), error); + } + + return 0; +} + +int vmspawn_qmp_start(VmspawnQmpBridge *bridge) { + assert(bridge); + + return qmp_client_invoke(bridge->qmp, "cont", /* args= */ NULL, on_cont_complete, /* userdata= */ NULL); +} diff --git a/src/vmspawn/vmspawn-qmp.h b/src/vmspawn/vmspawn-qmp.h new file mode 100644 index 00000000000..8f8c26fb03e --- /dev/null +++ b/src/vmspawn/vmspawn-qmp.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "shared-forward.h" + +/* Pending job continuation — called when a QMP background job reaches "concluded" state. + * Used by blockdev-create to chain remaining drive setup after the job completes. */ +typedef int (*pending_job_callback_t)(QmpClient *qmp, void *userdata); +typedef void (*pending_job_free_t)(void *userdata); + +typedef struct PendingJob { + pending_job_callback_t on_concluded; + pending_job_free_t free_userdata; + void *userdata; +} PendingJob; + +PendingJob* pending_job_free(PendingJob *j); +DEFINE_TRIVIAL_CLEANUP_FUNC(PendingJob *, pending_job_free); + +typedef enum VmspawnQmpFeatureFlags { + VMSPAWN_QMP_FEATURE_IO_URING = 1u << 0, + VMSPAWN_QMP_FEATURE_DISCARD_NO_UNREF = 1u << 1, +} VmspawnQmpFeatureFlags; + +typedef struct VmspawnQmpBridge { + QmpClient *qmp; + Hashmap *pending_jobs; /* job_id (string, owned) -> PendingJob* */ + VmspawnQmpFeatureFlags features; +} VmspawnQmpBridge; + +VmspawnQmpBridge* vmspawn_qmp_bridge_free(VmspawnQmpBridge *b); +DEFINE_TRIVIAL_CLEANUP_FUNC(VmspawnQmpBridge *, vmspawn_qmp_bridge_free); + +QmpClient* vmspawn_qmp_bridge_get_qmp(VmspawnQmpBridge *b); + +/* Phase 1: Connect to VMM backend. Returns an opaque bridge ready for device setup. */ +int vmspawn_qmp_init(VmspawnQmpBridge **ret, int fd, sd_event *event); + +/* Phase 1b: Feature probing. Fires one-shot QMP commands and drives the client + * synchronously until every reply has been delivered. Populates bridge->features. + * Must run before the device-setup phase; both io_uring and discard-no-unref flags + * are consumed by vmspawn_qmp_setup_drives(). */ +int vmspawn_qmp_probe_features(VmspawnQmpBridge *bridge); + +/* Phase 3: Resume vCPUs. All commands are async — responses arrive during sd_event_loop(). */ +int vmspawn_qmp_start(VmspawnQmpBridge *bridge); + +int vmspawn_qmp_bridge_register_job( + VmspawnQmpBridge *b, + const char *job_id, + pending_job_callback_t on_concluded, + void *userdata, + pending_job_free_t free_userdata); + +typedef enum QmpDriveFlags { + QMP_DRIVE_BLOCK_DEVICE = 1u << 0, + QMP_DRIVE_READ_ONLY = 1u << 1, + QMP_DRIVE_DISCARD = 1u << 2, + QMP_DRIVE_NO_FLUSH = 1u << 3, + QMP_DRIVE_BOOT = 1u << 4, + QMP_DRIVE_IO_URING = 1u << 5, + QMP_DRIVE_DISCARD_NO_UNREF = 1u << 6, /* qcow2 only */ +} QmpDriveFlags; + +/* Drive info for QMP-based drive setup */ +typedef struct DriveInfo { + const char *path; /* kept for logging only — not passed to QEMU */ + const char *format; /* "raw" or "qcow2" */ + const char *disk_driver; /* "virtio-blk-pci", "scsi-hd", "scsi-cd", "nvme" */ + char *serial; /* owned */ + char *node_name; /* owned */ + char *pcie_port; /* owned: pcie-root-port id for device_add bus (NULL on non-PCIe) */ + int fd; /* pre-opened image fd (owned, -EBADF if unused) */ + int overlay_fd; /* pre-opened anonymous overlay fd for ephemeral (owned, -EBADF if unused) */ + QmpDriveFlags flags; +} DriveInfo; + +void drive_info_done(DriveInfo *info); + +typedef struct DriveInfos { + DriveInfo *drives; + size_t n_drives; + char *scsi_pcie_port; /* owned: pcie-root-port id for SCSI controller (NULL if no SCSI or non-PCIe) */ +} DriveInfos; + +void drive_infos_done(DriveInfos *infos); + +/* Network info for QMP-based network setup. Covers privileged TAP (by name), + * nsresourced TAP (by FD via getfd), and user-mode networking. The no-network + * case (-nic none) stays on the QEMU command line. */ +typedef struct NetworkInfo { + const char *type; /* "tap" or "user" — points to a string literal */ + char *ifname; /* owned: TAP interface name (tap by name only, NULL if unset) */ + struct ether_addr mac; /* VM-side MAC address (tap only, valid iff mac_set) */ + bool mac_set; + char *pcie_port; /* owned: pcie-root-port id for device_add bus (NULL on non-PCIe) */ + int fd; /* TAP fd to pass via getfd (tap by fd only, -EBADF if unused) */ +} NetworkInfo; + +void network_info_done(NetworkInfo *info); + +/* Virtiofs device info for QMP-based chardev + device setup */ +typedef struct VirtiofsInfo { + char *id; /* owned: chardev and device id (e.g. "rootdir", "mnt0") */ + char *socket_path; /* owned: virtiofsd listen socket path */ + char *tag; /* owned: virtiofs mount tag visible to guest */ + char *pcie_port; /* owned: pcie-root-port id for device_add bus (NULL on non-PCIe) */ +} VirtiofsInfo; + +void virtiofs_info_done(VirtiofsInfo *info); + +typedef struct VirtiofsInfos { + VirtiofsInfo *entries; + size_t n_entries; +} VirtiofsInfos; + +void virtiofs_infos_done(VirtiofsInfos *infos); + +/* VSOCK device info for QMP-based setup via getfd + device_add */ +typedef struct VsockInfo { + int fd; /* vhost-vsock fd to pass via getfd (-EBADF if unused) */ + unsigned cid; /* guest CID */ + char *pcie_port; /* owned: pcie-root-port id for device_add bus (NULL on non-PCIe) */ +} VsockInfo; + +void vsock_info_done(VsockInfo *info); + +/* Aggregate of the per-device info structures populated before the bridge-based + * device setup phase. Keeps lifetime and cleanup of all device state in one place. */ +typedef struct MachineConfig { + DriveInfos drives; + NetworkInfo network; + VirtiofsInfos virtiofs; + VsockInfo vsock; +} MachineConfig; + +void machine_config_done(MachineConfig *c); + +/* Phase 2: Device setup — call any subset in any order before vmspawn_qmp_start(). */ +int vmspawn_qmp_setup_drives(VmspawnQmpBridge *bridge, DriveInfos *drives); +int vmspawn_qmp_setup_network(VmspawnQmpBridge *bridge, NetworkInfo *network); +int vmspawn_qmp_setup_virtiofs(VmspawnQmpBridge *bridge, const VirtiofsInfos *virtiofs); +int vmspawn_qmp_setup_vsock(VmspawnQmpBridge *bridge, VsockInfo *vsock); diff --git a/src/vmspawn/vmspawn-varlink.c b/src/vmspawn/vmspawn-varlink.c new file mode 100644 index 00000000000..c73372e7a1a --- /dev/null +++ b/src/vmspawn/vmspawn-varlink.c @@ -0,0 +1,410 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "errno-util.h" +#include "hashmap.h" +#include "log.h" +#include "path-util.h" +#include "qmp-client.h" +#include "string-util.h" +#include "strv.h" +#include "varlink-io.systemd.MachineInstance.h" +#include "varlink-io.systemd.QemuMachineInstance.h" +#include "varlink-io.systemd.VirtualMachineInstance.h" +#include "varlink-util.h" +#include "vmspawn-varlink.h" + +DEFINE_PRIVATE_HASH_OPS_FULL( + varlink_subscriber_hash_ops, + void, trivial_hash_func, trivial_compare_func, sd_varlink_close_unref, + char*, strv_free); + +struct VmspawnVarlinkContext { + sd_varlink_server *varlink_server; + VmspawnQmpBridge *bridge; + /* Key: sd_varlink* (ref'd), Value: strv filter (NULL = all events). + * varlink_subscriber_hash_ops handles cleanup of both on removal. */ + Hashmap *subscribed; +}; + +/* Translate a QMP async completion into a varlink error reply */ +static int qmp_error_to_varlink(sd_varlink *link, const char *error_desc, int error) { + assert(link); + + if (ERRNO_IS_DISCONNECT(error)) + return sd_varlink_error(link, "io.systemd.MachineInstance.NotConnected", NULL); + if (error == -EIO) + log_warning("QMP command failed: %s", strna(error_desc)); + return sd_varlink_error_errno(link, error); +} + +/* Shared async completion for simple QMP commands that return no data. + * Errors are translated to varlink replies, not propagated through sd_event. */ +static int on_qmp_simple_complete( + QmpClient *client, + sd_json_variant *result, + const char *error_desc, + int error, + void *userdata) { + + sd_varlink *link = ASSERT_PTR(userdata); + + assert(client); + + if (error == 0) + (void) sd_varlink_reply(link, NULL); + else + (void) qmp_error_to_varlink(link, error_desc, error); + + sd_varlink_unref(link); + return 0; +} + +static int qmp_execute_varlink_async( + VmspawnVarlinkContext *ctx, + sd_varlink *link, + const char *command, + sd_json_variant *arguments, + qmp_command_callback_t callback) { + + int r; + + sd_varlink_ref(link); + + r = qmp_client_invoke(ctx->bridge->qmp, command, QMP_CLIENT_ARGS(arguments), callback, link); + if (r < 0) + sd_varlink_unref(link); + + return r; +} + +static int qmp_execute_simple_async(sd_varlink *link, VmspawnVarlinkContext *ctx, const char *qmp_command) { + assert(link); + assert(ctx); + assert(qmp_command); + + return qmp_execute_varlink_async(ctx, link, qmp_command, /* arguments= */ NULL, on_qmp_simple_complete); +} + +static int vl_method_terminate(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) { + return qmp_execute_simple_async(link, ASSERT_PTR(userdata), "quit"); +} + +static int vl_method_pause(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) { + return qmp_execute_simple_async(link, ASSERT_PTR(userdata), "stop"); +} + +static int vl_method_resume(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) { + return qmp_execute_simple_async(link, ASSERT_PTR(userdata), "cont"); +} + +static int vl_method_power_off(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) { + return qmp_execute_simple_async(link, ASSERT_PTR(userdata), "system_powerdown"); +} + +static int vl_method_reboot(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) { + return qmp_execute_simple_async(link, ASSERT_PTR(userdata), "system_reset"); +} + +/* Async completion for query-status: extract running/status from QMP result */ +static int on_qmp_describe_complete( + QmpClient *client, + sd_json_variant *result, + const char *error_desc, + int error, + void *userdata) { + + _cleanup_(sd_varlink_unrefp) sd_varlink *link = ASSERT_PTR(userdata); + + assert(client); + + if (error != 0) { + (void) qmp_error_to_varlink(link, error_desc, error); + return 0; + } + + sd_json_variant *running = sd_json_variant_by_key(result, "running"); + sd_json_variant *status = sd_json_variant_by_key(result, "status"); + + (void) sd_varlink_replybo( + link, + SD_JSON_BUILD_PAIR_BOOLEAN("running", running ? sd_json_variant_boolean(running) : false), + SD_JSON_BUILD_PAIR_STRING("status", status && sd_json_variant_is_string(status) ? sd_json_variant_string(status) : "unknown")); + + return 0; +} + +static int vl_method_describe(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) { + VmspawnVarlinkContext *ctx = ASSERT_PTR(userdata); + + return qmp_execute_varlink_async(ctx, link, "query-status", /* arguments= */ NULL, on_qmp_describe_complete); +} + +static int vl_method_subscribe_events(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) { + VmspawnVarlinkContext *ctx = ASSERT_PTR(userdata); + _cleanup_strv_free_ char **filter = NULL; + int r; + + /* SD_VARLINK_REQUIRES_MORE in the IDL rejects non-streaming callers before we get here */ + + r = sd_varlink_dispatch(link, parameters, (const sd_json_dispatch_field[]) { + { "filter", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_strv, 0, SD_JSON_NULLABLE }, + {}, + }, &filter); + if (r != 0) + return r; + + sd_varlink_ref(link); + + r = hashmap_ensure_put(&ctx->subscribed, &varlink_subscriber_hash_ops, link, filter); + if (r < 0) { + sd_varlink_unref(link); + return r; + } + + TAKE_PTR(filter); + + r = sd_varlink_notifybo(link, SD_JSON_BUILD_PAIR_STRING("event", "READY")); + if (r < 0) { + strv_free(hashmap_remove(ctx->subscribed, link)); + sd_varlink_close_unref(link); + return r; + } + + return 0; +} + +static int vl_method_acquire_qmp(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) { + return sd_varlink_error_errno(link, -EOPNOTSUPP); +} + +static void vl_disconnect(sd_varlink_server *server, sd_varlink *link, void *userdata) { + VmspawnVarlinkContext *ctx = ASSERT_PTR(userdata); + + assert(server); + assert(link); + + /* Only subscribers hold an extra ref on the link (taken in vl_method_subscribe_events). + * Non-subscriber connections (one-shot commands like Pause, Describe) must not be unref'd + * here — their extra ref is consumed by the async completion callback. Only unref, never + * close — the server handles close after this callback returns (matching resolved's + * vl_on_notification_disconnect pattern). + * + * Use hashmap_remove2() so the returned key (non-NULL iff the entry was present) + * disambiguates "no filter subscriber" (value=NULL) from "not a subscriber". */ + void *removed_key = NULL; + strv_free(hashmap_remove2(ctx->subscribed, link, &removed_key)); + if (!removed_key) + return; + + sd_varlink_unref(link); +} + +static int on_job_dismiss_complete( + QmpClient *client, + sd_json_variant *result, + const char *error_desc, + int error, + void *userdata) { + + if (error < 0) + log_debug_errno(error, "job-dismiss failed: %s", strna(error_desc)); + + return 0; +} + +static int dispatch_pending_job(VmspawnQmpBridge *bridge, sd_json_variant *data) { + const char *job_id, *status; + int r; + + assert(bridge); + + if (!data) + return 0; + + job_id = sd_json_variant_string(sd_json_variant_by_key(data, "id")); + status = sd_json_variant_string(sd_json_variant_by_key(data, "status")); + + if (!job_id || !streq_ptr(status, "concluded")) + return 0; + + _cleanup_free_ char *key = NULL; + _cleanup_(pending_job_freep) PendingJob *job = hashmap_remove2(bridge->pending_jobs, job_id, (void**) &key); + if (!job) + return 0; + + log_debug("QMP job '%s' concluded, firing continuation", job_id); + + /* Dismiss the concluded job before running the continuation */ + _cleanup_(sd_json_variant_unrefp) sd_json_variant *dismiss_args = NULL; + r = sd_json_buildo(&dismiss_args, SD_JSON_BUILD_PAIR_STRING("id", job_id)); + if (r < 0) + return sd_event_exit(qmp_client_get_event(bridge->qmp), r); + + r = qmp_client_invoke(bridge->qmp, "job-dismiss", QMP_CLIENT_ARGS(dismiss_args), + on_job_dismiss_complete, /* userdata= */ NULL); + if (r < 0) + return sd_event_exit(qmp_client_get_event(bridge->qmp), r); + + if (!job->on_concluded) + return 1; + + r = job->on_concluded(bridge->qmp, TAKE_PTR(job->userdata)); + if (r < 0) { + log_error_errno(r, "Job continuation failed: %m"); + return sd_event_exit(qmp_client_get_event(bridge->qmp), r); + } + + return 1; +} + +static int on_qmp_event( + QmpClient *client, + const char *event, + sd_json_variant *data, + void *userdata) { + + VmspawnVarlinkContext *ctx = ASSERT_PTR(userdata); + _cleanup_(sd_json_variant_unrefp) sd_json_variant *notification = NULL; + sd_varlink *link; + char **filter; + int r; + + assert(client); + assert(event); + + /* Dispatch job status changes to pending continuations (e.g. blockdev-create) */ + if (streq(event, "JOB_STATUS_CHANGE")) + return dispatch_pending_job(ctx->bridge, data); + + if (hashmap_isempty(ctx->subscribed)) + return 0; + + r = sd_json_buildo( + ¬ification, + SD_JSON_BUILD_PAIR_STRING("event", event), + SD_JSON_BUILD_PAIR_CONDITION(!!data, "data", SD_JSON_BUILD_VARIANT(data))); + if (r < 0) { + log_warning_errno(r, "Failed to build event notification, ignoring: %m"); + return 0; + } + + HASHMAP_FOREACH_KEY(filter, link, ctx->subscribed) { + if (filter && !strv_contains(filter, event)) + continue; + + r = sd_varlink_notify(link, notification); + if (r < 0) + log_warning_errno(r, "Failed to notify event subscriber, ignoring: %m"); + } + + return 0; +} + +/* Free all subscriber entries — varlink_subscriber_hash_ops handles + * close + unref for each key and strv_free for each value. */ +static void drain_event_subscribers(Hashmap **subscribed) { + assert(subscribed); + *subscribed = hashmap_free(*subscribed); +} + +static void on_qmp_disconnect(QmpClient *client, void *userdata) { + VmspawnVarlinkContext *ctx = ASSERT_PTR(userdata); + + assert(client); + + log_debug("Backend connection lost"); + + /* Propagate connection loss by closing all subscriber connections */ + drain_event_subscribers(&ctx->subscribed); +} + +int vmspawn_varlink_setup( + VmspawnVarlinkContext **ret, + VmspawnQmpBridge *bridge, + const char *runtime_dir, + char **ret_control_address) { + + _cleanup_(vmspawn_varlink_context_freep) VmspawnVarlinkContext *ctx = NULL; + _cleanup_free_ char *listen_address = NULL; + int r; + + assert(ret); + assert(bridge); + assert(runtime_dir); + + sd_event *event = qmp_client_get_event(bridge->qmp); + assert(event); + + ctx = new0(VmspawnVarlinkContext, 1); + if (!ctx) + return log_oom(); + + /* Create varlink server for VM control */ + r = varlink_server_new(&ctx->varlink_server, + SD_VARLINK_SERVER_INHERIT_USERDATA, + ctx); + if (r < 0) + return log_error_errno(r, "Failed to create varlink server: %m"); + + r = sd_varlink_server_add_interface_many( + ctx->varlink_server, + &vl_interface_io_systemd_MachineInstance, + &vl_interface_io_systemd_VirtualMachineInstance, + &vl_interface_io_systemd_QemuMachineInstance); + if (r < 0) + return log_error_errno(r, "Failed to add varlink interfaces: %m"); + + r = sd_varlink_server_bind_method_many( + ctx->varlink_server, + "io.systemd.MachineInstance.Terminate", vl_method_terminate, + "io.systemd.MachineInstance.PowerOff", vl_method_power_off, + "io.systemd.MachineInstance.Pause", vl_method_pause, + "io.systemd.MachineInstance.Resume", vl_method_resume, + "io.systemd.MachineInstance.Reboot", vl_method_reboot, + "io.systemd.MachineInstance.Describe", vl_method_describe, + "io.systemd.MachineInstance.SubscribeEvents", vl_method_subscribe_events, + "io.systemd.QemuMachineInstance.AcquireQMP", vl_method_acquire_qmp); + if (r < 0) + return log_error_errno(r, "Failed to bind varlink methods: %m"); + + r = sd_varlink_server_bind_disconnect(ctx->varlink_server, vl_disconnect); + if (r < 0) + return log_error_errno(r, "Failed to bind disconnect handler: %m"); + + listen_address = path_join(runtime_dir, "control"); + if (!listen_address) + return log_oom(); + + r = sd_varlink_server_listen_address(ctx->varlink_server, listen_address, 0600); + if (r < 0) + return log_error_errno(r, "Failed to listen on %s: %m", listen_address); + + r = sd_varlink_server_attach_event(ctx->varlink_server, event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink server to event loop: %m"); + + ctx->bridge = bridge; + qmp_client_bind_event(ctx->bridge->qmp, on_qmp_event, ctx); + qmp_client_bind_disconnect(ctx->bridge->qmp, on_qmp_disconnect, ctx); + + log_debug("Varlink control server listening on %s", listen_address); + + if (ret_control_address) + *ret_control_address = TAKE_PTR(listen_address); + + *ret = TAKE_PTR(ctx); + return 0; +} + +VmspawnVarlinkContext* vmspawn_varlink_context_free(VmspawnVarlinkContext *ctx) { + if (!ctx) + return NULL; + + sd_varlink_server_unref(ctx->varlink_server); + vmspawn_qmp_bridge_free(ctx->bridge); + + drain_event_subscribers(&ctx->subscribed); + + return mfree(ctx); +} diff --git a/src/vmspawn/vmspawn-varlink.h b/src/vmspawn/vmspawn-varlink.h new file mode 100644 index 00000000000..1833416a56d --- /dev/null +++ b/src/vmspawn/vmspawn-varlink.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "cleanup-util.h" +#include "shared-forward.h" +#include "vmspawn-qmp.h" + +typedef struct VmspawnVarlinkContext VmspawnVarlinkContext; + +/* Varlink server for VM control on top of an established bridge connection */ +int vmspawn_varlink_setup( + VmspawnVarlinkContext **ret, + VmspawnQmpBridge *bridge, + const char *runtime_dir, + char **ret_control_address); + +VmspawnVarlinkContext* vmspawn_varlink_context_free(VmspawnVarlinkContext *ctx); + +DEFINE_TRIVIAL_CLEANUP_FUNC(VmspawnVarlinkContext *, vmspawn_varlink_context_free);