]> git.ipfire.org Git - thirdparty/libvirt.git/commitdiff
qemu: fix potential hang in qemuMigrationSrcCancelUnattended during reconnect
authorDenis V. Lunev <den@openvz.org>
Wed, 22 Apr 2026 09:34:27 +0000 (11:34 +0200)
committerJiri Denemark <jdenemar@redhat.com>
Fri, 24 Apr 2026 10:20:52 +0000 (12:20 +0200)
When libvirtd reconnects to a running QEMU process that had an
in-progress migration, qemuProcessReconnect first connects the
monitor and only later recovers the migration job. During this window
the async job is VIR_ASYNC_JOB_NONE, so any MIGRATION status events
from QEMU are silently dropped by qemuProcessHandleMigrationStatus.

If the migration was already cancelled or completed by QEMU during
this window, no further events will be emitted. When
qemuMigrationSrcCancelUnattended later restores the async job and
calls qemuMigrationSrcCancel with wait=true, the wait loop calls
qemuDomainObjWait (virCondWait with no timeout) and blocks forever
waiting for an event that will never arrive.

qemuProcessRecoverMigration already queries QEMU for the current
migration state via qemuMigrationAnyRefreshStatus and passes the
result to qemuProcessRecoverMigrationOut as migStatus. Plumb that
value one level further into qemuMigrationSrcCancelUnattended and,
when it indicates the migration has already reached a terminal
state (VIR_DOMAIN_JOB_STATUS_CANCELED), skip restoring the async
job and the qemuMigrationSrcCancel/virDomainObjEndAsyncJob pair
entirely.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Suggested-by: Jiri Denemark <jdenemar@redhat.com>
CC: Peter Krempa <pkrempa@redhat.com>
CC: Michal Privoznik <mprivozn@redhat.com>
Reviewed-by: Jiri Denemark <jdenemar@redhat.com>
src/qemu/qemu_migration.c
src/qemu/qemu_migration.h
src/qemu/qemu_process.c

index 33cc0f0ffe90913e1ce6d81a14077bef63f5528b..ffffeea75cd0df3e07358b4b24a7e512796d7b57 100644 (file)
@@ -7390,7 +7390,8 @@ qemuMigrationSrcToFile(virDomainObj *vm,
  */
 int
 qemuMigrationSrcCancelUnattended(virDomainObj *vm,
-                                 virDomainJobObj *oldJob)
+                                 virDomainJobObj *oldJob,
+                                 virDomainJobStatus migStatus)
 {
     bool storage = false;
     size_t i;
@@ -7398,25 +7399,35 @@ qemuMigrationSrcCancelUnattended(virDomainObj *vm,
     VIR_DEBUG("Canceling unfinished outgoing migration of domain %s",
               vm->def->name);
 
-    /* Make sure MIGRATION event handler can store the current migration state
-     * in the job.
+    /* If QEMU has already reached a terminal state during the reconnect
+     * gap, skip restoring the async job and issuing migrate_cancel: QEMU
+     * won't emit any further MIGRATION events, so the wait loop in
+     * qemuMigrationSrcCancel would block forever. The migStatus passed in
+     * comes from the query-migrate call in qemuProcessRecoverMigration,
+     * which is authoritative for the state QEMU reached while no libvirtd
+     * was attached.
      */
-    if (!vm->job->current) {
-        qemuDomainObjRestoreAsyncJob(vm, VIR_ASYNC_JOB_MIGRATION_OUT,
-                                     oldJob->phase, oldJob->asyncStarted,
-                                     VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT,
-                                     QEMU_DOMAIN_JOB_STATS_TYPE_MIGRATION,
-                                     VIR_DOMAIN_JOB_STATUS_FAILED,
-                                     VIR_JOB_NONE);
-    }
-
-    /* We're inside a MODIFY job and the restored MIGRATION_OUT async job is
-     * used only for processing migration events from QEMU. Thus we don't want
-     * to start a nested job for talking to QEMU.
-     */
-    qemuMigrationSrcCancel(vm, VIR_ASYNC_JOB_NONE, true);
+    if (migStatus != VIR_DOMAIN_JOB_STATUS_CANCELED) {
+        /* Make sure MIGRATION event handler can store the current migration
+         * state in the job.
+         */
+        if (!vm->job->current) {
+            qemuDomainObjRestoreAsyncJob(vm, VIR_ASYNC_JOB_MIGRATION_OUT,
+                                         oldJob->phase, oldJob->asyncStarted,
+                                         VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT,
+                                         QEMU_DOMAIN_JOB_STATS_TYPE_MIGRATION,
+                                         VIR_DOMAIN_JOB_STATUS_FAILED,
+                                         VIR_JOB_NONE);
+        }
+
+        /* We're inside a MODIFY job and the restored MIGRATION_OUT async job is
+         * used only for processing migration events from QEMU. Thus we don't
+         * want to start a nested job for talking to QEMU.
+         */
+        qemuMigrationSrcCancel(vm, VIR_ASYNC_JOB_NONE, true);
 
-    virDomainObjEndAsyncJob(vm);
+        virDomainObjEndAsyncJob(vm);
+    }
 
     for (i = 0; i < vm->def->ndisks; i++) {
         virDomainDiskDef *disk = vm->def->disks[i];
index ef6a1563a03cee298181264e720e8582b3d7eba0..59f32d2ebf4e1ad9334e768e7de9e4f8b2e0ae99 100644 (file)
@@ -253,7 +253,8 @@ qemuMigrationSrcToFile(virDomainObj *vm,
 
 int
 qemuMigrationSrcCancelUnattended(virDomainObj *vm,
-                                 virDomainJobObj *oldJob);
+                                 virDomainJobObj *oldJob,
+                                 virDomainJobStatus migStatus);
 
 int
 qemuMigrationSrcCancel(virDomainObj *vm,
index 7ebc038e54e50dd4434f200d17f0bd31dc76d20c..a6d33f67466927efc986e033cb74bfbb6a980173 100644 (file)
@@ -3798,7 +3798,7 @@ qemuProcessRecoverMigrationOut(virQEMUDriver *driver,
          */
         VIR_DEBUG("Cancelling unfinished migration of domain %s",
                   vm->def->name);
-        if (qemuMigrationSrcCancelUnattended(vm, job) < 0) {
+        if (qemuMigrationSrcCancelUnattended(vm, job, migStatus) < 0) {
             VIR_WARN("Could not cancel ongoing migration of domain %s",
                      vm->def->name);
         }