]> git.ipfire.org Git - thirdparty/qemu.git/commitdiff
migration: Refactor all incoming cleanup info migration_incoming_destroy()
authorJuraj Marcin <jmarcin@redhat.com>
Mon, 3 Nov 2025 18:32:54 +0000 (19:32 +0100)
committerPeter Xu <peterx@redhat.com>
Mon, 3 Nov 2025 21:04:10 +0000 (16:04 -0500)
Currently, there are two functions that are responsible for calling the
cleanup of the incoming migration state. With successful precopy, it's
the incoming migration coroutine, and with successful postcopy it's the
postcopy listen thread. However, if postcopy fails during in the device
load, both functions will try to do the cleanup.

This patch refactors all cleanup that needs to be done on the incoming
side into a common function and defines a clear boundary, who is
responsible for the cleanup. The incoming migration coroutine is
responsible for calling the cleanup function, unless the listen thread
has been started, in which case the postcopy listen thread runs the
incoming migration cleanup in its BH.

Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
Fixes: 9535435795 ("migration: push Error **errp into qemu_loadvm_state()")
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/20251103183301.3840862-6-jmarcin@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
migration/migration.c
migration/migration.h
migration/postcopy-ram.c
migration/trace-events

index 9a367f717eed0e7644e3619f256f703610386a34..637be71bfe8b1f22be076bba2578bfee08531411 100644 (file)
@@ -438,10 +438,15 @@ void migration_incoming_transport_cleanup(MigrationIncomingState *mis)
 
 void migration_incoming_state_destroy(void)
 {
-    struct MigrationIncomingState *mis = migration_incoming_get_current();
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    PostcopyState ps = postcopy_state_get();
 
     multifd_recv_cleanup();
 
+    if (ps != POSTCOPY_INCOMING_NONE) {
+        postcopy_incoming_cleanup(mis);
+    }
+
     /*
      * RAM state cleanup needs to happen after multifd cleanup, because
      * multifd threads can use some of its states (receivedmap).
@@ -866,7 +871,6 @@ process_incoming_migration_co(void *opaque)
 {
     MigrationState *s = migrate_get_current();
     MigrationIncomingState *mis = migration_incoming_get_current();
-    PostcopyState ps;
     int ret;
     Error *local_err = NULL;
 
@@ -883,25 +887,14 @@ process_incoming_migration_co(void *opaque)
 
     trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
 
-    ps = postcopy_state_get();
-    trace_process_incoming_migration_co_end(ret, ps);
-    if (ps != POSTCOPY_INCOMING_NONE) {
-        if (ps == POSTCOPY_INCOMING_ADVISE) {
-            /*
-             * Where a migration had postcopy enabled (and thus went to advise)
-             * but managed to complete within the precopy period, we can use
-             * the normal exit.
-             */
-            postcopy_incoming_cleanup(mis);
-        } else if (ret >= 0) {
-            /*
-             * Postcopy was started, cleanup should happen at the end of the
-             * postcopy thread.
-             */
-            trace_process_incoming_migration_co_postcopy_end_main();
-            goto out;
-        }
-        /* Else if something went wrong then just fall out of the normal exit */
+    trace_process_incoming_migration_co_end(ret);
+    if (mis->have_listen_thread) {
+        /*
+         * Postcopy was started, cleanup should happen at the end of the
+         * postcopy listen thread.
+         */
+        trace_process_incoming_migration_co_postcopy_end_main();
+        goto out;
     }
 
     if (ret < 0) {
@@ -933,15 +926,6 @@ fail:
         }
 
         exit(EXIT_FAILURE);
-    } else {
-        /*
-         * Report the error here in case that QEMU abruptly exits
-         * when postcopy is enabled.
-         */
-        WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
-            error_report_err(s->error);
-            s->error = NULL;
-        }
     }
 out:
     /* Pairs with the refcount taken in qmp_migrate_incoming() */
index 01329bf824893dff9006f836c688f3dfee08e62e..4a37f7202c1d030629d7d9a45de37ba027eea98d 100644 (file)
@@ -254,6 +254,7 @@ struct MigrationIncomingState {
 MigrationIncomingState *migration_incoming_get_current(void);
 void migration_incoming_state_destroy(void);
 void migration_incoming_transport_cleanup(MigrationIncomingState *mis);
+void migration_incoming_qemu_exit(void);
 /*
  * Functions to work with blocktime context
  */
index b47c955763a65bd8ee3c21a7c51f01e3f4b31e1b..48cbb46c278b73cddd93635f4bb19bc0a5073f78 100644 (file)
@@ -2078,6 +2078,24 @@ bool postcopy_is_paused(MigrationStatus status)
         status == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP;
 }
 
+static void postcopy_listen_thread_bh(void *opaque)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    migration_incoming_state_destroy();
+
+    if (mis->state == MIGRATION_STATUS_FAILED) {
+        /*
+         * If something went wrong then we have a bad state so exit;
+         * we only could have gotten here if something failed before
+         * POSTCOPY_INCOMING_RUNNING (for example device load), otherwise
+         * postcopy migration would pause inside qemu_loadvm_state_main().
+         * Failing dirty-bitmaps won't fail the whole migration.
+         */
+        exit(1);
+    }
+}
+
 /*
  * Triggered by a postcopy_listen command; this thread takes over reading
  * the input stream, leaving the main thread free to carry on loading the rest
@@ -2131,53 +2149,38 @@ static void *postcopy_listen_thread(void *opaque)
                          "bitmaps are correctly migrated and valid.",
                          __func__, load_res, error_get_pretty(local_err));
             g_clear_pointer(&local_err, error_free);
-            load_res = 0; /* prevent further exit() */
         } else {
+            /*
+             * Something went fatally wrong and we have a bad state, QEMU will
+             * exit depending on if postcopy-exit-on-error is true, but the
+             * migration cannot be recovered.
+             */
             error_prepend(&local_err,
                           "loadvm failed during postcopy: %d: ", load_res);
             migrate_set_error(migr, local_err);
             g_clear_pointer(&local_err, error_report_err);
             migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
                                            MIGRATION_STATUS_FAILED);
+            goto out;
         }
     }
-    if (load_res >= 0) {
-        /*
-         * This looks good, but it's possible that the device loading in the
-         * main thread hasn't finished yet, and so we might not be in 'RUN'
-         * state yet; wait for the end of the main thread.
-         */
-        qemu_event_wait(&mis->main_thread_load_event);
-    }
-    postcopy_incoming_cleanup(mis);
-
-    if (load_res < 0) {
-        /*
-         * If something went wrong then we have a bad state so exit;
-         * depending how far we got it might be possible at this point
-         * to leave the guest running and fire MCEs for pages that never
-         * arrived as a desperate recovery step.
-         */
-        rcu_unregister_thread();
-        exit(EXIT_FAILURE);
-    }
+    /*
+     * This looks good, but it's possible that the device loading in the
+     * main thread hasn't finished yet, and so we might not be in 'RUN'
+     * state yet; wait for the end of the main thread.
+     */
+    qemu_event_wait(&mis->main_thread_load_event);
 
     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
                                    MIGRATION_STATUS_COMPLETED);
-    /*
-     * If everything has worked fine, then the main thread has waited
-     * for us to start, and we're the last use of the mis.
-     * (If something broke then qemu will have to exit anyway since it's
-     * got a bad migration state).
-     */
-    bql_lock();
-    migration_incoming_state_destroy();
-    bql_unlock();
 
+out:
     rcu_unregister_thread();
     mis->have_listen_thread = false;
     postcopy_state_set(POSTCOPY_INCOMING_END);
 
+    migration_bh_schedule(postcopy_listen_thread_bh, NULL);
+
     object_unref(OBJECT(migr));
 
     return NULL;
index e8edd1fbbadf4c53f343296cff1b2914fbd5ce4a..772636f3ac72c4b70064df40cc4ed569c0bc3e2d 100644 (file)
@@ -193,7 +193,7 @@ source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32
 source_return_path_thread_switchover_acked(void) ""
 migration_thread_low_pending(uint64_t pending) "%" PRIu64
 migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidth, uint64_t avail_bw, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " switchover_bw %" PRIu64 " max_size %" PRId64
-process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
+process_incoming_migration_co_end(int ret) "ret=%d"
 process_incoming_migration_co_postcopy_end_main(void) ""
 postcopy_preempt_enabled(bool value) "%d"
 migration_precopy_complete(void) ""