migration: Refactor all incoming cleanup info migration_incoming_destroy()

author Juraj Marcin <jmarcin@redhat.com>

Mon, 3 Nov 2025 18:32:54 +0000 (19:32 +0100)

committer Peter Xu <peterx@redhat.com>

Mon, 3 Nov 2025 21:04:10 +0000 (16:04 -0500)
author Juraj Marcin <jmarcin@redhat.com>
Mon, 3 Nov 2025 18:32:54 +0000 (19:32 +0100)
committer Peter Xu <peterx@redhat.com>
Mon, 3 Nov 2025 21:04:10 +0000 (16:04 -0500)
diff --git a/migration/migration.c b/migration/migration.c

index 9a367f717eed0e7644e3619f256f703610386a34..637be71bfe8b1f22be076bba2578bfee08531411 100644 (file)
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -438,10 +438,15 @@ void migration_incoming_transport_cleanup(MigrationIncomingState *mis)
  
  void migration_incoming_state_destroy(void)
  {
-    struct MigrationIncomingState *mis = migration_incoming_get_current();
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    PostcopyState ps = postcopy_state_get();
  
      multifd_recv_cleanup();
  
+    if (ps != POSTCOPY_INCOMING_NONE) {
+        postcopy_incoming_cleanup(mis);
+    }
+
      /*
       * RAM state cleanup needs to happen after multifd cleanup, because
       * multifd threads can use some of its states (receivedmap).
@@ -866,7 +871,6 @@ process_incoming_migration_co(void *opaque)
  {
      MigrationState *s = migrate_get_current();
      MigrationIncomingState *mis = migration_incoming_get_current();
-    PostcopyState ps;
      int ret;
      Error *local_err = NULL;
  
@@ -883,25 +887,14 @@ process_incoming_migration_co(void *opaque)
  
      trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
  
-    ps = postcopy_state_get();
-    trace_process_incoming_migration_co_end(ret, ps);
-    if (ps != POSTCOPY_INCOMING_NONE) {
-        if (ps == POSTCOPY_INCOMING_ADVISE) {
-            /*
-             * Where a migration had postcopy enabled (and thus went to advise)
-             * but managed to complete within the precopy period, we can use
-             * the normal exit.
-             */
-            postcopy_incoming_cleanup(mis);
-        } else if (ret >= 0) {
-            /*
-             * Postcopy was started, cleanup should happen at the end of the
-             * postcopy thread.
-             */
-            trace_process_incoming_migration_co_postcopy_end_main();
-            goto out;
-        }
-        /* Else if something went wrong then just fall out of the normal exit */
+    trace_process_incoming_migration_co_end(ret);
+    if (mis->have_listen_thread) {
+        /*
+         * Postcopy was started, cleanup should happen at the end of the
+         * postcopy listen thread.
+         */
+        trace_process_incoming_migration_co_postcopy_end_main();
+        goto out;
      }
  
      if (ret < 0) {
@@ -933,15 +926,6 @@ fail:
          }
  
          exit(EXIT_FAILURE);
-    } else {
-        /*
-         * Report the error here in case that QEMU abruptly exits
-         * when postcopy is enabled.
-         */
-        WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
-            error_report_err(s->error);
-            s->error = NULL;
-        }
      }
  out:
      /* Pairs with the refcount taken in qmp_migrate_incoming() */
diff --git a/migration/migration.h b/migration/migration.h

index 01329bf824893dff9006f836c688f3dfee08e62e..4a37f7202c1d030629d7d9a45de37ba027eea98d 100644 (file)
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -254,6 +254,7 @@ struct MigrationIncomingState {
  MigrationIncomingState *migration_incoming_get_current(void);
  void migration_incoming_state_destroy(void);
  void migration_incoming_transport_cleanup(MigrationIncomingState *mis);
+void migration_incoming_qemu_exit(void);
  /*
   * Functions to work with blocktime context
   */
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c

index b47c955763a65bd8ee3c21a7c51f01e3f4b31e1b..48cbb46c278b73cddd93635f4bb19bc0a5073f78 100644 (file)
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -2078,6 +2078,24 @@ bool postcopy_is_paused(MigrationStatus status)
          status == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP;
  }
  
+static void postcopy_listen_thread_bh(void *opaque)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    migration_incoming_state_destroy();
+
+    if (mis->state == MIGRATION_STATUS_FAILED) {
+        /*
+         * If something went wrong then we have a bad state so exit;
+         * we only could have gotten here if something failed before
+         * POSTCOPY_INCOMING_RUNNING (for example device load), otherwise
+         * postcopy migration would pause inside qemu_loadvm_state_main().
+         * Failing dirty-bitmaps won't fail the whole migration.
+         */
+        exit(1);
+    }
+}
+
  /*
   * Triggered by a postcopy_listen command; this thread takes over reading
   * the input stream, leaving the main thread free to carry on loading the rest
@@ -2131,53 +2149,38 @@ static void *postcopy_listen_thread(void *opaque)
                           "bitmaps are correctly migrated and valid.",
                           __func__, load_res, error_get_pretty(local_err));
              g_clear_pointer(&local_err, error_free);
-            load_res = 0; /* prevent further exit() */
          } else {
+            /*
+             * Something went fatally wrong and we have a bad state, QEMU will
+             * exit depending on if postcopy-exit-on-error is true, but the
+             * migration cannot be recovered.
+             */
              error_prepend(&local_err,
                            "loadvm failed during postcopy: %d: ", load_res);
              migrate_set_error(migr, local_err);
              g_clear_pointer(&local_err, error_report_err);
              migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
                                             MIGRATION_STATUS_FAILED);
+            goto out;
          }
      }
-    if (load_res >= 0) {
-        /*
-         * This looks good, but it's possible that the device loading in the
-         * main thread hasn't finished yet, and so we might not be in 'RUN'
-         * state yet; wait for the end of the main thread.
-         */
-        qemu_event_wait(&mis->main_thread_load_event);
-    }
-    postcopy_incoming_cleanup(mis);
-
-    if (load_res < 0) {
-        /*
-         * If something went wrong then we have a bad state so exit;
-         * depending how far we got it might be possible at this point
-         * to leave the guest running and fire MCEs for pages that never
-         * arrived as a desperate recovery step.
-         */
-        rcu_unregister_thread();
-        exit(EXIT_FAILURE);
-    }
+    /*
+     * This looks good, but it's possible that the device loading in the
+     * main thread hasn't finished yet, and so we might not be in 'RUN'
+     * state yet; wait for the end of the main thread.
+     */
+    qemu_event_wait(&mis->main_thread_load_event);
  
      migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
                                     MIGRATION_STATUS_COMPLETED);
-    /*
-     * If everything has worked fine, then the main thread has waited
-     * for us to start, and we're the last use of the mis.
-     * (If something broke then qemu will have to exit anyway since it's
-     * got a bad migration state).
-     */
-    bql_lock();
-    migration_incoming_state_destroy();
-    bql_unlock();
  
+out:
      rcu_unregister_thread();
      mis->have_listen_thread = false;
      postcopy_state_set(POSTCOPY_INCOMING_END);
  
+    migration_bh_schedule(postcopy_listen_thread_bh, NULL);
+
      object_unref(OBJECT(migr));
  
      return NULL;
diff --git a/migration/trace-events b/migration/trace-events

index e8edd1fbbadf4c53f343296cff1b2914fbd5ce4a..772636f3ac72c4b70064df40cc4ed569c0bc3e2d 100644 (file)
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -193,7 +193,7 @@ source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32
  source_return_path_thread_switchover_acked(void) ""
  migration_thread_low_pending(uint64_t pending) "%" PRIu64
  migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidth, uint64_t avail_bw, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " switchover_bw %" PRIu64 " max_size %" PRId64
-process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
+process_incoming_migration_co_end(int ret) "ret=%d"
  process_incoming_migration_co_postcopy_end_main(void) ""
  postcopy_preempt_enabled(bool value) "%d"
  migration_precopy_complete(void) ""
author	Juraj Marcin <jmarcin@redhat.com>
	Mon, 3 Nov 2025 18:32:54 +0000 (19:32 +0100)
committer	Peter Xu <peterx@redhat.com>
	Mon, 3 Nov 2025 21:04:10 +0000 (16:04 -0500)
migration/migration.c		patch \| blob \| blame \| history
migration/migration.h		patch \| blob \| blame \| history
migration/postcopy-ram.c		patch \| blob \| blame \| history
migration/trace-events		patch \| blob \| blame \| history