migration/postcopy: blocktime allows track / report non-vCPU faults

author Peter Xu <peterx@redhat.com>

Fri, 13 Jun 2025 14:12:16 +0000 (10:12 -0400)

committer Fabiano Rosas <farosas@suse.de>

Fri, 11 Jul 2025 13:37:39 +0000 (10:37 -0300)
author Peter Xu <peterx@redhat.com>
Fri, 13 Jun 2025 14:12:16 +0000 (10:12 -0400)
committer Fabiano Rosas <farosas@suse.de>
Fri, 11 Jul 2025 13:37:39 +0000 (10:37 -0300)
diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c

index 8b3846dab543f8ffc4754f104a4df016d960de74..e1f953052053bf7939aefa7bb0e12eb5cf4dc288 100644 (file)
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -80,6 +80,11 @@ static void migration_dump_blocktime(Monitor *mon, MigrationInfo *info)
                         info->postcopy_latency);
      }
  
+    if (info->has_postcopy_non_vcpu_latency) {
+        monitor_printf(mon, "Postcopy non-vCPU Latencies (ns): %" PRIu64 "\n",
+                       info->postcopy_non_vcpu_latency);
+    }
+
      if (info->has_postcopy_vcpu_latency) {
          uint64List *item = info->postcopy_vcpu_latency;
          const char *sep = "";
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c

index 91c23b446edaa9898d9bad3ba4643665f23540b0..f4cb23b3e0b22ab9d832fb721329081710c47738 100644 (file)
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -192,6 +192,8 @@ typedef struct PostcopyBlocktimeContext {
      GHashTable *tid_to_vcpu_hash;
      /* Count of non-vCPU faults.  This is only for debugging purpose. */
      uint64_t non_vcpu_faults;
+    /* total blocktime when a non-vCPU thread is stopped */
+    uint64_t non_vcpu_blocktime_total;
  
      /*
       * Handler for exit event, necessary for
@@ -203,7 +205,10 @@ typedef struct PostcopyBlocktimeContext {
  typedef struct {
      /* The time the fault was triggered */
      uint64_t fault_time;
-    /* The vCPU index that was blocked */
+    /*
+     * The vCPU index that was blocked, when cpu==-1, it means it's a
+     * fault from non-vCPU threads.
+     */
      int cpu;
  } BlocktimeVCPUEntry;
  
@@ -344,6 +349,12 @@ void fill_destination_postcopy_migration_info(MigrationInfo *info)
          QAPI_LIST_PREPEND(list_latency, latency);
      }
  
+    latency_total += bc->non_vcpu_blocktime_total;
+    faults += bc->non_vcpu_faults;
+
+    info->has_postcopy_non_vcpu_latency = true;
+    info->postcopy_non_vcpu_latency = bc->non_vcpu_faults ?
+        (bc->non_vcpu_blocktime_total / bc->non_vcpu_faults) : 0;
      info->has_postcopy_blocktime = true;
      /* Convert ns -> ms */
      info->postcopy_blocktime = (uint32_t)(bc->total_blocktime / SCALE_MS);
@@ -983,7 +994,10 @@ static uint64_t get_current_ns(void)
      return (uint64_t)qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
  }
  
-/* Inject an (cpu, fault_time) entry into the database, using addr as key */
+/*
+ * Inject an (cpu, fault_time) entry into the database, using addr as key.
+ * When cpu==-1, it means it's a non-vCPU fault.
+ */
  static void blocktime_fault_inject(PostcopyBlocktimeContext *ctx,
                                     uintptr_t addr, int cpu, uint64_t time)
  {
@@ -1066,9 +1080,17 @@ void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
          /* Making sure it won't overflow - it really should never! */
          assert(dc->vcpu_faults_current[cpu] <= 255);
      } else {
-        /* We do not support non-vCPU thread tracking yet */
+        /*
+         * For non-vCPU thread faults, we don't care about tid or cpu index
+         * or time the thread is blocked (e.g., a kworker trying to help
+         * KVM when async_pf=on is OK to be blocked and not affect guest
+         * responsiveness), but we care about latency.  Track it with
+         * cpu=-1.
+         *
+         * Note that this will NOT affect blocktime reports on vCPU being
+         * blocked, but only about system-wide latency reports.
+         */
          dc->non_vcpu_faults++;
-        return;
      }
  
      blocktime_fault_inject(dc, addr, cpu, current);
@@ -1078,6 +1100,7 @@ typedef struct {
      PostcopyBlocktimeContext *ctx;
      uint64_t current;
      int affected_cpus;
+    int affected_non_cpus;
  } BlockTimeVCPUIter;
  
  static void blocktime_cpu_list_iter_fn(gpointer data, gpointer user_data)
@@ -1085,6 +1108,7 @@ static void blocktime_cpu_list_iter_fn(gpointer data, gpointer user_data)
      BlockTimeVCPUIter *iter = user_data;
      PostcopyBlocktimeContext *ctx = iter->ctx;
      BlocktimeVCPUEntry *entry = data;
+    uint64_t time_passed;
      int cpu = entry->cpu;
  
      /*
@@ -1092,17 +1116,27 @@ static void blocktime_cpu_list_iter_fn(gpointer data, gpointer user_data)
       * later than when it was faulted.
       */
      assert(iter->current >= entry->fault_time);
+    time_passed = iter->current - entry->fault_time;
  
-    /*
-     * If we resolved all pending faults on one vCPU due to this page
-     * resolution, take a note.
-     */
-    if (--ctx->vcpu_faults_current[cpu] == 0) {
-        ctx->vcpu_blocktime_total[cpu] += iter->current - entry->fault_time;
-        iter->affected_cpus += 1;
+    if (cpu >= 0) {
+        /*
+         * If we resolved all pending faults on one vCPU due to this page
+         * resolution, take a note.
+         */
+        if (--ctx->vcpu_faults_current[cpu] == 0) {
+            ctx->vcpu_blocktime_total[cpu] += time_passed;
+            iter->affected_cpus += 1;
+        }
+        trace_postcopy_blocktime_end_one(cpu, ctx->vcpu_faults_current[cpu]);
+    } else {
+        iter->affected_non_cpus++;
+        ctx->non_vcpu_blocktime_total += time_passed;
+        /*
+         * We do not maintain how many pending non-vCPU faults because we
+         * do not care about blocktime, only latency.
+         */
+        trace_postcopy_blocktime_end_one(-1, 0);
      }
-
-    trace_postcopy_blocktime_end_one(cpu, ctx->vcpu_faults_current[cpu]);
  }
  
  /*
@@ -1141,6 +1175,7 @@ static void mark_postcopy_blocktime_end(uintptr_t addr)
      BlockTimeVCPUIter iter = {
          .current = get_current_ns(),
          .affected_cpus = 0,
+        .affected_non_cpus = 0,
          .ctx = dc,
      };
      gpointer key = (gpointer)addr;
@@ -1174,7 +1209,8 @@ static void mark_postcopy_blocktime_end(uintptr_t addr)
      }
      dc->smp_cpus_down -= iter.affected_cpus;
  
-    trace_postcopy_blocktime_end(addr, iter.current, iter.affected_cpus);
+    trace_postcopy_blocktime_end(addr, iter.current, iter.affected_cpus,
+                                 iter.affected_non_cpus);
  }
  
  static void postcopy_pause_fault_thread(MigrationIncomingState *mis)
diff --git a/migration/trace-events b/migration/trace-events

index a36a78f01ae2bb1b48f90cc02c4605fd670439a5..706db97def9c524b6d8f030ca9f25574d2ec60d7 100644 (file)
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -310,7 +310,7 @@ postcopy_preempt_thread_entry(void) ""
  postcopy_preempt_thread_exit(void) ""
  postcopy_blocktime_tid_cpu_map(int cpu, uint32_t tid) "cpu: %d, tid: %u"
  postcopy_blocktime_begin(uint64_t addr, uint64_t time, int cpu, bool exists) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", cpu: %d, exist: %d"
-postcopy_blocktime_end(uint64_t addr, uint64_t time, int affected_cpu) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", affected_cpus: %d"
+postcopy_blocktime_end(uint64_t addr, uint64_t time, int affected_cpu, int affected_non_cpus) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", affected_cpus: %d, affected_non_cpus: %d"
  postcopy_blocktime_end_one(int cpu, uint8_t left_faults) "cpu: %d, left_faults: %" PRIu8
  
  # exec.c
diff --git a/qapi/migration.json b/qapi/migration.json

index bb41dc0795e436c68dccb209bb06df41e49ef765..66fb8ac74d0ec075484b8c1b0ae80c1d54ee0e65 100644 (file)
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -247,6 +247,12 @@
  #     this is the per-vCPU statistics.  This is only present when the
  #     postcopy-blocktime migration capability is enabled.  (Since 10.1)
  #
+# @postcopy-non-vcpu-latency: average remote page fault latency for all
+#     faults happend in non-vCPU threads (in ns).  It has the same
+#     definition of @postcopy-latency but this only provides statistics to
+#     non-vCPU faults.  This is only present when the postcopy-blocktime
+#     migration capability is enabled.  (Since 10.1)
+#
  # @socket-address: Only used for tcp, to know what the real port is
  #     (Since 4.0)
  #
@@ -273,8 +279,8 @@
  #
  # Features:
  #
-# @unstable: Members @postcopy-latency, @postcopy-vcpu-latency are
-#     experimental.
+# @unstable: Members @postcopy-latency, @postcopy-vcpu-latency,
+#     @postcopy-non-vcpu-latency are experimental.
  #
  # Since: 0.14
  ##
@@ -295,6 +301,8 @@
                 'type': 'uint64', 'features': [ 'unstable' ] },
             '*postcopy-vcpu-latency': {
                 'type': ['uint64'], 'features': [ 'unstable' ] },
+           '*postcopy-non-vcpu-latency': {
+               'type': 'uint64', 'features': [ 'unstable' ] },
             '*socket-address': ['SocketAddress'],
             '*dirty-limit-throttle-time-per-round': 'uint64',
             '*dirty-limit-ring-full-time': 'uint64'} }
diff --git a/tests/qtest/migration/migration-qmp.c b/tests/qtest/migration/migration-qmp.c

index 1a5ab2d229a0cd56fefcf0abf82da95d7e8455a6..67a67d4bd6cc59e3605119dd042e9b576bd58f42 100644 (file)
--- a/tests/qtest/migration/migration-qmp.c
+++ b/tests/qtest/migration/migration-qmp.c
@@ -361,6 +361,7 @@ void read_blocktime(QTestState *who)
      g_assert(qdict_haskey(rsp_return, "postcopy-vcpu-blocktime"));
      g_assert(qdict_haskey(rsp_return, "postcopy-latency"));
      g_assert(qdict_haskey(rsp_return, "postcopy-vcpu-latency"));
+    g_assert(qdict_haskey(rsp_return, "postcopy-non-vcpu-latency"));
      qobject_unref(rsp_return);
  }
author	Peter Xu <peterx@redhat.com>
	Fri, 13 Jun 2025 14:12:16 +0000 (10:12 -0400)
committer	Fabiano Rosas <farosas@suse.de>
	Fri, 11 Jul 2025 13:37:39 +0000 (10:37 -0300)
migration/migration-hmp-cmds.c		patch \| blob \| blame \| history
migration/postcopy-ram.c		patch \| blob \| blame \| history
migration/trace-events		patch \| blob \| blame \| history
qapi/migration.json		patch \| blob \| blame \| history
tests/qtest/migration/migration-qmp.c		patch \| blob \| blame \| history