/* All the time records are in unit of nanoseconds */
typedef struct PostcopyBlocktimeContext {
- /* time when page fault initiated per vCPU */
- uint64_t *vcpu_blocktime_start;
/* blocktime per vCPU */
uint64_t *vcpu_blocktime_total;
/* count of faults per vCPU */
uint64_t *vcpu_faults_count;
- /* page address per vCPU */
- uintptr_t *vcpu_addr;
+ /*
+ * count of currently blocked faults per vCPU.
+ *
+ * NOTE: Normally there should only be one fault in-progress per vCPU
+ * thread, so logically it _seems_ vcpu_faults_count[] for any vCPU
+ * should be either zero or one. However, there can be reasons we see
+ * >1 faults on the same vCPU thread.
+ *
+ * CASE (1): since the process to resolve faults (ioctl(UFFDIO_COPY),
+ * for example) is done before taking the mutex that protects the
+ * blocktime context, it can happen that we read more than one faulted
+ * addresses per vCPU.
+ *
+ * One example when we can see >1 faulted addresses for one vCPU:
+ *
+ * vcpu1 thread fault thread resolve thread
+ * ============ ============ ==============
+ *
+ * faulted on addr1
+ * read uffd msg (addr1)
+ * MUTEX_LOCK
+ * add entry (cpu1, addr1)
+ * MUTEX_UNLOCK
+ * request remote fault (addr1)
+ * resolve fault (addr1)
+ * addr1 resolved, continue..
+ * faulted on addr2
+ * read uffd msg (addr2)
+ * MUTEX_LOCK
+ * add entry (cpu1, addr2) <--------------- [A]
+ * MUTEX_UNLOCK
+ * MUTEX_LOCK
+ * remove entry (cpu1, addr1)
+ * MUTEX_UNLOCK
+ *
+ * In above case, we may see (cpu1, addr1) and (cpu1, addr2) entries to
+ * appear together at [A], when it gets the lock before the resolve
+ * thread. Use this counter to maintain such case, and only when it
+ * reaches zero we know the vCPU is not blocked anymore.
+ *
+ * CASE (2): theoretically (the author admit to not have verified
+ * this..), one vCPU thread can also generate more than one userfaultfd
+ * message on the same address. It can happen e.g. for whatever reason
+ * the fault got retried before a resolution arrives. In that extremely
+ * rare case, we could also see two (cpu1, addr1) entries.
+ *
+ * In all cases, be prepared with such re-entrancies with this array.
+ *
+ * Using uint8_t should be far enough for now. For example, when
+ * there're only one resolve thread (postcopy ram listening thread),
+ * the max (concurrent fault entries) should be two.
+ */
+ uint8_t *vcpu_faults_current;
+ /*
+ * The hash that contains addr1->[(cpu1,ts1),(cpu2,ts2) ...] mappings.
+ * Each of the entry is a tuple of (CPU index, fault timestamp) showing
+ * that a fault was requested.
+ */
+ GHashTable *vcpu_addr_hash;
/* total blocktime when all vCPUs are stopped */
uint64_t total_blocktime;
/* point in time when last page fault was initiated */
Notifier exit_notifier;
} PostcopyBlocktimeContext;
+typedef struct {
+ /* The time the fault was triggered */
+ uint64_t fault_time;
+ /* The vCPU index that was blocked */
+ int cpu;
+} BlocktimeVCPUEntry;
+
+/* Alloc an entry to record a vCPU fault */
+static BlocktimeVCPUEntry *
+blocktime_vcpu_entry_alloc(int cpu, uint64_t fault_time)
+{
+ BlocktimeVCPUEntry *entry = g_new(BlocktimeVCPUEntry, 1);
+
+ entry->fault_time = fault_time;
+ entry->cpu = cpu;
+
+ return entry;
+}
+
+/* Free a @GList of @BlocktimeVCPUEntry */
+static void blocktime_vcpu_list_free(gpointer data)
+{
+ g_list_free_full(data, g_free);
+}
+
static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
{
g_hash_table_destroy(ctx->tid_to_vcpu_hash);
- g_free(ctx->vcpu_blocktime_start);
+ g_hash_table_destroy(ctx->vcpu_addr_hash);
g_free(ctx->vcpu_blocktime_total);
g_free(ctx->vcpu_faults_count);
- g_free(ctx->vcpu_addr);
+ g_free(ctx->vcpu_faults_current);
g_free(ctx);
}
unsigned int smp_cpus = ms->smp.cpus;
PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
- ctx->vcpu_blocktime_start = g_new0(uint64_t, smp_cpus);
ctx->vcpu_blocktime_total = g_new0(uint64_t, smp_cpus);
ctx->vcpu_faults_count = g_new0(uint64_t, smp_cpus);
- ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
+ ctx->vcpu_faults_current = g_new0(uint8_t, smp_cpus);
ctx->tid_to_vcpu_hash = blocktime_init_tid_to_vcpu_hash();
+ /*
+ * The key (host virtual addresses) will always be gpointer-sized on
+ * either 32bits or 64bits systems, so it'll fit as a direct key.
+ *
+ * The value will be a list of BlocktimeVCPUEntry entries.
+ */
+ ctx->vcpu_addr_hash = g_hash_table_new_full(g_direct_hash,
+ g_direct_equal,
+ NULL,
+ blocktime_vcpu_list_free);
+
ctx->exit_notifier.notify = migration_exit_cb;
qemu_add_exit_notifier(&ctx->exit_notifier);
return (uint64_t)qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
}
+/* Inject an (cpu, fault_time) entry into the database, using addr as key */
+static void blocktime_fault_inject(PostcopyBlocktimeContext *ctx,
+ uintptr_t addr, int cpu, uint64_t time)
+{
+ BlocktimeVCPUEntry *entry = blocktime_vcpu_entry_alloc(cpu, time);
+ GHashTable *table = ctx->vcpu_addr_hash;
+ gpointer key = (gpointer)addr;
+ GList *head, *list;
+ gboolean result;
+
+ head = g_hash_table_lookup(table, key);
+ if (head) {
+ /*
+ * If existed, steal the @head for list operation rather than
+ * freeing it, making sure steal succeeded.
+ */
+ result = g_hash_table_steal(table, key);
+ assert(result == TRUE);
+ }
+
+ /*
+ * Now the key is guaranteed to be absent. Two cases:
+ *
+ * (1) There's no existing entry, list contains the only one. Insert.
+ * (2) There're existing entries, after stealing we own it, prepend the
+ * result and re-insert.
+ */
+ list = g_list_prepend(head, entry);
+ g_hash_table_insert(table, key, list);
+
+ trace_postcopy_blocktime_begin(addr, time, cpu, !!head);
+}
+
/*
* This function is being called when pagefault occurs. It tracks down vCPU
* blocking time. It's protected by @page_request_mutex.
if (!dc || ptid == 0) {
return;
}
+
+ /*
+ * The caller should only inject a blocktime entry when the page is
+ * yet missing.
+ */
+ assert(!ramblock_recv_bitmap_test(rb, (void *)addr));
+
+ current = get_current_ns();
cpu = blocktime_get_vcpu(dc, ptid);
- if (cpu < 0) {
+
+ if (cpu >= 0) {
+ /* How many faults on this vCPU in total? */
+ dc->vcpu_faults_count[cpu]++;
+
+ /*
+ * Account how many concurrent faults on this vCPU we trapped. See
+ * comments above vcpu_faults_current[] on why it can be more than one.
+ */
+ if (dc->vcpu_faults_current[cpu]++ == 0) {
+ dc->smp_cpus_down++;
+ /*
+ * We use last_begin to cover (1) the 1st fault on this specific
+ * vCPU, but meanwhile (2) the last vCPU that got blocked. It's
+ * only used to calculate system-wide blocktime.
+ */
+ dc->last_begin = current;
+ }
+
+ /* Making sure it won't overflow - it really should never! */
+ assert(dc->vcpu_faults_current[cpu] <= 255);
+ } else {
+ /* We do not support non-vCPU thread tracking yet */
dc->non_vcpu_faults++;
return;
}
- current = get_current_ns();
- if (dc->vcpu_addr[cpu] == 0) {
- dc->smp_cpus_down++;
- }
+ blocktime_fault_inject(dc, addr, cpu, current);
+}
- dc->last_begin = current;
- dc->vcpu_blocktime_start[cpu] = current;
- dc->vcpu_addr[cpu] = addr;
- dc->vcpu_faults_count[cpu]++;
+typedef struct {
+ PostcopyBlocktimeContext *ctx;
+ uint64_t current;
+ int affected_cpus;
+} BlockTimeVCPUIter;
+
+static void blocktime_cpu_list_iter_fn(gpointer data, gpointer user_data)
+{
+ BlockTimeVCPUIter *iter = user_data;
+ PostcopyBlocktimeContext *ctx = iter->ctx;
+ BlocktimeVCPUEntry *entry = data;
+ int cpu = entry->cpu;
/*
- * The caller should only inject a blocktime entry when the page is
- * yet missing.
+ * Time should never go back.. so when the fault is resolved it must be
+ * later than when it was faulted.
*/
- assert(!ramblock_recv_bitmap_test(rb, (void *)addr));
+ assert(iter->current >= entry->fault_time);
+
+ /*
+ * If we resolved all pending faults on one vCPU due to this page
+ * resolution, take a note.
+ */
+ if (--ctx->vcpu_faults_current[cpu] == 0) {
+ ctx->vcpu_blocktime_total[cpu] += iter->current - entry->fault_time;
+ iter->affected_cpus += 1;
+ }
- trace_mark_postcopy_blocktime_begin(addr, dc->vcpu_blocktime_start[cpu],
- cpu);
+ trace_postcopy_blocktime_end_one(cpu, ctx->vcpu_faults_current[cpu]);
}
/*
PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
MachineState *ms = MACHINE(qdev_get_machine());
unsigned int smp_cpus = ms->smp.cpus;
- int i, affected_cpu = 0;
- uint64_t read_vcpu_time, current;
+ BlockTimeVCPUIter iter = {
+ .current = get_current_ns(),
+ .affected_cpus = 0,
+ .ctx = dc,
+ };
+ gpointer key = (gpointer)addr;
+ GHashTable *table;
+ GList *list;
if (!dc) {
return;
}
- current = get_current_ns();
- /* lookup cpu, to clear it,
- * that algorithm looks straightforward, but it's not
- * optimal, more optimal algorithm is keeping tree or hash
- * where key is address value is a list of */
- for (i = 0; i < smp_cpus; i++) {
- uint64_t vcpu_blocktime = 0;
-
- read_vcpu_time = dc->vcpu_blocktime_start[i];
- if (dc->vcpu_addr[i] != addr || read_vcpu_time == 0) {
- continue;
- }
- dc->vcpu_addr[i] = 0;
- vcpu_blocktime = current - read_vcpu_time;
- affected_cpu += 1;
- /* continue cycle, due to one page could affect several vCPUs */
- dc->vcpu_blocktime_total[i] += vcpu_blocktime;
+ table = dc->vcpu_addr_hash;
+ /* the address wasn't tracked at all? */
+ list = g_hash_table_lookup(table, key);
+ if (!list) {
+ return;
}
+ /*
+ * Loop over the set of vCPUs that got blocked on this addr, do the
+ * blocktime accounting. After that, remove the whole list.
+ */
+ g_list_foreach(list, blocktime_cpu_list_iter_fn, &iter);
+ g_hash_table_remove(table, key);
+
/*
* If all vCPUs used to be down, and copying this page would free some
* vCPUs, then the system-level blocktime ends here.
*/
- if (dc->smp_cpus_down == smp_cpus && affected_cpu) {
- dc->total_blocktime += current - dc->last_begin;
+ if (dc->smp_cpus_down == smp_cpus && iter.affected_cpus) {
+ dc->total_blocktime += iter.current - dc->last_begin;
}
- dc->smp_cpus_down -= affected_cpu;
+ dc->smp_cpus_down -= iter.affected_cpus;
- trace_mark_postcopy_blocktime_end(addr, dc->total_blocktime,
- affected_cpu);
+ trace_postcopy_blocktime_end(addr, iter.current, iter.affected_cpus);
}
static void postcopy_pause_fault_thread(MigrationIncomingState *mis)