KVM RISC-V triggers a TLB flush for every single stage-2 PTE
modification (unmap or write-protect) now. Although KVM coalesces the
hardware IPIs, the software overhead of executing the flush work
for every page is large, especially during dirty page tracking.
Following the approach used in x86 and arm64, this patch optimizes
the MMU logic by making the PTE manipulation functions return a boolean
indicating if a leaf PTE was actually changed. The outer MMU functions
bubble up this flag to batch the remote TLB flushes.
Consequently, the flush operation is executed only once per batch.
Moving it outside of the `mmu_lock` also reduces lock contention.
Tested with tools/testing/selftests/kvm on a 4-vCPU guest (Host
environment: QEMU 10.2.1 RISC-V)
1. demand_paging_test (1GB memory)
time ./demand_paging_test -b 1G -v 4
- Total execution time reduced from ~2m39s to ~2m31s
2. dirty_log_perf_test (1GB memory)
./dirty_log_perf_test -b 1G -v 4
- "Clear dirty log time" per iteration dropped significantly from
~3.40s to ~0.18s
Reviewed-by: Nutty Liu <nutty.liu@hotmail.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Jinyu Tang <tjytimi@163.com>
Link: https://lore.kernel.org/r/20260412023822.83341-1-tjytimi@163.com
Signed-off-by: Anup Patel <anup@brainfault.org>
GSTAGE_OP_WP, /* Write-protect */
};
-void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
+bool kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op);
-void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
+bool kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
gpa_t start, gpa_t size, bool may_block);
-void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end);
+bool kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end);
void kvm_riscv_gstage_mode_detect(void);
return 0;
}
-void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
+bool kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
{
int i, ret;
pte_t old_pte, *next_ptep;
u32 next_ptep_level;
unsigned long next_page_size, page_size;
+ bool flush = false;
ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
if (ret)
- return;
+ return false;
WARN_ON(addr & (page_size - 1));
if (!pte_val(ptep_get(ptep)))
- return;
+ return false;
if (ptep_level && !gstage_pte_leaf(ptep)) {
next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
next_ptep_level = ptep_level - 1;
ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size);
if (ret)
- return;
+ return false;
if (op == GSTAGE_OP_CLEAR)
set_pte(ptep, __pte(0));
for (i = 0; i < PTRS_PER_PTE; i++)
- kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size,
- &next_ptep[i], next_ptep_level, op);
+ flush |= kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size,
+ &next_ptep[i], next_ptep_level, op);
if (op == GSTAGE_OP_CLEAR)
put_page(virt_to_page(next_ptep));
} else {
else if (op == GSTAGE_OP_WP)
set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE));
if (pte_val(*ptep) != pte_val(old_pte))
- gstage_tlb_flush(gstage, ptep_level, addr);
+ flush = true;
}
+
+ return flush;
}
-void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
+bool kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
gpa_t start, gpa_t size, bool may_block)
{
int ret;
bool found_leaf;
unsigned long page_size;
gpa_t addr = start, end = start + size;
+ bool flush = false;
while (addr < end) {
found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
goto next;
if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
- kvm_riscv_gstage_op_pte(gstage, addr, ptep,
- ptep_level, GSTAGE_OP_CLEAR);
+ flush |= kvm_riscv_gstage_op_pte(gstage, addr, ptep,
+ ptep_level, GSTAGE_OP_CLEAR);
next:
addr += page_size;
if (!(gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) && may_block && addr < end)
cond_resched_lock(&gstage->kvm->mmu_lock);
}
+
+ return flush;
}
-void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end)
+bool kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end)
{
int ret;
pte_t *ptep;
bool found_leaf;
gpa_t addr = start;
unsigned long page_size;
+ bool flush = false;
while (addr < end) {
found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
goto next;
addr = ALIGN_DOWN(addr, page_size);
- kvm_riscv_gstage_op_pte(gstage, addr, ptep,
- ptep_level, GSTAGE_OP_WP);
+ flush |= kvm_riscv_gstage_op_pte(gstage, addr, ptep,
+ ptep_level, GSTAGE_OP_WP);
next:
addr += page_size;
}
+
+ return flush;
}
void __init kvm_riscv_gstage_mode_detect(void)
phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
struct kvm_gstage gstage;
+ bool flush;
kvm_riscv_gstage_init(&gstage, kvm);
spin_lock(&kvm->mmu_lock);
- kvm_riscv_gstage_wp_range(&gstage, start, end);
+ flush = kvm_riscv_gstage_wp_range(&gstage, start, end);
spin_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs_memslot(kvm, memslot);
+ if (flush)
+ kvm_flush_remote_tlbs_memslot(kvm, memslot);
}
int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
{
struct kvm_gstage gstage;
+ bool flush;
kvm_riscv_gstage_init(&gstage, kvm);
spin_lock(&kvm->mmu_lock);
- kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
+ flush = kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
spin_unlock(&kvm->mmu_lock);
+
+ if (flush)
+ kvm_flush_remote_tlbs_range(kvm, gpa >> PAGE_SHIFT,
+ size >> PAGE_SHIFT);
}
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
struct kvm_gstage gstage;
+ bool flush;
kvm_riscv_gstage_init(&gstage, kvm);
- kvm_riscv_gstage_wp_range(&gstage, start, end);
+ flush = kvm_riscv_gstage_wp_range(&gstage, start, end);
+ if (flush)
+ kvm_flush_remote_tlbs_range(kvm, start >> PAGE_SHIFT,
+ (end - start) >> PAGE_SHIFT);
}
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
phys_addr_t size = slot->npages << PAGE_SHIFT;
struct kvm_gstage gstage;
+ bool flush;
kvm_riscv_gstage_init(&gstage, kvm);
spin_lock(&kvm->mmu_lock);
- kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
+ flush = kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
spin_unlock(&kvm->mmu_lock);
+ if (flush)
+ kvm_flush_remote_tlbs_range(kvm, gpa >> PAGE_SHIFT,
+ size >> PAGE_SHIFT);
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
{
struct kvm_gstage gstage;
bool mmu_locked;
+ bool flush;
if (!kvm->arch.pgd)
return false;
kvm_riscv_gstage_init(&gstage, kvm);
mmu_locked = spin_trylock(&kvm->mmu_lock);
- kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT,
- (range->end - range->start) << PAGE_SHIFT,
- range->may_block);
+
+ flush = kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT,
+ (range->end - range->start) << PAGE_SHIFT,
+ range->may_block);
+
if (mmu_locked)
spin_unlock(&kvm->mmu_lock);
+
+ if (flush)
+ kvm_flush_remote_tlbs_range(kvm, range->start,
+ range->end - range->start);
return false;
}
{
struct kvm_gstage gstage;
void *pgd = NULL;
+ bool flush = false;
spin_lock(&kvm->mmu_lock);
if (kvm->arch.pgd) {
kvm_riscv_gstage_init(&gstage, kvm);
- kvm_riscv_gstage_unmap_range(&gstage, 0UL,
+ flush = kvm_riscv_gstage_unmap_range(&gstage, 0UL,
kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels), false);
pgd = READ_ONCE(kvm->arch.pgd);
kvm->arch.pgd = NULL;
}
spin_unlock(&kvm->mmu_lock);
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
if (pgd)
free_pages((unsigned long)pgd, get_order(kvm_riscv_gstage_pgd_size));
}