From: Sean Christopherson Date: Tue, 4 Feb 2025 00:40:38 +0000 (+0000) Subject: KVM: x86/mmu: Walk rmaps (shadow MMU) without holding mmu_lock when aging gfns X-Git-Tag: v6.15-rc1~195^2~11^2~1 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=af3b6a9eba48419732b07a0472db7160282f0f39;p=thirdparty%2Fkernel%2Flinux.git KVM: x86/mmu: Walk rmaps (shadow MMU) without holding mmu_lock when aging gfns Convert the shadow MMU to use per-rmap locking instead of the per-VM mmu_lock to protect rmaps when aging SPTEs. When A/D bits are enabled, it is safe to simply clear the Accessed bits, i.e. KVM just needs to ensure the parent page table isn't freed. The less obvious case is marking SPTEs for access tracking in the non-A/D case (for EPT only). Because aging a gfn means making the SPTE not-present, KVM needs to play nice with the case where the CPU has TLB entries for a SPTE that is not-present in memory. For example, when doing dirty tracking, if KVM encounters a non-present shadow accessed SPTE, KVM must know to do a TLB invalidation. Fortunately, KVM already provides (and relies upon) the necessary functionality. E.g. KVM doesn't flush TLBs when aging pages (even in the clear_flush_young() case), and when harvesting dirty bitmaps, KVM flushes based on the dirty bitmaps, not on SPTEs. Co-developed-by: James Houghton Signed-off-by: James Houghton Link: https://lore.kernel.org/r/20250204004038.1680123-12-jthoughton@google.com Signed-off-by: Sean Christopherson --- diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9eb987d0cc23b..198a1e26962a9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -971,7 +971,6 @@ static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head) * actual locking is the same, but the caller is disallowed from modifying the * rmap, and so the unlock flow is a nop if the rmap is/was empty. */ -__maybe_unused static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head) { unsigned long rmap_val; @@ -985,7 +984,6 @@ static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head) return rmap_val; } -__maybe_unused static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head, unsigned long old_val) { @@ -1706,37 +1704,48 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, } static bool kvm_rmap_age_gfn_range(struct kvm *kvm, - struct kvm_gfn_range *range, bool test_only) + struct kvm_gfn_range *range, + bool test_only) { - struct slot_rmap_walk_iterator iterator; + struct kvm_rmap_head *rmap_head; struct rmap_iterator iter; + unsigned long rmap_val; bool young = false; u64 *sptep; + gfn_t gfn; + int level; + u64 spte; - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - range->start, range->end - 1, &iterator) { - for_each_rmap_spte(iterator.rmap, &iter, sptep) { - u64 spte = *sptep; + for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + for (gfn = range->start; gfn < range->end; + gfn += KVM_PAGES_PER_HPAGE(level)) { + rmap_head = gfn_to_rmap(gfn, level, range->slot); + rmap_val = kvm_rmap_lock_readonly(rmap_head); - if (!is_accessed_spte(spte)) - continue; + for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) { + if (!is_accessed_spte(spte)) + continue; + + if (test_only) { + kvm_rmap_unlock_readonly(rmap_head, rmap_val); + return true; + } - if (test_only) - return true; - - if (spte_ad_enabled(spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } else { - /* - * WARN if mmu_spte_update() signals the need - * for a TLB flush, as Access tracking a SPTE - * should never trigger an _immediate_ flush. - */ - spte = mark_spte_for_access_track(spte); - WARN_ON_ONCE(mmu_spte_update(sptep, spte)); + if (spte_ad_enabled(spte)) + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + else + /* + * If the following cmpxchg fails, the + * spte is being concurrently modified + * and should most likely stay young. + */ + cmpxchg64(sptep, spte, + mark_spte_for_access_track(spte)); + young = true; } - young = true; + + kvm_rmap_unlock_readonly(rmap_head, rmap_val); } } return young; @@ -1754,11 +1763,8 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (tdp_mmu_enabled) young = kvm_tdp_mmu_age_gfn_range(kvm, range); - if (kvm_may_have_shadow_mmu_sptes(kvm)) { - write_lock(&kvm->mmu_lock); + if (kvm_may_have_shadow_mmu_sptes(kvm)) young |= kvm_rmap_age_gfn_range(kvm, range, false); - write_unlock(&kvm->mmu_lock); - } return young; } @@ -1770,11 +1776,11 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (tdp_mmu_enabled) young = kvm_tdp_mmu_test_age_gfn(kvm, range); - if (!young && kvm_may_have_shadow_mmu_sptes(kvm)) { - write_lock(&kvm->mmu_lock); + if (young) + return young; + + if (kvm_may_have_shadow_mmu_sptes(kvm)) young |= kvm_rmap_age_gfn_range(kvm, range, true); - write_unlock(&kvm->mmu_lock); - } return young; }