]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
KVM: x86/mmu: Age TDP MMU SPTEs without holding mmu_lock
authorSean Christopherson <seanjc@google.com>
Wed, 12 Feb 2025 20:30:12 +0000 (12:30 -0800)
committerSean Christopherson <seanjc@google.com>
Fri, 14 Feb 2025 15:17:17 +0000 (07:17 -0800)
Walk the TDP MMU in an RCU read-side critical section without holding
mmu_lock when harvesting and potentially updating age information on
TDP MMU SPTEs.  Add a new macro to do RCU-safe walking of TDP MMU roots,
and do all SPTE aging with atomic updates; while clobbering Accessed
information is ok, KVM must not corrupt other bits, e.g. must not drop
a Dirty or Writable bit when making a SPTE young..

If updating a SPTE to mark it for access tracking fails, leave it as is
and treat it as if it were young.  If the spte is being actively modified,
it is most likely young.

Acquire and release mmu_lock for write when harvesting age information
from the shadow MMU, as the shadow MMU doesn't yet support aging outside
of mmu_lock.

Suggested-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20250204004038.1680123-5-jthoughton@google.com
[sean: massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/Kconfig
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/tdp_mmu.c

index b15cde0a9b5ca7ae2c90b15fed4a4debd34f825c..5a07bdcc3e4f64036857c3437d894bdc58b4c59c 100644 (file)
@@ -1478,6 +1478,7 @@ struct kvm_arch {
         * tdp_mmu_page set.
         *
         * For reads, this list is protected by:
+        *      RCU alone or
         *      the MMU lock in read mode + RCU or
         *      the MMU lock in write mode
         *
index ea2c4f21c1ca9bb8635b6040a16c52774caefa6a..fe8ea8c097decf3bc50b112e2c615bf7ce53622b 100644 (file)
@@ -22,6 +22,7 @@ config KVM_X86
        select KVM_COMMON
        select KVM_GENERIC_MMU_NOTIFIER
        select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
+       select KVM_MMU_LOCKLESS_AGING
        select HAVE_KVM_IRQCHIP
        select HAVE_KVM_PFNCACHE
        select HAVE_KVM_DIRTY_RING_TSO
index 5b9ef3535f50388dd842feaf6acc9c8990072280..b73b3c12f76fdea1fe6ec2e99ded06bea95d48f3 100644 (file)
@@ -1592,8 +1592,11 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        bool young = false;
 
-       if (kvm_memslots_have_rmaps(kvm))
+       if (kvm_memslots_have_rmaps(kvm)) {
+               write_lock(&kvm->mmu_lock);
                young = kvm_rmap_age_gfn_range(kvm, range, false);
+               write_unlock(&kvm->mmu_lock);
+       }
 
        if (tdp_mmu_enabled)
                young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@ -1605,8 +1608,11 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        bool young = false;
 
-       if (kvm_memslots_have_rmaps(kvm))
+       if (kvm_memslots_have_rmaps(kvm)) {
+               write_lock(&kvm->mmu_lock);
                young = kvm_rmap_age_gfn_range(kvm, range, true);
+               write_unlock(&kvm->mmu_lock);
+       }
 
        if (tdp_mmu_enabled)
                young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
index 046b6ba311976a7e9e79ee07ae416acfd68452fd..c9778c3e6ecdf174d14a0033fe7ad2955181f2d7 100644 (file)
@@ -193,6 +193,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
                     !tdp_mmu_root_match((_root), (_types)))) {                 \
                } else
 
+/*
+ * Iterate over all TDP MMU roots in an RCU read-side critical section.
+ * It is safe to iterate over the SPTEs under the root, but their values will
+ * be unstable, so all writes must be atomic. As this routine is meant to be
+ * used without holding the mmu_lock at all, any bits that are flipped must
+ * be reflected in kvm_tdp_mmu_spte_need_atomic_write().
+ */
+#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types)                 \
+       list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link)         \
+               if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||     \
+                   !tdp_mmu_root_match((_root), (_types))) {                   \
+               } else
+
 #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id)               \
        __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
 
@@ -1332,21 +1345,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
  * from the clear_young() or clear_flush_young() notifier, which uses the
  * return value to determine if the page has been accessed.
  */
-static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
+static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
 {
        u64 new_spte;
 
        if (spte_ad_enabled(iter->old_spte)) {
-               iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
-                                                        iter->old_spte,
-                                                        shadow_accessed_mask,
-                                                        iter->level);
+               iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
+                                                               shadow_accessed_mask);
                new_spte = iter->old_spte & ~shadow_accessed_mask;
        } else {
                new_spte = mark_spte_for_access_track(iter->old_spte);
-               iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
-                                                       iter->old_spte, new_spte,
-                                                       iter->level);
+               /*
+                * It is safe for the following cmpxchg to fail. Leave the
+                * Accessed bit set, as the spte is most likely young anyway.
+                */
+               if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
+                       return;
        }
 
        trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
@@ -1371,9 +1385,9 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
         * valid roots!
         */
        WARN_ON(types & ~KVM_VALID_ROOTS);
-       __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
-               guard(rcu)();
 
+       guard(rcu)();
+       for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
                tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
                        if (!is_accessed_spte(iter.old_spte))
                                continue;
@@ -1382,7 +1396,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
                                return true;
 
                        ret = true;
-                       kvm_tdp_mmu_age_spte(&iter);
+                       kvm_tdp_mmu_age_spte(kvm, &iter);
                }
        }