From: Sasha Levin Date: Sat, 3 Apr 2021 16:14:03 +0000 (-0400) Subject: Fixes for 5.10 X-Git-Tag: v4.4.265~35 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=fd5e7c2b5e4b2dd3ec1a9237dabd18b9a8660d51;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.10 Signed-off-by: Sasha Levin --- diff --git a/queue-5.10/kvm-x86-compile-out-tdp-mmu-on-32-bit-systems.patch b/queue-5.10/kvm-x86-compile-out-tdp-mmu-on-32-bit-systems.patch new file mode 100644 index 00000000000..de94fca806f --- /dev/null +++ b/queue-5.10/kvm-x86-compile-out-tdp-mmu-on-32-bit-systems.patch @@ -0,0 +1,351 @@ +From fb50942b0b7deaf416331b5ca2b5ce1c11a78d45 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 6 Feb 2021 09:53:33 -0500 +Subject: KVM: x86: compile out TDP MMU on 32-bit systems + +From: Paolo Bonzini + +[ Upstream commit 897218ff7cf19290ec2d69652ce673d8ed6fedeb ] + +The TDP MMU assumes that it can do atomic accesses to 64-bit PTEs. +Rather than just disabling it, compile it out completely so that it +is possible to use for example 64-bit xchg. + +To limit the number of stubs, wrap all accesses to tdp_mmu_enabled +or tdp_mmu_page with a function. Calls to all other functions in +tdp_mmu.c are eliminated and do not even reach the linker. + +Reviewed-by: Sean Christopherson +Tested-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 2 ++ + arch/x86/kvm/Makefile | 3 ++- + arch/x86/kvm/mmu/mmu.c | 36 ++++++++++++++++----------------- + arch/x86/kvm/mmu/mmu_internal.h | 2 ++ + arch/x86/kvm/mmu/tdp_mmu.c | 29 +------------------------- + arch/x86/kvm/mmu/tdp_mmu.h | 32 +++++++++++++++++++++++++---- + 6 files changed, 53 insertions(+), 51 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 47cd8f9b3fe7..af858f495e75 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1001,6 +1001,7 @@ struct kvm_arch { + struct kvm_pmu_event_filter *pmu_event_filter; + struct task_struct *nx_lpage_recovery_thread; + ++#ifdef CONFIG_X86_64 + /* + * Whether the TDP MMU is enabled for this VM. This contains a + * snapshot of the TDP MMU module parameter from when the VM was +@@ -1027,6 +1028,7 @@ struct kvm_arch { + * the thread holds the MMU lock in write mode. + */ + spinlock_t tdp_mmu_pages_lock; ++#endif /* CONFIG_X86_64 */ + }; + + struct kvm_vm_stat { +diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile +index b804444e16d4..1d1e31917a88 100644 +--- a/arch/x86/kvm/Makefile ++++ b/arch/x86/kvm/Makefile +@@ -16,7 +16,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o + kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ + i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ +- mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o ++ mmu/spte.o ++kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o + + kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ + vmx/evmcs.o vmx/nested.o vmx/posted_intr.o +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 0f45ad05f895..94e6bf004576 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -1225,7 +1225,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + { + struct kvm_rmap_head *rmap_head; + +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, true); + while (mask) { +@@ -1254,7 +1254,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, + { + struct kvm_rmap_head *rmap_head; + +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, false); + while (mask) { +@@ -1301,7 +1301,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + write_protected |= __rmap_write_protect(kvm, rmap_head, true); + } + +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + write_protected |= + kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); + +@@ -1513,7 +1513,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, + + r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); + + return r; +@@ -1525,7 +1525,7 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) + + r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); + + return r; +@@ -1580,7 +1580,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) + int young = false; + + young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); + + return young; +@@ -1591,7 +1591,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) + int young = false; + + young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + young |= kvm_tdp_mmu_test_age_hva(kvm, hva); + + return young; +@@ -3153,7 +3153,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, + sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); + + if (kvm_mmu_put_root(kvm, sp)) { +- if (sp->tdp_mmu_page) ++ if (is_tdp_mmu_page(sp)) + kvm_tdp_mmu_free_root(kvm, sp); + else if (sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); +@@ -3247,7 +3247,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) + hpa_t root; + unsigned i; + +- if (vcpu->kvm->arch.tdp_mmu_enabled) { ++ if (is_tdp_mmu_enabled(vcpu->kvm)) { + root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); + + if (!VALID_PAGE(root)) +@@ -5434,7 +5434,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) + + kvm_zap_obsolete_pages(kvm); + +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_zap_all(kvm); + + spin_unlock(&kvm->mmu_lock); +@@ -5497,7 +5497,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) + } + } + +- if (kvm->arch.tdp_mmu_enabled) { ++ if (is_tdp_mmu_enabled(kvm)) { + flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); + if (flush) + kvm_flush_remote_tlbs(kvm); +@@ -5521,7 +5521,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + spin_lock(&kvm->mmu_lock); + flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, false); +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); + spin_unlock(&kvm->mmu_lock); + +@@ -5587,7 +5587,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, + slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, + kvm_mmu_zap_collapsible_spte, true); + +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); + spin_unlock(&kvm->mmu_lock); + } +@@ -5614,7 +5614,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + + spin_lock(&kvm->mmu_lock); + flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); + spin_unlock(&kvm->mmu_lock); + +@@ -5637,7 +5637,7 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, + spin_lock(&kvm->mmu_lock); + flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, + false); +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); + spin_unlock(&kvm->mmu_lock); + +@@ -5653,7 +5653,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, + + spin_lock(&kvm->mmu_lock); + flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); + spin_unlock(&kvm->mmu_lock); + +@@ -5681,7 +5681,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + +- if (kvm->arch.tdp_mmu_enabled) ++ if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_zap_all(kvm); + + spin_unlock(&kvm->mmu_lock); +@@ -5992,7 +5992,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) + struct kvm_mmu_page, + lpage_disallowed_link); + WARN_ON_ONCE(!sp->lpage_disallowed); +- if (sp->tdp_mmu_page) { ++ if (is_tdp_mmu_page(sp)) { + kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, + sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); + } else { +diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h +index 7f599cc64178..cf67fa6fb8fe 100644 +--- a/arch/x86/kvm/mmu/mmu_internal.h ++++ b/arch/x86/kvm/mmu/mmu_internal.h +@@ -56,10 +56,12 @@ struct kvm_mmu_page { + /* Number of writes since the last time traversal visited this page. */ + atomic_t write_flooding_count; + ++#ifdef CONFIG_X86_64 + bool tdp_mmu_page; + + /* Used for freeing the page asyncronously if it is a TDP MMU page. */ + struct rcu_head rcu_head; ++#endif + }; + + extern struct kmem_cache *mmu_page_header_cache; +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index eb38f74af3f2..075b9d63bd57 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -10,24 +10,13 @@ + #include + #include + +-#ifdef CONFIG_X86_64 + static bool __read_mostly tdp_mmu_enabled = false; + module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); +-#endif +- +-static bool is_tdp_mmu_enabled(void) +-{ +-#ifdef CONFIG_X86_64 +- return tdp_enabled && READ_ONCE(tdp_mmu_enabled); +-#else +- return false; +-#endif /* CONFIG_X86_64 */ +-} + + /* Initializes the TDP MMU for the VM, if enabled. */ + void kvm_mmu_init_tdp_mmu(struct kvm *kvm) + { +- if (!is_tdp_mmu_enabled()) ++ if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) + return; + + /* This should not be changed for the lifetime of the VM. */ +@@ -96,22 +85,6 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, + #define for_each_tdp_mmu_root(_kvm, _root) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) + +-bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +-{ +- struct kvm_mmu_page *sp; +- +- if (!kvm->arch.tdp_mmu_enabled) +- return false; +- if (WARN_ON(!VALID_PAGE(hpa))) +- return false; +- +- sp = to_shadow_page(hpa); +- if (WARN_ON(!sp)) +- return false; +- +- return sp->tdp_mmu_page && sp->root_count; +-} +- + static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield, bool flush); + +diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h +index cbbdbadd1526..b4b65e3699b3 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.h ++++ b/arch/x86/kvm/mmu/tdp_mmu.h +@@ -5,10 +5,6 @@ + + #include + +-void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +-void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); +- +-bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root); + hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); + void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); + +@@ -47,4 +43,32 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, + int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level); + ++#ifdef CONFIG_X86_64 ++void kvm_mmu_init_tdp_mmu(struct kvm *kvm); ++void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); ++static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } ++static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } ++#else ++static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {} ++static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} ++static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } ++static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } ++#endif ++ ++static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) ++{ ++ struct kvm_mmu_page *sp; ++ ++ if (!is_tdp_mmu_enabled(kvm)) ++ return false; ++ if (WARN_ON(!VALID_PAGE(hpa))) ++ return false; ++ ++ sp = to_shadow_page(hpa); ++ if (WARN_ON(!sp)) ++ return false; ++ ++ return is_tdp_mmu_page(sp) && sp->root_count; ++} ++ + #endif /* __KVM_X86_MMU_TDP_MMU_H */ +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-add-comment-on-__tdp_mmu_set_spte.patch b/queue-5.10/kvm-x86-mmu-add-comment-on-__tdp_mmu_set_spte.patch new file mode 100644 index 00000000000..65ac57a4ce6 --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-add-comment-on-__tdp_mmu_set_spte.patch @@ -0,0 +1,56 @@ +From de4f8e1d8284e0976baa40ef79d6e0f6cbf5c6c6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:08 -0800 +Subject: KVM: x86/mmu: Add comment on __tdp_mmu_set_spte + +From: Ben Gardon + +[ Upstream commit fe43fa2f407b9d513f7bcf18142e14e1bf1508d6 ] + +__tdp_mmu_set_spte is a very important function in the TDP MMU which +already accepts several arguments and will take more in future commits. +To offset this complexity, add a comment to the function describing each +of the arguemnts. + +No functional change intended. + +Reviewed-by: Peter Feiner +Acked-by: Paolo Bonzini +Signed-off-by: Ben Gardon +Message-Id: <20210202185734.1680553-3-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_mmu.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index 34ef3e1a0f84..f88404033e0c 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -395,6 +395,22 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + new_spte, level); + } + ++/* ++ * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping ++ * @kvm: kvm instance ++ * @iter: a tdp_iter instance currently on the SPTE that should be set ++ * @new_spte: The value the SPTE should be set to ++ * @record_acc_track: Notify the MM subsystem of changes to the accessed state ++ * of the page. Should be set unless handling an MMU ++ * notifier for access tracking. Leaving record_acc_track ++ * unset in that case prevents page accesses from being ++ * double counted. ++ * @record_dirty_log: Record the page as dirty in the dirty bitmap if ++ * appropriate for the change being made. Should be set ++ * unless performing certain dirty logging operations. ++ * Leaving record_dirty_log unset in that case prevents page ++ * writes from being double counted. ++ */ + static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte, bool record_acc_track, + bool record_dirty_log) +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-add-existing-trace-points-to-tdp-mmu.patch b/queue-5.10/kvm-x86-mmu-add-existing-trace-points-to-tdp-mmu.patch new file mode 100644 index 00000000000..4e07a0cd72d --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-add-existing-trace-points-to-tdp-mmu.patch @@ -0,0 +1,86 @@ +From 9feb78a3670411d4bb2ac2800296123d02824f80 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 27 Oct 2020 10:59:43 -0700 +Subject: kvm: x86/mmu: Add existing trace points to TDP MMU + +From: Ben Gardon + +[ Upstream commit 33dd3574f5fef57c2c6caccf98925d63aa2a8d09 ] + +The TDP MMU was initially implemented without some of the usual +tracepoints found in mmu.c. Correct this discrepancy by adding the +missing trace points to the TDP MMU. + +Tested: ran the demand paging selftest on an Intel Skylake machine with + all the trace points used by the TDP MMU enabled and observed + them firing with expected values. + +This patch can be viewed in Gerrit at: +https://linux-review.googlesource.com/c/virt/kvm/kvm/+/3812 + +Signed-off-by: Ben Gardon +Message-Id: <20201027175944.1183301-1-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_mmu.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index 0d17457f1c84..61be95c6db20 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -7,6 +7,8 @@ + #include "tdp_mmu.h" + #include "spte.h" + ++#include ++ + #ifdef CONFIG_X86_64 + static bool __read_mostly tdp_mmu_enabled = false; + module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); +@@ -149,6 +151,8 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, + sp->gfn = gfn; + sp->tdp_mmu_page = true; + ++ trace_kvm_mmu_get_page(sp, true); ++ + return sp; + } + +@@ -319,6 +323,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + pt = spte_to_child_pt(old_spte, level); + sp = sptep_to_sp(pt); + ++ trace_kvm_mmu_prepare_zap_page(sp); ++ + list_del(&sp->link); + + if (sp->lpage_disallowed) +@@ -530,11 +536,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, + if (unlikely(is_noslot_pfn(pfn))) { + new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); + trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); +- } else ++ } else { + make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, + pfn, iter->old_spte, prefault, true, + map_writable, !shadow_accessed_mask, + &new_spte); ++ trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); ++ } + + if (new_spte == iter->old_spte) + ret = RET_PF_SPURIOUS; +@@ -740,6 +748,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, + + tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); + young = 1; ++ ++ trace_kvm_age_page(iter.gfn, iter.level, slot, young); + } + + return young; +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-add-lockdep-when-setting-a-tdp-mmu-spte.patch b/queue-5.10/kvm-x86-mmu-add-lockdep-when-setting-a-tdp-mmu-spte.patch new file mode 100644 index 00000000000..bf607836150 --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-add-lockdep-when-setting-a-tdp-mmu-spte.patch @@ -0,0 +1,41 @@ +From a57a6d516fabcd0ae433d70baa868ba5615722bb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:09 -0800 +Subject: KVM: x86/mmu: Add lockdep when setting a TDP MMU SPTE + +From: Ben Gardon + +[ Upstream commit 3a9a4aa5657471a02ffb7f9b7f3b7a468b3f257b ] + +Add lockdep to __tdp_mmu_set_spte to ensure that SPTEs are only modified +under the MMU lock. + +No functional change intended. + +Reviewed-by: Peter Feiner +Reviewed-by: Sean Christopherson +Acked-by: Paolo Bonzini +Signed-off-by: Ben Gardon +Message-Id: <20210202185734.1680553-4-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_mmu.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index 61be95c6db20..ad9f8f187045 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -363,6 +363,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *root = sptep_to_sp(root_pt); + int as_id = kvm_mmu_page_as_id(root); + ++ lockdep_assert_held(&kvm->mmu_lock); ++ + WRITE_ONCE(*iter->sptep, new_spte); + + __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-change-tdp-mmu-yield-function-returns-to.patch b/queue-5.10/kvm-x86-mmu-change-tdp-mmu-yield-function-returns-to.patch new file mode 100644 index 00000000000..c0f5e8b8d7d --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-change-tdp-mmu-yield-function-returns-to.patch @@ -0,0 +1,113 @@ +From 39ee9ada40ec00e88a1e2d200d5a27f5cc8ebe73 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:07 -0800 +Subject: KVM: x86/mmu: change TDP MMU yield function returns to match + cond_resched + +From: Ben Gardon + +[ Upstream commit e28a436ca4f65384cceaf3f4da0e00aa74244e6a ] + +Currently the TDP MMU yield / cond_resched functions either return +nothing or return true if the TLBs were not flushed. These are confusing +semantics, especially when making control flow decisions in calling +functions. + +To clean things up, change both functions to have the same +return value semantics as cond_resched: true if the thread yielded, +false if it did not. If the function yielded in the _flush_ version, +then the TLBs will have been flushed. + +Reviewed-by: Peter Feiner +Acked-by: Paolo Bonzini +Signed-off-by: Ben Gardon +Message-Id: <20210202185734.1680553-2-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_mmu.c | 39 ++++++++++++++++++++++++++++---------- + 1 file changed, 29 insertions(+), 10 deletions(-) + +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index ffa0bd0e033f..22efd016f05e 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -405,8 +405,15 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, + _mmu->shadow_root_level, _start, _end) + + /* +- * Flush the TLB if the process should drop kvm->mmu_lock. +- * Return whether the caller still needs to flush the tlb. ++ * Flush the TLB and yield if the MMU lock is contended or this thread needs to ++ * return control to the scheduler. ++ * ++ * If this function yields, it will also reset the tdp_iter's walk over the ++ * paging structure and the calling function should allow the iterator to ++ * continue its traversal from the paging structure root. ++ * ++ * Return true if this function yielded, the TLBs were flushed, and the ++ * iterator's traversal was reset. Return false if a yield was not needed. + */ + static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) + { +@@ -414,18 +421,32 @@ static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *it + kvm_flush_remote_tlbs(kvm); + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); +- return false; +- } else { + return true; + } ++ ++ return false; + } + +-static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) ++/* ++ * Yield if the MMU lock is contended or this thread needs to return control ++ * to the scheduler. ++ * ++ * If this function yields, it will also reset the tdp_iter's walk over the ++ * paging structure and the calling function should allow the iterator to ++ * continue its traversal from the paging structure root. ++ * ++ * Return true if this function yielded and the iterator's traversal was reset. ++ * Return false if a yield was not needed. ++ */ ++static bool tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) + { + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); ++ return true; + } ++ ++ return false; + } + + /* +@@ -461,10 +482,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + + tdp_mmu_set_spte(kvm, &iter, 0); + +- if (can_yield) +- flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); +- else +- flush_needed = true; ++ flush_needed = !can_yield || ++ !tdp_mmu_iter_flush_cond_resched(kvm, &iter); + } + return flush_needed; + } +@@ -1061,7 +1080,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, + + tdp_mmu_set_spte(kvm, &iter, 0); + +- spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); ++ spte_set = !tdp_mmu_iter_flush_cond_resched(kvm, &iter); + } + + if (spte_set) +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-don-t-redundantly-clear-tdp-mmu-pt-memor.patch b/queue-5.10/kvm-x86-mmu-don-t-redundantly-clear-tdp-mmu-pt-memor.patch new file mode 100644 index 00000000000..127c20f579f --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-don-t-redundantly-clear-tdp-mmu-pt-memor.patch @@ -0,0 +1,41 @@ +From 7ad0aa324fd19e916eae8be382c98efee1b759c6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:10 -0800 +Subject: KVM: x86/mmu: Don't redundantly clear TDP MMU pt memory + +From: Ben Gardon + +[ Upstream commit 734e45b329d626d2c14e2bcf8be3d069a33c3316 ] + +The KVM MMU caches already guarantee that shadow page table memory will +be zeroed, so there is no reason to re-zero the page in the TDP MMU page +fault handler. + +No functional change intended. + +Reviewed-by: Peter Feiner +Reviewed-by: Sean Christopherson +Acked-by: Paolo Bonzini +Signed-off-by: Ben Gardon +Message-Id: <20210202185734.1680553-5-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_mmu.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index f88404033e0c..136311be5890 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -706,7 +706,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); + list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); + child_pt = sp->spt; +- clear_page(child_pt); + new_spte = make_nonleaf_spte(child_pt, + !shadow_accessed_mask); + +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-ensure-forward-progress-when-yielding-in.patch b/queue-5.10/kvm-x86-mmu-ensure-forward-progress-when-yielding-in.patch new file mode 100644 index 00000000000..a5c802a06f1 --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-ensure-forward-progress-when-yielding-in.patch @@ -0,0 +1,147 @@ +From 102585bd2c96f22c43456229271e8b34177a09db Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:19 -0800 +Subject: KVM: x86/mmu: Ensure forward progress when yielding in TDP MMU iter + +From: Ben Gardon + +[ Upstream commit ed5e484b79e8a9b8be714bd85b6fc70bd6dc99a7 ] + +In some functions the TDP iter risks not making forward progress if two +threads livelock yielding to one another. This is possible if two threads +are trying to execute wrprot_gfn_range. Each could write protect an entry +and then yield. This would reset the tdp_iter's walk over the paging +structure and the loop would end up repeating the same entry over and +over, preventing either thread from making forward progress. + +Fix this issue by only yielding if the loop has made forward progress +since the last yield. + +Fixes: a6a0b05da9f3 ("kvm: x86/mmu: Support dirty logging for the TDP MMU") +Reviewed-by: Peter Feiner +Signed-off-by: Ben Gardon + +Message-Id: <20210202185734.1680553-14-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_iter.c | 18 +----------------- + arch/x86/kvm/mmu/tdp_iter.h | 7 ++++++- + arch/x86/kvm/mmu/tdp_mmu.c | 21 ++++++++++++++++----- + 3 files changed, 23 insertions(+), 23 deletions(-) + +diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c +index 9917c55b7d24..1a09d212186b 100644 +--- a/arch/x86/kvm/mmu/tdp_iter.c ++++ b/arch/x86/kvm/mmu/tdp_iter.c +@@ -31,6 +31,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); + + iter->next_last_level_gfn = next_last_level_gfn; ++ iter->yielded_gfn = iter->next_last_level_gfn; + iter->root_level = root_level; + iter->min_level = min_level; + iter->level = root_level; +@@ -158,23 +159,6 @@ void tdp_iter_next(struct tdp_iter *iter) + iter->valid = false; + } + +-/* +- * Restart the walk over the paging structure from the root, starting from the +- * highest gfn the iterator had previously reached. Assumes that the entire +- * paging structure, except the root page, may have been completely torn down +- * and rebuilt. +- */ +-void tdp_iter_refresh_walk(struct tdp_iter *iter) +-{ +- gfn_t next_last_level_gfn = iter->next_last_level_gfn; +- +- if (iter->gfn > next_last_level_gfn) +- next_last_level_gfn = iter->gfn; +- +- tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], +- iter->root_level, iter->min_level, next_last_level_gfn); +-} +- + u64 *tdp_iter_root_pt(struct tdp_iter *iter) + { + return iter->pt_path[iter->root_level - 1]; +diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h +index b2dd269c631f..d480c540ee27 100644 +--- a/arch/x86/kvm/mmu/tdp_iter.h ++++ b/arch/x86/kvm/mmu/tdp_iter.h +@@ -16,6 +16,12 @@ struct tdp_iter { + * for this GFN. + */ + gfn_t next_last_level_gfn; ++ /* ++ * The next_last_level_gfn at the time when the thread last ++ * yielded. Only yielding when the next_last_level_gfn != ++ * yielded_gfn helps ensure forward progress. ++ */ ++ gfn_t yielded_gfn; + /* Pointers to the page tables traversed to reach the current SPTE */ + u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + /* A pointer to the current SPTE */ +@@ -54,7 +60,6 @@ u64 *spte_to_child_pt(u64 pte, int level); + void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t next_last_level_gfn); + void tdp_iter_next(struct tdp_iter *iter); +-void tdp_iter_refresh_walk(struct tdp_iter *iter); + u64 *tdp_iter_root_pt(struct tdp_iter *iter); + + #endif /* __KVM_X86_MMU_TDP_ITER_H */ +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index 3b14d0008f92..f0bc5d3ce3d4 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -412,8 +412,9 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, + * TLB flush before yielding. + * + * If this function yields, it will also reset the tdp_iter's walk over the +- * paging structure and the calling function should allow the iterator to +- * continue its traversal from the paging structure root. ++ * paging structure and the calling function should skip to the next ++ * iteration to allow the iterator to continue its traversal from the ++ * paging structure root. + * + * Return true if this function yielded and the iterator's traversal was reset. + * Return false if a yield was not needed. +@@ -421,12 +422,22 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, + static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, + struct tdp_iter *iter, bool flush) + { ++ /* Ensure forward progress has been made before yielding. */ ++ if (iter->next_last_level_gfn == iter->yielded_gfn) ++ return false; ++ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (flush) + kvm_flush_remote_tlbs(kvm); + + cond_resched_lock(&kvm->mmu_lock); +- tdp_iter_refresh_walk(iter); ++ ++ WARN_ON(iter->gfn > iter->next_last_level_gfn); ++ ++ tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], ++ iter->root_level, iter->min_level, ++ iter->next_last_level_gfn); ++ + return true; + } + +@@ -466,8 +477,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + + tdp_mmu_set_spte(kvm, &iter, 0); + +- flush_needed = !can_yield || +- !tdp_mmu_iter_cond_resched(kvm, &iter, true); ++ flush_needed = !(can_yield && ++ tdp_mmu_iter_cond_resched(kvm, &iter, true)); + } + return flush_needed; + } +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-ensure-tlbs-are-flushed-for-tdp-mmu-duri.patch b/queue-5.10/kvm-x86-mmu-ensure-tlbs-are-flushed-for-tdp-mmu-duri.patch new file mode 100644 index 00000000000..347e98bf1ad --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-ensure-tlbs-are-flushed-for-tdp-mmu-duri.patch @@ -0,0 +1,68 @@ +From 9122666303232b5864b8e7926426658024fc61a0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 25 Mar 2021 13:01:18 -0700 +Subject: KVM: x86/mmu: Ensure TLBs are flushed for TDP MMU during NX zapping + +From: Sean Christopherson + +[ Upstream commit 048f49809c526348775425420fb5b8e84fd9a133 ] + +Honor the "flush needed" return from kvm_tdp_mmu_zap_gfn_range(), which +does the flush itself if and only if it yields (which it will never do in +this particular scenario), and otherwise expects the caller to do the +flush. If pages are zapped from the TDP MMU but not the legacy MMU, then +no flush will occur. + +Fixes: 29cf0f5007a2 ("kvm: x86/mmu: NX largepage recovery for TDP MMU") +Cc: stable@vger.kernel.org +Cc: Ben Gardon +Signed-off-by: Sean Christopherson +Message-Id: <20210325200119.1359384-3-seanjc@google.com> +Reviewed-by: Ben Gardon +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 94e6bf004576..e69248820d01 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -5972,6 +5972,8 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) + struct kvm_mmu_page *sp; + unsigned int ratio; + LIST_HEAD(invalid_list); ++ bool flush = false; ++ gfn_t gfn_end; + ulong to_zap; + + rcu_idx = srcu_read_lock(&kvm->srcu); +@@ -5993,19 +5995,20 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) + lpage_disallowed_link); + WARN_ON_ONCE(!sp->lpage_disallowed); + if (is_tdp_mmu_page(sp)) { +- kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, +- sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); ++ gfn_end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); ++ flush = kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, gfn_end); + } else { + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + } + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { +- kvm_mmu_commit_zap_page(kvm, &invalid_list); ++ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + cond_resched_lock(&kvm->mmu_lock); ++ flush = false; + } + } +- kvm_mmu_commit_zap_page(kvm, &invalid_list); ++ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, rcu_idx); +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-ensure-tlbs-are-flushed-when-yielding-du.patch b/queue-5.10/kvm-x86-mmu-ensure-tlbs-are-flushed-when-yielding-du.patch new file mode 100644 index 00000000000..cd096f54e2e --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-ensure-tlbs-are-flushed-when-yielding-du.patch @@ -0,0 +1,114 @@ +From e3018f2b4b268fb9b87ae49094c79601ad19c556 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 25 Mar 2021 13:01:17 -0700 +Subject: KVM: x86/mmu: Ensure TLBs are flushed when yielding during GFN range + zap + +From: Sean Christopherson + +[ Upstream commit a835429cda91621fca915d80672a157b47738afb ] + +When flushing a range of GFNs across multiple roots, ensure any pending +flush from a previous root is honored before yielding while walking the +tables of the current root. + +Note, kvm_tdp_mmu_zap_gfn_range() now intentionally overwrites its local +"flush" with the result to avoid redundant flushes. zap_gfn_range() +preserves and return the incoming "flush", unless of course the flush was +performed prior to yielding and no new flush was triggered. + +Fixes: 1af4a96025b3 ("KVM: x86/mmu: Yield in TDU MMU iter even if no SPTES changed") +Cc: stable@vger.kernel.org +Reviewed-by: Ben Gardon +Signed-off-by: Sean Christopherson +Message-Id: <20210325200119.1359384-2-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_mmu.c | 23 ++++++++++++----------- + 1 file changed, 12 insertions(+), 11 deletions(-) + +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index a54a9ed979d1..34ef3e1a0f84 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -111,7 +111,7 @@ bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) + } + + static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, +- gfn_t start, gfn_t end, bool can_yield); ++ gfn_t start, gfn_t end, bool can_yield, bool flush); + + void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) + { +@@ -124,7 +124,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) + + list_del(&root->link); + +- zap_gfn_range(kvm, root, 0, max_gfn, false); ++ zap_gfn_range(kvm, root, 0, max_gfn, false, false); + + free_page((unsigned long)root->spt); + kmem_cache_free(mmu_page_header_cache, root); +@@ -504,20 +504,21 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, + * scheduler needs the CPU or there is contention on the MMU lock. If this + * function cannot yield, it will not release the MMU lock or reschedule and + * the caller must ensure it does not supply too large a GFN range, or the +- * operation can cause a soft lockup. ++ * operation can cause a soft lockup. Note, in some use cases a flush may be ++ * required by prior actions. Ensure the pending flush is performed prior to ++ * yielding. + */ + static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, +- gfn_t start, gfn_t end, bool can_yield) ++ gfn_t start, gfn_t end, bool can_yield, bool flush) + { + struct tdp_iter iter; +- bool flush_needed = false; + + rcu_read_lock(); + + tdp_root_for_each_pte(iter, root, start, end) { + if (can_yield && +- tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { +- flush_needed = false; ++ tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { ++ flush = false; + continue; + } + +@@ -535,11 +536,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); +- flush_needed = true; ++ flush = true; + } + + rcu_read_unlock(); +- return flush_needed; ++ return flush; + } + + /* +@@ -554,7 +555,7 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) + bool flush = false; + + for_each_tdp_mmu_root_yield_safe(kvm, root) +- flush |= zap_gfn_range(kvm, root, start, end, true); ++ flush = zap_gfn_range(kvm, root, start, end, true, flush); + + return flush; + } +@@ -757,7 +758,7 @@ static int zap_gfn_range_hva_wrapper(struct kvm *kvm, + struct kvm_mmu_page *root, gfn_t start, + gfn_t end, unsigned long unused) + { +- return zap_gfn_range(kvm, root, start, end, false); ++ return zap_gfn_range(kvm, root, start, end, false, false); + } + + int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-factor-out-functions-to-add-remove-tdp-m.patch b/queue-5.10/kvm-x86-mmu-factor-out-functions-to-add-remove-tdp-m.patch new file mode 100644 index 00000000000..ec10d4841c6 --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-factor-out-functions-to-add-remove-tdp-m.patch @@ -0,0 +1,103 @@ +From 2504553be669a5b2f12575b0d7420c35196cf0eb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:25 -0800 +Subject: KVM: x86/mmu: Factor out functions to add/remove TDP MMU pages + +From: Ben Gardon + +[ Upstream commit a9442f594147f95307f691cfba0c31e25dc79b9d ] + +Move the work of adding and removing TDP MMU pages to/from "secondary" +data structures to helper functions. These functions will be built on in +future commits to enable MMU operations to proceed (mostly) in parallel. + +No functional change expected. + +Signed-off-by: Ben Gardon +Message-Id: <20210202185734.1680553-20-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_mmu.c | 47 +++++++++++++++++++++++++++++++------- + 1 file changed, 39 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index 136311be5890..14d69c01c710 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -262,6 +262,39 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, + } + } + ++/** ++ * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU ++ * ++ * @kvm: kvm instance ++ * @sp: the new page ++ * @account_nx: This page replaces a NX large page and should be marked for ++ * eventual reclaim. ++ */ ++static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, ++ bool account_nx) ++{ ++ lockdep_assert_held_write(&kvm->mmu_lock); ++ ++ list_add(&sp->link, &kvm->arch.tdp_mmu_pages); ++ if (account_nx) ++ account_huge_nx_page(kvm, sp); ++} ++ ++/** ++ * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU ++ * ++ * @kvm: kvm instance ++ * @sp: the page to be removed ++ */ ++static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp) ++{ ++ lockdep_assert_held_write(&kvm->mmu_lock); ++ ++ list_del(&sp->link); ++ if (sp->lpage_disallowed) ++ unaccount_huge_nx_page(kvm, sp); ++} ++ + /** + * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure + * +@@ -281,10 +314,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) + + trace_kvm_mmu_prepare_zap_page(sp); + +- list_del(&sp->link); +- +- if (sp->lpage_disallowed) +- unaccount_huge_nx_page(kvm, sp); ++ tdp_mmu_unlink_page(kvm, sp); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + old_child_spte = READ_ONCE(*(pt + i)); +@@ -704,15 +734,16 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + + if (!is_shadow_present_pte(iter.old_spte)) { + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); +- list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); + child_pt = sp->spt; ++ ++ tdp_mmu_link_page(vcpu->kvm, sp, ++ huge_page_disallowed && ++ req_level >= iter.level); ++ + new_spte = make_nonleaf_spte(child_pt, + !shadow_accessed_mask); + + trace_kvm_mmu_get_page(sp, true); +- if (huge_page_disallowed && req_level >= iter.level) +- account_huge_nx_page(vcpu->kvm, sp); +- + tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); + } + } +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-factor-out-handling-of-removed-page-tabl.patch b/queue-5.10/kvm-x86-mmu-factor-out-handling-of-removed-page-tabl.patch new file mode 100644 index 00000000000..113248e0fbd --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-factor-out-handling-of-removed-page-tabl.patch @@ -0,0 +1,125 @@ +From 342da545328248ab8ea49095d4e9009f77505587 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:11 -0800 +Subject: KVM: x86/mmu: Factor out handling of removed page tables + +From: Ben Gardon + +[ Upstream commit a066e61f13cf4b17d043ad8bea0cdde2b1e5ee49 ] + +Factor out the code to handle a disconnected subtree of the TDP paging +structure from the code to handle the change to an individual SPTE. +Future commits will build on this to allow asynchronous page freeing. + +No functional change intended. + +Reviewed-by: Peter Feiner +Acked-by: Paolo Bonzini +Signed-off-by: Ben Gardon + +Message-Id: <20210202185734.1680553-6-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_mmu.c | 71 ++++++++++++++++++++++---------------- + 1 file changed, 42 insertions(+), 29 deletions(-) + +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index ad9f8f187045..f52a22bc0fe8 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -234,6 +234,45 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, + } + } + ++/** ++ * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure ++ * ++ * @kvm: kvm instance ++ * @pt: the page removed from the paging structure ++ * ++ * Given a page table that has been removed from the TDP paging structure, ++ * iterates through the page table to clear SPTEs and free child page tables. ++ */ ++static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) ++{ ++ struct kvm_mmu_page *sp = sptep_to_sp(pt); ++ int level = sp->role.level; ++ gfn_t gfn = sp->gfn; ++ u64 old_child_spte; ++ int i; ++ ++ trace_kvm_mmu_prepare_zap_page(sp); ++ ++ list_del(&sp->link); ++ ++ if (sp->lpage_disallowed) ++ unaccount_huge_nx_page(kvm, sp); ++ ++ for (i = 0; i < PT64_ENT_PER_PAGE; i++) { ++ old_child_spte = READ_ONCE(*(pt + i)); ++ WRITE_ONCE(*(pt + i), 0); ++ handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), ++ gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), ++ old_child_spte, 0, level - 1); ++ } ++ ++ kvm_flush_remote_tlbs_with_address(kvm, gfn, ++ KVM_PAGES_PER_HPAGE(level)); ++ ++ free_page((unsigned long)pt); ++ kmem_cache_free(mmu_page_header_cache, sp); ++} ++ + /** + * handle_changed_spte - handle bookkeeping associated with an SPTE change + * @kvm: kvm instance +@@ -254,10 +293,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + bool was_leaf = was_present && is_last_spte(old_spte, level); + bool is_leaf = is_present && is_last_spte(new_spte, level); + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); +- u64 *pt; +- struct kvm_mmu_page *sp; +- u64 old_child_spte; +- int i; + + WARN_ON(level > PT64_ROOT_MAX_LEVEL); + WARN_ON(level < PG_LEVEL_4K); +@@ -319,31 +354,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + * Recursively handle child PTs if the change removed a subtree from + * the paging structure. + */ +- if (was_present && !was_leaf && (pfn_changed || !is_present)) { +- pt = spte_to_child_pt(old_spte, level); +- sp = sptep_to_sp(pt); +- +- trace_kvm_mmu_prepare_zap_page(sp); +- +- list_del(&sp->link); +- +- if (sp->lpage_disallowed) +- unaccount_huge_nx_page(kvm, sp); +- +- for (i = 0; i < PT64_ENT_PER_PAGE; i++) { +- old_child_spte = READ_ONCE(*(pt + i)); +- WRITE_ONCE(*(pt + i), 0); +- handle_changed_spte(kvm, as_id, +- gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), +- old_child_spte, 0, level - 1); +- } +- +- kvm_flush_remote_tlbs_with_address(kvm, gfn, +- KVM_PAGES_PER_HPAGE(level)); +- +- free_page((unsigned long)pt); +- kmem_cache_free(mmu_page_header_cache, sp); +- } ++ if (was_present && !was_leaf && (pfn_changed || !is_present)) ++ handle_removed_tdp_mmu_page(kvm, ++ spte_to_child_pt(old_spte, level)); + } + + static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-fix-braces-in-kvm_recover_nx_lpages.patch b/queue-5.10/kvm-x86-mmu-fix-braces-in-kvm_recover_nx_lpages.patch new file mode 100644 index 00000000000..f171e897c64 --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-fix-braces-in-kvm_recover_nx_lpages.patch @@ -0,0 +1,40 @@ +From 6687b6cd3c04a062f341c8c823051a2679167fa9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:15 -0800 +Subject: KVM: x86/mmu: Fix braces in kvm_recover_nx_lpages + +From: Ben Gardon + +[ Upstream commit 8d1a182ea791f0111b0258c8f3eb8d77af0a8386 ] + +No functional change intended. + +Fixes: 29cf0f5007a2 ("kvm: x86/mmu: NX largepage recovery for TDP MMU") +Signed-off-by: Ben Gardon +Message-Id: <20210202185734.1680553-10-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index dacbd13d32c6..0f45ad05f895 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -5992,10 +5992,10 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) + struct kvm_mmu_page, + lpage_disallowed_link); + WARN_ON_ONCE(!sp->lpage_disallowed); +- if (sp->tdp_mmu_page) ++ if (sp->tdp_mmu_page) { + kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, + sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); +- else { ++ } else { + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + } +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-merge-flush-and-non-flush-tdp_mmu_iter_c.patch b/queue-5.10/kvm-x86-mmu-merge-flush-and-non-flush-tdp_mmu_iter_c.patch new file mode 100644 index 00000000000..b68772962e9 --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-merge-flush-and-non-flush-tdp_mmu_iter_c.patch @@ -0,0 +1,125 @@ +From 2e2b695fb5d990619460c0f8515c25fadd44634e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:17 -0800 +Subject: KVM: x86/mmu: Merge flush and non-flush tdp_mmu_iter_cond_resched + +From: Ben Gardon + +[ Upstream commit e139a34ef9d5627a41e1c02210229082140d1f92 ] + +The flushing and non-flushing variants of tdp_mmu_iter_cond_resched have +almost identical implementations. Merge the two functions and add a +flush parameter. + +Signed-off-by: Ben Gardon +Message-Id: <20210202185734.1680553-12-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_mmu.c | 42 ++++++++++++-------------------------- + 1 file changed, 13 insertions(+), 29 deletions(-) + +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index 22efd016f05e..3b14d0008f92 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -404,33 +404,13 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, + for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ + _mmu->shadow_root_level, _start, _end) + +-/* +- * Flush the TLB and yield if the MMU lock is contended or this thread needs to +- * return control to the scheduler. +- * +- * If this function yields, it will also reset the tdp_iter's walk over the +- * paging structure and the calling function should allow the iterator to +- * continue its traversal from the paging structure root. +- * +- * Return true if this function yielded, the TLBs were flushed, and the +- * iterator's traversal was reset. Return false if a yield was not needed. +- */ +-static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +-{ +- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { +- kvm_flush_remote_tlbs(kvm); +- cond_resched_lock(&kvm->mmu_lock); +- tdp_iter_refresh_walk(iter); +- return true; +- } +- +- return false; +-} +- + /* + * Yield if the MMU lock is contended or this thread needs to return control + * to the scheduler. + * ++ * If this function should yield and flush is set, it will perform a remote ++ * TLB flush before yielding. ++ * + * If this function yields, it will also reset the tdp_iter's walk over the + * paging structure and the calling function should allow the iterator to + * continue its traversal from the paging structure root. +@@ -438,9 +418,13 @@ static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *it + * Return true if this function yielded and the iterator's traversal was reset. + * Return false if a yield was not needed. + */ +-static bool tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) ++static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, ++ struct tdp_iter *iter, bool flush) + { + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { ++ if (flush) ++ kvm_flush_remote_tlbs(kvm); ++ + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); + return true; +@@ -483,7 +467,7 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + tdp_mmu_set_spte(kvm, &iter, 0); + + flush_needed = !can_yield || +- !tdp_mmu_iter_flush_cond_resched(kvm, &iter); ++ !tdp_mmu_iter_cond_resched(kvm, &iter, true); + } + return flush_needed; + } +@@ -852,7 +836,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + +- tdp_mmu_iter_cond_resched(kvm, &iter); ++ tdp_mmu_iter_cond_resched(kvm, &iter, false); + } + return spte_set; + } +@@ -911,7 +895,7 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + +- tdp_mmu_iter_cond_resched(kvm, &iter); ++ tdp_mmu_iter_cond_resched(kvm, &iter, false); + } + return spte_set; + } +@@ -1027,7 +1011,7 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; + +- tdp_mmu_iter_cond_resched(kvm, &iter); ++ tdp_mmu_iter_cond_resched(kvm, &iter, false); + } + + return spte_set; +@@ -1080,7 +1064,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, + + tdp_mmu_set_spte(kvm, &iter, 0); + +- spte_set = !tdp_mmu_iter_flush_cond_resched(kvm, &iter); ++ spte_set = !tdp_mmu_iter_cond_resched(kvm, &iter, true); + } + + if (spte_set) +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-protect-tdp-mmu-page-table-memory-with-r.patch b/queue-5.10/kvm-x86-mmu-protect-tdp-mmu-page-table-memory-with-r.patch new file mode 100644 index 00000000000..20e98c48c9c --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-protect-tdp-mmu-page-table-memory-with-r.patch @@ -0,0 +1,505 @@ +From f6946783dd38dbc2bd758be27a17a0d47a563f25 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:23 -0800 +Subject: KVM: x86/mmu: Protect TDP MMU page table memory with RCU + +From: Ben Gardon + +[ Upstream commit 7cca2d0b7e7d9f3cd740d41afdc00051c9b508a0 ] + +In order to enable concurrent modifications to the paging structures in +the TDP MMU, threads must be able to safely remove pages of page table +memory while other threads are traversing the same memory. To ensure +threads do not access PT memory after it is freed, protect PT memory +with RCU. + +Protecting concurrent accesses to page table memory from use-after-free +bugs could also have been acomplished using +walk_shadow_page_lockless_begin/end() and READING_SHADOW_PAGE_TABLES, +coupling with the barriers in a TLB flush. The use of RCU for this case +has several distinct advantages over that approach. +1. Disabling interrupts for long running operations is not desirable. + Future commits will allow operations besides page faults to operate + without the exclusive protection of the MMU lock and those operations + are too long to disable iterrupts for their duration. +2. The use of RCU here avoids long blocking / spinning operations in + perfromance critical paths. By freeing memory with an asynchronous + RCU API we avoid the longer wait times TLB flushes experience when + overlapping with a thread in walk_shadow_page_lockless_begin/end(). +3. RCU provides a separation of concerns when removing memory from the + paging structure. Because the RCU callback to free memory can be + scheduled immediately after a TLB flush, there's no need for the + thread to manually free a queue of pages later, as commit_zap_pages + does. + +Fixes: 95fb5b0258b7 ("kvm: x86/mmu: Support MMIO in the TDP MMU") +Reviewed-by: Peter Feiner +Suggested-by: Sean Christopherson +Signed-off-by: Ben Gardon + +Message-Id: <20210202185734.1680553-18-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/mmu_internal.h | 3 ++ + arch/x86/kvm/mmu/tdp_iter.c | 16 +++--- + arch/x86/kvm/mmu/tdp_iter.h | 10 ++-- + arch/x86/kvm/mmu/tdp_mmu.c | 95 +++++++++++++++++++++++++++++---- + 4 files changed, 103 insertions(+), 21 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h +index bfc6389edc28..7f599cc64178 100644 +--- a/arch/x86/kvm/mmu/mmu_internal.h ++++ b/arch/x86/kvm/mmu/mmu_internal.h +@@ -57,6 +57,9 @@ struct kvm_mmu_page { + atomic_t write_flooding_count; + + bool tdp_mmu_page; ++ ++ /* Used for freeing the page asyncronously if it is a TDP MMU page. */ ++ struct rcu_head rcu_head; + }; + + extern struct kmem_cache *mmu_page_header_cache; +diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c +index 1a09d212186b..e5f148106e20 100644 +--- a/arch/x86/kvm/mmu/tdp_iter.c ++++ b/arch/x86/kvm/mmu/tdp_iter.c +@@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) + { + iter->sptep = iter->pt_path[iter->level - 1] + + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); +- iter->old_spte = READ_ONCE(*iter->sptep); ++ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); + } + + static gfn_t round_gfn_for_level(gfn_t gfn, int level) +@@ -35,7 +35,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + iter->root_level = root_level; + iter->min_level = min_level; + iter->level = root_level; +- iter->pt_path[iter->level - 1] = root_pt; ++ iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt; + + iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + tdp_iter_refresh_sptep(iter); +@@ -48,7 +48,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + * address of the child page table referenced by the SPTE. Returns null if + * there is no such entry. + */ +-u64 *spte_to_child_pt(u64 spte, int level) ++tdp_ptep_t spte_to_child_pt(u64 spte, int level) + { + /* + * There's no child entry if this entry isn't present or is a +@@ -57,7 +57,7 @@ u64 *spte_to_child_pt(u64 spte, int level) + if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) + return NULL; + +- return __va(spte_to_pfn(spte) << PAGE_SHIFT); ++ return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT); + } + + /* +@@ -66,7 +66,7 @@ u64 *spte_to_child_pt(u64 spte, int level) + */ + static bool try_step_down(struct tdp_iter *iter) + { +- u64 *child_pt; ++ tdp_ptep_t child_pt; + + if (iter->level == iter->min_level) + return false; +@@ -75,7 +75,7 @@ static bool try_step_down(struct tdp_iter *iter) + * Reread the SPTE before stepping down to avoid traversing into page + * tables that are no longer linked from this entry. + */ +- iter->old_spte = READ_ONCE(*iter->sptep); ++ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); + + child_pt = spte_to_child_pt(iter->old_spte, iter->level); + if (!child_pt) +@@ -109,7 +109,7 @@ static bool try_step_side(struct tdp_iter *iter) + iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); + iter->next_last_level_gfn = iter->gfn; + iter->sptep++; +- iter->old_spte = READ_ONCE(*iter->sptep); ++ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); + + return true; + } +@@ -159,7 +159,7 @@ void tdp_iter_next(struct tdp_iter *iter) + iter->valid = false; + } + +-u64 *tdp_iter_root_pt(struct tdp_iter *iter) ++tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter) + { + return iter->pt_path[iter->root_level - 1]; + } +diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h +index d480c540ee27..4cc177d75c4a 100644 +--- a/arch/x86/kvm/mmu/tdp_iter.h ++++ b/arch/x86/kvm/mmu/tdp_iter.h +@@ -7,6 +7,8 @@ + + #include "mmu.h" + ++typedef u64 __rcu *tdp_ptep_t; ++ + /* + * A TDP iterator performs a pre-order walk over a TDP paging structure. + */ +@@ -23,9 +25,9 @@ struct tdp_iter { + */ + gfn_t yielded_gfn; + /* Pointers to the page tables traversed to reach the current SPTE */ +- u64 *pt_path[PT64_ROOT_MAX_LEVEL]; ++ tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL]; + /* A pointer to the current SPTE */ +- u64 *sptep; ++ tdp_ptep_t sptep; + /* The lowest GFN mapped by the current SPTE */ + gfn_t gfn; + /* The level of the root page given to the iterator */ +@@ -55,11 +57,11 @@ struct tdp_iter { + #define for_each_tdp_pte(iter, root, root_level, start, end) \ + for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) + +-u64 *spte_to_child_pt(u64 pte, int level); ++tdp_ptep_t spte_to_child_pt(u64 pte, int level); + + void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t next_last_level_gfn); + void tdp_iter_next(struct tdp_iter *iter); +-u64 *tdp_iter_root_pt(struct tdp_iter *iter); ++tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter); + + #endif /* __KVM_X86_MMU_TDP_ITER_H */ +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index f52a22bc0fe8..a54a9ed979d1 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -42,6 +42,12 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) + return; + + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); ++ ++ /* ++ * Ensure that all the outstanding RCU callbacks to free shadow pages ++ * can run before the VM is torn down. ++ */ ++ rcu_barrier(); + } + + static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) +@@ -196,6 +202,28 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) + return __pa(root->spt); + } + ++static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) ++{ ++ free_page((unsigned long)sp->spt); ++ kmem_cache_free(mmu_page_header_cache, sp); ++} ++ ++/* ++ * This is called through call_rcu in order to free TDP page table memory ++ * safely with respect to other kernel threads that may be operating on ++ * the memory. ++ * By only accessing TDP MMU page table memory in an RCU read critical ++ * section, and freeing it after a grace period, lockless access to that ++ * memory won't use it after it is freed. ++ */ ++static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) ++{ ++ struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, ++ rcu_head); ++ ++ tdp_mmu_free_sp(sp); ++} ++ + static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level); + +@@ -269,8 +297,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) + kvm_flush_remote_tlbs_with_address(kvm, gfn, + KVM_PAGES_PER_HPAGE(level)); + +- free_page((unsigned long)pt); +- kmem_cache_free(mmu_page_header_cache, sp); ++ call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); + } + + /** +@@ -372,13 +399,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte, bool record_acc_track, + bool record_dirty_log) + { +- u64 *root_pt = tdp_iter_root_pt(iter); ++ tdp_ptep_t root_pt = tdp_iter_root_pt(iter); + struct kvm_mmu_page *root = sptep_to_sp(root_pt); + int as_id = kvm_mmu_page_as_id(root); + + lockdep_assert_held(&kvm->mmu_lock); + +- WRITE_ONCE(*iter->sptep, new_spte); ++ WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); + + __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, + iter->level); +@@ -448,10 +475,13 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, + return false; + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { ++ rcu_read_unlock(); ++ + if (flush) + kvm_flush_remote_tlbs(kvm); + + cond_resched_lock(&kvm->mmu_lock); ++ rcu_read_lock(); + + WARN_ON(iter->gfn > iter->next_last_level_gfn); + +@@ -482,6 +512,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + struct tdp_iter iter; + bool flush_needed = false; + ++ rcu_read_lock(); ++ + tdp_root_for_each_pte(iter, root, start, end) { + if (can_yield && + tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { +@@ -505,6 +537,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + tdp_mmu_set_spte(kvm, &iter, 0); + flush_needed = true; + } ++ ++ rcu_read_unlock(); + return flush_needed; + } + +@@ -550,13 +584,15 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, + + if (unlikely(is_noslot_pfn(pfn))) { + new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); +- trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); ++ trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, ++ new_spte); + } else { + make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, + pfn, iter->old_spte, prefault, true, + map_writable, !shadow_accessed_mask, + &new_spte); +- trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); ++ trace_kvm_mmu_set_spte(iter->level, iter->gfn, ++ rcu_dereference(iter->sptep)); + } + + if (new_spte == iter->old_spte) +@@ -579,7 +615,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, + if (unlikely(is_mmio_spte(new_spte))) + ret = RET_PF_EMULATE; + +- trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); ++ trace_kvm_mmu_set_spte(iter->level, iter->gfn, ++ rcu_dereference(iter->sptep)); + if (!prefault) + vcpu->stat.pf_fixed++; + +@@ -617,6 +654,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + huge_page_disallowed, &req_level); + + trace_kvm_mmu_spte_requested(gpa, level, pfn); ++ ++ rcu_read_lock(); ++ + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(iter.old_spte, gfn, +@@ -642,7 +682,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + * because the new value informs the !present + * path below. + */ +- iter.old_spte = READ_ONCE(*iter.sptep); ++ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + } + + if (!is_shadow_present_pte(iter.old_spte)) { +@@ -661,11 +701,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + } + } + +- if (WARN_ON(iter.level != level)) ++ if (WARN_ON(iter.level != level)) { ++ rcu_read_unlock(); + return RET_PF_RETRY; ++ } + + ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, + pfn, prefault); ++ rcu_read_unlock(); + + return ret; + } +@@ -736,6 +779,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, + int young = 0; + u64 new_spte = 0; + ++ rcu_read_lock(); ++ + tdp_root_for_each_leaf_pte(iter, root, start, end) { + /* + * If we have a non-accessed entry we don't need to change the +@@ -767,6 +812,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, + trace_kvm_age_page(iter.gfn, iter.level, slot, young); + } + ++ rcu_read_unlock(); ++ + return young; + } + +@@ -812,6 +859,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, + u64 new_spte; + int need_flush = 0; + ++ rcu_read_lock(); ++ + WARN_ON(pte_huge(*ptep)); + + new_pfn = pte_pfn(*ptep); +@@ -840,6 +889,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, + if (need_flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + ++ rcu_read_unlock(); ++ + return 0; + } + +@@ -863,6 +914,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + u64 new_spte; + bool spte_set = false; + ++ rcu_read_lock(); ++ + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); + + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, +@@ -879,6 +932,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + } ++ ++ rcu_read_unlock(); + return spte_set; + } + +@@ -920,6 +975,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + u64 new_spte; + bool spte_set = false; + ++ rcu_read_lock(); ++ + tdp_root_for_each_leaf_pte(iter, root, start, end) { + if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) + continue; +@@ -939,6 +996,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + } ++ ++ rcu_read_unlock(); + return spte_set; + } + +@@ -980,6 +1039,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, + struct tdp_iter iter; + u64 new_spte; + ++ rcu_read_lock(); ++ + tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), + gfn + BITS_PER_LONG) { + if (!mask) +@@ -1005,6 +1066,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, + + mask &= ~(1UL << (iter.gfn - gfn)); + } ++ ++ rcu_read_unlock(); + } + + /* +@@ -1044,6 +1107,8 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + u64 new_spte; + bool spte_set = false; + ++ rcu_read_lock(); ++ + tdp_root_for_each_pte(iter, root, start, end) { + if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) + continue; +@@ -1057,6 +1122,7 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + spte_set = true; + } + ++ rcu_read_unlock(); + return spte_set; + } + +@@ -1094,6 +1160,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm, + kvm_pfn_t pfn; + bool spte_set = false; + ++ rcu_read_lock(); ++ + tdp_root_for_each_pte(iter, root, start, end) { + if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { + spte_set = false; +@@ -1115,6 +1183,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, + spte_set = true; + } + ++ rcu_read_unlock(); + if (spte_set) + kvm_flush_remote_tlbs(kvm); + } +@@ -1151,6 +1220,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, + u64 new_spte; + bool spte_set = false; + ++ rcu_read_lock(); ++ + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { + if (!is_writable_pte(iter.old_spte)) + break; +@@ -1162,6 +1233,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, + spte_set = true; + } + ++ rcu_read_unlock(); ++ + return spte_set; + } + +@@ -1202,10 +1275,14 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + + *root_level = vcpu->arch.mmu->shadow_root_level; + ++ rcu_read_lock(); ++ + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + leaf = iter.level; + sptes[leaf - 1] = iter.old_spte; + } + ++ rcu_read_unlock(); ++ + return leaf; + } +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-rename-goal_gfn-to-next_last_level_gfn.patch b/queue-5.10/kvm-x86-mmu-rename-goal_gfn-to-next_last_level_gfn.patch new file mode 100644 index 00000000000..91f3b448d25 --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-rename-goal_gfn-to-next_last_level_gfn.patch @@ -0,0 +1,114 @@ +From 53efb1b0b0e39141e285a49dc9eefacc08893d8c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:18 -0800 +Subject: KVM: x86/mmu: Rename goal_gfn to next_last_level_gfn + +From: Ben Gardon + +[ Upstream commit 74953d3530280dc53256054e1906f58d07bfba44 ] + +The goal_gfn field in tdp_iter can be misleading as it implies that it +is the iterator's final goal. It is really a target for the lowest gfn +mapped by the leaf level SPTE the iterator will traverse towards. Change +the field's name to be more precise. + +Signed-off-by: Ben Gardon +Message-Id: <20210202185734.1680553-13-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_iter.c | 20 ++++++++++---------- + arch/x86/kvm/mmu/tdp_iter.h | 4 ++-- + 2 files changed, 12 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c +index 87b7e16911db..9917c55b7d24 100644 +--- a/arch/x86/kvm/mmu/tdp_iter.c ++++ b/arch/x86/kvm/mmu/tdp_iter.c +@@ -22,21 +22,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) + + /* + * Sets a TDP iterator to walk a pre-order traversal of the paging structure +- * rooted at root_pt, starting with the walk to translate goal_gfn. ++ * rooted at root_pt, starting with the walk to translate next_last_level_gfn. + */ + void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, +- int min_level, gfn_t goal_gfn) ++ int min_level, gfn_t next_last_level_gfn) + { + WARN_ON(root_level < 1); + WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); + +- iter->goal_gfn = goal_gfn; ++ iter->next_last_level_gfn = next_last_level_gfn; + iter->root_level = root_level; + iter->min_level = min_level; + iter->level = root_level; + iter->pt_path[iter->level - 1] = root_pt; + +- iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); ++ iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +@@ -82,7 +82,7 @@ static bool try_step_down(struct tdp_iter *iter) + + iter->level--; + iter->pt_path[iter->level - 1] = child_pt; +- iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); ++ iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + return true; +@@ -106,7 +106,7 @@ static bool try_step_side(struct tdp_iter *iter) + return false; + + iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); +- iter->goal_gfn = iter->gfn; ++ iter->next_last_level_gfn = iter->gfn; + iter->sptep++; + iter->old_spte = READ_ONCE(*iter->sptep); + +@@ -166,13 +166,13 @@ void tdp_iter_next(struct tdp_iter *iter) + */ + void tdp_iter_refresh_walk(struct tdp_iter *iter) + { +- gfn_t goal_gfn = iter->goal_gfn; ++ gfn_t next_last_level_gfn = iter->next_last_level_gfn; + +- if (iter->gfn > goal_gfn) +- goal_gfn = iter->gfn; ++ if (iter->gfn > next_last_level_gfn) ++ next_last_level_gfn = iter->gfn; + + tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], +- iter->root_level, iter->min_level, goal_gfn); ++ iter->root_level, iter->min_level, next_last_level_gfn); + } + + u64 *tdp_iter_root_pt(struct tdp_iter *iter) +diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h +index 47170d0dc98e..b2dd269c631f 100644 +--- a/arch/x86/kvm/mmu/tdp_iter.h ++++ b/arch/x86/kvm/mmu/tdp_iter.h +@@ -15,7 +15,7 @@ struct tdp_iter { + * The iterator will traverse the paging structure towards the mapping + * for this GFN. + */ +- gfn_t goal_gfn; ++ gfn_t next_last_level_gfn; + /* Pointers to the page tables traversed to reach the current SPTE */ + u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + /* A pointer to the current SPTE */ +@@ -52,7 +52,7 @@ struct tdp_iter { + u64 *spte_to_child_pt(u64 pte, int level); + + void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, +- int min_level, gfn_t goal_gfn); ++ int min_level, gfn_t next_last_level_gfn); + void tdp_iter_next(struct tdp_iter *iter); + void tdp_iter_refresh_walk(struct tdp_iter *iter); + u64 *tdp_iter_root_pt(struct tdp_iter *iter); +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-use-atomic-ops-to-set-sptes-in-tdp-mmu-m.patch b/queue-5.10/kvm-x86-mmu-use-atomic-ops-to-set-sptes-in-tdp-mmu-m.patch new file mode 100644 index 00000000000..3576f89d563 --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-use-atomic-ops-to-set-sptes-in-tdp-mmu-m.patch @@ -0,0 +1,371 @@ +From c5835f6eef689c5931e6365feb547afc21bcab04 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:26 -0800 +Subject: KVM: x86/mmu: Use atomic ops to set SPTEs in TDP MMU map + +From: Ben Gardon + +[ Upstream commit 9a77daacc87dee9fd63e31243f21894132ed8407 ] + +To prepare for handling page faults in parallel, change the TDP MMU +page fault handler to use atomic operations to set SPTEs so that changes +are not lost if multiple threads attempt to modify the same SPTE. + +Reviewed-by: Peter Feiner +Signed-off-by: Ben Gardon + +Message-Id: <20210202185734.1680553-21-bgardon@google.com> +[Document new locking rules. - Paolo] +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + Documentation/virt/kvm/locking.rst | 9 +- + arch/x86/include/asm/kvm_host.h | 13 +++ + arch/x86/kvm/mmu/tdp_mmu.c | 142 ++++++++++++++++++++++------- + 3 files changed, 130 insertions(+), 34 deletions(-) + +diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst +index b21a34c34a21..0aa4817b466d 100644 +--- a/Documentation/virt/kvm/locking.rst ++++ b/Documentation/virt/kvm/locking.rst +@@ -16,7 +16,14 @@ The acquisition orders for mutexes are as follows: + - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring + them together is quite rare. + +-On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. ++On x86: ++ ++- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock ++ ++- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is ++ taken inside kvm->arch.mmu_lock, and cannot be taken without already ++ holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise ++ there's no need to take kvm->arch.tdp_mmu_pages_lock at all). + + Everything else is a leaf: no other lock is taken inside the critical + sections. +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 02d4c74d30e2..47cd8f9b3fe7 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1014,6 +1014,19 @@ struct kvm_arch { + struct list_head tdp_mmu_roots; + /* List of struct tdp_mmu_pages not being used as roots */ + struct list_head tdp_mmu_pages; ++ ++ /* ++ * Protects accesses to the following fields when the MMU lock ++ * is held in read mode: ++ * - tdp_mmu_pages (above) ++ * - the link field of struct kvm_mmu_pages used by the TDP MMU ++ * - lpage_disallowed_mmu_pages ++ * - the lpage_disallowed_link field of struct kvm_mmu_pages used ++ * by the TDP MMU ++ * It is acceptable, but not necessary, to acquire this lock when ++ * the thread holds the MMU lock in write mode. ++ */ ++ spinlock_t tdp_mmu_pages_lock; + }; + + struct kvm_vm_stat { +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index 14d69c01c710..eb38f74af3f2 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -7,6 +7,7 @@ + #include "tdp_mmu.h" + #include "spte.h" + ++#include + #include + + #ifdef CONFIG_X86_64 +@@ -33,6 +34,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) + kvm->arch.tdp_mmu_enabled = true; + + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); ++ spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); + } + +@@ -225,7 +227,8 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) + } + + static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, +- u64 old_spte, u64 new_spte, int level); ++ u64 old_spte, u64 new_spte, int level, ++ bool shared); + + static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) + { +@@ -267,17 +270,26 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, + * + * @kvm: kvm instance + * @sp: the new page ++ * @shared: This operation may not be running under the exclusive use of ++ * the MMU lock and the operation must synchronize with other ++ * threads that might be adding or removing pages. + * @account_nx: This page replaces a NX large page and should be marked for + * eventual reclaim. + */ + static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, +- bool account_nx) ++ bool shared, bool account_nx) + { +- lockdep_assert_held_write(&kvm->mmu_lock); ++ if (shared) ++ spin_lock(&kvm->arch.tdp_mmu_pages_lock); ++ else ++ lockdep_assert_held_write(&kvm->mmu_lock); + + list_add(&sp->link, &kvm->arch.tdp_mmu_pages); + if (account_nx) + account_huge_nx_page(kvm, sp); ++ ++ if (shared) ++ spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + } + + /** +@@ -285,14 +297,24 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, + * + * @kvm: kvm instance + * @sp: the page to be removed ++ * @shared: This operation may not be running under the exclusive use of ++ * the MMU lock and the operation must synchronize with other ++ * threads that might be adding or removing pages. + */ +-static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp) ++static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, ++ bool shared) + { +- lockdep_assert_held_write(&kvm->mmu_lock); ++ if (shared) ++ spin_lock(&kvm->arch.tdp_mmu_pages_lock); ++ else ++ lockdep_assert_held_write(&kvm->mmu_lock); + + list_del(&sp->link); + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); ++ ++ if (shared) ++ spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + } + + /** +@@ -300,28 +322,39 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp) + * + * @kvm: kvm instance + * @pt: the page removed from the paging structure ++ * @shared: This operation may not be running under the exclusive use ++ * of the MMU lock and the operation must synchronize with other ++ * threads that might be modifying SPTEs. + * + * Given a page table that has been removed from the TDP paging structure, + * iterates through the page table to clear SPTEs and free child page tables. + */ +-static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) ++static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, ++ bool shared) + { + struct kvm_mmu_page *sp = sptep_to_sp(pt); + int level = sp->role.level; + gfn_t gfn = sp->gfn; + u64 old_child_spte; ++ u64 *sptep; + int i; + + trace_kvm_mmu_prepare_zap_page(sp); + +- tdp_mmu_unlink_page(kvm, sp); ++ tdp_mmu_unlink_page(kvm, sp, shared); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { +- old_child_spte = READ_ONCE(*(pt + i)); +- WRITE_ONCE(*(pt + i), 0); ++ sptep = pt + i; ++ ++ if (shared) { ++ old_child_spte = xchg(sptep, 0); ++ } else { ++ old_child_spte = READ_ONCE(*sptep); ++ WRITE_ONCE(*sptep, 0); ++ } + handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), + gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), +- old_child_spte, 0, level - 1); ++ old_child_spte, 0, level - 1, shared); + } + + kvm_flush_remote_tlbs_with_address(kvm, gfn, +@@ -338,12 +371,16 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) + * @old_spte: The value of the SPTE before the change + * @new_spte: The value of the SPTE after the change + * @level: the level of the PT the SPTE is part of in the paging structure ++ * @shared: This operation may not be running under the exclusive use of ++ * the MMU lock and the operation must synchronize with other ++ * threads that might be modifying SPTEs. + * + * Handle bookkeeping that might result from the modification of a SPTE. + * This function must be called for all TDP SPTE modifications. + */ + static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, +- u64 old_spte, u64 new_spte, int level) ++ u64 old_spte, u64 new_spte, int level, ++ bool shared) + { + bool was_present = is_shadow_present_pte(old_spte); + bool is_present = is_shadow_present_pte(new_spte); +@@ -413,18 +450,51 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + */ + if (was_present && !was_leaf && (pfn_changed || !is_present)) + handle_removed_tdp_mmu_page(kvm, +- spte_to_child_pt(old_spte, level)); ++ spte_to_child_pt(old_spte, level), shared); + } + + static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, +- u64 old_spte, u64 new_spte, int level) ++ u64 old_spte, u64 new_spte, int level, ++ bool shared) + { +- __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); ++ __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, ++ shared); + handle_changed_spte_acc_track(old_spte, new_spte, level); + handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, + new_spte, level); + } + ++/* ++ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the ++ * associated bookkeeping ++ * ++ * @kvm: kvm instance ++ * @iter: a tdp_iter instance currently on the SPTE that should be set ++ * @new_spte: The value the SPTE should be set to ++ * Returns: true if the SPTE was set, false if it was not. If false is returned, ++ * this function will have no side-effects. ++ */ ++static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, ++ struct tdp_iter *iter, ++ u64 new_spte) ++{ ++ u64 *root_pt = tdp_iter_root_pt(iter); ++ struct kvm_mmu_page *root = sptep_to_sp(root_pt); ++ int as_id = kvm_mmu_page_as_id(root); ++ ++ lockdep_assert_held_read(&kvm->mmu_lock); ++ ++ if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, ++ new_spte) != iter->old_spte) ++ return false; ++ ++ handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, ++ iter->level, true); ++ ++ return true; ++} ++ ++ + /* + * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping + * @kvm: kvm instance +@@ -454,7 +524,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); + + __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, +- iter->level); ++ iter->level, false); + if (record_acc_track) + handle_changed_spte_acc_track(iter->old_spte, new_spte, + iter->level); +@@ -629,23 +699,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, + int ret = 0; + int make_spte_ret = 0; + +- if (unlikely(is_noslot_pfn(pfn))) { ++ if (unlikely(is_noslot_pfn(pfn))) + new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); +- trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, +- new_spte); +- } else { ++ else + make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, + pfn, iter->old_spte, prefault, true, + map_writable, !shadow_accessed_mask, + &new_spte); +- trace_kvm_mmu_set_spte(iter->level, iter->gfn, +- rcu_dereference(iter->sptep)); +- } + + if (new_spte == iter->old_spte) + ret = RET_PF_SPURIOUS; +- else +- tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); ++ else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) ++ return RET_PF_RETRY; + + /* + * If the page fault was caused by a write but the page is write +@@ -659,8 +724,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, + } + + /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ +- if (unlikely(is_mmio_spte(new_spte))) ++ if (unlikely(is_mmio_spte(new_spte))) { ++ trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, ++ new_spte); + ret = RET_PF_EMULATE; ++ } else ++ trace_kvm_mmu_set_spte(iter->level, iter->gfn, ++ rcu_dereference(iter->sptep)); + + trace_kvm_mmu_set_spte(iter->level, iter->gfn, + rcu_dereference(iter->sptep)); +@@ -719,7 +789,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + */ + if (is_shadow_present_pte(iter.old_spte) && + is_large_pte(iter.old_spte)) { +- tdp_mmu_set_spte(vcpu->kvm, &iter, 0); ++ if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0)) ++ break; + + kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, + KVM_PAGES_PER_HPAGE(iter.level)); +@@ -736,19 +807,24 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); + child_pt = sp->spt; + +- tdp_mmu_link_page(vcpu->kvm, sp, +- huge_page_disallowed && +- req_level >= iter.level); +- + new_spte = make_nonleaf_spte(child_pt, + !shadow_accessed_mask); + +- trace_kvm_mmu_get_page(sp, true); +- tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); ++ if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, ++ new_spte)) { ++ tdp_mmu_link_page(vcpu->kvm, sp, true, ++ huge_page_disallowed && ++ req_level >= iter.level); ++ ++ trace_kvm_mmu_get_page(sp, true); ++ } else { ++ tdp_mmu_free_sp(sp); ++ break; ++ } + } + } + +- if (WARN_ON(iter.level != level)) { ++ if (iter.level != level) { + rcu_read_unlock(); + return RET_PF_RETRY; + } +-- +2.30.1 + diff --git a/queue-5.10/kvm-x86-mmu-yield-in-tdu-mmu-iter-even-if-no-sptes-c.patch b/queue-5.10/kvm-x86-mmu-yield-in-tdu-mmu-iter-even-if-no-sptes-c.patch new file mode 100644 index 00000000000..b230e5a12f5 --- /dev/null +++ b/queue-5.10/kvm-x86-mmu-yield-in-tdu-mmu-iter-even-if-no-sptes-c.patch @@ -0,0 +1,137 @@ +From 9216beb7a73a9159aca1662dcab60e3ba45b428e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Feb 2021 10:57:20 -0800 +Subject: KVM: x86/mmu: Yield in TDU MMU iter even if no SPTES changed + +From: Ben Gardon + +[ Upstream commit 1af4a96025b33587ca953c7ef12a1b20c6e70412 ] + +Given certain conditions, some TDP MMU functions may not yield +reliably / frequently enough. For example, if a paging structure was +very large but had few, if any writable entries, wrprot_gfn_range +could traverse many entries before finding a writable entry and yielding +because the check for yielding only happens after an SPTE is modified. + +Fix this issue by moving the yield to the beginning of the loop. + +Fixes: a6a0b05da9f3 ("kvm: x86/mmu: Support dirty logging for the TDP MMU") +Reviewed-by: Peter Feiner +Signed-off-by: Ben Gardon + +Message-Id: <20210202185734.1680553-15-bgardon@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/mmu/tdp_mmu.c | 32 ++++++++++++++++++++++---------- + 1 file changed, 22 insertions(+), 10 deletions(-) + +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index f0bc5d3ce3d4..0d17457f1c84 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -462,6 +462,12 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + bool flush_needed = false; + + tdp_root_for_each_pte(iter, root, start, end) { ++ if (can_yield && ++ tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { ++ flush_needed = false; ++ continue; ++ } ++ + if (!is_shadow_present_pte(iter.old_spte)) + continue; + +@@ -476,9 +482,7 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); +- +- flush_needed = !(can_yield && +- tdp_mmu_iter_cond_resched(kvm, &iter, true)); ++ flush_needed = true; + } + return flush_needed; + } +@@ -838,6 +842,9 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, start, end) { ++ if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) ++ continue; ++ + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) + continue; +@@ -846,8 +853,6 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; +- +- tdp_mmu_iter_cond_resched(kvm, &iter, false); + } + return spte_set; + } +@@ -891,6 +896,9 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + bool spte_set = false; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { ++ if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) ++ continue; ++ + if (spte_ad_need_write_protect(iter.old_spte)) { + if (is_writable_pte(iter.old_spte)) + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; +@@ -905,8 +913,6 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; +- +- tdp_mmu_iter_cond_resched(kvm, &iter, false); + } + return spte_set; + } +@@ -1014,6 +1020,9 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { ++ if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) ++ continue; ++ + if (!is_shadow_present_pte(iter.old_spte)) + continue; + +@@ -1021,8 +1030,6 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; +- +- tdp_mmu_iter_cond_resched(kvm, &iter, false); + } + + return spte_set; +@@ -1063,6 +1070,11 @@ static void zap_collapsible_spte_range(struct kvm *kvm, + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { ++ if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { ++ spte_set = false; ++ continue; ++ } ++ + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) + continue; +@@ -1075,7 +1087,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, + + tdp_mmu_set_spte(kvm, &iter, 0); + +- spte_set = !tdp_mmu_iter_cond_resched(kvm, &iter, true); ++ spte_set = true; + } + + if (spte_set) +-- +2.30.1 + diff --git a/queue-5.10/series b/queue-5.10/series index 3137260c52a..6be166bdb31 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -79,3 +79,20 @@ drm-tegra-dc-restore-coupling-of-display-controllers.patch drm-tegra-sor-grab-runtime-pm-reference-across-reset.patch vfio-nvlink-add-missing-spapr_tce_iommu-depends.patch pinctrl-rockchip-fix-restore-error-in-resume.patch +kvm-x86-mmu-change-tdp-mmu-yield-function-returns-to.patch +kvm-x86-mmu-merge-flush-and-non-flush-tdp_mmu_iter_c.patch +kvm-x86-mmu-rename-goal_gfn-to-next_last_level_gfn.patch +kvm-x86-mmu-ensure-forward-progress-when-yielding-in.patch +kvm-x86-mmu-yield-in-tdu-mmu-iter-even-if-no-sptes-c.patch +kvm-x86-mmu-add-existing-trace-points-to-tdp-mmu.patch +kvm-x86-mmu-add-lockdep-when-setting-a-tdp-mmu-spte.patch +kvm-x86-mmu-factor-out-handling-of-removed-page-tabl.patch +kvm-x86-mmu-protect-tdp-mmu-page-table-memory-with-r.patch +kvm-x86-mmu-ensure-tlbs-are-flushed-when-yielding-du.patch +kvm-x86-mmu-add-comment-on-__tdp_mmu_set_spte.patch +kvm-x86-mmu-don-t-redundantly-clear-tdp-mmu-pt-memor.patch +kvm-x86-mmu-fix-braces-in-kvm_recover_nx_lpages.patch +kvm-x86-mmu-factor-out-functions-to-add-remove-tdp-m.patch +kvm-x86-mmu-use-atomic-ops-to-set-sptes-in-tdp-mmu-m.patch +kvm-x86-compile-out-tdp-mmu-on-32-bit-systems.patch +kvm-x86-mmu-ensure-tlbs-are-flushed-for-tdp-mmu-duri.patch