--- /dev/null
+From a217a6593cec8b315d4c2f344bae33660b39b703 Mon Sep 17 00:00:00 2001
+From: Lai Jiangshan <laijs@linux.alibaba.com>
+Date: Tue, 4 May 2021 21:50:14 +0200
+Subject: KVM/VMX: Invoke NMI non-IST entry instead of IST entry
+
+From: Lai Jiangshan <laijs@linux.alibaba.com>
+
+commit a217a6593cec8b315d4c2f344bae33660b39b703 upstream.
+
+In VMX, the host NMI handler needs to be invoked after NMI VM-Exit.
+Before commit 1a5488ef0dcf6 ("KVM: VMX: Invoke NMI handler via indirect
+call instead of INTn"), this was done by INTn ("int $2"). But INTn
+microcode is relatively expensive, so the commit reworked NMI VM-Exit
+handling to invoke the kernel handler by function call.
+
+But this missed a detail. The NMI entry point for direct invocation is
+fetched from the IDT table and called on the kernel stack. But on 64-bit
+the NMI entry installed in the IDT expects to be invoked on the IST stack.
+It relies on the "NMI executing" variable on the IST stack to work
+correctly, which is at a fixed position in the IST stack. When the entry
+point is unexpectedly called on the kernel stack, the RSP-addressed "NMI
+executing" variable is obviously also on the kernel stack and is
+"uninitialized" and can cause the NMI entry code to run in the wrong way.
+
+Provide a non-ist entry point for VMX which shares the C-function with
+the regular NMI entry and invoke the new asm entry point instead.
+
+On 32-bit this just maps to the regular NMI entry point as 32-bit has no
+ISTs and is not affected.
+
+[ tglx: Made it independent for backporting, massaged changelog ]
+
+Fixes: 1a5488ef0dcf6 ("KVM: VMX: Invoke NMI handler via indirect call instead of INTn")
+Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Lai Jiangshan <laijs@linux.alibaba.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/87r1imi8i1.ffs@nanos.tec.linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/idtentry.h | 15 +++++++++++++++
+ arch/x86/kernel/nmi.c | 10 ++++++++++
+ arch/x86/kvm/vmx/vmx.c | 16 +++++++++-------
+ 3 files changed, 34 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/include/asm/idtentry.h
++++ b/arch/x86/include/asm/idtentry.h
+@@ -588,6 +588,21 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_ma
+ #endif
+
+ /* NMI */
++
++#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
++/*
++ * Special NOIST entry point for VMX which invokes this on the kernel
++ * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
++ * 'executing' marker.
++ *
++ * On 32bit this just uses the regular NMI entry point because 32-bit does
++ * not have ISTs.
++ */
++DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist);
++#else
++#define asm_exc_nmi_noist asm_exc_nmi
++#endif
++
+ DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
+ #ifdef CONFIG_XEN_PV
+ DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
+--- a/arch/x86/kernel/nmi.c
++++ b/arch/x86/kernel/nmi.c
+@@ -524,6 +524,16 @@ nmi_restart:
+ mds_user_clear_cpu_buffers();
+ }
+
++#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
++DEFINE_IDTENTRY_RAW(exc_nmi_noist)
++{
++ exc_nmi(regs);
++}
++#endif
++#if IS_MODULE(CONFIG_KVM_INTEL)
++EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
++#endif
++
+ void stop_nmi(void)
+ {
+ ignore_nmis++;
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -36,6 +36,7 @@
+ #include <asm/debugreg.h>
+ #include <asm/desc.h>
+ #include <asm/fpu/internal.h>
++#include <asm/idtentry.h>
+ #include <asm/io.h>
+ #include <asm/irq_remapping.h>
+ #include <asm/kexec.h>
+@@ -6354,18 +6355,17 @@ static void vmx_apicv_post_state_restore
+
+ void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
+
+-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
++static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
++ unsigned long entry)
+ {
+- unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+- gate_desc *desc = (gate_desc *)host_idt_base + vector;
+-
+ kvm_before_interrupt(vcpu);
+- vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
++ vmx_do_interrupt_nmi_irqoff(entry);
+ kvm_after_interrupt(vcpu);
+ }
+
+ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
+ {
++ const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
+ u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
+
+ /* if exit due to PF check for async PF */
+@@ -6376,18 +6376,20 @@ static void handle_exception_nmi_irqoff(
+ kvm_machine_check();
+ /* We need to handle NMIs before interrupts are enabled */
+ else if (is_nmi(intr_info))
+- handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
++ handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
+ }
+
+ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
+ {
+ u32 intr_info = vmx_get_intr_info(vcpu);
++ unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
++ gate_desc *desc = (gate_desc *)host_idt_base + vector;
+
+ if (WARN_ONCE(!is_external_intr(intr_info),
+ "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
+ return;
+
+- handle_interrupt_nmi_irqoff(vcpu, intr_info);
++ handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+ }
+
+ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
--- /dev/null
+From c5e2184d1544f9e56140791eff1a351bea2e63b9 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 14 Jan 2021 16:40:51 -0800
+Subject: KVM: x86/mmu: Remove the defunct update_pte() paging hook
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit c5e2184d1544f9e56140791eff1a351bea2e63b9 upstream.
+
+Remove the update_pte() shadow paging logic, which was obsoleted by
+commit 4731d4c7a077 ("KVM: MMU: out of sync shadow core"), but never
+removed. As pointed out by Yu, KVM never write protects leaf page
+tables for the purposes of shadow paging, and instead marks their
+associated shadow page as unsync so that the guest can write PTEs at
+will.
+
+The update_pte() path, which predates the unsync logic, optimizes COW
+scenarios by refreshing leaf SPTEs when they are written, as opposed to
+zapping the SPTE, restarting the guest, and installing the new SPTE on
+the subsequent fault. Since KVM no longer write-protects leaf page
+tables, update_pte() is unreachable and can be dropped.
+
+Reported-by: Yu Zhang <yu.c.zhang@intel.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210115004051.4099250-1-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/kvm_host.h | 3 --
+ arch/x86/kvm/mmu/mmu.c | 49 +---------------------------------------
+ arch/x86/kvm/x86.c | 1
+ 3 files changed, 2 insertions(+), 51 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -358,8 +358,6 @@ struct kvm_mmu {
+ int (*sync_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp);
+ void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
+- void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+- u64 *spte, const void *pte);
+ hpa_t root_hpa;
+ gpa_t root_pgd;
+ union kvm_mmu_role mmu_role;
+@@ -1019,7 +1017,6 @@ struct kvm_arch {
+ struct kvm_vm_stat {
+ ulong mmu_shadow_zapped;
+ ulong mmu_pte_write;
+- ulong mmu_pte_updated;
+ ulong mmu_pde_zapped;
+ ulong mmu_flooded;
+ ulong mmu_recycled;
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -1715,13 +1715,6 @@ static int nonpaging_sync_page(struct kv
+ return 0;
+ }
+
+-static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
+- struct kvm_mmu_page *sp, u64 *spte,
+- const void *pte)
+-{
+- WARN_ON(1);
+-}
+-
+ #define KVM_PAGE_ARRAY_NR 16
+
+ struct kvm_mmu_pages {
+@@ -3820,7 +3813,6 @@ static void nonpaging_init_context(struc
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = NULL;
+- context->update_pte = nonpaging_update_pte;
+ context->root_level = 0;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->direct_map = true;
+@@ -4402,7 +4394,6 @@ static void paging64_init_context_common
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->sync_page = paging64_sync_page;
+ context->invlpg = paging64_invlpg;
+- context->update_pte = paging64_update_pte;
+ context->shadow_root_level = level;
+ context->direct_map = false;
+ }
+@@ -4431,7 +4422,6 @@ static void paging32_init_context(struct
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->sync_page = paging32_sync_page;
+ context->invlpg = paging32_invlpg;
+- context->update_pte = paging32_update_pte;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->direct_map = false;
+ }
+@@ -4513,7 +4503,6 @@ static void init_kvm_tdp_mmu(struct kvm_
+ context->page_fault = kvm_tdp_page_fault;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = NULL;
+- context->update_pte = nonpaging_update_pte;
+ context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
+ context->direct_map = true;
+ context->get_guest_pgd = get_cr3;
+@@ -4690,7 +4679,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_page = ept_sync_page;
+ context->invlpg = ept_invlpg;
+- context->update_pte = ept_update_pte;
+ context->root_level = level;
+ context->direct_map = false;
+ context->mmu_role.as_u64 = new_role.as_u64;
+@@ -4838,19 +4826,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcp
+ }
+ EXPORT_SYMBOL_GPL(kvm_mmu_unload);
+
+-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
+- struct kvm_mmu_page *sp, u64 *spte,
+- const void *new)
+-{
+- if (sp->role.level != PG_LEVEL_4K) {
+- ++vcpu->kvm->stat.mmu_pde_zapped;
+- return;
+- }
+-
+- ++vcpu->kvm->stat.mmu_pte_updated;
+- vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
+-}
+-
+ static bool need_remote_flush(u64 old, u64 new)
+ {
+ if (!is_shadow_present_pte(old))
+@@ -4966,22 +4941,6 @@ static u64 *get_written_sptes(struct kvm
+ return spte;
+ }
+
+-/*
+- * Ignore various flags when determining if a SPTE can be immediately
+- * overwritten for the current MMU.
+- * - level: explicitly checked in mmu_pte_write_new_pte(), and will never
+- * match the current MMU role, as MMU's level tracks the root level.
+- * - access: updated based on the new guest PTE
+- * - quadrant: handled by get_written_sptes()
+- * - invalid: always false (loop only walks valid shadow pages)
+- */
+-static const union kvm_mmu_page_role role_ign = {
+- .level = 0xf,
+- .access = 0x7,
+- .quadrant = 0x3,
+- .invalid = 0x1,
+-};
+-
+ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes,
+ struct kvm_page_track_notifier_node *node)
+@@ -5032,14 +4991,10 @@ static void kvm_mmu_pte_write(struct kvm
+
+ local_flush = true;
+ while (npte--) {
+- u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
+-
+ entry = *spte;
+ mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
+- if (gentry &&
+- !((sp->role.word ^ base_role) & ~role_ign.word) &&
+- rmap_can_add(vcpu))
+- mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
++ if (gentry && sp->role.level != PG_LEVEL_4K)
++ ++vcpu->kvm->stat.mmu_pde_zapped;
+ if (need_remote_flush(entry, *spte))
+ remote_flush = true;
+ ++spte;
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -233,7 +233,6 @@ struct kvm_stats_debugfs_item debugfs_en
+ VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+ VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
+ VM_STAT("mmu_pte_write", mmu_pte_write),
+- VM_STAT("mmu_pte_updated", mmu_pte_updated),
+ VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
+ VM_STAT("mmu_flooded", mmu_flooded),
+ VM_STAT("mmu_recycled", mmu_recycled),