--- /dev/null
+From 6cd88243c7e03845a450795e134b488fc2afb736 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Tue, 7 Jun 2022 10:09:03 -0400
+Subject: KVM: x86: do not report a vCPU as preempted outside instruction boundaries
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 6cd88243c7e03845a450795e134b488fc2afb736 upstream.
+
+If a vCPU is outside guest mode and is scheduled out, it might be in the
+process of making a memory access. A problem occurs if another vCPU uses
+the PV TLB flush feature during the period when the vCPU is scheduled
+out, and a virtual address has already been translated but has not yet
+been accessed, because this is equivalent to using a stale TLB entry.
+
+To avoid this, only report a vCPU as preempted if sure that the guest
+is at an instruction boundary. A rescheduling request will be delivered
+to the host physical CPU as an external interrupt, so for simplicity
+consider any vmexit *not* instruction boundary except for external
+interrupts.
+
+It would in principle be okay to report the vCPU as preempted also
+if it is sleeping in kvm_vcpu_block(): a TLB flush IPI will incur the
+vmentry/vmexit overhead unnecessarily, and optimistic spinning is
+also unlikely to succeed. However, leave it for later because right
+now kvm_vcpu_check_block() is doing memory accesses. Even
+though the TLB flush issue only applies to virtual memory address,
+it's very much preferrable to be conservative.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+[OP: use VCPU_STAT() for debugfs entries]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/kvm_host.h | 3 +++
+ arch/x86/kvm/svm/svm.c | 2 ++
+ arch/x86/kvm/vmx/vmx.c | 1 +
+ arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++
+ 4 files changed, 28 insertions(+)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -553,6 +553,7 @@ struct kvm_vcpu_arch {
+ u64 ia32_misc_enable_msr;
+ u64 smbase;
+ u64 smi_count;
++ bool at_instruction_boundary;
+ bool tpr_access_reporting;
+ bool xsaves_enabled;
+ u64 ia32_xss;
+@@ -1061,6 +1062,8 @@ struct kvm_vcpu_stat {
+ u64 req_event;
+ u64 halt_poll_success_ns;
+ u64 halt_poll_fail_ns;
++ u64 preemption_reported;
++ u64 preemption_other;
+ };
+
+ struct x86_instruction_info;
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -3983,6 +3983,8 @@ out:
+
+ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+ {
++ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
++ vcpu->arch.at_instruction_boundary = true;
+ }
+
+ static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6510,6 +6510,7 @@ static void handle_external_interrupt_ir
+ return;
+
+ handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
++ vcpu->arch.at_instruction_boundary = true;
+ }
+
+ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -231,6 +231,8 @@ struct kvm_stats_debugfs_item debugfs_en
+ VCPU_STAT("l1d_flush", l1d_flush),
+ VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+ VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
++ VCPU_STAT("preemption_reported", preemption_reported),
++ VCPU_STAT("preemption_other", preemption_other),
+ VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
+ VM_STAT("mmu_pte_write", mmu_pte_write),
+ VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
+@@ -4052,6 +4054,19 @@ static void kvm_steal_time_set_preempted
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
++ /*
++ * The vCPU can be marked preempted if and only if the VM-Exit was on
++ * an instruction boundary and will not trigger guest emulation of any
++ * kind (see vcpu_run). Vendor specific code controls (conservatively)
++ * when this is true, for example allowing the vCPU to be marked
++ * preempted if and only if the VM-Exit was due to a host interrupt.
++ */
++ if (!vcpu->arch.at_instruction_boundary) {
++ vcpu->stat.preemption_other++;
++ return;
++ }
++
++ vcpu->stat.preemption_reported++;
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+@@ -9357,6 +9372,13 @@ static int vcpu_run(struct kvm_vcpu *vcp
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ for (;;) {
++ /*
++ * If another guest vCPU requests a PV TLB flush in the middle
++ * of instruction emulation, the rest of the emulation could
++ * use a stale page translation. Assume that any code after
++ * this point can start executing an instruction.
++ */
++ vcpu->arch.at_instruction_boundary = false;
+ if (kvm_vcpu_running(vcpu)) {
+ r = vcpu_enter_guest(vcpu);
+ } else {
--- /dev/null
+From 6470accc7ba948b0b3aca22b273fe84ec638a116 Mon Sep 17 00:00:00 2001
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+Date: Fri, 3 Sep 2021 09:51:36 +0200
+Subject: KVM: x86: hyper-v: Avoid calling kvm_make_vcpus_request_mask() with vcpu_mask==NULL
+
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+
+commit 6470accc7ba948b0b3aca22b273fe84ec638a116 upstream.
+
+In preparation to making kvm_make_vcpus_request_mask() use for_each_set_bit()
+switch kvm_hv_flush_tlb() to calling kvm_make_all_cpus_request() for 'all cpus'
+case.
+
+Note: kvm_make_all_cpus_request() (unlike kvm_make_vcpus_request_mask())
+currently dynamically allocates cpumask on each call and this is suboptimal.
+Both kvm_make_all_cpus_request() and kvm_make_vcpus_request_mask() are
+going to be switched to using pre-allocated per-cpu masks.
+
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Message-Id: <20210903075141.403071-4-vkuznets@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Acked-by: Sean Christopherson <seanjc@google.com>
+Fixes: 6100066358ee ("KVM: Optimize kvm_make_vcpus_request_mask() a bit")
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/hyperv.c | 15 +++++++++------
+ 1 file changed, 9 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kvm/hyperv.c
++++ b/arch/x86/kvm/hyperv.c
+@@ -1562,16 +1562,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_v
+
+ cpumask_clear(&hv_vcpu->tlb_flush);
+
+- vcpu_mask = all_cpus ? NULL :
+- sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
+- vp_bitmap, vcpu_bitmap);
+-
+ /*
+ * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
+ * analyze it here, flush TLB regardless of the specified address space.
+ */
+- kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST,
+- NULL, vcpu_mask, &hv_vcpu->tlb_flush);
++ if (all_cpus) {
++ kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
++ } else {
++ vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
++ vp_bitmap, vcpu_bitmap);
++
++ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST,
++ NULL, vcpu_mask, &hv_vcpu->tlb_flush);
++ }
+
+ ret_success:
+ /* We always do full TLB flush, set rep_done = rep_cnt. */