]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
KVM: SVM: Fix IRQ window inhibit handling across multiple vCPUs
authorSean Christopherson <seanjc@google.com>
Fri, 23 Jan 2026 22:45:12 +0000 (14:45 -0800)
committerSean Christopherson <seanjc@google.com>
Mon, 2 Mar 2026 22:51:36 +0000 (14:51 -0800)
IRQ window inhibits can be requested by multiple vCPUs at the same time
for injecting interrupts meant for different vCPUs. However, AVIC
inhibition is VM-wide and hence it is possible for the inhibition to be
cleared prematurely by the first vCPU that obtains the IRQ window even
though a second vCPU is still waiting for its IRQ window. This is likely
not a functional issue since the other vCPU will again see that
interrupts are pending to be injected (due to KVM_REQ_EVENT), and will
again request for an IRQ window inhibition. However, this can result in
AVIC being rapidly toggled resulting in high contention on
apicv_update_lock and degrading performance of the guest.

Address this by maintaining a VM-wide count of the number of vCPUs that
have requested for an IRQ window. Set/clear the inhibit reason when the
count transitions between 0 and 1. This ensures that the inhibit reason
is not cleared as long as there are some vCPUs still waiting for an IRQ
window.

Co-developed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Co-developed-by: Naveen N Rao (AMD) <naveen@kernel.org>
Signed-off-by: Naveen N Rao (AMD) <naveen@kernel.org>
Tested-by: Naveen N Rao (AMD) <naveen@kernel.org>
Link: https://patch.msgid.link/20260123224514.2509129-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/x86.c

index ff07c45e3c731a2833b472faca4262ac4af19a5b..68db00dc09a02c949c761691a0c04f74083060f5 100644 (file)
@@ -1433,6 +1433,7 @@ struct kvm_arch {
        struct kvm_pit *vpit;
 #endif
        atomic_t vapics_in_nmi_mode;
+
        struct mutex apic_map_lock;
        struct kvm_apic_map __rcu *apic_map;
        atomic_t apic_map_dirty;
@@ -1440,9 +1441,13 @@ struct kvm_arch {
        bool apic_access_memslot_enabled;
        bool apic_access_memslot_inhibited;
 
-       /* Protects apicv_inhibit_reasons */
+       /*
+        * Protects apicv_inhibit_reasons and apicv_nr_irq_window_req (with an
+        * asterisk, see kvm_inc_or_dec_irq_window_inhibit() for details).
+        */
        struct rw_semaphore apicv_update_lock;
        unsigned long apicv_inhibit_reasons;
+       atomic_t apicv_nr_irq_window_req;
 
        gpa_t wall_clock;
 
@@ -2316,6 +2321,18 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
        kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
 }
 
+void kvm_inc_or_dec_irq_window_inhibit(struct kvm *kvm, bool inc);
+
+static inline void kvm_inc_apicv_irq_window_req(struct kvm *kvm)
+{
+       kvm_inc_or_dec_irq_window_inhibit(kvm, true);
+}
+
+static inline void kvm_dec_apicv_irq_window_req(struct kvm *kvm)
+{
+       kvm_inc_or_dec_irq_window_inhibit(kvm, false);
+}
+
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
                       void *insn, int insn_len);
 void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg);
index 8766fd5f6d2b7a64b21887ea9289497861b9502f..e0da247ee594278857e618107e8a8d9fc91c79bc 100644 (file)
@@ -3729,8 +3729,11 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
         * the case in which the interrupt window was requested while L1 was
         * active (the vCPU was not running nested).
         */
-       if (!kvm_cpu_has_injectable_intr(vcpu) || is_guest_mode(vcpu))
-               kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+       if (svm->avic_irq_window &&
+           (!kvm_cpu_has_injectable_intr(vcpu) || is_guest_mode(vcpu))) {
+               svm->avic_irq_window = false;
+               kvm_dec_apicv_irq_window_req(svm->vcpu.kvm);
+       }
 
        trace_kvm_inj_virq(intr->nr, intr->soft, reinjected);
        ++vcpu->stat.irq_injections;
@@ -3932,17 +3935,28 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
         */
        if (vgif || gif_set(svm)) {
                /*
-                * IRQ window is not needed when AVIC is enabled,
-                * unless we have pending ExtINT since it cannot be injected
-                * via AVIC. In such case, KVM needs to temporarily disable AVIC,
-                * and fallback to injecting IRQ via V_IRQ.
+                * KVM only enables IRQ windows when AVIC is enabled if there's
+                * pending ExtINT since it cannot be injected via AVIC (ExtINT
+                * bypasses the local APIC).  V_IRQ is ignored by hardware when
+                * AVIC is enabled, and so KVM needs to temporarily disable
+                * AVIC in order to detect when it's ok to inject the ExtINT.
+                *
+                * If running nested, AVIC is already locally inhibited on this
+                * vCPU (L2 vCPUs use a different MMU that never maps the AVIC
+                * backing page), therefore there is no need to increment the
+                * VM-wide AVIC inhibit.  KVM will re-evaluate events when the
+                * vCPU exits to L1 and enable an IRQ window if the ExtINT is
+                * still pending.
                 *
-                * If running nested, AVIC is already locally inhibited
-                * on this vCPU, therefore there is no need to request
-                * the VM wide AVIC inhibition.
+                * Note, the IRQ window inhibit needs to be updated even if
+                * AVIC is inhibited for a different reason, as KVM needs to
+                * keep AVIC inhibited if the other reason is cleared and there
+                * is still an injectable interrupt pending.
                 */
-               if (!is_guest_mode(vcpu))
-                       kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+               if (enable_apicv && !svm->avic_irq_window && !is_guest_mode(vcpu)) {
+                       svm->avic_irq_window = true;
+                       kvm_inc_apicv_irq_window_req(vcpu->kvm);
+               }
 
                svm_set_vintr(svm);
        }
index ebd7b36b1ceb9b1ba1c6f7ef5622f6fc1ff8a232..68675b25ef8e970dfac6d02baf69bea5e3ae3671 100644 (file)
@@ -333,6 +333,7 @@ struct vcpu_svm {
 
        bool guest_state_loaded;
 
+       bool avic_irq_window;
        bool x2avic_msrs_intercepted;
        bool lbr_msrs_intercepted;
 
index a03530795707797b73c701b0c952d46d27389e8d..db25938b6b50a735b3c506eedbd7482980611847 100644 (file)
@@ -11014,6 +11014,25 @@ void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_or_clear_apicv_inhibit);
 
+void kvm_inc_or_dec_irq_window_inhibit(struct kvm *kvm, bool inc)
+{
+       int add = inc ? 1 : -1;
+
+       if (!enable_apicv)
+               return;
+
+       /*
+        * Strictly speaking, the lock is only needed if going 0->1 or 1->0,
+        * a la atomic_dec_and_mutex_lock.  However, ExtINTs are rare and
+        * only target a single CPU, so that is the common case; do not
+        * bother eliding the down_write()/up_write() pair.
+        */
+       guard(rwsem_write)(&kvm->arch.apicv_update_lock);
+       if (atomic_add_return(add, &kvm->arch.apicv_nr_irq_window_req) == inc)
+               __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_IRQWIN, inc);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inc_or_dec_irq_window_inhibit);
+
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
        if (!kvm_apic_present(vcpu))