]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
KVM: x86: Load guest/host XCR0 and XSS outside of the fastpath run loop
authorSean Christopherson <seanjc@google.com>
Tue, 18 Nov 2025 22:23:27 +0000 (14:23 -0800)
committerSean Christopherson <seanjc@google.com>
Wed, 19 Nov 2025 13:41:10 +0000 (05:41 -0800)
Move KVM's swapping of XFEATURE masks, i.e. XCR0 and XSS, out of the
fastpath loop now that the guts of the #MC handler runs in task context,
i.e. won't invoke schedule() with preemption disabled and clobber state
(or crash the kernel) due to trying to context switch XSTATE with a mix
of host and guest state.

For all intents and purposes, this reverts commit 1811d979c716 ("x86/kvm:
move kvm_load/put_guest_xcr0 into atomic context"), which papered over an
egregious bug/flaw in the #MC handler where it would do schedule() even
though IRQs are disabled.  E.g. the call stack from the commit:

  kvm_load_guest_xcr0
  ...
  kvm_x86_ops->run(vcpu)
    vmx_vcpu_run
      vmx_complete_atomic_exit
        kvm_machine_check
          do_machine_check
            do_memory_failure
              memory_failure
                lock_page

Commit 1811d979c716 "fixed" the immediate issue of XRSTORS exploding, but
completely ignored that scheduling out a vCPU task while IRQs and
preemption is wildly broken.  Thankfully, commit 5567d11c21a1 ("x86/mce:
Send #MC singal from task work") (somewhat incidentally?) fixed that flaw
by pushing the meat of the work to the user-return path, i.e. to task
context.

KVM has also hardened itself against #MC goofs by moving #MC forwarding to
kvm_x86_ops.handle_exit_irqoff(), i.e. out of the fastpath.  While that's
by no means a robust fix, restoring as much state as possible before
handling the #MC will hopefully provide some measure of protection in the
event that #MC handling goes off the rails again.

Note, KVM always intercepts XCR0 writes for vCPUs without protected state,
e.g. there's no risk of consuming a stale XCR0 when determining if a PKRU
update is needed; kvm_load_host_xfeatures() only reads, and never writes,
vcpu->arch.xcr0.

Deferring the XCR0 and XSS loads shaves ~300 cycles off the fastpath for
Intel, and ~500 cycles for AMD.  E.g. using INVD in KVM-Unit-Test's
vmexit.c, which an extra hack to enable CR4.OXSAVE, latency numbers for
AMD Turin go from ~2000 => 1500, and for Intel Emerald Rapids, go from
~1300 => ~1000.

Cc: Jon Kohler <jon@nutanix.com>
Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Reviewed-by: Jon Kohler <jon@nutanix.com>
Link: https://patch.msgid.link/20251118222328.2265758-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
arch/x86/kvm/x86.c

index 6af37204bd976119404fa129f2993db9d8b819d6..f6af75bff9adac58336542d0f7c8bd6b90b91af3 100644 (file)
@@ -1205,13 +1205,12 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw);
 
-void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
+static void kvm_load_guest_xfeatures(struct kvm_vcpu *vcpu)
 {
        if (vcpu->arch.guest_state_protected)
                return;
 
        if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
-
                if (vcpu->arch.xcr0 != kvm_host.xcr0)
                        xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
 
@@ -1219,6 +1218,27 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
                    vcpu->arch.ia32_xss != kvm_host.xss)
                        wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss);
        }
+}
+
+static void kvm_load_host_xfeatures(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.guest_state_protected)
+               return;
+
+       if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
+               if (vcpu->arch.xcr0 != kvm_host.xcr0)
+                       xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
+
+               if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
+                   vcpu->arch.ia32_xss != kvm_host.xss)
+                       wrmsrq(MSR_IA32_XSS, kvm_host.xss);
+       }
+}
+
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.guest_state_protected)
+               return;
 
        if (cpu_feature_enabled(X86_FEATURE_PKU) &&
            vcpu->arch.pkru != vcpu->arch.host_pkru &&
@@ -1240,17 +1260,6 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
                if (vcpu->arch.pkru != vcpu->arch.host_pkru)
                        wrpkru(vcpu->arch.host_pkru);
        }
-
-       if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
-
-               if (vcpu->arch.xcr0 != kvm_host.xcr0)
-                       xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
-
-               if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
-                   vcpu->arch.ia32_xss != kvm_host.xss)
-                       wrmsrq(MSR_IA32_XSS, kvm_host.xss);
-       }
-
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state);
 
@@ -11264,6 +11273,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (vcpu->arch.guest_fpu.xfd_err)
                wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
 
+       kvm_load_guest_xfeatures(vcpu);
+
        if (unlikely(vcpu->arch.switch_db_regs &&
                     !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
                set_debugreg(DR7_FIXED_1, 7);
@@ -11350,6 +11361,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        vcpu->mode = OUTSIDE_GUEST_MODE;
        smp_wmb();
 
+       kvm_load_host_xfeatures(vcpu);
+
        /*
         * Sync xfd before calling handle_exit_irqoff() which may
         * rely on the fact that guest_fpu::xfd is up-to-date (e.g.