]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
KVM: x86: Suppress WARNs on nested_run_pending after userspace exit
authorSean Christopherson <seanjc@google.com>
Thu, 12 Mar 2026 23:48:23 +0000 (16:48 -0700)
committerSean Christopherson <seanjc@google.com>
Fri, 3 Apr 2026 16:34:01 +0000 (09:34 -0700)
To end an ongoing game of whack-a-mole between KVM and syzkaller, WARN on
illegally cancelling a pending nested VM-Enter if and only if userspace
has NOT gained control of the vCPU since the nested run was initiated.  As
proven time and time again by syzkaller, userspace can clobber vCPU state
so as to force a VM-Exit that violates KVM's architectural modelling of
VMRUN/VMLAUNCH/VMRESUME.

To detect that userspace has gained control, while minimizing the risk of
operating on stale data, convert nested_run_pending from a pure boolean to
a tri-state of sorts, where '0' is still "not pending", '1' is "pending",
and '2' is "pending but untrusted".  Then on KVM_RUN, if the flag is in
the "trusted pending" state, move it to "untrusted pending".

Note, moving the state to "untrusted" even if KVM_RUN is ultimately
rejected is a-ok, because for the "untrusted" state to matter, KVM must
get past kvm_x86_vcpu_pre_run() at some point for the vCPU.

Reviewed-by: Yosry Ahmed <yosry@kernel.org>
Link: https://patch.msgid.link/20260312234823.3120658-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h

index 19b3790e5e99ac8c14baed1ca568d07a8d47ac19..c54c969c88ee80a44c38d06d039fdf6081cd193b 100644 (file)
@@ -1104,8 +1104,14 @@ struct kvm_vcpu_arch {
         * can only occur at instruction boundaries.  The only exception is
         * VMX's "notify" exits, which exist in large part to break the CPU out
         * of infinite ucode loops, but can corrupt vCPU state in the process!
+        *
+        * For all intents and purposes, this is a boolean, but it's tracked as
+        * a u8 so that KVM can detect when userspace may have stuffed vCPU
+        * state and generated an architecturally-impossible VM-Exit.
         */
-       bool nested_run_pending;
+#define KVM_NESTED_RUN_PENDING                 1
+#define KVM_NESTED_RUN_PENDING_UNTRUSTED       2
+       u8 nested_run_pending;
 
 #if IS_ENABLED(CONFIG_HYPERV)
        hpa_t hv_root_tdp;
index e24f5450f12190c997ed700f5f7b81606dd2d85d..88e878160229b6ed2c1349217345be988dcee49c 100644 (file)
@@ -1132,7 +1132,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
        if (!npt_enabled)
                vmcb01->save.cr3 = kvm_read_cr3(vcpu);
 
-       vcpu->arch.nested_run_pending = 1;
+       vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
 
        if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
            !nested_svm_merge_msrpm(vcpu)) {
@@ -1278,7 +1278,8 @@ void nested_svm_vmexit(struct vcpu_svm *svm)
        /* Exit Guest-Mode */
        leave_guest_mode(vcpu);
        svm->nested.vmcb12_gpa = 0;
-       WARN_ON_ONCE(vcpu->arch.nested_run_pending);
+
+       kvm_warn_on_nested_run_pending(vcpu);
 
        kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 
@@ -1985,8 +1986,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
 
        svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
 
-       vcpu->arch.nested_run_pending =
-               !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+       if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
+               vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED;
+       else
+               vcpu->arch.nested_run_pending = 0;
 
        svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
 
index dbd35340e7b0bb9f0d8a3c8e437b3a2bb1a12e88..f4b0aeba948fb2bfc73735b372207de7527b49bd 100644 (file)
@@ -5013,7 +5013,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
        if (ret)
                goto unmap_save;
 
-       vcpu->arch.nested_run_pending = 1;
+       vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
 
 unmap_save:
        kvm_vcpu_unmap(vcpu, &map_save);
index 031075467a6dcfa2573935aab20eaff104f936cd..48d2991886cb98797efa1ff0cc5defd059d6cc42 100644 (file)
@@ -3830,7 +3830,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         * We're finally done with prerequisite checking, and can start with
         * the nested entry.
         */
-       vcpu->arch.nested_run_pending = 1;
+       vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
        vmx->nested.has_preemption_timer_deadline = false;
        status = nested_vmx_enter_non_root_mode(vcpu, true);
        if (unlikely(status != NVMX_VMENTRY_SUCCESS))
@@ -5042,7 +5042,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
        vmx->nested.mtf_pending = false;
 
        /* trying to cancel vmlaunch/vmresume is a bug */
-       WARN_ON_ONCE(vcpu->arch.nested_run_pending);
+       kvm_warn_on_nested_run_pending(vcpu);
 
 #ifdef CONFIG_KVM_HYPERV
        if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
@@ -6665,7 +6665,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
        unsigned long exit_qual;
        u32 exit_intr_info;
 
-       WARN_ON_ONCE(vcpu->arch.nested_run_pending);
+       kvm_warn_on_nested_run_pending(vcpu);
 
        /*
         * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
@@ -6973,8 +6973,10 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
        if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
                return 0;
 
-       vcpu->arch.nested_run_pending =
-               !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+       if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
+               vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED;
+       else
+               vcpu->arch.nested_run_pending = 0;
 
        vmx->nested.mtf_pending =
                !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
index 9ef3fb04403d2397fdbe484dcd1f436cd63817d5..d75f6b22d74cbf5034b991738d52058becfdd7ae 100644 (file)
@@ -8532,7 +8532,7 @@ int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
                if (ret)
                        return ret;
 
-               vcpu->arch.nested_run_pending = 1;
+               vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
                vmx->nested.smm.guest_mode = false;
        }
        return 0;
index 64da02d1ee0084c1251b2c4f554b33ea65cc80c8..aa29f90c6e963b785e6558526080f0966d9724be 100644 (file)
@@ -11913,6 +11913,13 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 
 static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu)
 {
+       /*
+        * Userspace may have modified vCPU state, mark nested_run_pending as
+        * "untrusted" to avoid triggering false-positive WARNs.
+        */
+       if (vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING)
+               vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED;
+
        /*
         * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
         * tracks the pending SIPI separately.  SIPI_RECEIVED is still accepted
index 94d4f07aaaa09e9ac6c401ca18ee9cf504f69d07..9fe3a53fd8bedeebc3e18ac9c8853755fa2078ca 100644 (file)
@@ -188,6 +188,16 @@ static inline bool kvm_can_set_cpuid_and_feature_msrs(struct kvm_vcpu *vcpu)
        return vcpu->arch.last_vmentry_cpu == -1 && !is_guest_mode(vcpu);
 }
 
+/*
+ * WARN if a nested VM-Enter is pending completion, and userspace hasn't gained
+ * control since the nested VM-Enter was initiated (in which case, userspace
+ * may have modified vCPU state to induce an architecturally invalid VM-Exit).
+ */
+static inline void kvm_warn_on_nested_run_pending(struct kvm_vcpu *vcpu)
+{
+       WARN_ON_ONCE(vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING);
+}
+
 static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state)
 {
        vcpu->arch.mp_state = mp_state;