To end an ongoing game of whack-a-mole between KVM and syzkaller, WARN on
illegally cancelling a pending nested VM-Enter if and only if userspace
has NOT gained control of the vCPU since the nested run was initiated. As
proven time and time again by syzkaller, userspace can clobber vCPU state
so as to force a VM-Exit that violates KVM's architectural modelling of
VMRUN/VMLAUNCH/VMRESUME.
To detect that userspace has gained control, while minimizing the risk of
operating on stale data, convert nested_run_pending from a pure boolean to
a tri-state of sorts, where '0' is still "not pending", '1' is "pending",
and '2' is "pending but untrusted". Then on KVM_RUN, if the flag is in
the "trusted pending" state, move it to "untrusted pending".
Note, moving the state to "untrusted" even if KVM_RUN is ultimately
rejected is a-ok, because for the "untrusted" state to matter, KVM must
get past kvm_x86_vcpu_pre_run() at some point for the vCPU.
Reviewed-by: Yosry Ahmed <yosry@kernel.org>
Link: https://patch.msgid.link/20260312234823.3120658-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
* can only occur at instruction boundaries. The only exception is
* VMX's "notify" exits, which exist in large part to break the CPU out
* of infinite ucode loops, but can corrupt vCPU state in the process!
+ *
+ * For all intents and purposes, this is a boolean, but it's tracked as
+ * a u8 so that KVM can detect when userspace may have stuffed vCPU
+ * state and generated an architecturally-impossible VM-Exit.
*/
- bool nested_run_pending;
+#define KVM_NESTED_RUN_PENDING 1
+#define KVM_NESTED_RUN_PENDING_UNTRUSTED 2
+ u8 nested_run_pending;
#if IS_ENABLED(CONFIG_HYPERV)
hpa_t hv_root_tdp;
if (!npt_enabled)
vmcb01->save.cr3 = kvm_read_cr3(vcpu);
- vcpu->arch.nested_run_pending = 1;
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
!nested_svm_merge_msrpm(vcpu)) {
/* Exit Guest-Mode */
leave_guest_mode(vcpu);
svm->nested.vmcb12_gpa = 0;
- WARN_ON_ONCE(vcpu->arch.nested_run_pending);
+
+ kvm_warn_on_nested_run_pending(vcpu);
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
- vcpu->arch.nested_run_pending =
- !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+ if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED;
+ else
+ vcpu->arch.nested_run_pending = 0;
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
if (ret)
goto unmap_save;
- vcpu->arch.nested_run_pending = 1;
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
unmap_save:
kvm_vcpu_unmap(vcpu, &map_save);
* We're finally done with prerequisite checking, and can start with
* the nested entry.
*/
- vcpu->arch.nested_run_pending = 1;
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
vmx->nested.has_preemption_timer_deadline = false;
status = nested_vmx_enter_non_root_mode(vcpu, true);
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
vmx->nested.mtf_pending = false;
/* trying to cancel vmlaunch/vmresume is a bug */
- WARN_ON_ONCE(vcpu->arch.nested_run_pending);
+ kvm_warn_on_nested_run_pending(vcpu);
#ifdef CONFIG_KVM_HYPERV
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
unsigned long exit_qual;
u32 exit_intr_info;
- WARN_ON_ONCE(vcpu->arch.nested_run_pending);
+ kvm_warn_on_nested_run_pending(vcpu);
/*
* Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
return 0;
- vcpu->arch.nested_run_pending =
- !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+ if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED;
+ else
+ vcpu->arch.nested_run_pending = 0;
vmx->nested.mtf_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
if (ret)
return ret;
- vcpu->arch.nested_run_pending = 1;
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
vmx->nested.smm.guest_mode = false;
}
return 0;
static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
+ /*
+ * Userspace may have modified vCPU state, mark nested_run_pending as
+ * "untrusted" to avoid triggering false-positive WARNs.
+ */
+ if (vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING)
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED;
+
/*
* SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
* tracks the pending SIPI separately. SIPI_RECEIVED is still accepted
return vcpu->arch.last_vmentry_cpu == -1 && !is_guest_mode(vcpu);
}
+/*
+ * WARN if a nested VM-Enter is pending completion, and userspace hasn't gained
+ * control since the nested VM-Enter was initiated (in which case, userspace
+ * may have modified vCPU state to induce an architecturally invalid VM-Exit).
+ */
+static inline void kvm_warn_on_nested_run_pending(struct kvm_vcpu *vcpu)
+{
+ WARN_ON_ONCE(vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING);
+}
+
static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state)
{
vcpu->arch.mp_state = mp_state;