From: Sean Christopherson Date: Thu, 5 Mar 2026 00:06:56 +0000 (-0800) Subject: KVM: nSVM: Delay setting soft IRQ RIP tracking fields until vCPU run X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c64bc6ed1764c1b7e3c0017019f743196074092f;p=thirdparty%2Fkernel%2Flinux.git KVM: nSVM: Delay setting soft IRQ RIP tracking fields until vCPU run In the save+restore path, when restoring nested state, the values of RIP and CS base passed into nested_vmcb02_prepare_control() are mostly incorrect. They are both pulled from the vmcb02. For CS base, the value is only correct if system regs are restored before nested state. The value of RIP is whatever the vCPU had in vmcb02 before restoring nested state (zero on a freshly created vCPU). Instead, take a similar approach to NextRIP, and delay initializing the RIP tracking fields until shortly before the vCPU is run, to make sure the most up-to-date values of RIP and CS base are used regardless of KVM_SET_SREGS, KVM_SET_REGS, and KVM_SET_NESTED_STATE's relative ordering. Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE") CC: stable@vger.kernel.org Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260225005950.3739782-8-yosry@kernel.org [sean: deal with the svm_cancel_injection() madness] Signed-off-by: Sean Christopherson --- diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 76d959d15e147..3e2841598a36c 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -742,9 +742,7 @@ static bool is_evtinj_nmi(u32 evtinj) return type == SVM_EVTINJ_TYPE_NMI; } -static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, - unsigned long vmcb12_rip, - unsigned long vmcb12_csbase) +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; @@ -856,15 +854,16 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.next_rip = svm->nested.ctl.next_rip; svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); + + /* + * soft_int_csbase, soft_int_old_rip, and soft_int_next_rip (if L1 + * doesn't have NRIPS) are initialized later, before the vCPU is run. + */ if (is_evtinj_soft(vmcb02->control.event_inj)) { svm->soft_int_injected = true; - svm->soft_int_csbase = vmcb12_csbase; - svm->soft_int_old_rip = vmcb12_rip; if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) || !svm->nested.nested_run_pending) svm->soft_int_next_rip = svm->nested.ctl.next_rip; - else - svm->soft_int_next_rip = vmcb12_rip; } /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */ @@ -962,7 +961,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); + nested_vmcb02_prepare_control(svm); nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, @@ -1907,7 +1906,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); + nested_vmcb02_prepare_control(svm); /* * Any previously restored state (e.g. KVM_SET_SREGS) would mark fields diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f862bafc381ad..d82e30c40eaa9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3637,6 +3637,16 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code); } +static void svm_set_nested_run_soft_int_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->soft_int_csbase = svm->vmcb->save.cs.base; + svm->soft_int_old_rip = kvm_rip_read(vcpu); + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) + svm->soft_int_next_rip = kvm_rip_read(vcpu); +} + static int pre_svm_run(struct kvm_vcpu *vcpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); @@ -3759,6 +3769,13 @@ static void svm_fixup_nested_rips(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_NRIPS) && !guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) svm->vmcb->control.next_rip = kvm_rip_read(vcpu); + + /* + * Simiarly, initialize the soft int metadata here to use the most + * up-to-date values of RIP and CS base, regardless of restore order. + */ + if (svm->soft_int_injected) + svm_set_nested_run_soft_int_state(vcpu); } void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, @@ -4128,6 +4145,18 @@ static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); struct vcpu_svm *svm = to_svm(vcpu); + /* + * Initialize the soft int fields *before* reading them below if KVM + * aborted entry to the guest with a nested VMRUN pending. To ensure + * KVM uses up-to-date values for RIP and CS base across save/restore, + * regardless of restore order, KVM waits to set the soft int fields + * until VMRUN is imminent. But when canceling injection, KVM requeues + * the soft int and will reinject it via the standard injection flow, + * and so KVM needs to grab the state from the pending nested VMRUN. + */ + if (is_guest_mode(vcpu) && svm->nested.nested_run_pending) + svm_set_nested_run_soft_int_state(vcpu); + /* * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's * associated with the original soft exception/interrupt. next_rip is