]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
KVM: nSVM: Delay setting soft IRQ RIP tracking fields until vCPU run
authorSean Christopherson <seanjc@google.com>
Thu, 5 Mar 2026 00:06:56 +0000 (16:06 -0800)
committerSean Christopherson <seanjc@google.com>
Thu, 5 Mar 2026 00:06:56 +0000 (16:06 -0800)
In the save+restore path, when restoring nested state, the values of RIP
and CS base passed into nested_vmcb02_prepare_control() are mostly
incorrect.  They are both pulled from the vmcb02. For CS base, the value
is only correct if system regs are restored before nested state. The
value of RIP is whatever the vCPU had in vmcb02 before restoring nested
state (zero on a freshly created vCPU).

Instead, take a similar approach to NextRIP, and delay initializing the
RIP tracking fields until shortly before the vCPU is run, to make sure
the most up-to-date values of RIP and CS base are used regardless of
KVM_SET_SREGS, KVM_SET_REGS, and KVM_SET_NESTED_STATE's relative
ordering.

Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE")
CC: stable@vger.kernel.org
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yosry Ahmed <yosry@kernel.org>
Link: https://patch.msgid.link/20260225005950.3739782-8-yosry@kernel.org
[sean: deal with the svm_cancel_injection() madness]
Signed-off-by: Sean Christopherson <seanjc@google.com>
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/svm.c

index 76d959d15e1476d4df3ac8d6f1e411400cd806eb..3e2841598a36c4a6faf0883e0b7c342626d29852 100644 (file)
@@ -742,9 +742,7 @@ static bool is_evtinj_nmi(u32 evtinj)
        return type == SVM_EVTINJ_TYPE_NMI;
 }
 
-static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
-                                         unsigned long vmcb12_rip,
-                                         unsigned long vmcb12_csbase)
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 {
        u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
        u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
@@ -856,15 +854,16 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
                vmcb02->control.next_rip = svm->nested.ctl.next_rip;
 
        svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
+
+       /*
+        * soft_int_csbase, soft_int_old_rip, and soft_int_next_rip (if L1
+        * doesn't have NRIPS) are initialized later, before the vCPU is run.
+        */
        if (is_evtinj_soft(vmcb02->control.event_inj)) {
                svm->soft_int_injected = true;
-               svm->soft_int_csbase = vmcb12_csbase;
-               svm->soft_int_old_rip = vmcb12_rip;
                if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) ||
                    !svm->nested.nested_run_pending)
                        svm->soft_int_next_rip = svm->nested.ctl.next_rip;
-               else
-                       svm->soft_int_next_rip = vmcb12_rip;
        }
 
        /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
@@ -962,7 +961,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
        nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
 
        svm_switch_vmcb(svm, &svm->nested.vmcb02);
-       nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base);
+       nested_vmcb02_prepare_control(svm);
        nested_vmcb02_prepare_save(svm, vmcb12);
 
        ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
@@ -1907,7 +1906,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
        nested_copy_vmcb_control_to_cache(svm, ctl);
 
        svm_switch_vmcb(svm, &svm->nested.vmcb02);
-       nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base);
+       nested_vmcb02_prepare_control(svm);
 
        /*
         * Any previously restored state (e.g. KVM_SET_SREGS) would mark fields
index f862bafc381ad7d5d7a8823dfdbdba6971d9b372..d82e30c40eaa9c501573064031bccfa279791d98 100644 (file)
@@ -3637,6 +3637,16 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
        return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code);
 }
 
+static void svm_set_nested_run_soft_int_state(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       svm->soft_int_csbase = svm->vmcb->save.cs.base;
+       svm->soft_int_old_rip = kvm_rip_read(vcpu);
+       if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
+               svm->soft_int_next_rip = kvm_rip_read(vcpu);
+}
+
 static int pre_svm_run(struct kvm_vcpu *vcpu)
 {
        struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
@@ -3759,6 +3769,13 @@ static void svm_fixup_nested_rips(struct kvm_vcpu *vcpu)
        if (boot_cpu_has(X86_FEATURE_NRIPS) &&
            !guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
                svm->vmcb->control.next_rip = kvm_rip_read(vcpu);
+
+       /*
+        * Simiarly, initialize the soft int metadata here to use the most
+        * up-to-date values of RIP and CS base, regardless of restore order.
+        */
+       if (svm->soft_int_injected)
+               svm_set_nested_run_soft_int_state(vcpu);
 }
 
 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
@@ -4128,6 +4145,18 @@ static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
        bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
        struct vcpu_svm *svm = to_svm(vcpu);
 
+       /*
+        * Initialize the soft int fields *before* reading them below if KVM
+        * aborted entry to the guest with a nested VMRUN pending.  To ensure
+        * KVM uses up-to-date values for RIP and CS base across save/restore,
+        * regardless of restore order, KVM waits to set the soft int fields
+        * until VMRUN is imminent.  But when canceling injection, KVM requeues
+        * the soft int and will reinject it via the standard injection flow,
+        * and so KVM needs to grab the state from the pending nested VMRUN.
+        */
+       if (is_guest_mode(vcpu) && svm->nested.nested_run_pending)
+               svm_set_nested_run_soft_int_state(vcpu);
+
        /*
         * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
         * associated with the original soft exception/interrupt.  next_rip is