]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
KVM: x86/pmu: Load/put mediated PMU context when entering/exiting guest
authorDapeng Mi <dapeng1.mi@linux.intel.com>
Sat, 6 Dec 2025 00:17:03 +0000 (16:17 -0800)
committerSean Christopherson <seanjc@google.com>
Thu, 8 Jan 2026 19:52:11 +0000 (11:52 -0800)
Implement the PMU "world switch" between host perf and guest mediated PMU.
When loading guest state, call into perf to switch from host to guest, and
then load guest state into hardware, and then reverse those actions when
putting guest state.

On the KVM side, when loading guest state, zero PERF_GLOBAL_CTRL to ensure
all counters are disabled, then load selectors and counters, and finally
call into vendor code to load control/status information.  While VMX and
SVM use different mechanisms to avoid counting host activity while guest
controls are loaded, both implementations require PERF_GLOBAL_CTRL to be
zeroed when the event selectors are in flux.

When putting guest state, reverse the order, and save and zero controls
and status prior to saving+zeroing selectors and counters.  Defer clearing
PERF_GLOBAL_CTRL to vendor code, as only SVM needs to manually clear the
MSR; VMX configures PERF_GLOBAL_CTRL to be atomically cleared by the CPU
on VM-Exit.

Handle the difference in MSR layouts between Intel and AMD by communicating
the bases and stride via kvm_pmu_ops.  Because KVM requires Intel v4 (and
full-width writes) and AMD v2, the MSRs to load/save are constant for a
given vendor, i.e. do not vary based on the guest PMU, and do not vary
based on host PMU (because KVM will simply disable mediated PMU support if
the necessary MSRs are unsupported).

Except for retrieving the guest's PERF_GLOBAL_CTRL, which needs to be read
before invoking any fastpath handler (spoiler alert), perform the context
switch around KVM's inner run loop.  State only needs to be synchronized
from hardware before KVM can access the software "caches".

Note, VMX already grabs the guest's PERF_GLOBAL_CTRL immediately after
VM-Exit, as hardware saves value into the VMCS.

Co-developed-by: Mingwei Zhang <mizhang@google.com>
Signed-off-by: Mingwei Zhang <mizhang@google.com>
Co-developed-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Tested-by: Xudong Hao <xudong.hao@intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Tested-by: Manali Shukla <manali.shukla@amd.com>
Link: https://patch.msgid.link/20251206001720.468579-28-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
arch/x86/include/asm/kvm-x86-pmu-ops.h
arch/x86/include/asm/msr-index.h
arch/x86/kvm/pmu.c
arch/x86/kvm/pmu.h
arch/x86/kvm/svm/pmu.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/vmx/pmu_intel.c
arch/x86/kvm/x86.c

index ad2cc82abf794493f1cbda5bac1eaf1ce71fafd8..f0aa6996811f9e1d3c6e9e121d0e29fb1f4ab717 100644 (file)
@@ -24,6 +24,8 @@ KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
 KVM_X86_PMU_OP_OPTIONAL(cleanup)
 
 KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl)
+KVM_X86_PMU_OP(mediated_load)
+KVM_X86_PMU_OP(mediated_put)
 
 #undef KVM_X86_PMU_OP
 #undef KVM_X86_PMU_OP_OPTIONAL
index 3d0a0950d20a1609e02a8e854ed14d5d45a45dfa..4d3566bb1a9385cb84e366b9c1ad407a653dac62 100644 (file)
 #define MSR_CORE_PERF_GLOBAL_STATUS    0x0000038e
 #define MSR_CORE_PERF_GLOBAL_CTRL      0x0000038f
 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL  0x00000390
+#define MSR_CORE_PERF_GLOBAL_STATUS_SET        0x00000391
 
 #define MSR_PERF_METRICS               0x00000329
 
index 24f5c14715efb572500a92dddb17d8acbf1166e7..f6387c67b25c4a165185e00127f5540aa3747be2 100644 (file)
@@ -880,10 +880,13 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        diff = pmu->global_ctrl ^ data;
                        pmu->global_ctrl = data;
                        reprogram_counters(pmu, diff);
-
-                       if (kvm_vcpu_has_mediated_pmu(vcpu))
-                               kvm_pmu_call(write_global_ctrl)(data);
                }
+               /*
+                * Unconditionally forward writes to vendor code, i.e. to the
+                * VMC{B,S}, as pmu->global_ctrl is per-VCPU, not per-VMC{B,S}.
+                */
+               if (kvm_vcpu_has_mediated_pmu(vcpu))
+                       kvm_pmu_call(write_global_ctrl)(data);
                break;
        case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
                /*
@@ -1244,3 +1247,124 @@ cleanup:
        kfree(filter);
        return r;
 }
+
+static __always_inline u32 fixed_counter_msr(u32 idx)
+{
+       return kvm_pmu_ops.FIXED_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
+}
+
+static __always_inline u32 gp_counter_msr(u32 idx)
+{
+       return kvm_pmu_ops.GP_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
+}
+
+static __always_inline u32 gp_eventsel_msr(u32 idx)
+{
+       return kvm_pmu_ops.GP_EVENTSEL_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
+}
+
+static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       struct kvm_pmc *pmc;
+       u32 i;
+
+       /*
+        * No need to zero out unexposed GP/fixed counters/selectors since RDPMC
+        * is intercepted if hardware has counters that aren't visible to the
+        * guest (KVM will inject #GP as appropriate).
+        */
+       for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
+               pmc = &pmu->gp_counters[i];
+
+               wrmsrl(gp_counter_msr(i), pmc->counter);
+               wrmsrl(gp_eventsel_msr(i), pmc->eventsel_hw);
+       }
+       for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+               pmc = &pmu->fixed_counters[i];
+
+               wrmsrl(fixed_counter_msr(i), pmc->counter);
+       }
+}
+
+void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu)
+{
+       if (!kvm_vcpu_has_mediated_pmu(vcpu) ||
+           KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm))
+               return;
+
+       lockdep_assert_irqs_disabled();
+
+       perf_load_guest_context();
+
+       /*
+        * Explicitly clear PERF_GLOBAL_CTRL, as "loading" the guest's context
+        * disables all individual counters (if any were enabled), but doesn't
+        * globally disable the entire PMU.  Loading event selectors and PMCs
+        * with guest values while PERF_GLOBAL_CTRL is non-zero will generate
+        * unexpected events and PMIs.
+        *
+        * VMX will enable/disable counters at VM-Enter/VM-Exit by atomically
+        * loading PERF_GLOBAL_CONTROL.  SVM effectively performs the switch by
+        * configuring all events to be GUEST_ONLY.  Clear PERF_GLOBAL_CONTROL
+        * even for SVM to minimize the damage if a perf event is left enabled,
+        * and to ensure a consistent starting state.
+        */
+       wrmsrq(kvm_pmu_ops.PERF_GLOBAL_CTRL, 0);
+
+       perf_load_guest_lvtpc(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTPC));
+
+       kvm_pmu_load_guest_pmcs(vcpu);
+
+       kvm_pmu_call(mediated_load)(vcpu);
+}
+
+static void kvm_pmu_put_guest_pmcs(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       struct kvm_pmc *pmc;
+       u32 i;
+
+       /*
+        * Clear selectors and counters to ensure hardware doesn't count using
+        * guest controls when the host (perf) restores its state.
+        */
+       for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
+               pmc = &pmu->gp_counters[i];
+
+               pmc->counter = rdpmc(i);
+               if (pmc->counter)
+                       wrmsrq(gp_counter_msr(i), 0);
+               if (pmc->eventsel_hw)
+                       wrmsrq(gp_eventsel_msr(i), 0);
+       }
+
+       for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+               pmc = &pmu->fixed_counters[i];
+
+               pmc->counter = rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i);
+               if (pmc->counter)
+                       wrmsrq(fixed_counter_msr(i), 0);
+       }
+}
+
+void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu)
+{
+       if (!kvm_vcpu_has_mediated_pmu(vcpu) ||
+           KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm))
+               return;
+
+       lockdep_assert_irqs_disabled();
+
+       /*
+        * Defer handling of PERF_GLOBAL_CTRL to vendor code.  On Intel, it's
+        * atomically cleared on VM-Exit, i.e. doesn't need to be clear here.
+        */
+       kvm_pmu_call(mediated_put)(vcpu);
+
+       kvm_pmu_put_guest_pmcs(vcpu);
+
+       perf_put_guest_lvtpc();
+
+       perf_put_guest_context();
+}
index 9a199109d6729760564d012fdd737299ef029e3e..25b583da9ee29f81cf2ead31dd5c4f64beeac856 100644 (file)
@@ -38,11 +38,19 @@ struct kvm_pmu_ops {
        void (*cleanup)(struct kvm_vcpu *vcpu);
 
        bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu);
+       void (*mediated_load)(struct kvm_vcpu *vcpu);
+       void (*mediated_put)(struct kvm_vcpu *vcpu);
        void (*write_global_ctrl)(u64 global_ctrl);
 
        const u64 EVENTSEL_EVENT;
        const int MAX_NR_GP_COUNTERS;
        const int MIN_NR_GP_COUNTERS;
+
+       const u32 PERF_GLOBAL_CTRL;
+       const u32 GP_EVENTSEL_BASE;
+       const u32 GP_COUNTER_BASE;
+       const u32 FIXED_COUNTER_BASE;
+       const u32 MSR_STRIDE;
 };
 
 void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
@@ -240,6 +248,8 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
 void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu);
 void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu);
+void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu);
+void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu);
 
 bool is_vmware_backdoor_pmc(u32 pmc_idx);
 bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu);
index 6d5f791126b13d851b287a9add4e04bd8e45e989..7aa298eeb0721b7bb349c84ce0d8b6bed454fbd7 100644 (file)
@@ -234,6 +234,32 @@ static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pm
        return host_pmu->version >= 2;
 }
 
+static void amd_mediated_pmu_load(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       u64 global_status;
+
+       rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, global_status);
+       /* Clear host global_status MSR if non-zero. */
+       if (global_status)
+               wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, global_status);
+
+       wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, pmu->global_status);
+       wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, pmu->global_ctrl);
+}
+
+static void amd_mediated_pmu_put(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+       wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0);
+       rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, pmu->global_status);
+
+       /* Clear global status bits if non-zero */
+       if (pmu->global_status)
+               wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status);
+}
+
 struct kvm_pmu_ops amd_pmu_ops __initdata = {
        .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
        .msr_idx_to_pmc = amd_msr_idx_to_pmc,
@@ -245,8 +271,16 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
        .init = amd_pmu_init,
 
        .is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported,
+       .mediated_load = amd_mediated_pmu_load,
+       .mediated_put = amd_mediated_pmu_put,
 
        .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
        .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS,
        .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,
+
+       .PERF_GLOBAL_CTRL = MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+       .GP_EVENTSEL_BASE = MSR_F15H_PERF_CTL0,
+       .GP_COUNTER_BASE = MSR_F15H_PERF_CTR0,
+       .FIXED_COUNTER_BASE = 0,
+       .MSR_STRIDE = 2,
 };
index dca45f5151f9eb50fc2306d4d5160910dd5d99fb..1a616eb3ff1c8c3f392e310493e864996e2d287d 100644 (file)
@@ -4367,6 +4367,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
 
        vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
 
+       if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL))
+               rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl);
+
        trace_kvm_exit(vcpu, KVM_ISA_SVM);
 
        svm_complete_interrupts(vcpu);
index 855240678300e26fbd15d9246c9be2229c42ca26..55249fa4db95a0d823e88c909f5b36dd6fc2b8fb 100644 (file)
@@ -792,6 +792,42 @@ static void intel_pmu_write_global_ctrl(u64 global_ctrl)
        vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, global_ctrl);
 }
 
+
+static void intel_mediated_pmu_load(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       u64 global_status, toggle;
+
+       rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, global_status);
+       toggle = pmu->global_status ^ global_status;
+       if (global_status & toggle)
+               wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle);
+       if (pmu->global_status & toggle)
+               wrmsrq(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle);
+
+       wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl_hw);
+}
+
+static void intel_mediated_pmu_put(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+       /* MSR_CORE_PERF_GLOBAL_CTRL is already saved at VM-exit. */
+       rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status);
+
+       /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */
+       if (pmu->global_status)
+               wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status);
+
+       /*
+        * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and
+        * also to avoid accidentally enabling fixed counters (based on guest
+        * state) while running in the host, e.g. when setting global ctrl.
+        */
+       if (pmu->fixed_ctr_ctrl_hw)
+               wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
+}
+
 struct kvm_pmu_ops intel_pmu_ops __initdata = {
        .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
        .msr_idx_to_pmc = intel_msr_idx_to_pmc,
@@ -805,9 +841,17 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
        .cleanup = intel_pmu_cleanup,
 
        .is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported,
+       .mediated_load = intel_mediated_pmu_load,
+       .mediated_put = intel_mediated_pmu_put,
        .write_global_ctrl = intel_pmu_write_global_ctrl,
 
        .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
        .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS,
        .MIN_NR_GP_COUNTERS = 1,
+
+       .PERF_GLOBAL_CTRL = MSR_CORE_PERF_GLOBAL_CTRL,
+       .GP_EVENTSEL_BASE = MSR_P6_EVNTSEL0,
+       .GP_COUNTER_BASE = MSR_IA32_PMC0,
+       .FIXED_COUNTER_BASE = MSR_CORE_PERF_FIXED_CTR0,
+       .MSR_STRIDE = 1,
 };
index 76e86eb358dff0b32234940b4e2a6051e0288ed3..589a309259f42ed75f9dfa4f89d8b6a3e0df1b72 100644 (file)
@@ -11334,6 +11334,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                run_flags |= KVM_RUN_LOAD_DEBUGCTL;
        vcpu->arch.host_debugctl = debug_ctl;
 
+       kvm_mediated_pmu_load(vcpu);
+
        guest_timing_enter_irqoff();
 
        /*
@@ -11372,6 +11374,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        kvm_load_host_pkru(vcpu);
 
+       kvm_mediated_pmu_put(vcpu);
+
        /*
         * Do this here before restoring debug registers on the host.  And
         * since we do this before handling the vmexit, a DR access vmexit