]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
perf/x86: Fix NULL event access and potential PEBS record loss
authorDapeng Mi <dapeng1.mi@linux.intel.com>
Wed, 29 Oct 2025 10:21:26 +0000 (18:21 +0800)
committerPeter Zijlstra <peterz@infradead.org>
Fri, 7 Nov 2025 14:08:19 +0000 (15:08 +0100)
When intel_pmu_drain_pebs_icl() is called to drain PEBS records, the
perf_event_overflow() could be called to process the last PEBS record.

While perf_event_overflow() could trigger the interrupt throttle and
stop all events of the group, like what the below call-chain shows.

perf_event_overflow()
  -> __perf_event_overflow()
    ->__perf_event_account_interrupt()
      -> perf_event_throttle_group()
        -> perf_event_throttle()
          -> event->pmu->stop()
            -> x86_pmu_stop()

The side effect of stopping the events is that all corresponding event
pointers in cpuc->events[] array are cleared to NULL.

Assume there are two PEBS events (event a and event b) in a group. When
intel_pmu_drain_pebs_icl() calls perf_event_overflow() to process the
last PEBS record of PEBS event a, interrupt throttle is triggered and
all pointers of event a and event b are cleared to NULL. Then
intel_pmu_drain_pebs_icl() tries to process the last PEBS record of
event b and encounters NULL pointer access.

To avoid this issue, move cpuc->events[] clearing from x86_pmu_stop()
to x86_pmu_del(). It's safe since cpuc->active_mask or
cpuc->pebs_enabled is always checked before access the event pointer
from cpuc->events[].

Closes: https://lore.kernel.org/oe-lkp/202507042103.a15d2923-lkp@intel.com
Fixes: 9734e25fbf5a ("perf: Fix the throttle logic for a group")
Reported-by: kernel test robot <oliver.sang@intel.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-3-dapeng1.mi@linux.intel.com
arch/x86/events/core.c

index 0cf68ad9dcd0b411984287801c4a957d59320be6..b2868fee765b36b7d68b15ffc49d471c27431a13 100644 (file)
@@ -1344,6 +1344,7 @@ static void x86_pmu_enable(struct pmu *pmu)
                                hwc->state |= PERF_HES_ARCH;
 
                        x86_pmu_stop(event, PERF_EF_UPDATE);
+                       cpuc->events[hwc->idx] = NULL;
                }
 
                /*
@@ -1365,6 +1366,7 @@ static void x86_pmu_enable(struct pmu *pmu)
                         * if cpuc->enabled = 0, then no wrmsr as
                         * per x86_pmu_enable_event()
                         */
+                       cpuc->events[hwc->idx] = event;
                        x86_pmu_start(event, PERF_EF_RELOAD);
                }
                cpuc->n_added = 0;
@@ -1531,7 +1533,6 @@ static void x86_pmu_start(struct perf_event *event, int flags)
 
        event->hw.state = 0;
 
-       cpuc->events[idx] = event;
        __set_bit(idx, cpuc->active_mask);
        static_call(x86_pmu_enable)(event);
        perf_event_update_userpage(event);
@@ -1610,7 +1611,6 @@ void x86_pmu_stop(struct perf_event *event, int flags)
        if (test_bit(hwc->idx, cpuc->active_mask)) {
                static_call(x86_pmu_disable)(event);
                __clear_bit(hwc->idx, cpuc->active_mask);
-               cpuc->events[hwc->idx] = NULL;
                WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
                hwc->state |= PERF_HES_STOPPED;
        }
@@ -1648,6 +1648,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
         * Not a TXN, therefore cleanup properly.
         */
        x86_pmu_stop(event, PERF_EF_UPDATE);
+       cpuc->events[event->hw.idx] = NULL;
 
        for (i = 0; i < cpuc->n_events; i++) {
                if (event == cpuc->event_list[i])