]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
perf: attach/detach PMU specific data
authorKan Liang <kan.liang@linux.intel.com>
Fri, 14 Mar 2025 17:26:56 +0000 (10:26 -0700)
committerPeter Zijlstra <peterz@infradead.org>
Mon, 17 Mar 2025 10:23:37 +0000 (11:23 +0100)
The LBR call stack data has to be saved/restored during context switch
to fix the shorter LBRs call stacks issue in the  system-wide mode.
Allocate PMU specific data and attach them to the corresponding
task_struct during LBR call stack monitoring.

When a LBR call stack event is accounted, the perf_ctx_data for the
related tasks will be allocated/attached by attach_perf_ctx_data().
When a LBR call stack event is unaccounted, the perf_ctx_data for
related tasks will be detached/freed by detach_perf_ctx_data().

The LBR call stack event could be a per-task event or a system-wide
event.
- For a per-task event, perf only allocates the perf_ctx_data for the
  current task. If the allocation fails, perf will error out.
- For a system-wide event, perf has to allocate the perf_ctx_data for
  both the existing tasks and the upcoming tasks.
  The allocation for the existing tasks is done in perf_event_alloc().
  If any allocation fails, perf will error out.
  The allocation for the new tasks will be done in perf_event_fork().
  A global reader/writer semaphore, global_ctx_data_rwsem, is added to
  address the global race.
- The perf_ctx_data only be freed by the last LBR call stack event.
  The number of the per-task events is tracked by refcount of each task.
  Since the system-wide events impact all tasks, it's not practical to
  go through the whole task list to update the refcount for each
  system-wide event. The number of system-wide events is tracked by a
  global variable global_ctx_data_ref.

Suggested-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250314172700.438923-3-kan.liang@linux.intel.com
include/linux/perf_event.h
kernel/events/core.c

index 75d9b1e93f39bc1baae3f94ba3edaad43688d822..2551170c0d18915b6678c9c7bb1ade1f89742a49 100644 (file)
@@ -676,11 +676,12 @@ struct swevent_hlist {
 #define PERF_ATTACH_GROUP      0x0002
 #define PERF_ATTACH_TASK       0x0004
 #define PERF_ATTACH_TASK_DATA  0x0008
-#define PERF_ATTACH_ITRACE     0x0010
+#define PERF_ATTACH_GLOBAL_DATA        0x0010
 #define PERF_ATTACH_SCHED_CB   0x0020
 #define PERF_ATTACH_CHILD      0x0040
 #define PERF_ATTACH_EXCLUSIVE  0x0080
 #define PERF_ATTACH_CALLCHAIN  0x0100
+#define PERF_ATTACH_ITRACE     0x0200
 
 struct bpf_prog;
 struct perf_cgroup;
index 20d28b7e30fbbd76f935cc4f0efa5ebcd94acbaa..e86d35e4f27119c68a99d7c63fddb494ec2a0bcd 100644 (file)
@@ -55,6 +55,7 @@
 #include <linux/pgtable.h>
 #include <linux/buildid.h>
 #include <linux/task_work.h>
+#include <linux/percpu-rwsem.h>
 
 #include "internal.h"
 
@@ -5217,6 +5218,225 @@ static void unaccount_freq_event(void)
                atomic_dec(&nr_freq_events);
 }
 
+
+static struct perf_ctx_data *
+alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
+{
+       struct perf_ctx_data *cd;
+
+       cd = kzalloc(sizeof(*cd), GFP_KERNEL);
+       if (!cd)
+               return NULL;
+
+       cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
+       if (!cd->data) {
+               kfree(cd);
+               return NULL;
+       }
+
+       cd->global = global;
+       cd->ctx_cache = ctx_cache;
+       refcount_set(&cd->refcount, 1);
+
+       return cd;
+}
+
+static void free_perf_ctx_data(struct perf_ctx_data *cd)
+{
+       kmem_cache_free(cd->ctx_cache, cd->data);
+       kfree(cd);
+}
+
+static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
+{
+       struct perf_ctx_data *cd;
+
+       cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
+       free_perf_ctx_data(cd);
+}
+
+static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
+{
+       call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
+}
+
+static int
+attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
+                    bool global)
+{
+       struct perf_ctx_data *cd, *old = NULL;
+
+       cd = alloc_perf_ctx_data(ctx_cache, global);
+       if (!cd)
+               return -ENOMEM;
+
+       for (;;) {
+               if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
+                       if (old)
+                               perf_free_ctx_data_rcu(old);
+                       return 0;
+               }
+
+               if (!old) {
+                       /*
+                        * After seeing a dead @old, we raced with
+                        * removal and lost, try again to install @cd.
+                        */
+                       continue;
+               }
+
+               if (refcount_inc_not_zero(&old->refcount)) {
+                       free_perf_ctx_data(cd); /* unused */
+                       return 0;
+               }
+
+               /*
+                * @old is a dead object, refcount==0 is stable, try and
+                * replace it with @cd.
+                */
+       }
+       return 0;
+}
+
+static void __detach_global_ctx_data(void);
+DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
+static refcount_t global_ctx_data_ref;
+
+static int
+attach_global_ctx_data(struct kmem_cache *ctx_cache)
+{
+       struct task_struct *g, *p;
+       struct perf_ctx_data *cd;
+       int ret;
+
+       if (refcount_inc_not_zero(&global_ctx_data_ref))
+               return 0;
+
+       guard(percpu_write)(&global_ctx_data_rwsem);
+       if (refcount_inc_not_zero(&global_ctx_data_ref))
+               return 0;
+again:
+       /* Allocate everything */
+       scoped_guard (rcu) {
+               for_each_process_thread(g, p) {
+                       cd = rcu_dereference(p->perf_ctx_data);
+                       if (cd && !cd->global) {
+                               cd->global = 1;
+                               if (!refcount_inc_not_zero(&cd->refcount))
+                                       cd = NULL;
+                       }
+                       if (!cd) {
+                               get_task_struct(p);
+                               goto alloc;
+                       }
+               }
+       }
+
+       refcount_set(&global_ctx_data_ref, 1);
+
+       return 0;
+alloc:
+       ret = attach_task_ctx_data(p, ctx_cache, true);
+       put_task_struct(p);
+       if (ret) {
+               __detach_global_ctx_data();
+               return ret;
+       }
+       goto again;
+}
+
+static int
+attach_perf_ctx_data(struct perf_event *event)
+{
+       struct task_struct *task = event->hw.target;
+       struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
+       int ret;
+
+       if (!ctx_cache)
+               return -ENOMEM;
+
+       if (task)
+               return attach_task_ctx_data(task, ctx_cache, false);
+
+       ret = attach_global_ctx_data(ctx_cache);
+       if (ret)
+               return ret;
+
+       event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
+       return 0;
+}
+
+static void
+detach_task_ctx_data(struct task_struct *p)
+{
+       struct perf_ctx_data *cd;
+
+       scoped_guard (rcu) {
+               cd = rcu_dereference(p->perf_ctx_data);
+               if (!cd || !refcount_dec_and_test(&cd->refcount))
+                       return;
+       }
+
+       /*
+        * The old ctx_data may be lost because of the race.
+        * Nothing is required to do for the case.
+        * See attach_task_ctx_data().
+        */
+       if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
+               perf_free_ctx_data_rcu(cd);
+}
+
+static void __detach_global_ctx_data(void)
+{
+       struct task_struct *g, *p;
+       struct perf_ctx_data *cd;
+
+again:
+       scoped_guard (rcu) {
+               for_each_process_thread(g, p) {
+                       cd = rcu_dereference(p->perf_ctx_data);
+                       if (!cd || !cd->global)
+                               continue;
+                       cd->global = 0;
+                       get_task_struct(p);
+                       goto detach;
+               }
+       }
+       return;
+detach:
+       detach_task_ctx_data(p);
+       put_task_struct(p);
+       goto again;
+}
+
+static void detach_global_ctx_data(void)
+{
+       if (refcount_dec_not_one(&global_ctx_data_ref))
+               return;
+
+       guard(percpu_write)(&global_ctx_data_rwsem);
+       if (!refcount_dec_and_test(&global_ctx_data_ref))
+               return;
+
+       /* remove everything */
+       __detach_global_ctx_data();
+}
+
+static void detach_perf_ctx_data(struct perf_event *event)
+{
+       struct task_struct *task = event->hw.target;
+
+       event->attach_state &= ~PERF_ATTACH_TASK_DATA;
+
+       if (task)
+               return detach_task_ctx_data(task);
+
+       if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
+               detach_global_ctx_data();
+               event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
+       }
+}
+
 static void unaccount_event(struct perf_event *event)
 {
        bool dec = false;
@@ -5398,6 +5618,9 @@ static void __free_event(struct perf_event *event)
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
 
+       if (event->attach_state & PERF_ATTACH_TASK_DATA)
+               detach_perf_ctx_data(event);
+
        if (event->destroy)
                event->destroy(event);
 
@@ -8607,10 +8830,58 @@ static void perf_event_task(struct task_struct *task,
                       task_ctx);
 }
 
+/*
+ * Allocate data for a new task when profiling system-wide
+ * events which require PMU specific data
+ */
+static void
+perf_event_alloc_task_data(struct task_struct *child,
+                          struct task_struct *parent)
+{
+       struct kmem_cache *ctx_cache = NULL;
+       struct perf_ctx_data *cd;
+
+       if (!refcount_read(&global_ctx_data_ref))
+               return;
+
+       scoped_guard (rcu) {
+               cd = rcu_dereference(parent->perf_ctx_data);
+               if (cd)
+                       ctx_cache = cd->ctx_cache;
+       }
+
+       if (!ctx_cache)
+               return;
+
+       guard(percpu_read)(&global_ctx_data_rwsem);
+       scoped_guard (rcu) {
+               cd = rcu_dereference(child->perf_ctx_data);
+               if (!cd) {
+                       /*
+                        * A system-wide event may be unaccount,
+                        * when attaching the perf_ctx_data.
+                        */
+                       if (!refcount_read(&global_ctx_data_ref))
+                               return;
+                       goto attach;
+               }
+
+               if (!cd->global) {
+                       cd->global = 1;
+                       refcount_inc(&cd->refcount);
+               }
+       }
+
+       return;
+attach:
+       attach_task_ctx_data(child, ctx_cache, true);
+}
+
 void perf_event_fork(struct task_struct *task)
 {
        perf_event_task(task, NULL, 1);
        perf_event_namespaces(task);
+       perf_event_alloc_task_data(task, current);
 }
 
 /*
@@ -12490,6 +12761,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (IS_ERR(pmu))
                return (void*)pmu;
 
+       /*
+        * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
+        * The attach should be right after the perf_init_event().
+        * Otherwise, the __free_event() would mistakenly detach the non-exist
+        * perf_ctx_data because of the other errors between them.
+        */
+       if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+               err = attach_perf_ctx_data(event);
+               if (err)
+                       return ERR_PTR(err);
+       }
+
        /*
         * Disallow uncore-task events. Similarly, disallow uncore-cgroup
         * events (they don't make sense as the cgroup will be different
@@ -13637,6 +13920,12 @@ void perf_event_exit_task(struct task_struct *child)
         * At this point we need to send EXIT events to cpu contexts.
         */
        perf_event_task(child, NULL, 0);
+
+       /*
+        * Detach the perf_ctx_data for the system-wide event.
+        */
+       guard(percpu_read)(&global_ctx_data_rwsem);
+       detach_task_ctx_data(child);
 }
 
 static void perf_free_event(struct perf_event *event,