perf: Support PERF_SAMPLE_READ with inherit

author Ben Gainey <ben.gainey@arm.com>

Tue, 30 Jul 2024 08:44:15 +0000 (09:44 +0100)

committer Peter Zijlstra <peterz@infradead.org>

Fri, 2 Aug 2024 09:30:30 +0000 (11:30 +0200)
author Ben Gainey <ben.gainey@arm.com>
Tue, 30 Jul 2024 08:44:15 +0000 (09:44 +0100)
committer Peter Zijlstra <peterz@infradead.org>
Fri, 2 Aug 2024 09:30:30 +0000 (11:30 +0200)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index 655f66b18418a8aa58bc5b1754d613dd96e3f76c..701549967c1854e3f2778fe9b13c956eac6422a2 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -969,6 +969,9 @@ struct perf_event_context {
          * The count of events for which using the switch-out fast path
          * should be avoided.
          *
+        * Sum (event->pending_work + events with
+        *    (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)))
+        *
          * The SIGTRAP is targeted at ctx->task, as such it won't do changing
          * that until the signal is delivered.
          */
diff --git a/kernel/events/core.c b/kernel/events/core.c

index e6cc354a3ceefb5902979c79fe244ab2282c162d..c01a32687dad171a6517dfc4bc70bfdc5a32e8ba 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1767,6 +1767,14 @@ perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
                 event = rb_entry_safe(rb_next(&event->group_node),      \
                                 typeof(*event), group_node))
  
+/*
+ * Does the event attribute request inherit with PERF_SAMPLE_READ
+ */
+static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
+{
+       return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
+}
+
  /*
   * Add an event from the lists for its context.
   * Must be called with ctx->mutex and ctx->lock held.
@@ -1797,6 +1805,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                 ctx->nr_user++;
         if (event->attr.inherit_stat)
                 ctx->nr_stat++;
+       if (has_inherit_and_sample_read(&event->attr))
+               local_inc(&ctx->nr_no_switch_fast);
  
         if (event->state > PERF_EVENT_STATE_OFF)
                 perf_cgroup_event_enable(event, ctx);
@@ -2021,6 +2031,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
                 ctx->nr_user--;
         if (event->attr.inherit_stat)
                 ctx->nr_stat--;
+       if (has_inherit_and_sample_read(&event->attr))
+               local_dec(&ctx->nr_no_switch_fast);
  
         list_del_rcu(&event->event_entry);
  
@@ -3522,6 +3534,11 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
                                 /*
                                  * Must not swap out ctx when there's pending
                                  * events that rely on the ctx->task relation.
+                                *
+                                * Likewise, when a context contains inherit +
+                                * SAMPLE_READ events they should be switched
+                                * out using the slow path so that they are
+                                * treated as if they were distinct contexts.
                                  */
                                 raw_spin_unlock(&next_ctx->lock);
                                 rcu_read_unlock();
@@ -4538,8 +4555,11 @@ unlock:
         raw_spin_unlock(&ctx->lock);
  }
  
-static inline u64 perf_event_count(struct perf_event *event)
+static inline u64 perf_event_count(struct perf_event *event, bool self)
  {
+       if (self)
+               return local64_read(&event->count);
+
         return local64_read(&event->count) + atomic64_read(&event->child_count);
  }
  
@@ -5498,7 +5518,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
         mutex_lock(&event->child_mutex);
  
         (void)perf_event_read(event, false);
-       total += perf_event_count(event);
+       total += perf_event_count(event, false);
  
         *enabled += event->total_time_enabled +
                         atomic64_read(&event->child_total_time_enabled);
@@ -5507,7 +5527,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
  
         list_for_each_entry(child, &event->child_list, child_list) {
                 (void)perf_event_read(child, false);
-               total += perf_event_count(child);
+               total += perf_event_count(child, false);
                 *enabled += child->total_time_enabled;
                 *running += child->total_time_running;
         }
@@ -5589,14 +5609,14 @@ static int __perf_read_group_add(struct perf_event *leader,
         /*
          * Write {count,id} tuples for every sibling.
          */
-       values[n++] += perf_event_count(leader);
+       values[n++] += perf_event_count(leader, false);
         if (read_format & PERF_FORMAT_ID)
                 values[n++] = primary_event_id(leader);
         if (read_format & PERF_FORMAT_LOST)
                 values[n++] = atomic64_read(&leader->lost_samples);
  
         for_each_sibling_event(sub, leader) {
-               values[n++] += perf_event_count(sub);
+               values[n++] += perf_event_count(sub, false);
                 if (read_format & PERF_FORMAT_ID)
                         values[n++] = primary_event_id(sub);
                 if (read_format & PERF_FORMAT_LOST)
@@ -6176,7 +6196,7 @@ void perf_event_update_userpage(struct perf_event *event)
         ++userpg->lock;
         barrier();
         userpg->index = perf_event_index(event);
-       userpg->offset = perf_event_count(event);
+       userpg->offset = perf_event_count(event, false);
         if (userpg->index)
                 userpg->offset -= local64_read(&event->hw.prev_count);
  
@@ -7250,7 +7270,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
         u64 values[5];
         int n = 0;
  
-       values[n++] = perf_event_count(event);
+       values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                 values[n++] = enabled +
                         atomic64_read(&event->child_total_time_enabled);
@@ -7268,14 +7288,15 @@ static void perf_output_read_one(struct perf_output_handle *handle,
  }
  
  static void perf_output_read_group(struct perf_output_handle *handle,
-                           struct perf_event *event,
-                           u64 enabled, u64 running)
+                                  struct perf_event *event,
+                                  u64 enabled, u64 running)
  {
         struct perf_event *leader = event->group_leader, *sub;
         u64 read_format = event->attr.read_format;
         unsigned long flags;
         u64 values[6];
         int n = 0;
+       bool self = has_inherit_and_sample_read(&event->attr);
  
         /*
          * Disabling interrupts avoids all counter scheduling
@@ -7295,7 +7316,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
             (leader->state == PERF_EVENT_STATE_ACTIVE))
                 leader->pmu->read(leader);
  
-       values[n++] = perf_event_count(leader);
+       values[n++] = perf_event_count(leader, self);
         if (read_format & PERF_FORMAT_ID)
                 values[n++] = primary_event_id(leader);
         if (read_format & PERF_FORMAT_LOST)
@@ -7310,7 +7331,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
                     (sub->state == PERF_EVENT_STATE_ACTIVE))
                         sub->pmu->read(sub);
  
-               values[n++] = perf_event_count(sub);
+               values[n++] = perf_event_count(sub, self);
                 if (read_format & PERF_FORMAT_ID)
                         values[n++] = primary_event_id(sub);
                 if (read_format & PERF_FORMAT_LOST)
@@ -7331,6 +7352,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
   * The problem is that its both hard and excessively expensive to iterate the
   * child list, not to mention that its impossible to IPI the children running
   * on another CPU, from interrupt/NMI context.
+ *
+ * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
+ * counts rather than attempting to accumulate some value across all children on
+ * all cores.
   */
  static void perf_output_read(struct perf_output_handle *handle,
                              struct perf_event *event)
@@ -12057,10 +12082,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
         local64_set(&hwc->period_left, hwc->sample_period);
  
         /*
-        * We currently do not support PERF_SAMPLE_READ on inherited events.
+        * We do not support PERF_SAMPLE_READ on inherited events unless
+        * PERF_SAMPLE_TID is also selected, which allows inherited events to
+        * collect per-thread samples.
          * See perf_output_read().
          */
-       if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
+       if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
                 goto err_ns;
  
         if (!has_branch_stack(event))
@@ -13084,7 +13111,7 @@ static void sync_child_event(struct perf_event *child_event)
                         perf_event_read_event(child_event, task);
         }
  
-       child_val = perf_event_count(child_event);
+       child_val = perf_event_count(child_event, false);
  
         /*
          * Add back the child's count to the parent's count:
author	Ben Gainey <ben.gainey@arm.com>
	Tue, 30 Jul 2024 08:44:15 +0000 (09:44 +0100)
committer	Peter Zijlstra <peterz@infradead.org>
	Fri, 2 Aug 2024 09:30:30 +0000 (11:30 +0200)
include/linux/perf_event.h		patch \| blob \| blame \| history
kernel/events/core.c		patch \| blob \| blame \| history