event = rb_entry_safe(rb_next(&event->group_node), \
typeof(*event), group_node))
+/*
+ * Does the event attribute request inherit with PERF_SAMPLE_READ
+ */
+static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
+{
+ return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
+}
+
/*
* Add an event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_inc(&ctx->nr_no_switch_fast);
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_enable(event, ctx);
ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_dec(&ctx->nr_no_switch_fast);
list_del_rcu(&event->event_entry);
/*
* Must not swap out ctx when there's pending
* events that rely on the ctx->task relation.
+ *
+ * Likewise, when a context contains inherit +
+ * SAMPLE_READ events they should be switched
+ * out using the slow path so that they are
+ * treated as if they were distinct contexts.
*/
raw_spin_unlock(&next_ctx->lock);
rcu_read_unlock();
raw_spin_unlock(&ctx->lock);
}
-static inline u64 perf_event_count(struct perf_event *event)
+static inline u64 perf_event_count(struct perf_event *event, bool self)
{
+ if (self)
+ return local64_read(&event->count);
+
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
mutex_lock(&event->child_mutex);
(void)perf_event_read(event, false);
- total += perf_event_count(event);
+ total += perf_event_count(event, false);
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
list_for_each_entry(child, &event->child_list, child_list) {
(void)perf_event_read(child, false);
- total += perf_event_count(child);
+ total += perf_event_count(child, false);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
/*
* Write {count,id} tuples for every sibling.
*/
- values[n++] += perf_event_count(leader);
+ values[n++] += perf_event_count(leader, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
- values[n++] += perf_event_count(sub);
+ values[n++] += perf_event_count(sub, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
- userpg->offset = perf_event_count(event);
+ userpg->offset = perf_event_count(event, false);
if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
u64 values[5];
int n = 0;
- values[n++] = perf_event_count(event);
+ values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
values[n++] = enabled +
atomic64_read(&event->child_total_time_enabled);
}
static void perf_output_read_group(struct perf_output_handle *handle,
- struct perf_event *event,
- u64 enabled, u64 running)
+ struct perf_event *event,
+ u64 enabled, u64 running)
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
unsigned long flags;
u64 values[6];
int n = 0;
+ bool self = has_inherit_and_sample_read(&event->attr);
/*
* Disabling interrupts avoids all counter scheduling
(leader->state == PERF_EVENT_STATE_ACTIVE))
leader->pmu->read(leader);
- values[n++] = perf_event_count(leader);
+ values[n++] = perf_event_count(leader, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
(sub->state == PERF_EVENT_STATE_ACTIVE))
sub->pmu->read(sub);
- values[n++] = perf_event_count(sub);
+ values[n++] = perf_event_count(sub, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
* The problem is that its both hard and excessively expensive to iterate the
* child list, not to mention that its impossible to IPI the children running
* on another CPU, from interrupt/NMI context.
+ *
+ * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
+ * counts rather than attempting to accumulate some value across all children on
+ * all cores.
*/
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
local64_set(&hwc->period_left, hwc->sample_period);
/*
- * We currently do not support PERF_SAMPLE_READ on inherited events.
+ * We do not support PERF_SAMPLE_READ on inherited events unless
+ * PERF_SAMPLE_TID is also selected, which allows inherited events to
+ * collect per-thread samples.
* See perf_output_read().
*/
- if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
+ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
goto err_ns;
if (!has_branch_stack(event))
perf_event_read_event(child_event, task);
}
- child_val = perf_event_count(child_event);
+ child_val = perf_event_count(child_event, false);
/*
* Add back the child's count to the parent's count: