+++ /dev/null
-From c5de60cd622a2607c043ba65e25a6e9998a369f9 Mon Sep 17 00:00:00 2001
-From: Namhyung Kim <namhyung@kernel.org>
-Date: Mon, 24 Jan 2022 11:58:08 -0800
-Subject: perf/core: Fix cgroup event list management
-
-From: Namhyung Kim <namhyung@kernel.org>
-
-commit c5de60cd622a2607c043ba65e25a6e9998a369f9 upstream.
-
-The active cgroup events are managed in the per-cpu cgrp_cpuctx_list.
-This list is only accessed from current cpu and not protected by any
-locks. But from the commit ef54c1a476ae ("perf: Rework
-perf_event_exit_event()"), it's possible to access (actually modify)
-the list from another cpu.
-
-In the perf_remove_from_context(), it can remove an event from the
-context without an IPI when the context is not active. This is not
-safe with cgroup events which can have some active events in the
-context even if ctx->is_active is 0 at the moment. The target cpu
-might be in the middle of list iteration at the same time.
-
-If the event is enabled when it's about to be closed, it might call
-perf_cgroup_event_disable() and list_del() with the cgrp_cpuctx_list
-on a different cpu.
-
-This resulted in a crash due to an invalid list pointer access during
-the cgroup list traversal on the cpu which the event belongs to.
-
-Let's fallback to IPI to access the cgrp_cpuctx_list from that cpu.
-Similarly, perf_install_in_context() should use IPI for the cgroup
-events too.
-
-Fixes: ef54c1a476ae ("perf: Rework perf_event_exit_event()")
-Signed-off-by: Namhyung Kim <namhyung@kernel.org>
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Link: https://lkml.kernel.org/r/20220124195808.2252071-1-namhyung@kernel.org
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- kernel/events/core.c | 11 +++++++++--
- 1 file changed, 9 insertions(+), 2 deletions(-)
-
---- a/kernel/events/core.c
-+++ b/kernel/events/core.c
-@@ -2466,7 +2466,11 @@ static void perf_remove_from_context(str
- * event_function_call() user.
- */
- raw_spin_lock_irq(&ctx->lock);
-- if (!ctx->is_active) {
-+ /*
-+ * Cgroup events are per-cpu events, and must IPI because of
-+ * cgrp_cpuctx_list.
-+ */
-+ if (!ctx->is_active && !is_cgroup_event(event)) {
- __perf_remove_from_context(event, __get_cpu_context(ctx),
- ctx, (void *)flags);
- raw_spin_unlock_irq(&ctx->lock);
-@@ -2899,11 +2903,14 @@ perf_install_in_context(struct perf_even
- * perf_event_attr::disabled events will not run and can be initialized
- * without IPI. Except when this is the first event for the context, in
- * that case we need the magic of the IPI to set ctx->is_active.
-+ * Similarly, cgroup events for the context also needs the IPI to
-+ * manipulate the cgrp_cpuctx_list.
- *
- * The IOC_ENABLE that is sure to follow the creation of a disabled
- * event will issue the IPI and reprogram the hardware.
- */
-- if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
-+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
-+ ctx->nr_events && !is_cgroup_event(event)) {
- raw_spin_lock_irq(&ctx->lock);
- if (ctx->task == TASK_TOMBSTONE) {
- raw_spin_unlock_irq(&ctx->lock);
+++ /dev/null
-From ef54c1a476aef7eef26fe13ea10dc090952c00f8 Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Thu, 8 Apr 2021 12:35:56 +0200
-Subject: perf: Rework perf_event_exit_event()
-
-From: Peter Zijlstra <peterz@infradead.org>
-
-commit ef54c1a476aef7eef26fe13ea10dc090952c00f8 upstream.
-
-Make perf_event_exit_event() more robust, such that we can use it from
-other contexts. Specifically the up and coming remove_on_exec.
-
-For this to work we need to address a few issues. Remove_on_exec will
-not destroy the entire context, so we cannot rely on TASK_TOMBSTONE to
-disable event_function_call() and we thus have to use
-perf_remove_from_context().
-
-When using perf_remove_from_context(), there's two races to consider.
-The first is against close(), where we can have concurrent tear-down
-of the event. The second is against child_list iteration, which should
-not find a half baked event.
-
-To address this, teach perf_remove_from_context() to special case
-!ctx->is_active and about DETACH_CHILD.
-
-[ elver@google.com: fix racing parent/child exit in sync_child_event(). ]
-Signed-off-by: Marco Elver <elver@google.com>
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-Link: https://lkml.kernel.org/r/20210408103605.1676875-2-elver@google.com
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- include/linux/perf_event.h | 1
- kernel/events/core.c | 144 +++++++++++++++++++++++++--------------------
- 2 files changed, 81 insertions(+), 64 deletions(-)
-
---- a/include/linux/perf_event.h
-+++ b/include/linux/perf_event.h
-@@ -607,6 +607,7 @@ struct swevent_hlist {
- #define PERF_ATTACH_TASK_DATA 0x08
- #define PERF_ATTACH_ITRACE 0x10
- #define PERF_ATTACH_SCHED_CB 0x20
-+#define PERF_ATTACH_CHILD 0x40
-
- struct perf_cgroup;
- struct perf_buffer;
---- a/kernel/events/core.c
-+++ b/kernel/events/core.c
-@@ -2276,6 +2276,26 @@ out:
- perf_event__header_size(leader);
- }
-
-+static void sync_child_event(struct perf_event *child_event);
-+
-+static void perf_child_detach(struct perf_event *event)
-+{
-+ struct perf_event *parent_event = event->parent;
-+
-+ if (!(event->attach_state & PERF_ATTACH_CHILD))
-+ return;
-+
-+ event->attach_state &= ~PERF_ATTACH_CHILD;
-+
-+ if (WARN_ON_ONCE(!parent_event))
-+ return;
-+
-+ lockdep_assert_held(&parent_event->child_mutex);
-+
-+ sync_child_event(event);
-+ list_del_init(&event->child_list);
-+}
-+
- static bool is_orphaned_event(struct perf_event *event)
- {
- return event->state == PERF_EVENT_STATE_DEAD;
-@@ -2383,6 +2403,7 @@ group_sched_out(struct perf_event *group
- }
-
- #define DETACH_GROUP 0x01UL
-+#define DETACH_CHILD 0x02UL
-
- /*
- * Cross CPU call to remove a performance event
-@@ -2406,6 +2427,8 @@ __perf_remove_from_context(struct perf_e
- event_sched_out(event, cpuctx, ctx);
- if (flags & DETACH_GROUP)
- perf_group_detach(event);
-+ if (flags & DETACH_CHILD)
-+ perf_child_detach(event);
- list_del_event(event, ctx);
-
- if (!ctx->nr_events && ctx->is_active) {
-@@ -2437,25 +2460,21 @@ static void perf_remove_from_context(str
-
- lockdep_assert_held(&ctx->mutex);
-
-- event_function_call(event, __perf_remove_from_context, (void *)flags);
--
- /*
-- * The above event_function_call() can NO-OP when it hits
-- * TASK_TOMBSTONE. In that case we must already have been detached
-- * from the context (by perf_event_exit_event()) but the grouping
-- * might still be in-tact.
-- */
-- WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
-- if ((flags & DETACH_GROUP) &&
-- (event->attach_state & PERF_ATTACH_GROUP)) {
-- /*
-- * Since in that case we cannot possibly be scheduled, simply
-- * detach now.
-- */
-- raw_spin_lock_irq(&ctx->lock);
-- perf_group_detach(event);
-+ * Because of perf_event_exit_task(), perf_remove_from_context() ought
-+ * to work in the face of TASK_TOMBSTONE, unlike every other
-+ * event_function_call() user.
-+ */
-+ raw_spin_lock_irq(&ctx->lock);
-+ if (!ctx->is_active) {
-+ __perf_remove_from_context(event, __get_cpu_context(ctx),
-+ ctx, (void *)flags);
- raw_spin_unlock_irq(&ctx->lock);
-+ return;
- }
-+ raw_spin_unlock_irq(&ctx->lock);
-+
-+ event_function_call(event, __perf_remove_from_context, (void *)flags);
- }
-
- /*
-@@ -12330,14 +12349,17 @@ void perf_pmu_migrate_context(struct pmu
- }
- EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
-
--static void sync_child_event(struct perf_event *child_event,
-- struct task_struct *child)
-+static void sync_child_event(struct perf_event *child_event)
- {
- struct perf_event *parent_event = child_event->parent;
- u64 child_val;
-
-- if (child_event->attr.inherit_stat)
-- perf_event_read_event(child_event, child);
-+ if (child_event->attr.inherit_stat) {
-+ struct task_struct *task = child_event->ctx->task;
-+
-+ if (task && task != TASK_TOMBSTONE)
-+ perf_event_read_event(child_event, task);
-+ }
-
- child_val = perf_event_count(child_event);
-
-@@ -12352,60 +12374,53 @@ static void sync_child_event(struct perf
- }
-
- static void
--perf_event_exit_event(struct perf_event *child_event,
-- struct perf_event_context *child_ctx,
-- struct task_struct *child)
-+perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
- {
-- struct perf_event *parent_event = child_event->parent;
-+ struct perf_event *parent_event = event->parent;
-+ unsigned long detach_flags = 0;
-
-- /*
-- * Do not destroy the 'original' grouping; because of the context
-- * switch optimization the original events could've ended up in a
-- * random child task.
-- *
-- * If we were to destroy the original group, all group related
-- * operations would cease to function properly after this random
-- * child dies.
-- *
-- * Do destroy all inherited groups, we don't care about those
-- * and being thorough is better.
-- */
-- raw_spin_lock_irq(&child_ctx->lock);
-- WARN_ON_ONCE(child_ctx->is_active);
-+ if (parent_event) {
-+ /*
-+ * Do not destroy the 'original' grouping; because of the
-+ * context switch optimization the original events could've
-+ * ended up in a random child task.
-+ *
-+ * If we were to destroy the original group, all group related
-+ * operations would cease to function properly after this
-+ * random child dies.
-+ *
-+ * Do destroy all inherited groups, we don't care about those
-+ * and being thorough is better.
-+ */
-+ detach_flags = DETACH_GROUP | DETACH_CHILD;
-+ mutex_lock(&parent_event->child_mutex);
-+ }
-
-- if (parent_event)
-- perf_group_detach(child_event);
-- list_del_event(child_event, child_ctx);
-- perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
-- raw_spin_unlock_irq(&child_ctx->lock);
-+ perf_remove_from_context(event, detach_flags);
-+
-+ raw_spin_lock_irq(&ctx->lock);
-+ if (event->state > PERF_EVENT_STATE_EXIT)
-+ perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
-+ raw_spin_unlock_irq(&ctx->lock);
-
- /*
-- * Parent events are governed by their filedesc, retain them.
-+ * Child events can be freed.
- */
-- if (!parent_event) {
-- perf_event_wakeup(child_event);
-+ if (parent_event) {
-+ mutex_unlock(&parent_event->child_mutex);
-+ /*
-+ * Kick perf_poll() for is_event_hup();
-+ */
-+ perf_event_wakeup(parent_event);
-+ free_event(event);
-+ put_event(parent_event);
- return;
- }
-- /*
-- * Child events can be cleaned up.
-- */
--
-- sync_child_event(child_event, child);
-
- /*
-- * Remove this event from the parent's list
-- */
-- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-- mutex_lock(&parent_event->child_mutex);
-- list_del_init(&child_event->child_list);
-- mutex_unlock(&parent_event->child_mutex);
--
-- /*
-- * Kick perf_poll() for is_event_hup().
-+ * Parent events are governed by their filedesc, retain them.
- */
-- perf_event_wakeup(parent_event);
-- free_event(child_event);
-- put_event(parent_event);
-+ perf_event_wakeup(event);
- }
-
- static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
-@@ -12462,7 +12477,7 @@ static void perf_event_exit_task_context
- perf_event_task(child, child_ctx, 0);
-
- list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
-- perf_event_exit_event(child_event, child_ctx, child);
-+ perf_event_exit_event(child_event, child_ctx);
-
- mutex_unlock(&child_ctx->mutex);
-
-@@ -12722,6 +12737,7 @@ inherit_event(struct perf_event *parent_
- */
- raw_spin_lock_irqsave(&child_ctx->lock, flags);
- add_event_to_ctx(child_event, child_ctx);
-+ child_event->attach_state |= PERF_ATTACH_CHILD;
- raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
-
- /*