]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
perf/core: Fix missing wakeup when waiting for context reference
authorHaifeng Xu <haifeng.xu@shopee.com>
Mon, 13 May 2024 10:39:48 +0000 (10:39 +0000)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 5 Jul 2024 07:08:25 +0000 (09:08 +0200)
[ Upstream commit 74751ef5c1912ebd3e65c3b65f45587e05ce5d36 ]

In our production environment, we found many hung tasks which are
blocked for more than 18 hours. Their call traces are like this:

[346278.191038] __schedule+0x2d8/0x890
[346278.191046] schedule+0x4e/0xb0
[346278.191049] perf_event_free_task+0x220/0x270
[346278.191056] ? init_wait_var_entry+0x50/0x50
[346278.191060] copy_process+0x663/0x18d0
[346278.191068] kernel_clone+0x9d/0x3d0
[346278.191072] __do_sys_clone+0x5d/0x80
[346278.191076] __x64_sys_clone+0x25/0x30
[346278.191079] do_syscall_64+0x5c/0xc0
[346278.191083] ? syscall_exit_to_user_mode+0x27/0x50
[346278.191086] ? do_syscall_64+0x69/0xc0
[346278.191088] ? irqentry_exit_to_user_mode+0x9/0x20
[346278.191092] ? irqentry_exit+0x19/0x30
[346278.191095] ? exc_page_fault+0x89/0x160
[346278.191097] ? asm_exc_page_fault+0x8/0x30
[346278.191102] entry_SYSCALL_64_after_hwframe+0x44/0xae

The task was waiting for the refcount become to 1, but from the vmcore,
we found the refcount has already been 1. It seems that the task didn't
get woken up by perf_event_release_kernel() and got stuck forever. The
below scenario may cause the problem.

Thread A Thread B
... ...
perf_event_free_task perf_event_release_kernel
   ...
   acquire event->child_mutex
   ...
   get_ctx
   ...    release event->child_mutex
   acquire ctx->mutex
   ...
   perf_free_event (acquire/release event->child_mutex)
   ...
   release ctx->mutex
   wait_var_event
   acquire ctx->mutex
   acquire event->child_mutex
   # move existing events to free_list
   release event->child_mutex
   release ctx->mutex
   put_ctx
... ...

In this case, all events of the ctx have been freed, so we couldn't
find the ctx in free_list and Thread A will miss the wakeup. It's thus
necessary to add a wakeup after dropping the reference.

Fixes: 1cf8dfe8a661 ("perf/core: Fix race between close() and fork()")
Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20240513103948.33570-1-haifeng.xu@shopee.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
kernel/events/core.c

index 576af248a539a94cb9d49eee34baa7b574ceb818..2347dda682abd1f2a0e744dd65c1a7387c9d549b 100644 (file)
@@ -4760,6 +4760,7 @@ int perf_event_release_kernel(struct perf_event *event)
 again:
        mutex_lock(&event->child_mutex);
        list_for_each_entry(child, &event->child_list, child_list) {
+               void *var = NULL;
 
                /*
                 * Cannot change, child events are not migrated, see the
@@ -4800,11 +4801,23 @@ again:
                         * this can't be the last reference.
                         */
                        put_event(event);
+               } else {
+                       var = &ctx->refcount;
                }
 
                mutex_unlock(&event->child_mutex);
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);
+
+               if (var) {
+                       /*
+                        * If perf_event_free_task() has deleted all events from the
+                        * ctx while the child_mutex got released above, make sure to
+                        * notify about the preceding put_ctx().
+                        */
+                       smp_mb(); /* pairs with wait_var_event() */
+                       wake_up_var(var);
+               }
                goto again;
        }
        mutex_unlock(&event->child_mutex);