]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
perf: Support deferred user unwind
authorPeter Zijlstra <peterz@infradead.org>
Thu, 23 Oct 2025 13:17:05 +0000 (15:17 +0200)
committerPeter Zijlstra <peterz@infradead.org>
Wed, 29 Oct 2025 09:29:58 +0000 (10:29 +0100)
Add support for deferred userspace unwind to perf.

Where perf currently relies on in-place stack unwinding; from NMI
context and all that. This moves the userspace part of the unwind to
right before the return-to-userspace.

This has two distinct benefits, the biggest is that it moves the
unwind to a faultable context. It becomes possible to fault in debug
info (.eh_frame, SFrame etc.) that might not otherwise be readily
available. And secondly, it de-duplicates the user callchain where
multiple samples happen during the same kernel entry.

To facilitate this the perf interface is extended with a new record
type:

  PERF_RECORD_CALLCHAIN_DEFERRED

and two new attribute flags:

  perf_event_attr::defer_callchain - to request the user unwind be deferred
  perf_event_attr::defer_output    - to request PERF_RECORD_CALLCHAIN_DEFERRED records

The existing PERF_RECORD_SAMPLE callchain section gets a new
context type:

  PERF_CONTEXT_USER_DEFERRED

After which will come a single entry, denoting the 'cookie' of the
deferred callchain that should be attached here, matching the 'cookie'
field of the above mentioned PERF_RECORD_CALLCHAIN_DEFERRED.

The 'defer_callchain' flag is expected on all events with
PERF_SAMPLE_CALLCHAIN. The 'defer_output' flag is expect on the event
responsible for collecting side-band events (like mmap, comm etc.).
Setting 'defer_output' on multiple events will get you duplicated
PERF_RECORD_CALLCHAIN_DEFERRED records.

Based on earlier patches by Josh and Steven.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251023150002.GR4067720@noisy.programming.kicks-ass.net
include/linux/perf_event.h
include/linux/unwind_deferred.h
include/linux/unwind_deferred_types.h
include/uapi/linux/perf_event.h
kernel/bpf/stackmap.c
kernel/events/callchain.c
kernel/events/core.c
tools/include/uapi/linux/perf_event.h

index fd1d91017b99b35ec8d7b56f65e8194f6b9ed1e3..9870d768db4cc5e697b7879e8e9d90bd3762ed42 100644 (file)
@@ -1720,7 +1720,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p
 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-                  u32 max_stack, bool crosstask, bool add_mark);
+                  u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie);
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
 extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
index f4743c8cff4c2f350663b620c4a4688e6817a464..bc7ae7d2190069a27a6567e362396ffcaae01e0e 100644 (file)
@@ -6,18 +6,6 @@
 #include <linux/unwind_user.h>
 #include <linux/unwind_deferred_types.h>
 
-struct unwind_work;
-
-typedef void (*unwind_callback_t)(struct unwind_work *work,
-                                 struct unwind_stacktrace *trace,
-                                 u64 cookie);
-
-struct unwind_work {
-       struct list_head                list;
-       unwind_callback_t               func;
-       int                             bit;
-};
-
 #ifdef CONFIG_UNWIND_USER
 
 enum {
index 0a4c8ddbbc57d4e7a5700f14250193be5c92c7f8..18fa3932f61ca56b054338c10a86a888f684a092 100644 (file)
@@ -39,4 +39,17 @@ struct unwind_task_info {
        union unwind_task_id    id;
 };
 
+struct unwind_work;
+struct unwind_stacktrace;
+
+typedef void (*unwind_callback_t)(struct unwind_work *work,
+                                 struct unwind_stacktrace *trace,
+                                 u64 cookie);
+
+struct unwind_work {
+       struct list_head                list;
+       unwind_callback_t               func;
+       int                             bit;
+};
+
 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
index 78a362b8002776e5ce83a0d7816601638c61ecc6..d292f96bc06f86bc16d08538a8febc72138531d2 100644 (file)
@@ -463,7 +463,9 @@ struct perf_event_attr {
                                inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
                                remove_on_exec :  1, /* event is removed from task on exec */
                                sigtrap        :  1, /* send synchronous SIGTRAP on event */
-                               __reserved_1   : 26;
+                               defer_callchain:  1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */
+                               defer_output   :  1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */
+                               __reserved_1   : 24;
 
        union {
                __u32           wakeup_events;    /* wake up every n events */
@@ -1239,6 +1241,22 @@ enum perf_event_type {
         */
        PERF_RECORD_AUX_OUTPUT_HW_ID            = 21,
 
+       /*
+        * This user callchain capture was deferred until shortly before
+        * returning to user space.  Previous samples would have kernel
+        * callchains only and they need to be stitched with this to make full
+        * callchains.
+        *
+        * struct {
+        *      struct perf_event_header        header;
+        *      u64                             cookie;
+        *      u64                             nr;
+        *      u64                             ips[nr];
+        *      struct sample_id                sample_id;
+        * };
+        */
+       PERF_RECORD_CALLCHAIN_DEFERRED          = 22,
+
        PERF_RECORD_MAX,                        /* non-ABI */
 };
 
@@ -1269,6 +1287,7 @@ enum perf_callchain_context {
        PERF_CONTEXT_HV                         = (__u64)-32,
        PERF_CONTEXT_KERNEL                     = (__u64)-128,
        PERF_CONTEXT_USER                       = (__u64)-512,
+       PERF_CONTEXT_USER_DEFERRED              = (__u64)-640,
 
        PERF_CONTEXT_GUEST                      = (__u64)-2048,
        PERF_CONTEXT_GUEST_KERNEL               = (__u64)-2176,
index 4d53cdd1374cf713afd6f2b57c9193819f9180a8..8f1dacaf01fe286e909bccc1f7c1c76a0f18415a 100644 (file)
@@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
                max_depth = sysctl_perf_event_max_stack;
 
        trace = get_perf_callchain(regs, kernel, user, max_depth,
-                                  false, false);
+                                  false, false, 0);
 
        if (unlikely(!trace))
                /* couldn't fetch the stack trace */
@@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
                trace = get_callchain_entry_for_task(task, max_depth);
        else
                trace = get_perf_callchain(regs, kernel, user, max_depth,
-                                          crosstask, false);
+                                          crosstask, false, 0);
 
        if (unlikely(!trace) || trace->nr < skip) {
                if (may_fault)
index 808c0d7a31faf097550d9418e3e3f6e7afe62b6f..b9c7e00725d6b30a08d027195769d5e275f9057a 100644 (file)
@@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-                  u32 max_stack, bool crosstask, bool add_mark)
+                  u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
 {
        struct perf_callchain_entry *entry;
        struct perf_callchain_entry_ctx ctx;
@@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
                        regs = task_pt_regs(current);
                }
 
+               if (defer_cookie) {
+                       /*
+                        * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+                        * which can be stitched to this one, and add
+                        * the cookie after it (it will be cut off when the
+                        * user stack is copied to the callchain).
+                        */
+                       perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+                       perf_callchain_store_context(&ctx, defer_cookie);
+                       goto exit_put;
+               }
+
                if (add_mark)
                        perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 
index 7541f6f85fcb035bc79fef3327598efa0d19aa8d..f6a08c73f7831edbe0996257495bf1693b94ce25 100644 (file)
@@ -56,6 +56,7 @@
 #include <linux/buildid.h>
 #include <linux/task_work.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/unwind_deferred.h>
 
 #include "internal.h"
 
@@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr)
 
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
+static struct unwind_work perf_unwind_work;
+
 struct perf_callchain_entry *
 perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
@@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
                !(current->flags & (PF_KTHREAD | PF_USER_WORKER));
        /* Disallow cross-task user callchains. */
        bool crosstask = event->ctx->task && event->ctx->task != current;
+       bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
+                         event->attr.defer_callchain;
        const u32 max_stack = event->attr.sample_max_stack;
        struct perf_callchain_entry *callchain;
+       u64 defer_cookie;
 
        if (!current->mm)
                user = false;
@@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
        if (!kernel && !user)
                return &__empty_callchain;
 
-       callchain = get_perf_callchain(regs, kernel, user,
-                                      max_stack, crosstask, true);
+       if (!(user && defer_user && !crosstask &&
+             unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
+               defer_cookie = 0;
+
+       callchain = get_perf_callchain(regs, kernel, user, max_stack,
+                                      crosstask, true, defer_cookie);
+
        return callchain ?: &__empty_callchain;
 }
 
@@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog,
        perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
 }
 
+struct perf_callchain_deferred_event {
+       struct unwind_stacktrace *trace;
+       struct {
+               struct perf_event_header        header;
+               u64                             cookie;
+               u64                             nr;
+               u64                             ips[];
+       } event;
+};
+
+static void perf_callchain_deferred_output(struct perf_event *event, void *data)
+{
+       struct perf_callchain_deferred_event *deferred_event = data;
+       struct perf_output_handle handle;
+       struct perf_sample_data sample;
+       int ret, size = deferred_event->event.header.size;
+
+       if (!event->attr.defer_output)
+               return;
+
+       /* XXX do we really need sample_id_all for this ??? */
+       perf_event_header__init_id(&deferred_event->event.header, &sample, event);
+
+       ret = perf_output_begin(&handle, &sample, event,
+                               deferred_event->event.header.size);
+       if (ret)
+               goto out;
+
+       perf_output_put(&handle, deferred_event->event);
+       for (int i = 0; i < deferred_event->trace->nr; i++) {
+               u64 entry = deferred_event->trace->entries[i];
+               perf_output_put(&handle, entry);
+       }
+       perf_event__output_id_sample(event, &handle, &sample);
+
+       perf_output_end(&handle);
+out:
+       deferred_event->event.header.size = size;
+}
+
+static void perf_unwind_deferred_callback(struct unwind_work *work,
+                                        struct unwind_stacktrace *trace, u64 cookie)
+{
+       struct perf_callchain_deferred_event deferred_event = {
+               .trace = trace,
+               .event = {
+                       .header = {
+                               .type = PERF_RECORD_CALLCHAIN_DEFERRED,
+                               .misc = PERF_RECORD_MISC_USER,
+                               .size = sizeof(deferred_event.event) +
+                                       (trace->nr * sizeof(u64)),
+                       },
+                       .cookie = cookie,
+                       .nr = trace->nr,
+               },
+       };
+
+       perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
+}
+
 struct perf_text_poke_event {
        const void              *old_bytes;
        const void              *new_bytes;
@@ -14799,6 +14870,9 @@ void __init perf_event_init(void)
 
        idr_init(&pmu_idr);
 
+       unwind_deferred_init(&perf_unwind_work,
+                            perf_unwind_deferred_callback);
+
        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
index 78a362b8002776e5ce83a0d7816601638c61ecc6..d292f96bc06f86bc16d08538a8febc72138531d2 100644 (file)
@@ -463,7 +463,9 @@ struct perf_event_attr {
                                inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
                                remove_on_exec :  1, /* event is removed from task on exec */
                                sigtrap        :  1, /* send synchronous SIGTRAP on event */
-                               __reserved_1   : 26;
+                               defer_callchain:  1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */
+                               defer_output   :  1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */
+                               __reserved_1   : 24;
 
        union {
                __u32           wakeup_events;    /* wake up every n events */
@@ -1239,6 +1241,22 @@ enum perf_event_type {
         */
        PERF_RECORD_AUX_OUTPUT_HW_ID            = 21,
 
+       /*
+        * This user callchain capture was deferred until shortly before
+        * returning to user space.  Previous samples would have kernel
+        * callchains only and they need to be stitched with this to make full
+        * callchains.
+        *
+        * struct {
+        *      struct perf_event_header        header;
+        *      u64                             cookie;
+        *      u64                             nr;
+        *      u64                             ips[nr];
+        *      struct sample_id                sample_id;
+        * };
+        */
+       PERF_RECORD_CALLCHAIN_DEFERRED          = 22,
+
        PERF_RECORD_MAX,                        /* non-ABI */
 };
 
@@ -1269,6 +1287,7 @@ enum perf_callchain_context {
        PERF_CONTEXT_HV                         = (__u64)-32,
        PERF_CONTEXT_KERNEL                     = (__u64)-128,
        PERF_CONTEXT_USER                       = (__u64)-512,
+       PERF_CONTEXT_USER_DEFERRED              = (__u64)-640,
 
        PERF_CONTEXT_GUEST                      = (__u64)-2048,
        PERF_CONTEXT_GUEST_KERNEL               = (__u64)-2176,