unwind_user/deferred: Add deferred unwinding interface

author Josh Poimboeuf <jpoimboe@kernel.org>

Tue, 29 Jul 2025 18:23:08 +0000 (14:23 -0400)

committer Steven Rostedt (Google) <rostedt@goodmis.org>

Thu, 31 Jul 2025 14:20:10 +0000 (10:20 -0400)
author Josh Poimboeuf <jpoimboe@kernel.org>
Tue, 29 Jul 2025 18:23:08 +0000 (14:23 -0400)
committer Steven Rostedt (Google) <rostedt@goodmis.org>
Thu, 31 Jul 2025 14:20:10 +0000 (10:20 -0400)
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h

index baacf4a1eb4c28536429bf6b9a4bac6fc9f90098..14efd8c027aa29342d214ae3c8fb43f8edee93c6 100644 (file)
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -2,9 +2,19 @@
  #ifndef _LINUX_UNWIND_USER_DEFERRED_H
  #define _LINUX_UNWIND_USER_DEFERRED_H
  
+#include <linux/task_work.h>
  #include <linux/unwind_user.h>
  #include <linux/unwind_deferred_types.h>
  
+struct unwind_work;
+
+typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie);
+
+struct unwind_work {
+       struct list_head                list;
+       unwind_callback_t               func;
+};
+
  #ifdef CONFIG_UNWIND_USER
  
  void unwind_task_init(struct task_struct *task);
@@ -12,8 +22,19 @@ void unwind_task_free(struct task_struct *task);
  
  int unwind_user_faultable(struct unwind_stacktrace *trace);
  
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func);
+int unwind_deferred_request(struct unwind_work *work, u64 *cookie);
+void unwind_deferred_cancel(struct unwind_work *work);
+
  static __always_inline void unwind_reset_info(void)
  {
+       if (unlikely(current->unwind_info.id.id))
+               current->unwind_info.id.id = 0;
+       /*
+        * As unwind_user_faultable() can be called directly and
+        * depends on nr_entries being cleared on exit to user,
+        * this needs to be a separate conditional.
+        */
         if (unlikely(current->unwind_info.cache))
                 current->unwind_info.cache->nr_entries = 0;
  }
@@ -24,6 +45,9 @@ static inline void unwind_task_init(struct task_struct *task) {}
  static inline void unwind_task_free(struct task_struct *task) {}
  
  static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; }
+static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; }
+static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; }
+static inline void unwind_deferred_cancel(struct unwind_work *work) {}
  
  static inline void unwind_reset_info(void) {}
  
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h

index db5b54b18828a16febac2dbf301108126d8e137a..104c477d5609ca35638dc3a5cb6917d8afdc64a9 100644 (file)
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -7,8 +7,32 @@ struct unwind_cache {
         unsigned long           entries[];
  };
  
+/*
+ * The unwind_task_id is a unique identifier that maps to a user space
+ * stacktrace. It is generated the first time a deferred user space
+ * stacktrace is requested after a task has entered the kerenl and
+ * is cleared to zero when it exits. The mapped id will be a non-zero
+ * number.
+ *
+ * To simplify the generation of the 64 bit number, 32 bits will be
+ * the CPU it was generated on, and the other 32 bits will be a per
+ * cpu counter that gets incremented by two every time a new identifier
+ * is generated. The LSB will always be set to keep the value
+ * from being zero.
+ */
+union unwind_task_id {
+       struct {
+               u32             cpu;
+               u32             cnt;
+       };
+       u64                     id;
+};
+
  struct unwind_task_info {
         struct unwind_cache     *cache;
+       struct callback_head    work;
+       union unwind_task_id    id;
+       int                     pending;
  };
  
  #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c

index 96368a5aa522fada37facb098f56afa912b297cb..2cbae2ada309a73ce6dc98447a48a38423a13532 100644 (file)
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -2,16 +2,63 @@
  /*
   * Deferred user space unwinding
   */
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_deferred.h>
+#include <linux/sched/clock.h>
+#include <linux/task_work.h>
  #include <linux/kernel.h>
  #include <linux/sched.h>
  #include <linux/sizes.h>
  #include <linux/slab.h>
-#include <linux/unwind_deferred.h>
+#include <linux/mm.h>
  
  /* Make the cache fit in a 4K page */
  #define UNWIND_MAX_ENTRIES                                     \
         ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
  
+/* Guards adding to and reading the list of callbacks */
+static DEFINE_MUTEX(callback_mutex);
+static LIST_HEAD(callbacks);
+
+/*
+ * This is a unique percpu identifier for a given task entry context.
+ * Conceptually, it's incremented every time the CPU enters the kernel from
+ * user space, so that each "entry context" on the CPU gets a unique ID.  In
+ * reality, as an optimization, it's only incremented on demand for the first
+ * deferred unwind request after a given entry-from-user.
+ *
+ * It's combined with the CPU id to make a systemwide-unique "context cookie".
+ */
+static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
+
+/*
+ * The context cookie is a unique identifier that is assigned to a user
+ * space stacktrace. As the user space stacktrace remains the same while
+ * the task is in the kernel, the cookie is an identifier for the stacktrace.
+ * Although it is possible for the stacktrace to get another cookie if another
+ * request is made after the cookie was cleared and before reentering user
+ * space.
+ */
+static u64 get_cookie(struct unwind_task_info *info)
+{
+       u32 cnt = 1;
+       u32 old = 0;
+
+       if (info->id.cpu)
+               return info->id.id;
+
+       /* LSB is always set to ensure 0 is an invalid value */
+       cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
+       if (try_cmpxchg(&info->id.cnt, &old, cnt)) {
+               /* Update the per cpu counter */
+               __this_cpu_write(unwind_ctx_ctr, cnt);
+       }
+       /* Interrupts are disabled, the CPU will always be same */
+       info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
+
+       return info->id.id;
+}
+
  /**
   * unwind_user_faultable - Produce a user stacktrace in faultable context
   * @trace: The descriptor that will store the user stacktrace
@@ -62,11 +109,117 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
         return 0;
  }
  
+static void unwind_deferred_task_work(struct callback_head *head)
+{
+       struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
+       struct unwind_stacktrace trace;
+       struct unwind_work *work;
+       u64 cookie;
+
+       if (WARN_ON_ONCE(!info->pending))
+               return;
+
+       /* Allow work to come in again */
+       WRITE_ONCE(info->pending, 0);
+
+       /*
+        * From here on out, the callback must always be called, even if it's
+        * just an empty trace.
+        */
+       trace.nr = 0;
+       trace.entries = NULL;
+
+       unwind_user_faultable(&trace);
+
+       cookie = info->id.id;
+
+       guard(mutex)(&callback_mutex);
+       list_for_each_entry(work, &callbacks, list) {
+               work->func(work, &trace, cookie);
+       }
+}
+
+/**
+ * unwind_deferred_request - Request a user stacktrace on task kernel exit
+ * @work: Unwind descriptor requesting the trace
+ * @cookie: The cookie of the first request made for this task
+ *
+ * Schedule a user space unwind to be done in task work before exiting the
+ * kernel.
+ *
+ * The returned @cookie output is the generated cookie of the very first
+ * request for a user space stacktrace for this task since it entered the
+ * kernel. It can be from a request by any caller of this infrastructure.
+ * Its value will also be passed to the callback function.  It can be
+ * used to stitch kernel and user stack traces together in post-processing.
+ *
+ * It's valid to call this function multiple times for the same @work within
+ * the same task entry context.  Each call will return the same cookie
+ * while the task hasn't left the kernel. If the callback is not pending
+ * because it has already been previously called for the same entry context,
+ * it will be called again with the same stack trace and cookie.
+ *
+ * Return: 1 if the the callback was already queued.
+ *         0 if the callback successfully was queued.
+ *         Negative if there's an error.
+ *         @cookie holds the cookie of the first request by any user
+ */
+int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
+{
+       struct unwind_task_info *info = &current->unwind_info;
+       int ret;
+
+       *cookie = 0;
+
+       if (WARN_ON_ONCE(in_nmi()))
+               return -EINVAL;
+
+       if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
+           !user_mode(task_pt_regs(current)))
+               return -EINVAL;
+
+       guard(irqsave)();
+
+       *cookie = get_cookie(info);
+
+       /* callback already pending? */
+       if (info->pending)
+               return 1;
+
+       /* The work has been claimed, now schedule it. */
+       ret = task_work_add(current, &info->work, TWA_RESUME);
+       if (WARN_ON_ONCE(ret))
+               return ret;
+
+       info->pending = 1;
+       return 0;
+}
+
+void unwind_deferred_cancel(struct unwind_work *work)
+{
+       if (!work)
+               return;
+
+       guard(mutex)(&callback_mutex);
+       list_del(&work->list);
+}
+
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
+{
+       memset(work, 0, sizeof(*work));
+
+       guard(mutex)(&callback_mutex);
+       list_add(&work->list, &callbacks);
+       work->func = func;
+       return 0;
+}
+
  void unwind_task_init(struct task_struct *task)
  {
         struct unwind_task_info *info = &task->unwind_info;
  
         memset(info, 0, sizeof(*info));
+       init_task_work(&info->work, unwind_deferred_task_work);
  }
  
  void unwind_task_free(struct task_struct *task)
@@ -74,4 +227,5 @@ void unwind_task_free(struct task_struct *task)
         struct unwind_task_info *info = &task->unwind_info;
  
         kfree(info->cache);
+       task_work_cancel(task, &info->work);
  }
author	Josh Poimboeuf <jpoimboe@kernel.org>
	Tue, 29 Jul 2025 18:23:08 +0000 (14:23 -0400)
committer	Steven Rostedt (Google) <rostedt@goodmis.org>
	Thu, 31 Jul 2025 14:20:10 +0000 (10:20 -0400)
include/linux/unwind_deferred.h		patch \| blob \| blame \| history
include/linux/unwind_deferred_types.h		patch \| blob \| blame \| history
kernel/unwind/deferred.c		patch \| blob \| blame \| history