rseq: Rework the TIF_NOTIFY handler

author Thomas Gleixner <tglx@linutronix.de>

Mon, 27 Oct 2025 08:45:12 +0000 (09:45 +0100)

committer Ingo Molnar <mingo@kernel.org>

Tue, 4 Nov 2025 07:33:54 +0000 (08:33 +0100)
author Thomas Gleixner <tglx@linutronix.de>
Mon, 27 Oct 2025 08:45:12 +0000 (09:45 +0100)
committer Ingo Molnar <mingo@kernel.org>
Tue, 4 Nov 2025 07:33:54 +0000 (08:33 +0100)
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h

index 37444e80fd4587136967ac2481069ff44f713fa0..aa1c0464a16ca923994e76915e63a8b1cddaa346 100644 (file)
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -368,6 +368,35 @@ efault:
         return false;
  }
  
+/*
+ * Update user space with new IDs and conditionally check whether the task
+ * is in a critical section.
+ */
+static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
+                                       struct rseq_ids *ids, u32 node_id)
+{
+       u64 csaddr;
+
+       if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
+               return false;
+
+       /*
+        * On architectures which utilize the generic entry code this
+        * allows to skip the critical section when the entry was not from
+        * a user space interrupt, unless debug mode is enabled.
+        */
+       if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+               if (!static_branch_unlikely(&rseq_debug_enabled)) {
+                       if (likely(!t->rseq.event.user_irq))
+                               return true;
+               }
+       }
+       if (likely(!csaddr))
+               return true;
+       /* Sigh, this really needs to do work */
+       return rseq_update_user_cs(t, regs, csaddr);
+}
+
  static __always_inline void rseq_exit_to_user_mode(void)
  {
         struct rseq_event *ev = &current->rseq.event;
diff --git a/kernel/rseq.c b/kernel/rseq.c

index 13faadc737ad63370b07fa2e6fcedc9dbab31535..148fb21030234192f240e9f3a45ce7ec7f4fa626 100644 (file)
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -82,12 +82,6 @@
  #define CREATE_TRACE_POINTS
  #include <trace/events/rseq.h>
  
-#ifdef CONFIG_MEMBARRIER
-# define RSEQ_EVENT_GUARD      irq
-#else
-# define RSEQ_EVENT_GUARD      preempt
-#endif
-
  DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
  
  static inline void rseq_control_debug(bool on)
@@ -239,38 +233,15 @@ efault:
         return false;
  }
  
-/*
- * This resume handler must always be executed between any of:
- * - preemption,
- * - signal delivery,
- * and return to user-space.
- *
- * This is how we can ensure that the entire rseq critical section
- * will issue the commit instruction only if executed atomically with
- * respect to other threads scheduled on the same CPU, and with respect
- * to signal handlers.
- */
-void __rseq_handle_notify_resume(struct pt_regs *regs)
+static void rseq_slowpath_update_usr(struct pt_regs *regs)
  {
+       /* Preserve rseq state and user_irq state for exit to user */
+       const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
         struct task_struct *t = current;
         struct rseq_ids ids;
         u32 node_id;
         bool event;
  
-       /*
-        * If invoked from hypervisors before entering the guest via
-        * resume_user_mode_work(), then @regs is a NULL pointer.
-        *
-        * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
-        * it before returning from the ioctl() to user space when
-        * rseq_event.sched_switch is set.
-        *
-        * So it's safe to ignore here instead of pointlessly updating it
-        * in the vcpu_run() loop.
-        */
-       if (!regs)
-               return;
-
         if (unlikely(t->flags & PF_EXITING))
                 return;
  
@@ -294,26 +265,45 @@ void __rseq_handle_notify_resume(struct pt_regs *regs)
          * with the result handed in to allow the detection of
          * inconsistencies.
          */
-       scoped_guard(RSEQ_EVENT_GUARD) {
+       scoped_guard(irq) {
                 event = t->rseq.event.sched_switch;
-               t->rseq.event.sched_switch = false;
+               t->rseq.event.all &= evt_mask.all;
                 ids.cpu_id = task_cpu(t);
                 ids.mm_cid = task_mm_cid(t);
         }
  
-       if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
+       if (!event)
                 return;
  
-       if (!rseq_handle_cs(t, regs))
-               goto error;
-
         node_id = cpu_to_node(ids.cpu_id);
-       if (!rseq_set_ids(t, &ids, node_id))
-               goto error;
-       return;
  
-error:
-       force_sig(SIGSEGV);
+       if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
+               /*
+                * Clear the errors just in case this might survive magically, but
+                * leave the rest intact.
+                */
+               t->rseq.event.error = 0;
+               force_sig(SIGSEGV);
+       }
+}
+
+void __rseq_handle_notify_resume(struct pt_regs *regs)
+{
+       /*
+        * If invoked from hypervisors before entering the guest via
+        * resume_user_mode_work(), then @regs is a NULL pointer.
+        *
+        * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
+        * it before returning from the ioctl() to user space when
+        * rseq_event.sched_switch is set.
+        *
+        * So it's safe to ignore here instead of pointlessly updating it
+        * in the vcpu_run() loop.
+        */
+       if (!regs)
+               return;
+
+       rseq_slowpath_update_usr(regs);
  }
  
  void __rseq_signal_deliver(int sig, struct pt_regs *regs)
author	Thomas Gleixner <tglx@linutronix.de>
	Mon, 27 Oct 2025 08:45:12 +0000 (09:45 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Tue, 4 Nov 2025 07:33:54 +0000 (08:33 +0100)
include/linux/rseq_entry.h		patch \| blob \| blame \| history
kernel/rseq.c		patch \| blob \| blame \| history