rseq: Provide and use rseq_set_ids()

author Thomas Gleixner <tglx@linutronix.de>

Mon, 27 Oct 2025 08:45:08 +0000 (09:45 +0100)

committer Ingo Molnar <mingo@kernel.org>

Tue, 4 Nov 2025 07:33:33 +0000 (08:33 +0100)
author Thomas Gleixner <tglx@linutronix.de>
Mon, 27 Oct 2025 08:45:08 +0000 (09:45 +0100)
committer Ingo Molnar <mingo@kernel.org>
Tue, 4 Nov 2025 07:33:33 +0000 (08:33 +0100)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c

index e4653bb99946b1d9be67ff625d4344ca41d6e7ca..3eb734c192e9c04e9c1aa3744d28543775290d5f 100644 (file)
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,7 +46,7 @@
  #include <linux/cred.h>
  #include <linux/dax.h>
  #include <linux/uaccess.h>
-#include <linux/rseq.h>
+#include <uapi/linux/rseq.h>
  #include <asm/param.h>
  #include <asm/page.h>
  
diff --git a/include/linux/rseq.h b/include/linux/rseq.h

index 7f347c3a4af8b60d8fb4b18d7f5813ac98bf1995..92f9cd49489bc3f8b5c782f3f77038707e693c48 100644 (file)
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -5,6 +5,8 @@
  #ifdef CONFIG_RSEQ
  #include <linux/sched.h>
  
+#include <uapi/linux/rseq.h>
+
  void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
  
  static inline void rseq_handle_notify_resume(struct pt_regs *regs)
@@ -48,7 +50,7 @@ static inline void rseq_virt_userspace_exit(void)
  static inline void rseq_reset(struct task_struct *t)
  {
         memset(&t->rseq, 0, sizeof(t->rseq));
-       t->rseq.ids.cpu_cid = ~0ULL;
+       t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
  }
  
  static inline void rseq_execve(struct task_struct *t)
@@ -59,15 +61,19 @@ static inline void rseq_execve(struct task_struct *t)
  /*
   * If parent process has a registered restartable sequences area, the
   * child inherits. Unregister rseq for a clone with CLONE_VM set.
+ *
+ * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
+ * on the COW page on exit to user space, when the child stays on the same
+ * CPU as the parent. That's obviously not guaranteed, but in overcommit
+ * scenarios it is more likely and optimizes for the fork/exec case without
+ * taking the fault.
   */
  static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
  {
-       if (clone_flags & CLONE_VM) {
+       if (clone_flags & CLONE_VM)
                 rseq_reset(t);
-       } else {
+       else
                 t->rseq = current->rseq;
-               t->rseq.ids.cpu_cid = ~0ULL;
-       }
  }
  
  #else /* CONFIG_RSEQ */
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h

index fb53a6ff05d7f3980cf2f7b0bb9969e344fa03ce..37444e80fd4587136967ac2481069ff44f713fa0 100644 (file)
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -75,6 +75,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
  #endif
  
  bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
+bool rseq_debug_validate_ids(struct task_struct *t);
  
  static __always_inline void rseq_note_user_irq_entry(void)
  {
@@ -194,6 +195,43 @@ efault:
         return false;
  }
  
+/*
+ * On debug kernels validate that user space did not mess with it if the
+ * debug branch is enabled.
+ */
+bool rseq_debug_validate_ids(struct task_struct *t)
+{
+       struct rseq __user *rseq = t->rseq.usrptr;
+       u32 cpu_id, uval, node_id;
+
+       /*
+        * On the first exit after registering the rseq region CPU ID is
+        * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
+        */
+       node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
+                 cpu_to_node(t->rseq.ids.cpu_id) : 0;
+
+       scoped_user_read_access(rseq, efault) {
+               unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
+               if (cpu_id != t->rseq.ids.cpu_id)
+                       goto die;
+               unsafe_get_user(uval, &rseq->cpu_id, efault);
+               if (uval != cpu_id)
+                       goto die;
+               unsafe_get_user(uval, &rseq->node_id, efault);
+               if (uval != node_id)
+                       goto die;
+               unsafe_get_user(uval, &rseq->mm_cid, efault);
+               if (uval != t->rseq.ids.mm_cid)
+                       goto die;
+       }
+       return true;
+die:
+       t->rseq.event.fatal = true;
+efault:
+       return false;
+}
+
  #endif /* RSEQ_BUILD_SLOW_PATH */
  
  /*
@@ -279,6 +317,57 @@ efault:
         return false;
  }
  
+/*
+ * Updates CPU ID, Node ID and MM CID and reads the critical section
+ * address, when @csaddr != NULL. This allows to put the ID update and the
+ * read under the same uaccess region to spare a separate begin/end.
+ *
+ * As this is either invoked from a C wrapper with @csaddr = NULL or from
+ * the fast path code with a valid pointer, a clever compiler should be
+ * able to optimize the read out. Spares a duplicate implementation.
+ *
+ * Returns true, if the operation was successful, false otherwise.
+ *
+ * In the failure case task::rseq_event::fatal is set when invalid data
+ * was found on debug kernels. It's clear when the failure was an unresolved page
+ * fault.
+ *
+ * If inlined into the exit to user path with interrupts disabled, the
+ * caller has to protect against page faults with pagefault_disable().
+ *
+ * In preemptible task context this would be counterproductive as the page
+ * faults could not be fully resolved. As a consequence unresolved page
+ * faults in task context are fatal too.
+ */
+static rseq_inline
+bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
+                            u32 node_id, u64 *csaddr)
+{
+       struct rseq __user *rseq = t->rseq.usrptr;
+
+       if (static_branch_unlikely(&rseq_debug_enabled)) {
+               if (!rseq_debug_validate_ids(t))
+                       return false;
+       }
+
+       scoped_user_rw_access(rseq, efault) {
+               unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
+               unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
+               unsafe_put_user(node_id, &rseq->node_id, efault);
+               unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
+               if (csaddr)
+                       unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
+       }
+
+       /* Cache the new values */
+       t->rseq.ids.cpu_cid = ids->cpu_cid;
+       rseq_stat_inc(rseq_stats.ids);
+       rseq_trace_update(t, ids);
+       return true;
+efault:
+       return false;
+}
+
  static __always_inline void rseq_exit_to_user_mode(void)
  {
         struct rseq_event *ev = &current->rseq.event;
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 24a9da7ca3e7fc83c8ab55c014f5750cd4e7e5b8..e47abc8685d7b109ffffa39b3f6ef42d07763557 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -42,7 +42,6 @@
  #include <linux/posix-timers_types.h>
  #include <linux/restart_block.h>
  #include <linux/rseq_types.h>
-#include <uapi/linux/rseq.h>
  #include <linux/seqlock_types.h>
  #include <linux/kcsan.h>
  #include <linux/rv.h>
@@ -1408,15 +1407,6 @@ struct task_struct {
  #endif /* CONFIG_NUMA_BALANCING */
  
         struct rseq_data                rseq;
-#ifdef CONFIG_DEBUG_RSEQ
-       /*
-        * This is a place holder to save a copy of the rseq fields for
-        * validation of read-only fields. The struct rseq has a
-        * variable-length array at the end, so it cannot be used
-        * directly. Reserve a size large enough for the known fields.
-        */
-       char                            rseq_fields[sizeof(struct rseq)];
-#endif
  
  #ifdef CONFIG_SCHED_MM_CID
         int                             mm_cid;         /* Current cid in mm */
diff --git a/kernel/rseq.c b/kernel/rseq.c

index 97631554ae96252fdaf86b4dc28944b93caebf8b..1e4f1d2cdfe57fba96ea1c73e982a9b4ac64a61e 100644 (file)
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -88,13 +88,6 @@
  # define RSEQ_EVENT_GUARD      preempt
  #endif
  
-/* The original rseq structure size (including padding) is 32 bytes. */
-#define ORIG_RSEQ_SIZE         32
-
-#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
-                                 RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
-                                 RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
-
  DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
  
  static inline void rseq_control_debug(bool on)
@@ -227,159 +220,9 @@ static int __init rseq_debugfs_init(void)
  __initcall(rseq_debugfs_init);
  #endif /* CONFIG_DEBUG_FS */
  
-#ifdef CONFIG_DEBUG_RSEQ
-static struct rseq *rseq_kernel_fields(struct task_struct *t)
-{
-       return (struct rseq *) t->rseq_fields;
-}
-
-static int rseq_validate_ro_fields(struct task_struct *t)
-{
-       static DEFINE_RATELIMIT_STATE(_rs,
-                                     DEFAULT_RATELIMIT_INTERVAL,
-                                     DEFAULT_RATELIMIT_BURST);
-       u32 cpu_id_start, cpu_id, node_id, mm_cid;
-       struct rseq __user *rseq = t->rseq.usrptr;
-
-       /*
-        * Validate fields which are required to be read-only by
-        * user-space.
-        */
-       if (!user_read_access_begin(rseq, t->rseq.len))
-               goto efault;
-       unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
-       unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
-       unsafe_get_user(node_id, &rseq->node_id, efault_end);
-       unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
-       user_read_access_end();
-
-       if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
-           cpu_id != rseq_kernel_fields(t)->cpu_id ||
-           node_id != rseq_kernel_fields(t)->node_id ||
-           mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
-
-               pr_warn("Detected rseq corruption for pid: %d, name: %s\n"
-                       "\tcpu_id_start: %u ?= %u\n"
-                       "\tcpu_id:       %u ?= %u\n"
-                       "\tnode_id:      %u ?= %u\n"
-                       "\tmm_cid:       %u ?= %u\n",
-                       t->pid, t->comm,
-                       cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
-                       cpu_id, rseq_kernel_fields(t)->cpu_id,
-                       node_id, rseq_kernel_fields(t)->node_id,
-                       mm_cid, rseq_kernel_fields(t)->mm_cid);
-       }
-
-       /* For now, only print a console warning on mismatch. */
-       return 0;
-
-efault_end:
-       user_read_access_end();
-efault:
-       return -EFAULT;
-}
-
-/*
- * Update an rseq field and its in-kernel copy in lock-step to keep a coherent
- * state.
- */
-#define rseq_unsafe_put_user(t, value, field, error_label)                     \
-       do {                                                                    \
-               unsafe_put_user(value, &t->rseq.usrptr->field, error_label);    \
-               rseq_kernel_fields(t)->field = value;                           \
-       } while (0)
-
-#else
-static int rseq_validate_ro_fields(struct task_struct *t)
-{
-       return 0;
-}
-
-#define rseq_unsafe_put_user(t, value, field, error_label)             \
-       unsafe_put_user(value, &t->rseq.usrptr->field, error_label)
-#endif
-
-static int rseq_update_cpu_node_id(struct task_struct *t)
-{
-       struct rseq __user *rseq = t->rseq.usrptr;
-       u32 cpu_id = raw_smp_processor_id();
-       u32 node_id = cpu_to_node(cpu_id);
-       u32 mm_cid = task_mm_cid(t);
-
-       rseq_stat_inc(rseq_stats.ids);
-
-       /* Validate read-only rseq fields on debug kernels */
-       if (rseq_validate_ro_fields(t))
-               goto efault;
-       WARN_ON_ONCE((int) mm_cid < 0);
-
-       if (!user_write_access_begin(rseq, t->rseq.len))
-               goto efault;
-
-       rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end);
-       rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
-       rseq_unsafe_put_user(t, node_id, node_id, efault_end);
-       rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
-
-       /* Cache the user space values */
-       t->rseq.ids.cpu_id = cpu_id;
-       t->rseq.ids.mm_cid = mm_cid;
-
-       /*
-        * Additional feature fields added after ORIG_RSEQ_SIZE
-        * need to be conditionally updated only if
-        * t->rseq_len != ORIG_RSEQ_SIZE.
-        */
-       user_write_access_end();
-       trace_rseq_update(t);
-       return 0;
-
-efault_end:
-       user_write_access_end();
-efault:
-       return -EFAULT;
-}
-
-static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
+static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
  {
-       struct rseq __user *rseq = t->rseq.usrptr;
-       u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
-           mm_cid = 0;
-
-       /*
-        * Validate read-only rseq fields.
-        */
-       if (rseq_validate_ro_fields(t))
-               goto efault;
-
-       if (!user_write_access_begin(rseq, t->rseq.len))
-               goto efault;
-
-       /*
-        * Reset all fields to their initial state.
-        *
-        * All fields have an initial state of 0 except cpu_id which is set to
-        * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after
-        * unregistration can figure out that rseq needs to be registered
-        * again.
-        */
-       rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end);
-       rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
-       rseq_unsafe_put_user(t, node_id, node_id, efault_end);
-       rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
-
-       /*
-        * Additional feature fields added after ORIG_RSEQ_SIZE
-        * need to be conditionally reset only if
-        * t->rseq_len != ORIG_RSEQ_SIZE.
-        */
-       user_write_access_end();
-       return 0;
-
-efault_end:
-       user_write_access_end();
-efault:
-       return -EFAULT;
+       return rseq_set_ids_get_csaddr(t, ids, node_id, NULL);
  }
  
  static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
@@ -410,6 +253,8 @@ efault:
  void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
  {
         struct task_struct *t = current;
+       struct rseq_ids ids;
+       u32 node_id;
         bool event;
         int sig;
  
@@ -456,6 +301,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
         scoped_guard(RSEQ_EVENT_GUARD) {
                 event = t->rseq.event.sched_switch;
                 t->rseq.event.sched_switch = false;
+               ids.cpu_id = task_cpu(t);
+               ids.mm_cid = task_mm_cid(t);
         }
  
         if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
@@ -464,7 +311,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
         if (!rseq_handle_cs(t, regs))
                 goto error;
  
-       if (unlikely(rseq_update_cpu_node_id(t)))
+       node_id = cpu_to_node(ids.cpu_id);
+       if (!rseq_set_ids(t, &ids, node_id))
                 goto error;
         return;
  
@@ -504,13 +352,33 @@ void rseq_syscall(struct pt_regs *regs)
  }
  #endif
  
+static bool rseq_reset_ids(void)
+{
+       struct rseq_ids ids = {
+               .cpu_id         = RSEQ_CPU_ID_UNINITIALIZED,
+               .mm_cid         = 0,
+       };
+
+       /*
+        * If this fails, terminate it because this leaves the kernel in
+        * stupid state as exit to user space will try to fixup the ids
+        * again.
+        */
+       if (rseq_set_ids(current, &ids, 0))
+               return true;
+
+       force_sig(SIGSEGV);
+       return false;
+}
+
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE         32
+
  /*
   * sys_rseq - setup restartable sequences for caller thread.
   */
  SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
  {
-       int ret;
-
         if (flags & RSEQ_FLAG_UNREGISTER) {
                 if (flags & ~RSEQ_FLAG_UNREGISTER)
                         return -EINVAL;
@@ -521,9 +389,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
                         return -EINVAL;
                 if (current->rseq.sig != sig)
                         return -EPERM;
-               ret = rseq_reset_rseq_cpu_node_id(current);
-               if (ret)
-                       return ret;
+               if (!rseq_reset_ids())
+                       return -EFAULT;
                 rseq_reset(current);
                 return 0;
         }
@@ -563,27 +430,22 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
         if (!access_ok(rseq, rseq_len))
                 return -EFAULT;
  
-       /*
-        * If the rseq_cs pointer is non-NULL on registration, clear it to
-        * avoid a potential segfault on return to user-space. The proper thing
-        * to do would have been to fail the registration but this would break
-        * older libcs that reuse the rseq area for new threads without
-        * clearing the fields. Don't bother reading it, just reset it.
-        */
-       if (put_user(0UL, &rseq->rseq_cs))
-               return -EFAULT;
+       scoped_user_write_access(rseq, efault) {
+               /*
+                * If the rseq_cs pointer is non-NULL on registration, clear it to
+                * avoid a potential segfault on return to user-space. The proper thing
+                * to do would have been to fail the registration but this would break
+                * older libcs that reuse the rseq area for new threads without
+                * clearing the fields. Don't bother reading it, just reset it.
+                */
+               unsafe_put_user(0UL, &rseq->rseq_cs, efault);
+               /* Initialize IDs in user space */
+               unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
+               unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
+               unsafe_put_user(0U, &rseq->node_id, efault);
+               unsafe_put_user(0U, &rseq->mm_cid, efault);
+       }
  
-#ifdef CONFIG_DEBUG_RSEQ
-       /*
-        * Initialize the in-kernel rseq fields copy for validation of
-        * read-only fields.
-        */
-       if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
-           get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
-           get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
-           get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
-               return -EFAULT;
-#endif
         /*
          * Activate the registration by setting the rseq area address, length
          * and signature in the task struct.
@@ -599,6 +461,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
          */
         current->rseq.event.has_rseq = true;
         rseq_sched_switch_event(current);
-
         return 0;
+
+efault:
+       return -EFAULT;
  }
author	Thomas Gleixner <tglx@linutronix.de>
	Mon, 27 Oct 2025 08:45:08 +0000 (09:45 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Tue, 4 Nov 2025 07:33:33 +0000 (08:33 +0100)
fs/binfmt_elf.c		patch \| blob \| blame \| history
include/linux/rseq.h		patch \| blob \| blame \| history
include/linux/rseq_entry.h		patch \| blob \| blame \| history
include/linux/sched.h		patch \| blob \| blame \| history
kernel/rseq.c		patch \| blob \| blame \| history