]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
rseq: Implement syscall entry work for time slice extensions
authorThomas Gleixner <tglx@linutronix.de>
Mon, 15 Dec 2025 16:52:19 +0000 (17:52 +0100)
committerPeter Zijlstra <peterz@infradead.org>
Thu, 22 Jan 2026 10:11:18 +0000 (11:11 +0100)
The kernel sets SYSCALL_WORK_RSEQ_SLICE when it grants a time slice
extension. This allows to handle the rseq_slice_yield() syscall, which is
used by user space to relinquish the CPU after finishing the critical
section for which it requested an extension.

In case the kernel state is still GRANTED, the kernel resets both kernel
and user space state with a set of sanity checks. If the kernel state is
already cleared, then this raced against the timer or some other interrupt
and just clears the work bit.

Doing it in syscall entry work allows to catch misbehaving user space,
which issues an arbitrary syscall, i.e. not rseq_slice_yield(), from the
critical section. Contrary to the initial strict requirement to use
rseq_slice_yield() arbitrary syscalls are not considered a violation of the
ABI contract anymore to allow onion architecture applications, which cannot
control the code inside a critical section, to utilize this as well.

If the code detects inconsistent user space that result in a SIGSEGV for
the application.

If the grant was still active and the task was not preempted yet, the work
code reschedules immediately before continuing through the syscall.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251215155709.005777059@linutronix.de
include/linux/entry-common.h
include/linux/rseq.h
include/linux/thread_info.h
kernel/entry/syscall-common.c
kernel/rseq.c

index 87efb38b70817fd9371942e5ffd7a55d225edca7..026201a44aa2db0315f7c7fc6d67833615dbb27d 100644 (file)
@@ -36,8 +36,8 @@
                                 SYSCALL_WORK_SYSCALL_EMU |             \
                                 SYSCALL_WORK_SYSCALL_AUDIT |           \
                                 SYSCALL_WORK_SYSCALL_USER_DISPATCH |   \
+                                SYSCALL_WORK_SYSCALL_RSEQ_SLICE |      \
                                 ARCH_SYSCALL_WORK_ENTER)
-
 #define SYSCALL_WORK_EXIT      (SYSCALL_WORK_SYSCALL_TRACEPOINT |      \
                                 SYSCALL_WORK_SYSCALL_TRACE |           \
                                 SYSCALL_WORK_SYSCALL_AUDIT |           \
index 3c194a02ad0a82c07359d80934221e90d8898043..7a01a07604053d3db6055863b3c4016e47ba72e2 100644 (file)
@@ -164,8 +164,10 @@ static inline void rseq_syscall(struct pt_regs *regs) { }
 #endif /* !CONFIG_DEBUG_RSEQ */
 
 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
+void rseq_syscall_enter_work(long syscall);
 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
+static inline void rseq_syscall_enter_work(long syscall) { }
 static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
 {
        return -ENOTSUPP;
index b40de9bab4b7961a6a268b2b78f9457b32560e8f..051e42902690491236b9fd911b4a027bd75978c2 100644 (file)
@@ -46,15 +46,17 @@ enum syscall_work_bit {
        SYSCALL_WORK_BIT_SYSCALL_AUDIT,
        SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
        SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
+       SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE,
 };
 
-#define SYSCALL_WORK_SECCOMP           BIT(SYSCALL_WORK_BIT_SECCOMP)
-#define SYSCALL_WORK_SYSCALL_TRACEPOINT        BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
-#define SYSCALL_WORK_SYSCALL_TRACE     BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
-#define SYSCALL_WORK_SYSCALL_EMU       BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
-#define SYSCALL_WORK_SYSCALL_AUDIT     BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
-#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
-#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_SECCOMP                   BIT(SYSCALL_WORK_BIT_SECCOMP)
+#define SYSCALL_WORK_SYSCALL_TRACEPOINT                BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
+#define SYSCALL_WORK_SYSCALL_TRACE             BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
+#define SYSCALL_WORK_SYSCALL_EMU               BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
+#define SYSCALL_WORK_SYSCALL_AUDIT             BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
+#define SYSCALL_WORK_SYSCALL_USER_DISPATCH     BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
+#define SYSCALL_WORK_SYSCALL_EXIT_TRAP         BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_SYSCALL_RSEQ_SLICE                BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE)
 #endif
 
 #include <asm/thread_info.h>
index 940a597ded40fbdb708c3116504c5c663df4b37f..f7ee25b9cf2786ad95dcde8d81f6feed960554b2 100644 (file)
@@ -17,8 +17,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
        }
 }
 
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
-                               unsigned long work)
+long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work)
 {
        long ret = 0;
 
@@ -32,6 +31,14 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall,
                        return -1L;
        }
 
+       /*
+        * User space got a time slice extension granted and relinquishes
+        * the CPU. The work stops the slice timer to avoid an extra round
+        * through hrtimer_interrupt().
+        */
+       if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
+               rseq_syscall_enter_work(syscall);
+
        /* Handle ptrace */
        if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
                ret = ptrace_report_syscall_entry(regs);
index d8e1992edffa7831d817a44e991f4371b5f959a4..8aa4821e3979b85489b06843ac809b35df6e8edc 100644 (file)
@@ -502,6 +502,97 @@ efault:
 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
 
+static inline void rseq_slice_set_need_resched(struct task_struct *curr)
+{
+       /*
+        * The interrupt guard is required to prevent inconsistent state in
+        * this case:
+        *
+        * set_tsk_need_resched()
+        * --> Interrupt
+        *       wakeup()
+        *        set_tsk_need_resched()
+        *        set_preempt_need_resched()
+        *     schedule_on_return()
+        *        clear_tsk_need_resched()
+        *        clear_preempt_need_resched()
+        * set_preempt_need_resched()           <- Inconsistent state
+        *
+        * This is safe vs. a remote set of TIF_NEED_RESCHED because that
+        * only sets the already set bit and does not create inconsistent
+        * state.
+        */
+       scoped_guard(irq)
+               set_need_resched_current();
+}
+
+static void rseq_slice_validate_ctrl(u32 expected)
+{
+       u32 __user *sctrl = &current->rseq.usrptr->slice_ctrl.all;
+       u32 uval;
+
+       if (get_user(uval, sctrl) || uval != expected)
+               force_sig(SIGSEGV);
+}
+
+/*
+ * Invoked from syscall entry if a time slice extension was granted and the
+ * kernel did not clear it before user space left the critical section.
+ *
+ * While the recommended way to relinquish the CPU side effect free is
+ * rseq_slice_yield(2), any syscall within a granted slice terminates the
+ * grant and immediately reschedules if required. This supports onion layer
+ * applications, where the code requesting the grant cannot control the
+ * code within the critical section.
+ */
+void rseq_syscall_enter_work(long syscall)
+{
+       struct task_struct *curr = current;
+       struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted };
+
+       clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+
+       if (static_branch_unlikely(&rseq_debug_enabled))
+               rseq_slice_validate_ctrl(ctrl.all);
+
+       /*
+        * The kernel might have raced, revoked the grant and updated
+        * userspace, but kept the SLICE work set.
+        */
+       if (!ctrl.granted)
+               return;
+
+       /*
+        * Required to make set_tsk_need_resched() correct on PREEMPT[RT]
+        * kernels. Leaving the scope will reschedule on preemption models
+        * FULL, LAZY and RT if necessary.
+        */
+       scoped_guard(preempt) {
+               /*
+                * Now that preemption is disabled, quickly check whether
+                * the task was already rescheduled before arriving here.
+                */
+               if (!curr->rseq.event.sched_switch) {
+                       rseq_slice_set_need_resched(curr);
+
+                       if (syscall == __NR_rseq_slice_yield) {
+                               rseq_stat_inc(rseq_stats.s_yielded);
+                               /* Update the yielded state for syscall return */
+                               curr->rseq.slice.yielded = 1;
+                       } else {
+                               rseq_stat_inc(rseq_stats.s_aborted);
+                       }
+               }
+       }
+       /* Reschedule on NONE/VOLUNTARY preemption models */
+       cond_resched();
+
+       /* Clear the grant in kernel state and user space */
+       curr->rseq.slice.state.granted = false;
+       if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all))
+               force_sig(SIGSEGV);
+}
+
 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
 {
        switch (arg2) {