rseq: Implement time slice extension enforcement timer

author Thomas Gleixner <tglx@linutronix.de>

Mon, 15 Dec 2025 16:52:22 +0000 (17:52 +0100)

committer Peter Zijlstra <peterz@infradead.org>

Thu, 22 Jan 2026 10:11:18 +0000 (11:11 +0100)
author Thomas Gleixner <tglx@linutronix.de>
Mon, 15 Dec 2025 16:52:22 +0000 (17:52 +0100)
committer Peter Zijlstra <peterz@infradead.org>
Thu, 22 Jan 2026 10:11:18 +0000 (11:11 +0100)
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst

index 239da22c4e28f192d654667776e5c7f5dfee134f..b09d18e0f75b5b693f492217d91c74781ba6d4f0 100644 (file)
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1248,6 +1248,17 @@ reboot-cmd (SPARC only)
  ROM/Flash boot loader. Maybe to tell it what to do after
  rebooting. ???
  
+rseq_slice_extension_nsec
+=========================
+
+A task can request to delay its scheduling if it is in a critical section
+via the prctl(PR_RSEQ_SLICE_EXTENSION_SET) mechanism. This sets the maximum
+allowed extension in nanoseconds before scheduling of the task is enforced.
+Default value is 10000ns (10us). The possible range is 10000ns (10us) to
+50000ns (50us).
+
+This value has a direct correlation to the worst case scheduling latency;
+increment at your own risk.
  
  sched_energy_aware
  ==================
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h

index 54d8e338b26e95778f9f329a6b15d56d6c7d3c43..8d04611056aad8275e9e45f0aa5935c91e406633 100644 (file)
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -87,8 +87,24 @@ static __always_inline bool rseq_slice_extension_enabled(void)
  {
         return static_branch_likely(&rseq_slice_extension_key);
  }
+
+extern unsigned int rseq_slice_ext_nsecs;
+bool __rseq_arm_slice_extension_timer(void);
+
+static __always_inline bool rseq_arm_slice_extension_timer(void)
+{
+       if (!rseq_slice_extension_enabled())
+               return false;
+
+       if (likely(!current->rseq.slice.state.granted))
+               return false;
+
+       return __rseq_arm_slice_extension_timer();
+}
+
  #else /* CONFIG_RSEQ_SLICE_EXTENSION */
  static inline bool rseq_slice_extension_enabled(void) { return false; }
+static inline bool rseq_arm_slice_extension_timer(void) { return false; }
  #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
  
  bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
@@ -543,17 +559,19 @@ static __always_inline void clear_tif_rseq(void) { }
  static __always_inline bool
  rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
  {
-       if (likely(!test_tif_rseq(ti_work)))
-               return false;
-
-       if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
-               current->rseq.event.slowpath = true;
-               set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
-               return true;
+       if (unlikely(test_tif_rseq(ti_work))) {
+               if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
+                       current->rseq.event.slowpath = true;
+                       set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+                       return true;
+               }
+               clear_tif_rseq();
         }
-
-       clear_tif_rseq();
-       return false;
+       /*
+        * Arm the slice extension timer if nothing to do anymore and the
+        * task really goes out to user space.
+        */
+       return rseq_arm_slice_extension_timer();
  }
  
  #else /* CONFIG_GENERIC_ENTRY */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h

index 8c540e775161660c811803256c82a445f6dfe420..8a2e76c5d2a8bd86283326a5f0d5f8f3410ad316 100644 (file)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -89,10 +89,12 @@ union rseq_slice_state {
  /**
   * struct rseq_slice - Status information for rseq time slice extension
   * @state:     Time slice extension state
+ * @expires:   The time when a grant expires
   * @yielded:   Indicator for rseq_slice_yield()
   */
  struct rseq_slice {
         union rseq_slice_state  state;
+       u64                     expires;
         u8                      yielded;
  };
  
diff --git a/kernel/rseq.c b/kernel/rseq.c

index 8aa4821e3979b85489b06843ac809b35df6e8edc..275d701141079fcf0a2938da93f22dda6ad777a6 100644 (file)
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -71,6 +71,8 @@
  #define RSEQ_BUILD_SLOW_PATH
  
  #include <linux/debugfs.h>
+#include <linux/hrtimer.h>
+#include <linux/percpu.h>
  #include <linux/prctl.h>
  #include <linux/ratelimit.h>
  #include <linux/rseq_entry.h>
@@ -500,8 +502,91 @@ efault:
  }
  
  #ifdef CONFIG_RSEQ_SLICE_EXTENSION
+struct slice_timer {
+       struct hrtimer  timer;
+       void            *cookie;
+};
+
+unsigned int rseq_slice_ext_nsecs __read_mostly = 10 * NSEC_PER_USEC;
+static DEFINE_PER_CPU(struct slice_timer, slice_timer);
  DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
  
+/*
+ * When the timer expires and the task is still in user space, the return
+ * from interrupt will revoke the grant and schedule. If the task already
+ * entered the kernel via a syscall and the timer fires before the syscall
+ * work was able to cancel it, then depending on the preemption model this
+ * will either reschedule on return from interrupt or in the syscall work
+ * below.
+ */
+static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
+{
+       struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
+
+       /*
+        * Validate that the task which armed the timer is still on the
+        * CPU. It could have been scheduled out without canceling the
+        * timer.
+        */
+       if (st->cookie == current && current->rseq.slice.state.granted) {
+               rseq_stat_inc(rseq_stats.s_expired);
+               set_need_resched_current();
+       }
+       return HRTIMER_NORESTART;
+}
+
+bool __rseq_arm_slice_extension_timer(void)
+{
+       struct slice_timer *st = this_cpu_ptr(&slice_timer);
+       struct task_struct *curr = current;
+
+       lockdep_assert_irqs_disabled();
+
+       /*
+        * This check prevents a task, which got a time slice extension
+        * granted, from exceeding the maximum scheduling latency when the
+        * grant expired before going out to user space. Don't bother to
+        * clear the grant here, it will be cleaned up automatically before
+        * going out to user space after being scheduled back in.
+        */
+       if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
+               set_need_resched_current();
+               return true;
+       }
+
+       /*
+        * Store the task pointer as a cookie for comparison in the timer
+        * function. This is safe as the timer is CPU local and cannot be
+        * in the expiry function at this point.
+        */
+       st->cookie = curr;
+       hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
+       /* Arm the syscall entry work */
+       set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+       return false;
+}
+
+static void rseq_cancel_slice_extension_timer(void)
+{
+       struct slice_timer *st = this_cpu_ptr(&slice_timer);
+
+       /*
+        * st->cookie can be safely read as preemption is disabled and the
+        * timer is CPU local.
+        *
+        * As this is most probably the first expiring timer, the cancel is
+        * expensive as it has to reprogram the hardware, but that's less
+        * expensive than going through a full hrtimer_interrupt() cycle
+        * for nothing.
+        *
+        * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
+        * local and once the hrtimer code disabled interrupts the timer
+        * callback cannot be running.
+        */
+       if (st->cookie == current)
+               hrtimer_try_to_cancel(&st->timer);
+}
+
  static inline void rseq_slice_set_need_resched(struct task_struct *curr)
  {
         /*
@@ -563,11 +648,14 @@ void rseq_syscall_enter_work(long syscall)
                 return;
  
         /*
-        * Required to make set_tsk_need_resched() correct on PREEMPT[RT]
-        * kernels. Leaving the scope will reschedule on preemption models
-        * FULL, LAZY and RT if necessary.
+        * Required to stabilize the per CPU timer pointer and to make
+        * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
+        *
+        * Leaving the scope will reschedule on preemption models FULL,
+        * LAZY and RT if necessary.
          */
         scoped_guard(preempt) {
+               rseq_cancel_slice_extension_timer();
                 /*
                  * Now that preemption is disabled, quickly check whether
                  * the task was already rescheduled before arriving here.
@@ -665,6 +753,31 @@ SYSCALL_DEFINE0(rseq_slice_yield)
         return yielded;
  }
  
+#ifdef CONFIG_SYSCTL
+static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC;
+static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
+
+static const struct ctl_table rseq_slice_ext_sysctl[] = {
+       {
+               .procname       = "rseq_slice_extension_nsec",
+               .data           = &rseq_slice_ext_nsecs,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_douintvec_minmax,
+               .extra1         = (unsigned int *)&rseq_slice_ext_nsecs_min,
+               .extra2         = (unsigned int *)&rseq_slice_ext_nsecs_max,
+       },
+};
+
+static void rseq_slice_sysctl_init(void)
+{
+       if (rseq_slice_extension_enabled())
+               register_sysctl_init("kernel", rseq_slice_ext_sysctl);
+}
+#else /* CONFIG_SYSCTL */
+static inline void rseq_slice_sysctl_init(void) { }
+#endif  /* !CONFIG_SYSCTL */
+
  static int __init rseq_slice_cmdline(char *str)
  {
         bool on;
@@ -677,4 +790,17 @@ static int __init rseq_slice_cmdline(char *str)
         return 1;
  }
  __setup("rseq_slice_ext=", rseq_slice_cmdline);
+
+static int __init rseq_slice_init(void)
+{
+       unsigned int cpu;
+
+       for_each_possible_cpu(cpu) {
+               hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
+                             CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
+       }
+       rseq_slice_sysctl_init();
+       return 0;
+}
+device_initcall(rseq_slice_init);
  #endif /* CONFIG_RSEQ_SLICE_EXTENSION */
author	Thomas Gleixner <tglx@linutronix.de>
	Mon, 15 Dec 2025 16:52:22 +0000 (17:52 +0100)
committer	Peter Zijlstra <peterz@infradead.org>
	Thu, 22 Jan 2026 10:11:18 +0000 (11:11 +0100)
Documentation/admin-guide/sysctl/kernel.rst		patch \| blob \| blame \| history
include/linux/rseq_entry.h		patch \| blob \| blame \| history
include/linux/rseq_types.h		patch \| blob \| blame \| history
kernel/rseq.c		patch \| blob \| blame \| history