SYSCALL_WORK_BIT_SYSCALL_AUDIT,
SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
+ SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE,
};
-#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP)
-#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
-#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
-#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
-#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
-#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
-#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP)
+#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
+#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
+#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
+#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
+#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
+#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_SYSCALL_RSEQ_SLICE BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE)
#endif
#include <asm/thread_info.h>
}
}
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
- unsigned long work)
+long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work)
{
long ret = 0;
return -1L;
}
+ /*
+ * User space got a time slice extension granted and relinquishes
+ * the CPU. The work stops the slice timer to avoid an extra round
+ * through hrtimer_interrupt().
+ */
+ if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
+ rseq_syscall_enter_work(syscall);
+
/* Handle ptrace */
if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
ret = ptrace_report_syscall_entry(regs);
#ifdef CONFIG_RSEQ_SLICE_EXTENSION
DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
+static inline void rseq_slice_set_need_resched(struct task_struct *curr)
+{
+ /*
+ * The interrupt guard is required to prevent inconsistent state in
+ * this case:
+ *
+ * set_tsk_need_resched()
+ * --> Interrupt
+ * wakeup()
+ * set_tsk_need_resched()
+ * set_preempt_need_resched()
+ * schedule_on_return()
+ * clear_tsk_need_resched()
+ * clear_preempt_need_resched()
+ * set_preempt_need_resched() <- Inconsistent state
+ *
+ * This is safe vs. a remote set of TIF_NEED_RESCHED because that
+ * only sets the already set bit and does not create inconsistent
+ * state.
+ */
+ scoped_guard(irq)
+ set_need_resched_current();
+}
+
+static void rseq_slice_validate_ctrl(u32 expected)
+{
+ u32 __user *sctrl = ¤t->rseq.usrptr->slice_ctrl.all;
+ u32 uval;
+
+ if (get_user(uval, sctrl) || uval != expected)
+ force_sig(SIGSEGV);
+}
+
+/*
+ * Invoked from syscall entry if a time slice extension was granted and the
+ * kernel did not clear it before user space left the critical section.
+ *
+ * While the recommended way to relinquish the CPU side effect free is
+ * rseq_slice_yield(2), any syscall within a granted slice terminates the
+ * grant and immediately reschedules if required. This supports onion layer
+ * applications, where the code requesting the grant cannot control the
+ * code within the critical section.
+ */
+void rseq_syscall_enter_work(long syscall)
+{
+ struct task_struct *curr = current;
+ struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted };
+
+ clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+
+ if (static_branch_unlikely(&rseq_debug_enabled))
+ rseq_slice_validate_ctrl(ctrl.all);
+
+ /*
+ * The kernel might have raced, revoked the grant and updated
+ * userspace, but kept the SLICE work set.
+ */
+ if (!ctrl.granted)
+ return;
+
+ /*
+ * Required to make set_tsk_need_resched() correct on PREEMPT[RT]
+ * kernels. Leaving the scope will reschedule on preemption models
+ * FULL, LAZY and RT if necessary.
+ */
+ scoped_guard(preempt) {
+ /*
+ * Now that preemption is disabled, quickly check whether
+ * the task was already rescheduled before arriving here.
+ */
+ if (!curr->rseq.event.sched_switch) {
+ rseq_slice_set_need_resched(curr);
+
+ if (syscall == __NR_rseq_slice_yield) {
+ rseq_stat_inc(rseq_stats.s_yielded);
+ /* Update the yielded state for syscall return */
+ curr->rseq.slice.yielded = 1;
+ } else {
+ rseq_stat_inc(rseq_stats.s_aborted);
+ }
+ }
+ }
+ /* Reschedule on NONE/VOLUNTARY preemption models */
+ cond_resched();
+
+ /* Clear the grant in kernel state and user space */
+ curr->rseq.slice.state.granted = false;
+ if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all))
+ force_sig(SIGSEGV);
+}
+
int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
{
switch (arg2) {