]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
sched_ext: Save and restore scx_locked_rq across SCX_CALL_OP
authorTejun Heo <tj@kernel.org>
Sat, 25 Apr 2026 00:31:36 +0000 (14:31 -1000)
committerTejun Heo <tj@kernel.org>
Sat, 25 Apr 2026 00:31:36 +0000 (14:31 -1000)
SCX_CALL_OP{,_RET}() unconditionally clears scx_locked_rq_state to NULL on
exit. Correct at the top level, but ops can recurse via
scx_bpf_sub_dispatch(): a parent's ops.dispatch calls the helper, which
invokes the child's ops.dispatch under another SCX_CALL_OP. When the inner
call returns, the NULL clobbers the outer's state. The parent's BPF then
calls kfuncs like scx_bpf_cpuperf_set() which read scx_locked_rq()==NULL and
re-acquire the already-held rq.

Snapshot scx_locked_rq_state on entry and restore on exit. Rename the rq
parameter to locked_rq across all SCX_CALL_OP* macros so the snapshot local
can be typed as 'struct rq *' without colliding with the parameter token in
the expansion. SCX_CALL_OP_TASK{,_RET}() and SCX_CALL_OP_2TASKS_RET() funnel
through the two base macros and inherit the fix.

Fixes: 4f8b122848db ("sched_ext: Add basic building blocks for nested sub-scheduler dispatching")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
kernel/sched/ext.c

index 8a2a90659c65d1f80eb6c667bee09a2530c5d8d5..26968d0a67528b47c9a7084e234c8068c02c9e92 100644 (file)
@@ -470,24 +470,35 @@ static inline void update_locked_rq(struct rq *rq)
        __this_cpu_write(scx_locked_rq_state, rq);
 }
 
-#define SCX_CALL_OP(sch, op, rq, args...)                                      \
+/*
+ * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not
+ * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit.
+ */
+#define SCX_CALL_OP(sch, op, locked_rq, args...)                               \
 do {                                                                           \
-       if (rq)                                                                 \
-               update_locked_rq(rq);                                           \
+       struct rq *__prev_locked_rq;                                            \
+                                                                               \
+       if (locked_rq) {                                                        \
+               __prev_locked_rq = scx_locked_rq();                             \
+               update_locked_rq(locked_rq);                                    \
+       }                                                                       \
        (sch)->ops.op(args);                                                    \
-       if (rq)                                                                 \
-               update_locked_rq(NULL);                                         \
+       if (locked_rq)                                                          \
+               update_locked_rq(__prev_locked_rq);                             \
 } while (0)
 
-#define SCX_CALL_OP_RET(sch, op, rq, args...)                                  \
+#define SCX_CALL_OP_RET(sch, op, locked_rq, args...)                           \
 ({                                                                             \
+       struct rq *__prev_locked_rq;                                            \
        __typeof__((sch)->ops.op(args)) __ret;                                  \
                                                                                \
-       if (rq)                                                                 \
-               update_locked_rq(rq);                                           \
+       if (locked_rq) {                                                        \
+               __prev_locked_rq = scx_locked_rq();                             \
+               update_locked_rq(locked_rq);                                    \
+       }                                                                       \
        __ret = (sch)->ops.op(args);                                            \
-       if (rq)                                                                 \
-               update_locked_rq(NULL);                                         \
+       if (locked_rq)                                                          \
+               update_locked_rq(__prev_locked_rq);                             \
        __ret;                                                                  \
 })
 
@@ -499,39 +510,39 @@ do {                                                                              \
  * those subject tasks.
  *
  * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held -
- * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock
- * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if
- * kf_tasks[] is set, @p's scheduler-protected fields are stable.
+ * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's
+ * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu.
+ * So if kf_tasks[] is set, @p's scheduler-protected fields are stable.
  *
  * kf_tasks[] can not stack, so task-based SCX ops must not nest. The
  * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants
  * while a previous one is still in progress.
  */
-#define SCX_CALL_OP_TASK(sch, op, rq, task, args...)                           \
+#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...)                    \
 do {                                                                           \
        WARN_ON_ONCE(current->scx.kf_tasks[0]);                                 \
        current->scx.kf_tasks[0] = task;                                        \
-       SCX_CALL_OP((sch), op, rq, task, ##args);                               \
+       SCX_CALL_OP((sch), op, locked_rq, task, ##args);                        \
        current->scx.kf_tasks[0] = NULL;                                        \
 } while (0)
 
-#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...)                       \
+#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...)                        \
 ({                                                                             \
        __typeof__((sch)->ops.op(task, ##args)) __ret;                          \
        WARN_ON_ONCE(current->scx.kf_tasks[0]);                                 \
        current->scx.kf_tasks[0] = task;                                        \
-       __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args);                   \
+       __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args);            \
        current->scx.kf_tasks[0] = NULL;                                        \
        __ret;                                                                  \
 })
 
-#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...)             \
+#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...)      \
 ({                                                                             \
        __typeof__((sch)->ops.op(task0, task1, ##args)) __ret;                  \
        WARN_ON_ONCE(current->scx.kf_tasks[0]);                                 \
        current->scx.kf_tasks[0] = task0;                                       \
        current->scx.kf_tasks[1] = task1;                                       \
-       __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args);           \
+       __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args);    \
        current->scx.kf_tasks[0] = NULL;                                        \
        current->scx.kf_tasks[1] = NULL;                                        \
        __ret;                                                                  \