]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
sched_ext: Close root-enable vs sched_ext_dead() race with SCX_TASK_INIT_BEGIN
authorTejun Heo <tj@kernel.org>
Sun, 10 May 2026 20:08:16 +0000 (10:08 -1000)
committerTejun Heo <tj@kernel.org>
Sun, 10 May 2026 20:08:16 +0000 (10:08 -1000)
scx_root_enable_workfn() drops the iter rq lock for ops.init_task() and a
TASK_DEAD @p can fall through sched_ext_dead() in that window. The race hits
when sched_ext_dead() observes SCX_TASK_INIT (the intermediate state before
@p->scx.sched is published) and dereferences NULL via SCX_HAS_OP(NULL,
exit_task), or observes SCX_TASK_NONE during the unlocked init window and
skips cleanup so exit_task() never runs.

Add SCX_TASK_INIT_BEGIN. The enable path writes NONE -> INIT_BEGIN under the
iter rq lock, then takes the rq lock again after init to walk INIT_BEGIN ->
INIT -> READY. sched_ext_dead() that wins the rq-lock race observes
INIT_BEGIN and sets DEAD without calling into ops; the post-init recheck
unwinds via scx_sub_init_cancel_task().

scx_fork() runs single-threaded against sched_ext_dead() (the task is not on
scx_tasks until scx_post_fork() adds it) so its INIT_BEGIN -> INIT walk
needs no rq-lock pairing; it rolls back to NONE on ops.init_task() failure.

The validation matrix grows the INIT_BEGIN row and the INIT_BEGIN -> DEAD
edge; INIT now requires INIT_BEGIN as the predecessor. scx_sub_disable()'s
migration writes INIT_BEGIN as a synthetic predecessor to satisfy the
tightened verification.

The sub-sched paths still race with sched_ext_dead() during the unlocked
init window. This will be fixed by the next patch.

Reported-by: zhidao su <suzhidao@xiaomi.com>
Link: https://lore.kernel.org/all/20260429133155.3825247-1-suzhidao@xiaomi.com/
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
include/linux/sched/ext.h
kernel/sched/ext.c

index 9f1a326ad03ec74c3130feeb4be03fd1feca4abc..2129e18ada58ba1aaa7269392a90c3060cf9ed13 100644 (file)
@@ -106,6 +106,7 @@ enum scx_ent_flags {
         * Bits 8 to 10 are used to carry task state:
         *
         * NONE         ops.init_task() not called yet
+        * INIT_BEGIN   ops.init_task() in flight; see sched_ext_dead()
         * INIT         ops.init_task() succeeded, but task can be cancelled
         * READY        fully initialized, but not in sched_ext
         * ENABLED      fully initialized and in sched_ext
@@ -116,10 +117,11 @@ enum scx_ent_flags {
        SCX_TASK_STATE_MASK     = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
 
        SCX_TASK_NONE           = 0 << SCX_TASK_STATE_SHIFT,
-       SCX_TASK_INIT           = 1 << SCX_TASK_STATE_SHIFT,
-       SCX_TASK_READY          = 2 << SCX_TASK_STATE_SHIFT,
-       SCX_TASK_ENABLED        = 3 << SCX_TASK_STATE_SHIFT,
-       SCX_TASK_DEAD           = 4 << SCX_TASK_STATE_SHIFT,
+       SCX_TASK_INIT_BEGIN     = 1 << SCX_TASK_STATE_SHIFT,
+       SCX_TASK_INIT           = 2 << SCX_TASK_STATE_SHIFT,
+       SCX_TASK_READY          = 3 << SCX_TASK_STATE_SHIFT,
+       SCX_TASK_ENABLED        = 4 << SCX_TASK_STATE_SHIFT,
+       SCX_TASK_DEAD           = 5 << SCX_TASK_STATE_SHIFT,
 
        /*
         * Bits 12 and 13 are used to carry reenqueue reason. In addition to
index 2fc4a12711f9d556a4a0bf3065fdc60c1833c7c4..29fa9ffe7c7ba3e86d28b912abe4f3b1504a3561 100644 (file)
@@ -725,8 +725,11 @@ static void scx_set_task_state(struct task_struct *p, u32 state)
        case SCX_TASK_NONE:
                warn = prev_state == SCX_TASK_DEAD;
                break;
-       case SCX_TASK_INIT:
+       case SCX_TASK_INIT_BEGIN:
                warn = prev_state != SCX_TASK_NONE;
+               break;
+       case SCX_TASK_INIT:
+               warn = prev_state != SCX_TASK_INIT_BEGIN;
                p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
                break;
        case SCX_TASK_READY:
@@ -737,7 +740,8 @@ static void scx_set_task_state(struct task_struct *p, u32 state)
                warn = prev_state != SCX_TASK_READY;
                break;
        case SCX_TASK_DEAD:
-               warn = prev_state != SCX_TASK_NONE;
+               warn = !(prev_state == SCX_TASK_NONE ||
+                        prev_state == SCX_TASK_INIT_BEGIN);
                break;
        default:
                WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
@@ -3753,9 +3757,12 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 #else
                struct scx_sched *sch = scx_root;
 #endif
+               scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
                ret = __scx_init_task(sch, p, true);
-               if (unlikely(ret))
+               if (unlikely(ret)) {
+                       scx_set_task_state(p, SCX_TASK_NONE);
                        return ret;
+               }
                scx_set_task_state(p, SCX_TASK_INIT);
                scx_set_task_sched(p, sch);
        }
@@ -3856,13 +3863,18 @@ void sched_ext_dead(struct task_struct *p)
         * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup
         * iteration is only used from sub-sched paths, which require root
         * enabled. Root enable transitions every live task to at least READY.
+        *
+        * %INIT_BEGIN means ops.init_task() is running for @p. Don't call
+        * into ops; transition to %DEAD so the post-init recheck unwinds
+        * via scx_sub_init_cancel_task().
         */
        if (scx_get_task_state(p) != SCX_TASK_NONE) {
                struct rq_flags rf;
                struct rq *rq;
 
                rq = task_rq_lock(p, &rf);
-               scx_disable_and_exit_task(scx_task_sched(p), p);
+               if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN)
+                       scx_disable_and_exit_task(scx_task_sched(p), p);
                scx_set_task_state(p, SCX_TASK_DEAD);
                task_rq_unlock(rq, p, &rf);
        }
@@ -5773,6 +5785,7 @@ static void scx_sub_disable(struct scx_sched *sch)
                         * $p having already been initialized, and then enable.
                         */
                        scx_disable_and_exit_task(sch, p);
+                       scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
                        scx_set_task_state(p, SCX_TASK_INIT);
                        scx_set_task_sched(p, parent);
                        scx_set_task_state(p, SCX_TASK_READY);
@@ -6878,6 +6891,9 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 
        scx_task_iter_start(&sti, NULL);
        while ((p = scx_task_iter_next_locked(&sti))) {
+               struct rq_flags rf;
+               struct rq *rq;
+
                /*
                 * @p may already be dead, have lost all its usages counts and
                 * be waiting for RCU grace period before being freed. @p can't
@@ -6886,10 +6902,26 @@ static void scx_root_enable_workfn(struct kthread_work *work)
                if (!tryget_task_struct(p))
                        continue;
 
+               /*
+                * Set %INIT_BEGIN under the iter's rq lock so that a concurrent
+                * sched_ext_dead() does not call ops.exit_task() on @p while
+                * ops.init_task() is running. If sched_ext_dead() runs before
+                * this store, it has already removed @p from scx_tasks and the
+                * iter won't visit @p; if it runs after, it observes
+                * %INIT_BEGIN and transitions to %DEAD without calling ops,
+                * leaving the post-init recheck below to unwind.
+                */
+               scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
                scx_task_iter_unlock(&sti);
 
                ret = __scx_init_task(sch, p, false);
+
+               rq = task_rq_lock(p, &rf);
+
                if (unlikely(ret)) {
+                       if (scx_get_task_state(p) != SCX_TASK_DEAD)
+                               scx_set_task_state(p, SCX_TASK_NONE);
+                       task_rq_unlock(rq, p, &rf);
                        put_task_struct(p);
                        scx_task_iter_stop(&sti);
                        scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
@@ -6897,10 +6929,20 @@ static void scx_root_enable_workfn(struct kthread_work *work)
                        goto err_disable_unlock_all;
                }
 
-               scx_set_task_state(p, SCX_TASK_INIT);
-               scx_set_task_sched(p, sch);
-               scx_set_task_state(p, SCX_TASK_READY);
+               if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+                       /*
+                        * sched_ext_dead() observed %INIT_BEGIN and set %DEAD.
+                        * ops.exit_task() is owed to the sched __scx_init_task()
+                        * ran against; call it now.
+                        */
+                       scx_sub_init_cancel_task(sch, p);
+               } else {
+                       scx_set_task_state(p, SCX_TASK_INIT);
+                       scx_set_task_sched(p, sch);
+                       scx_set_task_state(p, SCX_TASK_READY);
+               }
 
+               task_rq_unlock(rq, p, &rf);
                put_task_struct(p);
        }
        scx_task_iter_stop(&sti);