sched_ext: Short-circuit sched_class operations on dead tasks

author Tejun Heo <tj@kernel.org>

Wed, 4 Feb 2026 20:07:55 +0000 (10:07 -1000)

committer Tejun Heo <tj@kernel.org>

Wed, 4 Feb 2026 22:22:11 +0000 (12:22 -1000)
author Tejun Heo <tj@kernel.org>
Wed, 4 Feb 2026 20:07:55 +0000 (10:07 -1000)
committer Tejun Heo <tj@kernel.org>
Wed, 4 Feb 2026 22:22:11 +0000 (12:22 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 8f6d8d7f895ccc494414e971d370229da7c7ffe1..1a5ead4a476e6e183bcd4766f9ef4ebed9dd7401 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -194,6 +194,7 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond
  #include <trace/events/sched_ext.h>
  
  static void process_ddsp_deferred_locals(struct rq *rq);
+static bool task_dead_and_done(struct task_struct *p);
  static u32 reenq_local(struct rq *rq);
  static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
  static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
@@ -2618,6 +2619,9 @@ static void set_cpus_allowed_scx(struct task_struct *p,
  
         set_cpus_allowed_common(p, ac);
  
+       if (task_dead_and_done(p))
+               return;
+
         /*
          * The effective cpumask is stored in @p->cpus_ptr which may temporarily
          * differ from the configured one in @p->cpus_mask. Always tell the bpf
@@ -3033,10 +3037,45 @@ void scx_cancel_fork(struct task_struct *p)
         percpu_up_read(&scx_fork_rwsem);
  }
  
+/**
+ * task_dead_and_done - Is a task dead and done running?
+ * @p: target task
+ *
+ * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the
+ * task no longer exists from SCX's POV. However, certain sched_class ops may be
+ * invoked on these dead tasks leading to failures - e.g. sched_setscheduler()
+ * may try to switch a task which finished sched_ext_dead() back into SCX
+ * triggering invalid SCX task state transitions and worse.
+ *
+ * Once a task has finished the final switch, sched_ext_dead() is the only thing
+ * that needs to happen on the task. Use this test to short-circuit sched_class
+ * operations which may be called on dead tasks.
+ */
+static bool task_dead_and_done(struct task_struct *p)
+{
+       struct rq *rq = task_rq(p);
+
+       lockdep_assert_rq_held(rq);
+
+       /*
+        * In do_task_dead(), a dying task sets %TASK_DEAD with preemption
+        * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p
+        * won't ever run again.
+        */
+       return unlikely(READ_ONCE(p->__state) == TASK_DEAD) &&
+               !task_on_cpu(rq, p);
+}
+
  void sched_ext_dead(struct task_struct *p)
  {
         unsigned long flags;
  
+       /*
+        * By the time control reaches here, @p has %TASK_DEAD set, switched out
+        * for the last time and then dropped the rq lock - task_dead_and_done()
+        * should be returning %true nullifying the straggling sched_class ops.
+        * Remove from scx_tasks and exit @p.
+        */
         raw_spin_lock_irqsave(&scx_tasks_lock, flags);
         list_del_init(&p->scx.tasks_node);
         raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
@@ -3062,6 +3101,9 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
  
         lockdep_assert_rq_held(task_rq(p));
  
+       if (task_dead_and_done(p))
+               return;
+
         p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
         if (SCX_HAS_OP(sch, set_weight))
                 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
@@ -3076,6 +3118,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
  {
         struct scx_sched *sch = scx_root;
  
+       if (task_dead_and_done(p))
+               return;
+
         scx_enable_task(p);
  
         /*
@@ -3089,6 +3134,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
  
  static void switched_from_scx(struct rq *rq, struct task_struct *p)
  {
+       if (task_dead_and_done(p))
+               return;
+
         scx_disable_task(p);
  }
author	Tejun Heo <tj@kernel.org>
	Wed, 4 Feb 2026 20:07:55 +0000 (10:07 -1000)
committer	Tejun Heo <tj@kernel.org>
	Wed, 4 Feb 2026 22:22:11 +0000 (12:22 -1000)