sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback

author Tejun Heo <tj@kernel.org>

Sun, 29 Mar 2026 00:18:55 +0000 (14:18 -1000)

committer Tejun Heo <tj@kernel.org>

Mon, 30 Mar 2026 18:37:27 +0000 (08:37 -1000)
author Tejun Heo <tj@kernel.org>
Sun, 29 Mar 2026 00:18:55 +0000 (14:18 -1000)
committer Tejun Heo <tj@kernel.org>
Mon, 30 Mar 2026 18:37:27 +0000 (08:37 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 26a6ac2f88267797b317fae562cbad1ff50bb642..d5bdcdb3f70041fc77e0c0a2be98767a237485dc 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2404,7 +2404,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
  {
         struct scx_sched *sch = scx_root;
  
-       /* see kick_cpus_irq_workfn() */
+       /* see kick_sync_wait_bal_cb() */
         smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
  
         update_curr_scx(rq);
@@ -2447,6 +2447,48 @@ switch_class:
                 switch_class(rq, next);
  }
  
+static void kick_sync_wait_bal_cb(struct rq *rq)
+{
+       struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
+       unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
+       bool waited;
+       s32 cpu;
+
+       /*
+        * Drop rq lock and enable IRQs while waiting. IRQs must be enabled
+        * — a target CPU may be waiting for us to process an IPI (e.g. TLB
+        * flush) while we wait for its kick_sync to advance.
+        *
+        * Also, keep advancing our own kick_sync so that new kick_sync waits
+        * targeting us, which can start after we drop the lock, cannot form
+        * cyclic dependencies.
+        */
+retry:
+       waited = false;
+       for_each_cpu(cpu, rq->scx.cpus_to_sync) {
+               /*
+                * smp_load_acquire() pairs with smp_store_release() on
+                * kick_sync updates on the target CPUs.
+                */
+               if (cpu == cpu_of(rq) ||
+                   smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
+                       cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
+                       continue;
+               }
+
+               raw_spin_rq_unlock_irq(rq);
+               while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
+                       smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+                       cpu_relax();
+               }
+               raw_spin_rq_lock_irq(rq);
+               waited = true;
+       }
+
+       if (waited)
+               goto retry;
+}
+
  static struct task_struct *first_local_task(struct rq *rq)
  {
         return list_first_entry_or_null(&rq->scx.local_dsq.list,
@@ -2460,7 +2502,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
         bool keep_prev;
         struct task_struct *p;
  
-       /* see kick_cpus_irq_workfn() */
+       /* see kick_sync_wait_bal_cb() */
         smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
  
         rq_modified_begin(rq, &ext_sched_class);
@@ -2470,6 +2512,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
         rq_repin_lock(rq, rf);
         maybe_queue_balance_callback(rq);
  
+       /*
+        * Defer to a balance callback which can drop rq lock and enable
+        * IRQs. Waiting directly in the pick path would deadlock against
+        * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
+        */
+       if (unlikely(rq->scx.kick_sync_pending)) {
+               rq->scx.kick_sync_pending = false;
+               queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
+                                      kick_sync_wait_bal_cb);
+       }
+
         /*
          * If any higher-priority sched class enqueued a runnable task on
          * this rq during balance_one(), abort and return RETRY_TASK, so
@@ -4713,6 +4766,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
                 if (!cpumask_empty(rq->scx.cpus_to_wait))
                         dump_line(&ns, "  cpus_to_wait   : %*pb",
                                   cpumask_pr_args(rq->scx.cpus_to_wait));
+               if (!cpumask_empty(rq->scx.cpus_to_sync))
+                       dump_line(&ns, "  cpus_to_sync   : %*pb",
+                                 cpumask_pr_args(rq->scx.cpus_to_sync));
  
                 used = seq_buf_used(&ns);
                 if (SCX_HAS_OP(sch, dump_cpu)) {
@@ -5610,11 +5666,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
  
                 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
                         if (cur_class == &ext_sched_class) {
+                               cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
                                 ksyncs[cpu] = rq->scx.kick_sync;
                                 should_wait = true;
-                       } else {
-                               cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
                         }
+                       cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
                 }
  
                 resched_curr(rq);
@@ -5669,27 +5725,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
                 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
         }
  
-       if (!should_wait)
-               return;
-
-       for_each_cpu(cpu, this_scx->cpus_to_wait) {
-               unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
-
-               /*
-                * Busy-wait until the task running at the time of kicking is no
-                * longer running. This can be used to implement e.g. core
-                * scheduling.
-                *
-                * smp_cond_load_acquire() pairs with store_releases in
-                * pick_task_scx() and put_prev_task_scx(). The former breaks
-                * the wait if SCX's scheduling path is entered even if the same
-                * task is picked subsequently. The latter is necessary to break
-                * the wait when $cpu is taken by a higher sched class.
-                */
-               if (cpu != cpu_of(this_rq))
-                       smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
-
-               cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+       /*
+        * Can't wait in hardirq — kick_sync can't advance, deadlocking if
+        * CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
+        */
+       if (should_wait) {
+               raw_spin_rq_lock(this_rq);
+               this_scx->kick_sync_pending = true;
+               resched_curr(this_rq);
+               raw_spin_rq_unlock(this_rq);
         }
  }
  
@@ -5794,6 +5838,7 @@ void __init init_sched_ext_class(void)
                 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
                 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
                 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
+               BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
                 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
                 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 43bbf0693cca4079e6f71029126a33fad55cb0f5..1ef9ba480f51d1224c21da7a6f93b16f809b2069 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -805,9 +805,12 @@ struct scx_rq {
         cpumask_var_t           cpus_to_kick_if_idle;
         cpumask_var_t           cpus_to_preempt;
         cpumask_var_t           cpus_to_wait;
+       cpumask_var_t           cpus_to_sync;
+       bool                    kick_sync_pending;
         unsigned long           kick_sync;
         local_t                 reenq_local_deferred;
         struct balance_callback deferred_bal_cb;
+       struct balance_callback kick_sync_bal_cb;
         struct irq_work         deferred_irq_work;
         struct irq_work         kick_cpus_irq_work;
         struct scx_dispatch_q   bypass_dsq;
author	Tejun Heo <tj@kernel.org>
	Sun, 29 Mar 2026 00:18:55 +0000 (14:18 -1000)
committer	Tejun Heo <tj@kernel.org>
	Mon, 30 Mar 2026 18:37:27 +0000 (08:37 -1000)
kernel/sched/ext.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history