From 61debc251c1c9150c7bdfd5c028bc2d078e17d22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Nov 2025 09:18:06 -1000 Subject: [PATCH] sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode Bypass mode routes tasks through fallback dispatch queues. Originally a single global DSQ, b7b3b2dbae73 ("sched_ext: Split the global DSQ per NUMA node") changed this to per-node DSQs to resolve NUMA-related livelocks. Dan Schatzberg found per-node DSQs can still livelock when many threads are pinned to different small CPU subsets: each CPU must scan many incompatible tasks to find runnable ones, causing severe contention with high CPU counts. Switch to per-CPU bypass DSQs. Each task queues on its current CPU. Default idle CPU selection and direct dispatch handle most cases well. This introduces a failure mode when tasks concentrate on one CPU in over-saturated systems. If the BPF scheduler severely skews placement before triggering bypass, that CPU's queue may be too long to drain, causing RCU stalls. A load balancer in a future patch will address this. The bypass DSQ is separate from local DSQ to enable load balancing: local DSQs use rq locks, preventing efficient scanning and transfer across CPUs, especially problematic when systems are already contended. v2: Clarified why bypass DSQ is separate from local DSQ (Andrea Righi). Reported-by: Dan Schatzberg Reviewed-by: Dan Schatzberg Reviewed-by: Andrea Righi Reviewed-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 16 +++++++++++++--- kernel/sched/sched.h | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 60285c3d07cf6..3d3216ff91887 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -57,6 +57,7 @@ enum scx_dsq_id_flags { SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3, SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 43083602c15e3..747391a3f6e39 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1298,7 +1298,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (scx_rq_bypassing(rq)) { __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); - goto global; + goto bypass; } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -1356,6 +1356,9 @@ local: global: dsq = find_global_dsq(sch, p); goto enqueue; +bypass: + dsq = &task_rq(p)->scx.bypass_dsq; + goto enqueue; enqueue: /* @@ -2154,8 +2157,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (consume_global_dsq(sch, rq)) goto has_tasks; - if (unlikely(!SCX_HAS_OP(sch, dispatch)) || - scx_rq_bypassing(rq) || !scx_rq_online(rq)) + if (scx_rq_bypassing(rq)) { + if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq)) + goto has_tasks; + else + goto no_tasks; + } + + if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) goto no_tasks; dspc->rq = rq; @@ -5371,6 +5380,7 @@ void __init init_sched_ext_class(void) int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); + init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 27aae2a298f8b..5991133a48498 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -808,6 +808,7 @@ struct scx_rq { struct balance_callback deferred_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; + struct scx_dispatch_q bypass_dsq; }; #endif /* CONFIG_SCHED_CLASS_EXT */ -- 2.47.3