From: Tejun Heo Date: Sat, 7 Mar 2026 15:29:49 +0000 (-1000) Subject: sched_ext: Convert deferred_reenq_locals from llist to regular list X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8c1b9453fde6;p=thirdparty%2Flinux.git sched_ext: Convert deferred_reenq_locals from llist to regular list The deferred reenqueue local mechanism uses an llist (lockless list) for collecting schedulers that need their local DSQs re-enqueued. Convert to a regular list protected by a raw_spinlock. The llist was used for its lockless properties, but the upcoming changes to support remote reenqueue require more complex list operations that are difficult to implement correctly with lockless data structures. A spinlock- protected regular list provides the necessary flexibility. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 18f8fd0d249d8..9c3129a45103b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3639,23 +3639,37 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq) return nr_enqueued; } -static void run_deferred(struct rq *rq) +static void process_deferred_reenq_locals(struct rq *rq) { - process_ddsp_deferred_locals(rq); - - if (!llist_empty(&rq->scx.deferred_reenq_locals)) { - struct llist_node *llist = - llist_del_all(&rq->scx.deferred_reenq_locals); - struct scx_sched_pcpu *pos, *next; + lockdep_assert_rq_held(rq); - llist_for_each_entry_safe(pos, next, llist, - deferred_reenq_locals_node) { - init_llist_node(&pos->deferred_reenq_locals_node); - reenq_local(pos->sch, rq); + while (true) { + struct scx_sched *sch; + + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { + struct scx_sched_pcpu *sch_pcpu = + list_first_entry_or_null(&rq->scx.deferred_reenq_locals, + struct scx_sched_pcpu, + deferred_reenq_local_node); + if (!sch_pcpu) + return; + + sch = sch_pcpu->sch; + list_del_init(&sch_pcpu->deferred_reenq_local_node); } + + reenq_local(sch, rq); } } +static void run_deferred(struct rq *rq) +{ + process_ddsp_deferred_locals(rq); + + if (!list_empty(&rq->scx.deferred_reenq_locals)) + process_deferred_reenq_locals(rq); +} + #ifdef CONFIG_NO_HZ_FULL bool scx_can_stop_tick(struct rq *rq) { @@ -4179,13 +4193,13 @@ static void scx_sched_free_rcu_work(struct work_struct *work) /* * $sch would have entered bypass mode before the RCU grace period. As - * that blocks new deferrals, all deferred_reenq_locals_node's must be + * that blocks new deferrals, all deferred_reenq_local_node's must be * off-list by now. */ for_each_possible_cpu(cpu) { struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); - WARN_ON_ONCE(llist_on_list(&pcpu->deferred_reenq_locals_node)); + WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local_node)); } free_percpu(sch->pcpu); @@ -5798,7 +5812,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); pcpu->sch = sch; - init_llist_node(&pcpu->deferred_reenq_locals_node); + INIT_LIST_HEAD(&pcpu->deferred_reenq_local_node); } sch->helper = kthread_run_worker(0, "sched_ext_helper"); @@ -7125,7 +7139,8 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); - init_llist_head(&rq->scx.deferred_reenq_locals); + raw_spin_lock_init(&rq->scx.deferred_reenq_lock); + INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); @@ -8357,7 +8372,6 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) unsigned long flags; struct scx_sched *sch; struct rq *rq; - struct llist_node *lnode; raw_local_irq_save(flags); @@ -8373,9 +8387,14 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) goto out_irq_restore; rq = this_rq(); - lnode = &this_cpu_ptr(sch->pcpu)->deferred_reenq_locals_node; - if (!llist_on_list(lnode)) - llist_add(lnode, &rq->scx.deferred_reenq_locals); + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { + struct scx_sched_pcpu *pcpu = this_cpu_ptr(sch->pcpu); + + if (list_empty(&pcpu->deferred_reenq_local_node)) + list_move_tail(&pcpu->deferred_reenq_local_node, + &rq->scx.deferred_reenq_locals); + } + schedule_deferred(rq); out_irq_restore: raw_local_irq_restore(flags); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 9e5ebd00ea0cc..80d40a9c5ad93 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -965,7 +965,7 @@ struct scx_sched_pcpu { */ struct scx_event_stats event_stats; - struct llist_node deferred_reenq_locals_node; + struct list_head deferred_reenq_local_node; struct scx_dispatch_q bypass_dsq; #ifdef CONFIG_EXT_SUB_SCHED u32 bypass_host_seq; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ebe971d12cb8e..0794852524e77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -808,7 +808,8 @@ struct scx_rq { struct task_struct *sub_dispatch_prev; - struct llist_head deferred_reenq_locals; + raw_spinlock_t deferred_reenq_lock; + struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */ struct balance_callback deferred_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work;