From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 15:29:49 +0000 (-1000)
Subject: sched_ext: Convert deferred_reenq_locals from llist to regular list
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8c1b9453fde6;p=thirdparty%2Flinux.git

sched_ext: Convert deferred_reenq_locals from llist to regular list

The deferred reenqueue local mechanism uses an llist (lockless list) for
collecting schedulers that need their local DSQs re-enqueued. Convert to a
regular list protected by a raw_spinlock.

The llist was used for its lockless properties, but the upcoming changes to
support remote reenqueue require more complex list operations that are
difficult to implement correctly with lockless data structures. A spinlock-
protected regular list provides the necessary flexibility.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 18f8fd0d249d8..9c3129a45103b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3639,23 +3639,37 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
 	return nr_enqueued;
 }
 
-static void run_deferred(struct rq *rq)
+static void process_deferred_reenq_locals(struct rq *rq)
 {
-	process_ddsp_deferred_locals(rq);
-
-	if (!llist_empty(&rq->scx.deferred_reenq_locals)) {
-		struct llist_node *llist =
-			llist_del_all(&rq->scx.deferred_reenq_locals);
-		struct scx_sched_pcpu *pos, *next;
+	lockdep_assert_rq_held(rq);
 
-		llist_for_each_entry_safe(pos, next, llist,
-					  deferred_reenq_locals_node) {
-			init_llist_node(&pos->deferred_reenq_locals_node);
-			reenq_local(pos->sch, rq);
+	while (true) {
+		struct scx_sched *sch;
+
+		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+			struct scx_sched_pcpu *sch_pcpu =
+				list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
+							 struct scx_sched_pcpu,
+							 deferred_reenq_local_node);
+			if (!sch_pcpu)
+				return;
+
+			sch = sch_pcpu->sch;
+			list_del_init(&sch_pcpu->deferred_reenq_local_node);
 		}
+
+		reenq_local(sch, rq);
 	}
 }
 
+static void run_deferred(struct rq *rq)
+{
+	process_ddsp_deferred_locals(rq);
+
+	if (!list_empty(&rq->scx.deferred_reenq_locals))
+		process_deferred_reenq_locals(rq);
+}
+
 #ifdef CONFIG_NO_HZ_FULL
 bool scx_can_stop_tick(struct rq *rq)
 {
@@ -4179,13 +4193,13 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	/*
 	 * $sch would have entered bypass mode before the RCU grace period. As
-	 * that blocks new deferrals, all deferred_reenq_locals_node's must be
+	 * that blocks new deferrals, all deferred_reenq_local_node's must be
 	 * off-list by now.
 	 */
 	for_each_possible_cpu(cpu) {
 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
-		WARN_ON_ONCE(llist_on_list(&pcpu->deferred_reenq_locals_node));
+		WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local_node));
 	}
 
 	free_percpu(sch->pcpu);
@@ -5798,7 +5812,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
 		pcpu->sch = sch;
-		init_llist_node(&pcpu->deferred_reenq_locals_node);
+		INIT_LIST_HEAD(&pcpu->deferred_reenq_local_node);
 	}
 
 	sch->helper = kthread_run_worker(0, "sched_ext_helper");
@@ -7125,7 +7139,8 @@ void __init init_sched_ext_class(void)
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
-		init_llist_head(&rq->scx.deferred_reenq_locals);
+		raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
+		INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
 		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
 		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
 
@@ -8357,7 +8372,6 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
 	unsigned long flags;
 	struct scx_sched *sch;
 	struct rq *rq;
-	struct llist_node *lnode;
 
 	raw_local_irq_save(flags);
 
@@ -8373,9 +8387,14 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
 		goto out_irq_restore;
 
 	rq = this_rq();
-	lnode = &this_cpu_ptr(sch->pcpu)->deferred_reenq_locals_node;
-	if (!llist_on_list(lnode))
-		llist_add(lnode, &rq->scx.deferred_reenq_locals);
+	scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+		struct scx_sched_pcpu *pcpu = this_cpu_ptr(sch->pcpu);
+
+		if (list_empty(&pcpu->deferred_reenq_local_node))
+			list_move_tail(&pcpu->deferred_reenq_local_node,
+				       &rq->scx.deferred_reenq_locals);
+	}
+
 	schedule_deferred(rq);
 out_irq_restore:
 	raw_local_irq_restore(flags);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 9e5ebd00ea0cc..80d40a9c5ad93 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -965,7 +965,7 @@ struct scx_sched_pcpu {
 	 */
 	struct scx_event_stats	event_stats;
 
-	struct llist_node	deferred_reenq_locals_node;
+	struct list_head	deferred_reenq_local_node;
 	struct scx_dispatch_q	bypass_dsq;
 #ifdef CONFIG_EXT_SUB_SCHED
 	u32			bypass_host_seq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ebe971d12cb8e..0794852524e77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -808,7 +808,8 @@ struct scx_rq {
 
 	struct task_struct	*sub_dispatch_prev;
 
-	struct llist_head	deferred_reenq_locals;
+	raw_spinlock_t		deferred_reenq_lock;
+	struct list_head	deferred_reenq_locals;	/* scheds requesting reenq of local DSQ */
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
 	struct irq_work		kick_cpus_irq_work;