]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched_ext: Convert deferred_reenq_locals from llist to regular list
authorTejun Heo <tj@kernel.org>
Sat, 7 Mar 2026 15:29:49 +0000 (05:29 -1000)
committerTejun Heo <tj@kernel.org>
Sat, 7 Mar 2026 15:29:49 +0000 (05:29 -1000)
The deferred reenqueue local mechanism uses an llist (lockless list) for
collecting schedulers that need their local DSQs re-enqueued. Convert to a
regular list protected by a raw_spinlock.

The llist was used for its lockless properties, but the upcoming changes to
support remote reenqueue require more complex list operations that are
difficult to implement correctly with lockless data structures. A spinlock-
protected regular list provides the necessary flexibility.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
kernel/sched/ext.c
kernel/sched/ext_internal.h
kernel/sched/sched.h

index 18f8fd0d249d8ed6c3746f00321e168683d986fd..9c3129a45103b8f4716e0c66003e7e15461de183 100644 (file)
@@ -3639,23 +3639,37 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
        return nr_enqueued;
 }
 
-static void run_deferred(struct rq *rq)
+static void process_deferred_reenq_locals(struct rq *rq)
 {
-       process_ddsp_deferred_locals(rq);
-
-       if (!llist_empty(&rq->scx.deferred_reenq_locals)) {
-               struct llist_node *llist =
-                       llist_del_all(&rq->scx.deferred_reenq_locals);
-               struct scx_sched_pcpu *pos, *next;
+       lockdep_assert_rq_held(rq);
 
-               llist_for_each_entry_safe(pos, next, llist,
-                                         deferred_reenq_locals_node) {
-                       init_llist_node(&pos->deferred_reenq_locals_node);
-                       reenq_local(pos->sch, rq);
+       while (true) {
+               struct scx_sched *sch;
+
+               scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+                       struct scx_sched_pcpu *sch_pcpu =
+                               list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
+                                                        struct scx_sched_pcpu,
+                                                        deferred_reenq_local_node);
+                       if (!sch_pcpu)
+                               return;
+
+                       sch = sch_pcpu->sch;
+                       list_del_init(&sch_pcpu->deferred_reenq_local_node);
                }
+
+               reenq_local(sch, rq);
        }
 }
 
+static void run_deferred(struct rq *rq)
+{
+       process_ddsp_deferred_locals(rq);
+
+       if (!list_empty(&rq->scx.deferred_reenq_locals))
+               process_deferred_reenq_locals(rq);
+}
+
 #ifdef CONFIG_NO_HZ_FULL
 bool scx_can_stop_tick(struct rq *rq)
 {
@@ -4179,13 +4193,13 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
        /*
         * $sch would have entered bypass mode before the RCU grace period. As
-        * that blocks new deferrals, all deferred_reenq_locals_node's must be
+        * that blocks new deferrals, all deferred_reenq_local_node's must be
         * off-list by now.
         */
        for_each_possible_cpu(cpu) {
                struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
-               WARN_ON_ONCE(llist_on_list(&pcpu->deferred_reenq_locals_node));
+               WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local_node));
        }
 
        free_percpu(sch->pcpu);
@@ -5798,7 +5812,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
                struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
                pcpu->sch = sch;
-               init_llist_node(&pcpu->deferred_reenq_locals_node);
+               INIT_LIST_HEAD(&pcpu->deferred_reenq_local_node);
        }
 
        sch->helper = kthread_run_worker(0, "sched_ext_helper");
@@ -7125,7 +7139,8 @@ void __init init_sched_ext_class(void)
                BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
                BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
                BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
-               init_llist_head(&rq->scx.deferred_reenq_locals);
+               raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
+               INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
                rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
                rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
 
@@ -8357,7 +8372,6 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
        unsigned long flags;
        struct scx_sched *sch;
        struct rq *rq;
-       struct llist_node *lnode;
 
        raw_local_irq_save(flags);
 
@@ -8373,9 +8387,14 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
                goto out_irq_restore;
 
        rq = this_rq();
-       lnode = &this_cpu_ptr(sch->pcpu)->deferred_reenq_locals_node;
-       if (!llist_on_list(lnode))
-               llist_add(lnode, &rq->scx.deferred_reenq_locals);
+       scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+               struct scx_sched_pcpu *pcpu = this_cpu_ptr(sch->pcpu);
+
+               if (list_empty(&pcpu->deferred_reenq_local_node))
+                       list_move_tail(&pcpu->deferred_reenq_local_node,
+                                      &rq->scx.deferred_reenq_locals);
+       }
+
        schedule_deferred(rq);
 out_irq_restore:
        raw_local_irq_restore(flags);
index 9e5ebd00ea0ccc3514240ce13636f4b0be54847e..80d40a9c5ad9327c72fde3c819b4954fc17e5529 100644 (file)
@@ -965,7 +965,7 @@ struct scx_sched_pcpu {
         */
        struct scx_event_stats  event_stats;
 
-       struct llist_node       deferred_reenq_locals_node;
+       struct list_head        deferred_reenq_local_node;
        struct scx_dispatch_q   bypass_dsq;
 #ifdef CONFIG_EXT_SUB_SCHED
        u32                     bypass_host_seq;
index ebe971d12cb8eb6a02d854de96d8f35c216aa988..0794852524e77f056e2f3e0f834ac6a542aaf893 100644 (file)
@@ -808,7 +808,8 @@ struct scx_rq {
 
        struct task_struct      *sub_dispatch_prev;
 
-       struct llist_head       deferred_reenq_locals;
+       raw_spinlock_t          deferred_reenq_lock;
+       struct list_head        deferred_reenq_locals;  /* scheds requesting reenq of local DSQ */
        struct balance_callback deferred_bal_cb;
        struct irq_work         deferred_irq_work;
        struct irq_work         kick_cpus_irq_work;