From: Tejun Heo Date: Fri, 6 Mar 2026 17:58:04 +0000 (-1000) Subject: sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0d8c551dd5de1c157600da05a01e3147115dfbb4;p=thirdparty%2Flinux.git sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware scx_bpf_reenqueue_local() currently re-enqueues all tasks on the local DSQ regardless of which sub-scheduler owns them. With multiple sub-schedulers, each should only re-enqueue tasks it owns or are owned by its descendants. Replace the per-rq boolean flag with a lock-free linked list to track per-scheduler reenqueue requests. Filter tasks in reenq_local() using hierarchical ownership checks and block deferrals during bypass to prevent use on dead schedulers. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e8378d59ddae8..f10a9667b491c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -185,7 +185,7 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond static void process_ddsp_deferred_locals(struct rq *rq); static bool task_dead_and_done(struct task_struct *p); -static u32 reenq_local(struct rq *rq); +static u32 reenq_local(struct scx_sched *sch, struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, @@ -991,9 +991,16 @@ static void run_deferred(struct rq *rq) { process_ddsp_deferred_locals(rq); - if (local_read(&rq->scx.reenq_local_deferred)) { - local_set(&rq->scx.reenq_local_deferred, 0); - reenq_local(rq); + if (!llist_empty(&rq->scx.deferred_reenq_locals)) { + struct llist_node *llist = + llist_del_all(&rq->scx.deferred_reenq_locals); + struct scx_sched_pcpu *pos, *next; + + llist_for_each_entry_safe(pos, next, llist, + deferred_reenq_locals_node) { + init_llist_node(&pos->deferred_reenq_locals_node); + reenq_local(pos->sch, rq); + } } } @@ -4082,7 +4089,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work) struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); struct rhashtable_iter rht_iter; struct scx_dispatch_q *dsq; - int node; + int cpu, node; irq_work_sync(&sch->error_irq_work); kthread_destroy_worker(sch->helper); @@ -4094,6 +4101,17 @@ static void scx_sched_free_rcu_work(struct work_struct *work) cgroup_put(sch_cgroup(sch)); #endif /* CONFIG_EXT_SUB_SCHED */ + /* + * $sch would have entered bypass mode before the RCU grace period. As + * that blocks new deferrals, all deferred_reenq_locals_node's must be + * off-list by now. + */ + for_each_possible_cpu(cpu) { + struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); + + WARN_ON_ONCE(llist_on_list(&pcpu->deferred_reenq_locals_node)); + } + free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) @@ -5655,8 +5673,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, for_each_possible_cpu(cpu) init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); - for_each_possible_cpu(cpu) - per_cpu_ptr(sch->pcpu, cpu)->sch = sch; + for_each_possible_cpu(cpu) { + struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); + + pcpu->sch = sch; + init_llist_node(&pcpu->deferred_reenq_locals_node); + } sch->helper = kthread_run_worker(0, "sched_ext_helper"); if (IS_ERR(sch->helper)) { @@ -6957,6 +6979,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); + init_llist_head(&rq->scx.deferred_reenq_locals); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); @@ -7528,7 +7551,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { .set = &scx_kfunc_ids_dispatch, }; -static u32 reenq_local(struct rq *rq) +static u32 reenq_local(struct scx_sched *sch, struct rq *rq) { LIST_HEAD(tasks); u32 nr_enqueued = 0; @@ -7543,6 +7566,8 @@ static u32 reenq_local(struct rq *rq) */ list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, scx.dsq_list.node) { + struct scx_sched *task_sch = scx_task_sched(p); + /* * If @p is being migrated, @p's current CPU may not agree with * its allowed CPUs and the migration_cpu_stop is about to @@ -7557,6 +7582,9 @@ static u32 reenq_local(struct rq *rq) if (p->migration_pending) continue; + if (!scx_is_descendant(task_sch, sch)) + continue; + dispatch_dequeue(rq, p); list_add_tail(&p->scx.dsq_list.node, &tasks); } @@ -7599,7 +7627,7 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux) rq = cpu_rq(smp_processor_id()); lockdep_assert_rq_held(rq); - return reenq_local(rq); + return reenq_local(sch, rq); } __bpf_kfunc_end_defs(); @@ -8170,20 +8198,39 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, /** * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Iterate over all of the tasks currently enqueued on the local DSQ of the * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from * anywhere. */ -__bpf_kfunc void scx_bpf_reenqueue_local___v2(void) +__bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) { + unsigned long flags; + struct scx_sched *sch; struct rq *rq; + struct llist_node *lnode; - guard(preempt)(); + raw_local_irq_save(flags); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + goto out_irq_restore; + + /* + * Allowing reenqueue-locals doesn't make sense while bypassing. This + * also blocks from new reenqueues to be scheduled on dead scheds. + */ + if (unlikely(sch->bypass_depth)) + goto out_irq_restore; rq = this_rq(); - local_set(&rq->scx.reenq_local_deferred, 1); + lnode = &this_cpu_ptr(sch->pcpu)->deferred_reenq_locals_node; + if (!llist_on_list(lnode)) + llist_add(lnode, &rq->scx.deferred_reenq_locals); schedule_deferred(rq); +out_irq_restore: + raw_local_irq_restore(flags); } /** @@ -8608,7 +8655,7 @@ BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) -BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 69d6e9b4d78eb..aac051e27f7f5 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -965,6 +965,7 @@ struct scx_sched_pcpu { */ struct scx_event_stats event_stats; + struct llist_node deferred_reenq_locals_node; struct scx_dispatch_q bypass_dsq; #ifdef CONFIG_EXT_SUB_SCHED u32 bypass_host_seq; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 596f6713cf7e9..7f3b07872e155 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -805,7 +805,7 @@ struct scx_rq { cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_wait; unsigned long kick_sync; - local_t reenq_local_deferred; + struct llist_head deferred_reenq_locals; struct balance_callback deferred_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work;