From: Tejun Heo Date: Sat, 7 Mar 2026 15:29:50 +0000 (-1000) Subject: sched_ext: Implement scx_bpf_dsq_reenq() for user DSQs X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=84b1a0ea0b7c23dec240783a592e480780efe459;p=thirdparty%2Flinux.git sched_ext: Implement scx_bpf_dsq_reenq() for user DSQs scx_bpf_dsq_reenq() currently only supports local DSQs. Extend it to support user-defined DSQs by adding a deferred re-enqueue mechanism similar to the local DSQ handling. Add per-cpu deferred_reenq_user_node/flags to scx_dsq_pcpu and deferred_reenq_users list to scx_rq. When scx_bpf_dsq_reenq() is called on a user DSQ, the DSQ's per-cpu node is added to the current rq's deferred list. process_deferred_reenq_users() then iterates the DSQ using the cursor helpers and re-enqueues each task. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 303f57dfb947d..e77504faa0bcb 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -62,8 +62,14 @@ enum scx_dsq_id_flags { SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; +struct scx_deferred_reenq_user { + struct list_head node; + u64 flags; +}; + struct scx_dsq_pcpu { struct scx_dispatch_q *dsq; + struct scx_deferred_reenq_user deferred_reenq_user; }; /* diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f51e4c20cd95e..805c6689c99a1 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1180,6 +1180,18 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq drl->flags |= reenq_flags; } + schedule_deferred(rq); + } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { + struct rq *rq = this_rq(); + struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); + struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; + + scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) { + if (list_empty(&dru->node)) + list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); + dru->flags |= reenq_flags; + } + schedule_deferred(rq); } else { scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); @@ -3784,12 +3796,108 @@ static void process_deferred_reenq_locals(struct rq *rq) } } +static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) +{ + struct rq *locked_rq = rq; + struct scx_sched *sch = dsq->sched; + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); + struct task_struct *p; + s32 nr_enqueued = 0; + + lockdep_assert_rq_held(rq); + + raw_spin_lock(&dsq->lock); + + while (likely(!READ_ONCE(sch->bypass_depth))) { + struct rq *task_rq; + + p = nldsq_cursor_next_task(&cursor, dsq); + if (!p) + break; + + if (!task_should_reenq(p, reenq_flags)) + continue; + + task_rq = task_rq(p); + + if (locked_rq != task_rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + if (unlikely(!raw_spin_rq_trylock(task_rq))) { + raw_spin_unlock(&dsq->lock); + raw_spin_rq_lock(task_rq); + raw_spin_lock(&dsq->lock); + } + locked_rq = task_rq; + + /* did we lose @p while switching locks? */ + if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) + continue; + } + + /* @p is on @dsq, its rq and @dsq are locked */ + dispatch_dequeue_locked(p, dsq); + raw_spin_unlock(&dsq->lock); + do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); + + if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { + raw_spin_rq_unlock(locked_rq); + locked_rq = NULL; + cpu_relax(); + } + + raw_spin_lock(&dsq->lock); + } + + list_del_init(&cursor.node); + raw_spin_unlock(&dsq->lock); + + if (locked_rq != rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + raw_spin_rq_lock(rq); + } +} + +static void process_deferred_reenq_users(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + while (true) { + struct scx_dispatch_q *dsq; + u64 reenq_flags = 0; + + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { + struct scx_deferred_reenq_user *dru = + list_first_entry_or_null(&rq->scx.deferred_reenq_users, + struct scx_deferred_reenq_user, + node); + struct scx_dsq_pcpu *dsq_pcpu; + + if (!dru) + return; + + dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, + deferred_reenq_user); + dsq = dsq_pcpu->dsq; + swap(dru->flags, reenq_flags); + list_del_init(&dru->node); + } + + BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); + reenq_user(rq, dsq, reenq_flags); + } +} + static void run_deferred(struct rq *rq) { process_ddsp_deferred_locals(rq); if (!list_empty(&rq->scx.deferred_reenq_locals)) process_deferred_reenq_locals(rq); + + if (!list_empty(&rq->scx.deferred_reenq_users)) + process_deferred_reenq_users(rq); } #ifdef CONFIG_NO_HZ_FULL @@ -4119,6 +4227,7 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); pcpu->dsq = dsq; + INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); } return 0; @@ -4126,6 +4235,23 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, static void exit_dsq(struct scx_dispatch_q *dsq) { + s32 cpu; + + for_each_possible_cpu(cpu) { + struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); + struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; + struct rq *rq = cpu_rq(cpu); + + /* + * There must have been a RCU grace period since the last + * insertion and @dsq should be off the deferred list by now. + */ + if (WARN_ON_ONCE(!list_empty(&dru->node))) { + guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); + list_del_init(&dru->node); + } + } + free_percpu(dsq->pcpu); } @@ -7308,6 +7434,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); raw_spin_lock_init(&rq->scx.deferred_reenq_lock); INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); + INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); @@ -8354,6 +8481,7 @@ __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, * supported: * * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) + * - User DSQs * * Re-enqueues are performed asynchronously. Can be called from anywhere. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0794852524e77..893f89ce2a775 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -810,6 +810,7 @@ struct scx_rq { raw_spinlock_t deferred_reenq_lock; struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */ + struct list_head deferred_reenq_users; /* user DSQs requesting reenq */ struct balance_callback deferred_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 83e8289e8c0cb..a4a1b84fe3591 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -26,8 +26,11 @@ enum consts { ONE_SEC_IN_NS = 1000000000, + ONE_MSEC_IN_NS = 1000000, + LOWPRI_INTV_NS = 10 * ONE_MSEC_IN_NS, SHARED_DSQ = 0, HIGHPRI_DSQ = 1, + LOWPRI_DSQ = 2, HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ }; @@ -172,6 +175,9 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, if (!(tctx = lookup_task_ctx(p))) return -ESRCH; + if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD)) + return prev_cpu; + cpu = pick_direct_dispatch_cpu(p, prev_cpu); if (cpu >= 0) { @@ -242,6 +248,13 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } + /* see lowpri_timerfn() */ + if (__COMPAT_has_generic_reenq() && + p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) { + scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags); + return; + } + /* if select_cpu() wasn't called, try direct dispatch */ if (!__COMPAT_is_enq_cpu_selected(enq_flags) && (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { @@ -873,6 +886,28 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) return 0; } +struct lowpri_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct lowpri_timer); +} lowpri_timer SEC(".maps"); + +/* + * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and + * the tasks are transferred to SHARED_DSQ. + */ +static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + scx_bpf_dsq_reenq(LOWPRI_DSQ, 0); + bpf_timer_start(timer, LOWPRI_INTV_NS, 0); + return 0; +} + s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) { u32 key = 0; @@ -894,14 +929,32 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) return ret; } + ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1); + if (ret) + return ret; + timer = bpf_map_lookup_elem(&monitor_timer, &key); if (!timer) return -ESRCH; - bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); bpf_timer_set_callback(timer, monitor_timerfn); + ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0); + if (ret) + return ret; - return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); + if (__COMPAT_has_generic_reenq()) { + /* see lowpri_timerfn() */ + timer = bpf_map_lookup_elem(&lowpri_timer, &key); + if (!timer) + return -ESRCH; + bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, lowpri_timerfn); + ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0); + if (ret) + return ret; + } + + return 0; } void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)