sched_ext: Implement scx_bpf_dsq_reenq() for user DSQs

author Tejun Heo <tj@kernel.org>

Sat, 7 Mar 2026 15:29:50 +0000 (05:29 -1000)

committer Tejun Heo <tj@kernel.org>

Sat, 7 Mar 2026 15:29:50 +0000 (05:29 -1000)
author Tejun Heo <tj@kernel.org>
Sat, 7 Mar 2026 15:29:50 +0000 (05:29 -1000)
committer Tejun Heo <tj@kernel.org>
Sat, 7 Mar 2026 15:29:50 +0000 (05:29 -1000)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h

index 303f57dfb947d840e0c966365b97325e265cbae2..e77504faa0bcbbe90239285bbaecaf0e7d0afbec 100644 (file)
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -62,8 +62,14 @@ enum scx_dsq_id_flags {
         SCX_DSQ_LOCAL_CPU_MASK  = 0xffffffffLLU,
  };
  
+struct scx_deferred_reenq_user {
+       struct list_head        node;
+       u64                     flags;
+};
+
  struct scx_dsq_pcpu {
         struct scx_dispatch_q   *dsq;
+       struct scx_deferred_reenq_user deferred_reenq_user;
  };
  
  /*
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index f51e4c20cd95e838292a7c331327abfe935b624b..805c6689c99a1871e61dae2b18007220ec9337d8 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1180,6 +1180,18 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq
                         drl->flags |= reenq_flags;
                 }
  
+               schedule_deferred(rq);
+       } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) {
+               struct rq *rq = this_rq();
+               struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq));
+               struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user;
+
+               scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) {
+                       if (list_empty(&dru->node))
+                               list_move_tail(&dru->node, &rq->scx.deferred_reenq_users);
+                       dru->flags |= reenq_flags;
+               }
+
                 schedule_deferred(rq);
         } else {
                 scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id);
@@ -3784,12 +3796,108 @@ static void process_deferred_reenq_locals(struct rq *rq)
         }
  }
  
+static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags)
+{
+       struct rq *locked_rq = rq;
+       struct scx_sched *sch = dsq->sched;
+       struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0);
+       struct task_struct *p;
+       s32 nr_enqueued = 0;
+
+       lockdep_assert_rq_held(rq);
+
+       raw_spin_lock(&dsq->lock);
+
+       while (likely(!READ_ONCE(sch->bypass_depth))) {
+               struct rq *task_rq;
+
+               p = nldsq_cursor_next_task(&cursor, dsq);
+               if (!p)
+                       break;
+
+               if (!task_should_reenq(p, reenq_flags))
+                       continue;
+
+               task_rq = task_rq(p);
+
+               if (locked_rq != task_rq) {
+                       if (locked_rq)
+                               raw_spin_rq_unlock(locked_rq);
+                       if (unlikely(!raw_spin_rq_trylock(task_rq))) {
+                               raw_spin_unlock(&dsq->lock);
+                               raw_spin_rq_lock(task_rq);
+                               raw_spin_lock(&dsq->lock);
+                       }
+                       locked_rq = task_rq;
+
+                       /* did we lose @p while switching locks? */
+                       if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p))
+                               continue;
+               }
+
+               /* @p is on @dsq, its rq and @dsq are locked */
+               dispatch_dequeue_locked(p, dsq);
+               raw_spin_unlock(&dsq->lock);
+               do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1);
+
+               if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) {
+                       raw_spin_rq_unlock(locked_rq);
+                       locked_rq = NULL;
+                       cpu_relax();
+               }
+
+               raw_spin_lock(&dsq->lock);
+       }
+
+       list_del_init(&cursor.node);
+       raw_spin_unlock(&dsq->lock);
+
+       if (locked_rq != rq) {
+               if (locked_rq)
+                       raw_spin_rq_unlock(locked_rq);
+               raw_spin_rq_lock(rq);
+       }
+}
+
+static void process_deferred_reenq_users(struct rq *rq)
+{
+       lockdep_assert_rq_held(rq);
+
+       while (true) {
+               struct scx_dispatch_q *dsq;
+               u64 reenq_flags = 0;
+
+               scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+                       struct scx_deferred_reenq_user *dru =
+                               list_first_entry_or_null(&rq->scx.deferred_reenq_users,
+                                                        struct scx_deferred_reenq_user,
+                                                        node);
+                       struct scx_dsq_pcpu *dsq_pcpu;
+
+                       if (!dru)
+                               return;
+
+                       dsq_pcpu = container_of(dru, struct scx_dsq_pcpu,
+                                               deferred_reenq_user);
+                       dsq = dsq_pcpu->dsq;
+                       swap(dru->flags, reenq_flags);
+                       list_del_init(&dru->node);
+               }
+
+               BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN);
+               reenq_user(rq, dsq, reenq_flags);
+       }
+}
+
  static void run_deferred(struct rq *rq)
  {
         process_ddsp_deferred_locals(rq);
  
         if (!list_empty(&rq->scx.deferred_reenq_locals))
                 process_deferred_reenq_locals(rq);
+
+       if (!list_empty(&rq->scx.deferred_reenq_users))
+               process_deferred_reenq_users(rq);
  }
  
  #ifdef CONFIG_NO_HZ_FULL
@@ -4119,6 +4227,7 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
                 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
  
                 pcpu->dsq = dsq;
+               INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node);
         }
  
         return 0;
@@ -4126,6 +4235,23 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
  
  static void exit_dsq(struct scx_dispatch_q *dsq)
  {
+       s32 cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
+               struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user;
+               struct rq *rq = cpu_rq(cpu);
+
+               /*
+                * There must have been a RCU grace period since the last
+                * insertion and @dsq should be off the deferred list by now.
+                */
+               if (WARN_ON_ONCE(!list_empty(&dru->node))) {
+                       guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
+                       list_del_init(&dru->node);
+               }
+       }
+
         free_percpu(dsq->pcpu);
  }
  
@@ -7308,6 +7434,7 @@ void __init init_sched_ext_class(void)
                 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
                 raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
                 INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
+               INIT_LIST_HEAD(&rq->scx.deferred_reenq_users);
                 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
                 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
  
@@ -8354,6 +8481,7 @@ __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id,
   * supported:
   *
   * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu)
+ * - User DSQs
   *
   * Re-enqueues are performed asynchronously. Can be called from anywhere.
   */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 0794852524e77f056e2f3e0f834ac6a542aaf893..893f89ce2a775fea4565c5a3d11f1e2ef58a88ee 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -810,6 +810,7 @@ struct scx_rq {
  
         raw_spinlock_t          deferred_reenq_lock;
         struct list_head        deferred_reenq_locals;  /* scheds requesting reenq of local DSQ */
+       struct list_head        deferred_reenq_users;   /* user DSQs requesting reenq */
         struct balance_callback deferred_bal_cb;
         struct irq_work         deferred_irq_work;
         struct irq_work         kick_cpus_irq_work;
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c

index 83e8289e8c0cbce260aa9302ad879d3ccee44f9a..a4a1b84fe3591b6e812b067d27dd3e7187fc65c4 100644 (file)
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -26,8 +26,11 @@
  
  enum consts {
         ONE_SEC_IN_NS           = 1000000000,
+       ONE_MSEC_IN_NS          = 1000000,
+       LOWPRI_INTV_NS          = 10 * ONE_MSEC_IN_NS,
         SHARED_DSQ              = 0,
         HIGHPRI_DSQ             = 1,
+       LOWPRI_DSQ              = 2,
         HIGHPRI_WEIGHT          = 8668,         /* this is what -20 maps to */
  };
  
@@ -172,6 +175,9 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
         if (!(tctx = lookup_task_ctx(p)))
                 return -ESRCH;
  
+       if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
+               return prev_cpu;
+
         cpu = pick_direct_dispatch_cpu(p, prev_cpu);
  
         if (cpu >= 0) {
@@ -242,6 +248,13 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
                 return;
         }
  
+       /* see lowpri_timerfn() */
+       if (__COMPAT_has_generic_reenq() &&
+           p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) {
+               scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags);
+               return;
+       }
+
         /* if select_cpu() wasn't called, try direct dispatch */
         if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
             (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
@@ -873,6 +886,28 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
         return 0;
  }
  
+struct lowpri_timer {
+       struct bpf_timer timer;
+};
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(max_entries, 1);
+       __type(key, u32);
+       __type(value, struct lowpri_timer);
+} lowpri_timer SEC(".maps");
+
+/*
+ * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and
+ * the tasks are transferred to SHARED_DSQ.
+ */
+static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+       scx_bpf_dsq_reenq(LOWPRI_DSQ, 0);
+       bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
+       return 0;
+}
+
  s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
  {
         u32 key = 0;
@@ -894,14 +929,32 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
                 return ret;
         }
  
+       ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1);
+       if (ret)
+               return ret;
+
         timer = bpf_map_lookup_elem(&monitor_timer, &key);
         if (!timer)
                 return -ESRCH;
-
         bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
         bpf_timer_set_callback(timer, monitor_timerfn);
+       ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+       if (ret)
+               return ret;
  
-       return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+       if (__COMPAT_has_generic_reenq()) {
+               /* see lowpri_timerfn() */
+               timer = bpf_map_lookup_elem(&lowpri_timer, &key);
+               if (!timer)
+                       return -ESRCH;
+               bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC);
+               bpf_timer_set_callback(timer, lowpri_timerfn);
+               ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
  }
  
  void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
author	Tejun Heo <tj@kernel.org>
	Sat, 7 Mar 2026 15:29:50 +0000 (05:29 -1000)
committer	Tejun Heo <tj@kernel.org>
	Sat, 7 Mar 2026 15:29:50 +0000 (05:29 -1000)
include/linux/sched/ext.h		patch \| blob \| blame \| history
kernel/sched/ext.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history
tools/sched_ext/scx_qmap.bpf.c		patch \| blob \| blame \| history