sched_ext: Introduce scx_bpf_dsq_reenq() for remote local DSQ reenqueue

author Tejun Heo <tj@kernel.org>

Sat, 7 Mar 2026 15:29:49 +0000 (05:29 -1000)

committer Tejun Heo <tj@kernel.org>

Sat, 7 Mar 2026 15:29:49 +0000 (05:29 -1000)
author Tejun Heo <tj@kernel.org>
Sat, 7 Mar 2026 15:29:49 +0000 (05:29 -1000)
committer Tejun Heo <tj@kernel.org>
Sat, 7 Mar 2026 15:29:49 +0000 (05:29 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 3548cf61477ac384cf107ec2af888f1afdeeb720..efcf7ef72a3e6568b20b211ef618978e203c8056 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1080,6 +1080,31 @@ static void schedule_deferred_locked(struct rq *rq)
         schedule_deferred(rq);
  }
  
+static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq)
+{
+       /*
+        * Allowing reenqueues doesn't make sense while bypassing. This also
+        * blocks from new reenqueues to be scheduled on dead scheds.
+        */
+       if (unlikely(READ_ONCE(sch->bypass_depth)))
+               return;
+
+       if (dsq->id == SCX_DSQ_LOCAL) {
+               struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+               struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq));
+               struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local;
+
+               scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) {
+                       if (list_empty(&drl->node))
+                               list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals);
+               }
+
+               schedule_deferred(rq);
+       } else {
+               scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id);
+       }
+}
+
  /**
   * touch_core_sched - Update timestamp used for core-sched task ordering
   * @rq: rq to read clock from, must be locked
@@ -7774,9 +7799,6 @@ __bpf_kfunc_start_defs();
   * Iterate over all of the tasks currently enqueued on the local DSQ of the
   * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
   * processed tasks. Can only be called from ops.cpu_release().
- *
- * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void
- * returning variant that can be called from anywhere.
   */
  __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux)
  {
@@ -8206,6 +8228,52 @@ __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id,
         return rcu_dereference(dsq->first_task);
  }
  
+/**
+ * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ
+ * @dsq_id: DSQ to re-enqueue
+ * @reenq_flags: %SCX_RENQ_*
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Iterate over all of the tasks currently enqueued on the DSQ identified by
+ * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are
+ * supported:
+ *
+ * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu)
+ *
+ * Re-enqueues are performed asynchronously. Can be called from anywhere.
+ */
+__bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags,
+                                  const struct bpf_prog_aux *aux)
+{
+       struct scx_sched *sch;
+       struct scx_dispatch_q *dsq;
+
+       guard(preempt)();
+
+       sch = scx_prog_sched(aux);
+       if (unlikely(!sch))
+               return;
+
+       dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id());
+       schedule_dsq_reenq(sch, dsq);
+}
+
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
+ * anywhere.
+ *
+ * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the
+ * future.
+ */
+__bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
+{
+       scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux);
+}
+
  __bpf_kfunc_end_defs();
  
  static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
@@ -8363,47 +8431,6 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
                 ops_dump_flush();
  }
  
-/**
- * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
- * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
- *
- * Iterate over all of the tasks currently enqueued on the local DSQ of the
- * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
- * anywhere.
- */
-__bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
-{
-       unsigned long flags;
-       struct scx_sched *sch;
-       struct rq *rq;
-
-       raw_local_irq_save(flags);
-
-       sch = scx_prog_sched(aux);
-       if (unlikely(!sch))
-               goto out_irq_restore;
-
-       /*
-        * Allowing reenqueue-locals doesn't make sense while bypassing. This
-        * also blocks from new reenqueues to be scheduled on dead scheds.
-        */
-       if (unlikely(sch->bypass_depth))
-               goto out_irq_restore;
-
-       rq = this_rq();
-       scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
-               struct scx_sched_pcpu *pcpu = this_cpu_ptr(sch->pcpu);
-
-               if (list_empty(&pcpu->deferred_reenq_local.node))
-                       list_move_tail(&pcpu->deferred_reenq_local.node,
-                                      &rq->scx.deferred_reenq_locals);
-       }
-
-       schedule_deferred(rq);
-out_irq_restore:
-       raw_local_irq_restore(flags);
-}
-
  /**
   * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
   * @cpu: CPU of interest
@@ -8820,13 +8847,14 @@ BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
  BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
  BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
  BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS)
  BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED)
  BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
  BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
  BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS)
  BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS)
  BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS)
  BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS)
  BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS)
  BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS)
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h

index f2969c3061a758d0abdaac4fe6ddccde6121d0a0..2d3985be7e2c1fa30d42d93046dadb14aea7f459 100644 (file)
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -375,6 +375,27 @@ static inline void scx_bpf_reenqueue_local(void)
                 scx_bpf_reenqueue_local___v1();
  }
  
+/*
+ * v6.20: New scx_bpf_dsq_reenq() that allows re-enqueues on more DSQs. This
+ * will eventually deprecate scx_bpf_reenqueue_local().
+ */
+void scx_bpf_dsq_reenq___compat(u64 dsq_id, u64 reenq_flags, const struct bpf_prog_aux *aux__prog) __ksym __weak;
+
+static inline bool __COMPAT_has_generic_reenq(void)
+{
+       return bpf_ksym_exists(scx_bpf_dsq_reenq___compat);
+}
+
+static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags)
+{
+       if (bpf_ksym_exists(scx_bpf_dsq_reenq___compat))
+               scx_bpf_dsq_reenq___compat(dsq_id, reenq_flags, NULL);
+       else if (dsq_id == SCX_DSQ_LOCAL && reenq_flags == 0)
+               scx_bpf_reenqueue_local();
+       else
+               scx_bpf_error("kernel too old to reenqueue foreign local or user DSQs");
+}
+
  /*
   * Define sched_ext_ops. This may be expanded to define multiple variants for
   * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c

index 91b8eac83f527b86a10a8ec38d71eaa01c2fd46f..83e8289e8c0cbce260aa9302ad879d3ccee44f9a 100644 (file)
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -131,7 +131,7 @@ struct {
  } cpu_ctx_stor SEC(".maps");
  
  /* Statistics */
-u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
  u64 nr_core_sched_execed;
  u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
  u32 cpuperf_min, cpuperf_avg, cpuperf_max;
@@ -206,8 +206,11 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
         void *ring;
         s32 cpu;
  
-       if (enq_flags & SCX_ENQ_REENQ)
+       if (enq_flags & SCX_ENQ_REENQ) {
                 __sync_fetch_and_add(&nr_reenqueued, 1);
+               if (scx_bpf_task_cpu(p) == 0)
+                       __sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
+       }
  
         if (p->flags & PF_KTHREAD) {
                 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
@@ -561,6 +564,10 @@ int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
         case 2: /* SCHED_RR */
         case 6: /* SCHED_DEADLINE */
                 scx_bpf_reenqueue_local();
+
+               /* trigger re-enqueue on CPU0 just to exercise LOCAL_ON */
+               if (__COMPAT_has_generic_reenq())
+                       scx_bpf_dsq_reenq(SCX_DSQ_LOCAL_ON | 0, 0);
         }
  
         return 0;
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c

index 5d762d10f4dbdc31821e01b913aed659a6375a74..9252037284d3079b59d33cde528d4380bba5e945 100644 (file)
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -137,9 +137,10 @@ int main(int argc, char **argv)
                 long nr_enqueued = skel->bss->nr_enqueued;
                 long nr_dispatched = skel->bss->nr_dispatched;
  
-               printf("stats  : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
+               printf("stats  : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
                        nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
-                      skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
+                      skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0,
+                      skel->bss->nr_dequeued,
                        skel->bss->nr_core_sched_execed,
                        skel->bss->nr_ddsp_from_enq);
                 printf("         exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
author	Tejun Heo <tj@kernel.org>
	Sat, 7 Mar 2026 15:29:49 +0000 (05:29 -1000)
committer	Tejun Heo <tj@kernel.org>
	Sat, 7 Mar 2026 15:29:49 +0000 (05:29 -1000)
kernel/sched/ext.c		patch \| blob \| blame \| history
tools/sched_ext/include/scx/compat.bpf.h		patch \| blob \| blame \| history
tools/sched_ext/scx_qmap.bpf.c		patch \| blob \| blame \| history
tools/sched_ext/scx_qmap.c		patch \| blob \| blame \| history