sched_ext: Add basic building blocks for nested sub-scheduler dispatching

author Tejun Heo <tj@kernel.org>

Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)

committer Tejun Heo <tj@kernel.org>

Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
author Tejun Heo <tj@kernel.org>
Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
committer Tejun Heo <tj@kernel.org>
Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 9db5002a2f4bec16cd817db61685c8a43736cbc9..e25b3593dd307065e57ebdbf2b66e76e052c9afd 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2444,8 +2444,14 @@ static inline void maybe_queue_balance_callback(struct rq *rq)
         rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
  }
  
-static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
-                              struct task_struct *prev)
+/*
+ * One user of this function is scx_bpf_dispatch() which can be called
+ * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
+ * from the call frame.
+ */
+static __always_inline bool
+scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
+                  struct task_struct *prev, bool nested)
  {
         struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
         int nr_loops = SCX_DSP_MAX_LOOPS;
@@ -2499,8 +2505,23 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
         do {
                 dspc->nr_tasks = 0;
  
-               SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
-                           prev_on_sch ? prev : NULL);
+               if (nested) {
+                       /*
+                        * If nested, don't update kf_mask as the originating
+                        * invocation would already have set it up.
+                        */
+                       SCX_CALL_OP(sch, 0, dispatch, rq, cpu,
+                                   prev_on_sch ? prev : NULL);
+               } else {
+                       /*
+                        * If not nested, stash @prev so that nested invocations
+                        * can access it.
+                        */
+                       rq->scx.sub_dispatch_prev = prev;
+                       SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
+                                   prev_on_sch ? prev : NULL);
+                       rq->scx.sub_dispatch_prev = NULL;
+               }
  
                 flush_dispatch_buf(sch, rq);
  
@@ -2541,7 +2562,7 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
  
  static int balance_one(struct rq *rq, struct task_struct *prev)
  {
-       struct scx_sched *sch = scx_root, *pos;
+       struct scx_sched *sch = scx_root;
         s32 cpu = cpu_of(rq);
  
         lockdep_assert_rq_held(rq);
@@ -2585,13 +2606,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
         if (rq->scx.local_dsq.nr)
                 goto has_tasks;
  
-       /*
-        * TEMPORARY - Dispatch all scheds. This will be replaced by BPF-driven
-        * hierarchical operation.
-        */
-       list_for_each_entry_rcu(pos, &scx_sched_all, all)
-               if (scx_dispatch_sched(pos, rq, prev))
-                       goto has_tasks;
+       if (scx_dispatch_sched(sch, rq, prev, false))
+               goto has_tasks;
  
         /*
          * Didn't find another task to run. Keep running @prev unless
@@ -4942,9 +4958,8 @@ static void scx_sub_disable(struct scx_sched *sch)
  
         /*
          * Guarantee forward progress and wait for descendants to be disabled.
-        * To limit
-        * disruptions, $parent is not bypassed. Tasks are fully prepped and
-        * then inserted back into $parent.
+        * To limit disruptions, $parent is not bypassed. Tasks are fully
+        * prepped and then inserted back into $parent.
          */
         scx_bypass(sch, true);
         drain_descendants(sch);
@@ -6580,6 +6595,20 @@ static int bpf_scx_init_member(const struct btf_type *t,
         return 0;
  }
  
+#ifdef CONFIG_EXT_SUB_SCHED
+static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog)
+{
+       struct scx_sched *sch;
+
+       guard(rcu)();
+       sch = scx_prog_sched(prog->aux);
+       if (unlikely(!sch))
+               return;
+
+       scx_error(sch, "dispatch recursion detected");
+}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
  static int bpf_scx_check_member(const struct btf_type *t,
                                 const struct btf_member *member,
                                 const struct bpf_prog *prog)
@@ -6605,6 +6634,22 @@ static int bpf_scx_check_member(const struct btf_type *t,
                         return -EINVAL;
         }
  
+#ifdef CONFIG_EXT_SUB_SCHED
+       /*
+        * Enable private stack for operations that can nest along the
+        * hierarchy.
+        *
+        * XXX - Ideally, we should only do this for scheds that allow
+        * sub-scheds and sub-scheds themselves but I don't know how to access
+        * struct_ops from here.
+        */
+       switch (moff) {
+       case offsetof(struct sched_ext_ops, dispatch):
+               prog->aux->priv_stack_requested = true;
+               prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch;
+       }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
         return 0;
  }
  
@@ -7583,6 +7628,48 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
                             p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
  }
  
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
+ * @cgroup_id: cgroup ID of the child scheduler to dispatch
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Allows a parent scheduler to trigger dispatching on one of its direct
+ * child schedulers. The child scheduler runs its dispatch operation to
+ * move tasks from dispatch queues to the local runqueue.
+ *
+ * Returns: true on success, false if cgroup_id is invalid, not a direct
+ * child, or caller lacks dispatch permission.
+ */
+__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux)
+{
+       struct rq *this_rq = this_rq();
+       struct scx_sched *parent, *child;
+
+       guard(rcu)();
+       parent = scx_prog_sched(aux);
+       if (unlikely(!parent))
+               return false;
+
+       if (!scx_kf_allowed(parent, SCX_KF_DISPATCH))
+               return false;
+
+       child = scx_find_sub_sched(cgroup_id);
+
+       if (unlikely(!child))
+               return false;
+
+       if (unlikely(scx_parent(child) != parent)) {
+               scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
+                         cgroup_id);
+               return false;
+       }
+
+       return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
+                                 true);
+}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
  __bpf_kfunc_end_defs();
  
  BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
@@ -7593,6 +7680,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
  BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
  BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
  BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+#ifdef CONFIG_EXT_SUB_SCHED
+BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS)
+#endif
  BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
  
  static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 7f3b07872e15553d3d983d0ad9b7cf5c5cfb4e7f..ebe971d12cb8eb6a02d854de96d8f35c216aa988 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -805,6 +805,9 @@ struct scx_rq {
         cpumask_var_t           cpus_to_preempt;
         cpumask_var_t           cpus_to_wait;
         unsigned long           kick_sync;
+
+       struct task_struct      *sub_dispatch_prev;
+
         struct llist_head       deferred_reenq_locals;
         struct balance_callback deferred_bal_cb;
         struct irq_work         deferred_irq_work;
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h

index 821d5791bd4229cb91cb141e85f75602128e042a..eba4d87345e06acaca2ba3f60212a38dbf501654 100644 (file)
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -101,6 +101,7 @@ struct rq *scx_bpf_locked_rq(void) __ksym;
  struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
  u64 scx_bpf_now(void) __ksym __weak;
  void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
+bool scx_bpf_sub_dispatch(u64 cgroup_id) __ksym __weak;
  
  /*
   * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c

index ff6ff34177ab340a74efd67474034383250a09b4..91b8eac83f527b86a10a8ec38d71eaa01c2fd46f 100644 (file)
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -48,6 +48,9 @@ const volatile bool suppress_dump;
  u64 nr_highpri_queued;
  u32 test_error_cnt;
  
+#define MAX_SUB_SCHEDS         8
+u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+
  UEI_DEFINE(uei);
  
  struct qmap {
@@ -451,6 +454,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
                 cpuc->dsp_cnt = 0;
         }
  
+       for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+               if (sub_sched_cgroup_ids[i] &&
+                   scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+                       return;
+       }
+
         /*
          * No other tasks. @prev will keep running. Update its core_sched_seq as
          * if the task were enqueued and dispatched immediately.
@@ -895,7 +904,32 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
  
  s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
  {
-       return 0;
+       s32 i;
+
+       for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+               if (!sub_sched_cgroup_ids[i]) {
+                       sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+                       bpf_printk("attaching sub-sched[%d] on %s",
+                                  i, args->cgroup_path);
+                       return 0;
+               }
+       }
+
+       return -ENOSPC;
+}
+
+void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
+{
+       s32 i;
+
+       for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+               if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+                       sub_sched_cgroup_ids[i] = 0;
+                       bpf_printk("detaching sub-sched[%d] on %s",
+                                  i, args->cgroup_path);
+                       break;
+               }
+       }
  }
  
  SCX_OPS_DEFINE(qmap_ops,
@@ -914,6 +948,7 @@ SCX_OPS_DEFINE(qmap_ops,
                .cgroup_set_weight       = (void *)qmap_cgroup_set_weight,
                .cgroup_set_bandwidth    = (void *)qmap_cgroup_set_bandwidth,
                .sub_attach              = (void *)qmap_sub_attach,
+              .sub_detach              = (void *)qmap_sub_detach,
                .cpu_online              = (void *)qmap_cpu_online,
                .cpu_offline             = (void *)qmap_cpu_offline,
                .init                    = (void *)qmap_init,
author	Tejun Heo <tj@kernel.org>
	Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
committer	Tejun Heo <tj@kernel.org>
	Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
kernel/sched/ext.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history
tools/sched_ext/include/scx/common.bpf.h		patch \| blob \| blame \| history
tools/sched_ext/scx_qmap.bpf.c		patch \| blob \| blame \| history