sched_ext: Introduce cgroup sub-sched support

author Tejun Heo <tj@kernel.org>

Fri, 6 Mar 2026 17:58:03 +0000 (07:58 -1000)

committer Tejun Heo <tj@kernel.org>

Fri, 6 Mar 2026 17:58:03 +0000 (07:58 -1000)
author Tejun Heo <tj@kernel.org>
Fri, 6 Mar 2026 17:58:03 +0000 (07:58 -1000)
committer Tejun Heo <tj@kernel.org>
Fri, 6 Mar 2026 17:58:03 +0000 (07:58 -1000)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index bb92f5c169ca2dcfc9354623e34968f6449854b1..dd61767cf9bbbc7f7a6a96b3eae7989ed9c7fcc4 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -17,6 +17,7 @@
  #include <linux/refcount.h>
  #include <linux/percpu-refcount.h>
  #include <linux/percpu-rwsem.h>
+#include <linux/sched.h>
  #include <linux/u64_stats_sync.h>
  #include <linux/workqueue.h>
  #include <linux/bpf-cgroup-defs.h>
@@ -624,6 +625,9 @@ struct cgroup {
  #ifdef CONFIG_BPF_SYSCALL
         struct bpf_local_storage __rcu  *bpf_cgrp_storage;
  #endif
+#ifdef CONFIG_EXT_SUB_SCHED
+       struct scx_sched __rcu *scx_sched;
+#endif
  
         /* All ancestors including self */
         union {
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h

index 0150b3fe623005b509f38813556410cac3081e4e..fa4349b319e610f3e0a6762df627681bcdabb152 100644 (file)
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -78,6 +78,7 @@ struct scx_dispatch_q {
         u64                     id;
         struct rhash_head       hash_node;
         struct llist_node       free_node;
+       struct scx_sched        *sched;
         struct rcu_head         rcu;
  };
  
@@ -157,6 +158,8 @@ struct scx_dsq_list_node {
                 .priv = (__priv),                                               \
         }
  
+struct scx_sched;
+
  /*
   * The following is embedded in task_struct and contains all fields necessary
   * for a task to be scheduled by SCX.
diff --git a/init/Kconfig b/init/Kconfig

index b55deae9256c70dc86004f8afffc2ea944f384f6..06abd8e272cba5f01e7315f82c6ad9b24528dba4 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1176,6 +1176,10 @@ config EXT_GROUP_SCHED
  
  endif #CGROUP_SCHED
  
+config EXT_SUB_SCHED
+        def_bool y
+        depends on SCHED_CLASS_EXT
+
  config SCHED_MM_CID
         def_bool y
         depends on SMP && RSEQ
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 142845bcddaaaaeec6a137edd8f5d3547860e3ba..bb3e33b660dac5a07b5dcf6ac612f6a133dd7130 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,6 +9,8 @@
  #include <linux/btf_ids.h>
  #include "ext_idle.h"
  
+static DEFINE_RAW_SPINLOCK(scx_sched_lock);
+
  /*
   * NOTE: sched_ext is in the process of growing multiple scheduler support and
   * scx_root usage is in a transitional state. Naked dereferences are safe if the
@@ -19,6 +21,12 @@
   */
  static struct scx_sched __rcu *scx_root;
  
+/*
+ * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock.
+ * Readers can hold either or rcu_read_lock().
+ */
+static LIST_HEAD(scx_sched_all);
+
  /*
   * During exit, a task may schedule after losing its PIDs. When disabling the
   * BPF scheduler, we need to be able to iterate tasks in every state to
@@ -197,6 +205,7 @@ static void process_ddsp_deferred_locals(struct rq *rq);
  static bool task_dead_and_done(struct task_struct *p);
  static u32 reenq_local(struct rq *rq);
  static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
+static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind);
  static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
                       s64 exit_code, const char *fmt, va_list args);
  
@@ -245,6 +254,88 @@ static bool u32_before(u32 a, u32 b)
         return (s32)(a - b) < 0;
  }
  
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_parent - Find the parent sched
+ * @sch: sched to find the parent of
+ *
+ * Returns the parent scheduler or %NULL if @sch is root.
+ */
+static struct scx_sched *scx_parent(struct scx_sched *sch)
+{
+       if (sch->level)
+               return sch->ancestors[sch->level - 1];
+       else
+               return NULL;
+}
+
+/**
+ * scx_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: sched whose descendants to walk
+ *
+ * To be used by scx_for_each_descendant_pre(). Find the next descendant to
+ * visit for pre-order traversal of @root's descendants. @root is included in
+ * the iteration and the first node to be visited.
+ */
+static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos,
+                                                struct scx_sched *root)
+{
+       struct scx_sched *next;
+
+       lockdep_assert(lockdep_is_held(&scx_enable_mutex) ||
+                      lockdep_is_held(&scx_sched_lock));
+
+       /* if first iteration, visit @root */
+       if (!pos)
+               return root;
+
+       /* visit the first child if exists */
+       next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling);
+       if (next)
+               return next;
+
+       /* no child, visit my or the closest ancestor's next sibling */
+       while (pos != root) {
+               if (!list_is_last(&pos->sibling, &scx_parent(pos)->children))
+                       return list_next_entry(pos, sibling);
+               pos = scx_parent(pos);
+       }
+
+       return NULL;
+}
+#else  /* CONFIG_EXT_SUB_SCHED */
+static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
+static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+/**
+ * scx_is_descendant - Test whether sched is a descendant
+ * @sch: sched to test
+ * @ancestor: ancestor sched to test against
+ *
+ * Test whether @sch is a descendant of @ancestor.
+ */
+static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor)
+{
+       if (sch->level < ancestor->level)
+               return false;
+       return sch->ancestors[ancestor->level] == ancestor;
+}
+
+/**
+ * scx_for_each_descendant_pre - pre-order walk of a sched's descendants
+ * @pos: iteration cursor
+ * @root: sched to walk the descendants of
+ *
+ * Walk @root's descendants. @root is included in the iteration and the first
+ * node to be visited. Must be called with either scx_enable_mutex or
+ * scx_sched_lock held.
+ */
+#define scx_for_each_descendant_pre(pos, root)                                 \
+       for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos);              \
+            (pos) = scx_next_descendant_pre((pos), (root)))
+
  static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
                                               struct task_struct *p)
  {
@@ -514,7 +605,7 @@ struct scx_task_iter {
         struct rq_flags                 rf;
         u32                             cnt;
         bool                            list_locked;
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
         struct cgroup                   *cgrp;
         struct cgroup_subsys_state      *css_pos;
         struct css_task_iter            css_iter;
@@ -553,7 +644,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp)
  {
         memset(iter, 0, sizeof(*iter));
  
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
         if (cgrp) {
                 lockdep_assert_held(&cgroup_mutex);
                 iter->cgrp = cgrp;
@@ -614,7 +705,7 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
   */
  static void scx_task_iter_stop(struct scx_task_iter *iter)
  {
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
         if (iter->cgrp) {
                 if (iter->css_pos)
                         css_task_iter_end(&iter->css_iter);
@@ -645,7 +736,7 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
                 cond_resched();
         }
  
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
         if (iter->cgrp) {
                 while (iter->css_pos) {
                         struct task_struct *p;
@@ -3032,7 +3123,10 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork
         scx_set_task_state(p, SCX_TASK_INIT);
  
         if (p->scx.disallow) {
-               if (unlikely(fork)) {
+               if (unlikely(scx_parent(sch))) {
+                       scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]",
+                                 p->comm, p->pid);
+               } else if (unlikely(fork)) {
                         scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
                                   p->comm, p->pid);
                 } else {
@@ -3555,25 +3649,51 @@ void scx_group_set_bandwidth(struct task_group *tg,
  
         percpu_up_read(&scx_cgroup_ops_rwsem);
  }
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+static struct cgroup *root_cgroup(void)
+{
+       return &cgrp_dfl_root.cgrp;
+}
+
+static struct cgroup *sch_cgroup(struct scx_sched *sch)
+{
+       return sch->cgrp;
+}
+
+/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
+{
+       struct cgroup *pos;
+       struct cgroup_subsys_state *css;
+
+       cgroup_for_each_live_descendant_pre(pos, css, cgrp)
+               rcu_assign_pointer(pos->scx_sched, sch);
+}
  
  static void scx_cgroup_lock(void)
  {
+#ifdef CONFIG_EXT_GROUP_SCHED
         percpu_down_write(&scx_cgroup_ops_rwsem);
+#endif
         cgroup_lock();
  }
  
  static void scx_cgroup_unlock(void)
  {
         cgroup_unlock();
+#ifdef CONFIG_EXT_GROUP_SCHED
         percpu_up_write(&scx_cgroup_ops_rwsem);
+#endif
  }
-
-#else  /* CONFIG_EXT_GROUP_SCHED */
-
+#else  /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
+static struct cgroup *root_cgroup(void) { return NULL; }
+static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
  static void scx_cgroup_lock(void) {}
  static void scx_cgroup_unlock(void) {}
-
-#endif /* CONFIG_EXT_GROUP_SCHED */
+#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
  
  /*
   * Omitted operations:
@@ -3622,13 +3742,15 @@ DEFINE_SCHED_CLASS(ext) = {
  #endif
  };
  
-static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
+                    struct scx_sched *sch)
  {
         memset(dsq, 0, sizeof(*dsq));
  
         raw_spin_lock_init(&dsq->lock);
         INIT_LIST_HEAD(&dsq->list);
         dsq->id = dsq_id;
+       dsq->sched = sch;
  }
  
  static void free_dsq_irq_workfn(struct irq_work *irq_work)
@@ -3826,6 +3948,12 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
         irq_work_sync(&sch->error_irq_work);
         kthread_destroy_worker(sch->helper);
  
+#ifdef CONFIG_EXT_SUB_SCHED
+       kfree(sch->cgrp_path);
+       if (sch_cgroup(sch))
+               cgroup_put(sch_cgroup(sch));
+#endif /* CONFIG_EXT_SUB_SCHED */
+
         free_percpu(sch->pcpu);
  
         for_each_node_state(node, N_POSSIBLE)
@@ -4405,6 +4533,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
                 return "unregistered from the main kernel";
         case SCX_EXIT_SYSRQ:
                 return "disabled by sysrq-S";
+       case SCX_EXIT_PARENT:
+               return "parent exiting";
         case SCX_EXIT_ERROR:
                 return "runtime error";
         case SCX_EXIT_ERROR_BPF:
@@ -4430,6 +4560,69 @@ static void free_kick_syncs(void)
         }
  }
  
+#ifdef CONFIG_EXT_SUB_SCHED
+static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
+
+static void drain_descendants(struct scx_sched *sch)
+{
+       /*
+        * Child scheds that finished the critical part of disabling will take
+        * themselves off @sch->children. Wait for it to drain. As propagation
+        * is recursive, empty @sch->children means that all proper descendant
+        * scheds reached unlinking stage.
+        */
+       wait_event(scx_unlink_waitq, list_empty(&sch->children));
+}
+
+static void scx_sub_disable(struct scx_sched *sch)
+{
+       struct scx_sched *parent = scx_parent(sch);
+
+       drain_descendants(sch);
+
+       mutex_lock(&scx_enable_mutex);
+       percpu_down_write(&scx_fork_rwsem);
+       scx_cgroup_lock();
+
+       set_cgroup_sched(sch_cgroup(sch), parent);
+
+       /* TODO - perform actual disabling here */
+
+       scx_cgroup_unlock();
+       percpu_up_write(&scx_fork_rwsem);
+
+       raw_spin_lock_irq(&scx_sched_lock);
+       list_del_init(&sch->sibling);
+       list_del_rcu(&sch->all);
+       raw_spin_unlock_irq(&scx_sched_lock);
+
+       mutex_unlock(&scx_enable_mutex);
+
+       /*
+        * @sch is now unlinked from the parent's children list. Notify and call
+        * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
+        * after unlinking and releasing all locks. See scx_claim_exit().
+        */
+       wake_up_all(&scx_unlink_waitq);
+
+       if (sch->ops.sub_detach && sch->sub_attached) {
+               struct scx_sub_detach_args sub_detach_args = {
+                       .ops = &sch->ops,
+                       .cgroup_path = sch->cgrp_path,
+               };
+               SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL,
+                           &sub_detach_args);
+       }
+
+       if (sch->ops.exit)
+               SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info);
+       kobject_del(&sch->kobj);
+}
+#else  /* CONFIG_EXT_SUB_SCHED */
+static void drain_descendants(struct scx_sched *sch) { }
+static void scx_sub_disable(struct scx_sched *sch) { }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
  static void scx_root_disable(struct scx_sched *sch)
  {
         struct scx_exit_info *ei = sch->exit_info;
@@ -4437,9 +4630,10 @@ static void scx_root_disable(struct scx_sched *sch)
         struct task_struct *p;
         int cpu;
  
-       /* guarantee forward progress by bypassing scx_ops */
+       /* guarantee forward progress and wait for descendants to be disabled */
         scx_bypass(true);
         WRITE_ONCE(scx_aborting, false);
+       drain_descendants(sch);
  
         switch (scx_set_enable_state(SCX_DISABLING)) {
         case SCX_DISABLING:
@@ -4498,6 +4692,11 @@ static void scx_root_disable(struct scx_sched *sch)
                 scx_exit_task(p);
         }
         scx_task_iter_stop(&sti);
+
+       scx_cgroup_lock();
+       set_cgroup_sched(sch_cgroup(sch), NULL);
+       scx_cgroup_unlock();
+
         percpu_up_write(&scx_fork_rwsem);
  
         /*
@@ -4534,6 +4733,10 @@ static void scx_root_disable(struct scx_sched *sch)
  
         cancel_delayed_work_sync(&scx_watchdog_work);
  
+       raw_spin_lock_irq(&scx_sched_lock);
+       list_del_rcu(&sch->all);
+       raw_spin_unlock_irq(&scx_sched_lock);
+
         /*
          * scx_root clearing must be inside cpus_read_lock(). See
          * handle_hotplug().
@@ -4591,6 +4794,24 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
          * successfully reach scx_bypass().
          */
         WRITE_ONCE(scx_aborting, true);
+
+       /*
+        * Propagate exits to descendants immediately. Each has a dedicated
+        * helper kthread and can run in parallel. While most of disabling is
+        * serialized, running them in separate threads allows parallelizing
+        * ops.exit(), which can take arbitrarily long prolonging bypass mode.
+        *
+        * This doesn't cause recursions as propagation only takes place for
+        * non-propagation exits.
+        */
+       if (kind != SCX_EXIT_PARENT) {
+               scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) {
+                       struct scx_sched *pos;
+                       scx_for_each_descendant_pre(pos, sch)
+                               scx_disable(pos, SCX_EXIT_PARENT);
+               }
+       }
+
         return true;
  }
  
@@ -4611,7 +4832,10 @@ static void scx_disable_workfn(struct kthread_work *work)
         ei->kind = kind;
         ei->reason = scx_exit_reason(ei->kind);
  
-       scx_root_disable(sch);
+       if (scx_parent(sch))
+               scx_sub_disable(sch);
+       else
+               scx_root_disable(sch);
  }
  
  static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
@@ -4987,12 +5211,15 @@ static int alloc_kick_syncs(void)
         return 0;
  }
  
-static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
+static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
+                                                struct cgroup *cgrp,
+                                                struct scx_sched *parent)
  {
         struct scx_sched *sch;
+       s32 level = parent ? parent->level + 1 : 0;
         int node, ret;
  
-       sch = kzalloc_obj(*sch);
+       sch = kzalloc_flex(*sch, ancestors, level);
         if (!sch)
                 return ERR_PTR(-ENOMEM);
  
@@ -5021,7 +5248,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
                         goto err_free_gdsqs;
                 }
  
-               init_dsq(dsq, SCX_DSQ_GLOBAL);
+               init_dsq(dsq, SCX_DSQ_GLOBAL, sch);
                 sch->global_dsqs[node] = dsq;
         }
  
@@ -5039,6 +5266,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
  
         sched_set_fifo(sch->helper->task);
  
+       if (parent)
+               memcpy(sch->ancestors, parent->ancestors,
+                      level * sizeof(parent->ancestors[0]));
+       sch->ancestors[level] = sch;
+       sch->level = level;
+
         atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
         init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
         kthread_init_work(&sch->disable_work, scx_disable_workfn);
@@ -5046,10 +5279,46 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
         ops->priv = sch;
  
         sch->kobj.kset = scx_kset;
+
+#ifdef CONFIG_EXT_SUB_SCHED
+       char *buf = kzalloc(PATH_MAX, GFP_KERNEL);
+       if (!buf)
+               goto err_stop_helper;
+       cgroup_path(cgrp, buf, PATH_MAX);
+       sch->cgrp_path = kstrdup(buf, GFP_KERNEL);
+       kfree(buf);
+       if (!sch->cgrp_path)
+               goto err_stop_helper;
+
+       sch->cgrp = cgrp;
+       INIT_LIST_HEAD(&sch->children);
+       INIT_LIST_HEAD(&sch->sibling);
+
+       if (parent)
+               ret = kobject_init_and_add(&sch->kobj, &scx_ktype,
+                                          &parent->sub_kset->kobj,
+                                          "sub-%llu", cgroup_id(cgrp));
+       else
+               ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+
+       if (ret < 0) {
+               kfree(sch->cgrp_path);
+               goto err_stop_helper;
+       }
+
+       if (ops->sub_attach) {
+               sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj);
+               if (!sch->sub_kset) {
+                       kobject_put(&sch->kobj);
+                       return ERR_PTR(-ENOMEM);
+               }
+       }
+
+#else  /* CONFIG_EXT_SUB_SCHED */
         ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
         if (ret < 0)
                 goto err_stop_helper;
-
+#endif /* CONFIG_EXT_SUB_SCHED */
         return sch;
  
  err_stop_helper:
@@ -5157,7 +5426,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
         if (ret)
                 goto err_unlock;
  
-       sch = scx_alloc_and_add_sched(ops);
+       sch = scx_alloc_and_add_sched(ops, root_cgroup(), NULL);
         if (IS_ERR(sch)) {
                 ret = PTR_ERR(sch);
                 goto err_free_ksyncs;
@@ -5174,8 +5443,13 @@ static void scx_root_enable_workfn(struct kthread_work *work)
  
         atomic_long_set(&scx_nr_rejected, 0);
  
-       for_each_possible_cpu(cpu)
-               cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
+       for_each_possible_cpu(cpu) {
+               struct rq *rq = cpu_rq(cpu);
+
+               rq->scx.local_dsq.sched = sch;
+               rq->scx.bypass_dsq.sched = sch;
+               rq->scx.cpuperf_target = SCX_CPUPERF_ONE;
+       }
  
         /*
          * Keep CPUs stable during enable so that the BPF scheduler can track
@@ -5189,6 +5463,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
          */
         rcu_assign_pointer(scx_root, sch);
  
+       raw_spin_lock_irq(&scx_sched_lock);
+       list_add_tail_rcu(&sch->all, &scx_sched_all);
+       raw_spin_unlock_irq(&scx_sched_lock);
+
         scx_idle_enable(ops);
  
         if (sch->ops.init) {
@@ -5278,6 +5556,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
          * never sees uninitialized tasks.
          */
         scx_cgroup_lock();
+       set_cgroup_sched(sch_cgroup(sch), sch);
         ret = scx_cgroup_init(sch);
         if (ret)
                 goto err_disable_unlock_all;
@@ -5392,6 +5671,185 @@ err_disable:
         cmd->ret = 0;
  }
  
+#ifdef CONFIG_EXT_SUB_SCHED
+/* verify that a scheduler can be attached to @cgrp and return the parent */
+static struct scx_sched *find_parent_sched(struct cgroup *cgrp)
+{
+       struct scx_sched *parent = cgrp->scx_sched;
+       struct scx_sched *pos;
+
+       lockdep_assert_held(&scx_sched_lock);
+
+       /* can't attach twice to the same cgroup */
+       if (parent->cgrp == cgrp)
+               return ERR_PTR(-EBUSY);
+
+       /* does $parent allow sub-scheds? */
+       if (!parent->ops.sub_attach)
+               return ERR_PTR(-EOPNOTSUPP);
+
+       /* can't insert between $parent and its exiting children */
+       list_for_each_entry(pos, &parent->children, sibling)
+               if (cgroup_is_descendant(pos->cgrp, cgrp))
+                       return ERR_PTR(-EBUSY);
+
+       return parent;
+}
+
+static void scx_sub_enable_workfn(struct kthread_work *work)
+{
+       struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
+       struct sched_ext_ops *ops = cmd->ops;
+       struct cgroup *cgrp;
+       struct scx_sched *parent, *sch;
+       s32 ret;
+
+       mutex_lock(&scx_enable_mutex);
+
+       if (!scx_enabled()) {
+               ret = -ENODEV;
+               goto out_unlock;
+       }
+
+       cgrp = cgroup_get_from_id(ops->sub_cgroup_id);
+       if (IS_ERR(cgrp)) {
+               ret = PTR_ERR(cgrp);
+               goto out_unlock;
+       }
+
+       raw_spin_lock_irq(&scx_sched_lock);
+       parent = find_parent_sched(cgrp);
+       if (IS_ERR(parent)) {
+               raw_spin_unlock_irq(&scx_sched_lock);
+               ret = PTR_ERR(parent);
+               goto out_put_cgrp;
+       }
+       kobject_get(&parent->kobj);
+       raw_spin_unlock_irq(&scx_sched_lock);
+
+       sch = scx_alloc_and_add_sched(ops, cgrp, parent);
+       kobject_put(&parent->kobj);
+       if (IS_ERR(sch)) {
+               ret = PTR_ERR(sch);
+               goto out_put_cgrp;
+       }
+
+       raw_spin_lock_irq(&scx_sched_lock);
+       list_add_tail(&sch->sibling, &parent->children);
+       list_add_tail_rcu(&sch->all, &scx_sched_all);
+       raw_spin_unlock_irq(&scx_sched_lock);
+
+       if (sch->level >= SCX_SUB_MAX_DEPTH) {
+               scx_error(sch, "max nesting depth %d violated",
+                         SCX_SUB_MAX_DEPTH);
+               goto err_disable;
+       }
+
+       if (sch->ops.init) {
+               ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
+               if (ret) {
+                       ret = ops_sanitize_err(sch, "init", ret);
+                       scx_error(sch, "ops.init() failed (%d)", ret);
+                       goto err_disable;
+               }
+               sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
+       }
+
+       if (validate_ops(sch, ops))
+               goto err_disable;
+
+       struct scx_sub_attach_args sub_attach_args = {
+               .ops = &sch->ops,
+               .cgroup_path = sch->cgrp_path,
+       };
+
+       ret = SCX_CALL_OP_RET(parent, SCX_KF_UNLOCKED, sub_attach, NULL,
+                             &sub_attach_args);
+       if (ret) {
+               ret = ops_sanitize_err(sch, "sub_attach", ret);
+               scx_error(sch, "parent rejected (%d)", ret);
+               goto err_disable;
+       }
+       sch->sub_attached = true;
+
+       percpu_down_write(&scx_fork_rwsem);
+       scx_cgroup_lock();
+
+       /*
+        * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see
+        * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down.
+        */
+       set_cgroup_sched(sch_cgroup(sch), sch);
+       if (!(cgrp->self.flags & CSS_ONLINE)) {
+               scx_error(sch, "cgroup is not online");
+               goto err_unlock_and_disable;
+       }
+
+       /* TODO - perform actual enabling here */
+
+       scx_cgroup_unlock();
+       percpu_up_write(&scx_fork_rwsem);
+
+       pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
+       kobject_uevent(&sch->kobj, KOBJ_ADD);
+       ret = 0;
+       goto out_unlock;
+
+out_put_cgrp:
+       cgroup_put(cgrp);
+out_unlock:
+       mutex_unlock(&scx_enable_mutex);
+       cmd->ret = ret;
+       return;
+
+err_unlock_and_disable:
+       scx_cgroup_unlock();
+       percpu_up_write(&scx_fork_rwsem);
+err_disable:
+       mutex_unlock(&scx_enable_mutex);
+       kthread_flush_work(&sch->disable_work);
+       cmd->ret = 0;
+}
+
+static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb,
+                                     unsigned long action, void *data)
+{
+       struct cgroup *cgrp = data;
+       struct cgroup *parent = cgroup_parent(cgrp);
+
+       if (!cgroup_on_dfl(cgrp))
+               return NOTIFY_OK;
+
+       switch (action) {
+       case CGROUP_LIFETIME_ONLINE:
+               /* inherit ->scx_sched from $parent */
+               if (parent)
+                       rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched);
+               break;
+       case CGROUP_LIFETIME_OFFLINE:
+               /* if there is a sched attached, shoot it down */
+               if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp)
+                       scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN,
+                                SCX_ECODE_RSN_CGROUP_OFFLINE,
+                                "cgroup %llu going offline", cgroup_id(cgrp));
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block scx_cgroup_lifetime_nb = {
+       .notifier_call = scx_cgroup_lifetime_notify,
+};
+
+static s32 __init scx_cgroup_lifetime_notifier_init(void)
+{
+       return blocking_notifier_chain_register(&cgroup_lifetime_notifier,
+                                               &scx_cgroup_lifetime_nb);
+}
+core_initcall(scx_cgroup_lifetime_notifier_init);
+#endif /* CONFIG_EXT_SUB_SCHED */
+
  static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
  {
         static struct kthread_worker *helper;
@@ -5418,7 +5876,12 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
                 mutex_unlock(&helper_mutex);
         }
  
-       kthread_init_work(&cmd.work, scx_root_enable_workfn);
+#ifdef CONFIG_EXT_SUB_SCHED
+       if (ops->sub_cgroup_id > 1)
+               kthread_init_work(&cmd.work, scx_sub_enable_workfn);
+       else
+#endif /* CONFIG_EXT_SUB_SCHED */
+               kthread_init_work(&cmd.work, scx_root_enable_workfn);
         cmd.ops = ops;
  
         kthread_queue_work(READ_ONCE(helper), &cmd.work);
@@ -5520,6 +5983,11 @@ static int bpf_scx_init_member(const struct btf_type *t,
         case offsetof(struct sched_ext_ops, hotplug_seq):
                 ops->hotplug_seq = *(u64 *)(udata + moff);
                 return 1;
+#ifdef CONFIG_EXT_SUB_SCHED
+       case offsetof(struct sched_ext_ops, sub_cgroup_id):
+               ops->sub_cgroup_id = *(u64 *)(udata + moff);
+               return 1;
+#endif /* CONFIG_EXT_SUB_SCHED */
         }
  
         return 0;
@@ -5542,6 +6010,8 @@ static int bpf_scx_check_member(const struct btf_type *t,
         case offsetof(struct sched_ext_ops, cpu_offline):
         case offsetof(struct sched_ext_ops, init):
         case offsetof(struct sched_ext_ops, exit):
+       case offsetof(struct sched_ext_ops, sub_attach):
+       case offsetof(struct sched_ext_ops, sub_detach):
                 break;
         default:
                 if (prog->sleepable)
@@ -5619,7 +6089,9 @@ static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgro
  static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
  static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
  static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {}
-#endif
+#endif /* CONFIG_EXT_GROUP_SCHED */
+static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; }
+static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {}
  static void sched_ext_ops__cpu_online(s32 cpu) {}
  static void sched_ext_ops__cpu_offline(s32 cpu) {}
  static s32 sched_ext_ops__init(void) { return -EINVAL; }
@@ -5659,6 +6131,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
         .cgroup_set_bandwidth   = sched_ext_ops__cgroup_set_bandwidth,
         .cgroup_set_idle        = sched_ext_ops__cgroup_set_idle,
  #endif
+       .sub_attach             = sched_ext_ops__sub_attach,
+       .sub_detach             = sched_ext_ops__sub_detach,
         .cpu_online             = sched_ext_ops__cpu_online,
         .cpu_offline            = sched_ext_ops__cpu_offline,
         .init                   = sched_ext_ops__init,
@@ -5941,8 +6415,10 @@ void __init init_sched_ext_class(void)
                 struct rq *rq = cpu_rq(cpu);
                 int  n = cpu_to_node(cpu);
  
-               init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
-               init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
+               /* local/bypass dsq's sch will be set during scx_root_enable() */
+               init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL);
+               init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS, NULL);
+
                 INIT_LIST_HEAD(&rq->scx.runnable_list);
                 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
  
@@ -6598,16 +7074,16 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
         if (!dsq)
                 return -ENOMEM;
  
-       init_dsq(dsq, dsq_id);
-
         rcu_read_lock();
  
         sch = rcu_dereference(scx_root);
-       if (sch)
+       if (sch) {
+               init_dsq(dsq, dsq_id, sch);
                 ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
                                                     dsq_hash_params);
-       else
+       } else {
                 ret = -ENODEV;
+       }
  
         rcu_read_unlock();
         if (ret)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h

index 417d3c6f02fe360429617c6bf859962cc824724f..75b7f57e20ab2b4dece614875ee3700605e0731d 100644 (file)
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -28,6 +28,8 @@ enum scx_consts {
         SCX_BYPASS_LB_DONOR_PCT         = 125,
         SCX_BYPASS_LB_MIN_DELTA_DIV     = 4,
         SCX_BYPASS_LB_BATCH             = 256,
+
+       SCX_SUB_MAX_DEPTH               = 4,
  };
  
  enum scx_exit_kind {
@@ -38,6 +40,7 @@ enum scx_exit_kind {
         SCX_EXIT_UNREG_BPF,     /* BPF-initiated unregistration */
         SCX_EXIT_UNREG_KERN,    /* kernel-initiated unregistration */
         SCX_EXIT_SYSRQ,         /* requested by 'S' sysrq */
+       SCX_EXIT_PARENT,        /* parent exiting */
  
         SCX_EXIT_ERROR = 1024,  /* runtime error, error msg contains details */
         SCX_EXIT_ERROR_BPF,     /* ERROR but triggered through scx_bpf_error() */
@@ -62,6 +65,7 @@ enum scx_exit_kind {
  enum scx_exit_code {
         /* Reasons */
         SCX_ECODE_RSN_HOTPLUG   = 1LLU << 32,
+       SCX_ECODE_RSN_CGROUP_OFFLINE = 2LLU << 32,
  
         /* Actions */
         SCX_ECODE_ACT_RESTART   = 1LLU << 48,
@@ -213,7 +217,7 @@ struct scx_exit_task_args {
         bool cancelled;
  };
  
-/* argument container for ops->cgroup_init() */
+/* argument container for ops.cgroup_init() */
  struct scx_cgroup_init_args {
         /* the weight of the cgroup [1..10000] */
         u32                     weight;
@@ -236,12 +240,12 @@ enum scx_cpu_preempt_reason {
  };
  
  /*
- * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * Argument container for ops.cpu_acquire(). Currently empty, but may be
   * expanded in the future.
   */
  struct scx_cpu_acquire_args {};
  
-/* argument container for ops->cpu_release() */
+/* argument container for ops.cpu_release() */
  struct scx_cpu_release_args {
         /* the reason the CPU was preempted */
         enum scx_cpu_preempt_reason reason;
@@ -250,9 +254,7 @@ struct scx_cpu_release_args {
         struct task_struct      *task;
  };
  
-/*
- * Informational context provided to dump operations.
- */
+/* informational context provided to dump operations */
  struct scx_dump_ctx {
         enum scx_exit_kind      kind;
         s64                     exit_code;
@@ -261,6 +263,18 @@ struct scx_dump_ctx {
         u64                     at_jiffies;
  };
  
+/* argument container for ops.sub_attach() */
+struct scx_sub_attach_args {
+       struct sched_ext_ops    *ops;
+       char                    *cgroup_path;
+};
+
+/* argument container for ops.sub_detach() */
+struct scx_sub_detach_args {
+       struct sched_ext_ops    *ops;
+       char                    *cgroup_path;
+};
+
  /**
   * struct sched_ext_ops - Operation table for BPF scheduler implementation
   *
@@ -721,6 +735,20 @@ struct sched_ext_ops {
  
  #endif /* CONFIG_EXT_GROUP_SCHED */
  
+       /**
+        * @sub_attach: Attach a sub-scheduler
+        * @args: argument container, see the struct definition
+        *
+        * Return 0 to accept the sub-scheduler. -errno to reject.
+        */
+       s32 (*sub_attach)(struct scx_sub_attach_args *args);
+
+       /**
+        * @sub_detach: Detach a sub-scheduler
+        * @args: argument container, see the struct definition
+        */
+       void (*sub_detach)(struct scx_sub_detach_args *args);
+
         /*
          * All online ops must come before ops.cpu_online().
          */
@@ -762,6 +790,10 @@ struct sched_ext_ops {
          */
         void (*exit)(struct scx_exit_info *info);
  
+       /*
+        * Data fields must comes after all ops fields.
+        */
+
         /**
          * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
          */
@@ -796,6 +828,12 @@ struct sched_ext_ops {
          */
         u64 hotplug_seq;
  
+       /**
+        * @cgroup_id: When >1, attach the scheduler as a sub-scheduler on the
+        * specified cgroup.
+        */
+       u64 sub_cgroup_id;
+
         /**
          * @name: BPF scheduler's name
          *
@@ -900,6 +938,8 @@ struct scx_sched {
         struct scx_dispatch_q   **global_dsqs;
         struct scx_sched_pcpu __percpu *pcpu;
  
+       s32                     level;
+
         /*
          * Updates to the following warned bitfields can race causing RMW issues
          * but it doesn't really matter.
@@ -907,6 +947,18 @@ struct scx_sched {
         bool                    warned_zero_slice:1;
         bool                    warned_deprecated_rq:1;
  
+       struct list_head        all;
+
+#ifdef CONFIG_EXT_SUB_SCHED
+       struct list_head        children;
+       struct list_head        sibling;
+       struct cgroup           *cgrp;
+       char                    *cgrp_path;
+       struct kset             *sub_kset;
+
+       bool                    sub_attached;
+#endif /* CONFIG_EXT_SUB_SCHED */
+
         atomic_t                exit_kind;
         struct scx_exit_info    *exit_info;
  
@@ -916,6 +968,9 @@ struct scx_sched {
         struct irq_work         error_irq_work;
         struct kthread_work     disable_work;
         struct rcu_work         rcu_work;
+
+       /* all ancestors including self */
+       struct scx_sched        *ancestors[];
  };
  
  enum scx_wake_flags {
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c

index d51d8c38f1cf9efa38af00859d503e20dd9ae67c..ff6ff34177ab340a74efd67474034383250a09b4 100644 (file)
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -41,6 +41,7 @@ const volatile u32 dsp_batch;
  const volatile bool highpri_boosting;
  const volatile bool print_dsqs_and_events;
  const volatile bool print_msgs;
+const volatile u64 sub_cgroup_id;
  const volatile s32 disallow_tgid;
  const volatile bool suppress_dump;
  
@@ -862,7 +863,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
         struct bpf_timer *timer;
         s32 ret;
  
-       if (print_msgs)
+       if (print_msgs && !sub_cgroup_id)
                 print_cpus();
  
         ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
@@ -892,6 +893,11 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
         UEI_RECORD(uei, ei);
  }
  
+s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
+{
+       return 0;
+}
+
  SCX_OPS_DEFINE(qmap_ops,
                .select_cpu              = (void *)qmap_select_cpu,
                .enqueue                 = (void *)qmap_enqueue,
@@ -907,6 +913,7 @@ SCX_OPS_DEFINE(qmap_ops,
                .cgroup_init             = (void *)qmap_cgroup_init,
                .cgroup_set_weight       = (void *)qmap_cgroup_set_weight,
                .cgroup_set_bandwidth    = (void *)qmap_cgroup_set_bandwidth,
+              .sub_attach              = (void *)qmap_sub_attach,
                .cpu_online              = (void *)qmap_cpu_online,
                .cpu_offline             = (void *)qmap_cpu_offline,
                .init                    = (void *)qmap_init,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c

index ef701d45ba4358f30eeea89b47c762896d1faa0e..5d762d10f4dbdc31821e01b913aed659a6375a74 100644 (file)
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -10,6 +10,7 @@
  #include <inttypes.h>
  #include <signal.h>
  #include <libgen.h>
+#include <sys/stat.h>
  #include <bpf/bpf.h>
  #include <scx/common.h>
  #include "scx_qmap.bpf.skel.h"
@@ -67,7 +68,7 @@ int main(int argc, char **argv)
  
         skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
  
-       while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) {
+       while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:Spvh")) != -1) {
                 switch (opt) {
                 case 's':
                         skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -96,6 +97,16 @@ int main(int argc, char **argv)
                 case 'H':
                         skel->rodata->highpri_boosting = true;
                         break;
+               case 'c': {
+                       struct stat st;
+                       if (stat(optarg, &st) < 0) {
+                               perror("stat");
+                               return 1;
+                       }
+                       skel->struct_ops.qmap_ops->sub_cgroup_id = st.st_ino;
+                       skel->rodata->sub_cgroup_id = st.st_ino;
+                       break;
+               }
                 case 'd':
                         skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
                         if (skel->rodata->disallow_tgid < 0)
author	Tejun Heo <tj@kernel.org>
	Fri, 6 Mar 2026 17:58:03 +0000 (07:58 -1000)
committer	Tejun Heo <tj@kernel.org>
	Fri, 6 Mar 2026 17:58:03 +0000 (07:58 -1000)
include/linux/cgroup-defs.h		patch \| blob \| blame \| history
include/linux/sched/ext.h		patch \| blob \| blame \| history
init/Kconfig		patch \| blob \| blame \| history
kernel/sched/ext.c		patch \| blob \| blame \| history
kernel/sched/ext_internal.h		patch \| blob \| blame \| history
tools/sched_ext/scx_qmap.bpf.c		patch \| blob \| blame \| history
tools/sched_ext/scx_qmap.c		patch \| blob \| blame \| history