sched_ext: Implement cgroup sub-sched enabling and disabling

author Tejun Heo <tj@kernel.org>

Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)

committer Tejun Heo <tj@kernel.org>

Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
author Tejun Heo <tj@kernel.org>
Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
committer Tejun Heo <tj@kernel.org>
Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h

index 3213e31c797946892abc729460829aa4bd5575ef..f354d7d343069ac5361f852995aedf763a145651 100644 (file)
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -88,6 +88,7 @@ enum scx_ent_flags {
         SCX_TASK_IN_CUSTODY     = 1 << 1, /* in custody, needs ops.dequeue() when leaving */
         SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
         SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
+       SCX_TASK_SUB_INIT       = 1 << 4, /* task being initialized for a sub sched */
  
         SCX_TASK_STATE_SHIFT    = 8,      /* bit 8 and 9 are used to carry scx_task_state */
         SCX_TASK_STATE_BITS     = 2,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 3f237b9da97043911017c0af7a66a5407c927594..70d0f9e8ef61055b8df14ec0f3a268cbb847c68f 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -51,6 +51,17 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
  static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
  static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
  
+#ifdef CONFIG_EXT_SUB_SCHED
+/*
+ * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit
+ * tasks for the sub-sched being enabled. Use a global variable instead of a
+ * per-task field as all enables are serialized.
+ */
+static struct scx_sched *scx_enabling_sub_sched;
+#else
+#define scx_enabling_sub_sched (struct scx_sched *)NULL
+#endif /* CONFIG_EXT_SUB_SCHED */
+
  /*
   * A monotically increasing sequence number that is incremented every time a
   * scheduler is enabled. This can be used by to check if any custom sched_ext
@@ -3342,6 +3353,17 @@ static void scx_disable_and_exit_task(struct scx_sched *sch,
  {
         __scx_disable_and_exit_task(sch, p);
  
+       /*
+        * If set, @p exited between __scx_init_task() and scx_enable_task() in
+        * scx_sub_enable() and is initialized for both the associated sched and
+        * its parent. Disable and exit for the child too.
+        */
+       if ((p->scx.flags & SCX_TASK_SUB_INIT) &&
+           !WARN_ON_ONCE(!scx_enabling_sub_sched)) {
+               __scx_disable_and_exit_task(scx_enabling_sub_sched, p);
+               p->scx.flags &= ~SCX_TASK_SUB_INIT;
+       }
+
         scx_set_task_sched(p, NULL);
         scx_set_task_state(p, SCX_TASK_NONE);
  }
@@ -3377,9 +3399,14 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
         percpu_rwsem_assert_held(&scx_fork_rwsem);
  
         if (scx_init_task_enabled) {
-               ret = scx_init_task(scx_root, p, true);
+#ifdef CONFIG_EXT_SUB_SCHED
+               struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
+#else
+               struct scx_sched *sch = scx_root;
+#endif
+               ret = scx_init_task(sch, p, true);
                 if (!ret)
-                       scx_set_task_sched(p, scx_root);
+                       scx_set_task_sched(p, sch);
                 return ret;
         }
  
@@ -4643,9 +4670,9 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
                 struct rq *rq = cpu_rq(cpu);
                 struct task_struct *p, *n;
  
+               raw_spin_lock(&scx_sched_lock);
                 raw_spin_rq_lock(rq);
  
-               raw_spin_lock(&scx_sched_lock);
                 scx_for_each_descendant_pre(pos, sch) {
                         struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu);
  
@@ -4654,6 +4681,7 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
                         else
                                 pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
                 }
+
                 raw_spin_unlock(&scx_sched_lock);
  
                 /*
@@ -4798,23 +4826,139 @@ static void drain_descendants(struct scx_sched *sch)
         wait_event(scx_unlink_waitq, list_empty(&sch->children));
  }
  
+static void scx_fail_parent(struct scx_sched *sch,
+                           struct task_struct *failed, s32 fail_code)
+{
+       struct scx_sched *parent = scx_parent(sch);
+       struct scx_task_iter sti;
+       struct task_struct *p;
+
+       scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
+                 fail_code, failed->comm, failed->pid);
+
+       /*
+        * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
+        * it. This may cause downstream failures on the BPF side but $parent is
+        * dying anyway.
+        */
+       scx_bypass(parent, true);
+
+       scx_task_iter_start(&sti, sch->cgrp);
+       while ((p = scx_task_iter_next_locked(&sti))) {
+               if (scx_task_on_sched(parent, p))
+                       continue;
+
+               scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+                       scx_disable_and_exit_task(sch, p);
+                       rcu_assign_pointer(p->scx.sched, parent);
+               }
+       }
+       scx_task_iter_stop(&sti);
+}
+
  static void scx_sub_disable(struct scx_sched *sch)
  {
         struct scx_sched *parent = scx_parent(sch);
+       struct scx_task_iter sti;
+       struct task_struct *p;
+       int ret;
  
+       /*
+        * Guarantee forward progress and wait for descendants to be disabled.
+        * To limit
+        * disruptions, $parent is not bypassed. Tasks are fully prepped and
+        * then inserted back into $parent.
+        */
+       scx_bypass(sch, true);
         drain_descendants(sch);
  
+       /*
+        * Here, every runnable task is guaranteed to make forward progress and
+        * we can safely use blocking synchronization constructs. Actually
+        * disable ops.
+        */
         mutex_lock(&scx_enable_mutex);
         percpu_down_write(&scx_fork_rwsem);
         scx_cgroup_lock();
  
         set_cgroup_sched(sch_cgroup(sch), parent);
  
-       /* TODO - perform actual disabling here */
+       scx_task_iter_start(&sti, sch->cgrp);
+       while ((p = scx_task_iter_next_locked(&sti))) {
+               struct rq *rq;
+               struct rq_flags rf;
+
+               /* filter out duplicate visits */
+               if (scx_task_on_sched(parent, p))
+                       continue;
+
+               /*
+                * By the time control reaches here, all descendant schedulers
+                * should already have been disabled.
+                */
+               WARN_ON_ONCE(!scx_task_on_sched(sch, p));
+
+               /*
+                * If $p is about to be freed, nothing prevents $sch from
+                * unloading before $p reaches sched_ext_free(). Disable and
+                * exit $p right away.
+                */
+               if (!tryget_task_struct(p)) {
+                       scx_disable_and_exit_task(sch, p);
+                       continue;
+               }
+
+               scx_task_iter_unlock(&sti);
+
+               /*
+                * $p is READY or ENABLED on @sch. Initialize for $parent,
+                * disable and exit from @sch, and then switch over to $parent.
+                *
+                * If a task fails to initialize for $parent, the only available
+                * action is disabling $parent too. While this allows disabling
+                * of a child sched to cause the parent scheduler to fail, the
+                * failure can only originate from ops.init_task() of the
+                * parent. A child can't directly affect the parent through its
+                * own failures.
+                */
+               ret = __scx_init_task(parent, p, false);
+               if (ret) {
+                       scx_fail_parent(sch, p, ret);
+                       put_task_struct(p);
+                       break;
+               }
+
+               rq = task_rq_lock(p, &rf);
+               scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+                       /*
+                        * $p is initialized for $parent and still attached to
+                        * @sch. Disable and exit for @sch, switch over to
+                        * $parent, override the state to READY to account for
+                        * $p having already been initialized, and then enable.
+                        */
+                       scx_disable_and_exit_task(sch, p);
+                       scx_set_task_state(p, SCX_TASK_INIT);
+                       rcu_assign_pointer(p->scx.sched, parent);
+                       scx_set_task_state(p, SCX_TASK_READY);
+                       scx_enable_task(parent, p);
+               }
+               task_rq_unlock(rq, p, &rf);
+
+               put_task_struct(p);
+       }
+       scx_task_iter_stop(&sti);
  
         scx_cgroup_unlock();
         percpu_up_write(&scx_fork_rwsem);
  
+       /*
+        * All tasks are moved off of @sch but there may still be on-going
+        * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
+        * the expedited version as ancestors may be waiting in bypass mode.
+        * Also, tell the parent that there is no need to keep running bypass
+        * DSQs for us.
+        */
+       synchronize_rcu_expedited();
         disable_bypass_dsp(sch);
  
         raw_spin_lock_irq(&scx_sched_lock);
@@ -5933,13 +6077,30 @@ static struct scx_sched *find_parent_sched(struct cgroup *cgrp)
         return parent;
  }
  
+static bool assert_task_ready_or_enabled(struct task_struct *p)
+{
+       enum scx_task_state state = scx_get_task_state(p);
+
+       switch (state) {
+       case SCX_TASK_READY:
+       case SCX_TASK_ENABLED:
+               return true;
+       default:
+               WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched",
+                         state, p->comm, p->pid);
+               return false;
+       }
+}
+
  static void scx_sub_enable_workfn(struct kthread_work *work)
  {
         struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
         struct sched_ext_ops *ops = cmd->ops;
         struct cgroup *cgrp;
         struct scx_sched *parent, *sch;
-       s32 ret;
+       struct scx_task_iter sti;
+       struct task_struct *p;
+       s32 i, ret;
  
         mutex_lock(&scx_enable_mutex);
  
@@ -6011,6 +6172,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
         }
         sch->sub_attached = true;
  
+       scx_bypass(sch, true);
+
+       for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
+               if (((void (**)(void))ops)[i])
+                       set_bit(i, sch->has_op);
+
         percpu_down_write(&scx_fork_rwsem);
         scx_cgroup_lock();
  
@@ -6024,16 +6191,121 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
                 goto err_unlock_and_disable;
         }
  
-       /* TODO - perform actual enabling here */
+       /*
+        * Initialize tasks for the new child $sch without exiting them for
+        * $parent so that the tasks can always be reverted back to $parent
+        * sched on child init failure.
+        */
+       WARN_ON_ONCE(scx_enabling_sub_sched);
+       scx_enabling_sub_sched = sch;
+
+       scx_task_iter_start(&sti, sch->cgrp);
+       while ((p = scx_task_iter_next_locked(&sti))) {
+               struct rq *rq;
+               struct rq_flags rf;
+
+               /*
+                * Task iteration may visit the same task twice when racing
+                * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which
+                * finished __scx_init_task() and skip if set.
+                *
+                * A task may exit and get freed between __scx_init_task()
+                * completion and scx_enable_task(). In such cases,
+                * scx_disable_and_exit_task() must exit the task for both the
+                * parent and child scheds.
+                */
+               if (p->scx.flags & SCX_TASK_SUB_INIT)
+                       continue;
+
+               /* see scx_root_enable() */
+               if (!tryget_task_struct(p))
+                       continue;
+
+               if (!assert_task_ready_or_enabled(p)) {
+                       ret = -EINVAL;
+                       goto abort;
+               }
+
+               scx_task_iter_unlock(&sti);
+
+               /*
+                * As $p is still on $parent, it can't be transitioned to INIT.
+                * Let's worry about task state later. Use __scx_init_task().
+                */
+               ret = __scx_init_task(sch, p, false);
+               if (ret)
+                       goto abort;
+
+               rq = task_rq_lock(p, &rf);
+               p->scx.flags |= SCX_TASK_SUB_INIT;
+               task_rq_unlock(rq, p, &rf);
+
+               put_task_struct(p);
+       }
+       scx_task_iter_stop(&sti);
+
+       /*
+        * All tasks are prepped. Disable/exit tasks for $parent and enable for
+        * the new @sch.
+        */
+       scx_task_iter_start(&sti, sch->cgrp);
+       while ((p = scx_task_iter_next_locked(&sti))) {
+               /*
+                * Use clearing of %SCX_TASK_SUB_INIT to detect and skip
+                * duplicate iterations.
+                */
+               if (!(p->scx.flags & SCX_TASK_SUB_INIT))
+                       continue;
+
+               scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+                       /*
+                        * $p must be either READY or ENABLED. If ENABLED,
+                        * __scx_disabled_and_exit_task() first disables and
+                        * makes it READY. However, after exiting $p, it will
+                        * leave $p as READY.
+                        */
+                       assert_task_ready_or_enabled(p);
+                       __scx_disable_and_exit_task(parent, p);
+
+                       /*
+                        * $p is now only initialized for @sch and READY, which
+                        * is what we want. Assign it to @sch and enable.
+                        */
+                       rcu_assign_pointer(p->scx.sched, sch);
+                       scx_enable_task(sch, p);
+
+                       p->scx.flags &= ~SCX_TASK_SUB_INIT;
+               }
+       }
+       scx_task_iter_stop(&sti);
+
+       scx_enabling_sub_sched = NULL;
  
         scx_cgroup_unlock();
         percpu_up_write(&scx_fork_rwsem);
  
+       scx_bypass(sch, false);
+
         pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
         kobject_uevent(&sch->kobj, KOBJ_ADD);
         ret = 0;
         goto out_unlock;
  
+abort:
+       put_task_struct(p);
+       scx_task_iter_stop(&sti);
+       scx_enabling_sub_sched = NULL;
+
+       scx_task_iter_start(&sti, sch->cgrp);
+       while ((p = scx_task_iter_next_locked(&sti))) {
+               if (p->scx.flags & SCX_TASK_SUB_INIT) {
+                       __scx_disable_and_exit_task(sch, p);
+                       p->scx.flags &= ~SCX_TASK_SUB_INIT;
+               }
+       }
+       scx_task_iter_stop(&sti);
+       scx_cgroup_unlock();
+       percpu_up_write(&scx_fork_rwsem);
  out_put_cgrp:
         cgroup_put(cgrp);
  out_unlock:
@@ -6042,6 +6314,7 @@ out_unlock:
         return;
  
  err_unlock_and_disable:
+       /* we'll soon enter disable path, keep bypass on */
         scx_cgroup_unlock();
         percpu_up_write(&scx_fork_rwsem);
  err_disable:
author	Tejun Heo <tj@kernel.org>
	Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
committer	Tejun Heo <tj@kernel.org>
	Fri, 6 Mar 2026 17:58:04 +0000 (07:58 -1000)
include/linux/sched/ext.h		patch \| blob \| blame \| history
kernel/sched/ext.c		patch \| blob \| blame \| history