]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched_ext: Fix starvation of scx_enable() under fair-class saturation
authorTejun Heo <tj@kernel.org>
Tue, 3 Mar 2026 11:01:15 +0000 (01:01 -1000)
committerTejun Heo <tj@kernel.org>
Tue, 3 Mar 2026 21:10:40 +0000 (11:10 -1000)
During scx_enable(), the READY -> ENABLED task switching loop changes the
calling thread's sched_class from fair to ext. Since fair has higher
priority than ext, saturating fair-class workloads can indefinitely starve
the enable thread, hanging the system. This was introduced when the enable
path switched from preempt_disable() to scx_bypass() which doesn't protect
against fair-class starvation. Note that the original preempt_disable()
protection wasn't complete either - in partial switch modes, the calling
thread could still be starved after preempt_enable() as it may have been
switched to ext class.

Fix it by offloading the enable body to a dedicated system-wide RT
(SCHED_FIFO) kthread which cannot be starved by either fair or ext class
tasks. scx_enable() lazily creates the kthread on first use and passes the
ops pointer through a struct scx_enable_cmd containing the kthread_work,
then synchronously waits for completion.

The workfn runs on a different kthread from sch->helper (which runs
disable_work), so it can safely flush disable_work on the error path
without deadlock.

Fixes: 8c2090c504e9 ("sched_ext: Initialize in bypass mode")
Cc: stable@vger.kernel.org # v6.12+
Signed-off-by: Tejun Heo <tj@kernel.org>
kernel/sched/ext.c

index eab6e09b644254a36510d735b566b5f8a6c47928..ba51969718f5eea870254efaf8efe1ee0f9e44a2 100644 (file)
@@ -4975,20 +4975,30 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
        return 0;
 }
 
-static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+/*
+ * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
+ * starvation. During the READY -> ENABLED task switching loop, the calling
+ * thread's sched_class gets switched from fair to ext. As fair has higher
+ * priority than ext, the calling thread can be indefinitely starved under
+ * fair-class saturation, leading to a system hang.
+ */
+struct scx_enable_cmd {
+       struct kthread_work     work;
+       struct sched_ext_ops    *ops;
+       int                     ret;
+};
+
+static void scx_enable_workfn(struct kthread_work *work)
 {
+       struct scx_enable_cmd *cmd =
+               container_of(work, struct scx_enable_cmd, work);
+       struct sched_ext_ops *ops = cmd->ops;
        struct scx_sched *sch;
        struct scx_task_iter sti;
        struct task_struct *p;
        unsigned long timeout;
        int i, cpu, ret;
 
-       if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
-                          cpu_possible_mask)) {
-               pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
-               return -EINVAL;
-       }
-
        mutex_lock(&scx_enable_mutex);
 
        if (scx_enable_state() != SCX_DISABLED) {
@@ -5205,13 +5215,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
        atomic_long_inc(&scx_enable_seq);
 
-       return 0;
+       cmd->ret = 0;
+       return;
 
 err_free_ksyncs:
        free_kick_syncs();
 err_unlock:
        mutex_unlock(&scx_enable_mutex);
-       return ret;
+       cmd->ret = ret;
+       return;
 
 err_disable_unlock_all:
        scx_cgroup_unlock();
@@ -5230,7 +5242,41 @@ err_disable:
         */
        scx_error(sch, "scx_enable() failed (%d)", ret);
        kthread_flush_work(&sch->disable_work);
-       return 0;
+       cmd->ret = 0;
+}
+
+static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+{
+       static struct kthread_worker *helper;
+       static DEFINE_MUTEX(helper_mutex);
+       struct scx_enable_cmd cmd;
+
+       if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
+                          cpu_possible_mask)) {
+               pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
+               return -EINVAL;
+       }
+
+       if (!READ_ONCE(helper)) {
+               mutex_lock(&helper_mutex);
+               if (!helper) {
+                       helper = kthread_run_worker(0, "scx_enable_helper");
+                       if (IS_ERR_OR_NULL(helper)) {
+                               helper = NULL;
+                               mutex_unlock(&helper_mutex);
+                               return -ENOMEM;
+                       }
+                       sched_set_fifo(helper->task);
+               }
+               mutex_unlock(&helper_mutex);
+       }
+
+       kthread_init_work(&cmd.work, scx_enable_workfn);
+       cmd.ops = ops;
+
+       kthread_queue_work(READ_ONCE(helper), &cmd.work);
+       kthread_flush_work(&cmd.work);
+       return cmd.ret;
 }