]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
sched_ext: Fix scx_flush_disable_work() UAF race
authorCheng-Yang Chou <yphbchou0911@gmail.com>
Tue, 28 Apr 2026 17:36:12 +0000 (01:36 +0800)
committerTejun Heo <tj@kernel.org>
Tue, 28 Apr 2026 17:40:03 +0000 (07:40 -1000)
scx_flush_disable_work() calls irq_work_sync() followed by
kthread_flush_work() to ensure that the disable kthread work has
fully completed before bpf_scx_unreg() frees the SCX scheduler.

However, a concurrent scx_vexit() (e.g., triggered by a watchdog stall)
creates a race window between scx_claim_exit() and irq_work_queue():

  CPU A (scx_vexit (watchdog))        CPU B (bpf_scx_unreg)
  ----                                ----
  scx_claim_exit()
    atomic_try_cmpxchg(NONE->kind)
  stack_trace_save()
  vscnprintf()
                                      scx_disable()
                                        scx_claim_exit() -> FAIL
                                      scx_flush_disable_work()
                                        irq_work_sync()      // no-op: not queued yet
                                        kthread_flush_work() // no-op: not queued yet
                                      kobject_put(&sch->kobj) -> free %sch
  irq_work_queue() -> UAF on %sch
  scx_disable_irq_workfn()
    kthread_queue_work() -> UAF

The root cause is that CPU B's scx_flush_disable_work() returns after
syncing an irq_work that has not yet been queued, while CPU A is still
executing the code between scx_claim_exit() and irq_work_queue().

Loop until exit_kind reaches SCX_EXIT_DONE or SCX_EXIT_NONE, draining
disable_irq_work and disable_work in each pass. This ensures that any
work queued after the previous check is caught, while also correctly
handling cases where no disable was triggered (e.g., the
scx_sub_enable_workfn() abort path).

Fixes: 510a27055446 ("sched_ext: sync disable_irq_work in bpf_scx_unreg()")
Reported-by: https://sashiko.dev/#/patchset/20260424100221.32407-1-icheng%40nvidia.com
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
kernel/sched/ext.c

index cac0b18239fea7b8a4e0283a3abfeeed51a72301..9483be03a4ca058a2c1bf4e08a7a4a60bcab6b62 100644 (file)
@@ -6039,8 +6039,13 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
  */
 static void scx_flush_disable_work(struct scx_sched *sch)
 {
-       irq_work_sync(&sch->disable_irq_work);
-       kthread_flush_work(&sch->disable_work);
+       int kind;
+
+       do {
+               irq_work_sync(&sch->disable_irq_work);
+               kthread_flush_work(&sch->disable_work);
+               kind = atomic_read(&sch->exit_kind);
+       } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE);
 }
 
 static void dump_newline(struct seq_buf *s)