]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
sched_ext: defer queue_balance_callback() until after ops.dispatch
authorEmil Tsalapatis <etsal@meta.com>
Fri, 10 Oct 2025 19:12:50 +0000 (12:12 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 1 Dec 2025 10:46:01 +0000 (11:46 +0100)
[ Upstream commit a8ad873113d3fe01f9b5d737d4b0570fa36826b0 ]

The sched_ext code calls queue_balance_callback() during enqueue_task()
to defer operations that drop multiple locks until we can unpin them.
The call assumes that the rq lock is held until the callbacks are
invoked, and the pending callbacks will not be visible to any other
threads. This is enforced by a WARN_ON_ONCE() in rq_pin_lock().

However, balance_one() may actually drop the lock during a BPF dispatch
call. Another thread may win the race to get the rq lock and see the
pending callback. To avoid this, sched_ext must only queue the callback
after the dispatch calls have completed.

CPU 0                   CPU 1           CPU 2

scx_balance()
  rq_unpin_lock()
  scx_balance_one()
    |= IN_BALANCE scx_enqueue()
    ops.dispatch()
      rq_unlock()
                        rq_lock()
                        queue_balance_callback()
                        rq_unlock()
                                        [WARN] rq_pin_lock()
      rq_lock()
    &= ~IN_BALANCE
rq_repin_lock()

Changelog

v2-> v1 (https://lore.kernel.org/sched-ext/aOgOxtHCeyRT_7jn@gpd4)

- Fixed explanation in patch description (Andrea)
- Fixed scx_rq mask state updates (Andrea)
- Added Reviewed-by tag from Andrea

Reported-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Emil Tsalapatis (Meta) <emil@etsalapatis.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
kernel/sched/ext.c
kernel/sched/sched.h

index b454206100ce5ab2ece213932c6ed7ab483d4679..d6d2eea9d1483e37964e370e481135fc743e1592 100644 (file)
@@ -820,13 +820,23 @@ static void schedule_deferred(struct rq *rq)
        if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
                return;
 
+       /* Don't do anything if there already is a deferred operation. */
+       if (rq->scx.flags & SCX_RQ_BAL_PENDING)
+               return;
+
        /*
         * If in balance, the balance callbacks will be called before rq lock is
         * released. Schedule one.
+        *
+        *
+        * We can't directly insert the callback into the
+        * rq's list: The call can drop its lock and make the pending balance
+        * callback visible to unrelated code paths that call rq_pin_lock().
+        *
+        * Just let balance_one() know that it must do it itself.
         */
        if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
-               queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
-                                      deferred_bal_cb_workfn);
+               rq->scx.flags |= SCX_RQ_BAL_CB_PENDING;
                return;
        }
 
@@ -2043,6 +2053,19 @@ static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
        dspc->cursor = 0;
 }
 
+static inline void maybe_queue_balance_callback(struct rq *rq)
+{
+       lockdep_assert_rq_held(rq);
+
+       if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING))
+               return;
+
+       queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
+                               deferred_bal_cb_workfn);
+
+       rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
+}
+
 static int balance_one(struct rq *rq, struct task_struct *prev)
 {
        struct scx_sched *sch = scx_root;
@@ -2190,6 +2213,8 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 #endif
        rq_repin_lock(rq, rf);
 
+       maybe_queue_balance_callback(rq);
+
        return ret;
 }
 
index 72fb9129afb6a8800e70d030c203e71e82c8d61e..c7f67f54d4e3ee2438b4777af63c11d621e22dee 100644 (file)
@@ -782,6 +782,7 @@ enum scx_rq_flags {
        SCX_RQ_BAL_KEEP         = 1 << 3, /* balance decided to keep current */
        SCX_RQ_BYPASSING        = 1 << 4,
        SCX_RQ_CLK_VALID        = 1 << 5, /* RQ clock is fresh and valid */
+       SCX_RQ_BAL_CB_PENDING   = 1 << 6, /* must queue a cb after dispatching */
 
        SCX_RQ_IN_WAKEUP        = 1 << 16,
        SCX_RQ_IN_BALANCE       = 1 << 17,