]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
sched: Fix pick_next_task_fair() vs try_to_wake_up() race
authorPeter Zijlstra <peterz@infradead.org>
Wed, 23 Oct 2024 09:36:41 +0000 (11:36 +0200)
committerPeter Zijlstra <peterz@infradead.org>
Wed, 23 Oct 2024 18:52:26 +0000 (20:52 +0200)
Syzkaller robot reported KCSAN tripping over the
ASSERT_EXCLUSIVE_WRITER(p->on_rq) in __block_task().

The report noted that both pick_next_task_fair() and try_to_wake_up()
were concurrently trying to write to the same p->on_rq, violating the
assertion -- even though both paths hold rq->__lock.

The logical consequence is that both code paths end up holding a
different rq->__lock. And looking through ttwu(), this is possible
when the __block_task() 'p->on_rq = 0' store is visible to the ttwu()
'p->on_rq' load, which then assumes the task is not queued and
continues to migrate it.

Rearrange things such that __block_task() releases @p with the store
and no code thereafter will use @p again.

Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Reported-by: syzbot+0ec1e96c2cdf5c0e512a@syzkaller.appspotmail.com
Reported-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Marco Elver <elver@google.com>
Link: https://lkml.kernel.org/r/20241023093641.GE16066@noisy.programming.kicks-ass.net
kernel/sched/fair.c
kernel/sched/sched.h

index c157d4860a3bf3118c8daa7eaea12dde63761d90..879614686188d3758215b7c32310c55398275b4a 100644 (file)
@@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
        struct sched_entity *se = pick_eevdf(cfs_rq);
        if (se->sched_delayed) {
                dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-               SCHED_WARN_ON(se->sched_delayed);
-               SCHED_WARN_ON(se->on_rq);
+               /*
+                * Must not reference @se again, see __block_task().
+                */
                return NULL;
        }
        return se;
@@ -7176,7 +7177,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
                /* Fix-up what dequeue_task_fair() skipped */
                hrtick_update(rq);
 
-               /* Fix-up what block_task() skipped. */
+               /*
+                * Fix-up what block_task() skipped.
+                *
+                * Must be last, @p might not be valid after this.
+                */
                __block_task(rq, p);
        }
 
@@ -7193,12 +7198,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
                util_est_dequeue(&rq->cfs, p);
 
-       if (dequeue_entities(rq, &p->se, flags) < 0) {
-               util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
+       util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+       if (dequeue_entities(rq, &p->se, flags) < 0)
                return false;
-       }
 
-       util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+       /*
+        * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
+        */
+
        hrtick_update(rq);
        return true;
 }
index 081519ffab46405e3f1ea97b0f00414bc11915d4..9f9d1cc390b124954f5a2e294fa8f78e25125d54 100644 (file)
@@ -2769,8 +2769,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 
 static inline void __block_task(struct rq *rq, struct task_struct *p)
 {
-       WRITE_ONCE(p->on_rq, 0);
-       ASSERT_EXCLUSIVE_WRITER(p->on_rq);
        if (p->sched_contributes_to_load)
                rq->nr_uninterruptible++;
 
@@ -2778,6 +2776,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p)
                atomic_inc(&rq->nr_iowait);
                delayacct_blkio_start();
        }
+
+       ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+
+       /*
+        * The moment this write goes through, ttwu() can swoop in and migrate
+        * this task, rendering our rq->__lock ineffective.
+        *
+        * __schedule()                         try_to_wake_up()
+        *   LOCK rq->__lock                      LOCK p->pi_lock
+        *   pick_next_task()
+        *     pick_next_task_fair()
+        *       pick_next_entity()
+        *         dequeue_entities()
+        *           __block_task()
+        *             RELEASE p->on_rq = 0       if (p->on_rq && ...)
+        *                                          break;
+        *
+        *                                        ACQUIRE (after ctrl-dep)
+        *
+        *                                        cpu = select_task_rq();
+        *                                        set_task_cpu(p, cpu);
+        *                                        ttwu_queue()
+        *                                          ttwu_do_activate()
+        *                                            LOCK rq->__lock
+        *                                            activate_task()
+        *                                              STORE p->on_rq = 1
+        *   UNLOCK rq->__lock
+        *
+        * Callers must ensure to not reference @p after this -- we no longer
+        * own it.
+        */
+       smp_store_release(&p->on_rq, 0);
 }
 
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);