]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
sched/fair: Fix wakeup_preempt_fair() vs delayed dequeue
authorVincent Guittot <vincent.guittot@linaro.org>
Wed, 22 Apr 2026 09:34:00 +0000 (11:34 +0200)
committerPeter Zijlstra <peterz@infradead.org>
Tue, 28 Apr 2026 07:19:54 +0000 (09:19 +0200)
Similar to how pick_next_entity() must dequeue delayed entities, so too must
wakeup_preempt_fair(). Any delayed task being found means it is eligible and
hence past the 0-lag point, ready for removal.

Worse, by not removing delayed entities from consideration, it can skew the
preemption decision, with the end result that a short slice wakeup will not
result in a preemption.

                     tip/sched/core  tip/sched/core    +this patch
cyclictest slice  (ms) (default)2.8             8               8
hackbench slice   (ms) (default)2.8            20              20
Total Samples          |    22559           22595           22683
Average           (us) |      157              64( 59%)        59(  8%)
Median (P50)      (us) |       57              57(  0%)        58(- 2%)
90th Percentile   (us) |       64              60(  6%)        60(  0%)
99th Percentile   (us) |     2407              67( 97%)        67(  0%)
99.9th Percentile (us) |     3400            2288( 33%)       727( 68%)
Maximum           (us) |     5037            9252(-84%)      7461( 19%)

Fixes: f12e148892ed ("sched/fair: Prepare pick_next_task() for delayed dequeue")
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260422093400.319251-1-vincent.guittot@linaro.org
kernel/sched/fair.c

index 615861d963571ec48ee1e1591906c02bfbb3d923..728965851842e189790173c110ecc8a4835b418a 100644 (file)
@@ -1104,7 +1104,7 @@ static inline void cancel_protect_slice(struct sched_entity *se)
  *
  * Which allows tree pruning through eligibility.
  */
-static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
 {
        struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
        struct sched_entity *se = __pick_first_entity(cfs_rq);
@@ -1175,11 +1175,6 @@ found:
        return best;
 }
 
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-{
-       return __pick_eevdf(cfs_rq, true);
-}
-
 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -5754,11 +5749,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
  * 4) do not run the "skip" process, if something else is available
  */
 static struct sched_entity *
-pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect)
 {
        struct sched_entity *se;
 
-       se = pick_eevdf(cfs_rq);
+       se = pick_eevdf(cfs_rq, protect);
        if (se->sched_delayed) {
                dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
                /*
@@ -9032,7 +9027,7 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
 {
        enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
        struct task_struct *donor = rq->donor;
-       struct sched_entity *se = &donor->se, *pse = &p->se;
+       struct sched_entity *nse, *se = &donor->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(donor);
        int cse_is_idle, pse_is_idle;
 
@@ -9143,11 +9138,17 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
        }
 
 pick:
+       nse = pick_next_entity(rq, cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT);
+       /* If @p has become the most eligible task, force preemption */
+       if (nse == pse)
+               goto preempt;
+
        /*
-        * If @p has become the most eligible task, force preemption.
+        * Because p is enqueued, nse being null can only mean that we
+        * dequeued a delayed task.
         */
-       if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
-               goto preempt;
+       if (!nse)
+               goto pick;
 
        if (sched_feat(RUN_TO_PARITY))
                update_protect_slice(cfs_rq, se);
@@ -9184,7 +9185,7 @@ again:
 
                throttled |= check_cfs_rq_runtime(cfs_rq);
 
-               se = pick_next_entity(rq, cfs_rq);
+               se = pick_next_entity(rq, cfs_rq, true);
                if (!se)
                        goto again;
                cfs_rq = group_cfs_rq(se);