]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
sched/fair: Update util_est after updating util_avg during dequeue
authorVincent Guittot <vincent.guittot@linaro.org>
Mon, 18 May 2026 10:23:45 +0000 (12:23 +0200)
committerPeter Zijlstra <peterz@infradead.org>
Tue, 19 May 2026 10:17:36 +0000 (12:17 +0200)
util_est_update() must be called after updating util_avg during the dequeue
of a task and only when the task is not delayed dequeue.

Move util_est_update() in update_load_avg().

Fixes: b55945c500c5 ("sched: Fix pick_next_task_fair() vs try_to_wake_up() race")
Closes: https://lore.kernel.org/all/20260512124653.305275-1-qyousef@layalina.io/
Reported-by: Qais Yousef <qyousef@layalina.io>
Reviewed-and-tested-by: Qais Yousef <qyousef@layalina.io>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260518102345.268452-1-vincent.guittot@linaro.org
kernel/sched/fair.c

index 728965851842e189790173c110ecc8a4835b418a..09d3acd2d2bcf9b6845ea2d974f9995118151cd3 100644 (file)
@@ -4930,13 +4930,86 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
        trace_pelt_cfs_tp(cfs_rq);
 }
 
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
+
+static inline void util_est_update(struct sched_entity *se)
+{
+       unsigned int ewma, dequeued, last_ewma_diff;
+
+       if (!sched_feat(UTIL_EST))
+               return;
+
+       /* Get current estimate of utilization */
+       ewma = READ_ONCE(se->avg.util_est);
+
+       /*
+        * If the PELT values haven't changed since enqueue time,
+        * skip the util_est update.
+        */
+       if (ewma & UTIL_AVG_UNCHANGED)
+               return;
+
+       /* Get utilization at dequeue */
+       dequeued = READ_ONCE(se->avg.util_avg);
+
+       /*
+        * Reset EWMA on utilization increases, the moving average is used only
+        * to smooth utilization decreases.
+        */
+       if (ewma <= dequeued) {
+               ewma = dequeued;
+               goto done;
+       }
+
+       /*
+        * Skip update of task's estimated utilization when its members are
+        * already ~1% close to its last activation value.
+        */
+       last_ewma_diff = ewma - dequeued;
+       if (last_ewma_diff < UTIL_EST_MARGIN)
+               goto done;
+
+       /*
+        * To avoid underestimate of task utilization, skip updates of EWMA if
+        * we cannot grant that thread got all CPU time it wanted.
+        */
+       if ((dequeued + UTIL_EST_MARGIN) < READ_ONCE(se->avg.runnable_avg))
+               goto done;
+
+       /*
+        * Update Task's estimated utilization
+        *
+        * When *p completes an activation we can consolidate another sample
+        * of the task size. This is done by using this value to update the
+        * Exponential Weighted Moving Average (EWMA):
+        *
+        *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
+        *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
+        *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
+        *          = w * (      -last_ewma_diff           ) +     ewma(t-1)
+        *          = w * (-last_ewma_diff +  ewma(t-1) / w)
+        *
+        * Where 'w' is the weight of new samples, which is configured to be
+        * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+        */
+       ewma <<= UTIL_EST_WEIGHT_SHIFT;
+       ewma  -= last_ewma_diff;
+       ewma >>= UTIL_EST_WEIGHT_SHIFT;
+done:
+       ewma |= UTIL_AVG_UNCHANGED;
+       WRITE_ONCE(se->avg.util_est, ewma);
+
+       trace_sched_util_est_se_tp(se);
+}
+
 /*
  * Optional action to be done while updating the load average
  */
-#define UPDATE_TG      0x1
-#define SKIP_AGE_LOAD  0x2
-#define DO_ATTACH      0x4
-#define DO_DETACH      0x8
+#define UPDATE_TG      0x01
+#define SKIP_AGE_LOAD  0x02
+#define DO_ATTACH      0x04
+#define DO_DETACH      0x08
+#define UPDATE_UTIL_EST        0x10
 
 /* Update task and its cfs_rq load average */
 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -4979,6 +5052,9 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
                if (flags & UPDATE_TG)
                        update_tg_load_avg(cfs_rq);
        }
+
+       if (flags & UPDATE_UTIL_EST)
+               util_est_update(se);
 }
 
 /*
@@ -5037,11 +5113,6 @@ static inline unsigned long task_util(struct task_struct *p)
        return READ_ONCE(p->se.avg.util_avg);
 }
 
-static inline unsigned long task_runnable(struct task_struct *p)
-{
-       return READ_ONCE(p->se.avg.runnable_avg);
-}
-
 static inline unsigned long _task_util_est(struct task_struct *p)
 {
        return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
@@ -5084,88 +5155,6 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
        trace_sched_util_est_cfs_tp(cfs_rq);
 }
 
-#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
-
-static inline void util_est_update(struct cfs_rq *cfs_rq,
-                                  struct task_struct *p,
-                                  bool task_sleep)
-{
-       unsigned int ewma, dequeued, last_ewma_diff;
-
-       if (!sched_feat(UTIL_EST))
-               return;
-
-       /*
-        * Skip update of task's estimated utilization when the task has not
-        * yet completed an activation, e.g. being migrated.
-        */
-       if (!task_sleep)
-               return;
-
-       /* Get current estimate of utilization */
-       ewma = READ_ONCE(p->se.avg.util_est);
-
-       /*
-        * If the PELT values haven't changed since enqueue time,
-        * skip the util_est update.
-        */
-       if (ewma & UTIL_AVG_UNCHANGED)
-               return;
-
-       /* Get utilization at dequeue */
-       dequeued = task_util(p);
-
-       /*
-        * Reset EWMA on utilization increases, the moving average is used only
-        * to smooth utilization decreases.
-        */
-       if (ewma <= dequeued) {
-               ewma = dequeued;
-               goto done;
-       }
-
-       /*
-        * Skip update of task's estimated utilization when its members are
-        * already ~1% close to its last activation value.
-        */
-       last_ewma_diff = ewma - dequeued;
-       if (last_ewma_diff < UTIL_EST_MARGIN)
-               goto done;
-
-       /*
-        * To avoid underestimate of task utilization, skip updates of EWMA if
-        * we cannot grant that thread got all CPU time it wanted.
-        */
-       if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
-               goto done;
-
-
-       /*
-        * Update Task's estimated utilization
-        *
-        * When *p completes an activation we can consolidate another sample
-        * of the task size. This is done by using this value to update the
-        * Exponential Weighted Moving Average (EWMA):
-        *
-        *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
-        *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
-        *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
-        *          = w * (      -last_ewma_diff           ) +     ewma(t-1)
-        *          = w * (-last_ewma_diff +  ewma(t-1) / w)
-        *
-        * Where 'w' is the weight of new samples, which is configured to be
-        * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
-        */
-       ewma <<= UTIL_EST_WEIGHT_SHIFT;
-       ewma  -= last_ewma_diff;
-       ewma >>= UTIL_EST_WEIGHT_SHIFT;
-done:
-       ewma |= UTIL_AVG_UNCHANGED;
-       WRITE_ONCE(p->se.avg.util_est, ewma);
-
-       trace_sched_util_est_se_tp(&p->se);
-}
-
 static inline unsigned long get_actual_cpu_capacity(int cpu)
 {
        unsigned long capacity = arch_scale_cpu_capacity(cpu);
@@ -5618,7 +5607,7 @@ static bool
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        bool sleep = flags & DEQUEUE_SLEEP;
-       int action = UPDATE_TG;
+       int action = 0;
 
        update_curr(cfs_rq);
        clear_buddies(cfs_rq, se);
@@ -5638,15 +5627,23 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
                if (sched_feat(DELAY_DEQUEUE) && delay &&
                    !entity_eligible(cfs_rq, se)) {
-                       update_load_avg(cfs_rq, se, 0);
+                       if (entity_is_task(se))
+                               action |= UPDATE_UTIL_EST;
+                       update_load_avg(cfs_rq, se, action);
                        update_entity_lag(cfs_rq, se);
                        set_delayed(se);
                        return false;
                }
        }
 
-       if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
-               action |= DO_DETACH;
+       action = UPDATE_TG;
+       if (entity_is_task(se)) {
+               if (task_on_rq_migrating(task_of(se)))
+                       action |= DO_DETACH;
+
+               if (sleep && !(flags & DEQUEUE_DELAYED))
+                       action |= UPDATE_UTIL_EST;
+       }
 
        /*
         * When dequeuing a sched_entity, we must:
@@ -7409,7 +7406,6 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        if (!p->se.sched_delayed)
                util_est_dequeue(&rq->cfs, p);
 
-       util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
        if (dequeue_entities(rq, &p->se, flags) < 0)
                return false;