]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
sched/fair: Revert 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing...
authorPeter Zijlstra <peterz@infradead.org>
Mon, 26 Jan 2026 19:56:23 +0000 (20:56 +0100)
committerPeter Zijlstra <peterz@infradead.org>
Mon, 23 Feb 2026 17:04:10 +0000 (18:04 +0100)
Zicheng Qu reported that, because avg_vruntime() always includes
cfs_rq->curr, when ->on_rq, place_entity() doesn't work right.

Specifically, the lag scaling in place_entity() relies on
avg_vruntime() being the state *before* placement of the new entity.
However in this case avg_vruntime() will actually already include the
entity, which breaks things.

Also, Zicheng Qu argues that avg_vruntime should be invariant under
reweight. IOW commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity
placement bug causing scheduling lag") was wrong!

The issue reported in 6d71a9c61604 could possibly be explained by
rounding artifacts -- notably the extreme weight '2' is outside of the
range of avg_vruntime/sum_w_vruntime, since that uses
scale_load_down(). By scaling vruntime by the real weight, but
accounting it in vruntime with a factor 1024 more, the average moves
significantly. However, that is now cured.

Tested by reverting 66951e4860d3 ("sched/fair: Fix update_cfs_group()
vs DELAY_DEQUEUE") and tracing vruntime and vlag figures again.

Reported-by: Zicheng Qu <quzicheng@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>
Link: https://patch.msgid.link/20260219080625.066102672%40infradead.org
kernel/sched/fair.c

index fdb98d2ea131c0d7752df3200bb0782961263063..2b98054cd7548dde372985af51ba91f6e919b58d 100644 (file)
@@ -822,17 +822,22 @@ static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
  *
  *   -r_max < lag < max(r_max, q)
  */
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime)
 {
        u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
        s64 vlag, limit;
 
-       WARN_ON_ONCE(!se->on_rq);
-
-       vlag = avg_vruntime(cfs_rq) - se->vruntime;
+       vlag = avruntime - se->vruntime;
        limit = calc_delta_fair(max_slice, se);
 
-       se->vlag = clamp(vlag, -limit, limit);
+       return clamp(vlag, -limit, limit);
+}
+
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       WARN_ON_ONCE(!se->on_rq);
+
+       se->vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
 }
 
 /*
@@ -3898,23 +3903,125 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
                    se_weight(se) * -se->avg.load_sum);
 }
 
-static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+static void
+rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot)
+{
+       unsigned long old_weight = se->load.weight;
+
+       /*
+        * VRUNTIME
+        * --------
+        *
+        * COROLLARY #1: The virtual runtime of the entity needs to be
+        * adjusted if re-weight at !0-lag point.
+        *
+        * Proof: For contradiction assume this is not true, so we can
+        * re-weight without changing vruntime at !0-lag point.
+        *
+        *             Weight   VRuntime   Avg-VRuntime
+        *     before    w          v            V
+        *      after    w'         v'           V'
+        *
+        * Since lag needs to be preserved through re-weight:
+        *
+        *      lag = (V - v)*w = (V'- v')*w', where v = v'
+        *      ==>     V' = (V - v)*w/w' + v           (1)
+        *
+        * Let W be the total weight of the entities before reweight,
+        * since V' is the new weighted average of entities:
+        *
+        *      V' = (WV + w'v - wv) / (W + w' - w)     (2)
+        *
+        * by using (1) & (2) we obtain:
+        *
+        *      (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+        *      ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+        *      ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+        *      ==>     (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+        *
+        * Since we are doing at !0-lag point which means V != v, we
+        * can simplify (3):
+        *
+        *      ==>     W / (W + w' - w) = w / w'
+        *      ==>     Ww' = Ww + ww' - ww
+        *      ==>     W * (w' - w) = w * (w' - w)
+        *      ==>     W = w   (re-weight indicates w' != w)
+        *
+        * So the cfs_rq contains only one entity, hence vruntime of
+        * the entity @v should always equal to the cfs_rq's weighted
+        * average vruntime @V, which means we will always re-weight
+        * at 0-lag point, thus breach assumption. Proof completed.
+        *
+        *
+        * COROLLARY #2: Re-weight does NOT affect weighted average
+        * vruntime of all the entities.
+        *
+        * Proof: According to corollary #1, Eq. (1) should be:
+        *
+        *      (V - v)*w = (V' - v')*w'
+        *      ==>    v' = V' - (V - v)*w/w'           (4)
+        *
+        * According to the weighted average formula, we have:
+        *
+        *      V' = (WV - wv + w'v') / (W - w + w')
+        *         = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+        *         = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+        *         = (WV + w'V' - Vw) / (W - w + w')
+        *
+        *      ==>  V'*(W - w + w') = WV + w'V' - Vw
+        *      ==>     V' * (W - w) = (W - w) * V      (5)
+        *
+        * If the entity is the only one in the cfs_rq, then reweight
+        * always occurs at 0-lag point, so V won't change. Or else
+        * there are other entities, hence W != w, then Eq. (5) turns
+        * into V' = V. So V won't change in either case, proof done.
+        *
+        *
+        * So according to corollary #1 & #2, the effect of re-weight
+        * on vruntime should be:
+        *
+        *      v' = V' - (V - v) * w / w'              (4)
+        *         = V  - (V - v) * w / w'
+        *         = V  - vl * w / w'
+        *         = V  - vl'
+        */
+       se->vlag = div64_long(se->vlag * old_weight, weight);
+
+       /*
+        * DEADLINE
+        * --------
+        *
+        * When the weight changes, the virtual time slope changes and
+        * we should adjust the relative virtual deadline accordingly.
+        *
+        *      d' = v' + (d - v)*w/w'
+        *         = V' - (V - v)*w/w' + (d - v)*w/w'
+        *         = V  - (V - v)*w/w' + (d - v)*w/w'
+        *         = V  + (d - V)*w/w'
+        */
+       if (se->rel_deadline)
+               se->deadline = div64_long(se->deadline * old_weight, weight);
+
+       if (rel_vprot)
+               se->vprot = div64_long(se->vprot * old_weight, weight);
+}
 
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
 {
        bool curr = cfs_rq->curr == se;
        bool rel_vprot = false;
-       u64 vprot;
+       u64 avruntime = 0;
 
        if (se->on_rq) {
                /* commit outstanding execution time */
                update_curr(cfs_rq);
-               update_entity_lag(cfs_rq, se);
-               se->deadline -= se->vruntime;
+               avruntime = avg_vruntime(cfs_rq);
+               se->vlag = entity_lag(cfs_rq, se, avruntime);
+               se->deadline -= avruntime;
                se->rel_deadline = 1;
                if (curr && protect_slice(se)) {
-                       vprot = se->vprot - se->vruntime;
+                       se->vprot -= avruntime;
                        rel_vprot = true;
                }
 
@@ -3925,30 +4032,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
        }
        dequeue_load_avg(cfs_rq, se);
 
-       /*
-        * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
-        * we need to scale se->vlag when w_i changes.
-        */
-       se->vlag = div64_long(se->vlag * se->load.weight, weight);
-       if (se->rel_deadline)
-               se->deadline = div64_long(se->deadline * se->load.weight, weight);
-
-       if (rel_vprot)
-               vprot = div64_long(vprot * se->load.weight, weight);
+       rescale_entity(se, weight, rel_vprot);
 
        update_load_set(&se->load, weight);
 
        do {
                u32 divider = get_pelt_divider(&se->avg);
-
                se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
        } while (0);
 
        enqueue_load_avg(cfs_rq, se);
        if (se->on_rq) {
-               place_entity(cfs_rq, se, 0);
                if (rel_vprot)
-                       se->vprot = se->vruntime + vprot;
+                       se->vprot += avruntime;
+               se->deadline += avruntime;
+               se->rel_deadline = 0;
+               se->vruntime = avruntime - se->vlag;
+
                update_load_add(&cfs_rq->load, se->load.weight);
                if (!curr)
                        __enqueue_entity(cfs_rq, se);
@@ -5306,7 +5406,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
        se->vruntime = vruntime - lag;
 
-       if (se->rel_deadline) {
+       if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
                se->deadline += se->vruntime;
                se->rel_deadline = 0;
                return;