sched/fair: Revert 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing...

author Peter Zijlstra <peterz@infradead.org>

Mon, 26 Jan 2026 19:56:23 +0000 (20:56 +0100)

committer Peter Zijlstra <peterz@infradead.org>

Mon, 23 Feb 2026 17:04:10 +0000 (18:04 +0100)
author Peter Zijlstra <peterz@infradead.org>
Mon, 26 Jan 2026 19:56:23 +0000 (20:56 +0100)
committer Peter Zijlstra <peterz@infradead.org>
Mon, 23 Feb 2026 17:04:10 +0000 (18:04 +0100)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index fdb98d2ea131c0d7752df3200bb0782961263063..2b98054cd7548dde372985af51ba91f6e919b58d 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -822,17 +822,22 @@ static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
   *
   *   -r_max < lag < max(r_max, q)
   */
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime)
  {
         u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
         s64 vlag, limit;
  
-       WARN_ON_ONCE(!se->on_rq);
-
-       vlag = avg_vruntime(cfs_rq) - se->vruntime;
+       vlag = avruntime - se->vruntime;
         limit = calc_delta_fair(max_slice, se);
  
-       se->vlag = clamp(vlag, -limit, limit);
+       return clamp(vlag, -limit, limit);
+}
+
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       WARN_ON_ONCE(!se->on_rq);
+
+       se->vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
  }
  
  /*
@@ -3898,23 +3903,125 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
                     se_weight(se) * -se->avg.load_sum);
  }
  
-static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+static void
+rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot)
+{
+       unsigned long old_weight = se->load.weight;
+
+       /*
+        * VRUNTIME
+        * --------
+        *
+        * COROLLARY #1: The virtual runtime of the entity needs to be
+        * adjusted if re-weight at !0-lag point.
+        *
+        * Proof: For contradiction assume this is not true, so we can
+        * re-weight without changing vruntime at !0-lag point.
+        *
+        *             Weight   VRuntime   Avg-VRuntime
+        *     before    w          v            V
+        *      after    w'         v'           V'
+        *
+        * Since lag needs to be preserved through re-weight:
+        *
+        *      lag = (V - v)*w = (V'- v')*w', where v = v'
+        *      ==>     V' = (V - v)*w/w' + v           (1)
+        *
+        * Let W be the total weight of the entities before reweight,
+        * since V' is the new weighted average of entities:
+        *
+        *      V' = (WV + w'v - wv) / (W + w' - w)     (2)
+        *
+        * by using (1) & (2) we obtain:
+        *
+        *      (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+        *      ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+        *      ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+        *      ==>     (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+        *
+        * Since we are doing at !0-lag point which means V != v, we
+        * can simplify (3):
+        *
+        *      ==>     W / (W + w' - w) = w / w'
+        *      ==>     Ww' = Ww + ww' - ww
+        *      ==>     W * (w' - w) = w * (w' - w)
+        *      ==>     W = w   (re-weight indicates w' != w)
+        *
+        * So the cfs_rq contains only one entity, hence vruntime of
+        * the entity @v should always equal to the cfs_rq's weighted
+        * average vruntime @V, which means we will always re-weight
+        * at 0-lag point, thus breach assumption. Proof completed.
+        *
+        *
+        * COROLLARY #2: Re-weight does NOT affect weighted average
+        * vruntime of all the entities.
+        *
+        * Proof: According to corollary #1, Eq. (1) should be:
+        *
+        *      (V - v)*w = (V' - v')*w'
+        *      ==>    v' = V' - (V - v)*w/w'           (4)
+        *
+        * According to the weighted average formula, we have:
+        *
+        *      V' = (WV - wv + w'v') / (W - w + w')
+        *         = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+        *         = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+        *         = (WV + w'V' - Vw) / (W - w + w')
+        *
+        *      ==>  V'*(W - w + w') = WV + w'V' - Vw
+        *      ==>     V' * (W - w) = (W - w) * V      (5)
+        *
+        * If the entity is the only one in the cfs_rq, then reweight
+        * always occurs at 0-lag point, so V won't change. Or else
+        * there are other entities, hence W != w, then Eq. (5) turns
+        * into V' = V. So V won't change in either case, proof done.
+        *
+        *
+        * So according to corollary #1 & #2, the effect of re-weight
+        * on vruntime should be:
+        *
+        *      v' = V' - (V - v) * w / w'              (4)
+        *         = V  - (V - v) * w / w'
+        *         = V  - vl * w / w'
+        *         = V  - vl'
+        */
+       se->vlag = div64_long(se->vlag * old_weight, weight);
+
+       /*
+        * DEADLINE
+        * --------
+        *
+        * When the weight changes, the virtual time slope changes and
+        * we should adjust the relative virtual deadline accordingly.
+        *
+        *      d' = v' + (d - v)*w/w'
+        *         = V' - (V - v)*w/w' + (d - v)*w/w'
+        *         = V  - (V - v)*w/w' + (d - v)*w/w'
+        *         = V  + (d - V)*w/w'
+        */
+       if (se->rel_deadline)
+               se->deadline = div64_long(se->deadline * old_weight, weight);
+
+       if (rel_vprot)
+               se->vprot = div64_long(se->vprot * old_weight, weight);
+}
  
  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                             unsigned long weight)
  {
         bool curr = cfs_rq->curr == se;
         bool rel_vprot = false;
-       u64 vprot;
+       u64 avruntime = 0;
  
         if (se->on_rq) {
                 /* commit outstanding execution time */
                 update_curr(cfs_rq);
-               update_entity_lag(cfs_rq, se);
-               se->deadline -= se->vruntime;
+               avruntime = avg_vruntime(cfs_rq);
+               se->vlag = entity_lag(cfs_rq, se, avruntime);
+               se->deadline -= avruntime;
                 se->rel_deadline = 1;
                 if (curr && protect_slice(se)) {
-                       vprot = se->vprot - se->vruntime;
+                       se->vprot -= avruntime;
                         rel_vprot = true;
                 }
  
@@ -3925,30 +4032,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
         }
         dequeue_load_avg(cfs_rq, se);
  
-       /*
-        * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
-        * we need to scale se->vlag when w_i changes.
-        */
-       se->vlag = div64_long(se->vlag * se->load.weight, weight);
-       if (se->rel_deadline)
-               se->deadline = div64_long(se->deadline * se->load.weight, weight);
-
-       if (rel_vprot)
-               vprot = div64_long(vprot * se->load.weight, weight);
+       rescale_entity(se, weight, rel_vprot);
  
         update_load_set(&se->load, weight);
  
         do {
                 u32 divider = get_pelt_divider(&se->avg);
-
                 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
         } while (0);
  
         enqueue_load_avg(cfs_rq, se);
         if (se->on_rq) {
-               place_entity(cfs_rq, se, 0);
                 if (rel_vprot)
-                       se->vprot = se->vruntime + vprot;
+                       se->vprot += avruntime;
+               se->deadline += avruntime;
+               se->rel_deadline = 0;
+               se->vruntime = avruntime - se->vlag;
+
                 update_load_add(&cfs_rq->load, se->load.weight);
                 if (!curr)
                         __enqueue_entity(cfs_rq, se);
@@ -5306,7 +5406,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
         se->vruntime = vruntime - lag;
  
-       if (se->rel_deadline) {
+       if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
                 se->deadline += se->vruntime;
                 se->rel_deadline = 0;
                 return;
author	Peter Zijlstra <peterz@infradead.org>
	Mon, 26 Jan 2026 19:56:23 +0000 (20:56 +0100)
committer	Peter Zijlstra <peterz@infradead.org>
	Mon, 23 Feb 2026 17:04:10 +0000 (18:04 +0100)