--- /dev/null
+From 5f1e9bf4a001dc935b5c913507a27be28e80dccc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 9 Jan 2025 11:59:59 +0100
+Subject: sched/fair: Fix EEVDF entity placement bug causing scheduling lag
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit 6d71a9c6160479899ee744d2c6d6602a191deb1f ]
+
+I noticed this in my traces today:
+
+ turbostat-1222 [006] d..2. 311.935649: reweight_entity: (ffff888108f13e00-ffff88885ef38440-6)
+ { weight: 1048576 avg_vruntime: 3184159639071 vruntime: 3184159640194 (-1123) deadline: 3184162621107 } ->
+ { weight: 2 avg_vruntime: 3184177463330 vruntime: 3184748414495 (-570951165) deadline: 4747605329439 }
+ turbostat-1222 [006] d..2. 311.935651: reweight_entity: (ffff888108f13e00-ffff88885ef38440-6)
+ { weight: 2 avg_vruntime: 3184177463330 vruntime: 3184748414495 (-570951165) deadline: 4747605329439 } ->
+ { weight: 1048576 avg_vruntime: 3184176414812 vruntime: 3184177464419 (-1049607) deadline: 3184180445332 }
+
+Which is a weight transition: 1048576 -> 2 -> 1048576.
+
+One would expect the lag to shoot out *AND* come back, notably:
+
+ -1123*1048576/2 = -588775424
+ -588775424*2/1048576 = -1123
+
+Except the trace shows it is all off. Worse, subsequent cycles shoot it
+out further and further.
+
+This made me have a very hard look at reweight_entity(), and
+specifically the ->on_rq case, which is more prominent with
+DELAY_DEQUEUE.
+
+And indeed, it is all sorts of broken. While the computation of the new
+lag is correct, the computation for the new vruntime, using the new lag
+is broken for it does not consider the logic set out in place_entity().
+
+With the below patch, I now see things like:
+
+ migration/12-55 [012] d..3. 309.006650: reweight_entity: (ffff8881e0e6f600-ffff88885f235f40-12)
+ { weight: 977582 avg_vruntime: 4860513347366 vruntime: 4860513347908 (-542) deadline: 4860516552475 } ->
+ { weight: 2 avg_vruntime: 4860528915984 vruntime: 4860793840706 (-264924722) deadline: 6427157349203 }
+ migration/14-62 [014] d..3. 309.006698: reweight_entity: (ffff8881e0e6cc00-ffff88885f3b5f40-15)
+ { weight: 2 avg_vruntime: 4874472992283 vruntime: 4939833828823 (-65360836540) deadline: 6316614641111 } ->
+ { weight: 967149 avg_vruntime: 4874217684324 vruntime: 4874217688559 (-4235) deadline: 4874220535650 }
+
+Which isn't perfect yet, but much closer.
+
+Reported-by: Doug Smythies <dsmythies@telus.net>
+Reported-by: Ingo Molnar <mingo@kernel.org>
+Tested-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight")
+Link: https://lore.kernel.org/r/20250109105959.GA2981@noisy.programming.kicks-ass.net
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/fair.c | 145 ++++++--------------------------------------
+ 1 file changed, 18 insertions(+), 127 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 1ca96c99872f0..bf8153af96879 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -689,21 +689,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
+ *
+ * XXX could add max_slice to the augmented data to track this.
+ */
+-static s64 entity_lag(u64 avruntime, struct sched_entity *se)
++static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+ s64 vlag, limit;
+
+- vlag = avruntime - se->vruntime;
+- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+-
+- return clamp(vlag, -limit, limit);
+-}
+-
+-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+-{
+ SCHED_WARN_ON(!se->on_rq);
+
+- se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
++ vlag = avg_vruntime(cfs_rq) - se->vruntime;
++ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
++
++ se->vlag = clamp(vlag, -limit, limit);
+ }
+
+ /*
+@@ -3774,137 +3769,32 @@ static inline void
+ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+ #endif
+
+-static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
+- unsigned long weight)
+-{
+- unsigned long old_weight = se->load.weight;
+- s64 vlag, vslice;
+-
+- /*
+- * VRUNTIME
+- * --------
+- *
+- * COROLLARY #1: The virtual runtime of the entity needs to be
+- * adjusted if re-weight at !0-lag point.
+- *
+- * Proof: For contradiction assume this is not true, so we can
+- * re-weight without changing vruntime at !0-lag point.
+- *
+- * Weight VRuntime Avg-VRuntime
+- * before w v V
+- * after w' v' V'
+- *
+- * Since lag needs to be preserved through re-weight:
+- *
+- * lag = (V - v)*w = (V'- v')*w', where v = v'
+- * ==> V' = (V - v)*w/w' + v (1)
+- *
+- * Let W be the total weight of the entities before reweight,
+- * since V' is the new weighted average of entities:
+- *
+- * V' = (WV + w'v - wv) / (W + w' - w) (2)
+- *
+- * by using (1) & (2) we obtain:
+- *
+- * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+- * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+- * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+- * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+- *
+- * Since we are doing at !0-lag point which means V != v, we
+- * can simplify (3):
+- *
+- * ==> W / (W + w' - w) = w / w'
+- * ==> Ww' = Ww + ww' - ww
+- * ==> W * (w' - w) = w * (w' - w)
+- * ==> W = w (re-weight indicates w' != w)
+- *
+- * So the cfs_rq contains only one entity, hence vruntime of
+- * the entity @v should always equal to the cfs_rq's weighted
+- * average vruntime @V, which means we will always re-weight
+- * at 0-lag point, thus breach assumption. Proof completed.
+- *
+- *
+- * COROLLARY #2: Re-weight does NOT affect weighted average
+- * vruntime of all the entities.
+- *
+- * Proof: According to corollary #1, Eq. (1) should be:
+- *
+- * (V - v)*w = (V' - v')*w'
+- * ==> v' = V' - (V - v)*w/w' (4)
+- *
+- * According to the weighted average formula, we have:
+- *
+- * V' = (WV - wv + w'v') / (W - w + w')
+- * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+- * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+- * = (WV + w'V' - Vw) / (W - w + w')
+- *
+- * ==> V'*(W - w + w') = WV + w'V' - Vw
+- * ==> V' * (W - w) = (W - w) * V (5)
+- *
+- * If the entity is the only one in the cfs_rq, then reweight
+- * always occurs at 0-lag point, so V won't change. Or else
+- * there are other entities, hence W != w, then Eq. (5) turns
+- * into V' = V. So V won't change in either case, proof done.
+- *
+- *
+- * So according to corollary #1 & #2, the effect of re-weight
+- * on vruntime should be:
+- *
+- * v' = V' - (V - v) * w / w' (4)
+- * = V - (V - v) * w / w'
+- * = V - vl * w / w'
+- * = V - vl'
+- */
+- if (avruntime != se->vruntime) {
+- vlag = entity_lag(avruntime, se);
+- vlag = div_s64(vlag * old_weight, weight);
+- se->vruntime = avruntime - vlag;
+- }
+-
+- /*
+- * DEADLINE
+- * --------
+- *
+- * When the weight changes, the virtual time slope changes and
+- * we should adjust the relative virtual deadline accordingly.
+- *
+- * d' = v' + (d - v)*w/w'
+- * = V' - (V - v)*w/w' + (d - v)*w/w'
+- * = V - (V - v)*w/w' + (d - v)*w/w'
+- * = V + (d - V)*w/w'
+- */
+- vslice = (s64)(se->deadline - avruntime);
+- vslice = div_s64(vslice * old_weight, weight);
+- se->deadline = avruntime + vslice;
+-}
++static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+
+ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight)
+ {
+ bool curr = cfs_rq->curr == se;
+- u64 avruntime;
+
+ if (se->on_rq) {
+ /* commit outstanding execution time */
+ update_curr(cfs_rq);
+- avruntime = avg_vruntime(cfs_rq);
++ update_entity_lag(cfs_rq, se);
++ se->deadline -= se->vruntime;
++ se->rel_deadline = 1;
+ if (!curr)
+ __dequeue_entity(cfs_rq, se);
+ update_load_sub(&cfs_rq->load, se->load.weight);
+ }
+ dequeue_load_avg(cfs_rq, se);
+
+- if (se->on_rq) {
+- reweight_eevdf(se, avruntime, weight);
+- } else {
+- /*
+- * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+- * we need to scale se->vlag when w_i changes.
+- */
+- se->vlag = div_s64(se->vlag * se->load.weight, weight);
+- }
++ /*
++ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
++ * we need to scale se->vlag when w_i changes.
++ */
++ se->vlag = div_s64(se->vlag * se->load.weight, weight);
++ if (se->rel_deadline)
++ se->deadline = div_s64(se->deadline * se->load.weight, weight);
+
+ update_load_set(&se->load, weight);
+
+@@ -3919,6 +3809,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ enqueue_load_avg(cfs_rq, se);
+ if (se->on_rq) {
+ update_load_add(&cfs_rq->load, se->load.weight);
++ place_entity(cfs_rq, se, 0);
+ if (!curr)
+ __enqueue_entity(cfs_rq, se);
+
+@@ -5359,7 +5250,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+
+ se->vruntime = vruntime - lag;
+
+- if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
++ if (se->rel_deadline) {
+ se->deadline += se->vruntime;
+ se->rel_deadline = 0;
+ return;
+--
+2.39.5
+
--- /dev/null
+From 21dbeb9d8096a5f19b1a69347dc518e95a9e3782 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Jan 2025 13:50:11 +0100
+Subject: sched/fair: Fix update_cfs_group() vs DELAY_DEQUEUE
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit 66951e4860d3c688bfa550ea4a19635b57e00eca ]
+
+Normally dequeue_entities() will continue to dequeue an empty group entity;
+except DELAY_DEQUEUE changes things -- it retains empty entities such that they
+might continue to compete and burn off some lag.
+
+However, doing this results in update_cfs_group() re-computing the cgroup
+weight 'slice' for an empty group, which it (rightly) figures isn't much at
+all. This in turn means that the delayed entity is not competing at the
+expected weight. Worse, the very low weight causes its lag to be inflated,
+which combined with avg_vruntime() using scale_load_down(), leads to artifacts.
+
+As such, don't adjust the weight for empty group entities and let them compete
+at their original weight.
+
+Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20250110115720.GA17405@noisy.programming.kicks-ass.net
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/fair.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index bf8153af96879..7f8677ce83f9f 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -3956,7 +3956,11 @@ static void update_cfs_group(struct sched_entity *se)
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long shares;
+
+- if (!gcfs_rq)
++ /*
++ * When a group becomes empty, preserve its weight. This matters for
++ * DELAY_DEQUEUE.
++ */
++ if (!gcfs_rq || !gcfs_rq->load.weight)
+ return;
+
+ if (throttled_hierarchy(gcfs_rq))
+--
+2.39.5
+