--- /dev/null
+From a2e90611b9f425adbbfcdaa5b5e49958ddf6f61b Mon Sep 17 00:00:00 2001
+From: Vincent Guittot <vincent.guittot@linaro.org>
+Date: Wed, 1 Feb 2023 15:36:28 +0100
+Subject: sched/fair: Remove capacity inversion detection
+
+From: Vincent Guittot <vincent.guittot@linaro.org>
+
+commit a2e90611b9f425adbbfcdaa5b5e49958ddf6f61b upstream.
+
+Remove the capacity inversion detection which is now handled by
+util_fits_cpu() returning -1 when we need to continue to look for a
+potential CPU with better performance.
+
+This ends up almost reverting patches below except for some comments:
+commit da07d2f9c153 ("sched/fair: Fixes for capacity inversion detection")
+commit aa69c36f31aa ("sched/fair: Consider capacity inversion in util_fits_cpu()")
+commit 44c7b80bffc3 ("sched/fair: Detect capacity inversion")
+
+Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20230201143628.270912-3-vincent.guittot@linaro.org
+Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c | 84 +++------------------------------------------------
+ kernel/sched/sched.h | 19 -----------
+ 2 files changed, 5 insertions(+), 98 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -4464,17 +4464,9 @@ static inline int util_fits_cpu(unsigned
+ *
+ * For uclamp_max, we can tolerate a drop in performance level as the
+ * goal is to cap the task. So it's okay if it's getting less.
+- *
+- * In case of capacity inversion we should honour the inverted capacity
+- * for both uclamp_min and uclamp_max all the time.
+ */
+- capacity_orig = cpu_in_capacity_inversion(cpu);
+- if (capacity_orig) {
+- capacity_orig_thermal = capacity_orig;
+- } else {
+- capacity_orig = capacity_orig_of(cpu);
+- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
+- }
++ capacity_orig = capacity_orig_of(cpu);
++ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
+
+ /*
+ * We want to force a task to fit a cpu as implied by uclamp_max.
+@@ -8929,82 +8921,16 @@ static unsigned long scale_rt_capacity(i
+
+ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
+ {
+- unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
+ unsigned long capacity = scale_rt_capacity(cpu);
+ struct sched_group *sdg = sd->groups;
+- struct rq *rq = cpu_rq(cpu);
+
+- rq->cpu_capacity_orig = capacity_orig;
++ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
+
+ if (!capacity)
+ capacity = 1;
+
+- rq->cpu_capacity = capacity;
+-
+- /*
+- * Detect if the performance domain is in capacity inversion state.
+- *
+- * Capacity inversion happens when another perf domain with equal or
+- * lower capacity_orig_of() ends up having higher capacity than this
+- * domain after subtracting thermal pressure.
+- *
+- * We only take into account thermal pressure in this detection as it's
+- * the only metric that actually results in *real* reduction of
+- * capacity due to performance points (OPPs) being dropped/become
+- * unreachable due to thermal throttling.
+- *
+- * We assume:
+- * * That all cpus in a perf domain have the same capacity_orig
+- * (same uArch).
+- * * Thermal pressure will impact all cpus in this perf domain
+- * equally.
+- */
+- if (sched_energy_enabled()) {
+- unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
+- struct perf_domain *pd;
+-
+- rcu_read_lock();
+-
+- pd = rcu_dereference(rq->rd->pd);
+- rq->cpu_capacity_inverted = 0;
+-
+- for (; pd; pd = pd->next) {
+- struct cpumask *pd_span = perf_domain_span(pd);
+- unsigned long pd_cap_orig, pd_cap;
+-
+- /* We can't be inverted against our own pd */
+- if (cpumask_test_cpu(cpu_of(rq), pd_span))
+- continue;
+-
+- cpu = cpumask_any(pd_span);
+- pd_cap_orig = arch_scale_cpu_capacity(cpu);
+-
+- if (capacity_orig < pd_cap_orig)
+- continue;
+-
+- /*
+- * handle the case of multiple perf domains have the
+- * same capacity_orig but one of them is under higher
+- * thermal pressure. We record it as capacity
+- * inversion.
+- */
+- if (capacity_orig == pd_cap_orig) {
+- pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
+-
+- if (pd_cap > inv_cap) {
+- rq->cpu_capacity_inverted = inv_cap;
+- break;
+- }
+- } else if (pd_cap_orig > inv_cap) {
+- rq->cpu_capacity_inverted = inv_cap;
+- break;
+- }
+- }
+-
+- rcu_read_unlock();
+- }
+-
+- trace_sched_cpu_capacity_tp(rq);
++ cpu_rq(cpu)->cpu_capacity = capacity;
++ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+
+ sdg->sgc->capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1041,7 +1041,6 @@ struct rq {
+
+ unsigned long cpu_capacity;
+ unsigned long cpu_capacity_orig;
+- unsigned long cpu_capacity_inverted;
+
+ struct balance_callback *balance_callback;
+
+@@ -2879,24 +2878,6 @@ static inline unsigned long capacity_ori
+ return cpu_rq(cpu)->cpu_capacity_orig;
+ }
+
+-/*
+- * Returns inverted capacity if the CPU is in capacity inversion state.
+- * 0 otherwise.
+- *
+- * Capacity inversion detection only considers thermal impact where actual
+- * performance points (OPPs) gets dropped.
+- *
+- * Capacity inversion state happens when another performance domain that has
+- * equal or lower capacity_orig_of() becomes effectively larger than the perf
+- * domain this CPU belongs to due to thermal pressure throttling it hard.
+- *
+- * See comment in update_cpu_capacity().
+- */
+-static inline unsigned long cpu_in_capacity_inversion(int cpu)
+-{
+- return cpu_rq(cpu)->cpu_capacity_inverted;
+-}
+-
+ /**
+ * enum cpu_util_type - CPU utilization type
+ * @FREQUENCY_UTIL: Utilization used to select frequency
--- /dev/null
+From e5ed0550c04c5469ecdc1634d8aa18c8609590f0 Mon Sep 17 00:00:00 2001
+From: Vincent Guittot <vincent.guittot@linaro.org>
+Date: Wed, 1 Feb 2023 15:36:27 +0100
+Subject: sched/fair: unlink misfit task from cpu overutilized
+
+From: Vincent Guittot <vincent.guittot@linaro.org>
+
+commit e5ed0550c04c5469ecdc1634d8aa18c8609590f0 upstream.
+
+By taking into account uclamp_min, the 1:1 relation between task misfit
+and cpu overutilized is no more true as a task with a small util_avg may
+not fit a high capacity cpu because of uclamp_min constraint.
+
+Add a new state in util_fits_cpu() to reflect the case that task would fit
+a CPU except for the uclamp_min hint which is a performance requirement.
+
+Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
+can use this new value to take additional action to select the best CPU
+that doesn't match uclamp_min hint.
+
+When util_fits_cpu() returns -1, we will continue to look for a possible
+CPU with better performance, which replaces Capacity Inversion detection
+with capacity_orig_of() - thermal_load_avg to detect a capacity inversion.
+
+Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
+Reviewed-and-tested-by: Qais Yousef <qyousef@layalina.io>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
+Link: https://lore.kernel.org/r/20230201143628.270912-2-vincent.guittot@linaro.org
+Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++++++++------------
+ 1 file changed, 82 insertions(+), 23 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -4549,8 +4549,8 @@ static inline int util_fits_cpu(unsigned
+ * handle the case uclamp_min > uclamp_max.
+ */
+ uclamp_min = min(uclamp_min, uclamp_max);
+- if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
+- fits = fits && (uclamp_min <= capacity_orig_thermal);
++ if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
++ return -1;
+
+ return fits;
+ }
+@@ -4560,7 +4560,11 @@ static inline int task_fits_cpu(struct t
+ unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ unsigned long util = task_util_est(p);
+- return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
++ /*
++ * Return true only if the cpu fully fits the task requirements, which
++ * include the utilization but also the performance hints.
++ */
++ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
+ }
+
+ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
+@@ -6043,6 +6047,7 @@ static inline bool cpu_overutilized(int
+ unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
++ /* Return true only if the utilization doesn't fit CPU's capacity */
+ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
+ }
+
+@@ -6836,6 +6841,7 @@ static int
+ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
+ {
+ unsigned long task_util, util_min, util_max, best_cap = 0;
++ int fits, best_fits = 0;
+ int cpu, best_cpu = -1;
+ struct cpumask *cpus;
+
+@@ -6851,12 +6857,28 @@ select_idle_capacity(struct task_struct
+
+ if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+ continue;
+- if (util_fits_cpu(task_util, util_min, util_max, cpu))
++
++ fits = util_fits_cpu(task_util, util_min, util_max, cpu);
++
++ /* This CPU fits with all requirements */
++ if (fits > 0)
+ return cpu;
++ /*
++ * Only the min performance hint (i.e. uclamp_min) doesn't fit.
++ * Look for the CPU with best capacity.
++ */
++ else if (fits < 0)
++ cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
+
+- if (cpu_cap > best_cap) {
++ /*
++ * First, select CPU which fits better (-1 being better than 0).
++ * Then, select the one with best capacity at same level.
++ */
++ if ((fits < best_fits) ||
++ ((fits == best_fits) && (cpu_cap > best_cap))) {
+ best_cap = cpu_cap;
+ best_cpu = cpu;
++ best_fits = fits;
+ }
+ }
+
+@@ -6869,7 +6891,11 @@ static inline bool asym_fits_cpu(unsigne
+ int cpu)
+ {
+ if (sched_asym_cpucap_active())
+- return util_fits_cpu(util, util_min, util_max, cpu);
++ /*
++ * Return true only if the cpu fully fits the task requirements
++ * which include the utilization and the performance hints.
++ */
++ return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+
+ return true;
+ }
+@@ -7236,6 +7262,9 @@ static int find_energy_efficient_cpu(str
+ unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
+ struct root_domain *rd = this_rq()->rd;
+ int cpu, best_energy_cpu, target = -1;
++ int prev_fits = -1, best_fits = -1;
++ unsigned long best_thermal_cap = 0;
++ unsigned long prev_thermal_cap = 0;
+ struct sched_domain *sd;
+ struct perf_domain *pd;
+ struct energy_env eenv;
+@@ -7271,6 +7300,7 @@ static int find_energy_efficient_cpu(str
+ unsigned long prev_spare_cap = 0;
+ int max_spare_cap_cpu = -1;
+ unsigned long base_energy;
++ int fits, max_fits = -1;
+
+ cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
+
+@@ -7320,7 +7350,9 @@ static int find_energy_efficient_cpu(str
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
+ }
+- if (!util_fits_cpu(util, util_min, util_max, cpu))
++
++ fits = util_fits_cpu(util, util_min, util_max, cpu);
++ if (!fits)
+ continue;
+
+ lsub_positive(&cpu_cap, util);
+@@ -7328,7 +7360,9 @@ static int find_energy_efficient_cpu(str
+ if (cpu == prev_cpu) {
+ /* Always use prev_cpu as a candidate. */
+ prev_spare_cap = cpu_cap;
+- } else if (cpu_cap > max_spare_cap) {
++ prev_fits = fits;
++ } else if ((fits > max_fits) ||
++ ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
+ /*
+ * Find the CPU with the maximum spare capacity
+ * among the remaining CPUs in the performance
+@@ -7336,6 +7370,7 @@ static int find_energy_efficient_cpu(str
+ */
+ max_spare_cap = cpu_cap;
+ max_spare_cap_cpu = cpu;
++ max_fits = fits;
+ }
+ }
+
+@@ -7354,26 +7389,50 @@ static int find_energy_efficient_cpu(str
+ if (prev_delta < base_energy)
+ goto unlock;
+ prev_delta -= base_energy;
++ prev_thermal_cap = cpu_thermal_cap;
+ best_delta = min(best_delta, prev_delta);
+ }
+
+ /* Evaluate the energy impact of using max_spare_cap_cpu. */
+ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
++ /* Current best energy cpu fits better */
++ if (max_fits < best_fits)
++ continue;
++
++ /*
++ * Both don't fit performance hint (i.e. uclamp_min)
++ * but best energy cpu has better capacity.
++ */
++ if ((max_fits < 0) &&
++ (cpu_thermal_cap <= best_thermal_cap))
++ continue;
++
+ cur_delta = compute_energy(&eenv, pd, cpus, p,
+ max_spare_cap_cpu);
+ /* CPU utilization has changed */
+ if (cur_delta < base_energy)
+ goto unlock;
+ cur_delta -= base_energy;
+- if (cur_delta < best_delta) {
+- best_delta = cur_delta;
+- best_energy_cpu = max_spare_cap_cpu;
+- }
++
++ /*
++ * Both fit for the task but best energy cpu has lower
++ * energy impact.
++ */
++ if ((max_fits > 0) && (best_fits > 0) &&
++ (cur_delta >= best_delta))
++ continue;
++
++ best_delta = cur_delta;
++ best_energy_cpu = max_spare_cap_cpu;
++ best_fits = max_fits;
++ best_thermal_cap = cpu_thermal_cap;
+ }
+ }
+ rcu_read_unlock();
+
+- if (best_delta < prev_delta)
++ if ((best_fits > prev_fits) ||
++ ((best_fits > 0) && (best_delta < prev_delta)) ||
++ ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+ target = best_energy_cpu;
+
+ return target;
+@@ -10183,24 +10242,23 @@ static struct sched_group *find_busiest_
+ */
+ update_sd_lb_stats(env, &sds);
+
+- if (sched_energy_enabled()) {
+- struct root_domain *rd = env->dst_rq->rd;
+-
+- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+- goto out_balanced;
+- }
+-
+- local = &sds.local_stat;
+- busiest = &sds.busiest_stat;
+-
+ /* There is no busy sibling group to pull tasks from */
+ if (!sds.busiest)
+ goto out_balanced;
+
++ busiest = &sds.busiest_stat;
++
+ /* Misfit tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task)
+ goto force_balance;
+
++ if (sched_energy_enabled()) {
++ struct root_domain *rd = env->dst_rq->rd;
++
++ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
++ goto out_balanced;
++ }
++
+ /* ASYM feature bypasses nice load balance check */
+ if (busiest->group_type == group_asym_packing)
+ goto force_balance;
+@@ -10213,6 +10271,7 @@ static struct sched_group *find_busiest_
+ if (busiest->group_type == group_imbalanced)
+ goto force_balance;
+
++ local = &sds.local_stat;
+ /*
+ * If the local group is busier than the selected busiest group
+ * don't try and pull any tasks.