From 6c88e27cf4327ea95ab59fb4d478a21296987d36 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 13 May 2025 16:12:52 +0200
Subject: [PATCH] MEDIUM: cpu-topo: change "performance" to consider per-core
 capacity

Running the "performance" policy on highly heterogenous systems yields
bad choices when there are sufficiently more small than big cores,
and/or when there are multiple cluster types, because on such setups,
the higher the frequency, the lower the number of cores, despite small
differences in frequencies. In such cases, we quickly end up with
"performance" only choosing the small or the medium cores, which is
contrary to the original intent, which was to select performance cores.
This is what happens on boards like the Orion O6 for example where only
the 4 medium cores and 2 big cores are choosen, evicting the 2 biggest
cores and the 4 smallest ones.

Here we're changing the sorting method to sort CPU clusters by average
per-CPU capacity, and we evict clusters whose per-CPU capacity falls
below 80% of the previous one. Per-core capacity allows to detect
discrepancies between CPU cores, and to continue to focus on high
performance ones as a priority.
---
 doc/configuration.txt | 19 ++++++++++---------
 src/cpu_topo.c        | 18 +++++++++++-------
 2 files changed, 21 insertions(+), 16 deletions(-)
diff --git a/doc/configuration.txt b/doc/configuration.txt
index 430ab4f82..c84f7f10c 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -2098,15 +2098,16 @@ cpu-policy <policy>
                         admins to validate setups.
 
    - performance        exactly like group-by-cluster above, except that CPU
-                        clusters whose performance is less than half of the
-                        next more performant one are evicted. These are
-                        typically "little" or "efficient" cores, whose addition
-                        generally doesn't bring significant gains and can
-                        easily be counter-productive (e.g. TLS handshakes).
-                        Often, keeping such cores for other tasks such as
-                        network handling is much more effective. On development
-                        systems, these can also be used to run auxiliary tools
-                        such as load generators and monitoring tools.
+                        clusters composed of cores whose performance is less
+                        than 80% of those of the next more performant one are
+                        evicted. These are typically "little" or "efficient"
+                        cores, whose addition generally doesn't bring significant
+                        gains and can easily be counter-productive (e.g. TLS
+                        handshakes). Often, keeping such cores for other tasks
+                        such as network handling is much more effective. On
+                        development systems, these can also be used to run
+                        auxiliary tools such as load generators and monitoring
+                        tools.
 
    - resource           this is like group-by-cluster above, except that only
                         the smallest and most efficient CPU cluster will be
diff --git a/src/cpu_topo.c b/src/cpu_topo.c
index 8145309f5..759f9fab8 100644
--- a/src/cpu_topo.c
+++ b/src/cpu_topo.c
@@ -1316,7 +1316,7 @@ static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int
 
 /* the "performance" cpu-policy:
  *  - does nothing if nbthread or thread-groups are set
- *  - eliminates clusters whose total capacity is below half of others
+ *  - eliminates clusters whose average capacity is less than 80% that of others
  *  - tries to create one thread-group per cluster, with as many
  *    threads as CPUs in the cluster, and bind all the threads of
  *    this group to all the CPUs of the cluster.
@@ -1329,22 +1329,26 @@ static int cpu_policy_performance(int policy, int tmin, int tmax, int gmin, int
 	if (global.nbthread || global.nbtgroups)
 		return 0;
 
-	/* sort clusters by reverse capacity */
-	cpu_cluster_reorder_by_capa(ha_cpu_clusters, cpu_topo_maxcpus);
+	/* sort clusters by average reverse capacity */
+	cpu_cluster_reorder_by_avg_capa(ha_cpu_clusters, cpu_topo_maxcpus);
 
 	capa = 0;
 	for (cluster = 0; cluster < cpu_topo_maxcpus; cluster++) {
-		if (capa && ha_cpu_clusters[cluster].capa < capa / 2) {
-			/* This cluster is more than twice as slow as the
-			 * previous one, we're not interested in using it.
+		if (capa && ha_cpu_clusters[cluster].capa * 10 < ha_cpu_clusters[cluster].nb_cpu * capa * 8) {
+			/* This cluster is made of cores delivering less than
+			 * 80% of the performance of those of the previous
+			 * cluster, previous one, we're not interested in
+			 * using it.
 			 */
 			for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
 				if (ha_cpu_topo[cpu].cl_gid == ha_cpu_clusters[cluster].idx)
 					ha_cpu_topo[cpu].st |= HA_CPU_F_IGNORED;
 			}
 		}
+		else if (ha_cpu_clusters[cluster].nb_cpu)
+			capa = ha_cpu_clusters[cluster].capa / ha_cpu_clusters[cluster].nb_cpu;
 		else
-			capa = ha_cpu_clusters[cluster].capa;
+			capa = 0;
 	}
 
 	cpu_cluster_reorder_by_index(ha_cpu_clusters, cpu_topo_maxcpus);
-- 
2.47.2