MINOR: cpu-topo: add cpu-policy "group-by-cluster"

author Willy Tarreau <w@1wt.eu>

Wed, 12 Mar 2025 14:00:42 +0000 (15:00 +0100)

committer Willy Tarreau <w@1wt.eu>

Fri, 14 Mar 2025 17:33:16 +0000 (18:33 +0100)
author Willy Tarreau <w@1wt.eu>
Wed, 12 Mar 2025 14:00:42 +0000 (15:00 +0100)
committer Willy Tarreau <w@1wt.eu>
Fri, 14 Mar 2025 17:33:16 +0000 (18:33 +0100)
diff --git a/doc/configuration.txt b/doc/configuration.txt

index 270e35d311a38b02c42a1b08fc0989af9ebefd5c..fd8af6bfad061883bb24b7c040b4cd80d8cfd409 100644 (file)
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -1981,6 +1981,16 @@ cpu-policy <policy>
                          the limit of 32 or 64 depending on the system. This is
                          the default policy.
  
+   - group-by-cluster   if neither "nbthread" not "nbtgroups" were set, then
+                        one thread group is created for each CPU cluster with
+                        available CPUs, each with as many threads as CPUs. All
+                        threads of a group are bound to all CPUs of the cluster
+                        so that intra-group communications remain local to the
+                        cluster without enforcing too strong a binding. The
+                        per-group thread limits and thread-group limits are
+                        respected. This is recommended on multi-socket and NUMA
+                        systems, as well as CPUs with bad inter-CCX latencies.
+
    See also: "cpu-map", "cpu-set", "nbthread"
  
  cpu-set <directive>...
diff --git a/src/cpu_topo.c b/src/cpu_topo.c

index edb1708e3780e7d3951cb1c06e56e44553032536..cd823272fdcfeda65e82eda8fb1f29714d4bdba3 100644 (file)
--- a/src/cpu_topo.c
+++ b/src/cpu_topo.c
@@ -52,10 +52,12 @@ static int cpu_policy = 1; // "first-usable-node"
  
  /* list of CPU policies for "cpu-policy". The default one is the first one. */
  static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
+static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
  
  static struct ha_cpu_policy ha_cpu_policy[] = {
         { .name = "none",               .desc = "use all available CPUs",                           .fct = NULL   },
         { .name = "first-usable-node",  .desc = "use only first usable node if nbthreads not set",  .fct = cpu_policy_first_usable_node  },
+       { .name = "group-by-cluster",   .desc = "make one thread group per core cluster",           .fct = cpu_policy_group_by_cluster   },
         { 0 } /* end */
  };
  
@@ -959,6 +961,94 @@ static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin
         return 0;
  }
  
+/* the "group-by-cluster" cpu-policy:
+ *  - does nothing if nbthread or thread-groups are set
+ *  - otherwise tries to create one thread-group per cluster, with as many
+ *    threads as CPUs in the cluster, and bind all the threads of this group
+ *    to all the CPUs of the cluster.
+ */
+static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, int gmax, char **err)
+{
+       struct hap_cpuset node_cpu_set;
+       int cpu, cpu_start;
+       int cpu_count;
+       int cid, lcid;
+       int thr;
+
+       if (global.nbthread)
+               return 0;
+
+       if (global.nbtgroups)
+               return 0;
+
+       /* iterate over each new cluster */
+       lcid = -1;
+       cpu_start = 0;
+       while (global.nbtgroups < MAX_TGROUPS) {
+               ha_cpuset_zero(&node_cpu_set);
+               cid = -1; cpu_count = 0;
+
+               for (cpu = cpu_start; cpu <= cpu_topo_lastcpu; cpu++) {
+                       /* skip disabled and already visited CPUs */
+                       if (ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK)
+                               continue;
+                       if (ha_cpu_topo[cpu].cl_gid <= lcid)
+                               continue;
+
+                       if (cid < 0) {
+                               cid = ha_cpu_topo[cpu].cl_gid;
+                               cpu_start = cpu + 1;
+                       }
+                       else if (cid != ha_cpu_topo[cpu].cl_gid)
+                               continue;
+
+                       /* make a mask of all of this cluster's CPUs */
+                       ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx);
+                       cpu_count++;
+               }
+               /* now cid = next cluster_id or -1 if none; cpu_count is the
+                * number of CPUs in this cluster, and cpu_start is the next
+                * cpu to restart from to scan for new clusters.
+                */
+               if (cid < 0)
+                       break;
+
+               /* check that we're still within limits */
+               if (cpu_count > MAX_THREADS_PER_GROUP)
+                       cpu_count = MAX_THREADS_PER_GROUP;
+
+               if (cpu_count + global.nbthread > MAX_THREADS)
+                       cpu_count = MAX_THREADS - global.nbthread;
+
+               if (cpu_count <= 0)
+                       break;
+
+               /* let's create the new thread group */
+               ha_tgroup_info[global.nbtgroups].base  = global.nbthread;
+               ha_tgroup_info[global.nbtgroups].count = cpu_count;
+
+               /* assign to this group the required number of threads */
+               for (thr = 0; thr < cpu_count; thr++) {
+                       ha_thread_info[thr + global.nbthread].tgid = global.nbtgroups + 1;
+                       ha_thread_info[thr + global.nbthread].tg = &ha_tgroup_info[global.nbtgroups];
+                       ha_thread_info[thr + global.nbthread].tg_ctx = &ha_tgroup_ctx[global.nbtgroups];
+                       /* map these threads to all the CPUs */
+                       ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &node_cpu_set);
+               }
+
+               lcid = cid; // last cluster_id
+               global.nbthread += cpu_count;
+               global.nbtgroups++;
+       }
+
+       if (global.nbthread)
+               ha_diag_warning("Created %d threads split into %d groups\n", global.nbthread, global.nbtgroups);
+       else
+               ha_diag_warning("Could not determine any CPU cluster\n");
+
+       return 0;
+}
+
  /* apply the chosen CPU policy if no cpu-map was forced. Returns < 0 on failure
   * with a message in *err that must be freed by the caller if non-null.
   */
author	Willy Tarreau <w@1wt.eu>
	Wed, 12 Mar 2025 14:00:42 +0000 (15:00 +0100)
committer	Willy Tarreau <w@1wt.eu>
	Fri, 14 Mar 2025 17:33:16 +0000 (18:33 +0100)
doc/configuration.txt		patch \| blob \| blame \| history
src/cpu_topo.c		patch \| blob \| blame \| history