the limit of 32 or 64 depending on the system. This is
the default policy.
+ - group-by-2-ccx same as "group-by-ccx" below but create a group every
+ two CCX. This can make sense on CPUs having many CCX of
+ few cores each, to avoid creating many groups, or to
+ smooth the distribution a little bit when not all cores
+ are in use. Please note that it can have very bad
+ performance effects when the communication between CCX
+ is slow. This is generally recommended against.
+
- group-by-2-clusters same as "group-by-cluster" but create a group every
two clusters. This can make sense on CPUs having many
clusters of few cores each, to avoid creating many
between clusters is slow. This is generally recommended
against.
+ - group-by-3-ccx same as "group-by-ccx" below but create a group every
+ three CCX. This can make sense on CPUs having many CCX
+ of few cores each, to avoid creating many groups, or to
+ smooth the distribution a little bit when not all cores
+ are in use. Please note that it can have very bad
+ performance effects when the communication between CCX
+ is slow. This is generally recommended against.
+
- group-by-3-clusters same as "group-by-cluster" but create a group every
three clusters. This can make sense on CPUs having many
clusters of few cores each, to avoid creating many
between clusters is slow. This is generally recommended
against.
+ - group-by-4-ccx same as "group-by-ccx" below but create a group every
+ four CCX. This can make sense on CPUs having many CCX
+ of few cores each, to avoid creating many groups, or to
+ smooth the distribution a little bit when not all cores
+ are in use. Please note that it can have very bad
+ performance effects when the communication between CCX
+ is slow. This is generally recommended against.
+
- group-by-4-clusters same as "group-by-cluster" but create a group every
four clusters. This can make sense on CPUs having many
clusters of few cores each, to avoid creating many
between clusters is slow. This is generally recommended
against.
+ - group-by-ccx if neither "nbthread" not "nbtgroups" were set, then
+ one thread group is created for each CPU core complex
+ ("CCX") with available CPUs, each with as many threads
+ as CPUs. A CCX groups CPUs having a similarly fast
+ access to the last level cache ("LLC"), typically the
+ L3 cache. On most modern machines, it is critical for
+ performance not to mix CPUs from distant CCX in the
+ same thread group. All threads of a group are then
+ bound to all CPUs of the CCX so that intra-group
+ communications remain local to the CCX without
+ enforcing too strong a binding. The per-group thread
+ limits and thread-group limits are respected. This is
+ recommended on multi-socket and NUMA systems, as well
+ as CPUs with bad inter-CCX latencies.
+
- group-by-cluster if neither "nbthread" not "nbtgroups" were set, then
one thread group is created for each CPU cluster with
available CPUs, each with as many threads as CPUs. All
per-group thread limits and thread-group limits are
respected. This is recommended on multi-socket and NUMA
systems, as well as CPUs with bad inter-CCX latencies.
+ On most server machines, clusters and CCX are the same,
+ but on heterogenous machines ("performance" vs
+ "efficiency" or "big" vs "little"), a cluster will
+ generally be made of only a part of a CCX composed only
+ of very similar CPUs (same type, +/-5% frequency
+ difference max). The difference is visible on modern
+ laptops and desktop machines used by developers and
+ admins to validate setups.
- performance exactly like group-by-cluster above, except that CPU
clusters whose performance is less than half of the
/* list of CPU policies for "cpu-policy". The default one is the first one. */
static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
+static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
static int cpu_policy_performance(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
static int cpu_policy_efficiency(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
static struct ha_cpu_policy ha_cpu_policy[] = {
{ .name = "none", .desc = "use all available CPUs", .fct = NULL },
{ .name = "first-usable-node", .desc = "use only first usable node if nbthreads not set", .fct = cpu_policy_first_usable_node, .arg = 0 },
+ { .name = "group-by-ccx", .desc = "make one thread group per CCX", .fct = cpu_policy_group_by_ccx , .arg = 1 },
+ { .name = "group-by-2-ccx", .desc = "make one thread group per 2 CCX", .fct = cpu_policy_group_by_ccx , .arg = 2 },
+ { .name = "group-by-3-ccx", .desc = "make one thread group per 3 CCX", .fct = cpu_policy_group_by_ccx , .arg = 3 },
+ { .name = "group-by-4-ccx", .desc = "make one thread group per 4 CCX", .fct = cpu_policy_group_by_ccx , .arg = 4 },
{ .name = "group-by-cluster", .desc = "make one thread group per core cluster", .fct = cpu_policy_group_by_cluster , .arg = 1 },
{ .name = "group-by-2-clusters",.desc = "make one thread group per 2 core clusters", .fct = cpu_policy_group_by_cluster , .arg = 2 },
{ .name = "group-by-3-clusters",.desc = "make one thread group per 3 core clusters", .fct = cpu_policy_group_by_cluster , .arg = 3 },
return 0;
}
+/* the "group-by-ccx" cpu-policy:
+ * - does nothing if nbthread or thread-groups are set
+ * - otherwise tries to create one thread-group per CCX (defined as the ID of
+ * the last level cache), with as many threads as CPUs in the CCX, and bind
+ * all the threads of this group to all the CPUs of the CCX. In practice, an
+ * ID of layer3 will have been assigned so we'll use this.
+ * Also implements the variants "group-by-2-ccx", "group-by-3-ccx" and
+ * "group-by-4-ccx".
+ */
+static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int gmax, char **err)
+{
+ struct hap_cpuset visited_ccx_set;
+ struct hap_cpuset node_cpu_set;
+ int cpu, cpu_start;
+ int cpu_count;
+ int l3id;
+ int thr_per_grp, nb_grp;
+ int thr;
+ int div;
+
+ if (global.nbthread)
+ return 0;
+
+ if (global.nbtgroups)
+ return 0;
+
+ ha_cpuset_zero(&visited_ccx_set);
+
+ /* iterate over each new ccx */
+ cpu_start = 0;
+
+ /* used as a divisor of ccx */
+ div = ha_cpu_policy[policy].arg;
+ div = div ? div : 1;
+
+ while (global.nbtgroups < MAX_TGROUPS && global.nbthread < MAX_THREADS) {
+ ha_cpuset_zero(&node_cpu_set);
+ l3id = -1; cpu_count = 0;
+
+ for (cpu = cpu_start; cpu <= cpu_topo_lastcpu; cpu++) {
+ /* skip disabled and already visited CPUs */
+ if (ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK)
+ continue;
+
+ if (ha_cpuset_isset(&visited_ccx_set, ha_cpu_topo[cpu].ca_id[3] / div))
+ continue;
+
+ if (l3id < 0) {
+ l3id = ha_cpu_topo[cpu].ca_id[3] / div;
+ cpu_start = cpu + 1;
+ }
+ else if (l3id != ha_cpu_topo[cpu].ca_id[3] / div)
+ continue;
+
+ /* make a mask of all of this cluster's CPUs */
+ ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx);
+ cpu_count++;
+ }
+
+ /* now l3id = next L3 ID or -1 if none; cpu_count is the
+ * number of CPUs in this CCX, and cpu_start is the next
+ * cpu to restart from to scan for new clusters.
+ */
+ if (l3id < 0 || !cpu_count)
+ break;
+
+ ha_cpuset_set(&visited_ccx_set, l3id);
+
+ /* check that we're still within limits. If there are too many
+ * CPUs but enough groups left, we'll try to make more smaller
+ * groups, of the closest size each.
+ */
+ nb_grp = (cpu_count + MAX_THREADS_PER_GROUP - 1) / MAX_THREADS_PER_GROUP;
+ if (nb_grp > MAX_TGROUPS - global.nbtgroups)
+ nb_grp = MAX_TGROUPS - global.nbtgroups;
+ thr_per_grp = (cpu_count + nb_grp - 1) / nb_grp;
+ if (thr_per_grp > MAX_THREADS_PER_GROUP)
+ thr_per_grp = MAX_THREADS_PER_GROUP;
+
+ while (nb_grp && cpu_count > 0) {
+ /* create at most thr_per_grp threads */
+ if (thr_per_grp > cpu_count)
+ thr_per_grp = cpu_count;
+
+ if (thr_per_grp + global.nbthread > MAX_THREADS)
+ thr_per_grp = MAX_THREADS - global.nbthread;
+
+ /* let's create the new thread group */
+ ha_tgroup_info[global.nbtgroups].base = global.nbthread;
+ ha_tgroup_info[global.nbtgroups].count = thr_per_grp;
+
+ /* assign to this group the required number of threads */
+ for (thr = 0; thr < thr_per_grp; thr++) {
+ ha_thread_info[thr + global.nbthread].tgid = global.nbtgroups + 1;
+ ha_thread_info[thr + global.nbthread].tg = &ha_tgroup_info[global.nbtgroups];
+ ha_thread_info[thr + global.nbthread].tg_ctx = &ha_tgroup_ctx[global.nbtgroups];
+ /* map these threads to all the CPUs */
+ ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &node_cpu_set);
+ }
+
+ cpu_count -= thr_per_grp;
+ global.nbthread += thr_per_grp;
+ global.nbtgroups++;
+ if (global.nbtgroups >= MAX_TGROUPS || global.nbthread >= MAX_THREADS)
+ break;
+ }
+ }
+
+ if (global.nbthread)
+ ha_diag_warning("Created %d threads split into %d groups\n", global.nbthread, global.nbtgroups);
+ else
+ ha_diag_warning("Could not determine any CPU cluster\n");
+
+ return 0;
+}
+
/* the "performance" cpu-policy:
* - does nothing if nbthread or thread-groups are set
* - eliminates clusters whose total capacity is below half of others