MEDIUM: cpu-topo: Add an optional directive for per-group affinity

author Olivier Houchard <ohouchard@haproxy.com>

Mon, 5 Jan 2026 05:38:58 +0000 (06:38 +0100)

committer Olivier Houchard <ohouchard@haproxy.com>

Tue, 6 Jan 2026 10:32:45 +0000 (11:32 +0100)
author Olivier Houchard <ohouchard@haproxy.com>
Mon, 5 Jan 2026 05:38:58 +0000 (06:38 +0100)
committer Olivier Houchard <ohouchard@haproxy.com>
Tue, 6 Jan 2026 10:32:45 +0000 (11:32 +0100)
diff --git a/doc/configuration.txt b/doc/configuration.txt

index edb3c9695d1542e3e4ef2aa743644654673e3f49..01cab862523321b19d9b71edb5c230e070f9e62d 100644 (file)
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -2233,6 +2233,17 @@ cpu-affinity <affinity>
    - per-core, each thread will be bound to all the hardware threads of one core.
    - per-group, each thread will be bound to all the hardware threads of the
      group. This is the default unless threads-per-core 1 is used in cpu-policy.
+    per-group accepts an optional argument, to specify how CPUs should be
+    allocated. When a list of CPUs is larger than the maximum allozed number
+    of CPUs per group and has to be split between multiple groups, an extra
+    option allows to choose how the groups will be bound to those CPUs:
+    - auto: each thread group will only be assigned a fair share of contiguous
+      CPU cores that are dedicated to it and not shared with other groups.
+      This is the default as it generally is more optimal.
+      be shared across two thread groups. This is the default.
+    - loose: each group will still be allowed to use any CPU in the list. This
+      generally causes more contention, but may sometimes help deal better
+      with parasitic loads running on the same CPUs.
    - auto, per-group will be used, unless threads-per-core 1 is used in
      cpu-policy, in which case per-core will be used. This is the default.
    - per-thread, that will bind one thread to one hardware thread only.
diff --git a/src/cpu_topo.c b/src/cpu_topo.c

index 57cdbc4486c58d830670c88d41997cc58a5d503e..d7d39c90a0486e6e8c11cd053b6eff95e86ff227 100644 (file)
--- a/src/cpu_topo.c
+++ b/src/cpu_topo.c
@@ -26,6 +26,11 @@
  #define CPU_AFFINITY_PER_THREAD (1 << 2)
  #define CPU_AFFINITY_PER_CCX    (1 << 3)
  
+/*
+ * Specific to the per-group affinity
+ */
+#define CPU_AFFINITY_PER_GROUP_LOOSE (1 << 8)
+
  /* CPU topology information, ha_cpuset_size() entries, allocated at boot */
  int cpu_topo_maxcpus  = -1;  // max number of CPUs supported by OS/haproxy
  int cpu_topo_lastcpu  = -1;  // last supposed online CPU (no need to look beyond)
@@ -64,19 +69,31 @@ struct {
  } cpu_policy_conf = {
                         1, /* "performance" policy */
                         0, /* Default flags */
-                       0  /* Default affinity */
+                       0, /* Default affinity */
+};
+
+struct cpu_affinity_optional {
+       char *name;
+       int affinity_flag;
+};
+
+static struct cpu_affinity_optional per_group_optional[] = {
+       {"loose", CPU_AFFINITY_PER_GROUP_LOOSE},
+       {"auto", 0},
+       {NULL, 0}
  };
  
  static struct cpu_affinity {
         char *name;
         int affinity_flags;
+       struct cpu_affinity_optional *optional;
  } ha_cpu_affinity[] = {
-       {"per-core", CPU_AFFINITY_PER_CORE},
-       {"per-group", CPU_AFFINITY_PER_GROUP},
-       {"per-thread", CPU_AFFINITY_PER_THREAD},
-       {"per-ccx", CPU_AFFINITY_PER_CCX},
-       {"auto", 0},
-       {NULL, 0}
+       {"per-core", CPU_AFFINITY_PER_CORE, NULL},
+       {"per-group", CPU_AFFINITY_PER_GROUP, per_group_optional},
+       {"per-thread", CPU_AFFINITY_PER_THREAD, NULL},
+       {"per-ccx", CPU_AFFINITY_PER_CCX, NULL},
+       {"auto", 0, NULL},
+       {NULL, 0, NULL}
  };
  
  /* list of CPU policies for "cpu-policy". The default one is the first one. */
@@ -1258,34 +1275,97 @@ static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin
  }
  
  static void
-cpu_policy_assign_threads(int cpu_count, struct hap_cpuset node_cpu_set, struct hap_cpuset touse_tsid, struct hap_cpuset touse_ccx)
+cpu_policy_assign_threads(int cpu_count, int thr_count, struct hap_cpuset node_cpu_set, struct hap_cpuset touse_tsid, struct hap_cpuset touse_ccx)
  {
         struct hap_cpuset thrset;
+       struct hap_cpuset saved_touse_ccx;
         int nb_grp;
         int thr_per_grp;
         int thr;
         int same_core = 0;
+       int cpu_per_group;
  
         ha_cpuset_zero(&thrset);
+       ha_cpuset_assign(&saved_touse_ccx, &touse_ccx);
         /* check that we're still within limits. If there are too many
          * CPUs but enough groups left, we'll try to make more smaller
          * groups, of the closest size each.
          */
-       nb_grp = (cpu_count + global.maxthrpertgroup - 1) / global.maxthrpertgroup;
+       nb_grp = (thr_count + global.maxthrpertgroup - 1) / global.maxthrpertgroup;
         if (nb_grp > MAX_TGROUPS - global.nbtgroups)
                 nb_grp = MAX_TGROUPS - global.nbtgroups;
-       thr_per_grp = (cpu_count + nb_grp - 1) / nb_grp;
+       cpu_per_group = (cpu_count + nb_grp - 1) / nb_grp;
+       thr_per_grp = (thr_count + nb_grp - 1) / nb_grp;
         if (thr_per_grp > global.maxthrpertgroup)
                 thr_per_grp = global.maxthrpertgroup;
  
-       while (nb_grp && cpu_count > 0) {
+       while (nb_grp && thr_count > 0) {
+               struct hap_cpuset group_cpuset;
+               struct hap_cpuset current_tsid;
+               struct hap_cpuset current_ccx;
+
+               ha_cpuset_zero(&group_cpuset);
+               ha_cpuset_zero(&current_tsid);
+               ha_cpuset_zero(&current_ccx);
+
                 /* create at most thr_per_grp threads */
-               if (thr_per_grp > cpu_count)
-                       thr_per_grp = cpu_count;
+               if (thr_per_grp > thr_count)
+                       thr_per_grp = thr_count;
  
                 if (thr_per_grp + global.nbthread > MAX_THREADS)
                         thr_per_grp = MAX_THREADS - global.nbthread;
  
+               if ((cpu_policy_conf.affinity & (CPU_AFFINITY_PER_GROUP | CPU_AFFINITY_PER_GROUP_LOOSE)) == CPU_AFFINITY_PER_GROUP) {
+                       int i = 0;
+                       int next_ccx;
+
+                       /*
+                        * Decide which CPUs to use for the group.
+                        * Try to allocate them from the same CCX, and then
+                        * the same TSID
+                        */
+                       while (i < cpu_per_group) {
+                               int next_cpu = 0;
+                               int got_cpu;
+
+                               next_ccx = ha_cpuset_ffs(&saved_touse_ccx) - 1;
+
+                               if (next_ccx == -1)
+                                       break;
+
+                               while (i < cpu_per_group && (got_cpu = find_next_cpu_ccx(next_cpu, next_ccx)) != -1) {
+                                       int tsid;
+                                       int got_cpu_tsid;
+                                       int next_cpu_tsid = 0;
+                                       next_cpu = got_cpu + 1;
+                                       if (!ha_cpuset_isset(&node_cpu_set, ha_cpu_topo[got_cpu].idx))
+                                               continue;
+                                       tsid = ha_cpu_topo[got_cpu].ts_id;
+
+                                       while (i < cpu_per_group && (got_cpu_tsid = find_next_cpu_tsid(next_cpu_tsid, tsid)) != -1) {
+                                               next_cpu_tsid = got_cpu_tsid + 1;
+                                               if (!ha_cpuset_isset(&node_cpu_set, ha_cpu_topo[got_cpu_tsid].idx))
+                                                       continue;
+                                               ha_cpuset_set(&group_cpuset, ha_cpu_topo[got_cpu_tsid].idx);
+                                               ha_cpuset_clr(&node_cpu_set, ha_cpu_topo[got_cpu_tsid].idx);
+                                               ha_cpuset_set(&current_tsid, tsid);
+                                               ha_cpuset_set(&current_ccx, next_ccx);
+                                               i++;
+                                       }
+                               }
+                               /*
+                                * At this point there is nothing left
+                                * for us in that CCX, forget about it.
+                                */
+                               if (i < cpu_per_group)
+                                       ha_cpuset_clr(&saved_touse_ccx, next_ccx);
+
+                       }
+                       ha_cpuset_assign(&touse_tsid, &current_tsid);
+                       ha_cpuset_assign(&touse_ccx, &current_ccx);
+               } else {
+                       ha_cpuset_assign(&group_cpuset, &node_cpu_set);
+               }
                 /* let's create the new thread group */
                 ha_tgroup_info[global.nbtgroups].base  = global.nbthread;
                 ha_tgroup_info[global.nbtgroups].count = thr_per_grp;
@@ -1310,7 +1390,8 @@ cpu_policy_assign_threads(int cpu_count, struct hap_cpuset node_cpu_set, struct
                                                 int got_cpu;
                                                 while ((got_cpu = find_next_cpu_tsid(next_try, tsid)) != -1) {
                                                         next_try = got_cpu + 1;
-                                                       if (!(ha_cpu_topo[got_cpu].st & HA_CPU_F_EXCL_MASK)) {
+                                                       if (!(ha_cpu_topo[got_cpu].st & HA_CPU_F_EXCL_MASK) &&
+                                                           ha_cpuset_isset(&group_cpuset, ha_cpu_topo[got_cpu].idx)) {
                                                                 ha_cpuset_set(&thrset, ha_cpu_topo[got_cpu].idx);
                                                                 corenb++;
                                                         }
@@ -1337,7 +1418,8 @@ cpu_policy_assign_threads(int cpu_count, struct hap_cpuset node_cpu_set, struct
  
                                         tsid = ha_cpuset_ffs(&touse_tsid) - 1;
                                         while ((got_cpu = find_next_cpu_tsid(next_cpu, tsid)) != -1) {
-                                               if (!(ha_cpu_topo[got_cpu].st & HA_CPU_F_EXCL_MASK))
+                                               if (!(ha_cpu_topo[got_cpu].st & HA_CPU_F_EXCL_MASK) &&
+                                                   ha_cpuset_isset(&group_cpuset, ha_cpu_topo[got_cpu].idx))
                                                         break;
                                                 next_cpu = got_cpu + 1;
                                         }
@@ -1347,7 +1429,7 @@ cpu_policy_assign_threads(int cpu_count, struct hap_cpuset node_cpu_set, struct
                                                 ha_cpuset_clr(&touse_tsid, tsid);
                                         }
                                 } else {
-                                       int tid = ha_cpuset_ffs(&node_cpu_set) - 1;
+                                       int tid = ha_cpuset_ffs(&group_cpuset) - 1;
  
                                         if (tid != -1) {
                                                 ha_cpuset_set(&thrset, tid);
@@ -1358,30 +1440,34 @@ cpu_policy_assign_threads(int cpu_count, struct hap_cpuset node_cpu_set, struct
                                         ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &thrset);
  
                         } else if (cpu_policy_conf.affinity & CPU_AFFINITY_PER_CCX) {
-                               if (same_core == 0) {
+                               while (same_core == 0) {
                                         int l3id = ha_cpuset_ffs(&touse_ccx) - 1;
                                         int got_cpu;
                                         int next_try = 0;
-                                       ha_cpuset_zero(&thrset);
  
+                                       if (l3id == -1)
+                                               break;
+                                       ha_cpuset_zero(&thrset);
                                         while ((got_cpu = find_next_cpu_ccx(next_try, l3id)) != -1) {
                                                 next_try = got_cpu + 1;
-                                               same_core++;
-                                               ha_cpuset_set(&thrset, ha_cpu_topo[got_cpu].idx);
+                                               if (!(ha_cpu_topo[got_cpu].st & HA_CPU_F_EXCL_MASK) &&
+                                                   ha_cpuset_isset(&group_cpuset, ha_cpu_topo[got_cpu].idx)) {
+                                                       same_core++;
+                                                       ha_cpuset_set(&thrset, ha_cpu_topo[got_cpu].idx);
+                                               }
                                         }
                                         ha_cpuset_clr(&touse_ccx, l3id);
                                 }
-                               BUG_ON(same_core == 0);
                                 if (ha_cpuset_ffs(&thrset) != 0)
                                         ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &thrset);
                                 same_core--;
                         } else {
                                 /* map these threads to all the CPUs */
-                               ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &node_cpu_set);
+                               ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &group_cpuset);
                         }
                 }
  
-               cpu_count -= thr_per_grp;
+               thr_count -= thr_per_grp;
                 global.nbthread += thr_per_grp;
                 global.nbtgroups++;
                 if (global.nbtgroups >= MAX_TGROUPS || global.nbthread >= MAX_THREADS)
@@ -1404,6 +1490,7 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin,
         struct hap_cpuset touse_ccx;
         int cpu, cpu_start;
         int cpu_count;
+       int thr_count;
         int cid;
         int div;
  
@@ -1426,7 +1513,7 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin,
                 ha_cpuset_zero(&node_cpu_set);
                 ha_cpuset_zero(&touse_tsid);
                 ha_cpuset_zero(&touse_ccx);
-               cid = -1; cpu_count = 0;
+               cid = -1; cpu_count = 0; thr_count = 0;
  
                 for (cpu = cpu_start; cpu <= cpu_topo_lastcpu; cpu++) {
                         /* skip disabled and already visited CPUs */
@@ -1446,11 +1533,12 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin,
                         /* make a mask of all of this cluster's CPUs */
                         ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx);
                         ha_cpuset_set(&touse_ccx, ha_cpu_topo[cpu].ca_id[3]);
+                       cpu_count++;
                         if (!ha_cpuset_isset(&touse_tsid, ha_cpu_topo[cpu].ts_id)) {
-                               cpu_count++;
+                               thr_count++;
                                 ha_cpuset_set(&touse_tsid, ha_cpu_topo[cpu].ts_id);
                         } else if (!(cpu_policy_conf.flags & CPU_POLICY_ONE_THREAD_PER_CORE))
-                               cpu_count++;
+                               thr_count++;
                 }
  
                 /* now cid = next cluster_id or -1 if none; cpu_count is the
@@ -1462,7 +1550,7 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin,
  
                 ha_cpuset_set(&visited_cl_set, cid);
  
-               cpu_policy_assign_threads(cpu_count, node_cpu_set, touse_tsid, touse_ccx);
+               cpu_policy_assign_threads(cpu_count, thr_count, node_cpu_set, touse_tsid, touse_ccx);
         }
  
         if (global.nbthread)
@@ -1490,6 +1578,7 @@ static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int
         struct hap_cpuset touse_ccx; /* List of CCXs we'll currently use */
         int cpu, cpu_start;
         int cpu_count;
+       int thr_count;
         int l3id;
         int div;
  
@@ -1512,7 +1601,7 @@ static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int
                 ha_cpuset_zero(&node_cpu_set);
                 ha_cpuset_zero(&touse_tsid);
                 ha_cpuset_zero(&touse_ccx);
-               l3id = -1; cpu_count = 0;
+               l3id = -1; cpu_count = 0; thr_count = 0;
  
                 for (cpu = cpu_start; cpu <= cpu_topo_lastcpu; cpu++) {
                         /* skip disabled and already visited CPUs */
@@ -1532,11 +1621,12 @@ static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int
                         /* make a mask of all of this cluster's CPUs */
                         ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx);
                         ha_cpuset_set(&touse_ccx, ha_cpu_topo[cpu].ca_id[3]);
+                       cpu_count++;
                         if (!ha_cpuset_isset(&touse_tsid, ha_cpu_topo[cpu].ts_id)) {
-                               cpu_count++;
+                               thr_count++;
                                 ha_cpuset_set(&touse_tsid, ha_cpu_topo[cpu].ts_id);
                         } else if (!(cpu_policy_conf.flags & CPU_POLICY_ONE_THREAD_PER_CORE))
-                               cpu_count++;
+                               thr_count++;
                 }
  
                 /* now l3id = next L3 ID or -1 if none; cpu_count is the
@@ -1548,7 +1638,7 @@ static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int
  
                 ha_cpuset_set(&visited_ccx_set, l3id);
  
-               cpu_policy_assign_threads(cpu_count, node_cpu_set, touse_tsid, touse_ccx);
+               cpu_policy_assign_threads(cpu_count, thr_count, node_cpu_set, touse_tsid, touse_ccx);
         }
  
         if (global.nbthread)
@@ -2059,12 +2149,31 @@ static int cfg_parse_cpu_affinity(char **args, int section_type, struct proxy *c
  {
         int i;
  
-       if (too_many_args(1, args, err, NULL))
+       if (too_many_args(2, args, err, NULL))
                 return -1;
  
         for (i = 0; ha_cpu_affinity[i].name != NULL; i++) {
                 if (!strcmp(args[1], ha_cpu_affinity[i].name)) {
                         cpu_policy_conf.affinity |= ha_cpu_affinity[i].affinity_flags;
+                       if (*args[2] != 0) {
+                               struct cpu_affinity_optional *optional = ha_cpu_affinity[i].optional;
+
+                               if (optional) {
+                                       for (i = 0; optional[i].name; i++) {
+                                               if (!strcmp(args[2], optional[i].name)) {
+                                                       cpu_policy_conf.affinity |= optional[i].affinity_flag;
+                                                       return 0;
+                                               }
+                                       }
+                               }
+                               memprintf(err, "'%s' provided with unknown optional argument '%s'. ", args[1], args[2]);
+                               if (optional) {
+                                       memprintf(err, "%s Known values are :", *err);
+                                       for (i = 0; optional[i].name != NULL; i++)
+                                               memprintf(err, "%s %s", *err, optional[i].name);
+                               }
+                               return -1;
+                       }
                         return 0;
                 }
         }
author	Olivier Houchard <ohouchard@haproxy.com>
	Mon, 5 Jan 2026 05:38:58 +0000 (06:38 +0100)
committer	Olivier Houchard <ohouchard@haproxy.com>
	Tue, 6 Jan 2026 10:32:45 +0000 (11:32 +0100)
doc/configuration.txt		patch \| blob \| blame \| history
src/cpu_topo.c		patch \| blob \| blame \| history