MINOR: cpu-topo: add CPU topology detection for linux

author Willy Tarreau <w@1wt.eu>

Wed, 12 Jul 2023 13:41:51 +0000 (15:41 +0200)

committer Willy Tarreau <w@1wt.eu>

Fri, 14 Mar 2025 17:30:30 +0000 (18:30 +0100)
author Willy Tarreau <w@1wt.eu>
Wed, 12 Jul 2023 13:41:51 +0000 (15:41 +0200)
committer Willy Tarreau <w@1wt.eu>
Fri, 14 Mar 2025 17:30:30 +0000 (18:30 +0100)
diff --git a/include/haproxy/cpu_topo.h b/include/haproxy/cpu_topo.h

index 74a7e70d2008f14f00bff7bca4398ccfcf27a7ee..1ef64189839b746d653bd55c6489ef3fdb7ba073 100644 (file)
--- a/include/haproxy/cpu_topo.h
+++ b/include/haproxy/cpu_topo.h
@@ -22,6 +22,9 @@ int ha_cpuset_detect_online(struct hap_cpuset *set);
   */
  int cpu_detect_usable(void);
  
+/* detect the CPU topology based on info in /sys */
+int cpu_detect_topology(void);
+
  /* Detects CPUs that are bound to the current process. Returns the number of
   * CPUs detected or 0 if the detection failed.
   */
diff --git a/src/cpu_topo.c b/src/cpu_topo.c

index 8d47fd01f38d9ca6675d81521f4dd33afe43ff87..9166c1195b9a95c2d6e74f414b5a18e0740b013b 100644 (file)
--- a/src/cpu_topo.c
+++ b/src/cpu_topo.c
@@ -208,6 +208,181 @@ static int cpu_topo_get_maxcpus(void)
         return abs_max;
  }
  
+/* CPU topology detection below, OS-specific */
+
+#if defined(__linux__)
+
+/* detect the CPU topology based on info in /sys */
+int cpu_detect_topology(void)
+{
+       const char *parse_cpu_set_args[2];
+       struct ha_cpu_topo cpu_id = { }; /* all zeroes */
+       int cpu;
+
+       /* now let's only focus on bound CPUs to learn more about their
+        * topology, their siblings, their cache affinity etc. We can stop
+        * at lastcpu which matches the ID of the last known bound CPU
+        * when it's set. We'll pre-assign and auto-increment indexes for
+        * thread_set_id, cluster_id, l1/l2/l3 id, etc. We don't revisit entries
+        * already filled from the list provided by another CPU.
+        */
+       for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
+               struct hap_cpuset cpus_list;
+               int next_level = 1; // assume L1 if unknown
+               int idx, level;
+               int cpu2;
+
+               if (ha_cpu_topo[cpu].st & HA_CPU_F_OFFLINE)
+                       continue;
+
+               /* First, let's check the cache hierarchy. On systems exposing
+                * it, index0 generally is the L1D cache, index1 the L1I, index2
+                * the L2 and index3 the L3. But sometimes L1I/D are reversed,
+                * and some CPUs also have L0 or L4. Maybe some heterogenous
+                * SoCs even have inconsistent levels between clusters... Thus
+                * we'll scan all entries that we can find for each CPU and
+                * assign levels based on what is reported. The types generally
+                * are "Data", "Instruction", "Unified". We just ignore inst if
+                * found.
+                */
+               for (idx = 0; idx < 10; idx++) {
+                       if (!is_dir_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/cache/index%d", cpu, idx))
+                               break;
+
+                       if (read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH
+                                              "/cpu/cpu%d/cache/index%d/type", cpu, idx) >= 0 &&
+                           strcmp(trash.area, "Instruction") == 0)
+                               continue;
+
+                       level = next_level;
+                       if (read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH
+                                              "/cpu/cpu%d/cache/index%d/level", cpu, idx) >= 0) {
+                               level = atoi(trash.area);
+                               next_level = level + 1;
+                       }
+
+                       if (level < 0 || level > 4)
+                               continue; // level out of bounds
+
+                       if (ha_cpu_topo[cpu].ca_id[level] >= 0)
+                               continue; // already filled
+
+                       if (read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH
+                                              "/cpu/cpu%d/cache/index%d/shared_cpu_list", cpu, idx) >= 0) {
+                               parse_cpu_set_args[0] = trash.area;
+                               parse_cpu_set_args[1] = "\0";
+                               if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
+                                       for (cpu2 = 0; cpu2 <= cpu_topo_lastcpu; cpu2++) {
+                                               if (ha_cpuset_isset(&cpus_list, cpu2))
+                                                       ha_cpu_topo[cpu2].ca_id[level] = cpu_id.ca_id[level];
+                                       }
+                                       cpu_id.ca_id[level]++;
+                               }
+                       }
+               }
+
+               /* Now let's try to get more info about how the cores are
+                * arranged in packages, clusters, cores, threads etc. It
+                * overlaps a bit with the cache above, but as not all systems
+                * provide all of these, they're quite complementary in fact.
+                */
+
+               /* thread siblings list will allow to figure which CPU threads
+                * share the same cores, and also to tell apart cores that
+                * support SMT from those which do not. When mixed, generally
+                * the ones with SMT are big cores and the ones without are the
+                * small ones.
+                */
+               if (ha_cpu_topo[cpu].ts_id < 0 &&
+                   read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/thread_siblings_list", cpu) >= 0) {
+                       parse_cpu_set_args[0] = trash.area;
+                       parse_cpu_set_args[1] = "\0";
+                       if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
+                               cpu_id.th_cnt = ha_cpuset_count(&cpus_list);
+                               for (cpu2 = 0; cpu2 <= cpu_topo_lastcpu; cpu2++) {
+                                       if (ha_cpuset_isset(&cpus_list, cpu2)) {
+                                               ha_cpu_topo[cpu2].ts_id  = cpu_id.ts_id;
+                                               ha_cpu_topo[cpu2].th_cnt = cpu_id.th_cnt;
+                                       }
+                               }
+                               cpu_id.ts_id++;
+                       }
+               }
+
+               /* clusters of cores when they exist, can be smaller and more
+                * precise than core lists (e.g. big.little), otherwise use
+                * core lists as a fall back, which may also have been used
+                * above as a fallback for package but we don't care here. We
+                * only consider these values if there's more than one CPU per
+                * cluster (some kernels such as 6.1 report one cluster per CPU).
+                */
+               if (ha_cpu_topo[cpu].cl_gid < 0 &&
+                   (read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/cluster_cpus_list", cpu) >= 0 ||
+                    read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/core_siblings_list", cpu) >= 0)) {
+                       parse_cpu_set_args[0] = trash.area;
+                       parse_cpu_set_args[1] = "\0";
+                       if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0 && ha_cpuset_count(&cpus_list) > 1) {
+                               for (cpu2 = 0; cpu2 <= cpu_topo_lastcpu; cpu2++) {
+                                       if (ha_cpuset_isset(&cpus_list, cpu2)) {
+                                               ha_cpu_topo[cpu2].cl_lid = cpu_id.cl_lid;
+                                               ha_cpu_topo[cpu2].cl_gid = cpu_id.cl_gid;
+                                       }
+                               }
+                               cpu_id.cl_lid++;
+                               cpu_id.cl_gid++;
+                       }
+               }
+
+               /* package CPUs list, like nodes, are generally a hard limit
+                * for groups, which must not span over multiple of them. On
+                * some systems, the package_cpus_list is not always provided,
+                * so we may first fall back to core_siblings_list which also
+                * exists, then to the physical package id from each CPU, whose
+                * number starts at 0. The first one is preferred because it
+                * provides a list in a single read().
+                */
+               if (ha_cpu_topo[cpu].pk_id < 0 &&
+                   (read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/package_cpus_list", cpu) >= 0 ||
+                    read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/core_siblings_list", cpu) >= 0)) {
+                       parse_cpu_set_args[0] = trash.area;
+                       parse_cpu_set_args[1] = "\0";
+                       if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
+                               for (cpu2 = 0; cpu2 <= cpu_topo_lastcpu; cpu2++) {
+                                       if (ha_cpuset_isset(&cpus_list, cpu2))
+                                               ha_cpu_topo[cpu2].pk_id = cpu_id.pk_id;
+                               }
+                               cpu_id.pk_id++;
+                       }
+               }
+
+               if (ha_cpu_topo[cpu].pk_id < 0 &&
+                   read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/physical_package_id", cpu) >= 0) {
+                       if (trash.data)
+                               ha_cpu_topo[cpu].pk_id = str2uic(trash.area);
+               }
+
+               /* CPU capacity is a relative notion to compare little and big
+                * cores. Usually the values encountered in field set the big
+                * CPU's nominal capacity to 1024 and the other ones below.
+                */
+               if (ha_cpu_topo[cpu].capa < 0 &&
+                   read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/cpu_capacity", cpu) >= 0) {
+                       if (trash.data)
+                               ha_cpu_topo[cpu].capa = str2uic(trash.area);
+               }
+       }
+       return 1;
+}
+
+#else // __linux__
+
+int cpu_detect_topology(void)
+{
+       return 1;
+}
+
+#endif // OS-specific cpu_detect_topology()
+
  /* Allocates everything needed to store CPU topology at boot.
   * Returns non-zero on success, zero on failure.
   */
diff --git a/src/haproxy.c b/src/haproxy.c

index c77f88ffdac1f56d70d0c9c1fd17880e52080291..f7a389ccde2773f54b99df72df63817e9be52851 100644 (file)
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -2055,6 +2055,9 @@ static void step_init_2(int argc, char** argv)
          * to be used. Let's check which of these are usable.
          */
         cpu_detect_usable();
+
+       /* Now detect how CPUs are arranged */
+       cpu_detect_topology();
  #endif
  
         /* Note: global.nbthread will be initialized as part of this call */
author	Willy Tarreau <w@1wt.eu>
	Wed, 12 Jul 2023 13:41:51 +0000 (15:41 +0200)
committer	Willy Tarreau <w@1wt.eu>
	Fri, 14 Mar 2025 17:30:30 +0000 (18:30 +0100)
include/haproxy/cpu_topo.h		patch \| blob \| blame \| history
src/cpu_topo.c		patch \| blob \| blame \| history
src/haproxy.c		patch \| blob \| blame \| history