perf pmu intel: Adjust cpumaks for sub-NUMA clusters on graniterapids

author Ian Rogers <irogers@google.com>

Thu, 15 May 2025 18:14:17 +0000 (11:14 -0700)

committer Arnaldo Carvalho de Melo <acme@redhat.com>

Fri, 23 May 2025 02:15:48 +0000 (23:15 -0300)
author Ian Rogers <irogers@google.com>
Thu, 15 May 2025 18:14:17 +0000 (11:14 -0700)
committer Arnaldo Carvalho de Melo <acme@redhat.com>
Fri, 23 May 2025 02:15:48 +0000 (23:15 -0300)
diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c

index 8712cbbbc712239559a6e42a0249d3876fa9c68b..58113482654b79d85e3373f4dcb320872ee8cdf4 100644 (file)
--- a/tools/perf/arch/x86/util/pmu.c
+++ b/tools/perf/arch/x86/util/pmu.c
@@ -8,6 +8,8 @@
  #include <linux/perf_event.h>
  #include <linux/zalloc.h>
  #include <api/fs/fs.h>
+#include <api/io_dir.h>
+#include <internal/cpumap.h>
  #include <errno.h>
  
  #include "../../../util/intel-pt.h"
@@ -16,7 +18,256 @@
  #include "../../../util/fncache.h"
  #include "../../../util/pmus.h"
  #include "mem-events.h"
+#include "util/debug.h"
  #include "util/env.h"
+#include "util/header.h"
+
+static bool x86__is_intel_graniterapids(void)
+{
+       static bool checked_if_graniterapids;
+       static bool is_graniterapids;
+
+       if (!checked_if_graniterapids) {
+               const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
+               char *cpuid = get_cpuid_str((struct perf_cpu){0});
+
+               is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
+               free(cpuid);
+               checked_if_graniterapids = true;
+       }
+       return is_graniterapids;
+}
+
+static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
+{
+       struct perf_cpu_map *cpus;
+       char *buf = NULL;
+       size_t buf_len;
+
+       if (sysfs__read_str(sysfs_path, &buf, &buf_len) < 0)
+               return NULL;
+
+       cpus = perf_cpu_map__new(buf);
+       free(buf);
+       return cpus;
+}
+
+static int snc_nodes_per_l3_cache(void)
+{
+       static bool checked_snc;
+       static int snc_nodes;
+
+       if (!checked_snc) {
+               struct perf_cpu_map *node_cpus =
+                       read_sysfs_cpu_map("devices/system/node/node0/cpulist");
+               struct perf_cpu_map *cache_cpus =
+                       read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
+
+               snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
+               perf_cpu_map__put(cache_cpus);
+               perf_cpu_map__put(node_cpus);
+               checked_snc = true;
+       }
+       return snc_nodes;
+}
+
+static bool starts_with(const char *str, const char *prefix)
+{
+       return !strncmp(prefix, str, strlen(prefix));
+}
+
+static int num_chas(void)
+{
+       static bool checked_chas;
+       static int num_chas;
+
+       if (!checked_chas) {
+               int fd = perf_pmu__event_source_devices_fd();
+               struct io_dir dir;
+               struct io_dirent64 *dent;
+
+               if (fd < 0)
+                       return -1;
+
+               io_dir__init(&dir, fd);
+
+               while ((dent = io_dir__readdir(&dir)) != NULL) {
+                       /* Note, dent->d_type will be DT_LNK and so isn't a useful filter. */
+                       if (starts_with(dent->d_name, "uncore_cha_"))
+                               num_chas++;
+               }
+               close(fd);
+               checked_chas = true;
+       }
+       return num_chas;
+}
+
+#define MAX_SNCS 6
+
+static int uncore_cha_snc(struct perf_pmu *pmu)
+{
+       // CHA SNC numbers are ordered correspond to the CHAs number.
+       unsigned int cha_num;
+       int num_cha, chas_per_node, cha_snc;
+       int snc_nodes = snc_nodes_per_l3_cache();
+
+       if (snc_nodes <= 1)
+               return 0;
+
+       num_cha = num_chas();
+       if (num_cha <= 0) {
+               pr_warning("Unexpected: no CHAs found\n");
+               return 0;
+       }
+
+       /* Compute SNC for PMU. */
+       if (sscanf(pmu->name, "uncore_cha_%u", &cha_num) != 1) {
+               pr_warning("Unexpected: unable to compute CHA number '%s'\n", pmu->name);
+               return 0;
+       }
+       chas_per_node = num_cha / snc_nodes;
+       cha_snc = cha_num / chas_per_node;
+
+       /* Range check cha_snc. for unexpected out of bounds. */
+       return cha_snc >= MAX_SNCS ? 0 : cha_snc;
+}
+
+static int uncore_imc_snc(struct perf_pmu *pmu)
+{
+       // Compute the IMC SNC using lookup tables.
+       unsigned int imc_num;
+       int snc_nodes = snc_nodes_per_l3_cache();
+       const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
+       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
+       const u8 *snc_map;
+       size_t snc_map_len;
+
+       switch (snc_nodes) {
+       case 2:
+               snc_map = snc2_map;
+               snc_map_len = ARRAY_SIZE(snc2_map);
+               break;
+       case 3:
+               snc_map = snc3_map;
+               snc_map_len = ARRAY_SIZE(snc3_map);
+               break;
+       default:
+               /* Error or no lookup support for SNC with >3 nodes. */
+               return 0;
+       }
+
+       /* Compute SNC for PMU. */
+       if (sscanf(pmu->name, "uncore_imc_%u", &imc_num) != 1) {
+               pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
+               return 0;
+       }
+       if (imc_num >= snc_map_len) {
+               pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
+               return 0;
+       }
+       return snc_map[imc_num];
+}
+
+static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
+{
+       static bool checked_cpu_adjust[MAX_SNCS];
+       static int cpu_adjust[MAX_SNCS];
+       struct perf_cpu_map *node_cpus;
+       char node_path[] = "devices/system/node/node0/cpulist";
+
+       /* Was adjust already computed? */
+       if (checked_cpu_adjust[pmu_snc])
+               return cpu_adjust[pmu_snc];
+
+       /* SNC0 doesn't need an adjust. */
+       if (pmu_snc == 0) {
+               cpu_adjust[0] = 0;
+               checked_cpu_adjust[0] = true;
+               return 0;
+       }
+
+       /*
+        * Use NUMA topology to compute first CPU of the NUMA node, we want to
+        * adjust CPU 0 to be this and similarly for other CPUs if there is >1
+        * socket.
+        */
+       assert(pmu_snc >= 0 && pmu_snc <= 9);
+       node_path[24] += pmu_snc; // Shift node0 to be node<pmu_snc>.
+       node_cpus = read_sysfs_cpu_map(node_path);
+       cpu_adjust[pmu_snc] = perf_cpu_map__cpu(node_cpus, 0).cpu;
+       if (cpu_adjust[pmu_snc] < 0) {
+               pr_debug("Failed to read valid CPU list from <sysfs>/%s\n", node_path);
+               cpu_adjust[pmu_snc] = 0;
+       } else {
+               checked_cpu_adjust[pmu_snc] = true;
+       }
+       perf_cpu_map__put(node_cpus);
+       return cpu_adjust[pmu_snc];
+}
+
+static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
+{
+       // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
+       // topology. For example, a two socket graniterapids machine may be set
+       // up with 3-way SNC meaning there are 6 NUMA nodes that should be
+       // displayed with --per-node. The cpumask of the CHA and IMC PMUs
+       // reflects per-socket information meaning, for example, uncore_cha_60
+       // on a two socket graniterapids machine with 120 cores per socket will
+       // have a cpumask of "0,120". This cpumask needs adjusting to "40,160"
+       // to reflect that uncore_cha_60 is used for the 2nd SNC of each
+       // socket. Without the adjustment events on uncore_cha_60 will appear in
+       // node 0 and node 3 (in our example 2 socket 3-way set up), but with
+       // the adjustment they will appear in node 1 and node 4. The number of
+       // CHAs is typically larger than the number of cores. The CHA numbers
+       // are assumed to split evenly and inorder wrt core numbers. There are
+       // fewer memory IMC PMUs than cores and mapping is handled using lookup
+       // tables.
+       static struct perf_cpu_map *cha_adjusted[MAX_SNCS];
+       static struct perf_cpu_map *imc_adjusted[MAX_SNCS];
+       struct perf_cpu_map **adjusted = cha ? cha_adjusted : imc_adjusted;
+       int idx, pmu_snc, cpu_adjust;
+       struct perf_cpu cpu;
+       bool alloc;
+
+       // Cpus from the kernel holds first CPU of each socket. e.g. 0,120.
+       if (perf_cpu_map__cpu(pmu->cpus, 0).cpu != 0) {
+               pr_debug("Ignoring cpumask adjust for %s as unexpected first CPU\n", pmu->name);
+               return;
+       }
+
+       pmu_snc = cha ? uncore_cha_snc(pmu) : uncore_imc_snc(pmu);
+       if (pmu_snc == 0) {
+               // No adjustment necessary for the first SNC.
+               return;
+       }
+
+       alloc = adjusted[pmu_snc] == NULL;
+       if (alloc) {
+               // Hold onto the perf_cpu_map globally to avoid recomputation.
+               cpu_adjust = uncore_cha_imc_compute_cpu_adjust(pmu_snc);
+               adjusted[pmu_snc] = perf_cpu_map__empty_new(perf_cpu_map__nr(pmu->cpus));
+               if (!adjusted[pmu_snc])
+                       return;
+       }
+
+       perf_cpu_map__for_each_cpu(cpu, idx, pmu->cpus) {
+               // Compute the new cpu map values or if not allocating, assert
+               // that they match expectations. asserts will be removed to
+               // avoid overhead in NDEBUG builds.
+               if (alloc) {
+                       RC_CHK_ACCESS(adjusted[pmu_snc])->map[idx].cpu = cpu.cpu + cpu_adjust;
+               } else if (idx == 0) {
+                       cpu_adjust = perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu - cpu.cpu;
+                       assert(uncore_cha_imc_compute_cpu_adjust(pmu_snc) == cpu_adjust);
+               } else {
+                       assert(perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu ==
+                              cpu.cpu + cpu_adjust);
+               }
+       }
+
+       perf_cpu_map__put(pmu->cpus);
+       pmu->cpus = perf_cpu_map__get(adjusted[pmu_snc]);
+}
  
  void perf_pmu__arch_init(struct perf_pmu *pmu)
  {
@@ -49,10 +300,17 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
  
                 perf_mem_events__loads_ldlat = 0;
                 pmu->mem_events = perf_mem_events_amd_ldlat;
-       } else if (pmu->is_core) {
-               if (perf_pmu__have_event(pmu, "mem-loads-aux"))
-                       pmu->mem_events = perf_mem_events_intel_aux;
-               else
-                       pmu->mem_events = perf_mem_events_intel;
+       } else {
+               if (pmu->is_core) {
+                       if (perf_pmu__have_event(pmu, "mem-loads-aux"))
+                               pmu->mem_events = perf_mem_events_intel_aux;
+                       else
+                               pmu->mem_events = perf_mem_events_intel;
+               } else if (x86__is_intel_graniterapids()) {
+                       if (starts_with(pmu->name, "uncore_cha_"))
+                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
+                       else if (starts_with(pmu->name, "uncore_imc_"))
+                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
+               }
         }
  }
author	Ian Rogers <irogers@google.com>
	Thu, 15 May 2025 18:14:17 +0000 (11:14 -0700)
committer	Arnaldo Carvalho de Melo <acme@redhat.com>
	Fri, 23 May 2025 02:15:48 +0000 (23:15 -0300)