perf vendor events: Update sandybridge metrics

author Ian Rogers <irogers@google.com>

Fri, 28 Mar 2025 17:49:57 +0000 (10:49 -0700)

committer Arnaldo Carvalho de Melo <acme@redhat.com>

Fri, 25 Apr 2025 15:30:04 +0000 (12:30 -0300)
author Ian Rogers <irogers@google.com>
Fri, 28 Mar 2025 17:49:57 +0000 (10:49 -0700)
committer Arnaldo Carvalho de Melo <acme@redhat.com>
Fri, 25 Apr 2025 15:30:04 +0000 (12:30 -0300)
diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/frontend.json b/tools/perf/pmu-events/arch/x86/sandybridge/frontend.json

index e95d1005e22f5b9ca69d8eb56374b17cf32f6b0a..5c9ab7680762888bc16d1aa69a945ac376dc9a60 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/sandybridge/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/frontend.json
@@ -278,5 +278,13 @@
          "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_3_UOP_DELIV.CORE",
          "SampleAfterValue": "2000003",
          "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Valid instructions written to IQ per cycle.",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x17",
+        "EventName": "INSTS_WRITTEN_TO_IQ.INSTS",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x1"
      }
  ]
diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json b/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json

index 7dc7eb0d3dd3c1d1202ccf58f9438f8eeecbc334..eb8fbd14138a8daaaf445fa1784af81ee38e8612 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json
@@ -9,6 +9,7 @@
      "BvCB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "BvFB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "BvIO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BvMB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "BvML": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "BvMP": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "BvMS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -33,6 +34,7 @@
      "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "LockCont": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -48,6 +50,7 @@
      "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Prefetches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
      "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -75,6 +78,7 @@
      "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category",
      "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category",
      "tma_core_bound_group": "Metrics contributing to tma_core_bound category",
+    "tma_divider_group": "Metrics contributing to tma_divider category",
      "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category",
      "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category",
      "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category",
@@ -99,6 +103,7 @@
      "tma_issueSmSt": "Metrics related by the issue $issueSmSt",
      "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn",
      "tma_issueTLB": "Metrics related by the issue $issueTLB",
+    "tma_itlb_misses_group": "Metrics contributing to tma_itlb_misses category",
      "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category",
      "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
      "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/other.json b/tools/perf/pmu-events/arch/x86/sandybridge/other.json

index 42692fa24b6c5fa4247cec29a0d63a430f8ecf30..970839a9c786c6c101945948a66834e231f988cd 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/sandybridge/other.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/other.json
@@ -33,14 +33,6 @@
          "SampleAfterValue": "2000003",
          "UMask": "0x2"
      },
-    {
-        "BriefDescription": "Valid instructions written to IQ per cycle.",
-        "Counter": "0,1,2,3",
-        "EventCode": "0x17",
-        "EventName": "INSTS_WRITTEN_TO_IQ.INSTS",
-        "SampleAfterValue": "2000003",
-        "UMask": "0x1"
-    },
      {
          "BriefDescription": "Cycles when L1 and L2 are locked due to UC or split lock.",
          "Counter": "0,1,2,3",
diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json

index ff2e515c744abebe050454e0558dc4349d0136f5..823d8b7c422466c7e994d917ff5b35e306b031cf 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json
@@ -127,7 +127,7 @@
          "MetricGroup": "BvCB;TopdownL3;tma_L3_group;tma_core_bound_group",
          "MetricName": "tma_divider",
          "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
-        "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS",
+        "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE",
          "ScaleUnit": "100%"
      },
      {
@@ -211,7 +211,7 @@
          "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
          "MetricName": "tma_fp_vector_128b",
          "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
-        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting prior to LNL. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
          "ScaleUnit": "100%"
      },
      {
@@ -220,7 +220,7 @@
          "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
          "MetricName": "tma_fp_vector_256b",
          "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
-        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting prior to LNL. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
          "ScaleUnit": "100%"
      },
      {
@@ -240,7 +240,7 @@
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
          "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.([ICL+] Note this may overcount due to approximation using indirect events; [ADL+])",
          "ScaleUnit": "100%"
      },
      {
@@ -275,6 +275,12 @@
          "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35",
          "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_lcp"
      },
+    {
+        "BriefDescription": "Taken Branches retired Per Cycle",
+        "MetricExpr": "BR_INST_RETIRED.NEAR_TAKEN / tma_info_thread_clks",
+        "MetricGroup": "Branches;FetchBW",
+        "MetricName": "tma_info_frontend_tbpc"
+    },
      {
          "BriefDescription": "Total number of retired Instructions",
          "MetricExpr": "INST_RETIRED.ANY",
@@ -290,7 +296,7 @@
      },
      {
          "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
-        "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
+        "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time",
          "MetricGroup": "Power;Summary",
          "MetricName": "tma_info_system_core_frequency"
      },
@@ -308,14 +314,14 @@
      },
      {
          "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
-        "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
+        "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / tma_info_system_time / 1e3",
          "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
          "MetricName": "tma_info_system_dram_bw_use",
          "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_mem_bandwidth"
      },
      {
          "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time",
+        "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / tma_info_system_time",
          "MetricGroup": "Cor;Flops;HPC",
          "MetricName": "tma_info_system_gflops",
          "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
@@ -340,6 +346,13 @@
          "MetricName": "tma_info_system_kernel_utilization",
          "MetricThreshold": "tma_info_system_kernel_utilization > 0.05"
      },
+    {
+        "BriefDescription": "PerfMon Event Multiplexing accuracy indicator",
+        "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P / CPU_CLK_UNHALTED.THREAD",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_mux",
+        "MetricThreshold": "tma_info_system_mux > 1.1 | tma_info_system_mux < 0.9"
+    },
      {
          "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
          "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)",
@@ -352,6 +365,13 @@
          "MetricGroup": "SoC",
          "MetricName": "tma_info_system_socket_clks"
      },
+    {
+        "BriefDescription": "Run duration time in seconds",
+        "MetricExpr": "duration_time",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_time",
+        "MetricThreshold": "tma_info_system_time < 1"
+    },
      {
          "BriefDescription": "Average Frequency Utilization relative nominal frequency",
          "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC",
@@ -448,7 +468,7 @@
      {
          "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
          "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks",
-        "MetricGroup": "BvMS;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
+        "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
          "MetricName": "tma_mem_bandwidth",
          "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
          "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use",
author	Ian Rogers <irogers@google.com>
	Fri, 28 Mar 2025 17:49:57 +0000 (10:49 -0700)
committer	Arnaldo Carvalho de Melo <acme@redhat.com>
	Fri, 25 Apr 2025 15:30:04 +0000 (12:30 -0300)
tools/perf/pmu-events/arch/x86/sandybridge/frontend.json		patch \| blob \| blame \| history
tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json		patch \| blob \| blame \| history
tools/perf/pmu-events/arch/x86/sandybridge/other.json		patch \| blob \| blame \| history
tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json		patch \| blob \| blame \| history