]> git.ipfire.org Git - thirdparty/collectd.git/commitdiff
gpu_sysman: add "throttled_by" label to frequency metric
authorEero Tamminen <eero.t.tamminen@intel.com>
Fri, 16 Sep 2022 11:58:19 +0000 (14:58 +0300)
committerMatthias Runge <mrunge@matthias-runge.de>
Wed, 1 Feb 2023 06:55:27 +0000 (07:55 +0100)
Which is empty/missing when frequency is not throttled.

Already in L0 spec v1.0.

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
src/gpu_sysman.c
src/gpu_sysman_test.c

index 95b499235737c63f5a346777a368cecca2658fd1..b9e2170db220b9f37a62e59c7c67f1bbbc67e13c 100644 (file)
@@ -1284,6 +1284,41 @@ static ze_result_t set_freq_labels(zes_freq_handle_t freq, metric_t *metric,
   return ZE_RESULT_SUCCESS;
 }
 
+/* set label explaining frequency throttling reason(s) */
+static void set_freq_throttled_label(metric_t *metric,
+                                     zes_freq_throttle_reason_flags_t reasons) {
+  static const struct {
+    zes_freq_throttle_reason_flags_t flag;
+    const char *reason;
+  } flags[] = {
+      {ZES_FREQ_THROTTLE_REASON_FLAG_AVE_PWR_CAP, "average-power"},
+      {ZES_FREQ_THROTTLE_REASON_FLAG_BURST_PWR_CAP, "burst-power"},
+      {ZES_FREQ_THROTTLE_REASON_FLAG_CURRENT_LIMIT, "current"},
+      {ZES_FREQ_THROTTLE_REASON_FLAG_THERMAL_LIMIT, "temperature"},
+      {ZES_FREQ_THROTTLE_REASON_FLAG_PSU_ALERT, "PSU-alert"},
+      {ZES_FREQ_THROTTLE_REASON_FLAG_SW_RANGE, "SW-freq-range"},
+      {ZES_FREQ_THROTTLE_REASON_FLAG_HW_RANGE, "HW-freq-range"},
+  };
+  bool found = false;
+  const char *reason = NULL;
+  for (unsigned int i = 0; i < STATIC_ARRAY_SIZE(flags); i++) {
+    if (reasons & flags[i].flag) {
+      if (found) {
+        reason = "many";
+        break;
+      }
+      reason = flags[i].reason;
+      found = true;
+    }
+  }
+  if (reasons) {
+    if (!found) {
+      reason = "unknown";
+    }
+    metric_label_set(metric, "throttled_by", reason);
+  }
+}
+
 /* Report frequency domains request & actual frequency, return true for success
  *
  * See gpu_read() on 'cache_idx' usage.
@@ -1355,6 +1390,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
     double value;
 
     if (config.samples < 2) {
+      set_freq_throttled_label(&metric, gpu->frequency[0][i].throttleReasons);
       /* negative value = unsupported:
        * https://spec.oneapi.com/level-zero/latest/sysman/api.html#_CPPv416zes_freq_state_t
        */
@@ -1388,7 +1424,9 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
        */
       double req_min = 1.0e12, req_max = -1.0e12;
       double act_min = 1.0e12, act_max = -1.0e12;
+      zes_freq_throttle_reason_flags_t reasons = 0;
       for (uint32_t j = 0; j < config.samples; j++) {
+        reasons |= gpu->frequency[j][i].throttleReasons;
         value = gpu->frequency[j][i].request;
         if (value < req_min) {
           req_min = value;
@@ -1404,6 +1442,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
           act_max = value;
         }
       }
+      set_freq_throttled_label(&metric, reasons);
       if (req_max >= 0.0) {
         metric.value.gauge = req_min;
         metric_label_set(&metric, "type", "request");
index 235a44c73a505508411acaa362c3aabdb772b2ee..4ddcd40f196f8ef86c60e535c76c28e7294fb5b3 100644 (file)
@@ -370,8 +370,10 @@ ADD_METRIC(0, zesDeviceEnumEngineGroups, zes_engine_handle_t,
            engine_stats.timestamp += TIME_INC)
 
 static zes_freq_properties_t freq_props = {.max = FREQ_LIMIT};
-static zes_freq_state_t freq_state = {.request = FREQ_INIT,
-                                      .actual = FREQ_INIT};
+static zes_freq_state_t freq_state = {
+    .throttleReasons = ZES_FREQ_THROTTLE_REASON_FLAG_CURRENT_LIMIT,
+    .request = FREQ_INIT,
+    .actual = FREQ_INIT};
 
 ADD_METRIC(3, zesDeviceEnumFrequencyDomains, zes_freq_handle_t,
            zesFrequencyGetProperties, zes_freq_properties_t, freq_props,
@@ -525,26 +527,29 @@ typedef struct {
 static metrics_validation_t valid_metrics[] = {
     /* gauge value changes */
     {"all_errors_total", true, false, RAS_INIT, RAS_INC, 0, 0.0},
-    {"frequency_mhz/actual/gpu/min", true, true, FREQ_INIT, FREQ_INC, 0, 0.0},
-    {"frequency_mhz/actual/gpu/max", true, true, FREQ_INIT, FREQ_INC, 0, 0.0},
-    {"frequency_mhz/actual/gpu", false, false, FREQ_INIT, FREQ_INC, 0, 0.0},
-    {"frequency_mhz/request/gpu/min", true, true, FREQ_INIT, 2 * FREQ_INC, 0,
+    {"frequency_mhz/actual/current/gpu/min", true, true, FREQ_INIT, FREQ_INC, 0,
      0.0},
-    {"frequency_mhz/request/gpu/max", true, true, FREQ_INIT, 2 * FREQ_INC, 0,
+    {"frequency_mhz/actual/current/gpu/max", true, true, FREQ_INIT, FREQ_INC, 0,
      0.0},
-    {"frequency_mhz/request/gpu", false, false, FREQ_INIT, 2 * FREQ_INC, 0,
+    {"frequency_mhz/actual/current/gpu", false, false, FREQ_INIT, FREQ_INC, 0,
      0.0},
-    {"frequency_ratio/actual/gpu/min", true, true, FREQ_RATIO_INIT,
+    {"frequency_mhz/request/current/gpu/min", true, true, FREQ_INIT,
+     2 * FREQ_INC, 0, 0.0},
+    {"frequency_mhz/request/current/gpu/max", true, true, FREQ_INIT,
+     2 * FREQ_INC, 0, 0.0},
+    {"frequency_mhz/request/current/gpu", false, false, FREQ_INIT, 2 * FREQ_INC,
+     0, 0.0},
+    {"frequency_ratio/actual/current/gpu/min", true, true, FREQ_RATIO_INIT,
      FREQ_RATIO_INC, 0, 0.0},
-    {"frequency_ratio/actual/gpu/max", true, true, FREQ_RATIO_INIT,
+    {"frequency_ratio/actual/current/gpu/max", true, true, FREQ_RATIO_INIT,
      FREQ_RATIO_INC, 0, 0.0},
-    {"frequency_ratio/actual/gpu", false, false, FREQ_RATIO_INIT,
+    {"frequency_ratio/actual/current/gpu", false, false, FREQ_RATIO_INIT,
      FREQ_RATIO_INC, 0, 0.0},
-    {"frequency_ratio/request/gpu/min", true, true, FREQ_RATIO_INIT,
+    {"frequency_ratio/request/current/gpu/min", true, true, FREQ_RATIO_INIT,
      2 * FREQ_RATIO_INC, 0, 0.0},
-    {"frequency_ratio/request/gpu/max", true, true, FREQ_RATIO_INIT,
+    {"frequency_ratio/request/current/gpu/max", true, true, FREQ_RATIO_INIT,
      2 * FREQ_RATIO_INC, 0, 0.0},
-    {"frequency_ratio/request/gpu", false, false, FREQ_RATIO_INIT,
+    {"frequency_ratio/request/current/gpu", false, false, FREQ_RATIO_INIT,
      2 * FREQ_RATIO_INC, 0, 0.0},
     {"memory_used_bytes/HBM/system/min", true, true, MEMORY_INIT, MEMORY_INC, 0,
      0.0},