return ZE_RESULT_SUCCESS;
}
+/* set label explaining frequency throttling reason(s) */
+static void set_freq_throttled_label(metric_t *metric,
+ zes_freq_throttle_reason_flags_t reasons) {
+ static const struct {
+ zes_freq_throttle_reason_flags_t flag;
+ const char *reason;
+ } flags[] = {
+ {ZES_FREQ_THROTTLE_REASON_FLAG_AVE_PWR_CAP, "average-power"},
+ {ZES_FREQ_THROTTLE_REASON_FLAG_BURST_PWR_CAP, "burst-power"},
+ {ZES_FREQ_THROTTLE_REASON_FLAG_CURRENT_LIMIT, "current"},
+ {ZES_FREQ_THROTTLE_REASON_FLAG_THERMAL_LIMIT, "temperature"},
+ {ZES_FREQ_THROTTLE_REASON_FLAG_PSU_ALERT, "PSU-alert"},
+ {ZES_FREQ_THROTTLE_REASON_FLAG_SW_RANGE, "SW-freq-range"},
+ {ZES_FREQ_THROTTLE_REASON_FLAG_HW_RANGE, "HW-freq-range"},
+ };
+ bool found = false;
+ const char *reason = NULL;
+ for (unsigned int i = 0; i < STATIC_ARRAY_SIZE(flags); i++) {
+ if (reasons & flags[i].flag) {
+ if (found) {
+ reason = "many";
+ break;
+ }
+ reason = flags[i].reason;
+ found = true;
+ }
+ }
+ if (reasons) {
+ if (!found) {
+ reason = "unknown";
+ }
+ metric_label_set(metric, "throttled_by", reason);
+ }
+}
+
/* Report frequency domains request & actual frequency, return true for success
*
* See gpu_read() on 'cache_idx' usage.
double value;
if (config.samples < 2) {
+ set_freq_throttled_label(&metric, gpu->frequency[0][i].throttleReasons);
/* negative value = unsupported:
* https://spec.oneapi.com/level-zero/latest/sysman/api.html#_CPPv416zes_freq_state_t
*/
*/
double req_min = 1.0e12, req_max = -1.0e12;
double act_min = 1.0e12, act_max = -1.0e12;
+ zes_freq_throttle_reason_flags_t reasons = 0;
for (uint32_t j = 0; j < config.samples; j++) {
+ reasons |= gpu->frequency[j][i].throttleReasons;
value = gpu->frequency[j][i].request;
if (value < req_min) {
req_min = value;
act_max = value;
}
}
+ set_freq_throttled_label(&metric, reasons);
if (req_max >= 0.0) {
metric.value.gauge = req_min;
metric_label_set(&metric, "type", "request");
engine_stats.timestamp += TIME_INC)
static zes_freq_properties_t freq_props = {.max = FREQ_LIMIT};
-static zes_freq_state_t freq_state = {.request = FREQ_INIT,
- .actual = FREQ_INIT};
+static zes_freq_state_t freq_state = {
+ .throttleReasons = ZES_FREQ_THROTTLE_REASON_FLAG_CURRENT_LIMIT,
+ .request = FREQ_INIT,
+ .actual = FREQ_INIT};
ADD_METRIC(3, zesDeviceEnumFrequencyDomains, zes_freq_handle_t,
zesFrequencyGetProperties, zes_freq_properties_t, freq_props,
static metrics_validation_t valid_metrics[] = {
/* gauge value changes */
{"all_errors_total", true, false, RAS_INIT, RAS_INC, 0, 0.0},
- {"frequency_mhz/actual/gpu/min", true, true, FREQ_INIT, FREQ_INC, 0, 0.0},
- {"frequency_mhz/actual/gpu/max", true, true, FREQ_INIT, FREQ_INC, 0, 0.0},
- {"frequency_mhz/actual/gpu", false, false, FREQ_INIT, FREQ_INC, 0, 0.0},
- {"frequency_mhz/request/gpu/min", true, true, FREQ_INIT, 2 * FREQ_INC, 0,
+ {"frequency_mhz/actual/current/gpu/min", true, true, FREQ_INIT, FREQ_INC, 0,
0.0},
- {"frequency_mhz/request/gpu/max", true, true, FREQ_INIT, 2 * FREQ_INC, 0,
+ {"frequency_mhz/actual/current/gpu/max", true, true, FREQ_INIT, FREQ_INC, 0,
0.0},
- {"frequency_mhz/request/gpu", false, false, FREQ_INIT, 2 * FREQ_INC, 0,
+ {"frequency_mhz/actual/current/gpu", false, false, FREQ_INIT, FREQ_INC, 0,
0.0},
- {"frequency_ratio/actual/gpu/min", true, true, FREQ_RATIO_INIT,
+ {"frequency_mhz/request/current/gpu/min", true, true, FREQ_INIT,
+ 2 * FREQ_INC, 0, 0.0},
+ {"frequency_mhz/request/current/gpu/max", true, true, FREQ_INIT,
+ 2 * FREQ_INC, 0, 0.0},
+ {"frequency_mhz/request/current/gpu", false, false, FREQ_INIT, 2 * FREQ_INC,
+ 0, 0.0},
+ {"frequency_ratio/actual/current/gpu/min", true, true, FREQ_RATIO_INIT,
FREQ_RATIO_INC, 0, 0.0},
- {"frequency_ratio/actual/gpu/max", true, true, FREQ_RATIO_INIT,
+ {"frequency_ratio/actual/current/gpu/max", true, true, FREQ_RATIO_INIT,
FREQ_RATIO_INC, 0, 0.0},
- {"frequency_ratio/actual/gpu", false, false, FREQ_RATIO_INIT,
+ {"frequency_ratio/actual/current/gpu", false, false, FREQ_RATIO_INIT,
FREQ_RATIO_INC, 0, 0.0},
- {"frequency_ratio/request/gpu/min", true, true, FREQ_RATIO_INIT,
+ {"frequency_ratio/request/current/gpu/min", true, true, FREQ_RATIO_INIT,
2 * FREQ_RATIO_INC, 0, 0.0},
- {"frequency_ratio/request/gpu/max", true, true, FREQ_RATIO_INIT,
+ {"frequency_ratio/request/current/gpu/max", true, true, FREQ_RATIO_INIT,
2 * FREQ_RATIO_INC, 0, 0.0},
- {"frequency_ratio/request/gpu", false, false, FREQ_RATIO_INIT,
+ {"frequency_ratio/request/current/gpu", false, false, FREQ_RATIO_INIT,
2 * FREQ_RATIO_INC, 0, 0.0},
{"memory_used_bytes/HBM/system/min", true, true, MEMORY_INIT, MEMORY_INC, 0,
0.0},