From 543147761e99026df30cf2f5ccee71e4787bfab1 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Fri, 16 Sep 2022 14:58:19 +0300 Subject: [PATCH] gpu_sysman: add "throttled_by" label to frequency metric Which is empty/missing when frequency is not throttled. Already in L0 spec v1.0. Signed-off-by: Eero Tamminen --- src/gpu_sysman.c | 39 +++++++++++++++++++++++++++++++++++++++ src/gpu_sysman_test.c | 33 +++++++++++++++++++-------------- 2 files changed, 58 insertions(+), 14 deletions(-) diff --git a/src/gpu_sysman.c b/src/gpu_sysman.c index 95b499235..b9e2170db 100644 --- a/src/gpu_sysman.c +++ b/src/gpu_sysman.c @@ -1284,6 +1284,41 @@ static ze_result_t set_freq_labels(zes_freq_handle_t freq, metric_t *metric, return ZE_RESULT_SUCCESS; } +/* set label explaining frequency throttling reason(s) */ +static void set_freq_throttled_label(metric_t *metric, + zes_freq_throttle_reason_flags_t reasons) { + static const struct { + zes_freq_throttle_reason_flags_t flag; + const char *reason; + } flags[] = { + {ZES_FREQ_THROTTLE_REASON_FLAG_AVE_PWR_CAP, "average-power"}, + {ZES_FREQ_THROTTLE_REASON_FLAG_BURST_PWR_CAP, "burst-power"}, + {ZES_FREQ_THROTTLE_REASON_FLAG_CURRENT_LIMIT, "current"}, + {ZES_FREQ_THROTTLE_REASON_FLAG_THERMAL_LIMIT, "temperature"}, + {ZES_FREQ_THROTTLE_REASON_FLAG_PSU_ALERT, "PSU-alert"}, + {ZES_FREQ_THROTTLE_REASON_FLAG_SW_RANGE, "SW-freq-range"}, + {ZES_FREQ_THROTTLE_REASON_FLAG_HW_RANGE, "HW-freq-range"}, + }; + bool found = false; + const char *reason = NULL; + for (unsigned int i = 0; i < STATIC_ARRAY_SIZE(flags); i++) { + if (reasons & flags[i].flag) { + if (found) { + reason = "many"; + break; + } + reason = flags[i].reason; + found = true; + } + } + if (reasons) { + if (!found) { + reason = "unknown"; + } + metric_label_set(metric, "throttled_by", reason); + } +} + /* Report frequency domains request & actual frequency, return true for success * * See gpu_read() on 'cache_idx' usage. @@ -1355,6 +1390,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) { double value; if (config.samples < 2) { + set_freq_throttled_label(&metric, gpu->frequency[0][i].throttleReasons); /* negative value = unsupported: * https://spec.oneapi.com/level-zero/latest/sysman/api.html#_CPPv416zes_freq_state_t */ @@ -1388,7 +1424,9 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) { */ double req_min = 1.0e12, req_max = -1.0e12; double act_min = 1.0e12, act_max = -1.0e12; + zes_freq_throttle_reason_flags_t reasons = 0; for (uint32_t j = 0; j < config.samples; j++) { + reasons |= gpu->frequency[j][i].throttleReasons; value = gpu->frequency[j][i].request; if (value < req_min) { req_min = value; @@ -1404,6 +1442,7 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) { act_max = value; } } + set_freq_throttled_label(&metric, reasons); if (req_max >= 0.0) { metric.value.gauge = req_min; metric_label_set(&metric, "type", "request"); diff --git a/src/gpu_sysman_test.c b/src/gpu_sysman_test.c index 235a44c73..4ddcd40f1 100644 --- a/src/gpu_sysman_test.c +++ b/src/gpu_sysman_test.c @@ -370,8 +370,10 @@ ADD_METRIC(0, zesDeviceEnumEngineGroups, zes_engine_handle_t, engine_stats.timestamp += TIME_INC) static zes_freq_properties_t freq_props = {.max = FREQ_LIMIT}; -static zes_freq_state_t freq_state = {.request = FREQ_INIT, - .actual = FREQ_INIT}; +static zes_freq_state_t freq_state = { + .throttleReasons = ZES_FREQ_THROTTLE_REASON_FLAG_CURRENT_LIMIT, + .request = FREQ_INIT, + .actual = FREQ_INIT}; ADD_METRIC(3, zesDeviceEnumFrequencyDomains, zes_freq_handle_t, zesFrequencyGetProperties, zes_freq_properties_t, freq_props, @@ -525,26 +527,29 @@ typedef struct { static metrics_validation_t valid_metrics[] = { /* gauge value changes */ {"all_errors_total", true, false, RAS_INIT, RAS_INC, 0, 0.0}, - {"frequency_mhz/actual/gpu/min", true, true, FREQ_INIT, FREQ_INC, 0, 0.0}, - {"frequency_mhz/actual/gpu/max", true, true, FREQ_INIT, FREQ_INC, 0, 0.0}, - {"frequency_mhz/actual/gpu", false, false, FREQ_INIT, FREQ_INC, 0, 0.0}, - {"frequency_mhz/request/gpu/min", true, true, FREQ_INIT, 2 * FREQ_INC, 0, + {"frequency_mhz/actual/current/gpu/min", true, true, FREQ_INIT, FREQ_INC, 0, 0.0}, - {"frequency_mhz/request/gpu/max", true, true, FREQ_INIT, 2 * FREQ_INC, 0, + {"frequency_mhz/actual/current/gpu/max", true, true, FREQ_INIT, FREQ_INC, 0, 0.0}, - {"frequency_mhz/request/gpu", false, false, FREQ_INIT, 2 * FREQ_INC, 0, + {"frequency_mhz/actual/current/gpu", false, false, FREQ_INIT, FREQ_INC, 0, 0.0}, - {"frequency_ratio/actual/gpu/min", true, true, FREQ_RATIO_INIT, + {"frequency_mhz/request/current/gpu/min", true, true, FREQ_INIT, + 2 * FREQ_INC, 0, 0.0}, + {"frequency_mhz/request/current/gpu/max", true, true, FREQ_INIT, + 2 * FREQ_INC, 0, 0.0}, + {"frequency_mhz/request/current/gpu", false, false, FREQ_INIT, 2 * FREQ_INC, + 0, 0.0}, + {"frequency_ratio/actual/current/gpu/min", true, true, FREQ_RATIO_INIT, FREQ_RATIO_INC, 0, 0.0}, - {"frequency_ratio/actual/gpu/max", true, true, FREQ_RATIO_INIT, + {"frequency_ratio/actual/current/gpu/max", true, true, FREQ_RATIO_INIT, FREQ_RATIO_INC, 0, 0.0}, - {"frequency_ratio/actual/gpu", false, false, FREQ_RATIO_INIT, + {"frequency_ratio/actual/current/gpu", false, false, FREQ_RATIO_INIT, FREQ_RATIO_INC, 0, 0.0}, - {"frequency_ratio/request/gpu/min", true, true, FREQ_RATIO_INIT, + {"frequency_ratio/request/current/gpu/min", true, true, FREQ_RATIO_INIT, 2 * FREQ_RATIO_INC, 0, 0.0}, - {"frequency_ratio/request/gpu/max", true, true, FREQ_RATIO_INIT, + {"frequency_ratio/request/current/gpu/max", true, true, FREQ_RATIO_INIT, 2 * FREQ_RATIO_INC, 0, 0.0}, - {"frequency_ratio/request/gpu", false, false, FREQ_RATIO_INIT, + {"frequency_ratio/request/current/gpu", false, false, FREQ_RATIO_INIT, 2 * FREQ_RATIO_INC, 0, 0.0}, {"memory_used_bytes/HBM/system/min", true, true, MEMORY_INIT, MEMORY_INC, 0, 0.0}, -- 2.47.2