From: Eero Tamminen Date: Tue, 13 Sep 2022 09:15:58 +0000 (+0300) Subject: gpu_sysman: Add ratio variant for frequency metric type X-Git-Tag: 6.0.0-rc0~106 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f1afd7e4ad38f62dd47bfa1337047d3a423ec215;p=thirdparty%2Fcollectd.git gpu_sysman: Add ratio variant for frequency metric type Signed-off-by: Eero Tamminen --- diff --git a/src/gpu_sysman.c b/src/gpu_sysman.c index a86107d8e..08c86b5f1 100644 --- a/src/gpu_sysman.c +++ b/src/gpu_sysman.c @@ -1190,11 +1190,13 @@ static bool gpu_mems_bw(gpu_device_t *gpu) { /* set frequency metric labels based on its properties, return true for success */ -static bool set_freq_labels(zes_freq_handle_t freq, metric_t *metric) { +static bool set_freq_labels(zes_freq_handle_t freq, metric_t *metric, + double *maxfreq) { zes_freq_properties_t props; if (zesFrequencyGetProperties(freq, &props) != ZE_RESULT_SUCCESS) { return false; } + *maxfreq = props.max; const char *type; switch (props.type) { case ZES_FREQ_DOMAIN_GPU: @@ -1240,14 +1242,19 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) { assert(gpu->frequency); } - metric_family_t fam = { + metric_family_t fam_freq = { .help = "Sampled HW frequency (in MHz)", .name = METRIC_PREFIX "frequency_mhz", .type = METRIC_TYPE_GAUGE, }; + metric_family_t fam_ratio = { + .help = "Sampled HW frequency ratio vs (non-overclocked) max frequency", + .name = METRIC_PREFIX "frequency_ratio", + .type = METRIC_TYPE_GAUGE, + }; metric_t metric = {0}; - bool reported = false, ok = false; + bool reported_ratio = false, reported = false, ok = false; for (i = 0; i < freq_count; i++) { /* fetch freq samples */ if (zesFrequencyGetState(freqs[i], &(gpu->frequency[cache_idx][i])) != @@ -1261,7 +1268,8 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) { continue; } /* process samples */ - if (!set_freq_labels(freqs[i], &metric)) { + double maxfreq; + if (!set_freq_labels(freqs[i], &metric, &maxfreq)) { ERROR(PLUGIN_NAME ": failed to get frequency domain %d properties", i); ok = false; break; @@ -1278,14 +1286,24 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) { if (value >= 0) { metric.value.gauge = value; metric_label_set(&metric, "type", "request"); - metric_family_metric_append(&fam, metric); + metric_family_metric_append(&fam_freq, metric); + if ((config.output & OUTPUT_RATIO) && maxfreq > 0) { + metric.value.gauge = value / maxfreq; + metric_family_metric_append(&fam_ratio, metric); + reported_ratio = true; + } freq_ok = true; } value = gpu->frequency[0][i].actual; if (value >= 0) { metric.value.gauge = value; metric_label_set(&metric, "type", "actual"); - metric_family_metric_append(&fam, metric); + metric_family_metric_append(&fam_freq, metric); + if ((config.output & OUTPUT_RATIO) && maxfreq > 0) { + metric.value.gauge = value / maxfreq; + metric_family_metric_append(&fam_ratio, metric); + reported_ratio = true; + } freq_ok = true; } } else { @@ -1314,22 +1332,40 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) { metric.value.gauge = req_min; metric_label_set(&metric, "type", "request"); metric_label_set(&metric, "function", "min"); - metric_family_metric_append(&fam, metric); - + metric_family_metric_append(&fam_freq, metric); + if ((config.output & OUTPUT_RATIO) && maxfreq > 0) { + metric.value.gauge = req_min / maxfreq; + metric_family_metric_append(&fam_ratio, metric); + reported_ratio = true; + } metric.value.gauge = req_max; metric_label_set(&metric, "function", "max"); - metric_family_metric_append(&fam, metric); + metric_family_metric_append(&fam_freq, metric); + if ((config.output & OUTPUT_RATIO) && maxfreq > 0) { + metric.value.gauge = req_max / maxfreq; + metric_family_metric_append(&fam_ratio, metric); + reported_ratio = true; + } freq_ok = true; } if (act_max >= 0.0) { metric.value.gauge = act_min; metric_label_set(&metric, "type", "actual"); metric_label_set(&metric, "function", "min"); - metric_family_metric_append(&fam, metric); - + metric_family_metric_append(&fam_freq, metric); + if ((config.output & OUTPUT_RATIO) && maxfreq > 0) { + metric.value.gauge = act_min / maxfreq; + metric_family_metric_append(&fam_ratio, metric); + reported_ratio = true; + } metric.value.gauge = act_max; metric_label_set(&metric, "function", "max"); - metric_family_metric_append(&fam, metric); + metric_family_metric_append(&fam_freq, metric); + if ((config.output & OUTPUT_RATIO) && maxfreq > 0) { + metric.value.gauge = act_max / maxfreq; + metric_family_metric_append(&fam_ratio, metric); + reported_ratio = true; + } freq_ok = true; } } @@ -1345,7 +1381,10 @@ static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) { } if (reported) { metric_reset(&metric); - gpu_submit(gpu, &fam); + gpu_submit(gpu, &fam_freq); + if (reported_ratio) { + gpu_submit(gpu, &fam_ratio); + } } free(freqs); return ok; @@ -1411,7 +1450,8 @@ static bool gpu_freqs_throttle(gpu_device_t *gpu) { ok = false; break; } - if (!set_freq_labels(freqs[i], &metric)) { + double dummy; + if (!set_freq_labels(freqs[i], &metric, &dummy)) { ERROR(PLUGIN_NAME ": failed to get frequency domain %d properties", i); ok = false; break; diff --git a/src/gpu_sysman_test.c b/src/gpu_sysman_test.c index 960f73bfc..37c2b79fb 100644 --- a/src/gpu_sysman_test.c +++ b/src/gpu_sysman_test.c @@ -294,6 +294,7 @@ static ze_result_t metric_args_check(int callbit, const char *name, #define COUNTER_MAX_RATIO \ (1.0e6 * COUNTER_INC / ((double)COUNTER_MAX * TIME_INC)) +#define FREQ_LIMIT 1600 #define FREQ_INIT 300 #define FREQ_INC 50 @@ -366,7 +367,7 @@ ADD_METRIC(0, zesDeviceEnumEngineGroups, zes_engine_handle_t, engine_stats.activeTime += COUNTER_INC, engine_stats.timestamp += TIME_INC) -static zes_freq_properties_t freq_props; +static zes_freq_properties_t freq_props = {.max = FREQ_LIMIT}; static zes_freq_state_t freq_state = {.request = FREQ_INIT, .actual = FREQ_INIT}; @@ -481,6 +482,9 @@ typedef struct { double last; } metrics_validation_t; +#define FREQ_RATIO_INIT ((double)(FREQ_INIT) / (FREQ_LIMIT)) +#define FREQ_RATIO_INC ((double)(FREQ_INC) / (FREQ_LIMIT)) + #define TEMP_RATIO_INIT ((double)(TEMP_INIT) / (TEMP_LIMIT)) #define TEMP_RATIO_INC ((double)(TEMP_INC) / (TEMP_LIMIT)) @@ -499,6 +503,18 @@ static metrics_validation_t valid_metrics[] = { 0.0}, {"frequency_mhz/request/gpu", false, false, FREQ_INIT, 2 * FREQ_INC, 0, 0.0}, + {"frequency_ratio/actual/gpu/min", true, true, FREQ_RATIO_INIT, + FREQ_RATIO_INC, 0, 0.0}, + {"frequency_ratio/actual/gpu/max", true, true, FREQ_RATIO_INIT, + FREQ_RATIO_INC, 0, 0.0}, + {"frequency_ratio/actual/gpu", false, false, FREQ_RATIO_INIT, + FREQ_RATIO_INC, 0, 0.0}, + {"frequency_ratio/request/gpu/min", true, true, FREQ_RATIO_INIT, + 2 * FREQ_RATIO_INC, 0, 0.0}, + {"frequency_ratio/request/gpu/max", true, true, FREQ_RATIO_INIT, + 2 * FREQ_RATIO_INC, 0, 0.0}, + {"frequency_ratio/request/gpu", false, false, FREQ_RATIO_INIT, + 2 * FREQ_RATIO_INC, 0, 0.0}, {"memory_used_bytes/HBM/system/min", true, true, MEMORY_INIT, MEMORY_INC, 0, 0.0}, {"memory_used_bytes/HBM/system/max", true, true, MEMORY_INIT, MEMORY_INC, 0,