From: Eero Tamminen Date: Wed, 7 Sep 2022 17:05:56 +0000 (+0300) Subject: gpu_sysman: Report rate variant for memory bandwidth X-Git-Tag: 6.0.0-rc0~111 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=4a525a2170914ce80b8fbbbbf628c121b936b328;p=thirdparty%2Fcollectd.git gpu_sysman: Report rate variant for memory bandwidth Signed-off-by: Eero Tamminen --- diff --git a/src/gpu_sysman.c b/src/gpu_sysman.c index f0e97abfc..76c2ac4df 100644 --- a/src/gpu_sysman.c +++ b/src/gpu_sysman.c @@ -1062,6 +1062,17 @@ static bool gpu_mems(gpu_device_t *gpu, unsigned int cache_idx) { return ok; } +static void add_bw_gauges(metric_t *metric, metric_family_t *fam, double reads, + double writes) { + metric->value.gauge = reads; + metric_label_set(metric, "direction", "read"); + metric_family_metric_append(fam, *metric); + + metric->value.gauge = writes; + metric_label_set(metric, "direction", "write"); + metric_family_metric_append(fam, *metric); +} + /* Report memory modules bandwidth usage, return true for success. */ static bool gpu_mems_bw(gpu_device_t *gpu) { @@ -1094,6 +1105,11 @@ static bool gpu_mems_bw(gpu_device_t *gpu) { .name = METRIC_PREFIX "memory_bw_ratio", .type = METRIC_TYPE_GAUGE, }; + metric_family_t fam_rate = { + .help = "Memory bandwidth usage rate (in bytes per second)", + .name = METRIC_PREFIX "memory_bw_bytes_per_second", + .type = METRIC_TYPE_GAUGE, + }; metric_family_t fam_counter = { .help = "Memory bandwidth usage total (in bytes)", .name = METRIC_PREFIX "memory_bw_bytes_total", @@ -1101,7 +1117,9 @@ static bool gpu_mems_bw(gpu_device_t *gpu) { }; metric_t metric = {0}; - bool reported_ratio = false, reported_counter = false, ok = false; + bool reported_rate = false, reported_ratio = false, reported_counter = false; + + bool ok = false; for (i = 0; i < mem_count; i++) { ze_result_t ret; zes_mem_bandwidth_t bw; @@ -1127,23 +1145,24 @@ static bool gpu_mems_bw(gpu_device_t *gpu) { reported_counter = true; } zes_mem_bandwidth_t *old = &gpu->membw[i]; - if (old->maxBandwidth && (config.output & OUTPUT_RATIO) && - bw.timestamp > old->timestamp) { + if (old->timestamp && bw.timestamp > old->timestamp && + (config.output & (OUTPUT_RATIO | OUTPUT_RATE))) { /* https://spec.oneapi.com/level-zero/latest/sysman/api.html#_CPPv419zes_mem_bandwidth_t */ uint64_t writes = bw.writeCounter - old->writeCounter; uint64_t reads = bw.readCounter - old->readCounter; uint64_t timediff = bw.timestamp - old->timestamp; - double factor = 1.0e6 / (old->maxBandwidth * timediff); - - metric.value.gauge = factor * writes; - metric_label_set(&metric, "direction", "write"); - metric_family_metric_append(&fam_ratio, metric); - metric.value.gauge = factor * reads; - metric_label_set(&metric, "direction", "read"); - metric_family_metric_append(&fam_ratio, metric); - reported_ratio = true; + if (config.output & OUTPUT_RATE) { + double factor = 1.0e6 / timediff; + add_bw_gauges(&metric, &fam_rate, factor * reads, factor * writes); + reported_rate = true; + } + if ((config.output & OUTPUT_RATIO) && old->maxBandwidth) { + double factor = 1.0e6 / (old->maxBandwidth * timediff); + add_bw_gauges(&metric, &fam_ratio, factor * reads, factor * writes); + reported_ratio = true; + } } *old = bw; ok = true; @@ -1153,6 +1172,9 @@ static bool gpu_mems_bw(gpu_device_t *gpu) { if (reported_ratio) { gpu_submit(gpu, &fam_ratio); } + if (reported_rate) { + gpu_submit(gpu, &fam_rate); + } if (reported_counter) { gpu_submit(gpu, &fam_counter); } diff --git a/src/gpu_sysman_test.c b/src/gpu_sysman_test.c index 6ac59d304..6a18a72f3 100644 --- a/src/gpu_sysman_test.c +++ b/src/gpu_sysman_test.c @@ -285,11 +285,14 @@ static ze_result_t metric_args_check(int callbit, const char *name, #define COUNTER_START 100000 // 100ms #define COUNTER_INC 20000 // 20ms #define TIME_START 5000000 // 5s in us -#define TIME_INC 1000000 // 1s in us +#define TIME_INC 2000000 // 2s in us #define COUNTER_MAX TIME_INC /* what should get reported as result of above */ #define COUNTER_RATIO ((double)COUNTER_INC / TIME_INC) +#define COUNTER_RATE (1.0e6 * COUNTER_INC / TIME_INC) +#define COUNTER_MAX_RATIO \ + (1.0e6 * COUNTER_INC / ((double)COUNTER_MAX * TIME_INC)) #define FREQ_INIT 300 #define FREQ_INC 50 @@ -502,9 +505,14 @@ static metrics_validation_t valid_metrics[] = { 2 * COUNTER_INC, 0, 0.0}, {"memory_bw_bytes_total/HBM/system/write", true, false, COUNTER_START, COUNTER_INC, 0, 0.0}, - {"memory_bw_ratio/HBM/system/read", true, false, 2 * COUNTER_RATIO, 0, 0, + {"memory_bw_bytes_per_second/HBM/system/read", true, false, + 2 * COUNTER_RATE, 0, 0, 0.0}, + {"memory_bw_bytes_per_second/HBM/system/write", true, false, COUNTER_RATE, + 0, 0, 0.0}, + {"memory_bw_ratio/HBM/system/read", true, false, 2 * COUNTER_MAX_RATIO, 0, + 0, 0.0}, + {"memory_bw_ratio/HBM/system/write", true, false, COUNTER_MAX_RATIO, 0, 0, 0.0}, - {"memory_bw_ratio/HBM/system/write", true, false, COUNTER_RATIO, 0, 0, 0.0}, {"power_watts", true, false, COUNTER_RATIO, 0, 0, 0.0}, {"throttled_usecs_total/gpu", true, false, COUNTER_START, COUNTER_INC, 0, 0.0},