]> git.ipfire.org Git - thirdparty/collectd.git/commitdiff
gpu_sysman: Report rate variant for memory bandwidth
authorEero Tamminen <eero.t.tamminen@intel.com>
Wed, 7 Sep 2022 17:05:56 +0000 (20:05 +0300)
committerMatthias Runge <mrunge@matthias-runge.de>
Tue, 8 Nov 2022 12:24:36 +0000 (13:24 +0100)
Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
src/gpu_sysman.c
src/gpu_sysman_test.c

index f0e97abfc74a6fc820f1a8cbc3b348a9cc23ab4c..76c2ac4df77e3a711d94768f2fd8c6883ca36f23 100644 (file)
@@ -1062,6 +1062,17 @@ static bool gpu_mems(gpu_device_t *gpu, unsigned int cache_idx) {
   return ok;
 }
 
+static void add_bw_gauges(metric_t *metric, metric_family_t *fam, double reads,
+                          double writes) {
+  metric->value.gauge = reads;
+  metric_label_set(metric, "direction", "read");
+  metric_family_metric_append(fam, *metric);
+
+  metric->value.gauge = writes;
+  metric_label_set(metric, "direction", "write");
+  metric_family_metric_append(fam, *metric);
+}
+
 /* Report memory modules bandwidth usage, return true for success.
  */
 static bool gpu_mems_bw(gpu_device_t *gpu) {
@@ -1094,6 +1105,11 @@ static bool gpu_mems_bw(gpu_device_t *gpu) {
       .name = METRIC_PREFIX "memory_bw_ratio",
       .type = METRIC_TYPE_GAUGE,
   };
+  metric_family_t fam_rate = {
+      .help = "Memory bandwidth usage rate (in bytes per second)",
+      .name = METRIC_PREFIX "memory_bw_bytes_per_second",
+      .type = METRIC_TYPE_GAUGE,
+  };
   metric_family_t fam_counter = {
       .help = "Memory bandwidth usage total (in bytes)",
       .name = METRIC_PREFIX "memory_bw_bytes_total",
@@ -1101,7 +1117,9 @@ static bool gpu_mems_bw(gpu_device_t *gpu) {
   };
   metric_t metric = {0};
 
-  bool reported_ratio = false, reported_counter = false, ok = false;
+  bool reported_rate = false, reported_ratio = false, reported_counter = false;
+
+  bool ok = false;
   for (i = 0; i < mem_count; i++) {
     ze_result_t ret;
     zes_mem_bandwidth_t bw;
@@ -1127,23 +1145,24 @@ static bool gpu_mems_bw(gpu_device_t *gpu) {
       reported_counter = true;
     }
     zes_mem_bandwidth_t *old = &gpu->membw[i];
-    if (old->maxBandwidth && (config.output & OUTPUT_RATIO) &&
-        bw.timestamp > old->timestamp) {
+    if (old->timestamp && bw.timestamp > old->timestamp &&
+        (config.output & (OUTPUT_RATIO | OUTPUT_RATE))) {
       /* https://spec.oneapi.com/level-zero/latest/sysman/api.html#_CPPv419zes_mem_bandwidth_t
        */
       uint64_t writes = bw.writeCounter - old->writeCounter;
       uint64_t reads = bw.readCounter - old->readCounter;
       uint64_t timediff = bw.timestamp - old->timestamp;
-      double factor = 1.0e6 / (old->maxBandwidth * timediff);
-
-      metric.value.gauge = factor * writes;
-      metric_label_set(&metric, "direction", "write");
-      metric_family_metric_append(&fam_ratio, metric);
 
-      metric.value.gauge = factor * reads;
-      metric_label_set(&metric, "direction", "read");
-      metric_family_metric_append(&fam_ratio, metric);
-      reported_ratio = true;
+      if (config.output & OUTPUT_RATE) {
+        double factor = 1.0e6 / timediff;
+        add_bw_gauges(&metric, &fam_rate, factor * reads, factor * writes);
+        reported_rate = true;
+      }
+      if ((config.output & OUTPUT_RATIO) && old->maxBandwidth) {
+        double factor = 1.0e6 / (old->maxBandwidth * timediff);
+        add_bw_gauges(&metric, &fam_ratio, factor * reads, factor * writes);
+        reported_ratio = true;
+      }
     }
     *old = bw;
     ok = true;
@@ -1153,6 +1172,9 @@ static bool gpu_mems_bw(gpu_device_t *gpu) {
     if (reported_ratio) {
       gpu_submit(gpu, &fam_ratio);
     }
+    if (reported_rate) {
+      gpu_submit(gpu, &fam_rate);
+    }
     if (reported_counter) {
       gpu_submit(gpu, &fam_counter);
     }
index 6ac59d3040077f98df34c0e89b3f6babe9126a51..6a18a72f3c53fdc33407596482a85caee60b988c 100644 (file)
@@ -285,11 +285,14 @@ static ze_result_t metric_args_check(int callbit, const char *name,
 #define COUNTER_START 100000 // 100ms
 #define COUNTER_INC 20000    // 20ms
 #define TIME_START 5000000   // 5s in us
-#define TIME_INC 1000000     // 1s in us
+#define TIME_INC 2000000     // 2s in us
 #define COUNTER_MAX TIME_INC
 
 /* what should get reported as result of above */
 #define COUNTER_RATIO ((double)COUNTER_INC / TIME_INC)
+#define COUNTER_RATE (1.0e6 * COUNTER_INC / TIME_INC)
+#define COUNTER_MAX_RATIO                                                      \
+  (1.0e6 * COUNTER_INC / ((double)COUNTER_MAX * TIME_INC))
 
 #define FREQ_INIT 300
 #define FREQ_INC 50
@@ -502,9 +505,14 @@ static metrics_validation_t valid_metrics[] = {
      2 * COUNTER_INC, 0, 0.0},
     {"memory_bw_bytes_total/HBM/system/write", true, false, COUNTER_START,
      COUNTER_INC, 0, 0.0},
-    {"memory_bw_ratio/HBM/system/read", true, false, 2 * COUNTER_RATIO, 0, 0,
+    {"memory_bw_bytes_per_second/HBM/system/read", true, false,
+     2 * COUNTER_RATE, 0, 0, 0.0},
+    {"memory_bw_bytes_per_second/HBM/system/write", true, false, COUNTER_RATE,
+     0, 0, 0.0},
+    {"memory_bw_ratio/HBM/system/read", true, false, 2 * COUNTER_MAX_RATIO, 0,
+     0, 0.0},
+    {"memory_bw_ratio/HBM/system/write", true, false, COUNTER_MAX_RATIO, 0, 0,
      0.0},
-    {"memory_bw_ratio/HBM/system/write", true, false, COUNTER_RATIO, 0, 0, 0.0},
     {"power_watts", true, false, COUNTER_RATIO, 0, 0, 0.0},
     {"throttled_usecs_total/gpu", true, false, COUNTER_START, COUNTER_INC, 0,
      0.0},