]> git.ipfire.org Git - thirdparty/collectd.git/commitdiff
gpu_sysman: usecs/ujoules -> seconds/joules metric types
authorEero Tamminen <eero.t.tamminen@intel.com>
Tue, 16 Apr 2024 15:54:10 +0000 (18:54 +0300)
committerMatthias Runge <mrunge@matthias-runge.de>
Tue, 14 May 2024 15:40:45 +0000 (17:40 +0200)
With the new floating point counter type, joules & seconds can be used
as metric units instead of microjoules & microseconds.

(Frequencies are are still left as Mhz, but for convenience, and
compability with Intel XPU Manager.)

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
src/gpu_sysman.c
src/gpu_sysman_test.c

index 3cf2d1697764d54459132f7863ea62fc07aa3c8b..0660e1d9fdfb25ddf8e4649397997eed6857e312 100644 (file)
@@ -1767,9 +1767,9 @@ static bool gpu_freqs_throttle(gpu_device_t *gpu) {
       .type = METRIC_TYPE_GAUGE,
   };
   metric_family_t fam_counter = {
-      .help = "Total time HW frequency has been throttled (in microseconds)",
-      .name = METRIC_PREFIX "throttled_usecs_total",
-      .type = METRIC_TYPE_COUNTER,
+      .help = "Total time HW frequency has been throttled (in seconds)",
+      .name = METRIC_PREFIX "throttled_seconds_total",
+      .type = METRIC_TYPE_COUNTER_FP,
   };
   metric_t metric = {0};
 
@@ -1793,15 +1793,17 @@ static bool gpu_freqs_throttle(gpu_device_t *gpu) {
       break;
     }
     if (config.output & OUTPUT_BASE) {
-      /* cannot convert microsecs to secs as counters are integers */
-      metric.value.counter = throttle.throttleTime;
+      /* times are in microseconds:
+       * https://spec.oneapi.io/level-zero/latest/sysman/api.html#zes-freq-throttle-time-t
+       */
+      metric.value.counter_fp = throttle.throttleTime / 1e6;
       metric_family_metric_append(&fam_counter, metric);
       reported_base = true;
     }
     zes_freq_throttle_time_t *old = &gpu->throttle[i];
     if (old->timestamp && throttle.timestamp > old->timestamp &&
         (config.output & OUTPUT_RATIO)) {
-      /* micro seconds => throttle ratio */
+      /* throttle time & timestamp are both in microsecs */
       metric.value.gauge = (throttle.throttleTime - old->throttleTime) /
                            (double)(throttle.timestamp - old->timestamp);
       metric_family_metric_append(&fam_ratio, metric);
@@ -2209,9 +2211,9 @@ static bool gpu_powers(gpu_device_t *gpu) {
       .type = METRIC_TYPE_UP_DOWN_FP,
   };
   metric_family_t fam_energy = {
-      .help = "Total energy consumption since boot (in microjoules)",
-      .name = METRIC_PREFIX "energy_ujoules_total",
-      .type = METRIC_TYPE_COUNTER,
+      .help = "Total energy consumption since boot (in joules)",
+      .name = METRIC_PREFIX "energy_joules_total",
+      .type = METRIC_TYPE_COUNTER_FP,
   };
   metric_t metric = {0};
 
@@ -2240,7 +2242,7 @@ static bool gpu_powers(gpu_device_t *gpu) {
     }
     metric_set_subdev(&metric, props.onSubdevice, props.subdeviceId);
     if (config.output & OUTPUT_BASE) {
-      metric.value.counter = counter.energy;
+      metric.value.counter_fp = counter.energy / 1e6;
       metric_family_metric_append(&fam_energy, metric);
       reported_base = true;
     }
@@ -2359,10 +2361,10 @@ static bool gpu_engines(gpu_device_t *gpu) {
       .type = METRIC_TYPE_GAUGE,
   };
   metric_family_t fam_counter = {
-      .help = "GPU engine / group execution time (activity) total (in "
-              "microseconds)",
-      .name = METRIC_PREFIX "engine_use_usecs_total",
-      .type = METRIC_TYPE_COUNTER,
+      .help = "GPU engine / group execution (use / activity) time total (in "
+              "seconds)",
+      .name = METRIC_PREFIX "engine_use_seconds_total",
+      .type = METRIC_TYPE_COUNTER_FP,
   };
   metric_t metric = {0};
 
@@ -2462,7 +2464,10 @@ static bool gpu_engines(gpu_device_t *gpu) {
     metric_set_subdev(&metric, props.onSubdevice, props.subdeviceId);
     metric_label_set(&metric, "type", vname);
     if (config.output & OUTPUT_BASE) {
-      metric.value.counter = stats.activeTime;
+      /* Intel L0 backend provides times in microsecs:
+       * https://spec.oneapi.io/level-zero/latest/sysman/api.html#zes-engine-stats-t
+       */
+      metric.value.counter_fp = stats.activeTime / 1e6;
       metric_family_metric_append(&fam_counter, metric);
       reported_base = true;
     }
index 5850e5b8a4299550d85b8c32f2416c0547cd7446..6c7010cfa1cfabbc7d67bebbe159a43738bb790f 100644 (file)
@@ -74,6 +74,7 @@
 
 #define SYSMAN_UNIT_TEST_BUILD 1
 #include "gpu_sysman.c" /* test this */
+#include "testing.h"
 
 /* include metric functions + their dependencies directly, instead of
  * building & linking libcommon.a (like normal collectd builds do)?
@@ -623,10 +624,11 @@ static metrics_validation_t valid_metrics[] = {
     {"temperature_ratio", true, false, TEMP_RATIO_INIT, TEMP_RATIO_INC, 0, 0.0},
 
     /* while counters increase, per-time incremented value should stay same */
-    {"energy_ujoules_total", true, false, COUNTER_START, COUNTER_INC, 0, 0.0},
+    {"energy_joules_total", true, false, COUNTER_START / 1e6, COUNTER_INC / 1e6,
+     0, 0.0},
     {"engine_ratio/all", true, false, COUNTER_RATIO, 0, 0, 0.0},
-    {"engine_use_usecs_total/all", true, false, COUNTER_START, COUNTER_INC, 0,
-     0.0},
+    {"engine_use_seconds_total/all", true, false, COUNTER_START / 1e6,
+     COUNTER_INC / 1e6, 0, 0.0},
     {"fabric_port_bytes_total/healthy/off/read", true, false, 2 * COUNTER_START,
      2 * COUNTER_INC, 0, 0.0},
     {"fabric_port_bytes_total/healthy/off/write", true, false, COUNTER_START,
@@ -654,11 +656,20 @@ static metrics_validation_t valid_metrics[] = {
     {"power_ratio", true, false, COUNTER_INC / POWER_LIMIT / TIME_INC, 0, 0,
      0.0},
     {"power_watts", true, false, COUNTER_RATIO, 0, 0, 0.0},
-    {"throttled_usecs_total/gpu", true, false, COUNTER_START, COUNTER_INC, 0,
-     0.0},
+    {"throttled_seconds_total/gpu", true, false, COUNTER_START / 1e6,
+     COUNTER_INC / 1e6, 0, 0.0},
     {"throttled_ratio/gpu", true, false, COUNTER_RATIO, 0, 0, 0.0},
 };
 
+static int expect_double_eq(double expect, double actual) {
+  /* WA for "unused-variable" warning on testing.h */
+  fail_count__++;
+  /* macro returns -1 on non-equality, continues if equal */
+  EXPECT_EQ_DOUBLE(expect, actual);
+  fail_count__--;
+  return 0;
+}
+
 /* VALIDATE: reset tracked metrics values and return count of how many
  * metrics were not set since last reset.
  *
@@ -729,7 +740,7 @@ static int validate_and_reset_saved_metrics(unsigned int base_rounds,
       incrounds += multisampled / config.samples;
     }
     double expected = metric->value_init + incrounds * metric->value_inc;
-    if (last != expected) {
+    if (expect_double_eq(expected, last) != 0) {
       fprintf(
           stderr,
           "ERROR: expected %g, but got value %g for metric '%s' on round %d\n",