]> git.ipfire.org Git - thirdparty/collectd.git/commitdiff
gpu_sysman: remove _total suffix for monotonic metrics
authorEero Tamminen <eero.t.tamminen@intel.com>
Tue, 16 Apr 2024 18:05:26 +0000 (21:05 +0300)
committerMatthias Runge <mrunge@matthias-runge.de>
Tue, 14 May 2024 15:40:45 +0000 (17:40 +0200)
As "write_prometheus" plugin already adds that unconditionally to
names of all monotonic metric types.

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
src/gpu_sysman.c
src/gpu_sysman_test.c

index 0660e1d9fdfb25ddf8e4649397997eed6857e312..ae582277b33f8652e87deb1d2b9285811e393c60 100644 (file)
@@ -1101,46 +1101,46 @@ static bool gpu_ras(gpu_device_t *gpu) {
         // https://spec.oneapi.io/level-zero/latest/sysman/PROG.html#querying-ras-errors
       case ZES_RAS_ERROR_CAT_RESET:
         help = "Total count of HW accelerator resets attempted by the driver";
-        catname = METRIC_PREFIX "resets_total";
+        catname = METRIC_PREFIX "resets";
         correctable = false;
         break;
       case ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS:
         help =
             "Total count of (non-correctable) HW exceptions generated by the "
             "way workloads program the HW";
-        catname = METRIC_PREFIX "programming_errors_total";
+        catname = METRIC_PREFIX "programming_errors";
         correctable = false;
         break;
       case ZES_RAS_ERROR_CAT_DRIVER_ERRORS:
         help =
             "total count of (non-correctable) low-level driver communication "
             "errors";
-        catname = METRIC_PREFIX "driver_errors_total";
+        catname = METRIC_PREFIX "driver_errors";
         correctable = false;
         break;
         // categories which can have both correctable and uncorrectable errors
       case ZES_RAS_ERROR_CAT_COMPUTE_ERRORS:
         help = "Total count of errors that have occurred in the (shader) "
                "accelerator HW";
-        catname = METRIC_PREFIX "compute_errors_total";
+        catname = METRIC_PREFIX "compute_errors";
         break;
       case ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS:
         help = "Total count of errors that have occurred in the fixed-function "
                "accelerator HW";
-        catname = METRIC_PREFIX "fixed_function_errors_total";
+        catname = METRIC_PREFIX "fixed_function_errors";
         break;
       case ZES_RAS_ERROR_CAT_CACHE_ERRORS:
         help = "Total count of ECC errors that have occurred in the on-chip "
                "caches";
-        catname = METRIC_PREFIX "cache_errors_total";
+        catname = METRIC_PREFIX "cache_errors";
         break;
       case ZES_RAS_ERROR_CAT_DISPLAY_ERRORS:
         help = "Total count of ECC errors that have occurred in the display";
-        catname = METRIC_PREFIX "display_errors_total";
+        catname = METRIC_PREFIX "display_errors";
         break;
       default:
         help = "Total count of errors in unsupported categories";
-        catname = METRIC_PREFIX "unknown_errors_total";
+        catname = METRIC_PREFIX "unknown_errors";
       }
       if (correctable) {
         ras_submit(gpu, catname, help, type, subdev, value);
@@ -1148,7 +1148,7 @@ static bool gpu_ras(gpu_device_t *gpu) {
         ras_submit(gpu, catname, help, NULL, subdev, value);
       }
     }
-    catname = METRIC_PREFIX "all_errors_total";
+    catname = METRIC_PREFIX "all_errors";
     help = "Total count of errors in all categories";
     ras_submit(gpu, catname, help, type, subdev, total);
     ok = true;
@@ -1390,7 +1390,7 @@ static bool gpu_mems_bw(gpu_device_t *gpu) {
   };
   metric_family_t fam_counter = {
       .help = "Memory bandwidth usage total (in bytes)",
-      .name = METRIC_PREFIX "memory_bw_bytes_total",
+      .name = METRIC_PREFIX "memory_bw_bytes",
       .type = METRIC_TYPE_COUNTER,
   };
   metric_t metric = {0};
@@ -1768,7 +1768,7 @@ static bool gpu_freqs_throttle(gpu_device_t *gpu) {
   };
   metric_family_t fam_counter = {
       .help = "Total time HW frequency has been throttled (in seconds)",
-      .name = METRIC_PREFIX "throttled_seconds_total",
+      .name = METRIC_PREFIX "throttled_seconds",
       .type = METRIC_TYPE_COUNTER_FP,
   };
   metric_t metric = {0};
@@ -2034,7 +2034,7 @@ static bool gpu_fabrics(gpu_device_t *gpu) {
   };
   metric_family_t fam_counter = {
       .help = "Fabric port throughput total (in bytes)",
-      .name = METRIC_PREFIX "fabric_port_bytes_total",
+      .name = METRIC_PREFIX "fabric_port_bytes",
       .type = METRIC_TYPE_COUNTER,
   };
   metric_t metric = {0};
@@ -2212,7 +2212,7 @@ static bool gpu_powers(gpu_device_t *gpu) {
   };
   metric_family_t fam_energy = {
       .help = "Total energy consumption since boot (in joules)",
-      .name = METRIC_PREFIX "energy_joules_total",
+      .name = METRIC_PREFIX "energy_joules",
       .type = METRIC_TYPE_COUNTER_FP,
   };
   metric_t metric = {0};
@@ -2363,7 +2363,7 @@ static bool gpu_engines(gpu_device_t *gpu) {
   metric_family_t fam_counter = {
       .help = "GPU engine / group execution (use / activity) time total (in "
               "seconds)",
-      .name = METRIC_PREFIX "engine_use_seconds_total",
+      .name = METRIC_PREFIX "engine_use_seconds",
       .type = METRIC_TYPE_COUNTER_FP,
   };
   metric_t metric = {0};
index 6c7010cfa1cfabbc7d67bebbe159a43738bb790f..3f70695b8de343b8bd0bbc5b2bd977aba2adb49a 100644 (file)
@@ -583,7 +583,7 @@ typedef struct {
 
 static metrics_validation_t valid_metrics[] = {
     /* gauge value changes */
-    {"all_errors_total", true, false, RAS_INIT, RAS_INC, 0, 0.0},
+    {"all_errors", true, false, RAS_INIT, RAS_INC, 0, 0.0},
     {"frequency_mhz/actual/current/gpu/min", true, true, FREQ_INIT, FREQ_INC, 0,
      0.0},
     {"frequency_mhz/actual/current/gpu/max", true, true, FREQ_INIT, FREQ_INC, 0,
@@ -624,14 +624,14 @@ static metrics_validation_t valid_metrics[] = {
     {"temperature_ratio", true, false, TEMP_RATIO_INIT, TEMP_RATIO_INC, 0, 0.0},
 
     /* while counters increase, per-time incremented value should stay same */
-    {"energy_joules_total", true, false, COUNTER_START / 1e6, COUNTER_INC / 1e6,
-     0, 0.0},
+    {"energy_joules", true, false, COUNTER_START / 1e6, COUNTER_INC / 1e6, 0,
+     0.0},
     {"engine_ratio/all", true, false, COUNTER_RATIO, 0, 0, 0.0},
-    {"engine_use_seconds_total/all", true, false, COUNTER_START / 1e6,
+    {"engine_use_seconds/all", true, false, COUNTER_START / 1e6,
      COUNTER_INC / 1e6, 0, 0.0},
-    {"fabric_port_bytes_total/healthy/off/read", true, false, 2 * COUNTER_START,
+    {"fabric_port_bytes/healthy/off/read", true, false, 2 * COUNTER_START,
      2 * COUNTER_INC, 0, 0.0},
-    {"fabric_port_bytes_total/healthy/off/write", true, false, COUNTER_START,
+    {"fabric_port_bytes/healthy/off/write", true, false, COUNTER_START,
      COUNTER_INC, 0, 0.0},
     {"fabric_port_bytes_per_second/healthy/off/read", true, false,
      2 * COUNTER_RATE, 0, 0, 0.0},
@@ -641,9 +641,9 @@ static metrics_validation_t valid_metrics[] = {
      0, 0, 0.0},
     {"fabric_port_ratio/healthy/off/write", true, false, COUNTER_MAX_RATIO, 0,
      0, 0.0},
-    {"memory_bw_bytes_total/HBM/system/read", true, false, 2 * COUNTER_START,
+    {"memory_bw_bytes/HBM/system/read", true, false, 2 * COUNTER_START,
      2 * COUNTER_INC, 0, 0.0},
-    {"memory_bw_bytes_total/HBM/system/write", true, false, COUNTER_START,
+    {"memory_bw_bytes/HBM/system/write", true, false, COUNTER_START,
      COUNTER_INC, 0, 0.0},
     {"memory_bw_bytes_per_second/HBM/system/read", true, false,
      2 * COUNTER_RATE, 0, 0, 0.0},
@@ -656,7 +656,7 @@ static metrics_validation_t valid_metrics[] = {
     {"power_ratio", true, false, COUNTER_INC / POWER_LIMIT / TIME_INC, 0, 0,
      0.0},
     {"power_watts", true, false, COUNTER_RATIO, 0, 0, 0.0},
-    {"throttled_seconds_total/gpu", true, false, COUNTER_START / 1e6,
+    {"throttled_seconds/gpu", true, false, COUNTER_START / 1e6,
      COUNTER_INC / 1e6, 0, 0.0},
     {"throttled_ratio/gpu", true, false, COUNTER_RATIO, 0, 0, 0.0},
 };