]> git.ipfire.org Git - thirdparty/collectd.git/commitdiff
gpu_sysman: Add memory "health" label if memory health is known
authorEero Tamminen <eero.t.tamminen@intel.com>
Mon, 14 Feb 2022 17:23:52 +0000 (19:23 +0200)
committerMatthias Runge <mrunge@matthias-runge.de>
Wed, 1 Feb 2023 06:55:27 +0000 (07:55 +0100)
Already in L0 spec v1.0.

Included only to memory usage metrics which are already querying
memory state (unlike memory BW metrics).

src/gpu_sysman.c

index 1f87bbfeeb3545801c8a48e2f5236370f615580d..af4f9790ec6d0694e3daa73fb933088cc803bafd 100644 (file)
@@ -1025,6 +1025,28 @@ static bool gpu_mems(gpu_device_t *gpu, unsigned int cache_idx) {
       ok = false;
       break;
     }
+    /* get health status from last i.e. zeroeth sample */
+    zes_mem_health_t value = gpu->memory[0][i].health;
+    if (value != ZES_MEM_HEALTH_UNKNOWN) {
+      const char *health;
+      switch (value) {
+      case ZES_MEM_HEALTH_OK:
+        health = "ok";
+        break;
+      case ZES_MEM_HEALTH_DEGRADED:
+        health = "degraded";
+        break;
+      case ZES_MEM_HEALTH_CRITICAL:
+        health = "critical";
+        break;
+      case ZES_MEM_HEALTH_REPLACE:
+        health = "replace";
+        break;
+      default:
+        health = "unknown";
+      }
+      metric_label_set(&metric, "health", health);
+    }
     double mem_used;
     if (config.samples < 2) {
       const uint64_t mem_free = gpu->memory[0][i].free;