]> git.ipfire.org Git - thirdparty/collectd.git/commitdiff
gpu_sysman: Output device ECC state with other GPU info at start
authorEero Tamminen <eero.t.tamminen@intel.com>
Thu, 22 Sep 2022 18:44:34 +0000 (21:44 +0300)
committerMatthias Runge <mrunge@matthias-runge.de>
Fri, 5 May 2023 05:51:08 +0000 (07:51 +0200)
Added in L0 spec v1.4.

Requires loader 1.8.0 version released in May 2022.

(With minor cleanup comments from Alexey applied.)

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
src/gpu_sysman.c
src/gpu_sysman_test.c

index 69ec4b8af235d68526a33d162fe02846231ea17c..1910fc68a07fb34e378d76101be4a5d79242e41d 100644 (file)
@@ -411,6 +411,22 @@ static bool gpu_info(zes_device_handle_t dev, char **pci_bdf, char **pci_dev) {
     WARNING(PLUGIN_NAME ": failed to get GPU device state => 0x%x", ret);
   }
 
+  const char *eccstate = "unavailable";
+  zes_device_ecc_properties_t ecc = {.pNext = NULL};
+  if (zesDeviceGetEccState(dev, &ecc) == ZE_RESULT_SUCCESS) {
+    switch (ecc.currentState) {
+    case ZES_DEVICE_ECC_STATE_ENABLED:
+      eccstate = "enabled";
+      break;
+    case ZES_DEVICE_ECC_STATE_DISABLED:
+      eccstate = "disabled";
+      break;
+    default:
+      break;
+    }
+  }
+  INFO("- ECC state: %s", eccstate);
+
   INFO("HW identification:");
   zes_device_properties_t props = {.pNext = NULL};
   if (ret = zesDeviceGetProperties(dev, &props), ret == ZE_RESULT_SUCCESS) {
index 8d2887fb3933e628fd5db53d59371c4be6410f01..4b9389b62e4e6893e6200cb8601fcc43c102af40 100644 (file)
@@ -250,21 +250,26 @@ ze_result_t zeDeviceGetMemoryProperties(ze_device_handle_t dev, uint32_t *count,
 
 /* mock up level-zero sysman device handling API, called during gpu_init() */
 
-#define DEV_GET_ZEROED_STRUCT(callbit, getname, structtype)                    \
+#define DEV_GET_SET_STRUCT(callbit, getname, structtype, setval)               \
   ze_result_t getname(zes_device_handle_t dev, structtype *to_zero) {          \
     ze_result_t ret = dev_args_check(callbit, #getname, dev, to_zero);         \
     if (ret == ZE_RESULT_SUCCESS) {                                            \
       assert(!to_zero->pNext);                                                 \
       memset(to_zero, 0, sizeof(*to_zero));                                    \
+      setval;                                                                  \
     }                                                                          \
     return ret;                                                                \
   }
 
-DEV_GET_ZEROED_STRUCT(5, zesDeviceGetProperties, zes_device_properties_t)
-DEV_GET_ZEROED_STRUCT(6, zesDevicePciGetProperties, zes_pci_properties_t)
-DEV_GET_ZEROED_STRUCT(7, zesDeviceGetState, zes_device_state_t)
+DEV_GET_SET_STRUCT(5, zesDeviceGetProperties, zes_device_properties_t, )
+DEV_GET_SET_STRUCT(6, zesDevicePciGetProperties, zes_pci_properties_t, )
+DEV_GET_SET_STRUCT(7, zesDeviceGetState, zes_device_state_t,
+                   to_zero->reset = (ZES_RESET_REASON_FLAG_WEDGED |
+                                     ZES_RESET_REASON_FLAG_REPAIR))
+DEV_GET_SET_STRUCT(8, zesDeviceGetEccState, zes_device_ecc_properties_t,
+                   to_zero->currentState = ZES_DEVICE_ECC_STATE_ENABLED)
 
-#define INIT_CALL_FUNCS 8
+#define INIT_CALL_FUNCS 9
 #define INIT_CALL_BITS (((uint64_t)1 << INIT_CALL_FUNCS) - 1)
 
 /* ------------------------------------------------------------------------- */