]> git.ipfire.org Git - thirdparty/collectd.git/commitdiff
gpu_sysman: Fine-tune RAS error counter descriptions
authorEero Tamminen <eero.t.tamminen@intel.com>
Wed, 23 Feb 2022 14:42:37 +0000 (16:42 +0200)
committerMatthias Runge <mrunge@matthias-runge.de>
Mon, 12 Sep 2022 12:10:55 +0000 (14:10 +0200)
* "number" -> "count" (as they are counters)
* "occurred" -> "that have occurred" (consistency with Sysman spec)

src/gpu_sysman.c

index e98ff7a87c5238569bad232d30f71999453f0544..ea9653e12b6c6fbc106dc28b3fda0a123202e9ae 100644 (file)
@@ -806,43 +806,46 @@ static bool gpu_ras(gpu_device_t *gpu) {
         // categories which are not correctable, see:
         // https://spec.oneapi.io/level-zero/latest/sysman/PROG.html#querying-ras-errors
       case ZES_RAS_ERROR_CAT_RESET:
-        help = "Total number of GPU reset attempts by the driver";
+        help = "Total count of HW accelerator resets attempted by the driver";
         catname = METRIC_PREFIX "resets_total";
         correctable = false;
         break;
       case ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS:
-        help = "Total number of non-correctable HW exceptions generated by the "
-               "way workloads have programmed the HW";
+        help =
+            "Total count of (non-correctable) HW exceptions generated by the "
+            "way workloads program the HW";
         catname = METRIC_PREFIX "programming_errors_total";
         correctable = false;
         break;
       case ZES_RAS_ERROR_CAT_DRIVER_ERRORS:
-        help = "total number of non-correctable low level driver communication "
-               "errors";
+        help =
+            "total count of (non-correctable) low-level driver communication "
+            "errors";
         catname = METRIC_PREFIX "driver_errors_total";
         correctable = false;
         break;
         // categories which can have both correctable and uncorrectable errors
       case ZES_RAS_ERROR_CAT_COMPUTE_ERRORS:
-        help = "Total number of errors occurrend in the accelerator HW";
+        help = "Total count of errors that have occurred in the (shader) "
+               "accelerator HW";
         catname = METRIC_PREFIX "compute_errors_total";
         break;
       case ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS:
-        help = "Total number of errors occurred in the fixed-function "
+        help = "Total count of errors that have occurred in the fixed-function "
                "accelerator HW";
         catname = METRIC_PREFIX "fixed_function_errors_total";
         break;
       case ZES_RAS_ERROR_CAT_CACHE_ERRORS:
-        help = "Total number of ECC errors that have occurred in the on-chip "
+        help = "Total count of ECC errors that have occurred in the on-chip "
                "caches";
         catname = METRIC_PREFIX "cache_errors_total";
         break;
       case ZES_RAS_ERROR_CAT_DISPLAY_ERRORS:
-        help = "Total number of ECC errors that have occurred in the display";
+        help = "Total count of ECC errors that have occurred in the display";
         catname = METRIC_PREFIX "display_errors_total";
         break;
       default:
-        help = "Total number of errors in unsupported categories";
+        help = "Total count of errors in unsupported categories";
         catname = METRIC_PREFIX "unknown_errors_total";
       }
       if (correctable) {
@@ -852,7 +855,7 @@ static bool gpu_ras(gpu_device_t *gpu) {
       }
     }
     catname = METRIC_PREFIX "all_errors_total";
-    help = "Total number of errors in all categories";
+    help = "Total count of errors in all categories";
     ras_submit(gpu, catname, help, type, subdev, total);
     ok = true;
   }