// categories which are not correctable, see:
// https://spec.oneapi.io/level-zero/latest/sysman/PROG.html#querying-ras-errors
case ZES_RAS_ERROR_CAT_RESET:
- help = "Total number of GPU reset attempts by the driver";
+ help = "Total count of HW accelerator resets attempted by the driver";
catname = METRIC_PREFIX "resets_total";
correctable = false;
break;
case ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS:
- help = "Total number of non-correctable HW exceptions generated by the "
- "way workloads have programmed the HW";
+ help =
+ "Total count of (non-correctable) HW exceptions generated by the "
+ "way workloads program the HW";
catname = METRIC_PREFIX "programming_errors_total";
correctable = false;
break;
case ZES_RAS_ERROR_CAT_DRIVER_ERRORS:
- help = "total number of non-correctable low level driver communication "
- "errors";
+ help =
+ "total count of (non-correctable) low-level driver communication "
+ "errors";
catname = METRIC_PREFIX "driver_errors_total";
correctable = false;
break;
// categories which can have both correctable and uncorrectable errors
case ZES_RAS_ERROR_CAT_COMPUTE_ERRORS:
- help = "Total number of errors occurrend in the accelerator HW";
+ help = "Total count of errors that have occurred in the (shader) "
+ "accelerator HW";
catname = METRIC_PREFIX "compute_errors_total";
break;
case ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS:
- help = "Total number of errors occurred in the fixed-function "
+ help = "Total count of errors that have occurred in the fixed-function "
"accelerator HW";
catname = METRIC_PREFIX "fixed_function_errors_total";
break;
case ZES_RAS_ERROR_CAT_CACHE_ERRORS:
- help = "Total number of ECC errors that have occurred in the on-chip "
+ help = "Total count of ECC errors that have occurred in the on-chip "
"caches";
catname = METRIC_PREFIX "cache_errors_total";
break;
case ZES_RAS_ERROR_CAT_DISPLAY_ERRORS:
- help = "Total number of ECC errors that have occurred in the display";
+ help = "Total count of ECC errors that have occurred in the display";
catname = METRIC_PREFIX "display_errors_total";
break;
default:
- help = "Total number of errors in unsupported categories";
+ help = "Total count of errors in unsupported categories";
catname = METRIC_PREFIX "unknown_errors_total";
}
if (correctable) {
}
}
catname = METRIC_PREFIX "all_errors_total";
- help = "Total number of errors in all categories";
+ help = "Total count of errors in all categories";
ras_submit(gpu, catname, help, type, subdev, total);
ok = true;
}