// https://spec.oneapi.io/level-zero/latest/sysman/PROG.html#querying-ras-errors
case ZES_RAS_ERROR_CAT_RESET:
help = "Total count of HW accelerator resets attempted by the driver";
- catname = METRIC_PREFIX "resets_total";
+ catname = METRIC_PREFIX "resets";
correctable = false;
break;
case ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS:
help =
"Total count of (non-correctable) HW exceptions generated by the "
"way workloads program the HW";
- catname = METRIC_PREFIX "programming_errors_total";
+ catname = METRIC_PREFIX "programming_errors";
correctable = false;
break;
case ZES_RAS_ERROR_CAT_DRIVER_ERRORS:
help =
"total count of (non-correctable) low-level driver communication "
"errors";
- catname = METRIC_PREFIX "driver_errors_total";
+ catname = METRIC_PREFIX "driver_errors";
correctable = false;
break;
// categories which can have both correctable and uncorrectable errors
case ZES_RAS_ERROR_CAT_COMPUTE_ERRORS:
help = "Total count of errors that have occurred in the (shader) "
"accelerator HW";
- catname = METRIC_PREFIX "compute_errors_total";
+ catname = METRIC_PREFIX "compute_errors";
break;
case ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS:
help = "Total count of errors that have occurred in the fixed-function "
"accelerator HW";
- catname = METRIC_PREFIX "fixed_function_errors_total";
+ catname = METRIC_PREFIX "fixed_function_errors";
break;
case ZES_RAS_ERROR_CAT_CACHE_ERRORS:
help = "Total count of ECC errors that have occurred in the on-chip "
"caches";
- catname = METRIC_PREFIX "cache_errors_total";
+ catname = METRIC_PREFIX "cache_errors";
break;
case ZES_RAS_ERROR_CAT_DISPLAY_ERRORS:
help = "Total count of ECC errors that have occurred in the display";
- catname = METRIC_PREFIX "display_errors_total";
+ catname = METRIC_PREFIX "display_errors";
break;
default:
help = "Total count of errors in unsupported categories";
- catname = METRIC_PREFIX "unknown_errors_total";
+ catname = METRIC_PREFIX "unknown_errors";
}
if (correctable) {
ras_submit(gpu, catname, help, type, subdev, value);
ras_submit(gpu, catname, help, NULL, subdev, value);
}
}
- catname = METRIC_PREFIX "all_errors_total";
+ catname = METRIC_PREFIX "all_errors";
help = "Total count of errors in all categories";
ras_submit(gpu, catname, help, type, subdev, total);
ok = true;
};
metric_family_t fam_counter = {
.help = "Memory bandwidth usage total (in bytes)",
- .name = METRIC_PREFIX "memory_bw_bytes_total",
+ .name = METRIC_PREFIX "memory_bw_bytes",
.type = METRIC_TYPE_COUNTER,
};
metric_t metric = {0};
};
metric_family_t fam_counter = {
.help = "Total time HW frequency has been throttled (in seconds)",
- .name = METRIC_PREFIX "throttled_seconds_total",
+ .name = METRIC_PREFIX "throttled_seconds",
.type = METRIC_TYPE_COUNTER_FP,
};
metric_t metric = {0};
};
metric_family_t fam_counter = {
.help = "Fabric port throughput total (in bytes)",
- .name = METRIC_PREFIX "fabric_port_bytes_total",
+ .name = METRIC_PREFIX "fabric_port_bytes",
.type = METRIC_TYPE_COUNTER,
};
metric_t metric = {0};
};
metric_family_t fam_energy = {
.help = "Total energy consumption since boot (in joules)",
- .name = METRIC_PREFIX "energy_joules_total",
+ .name = METRIC_PREFIX "energy_joules",
.type = METRIC_TYPE_COUNTER_FP,
};
metric_t metric = {0};
metric_family_t fam_counter = {
.help = "GPU engine / group execution (use / activity) time total (in "
"seconds)",
- .name = METRIC_PREFIX "engine_use_seconds_total",
+ .name = METRIC_PREFIX "engine_use_seconds",
.type = METRIC_TYPE_COUNTER_FP,
};
metric_t metric = {0};
static metrics_validation_t valid_metrics[] = {
/* gauge value changes */
- {"all_errors_total", true, false, RAS_INIT, RAS_INC, 0, 0.0},
+ {"all_errors", true, false, RAS_INIT, RAS_INC, 0, 0.0},
{"frequency_mhz/actual/current/gpu/min", true, true, FREQ_INIT, FREQ_INC, 0,
0.0},
{"frequency_mhz/actual/current/gpu/max", true, true, FREQ_INIT, FREQ_INC, 0,
{"temperature_ratio", true, false, TEMP_RATIO_INIT, TEMP_RATIO_INC, 0, 0.0},
/* while counters increase, per-time incremented value should stay same */
- {"energy_joules_total", true, false, COUNTER_START / 1e6, COUNTER_INC / 1e6,
- 0, 0.0},
+ {"energy_joules", true, false, COUNTER_START / 1e6, COUNTER_INC / 1e6, 0,
+ 0.0},
{"engine_ratio/all", true, false, COUNTER_RATIO, 0, 0, 0.0},
- {"engine_use_seconds_total/all", true, false, COUNTER_START / 1e6,
+ {"engine_use_seconds/all", true, false, COUNTER_START / 1e6,
COUNTER_INC / 1e6, 0, 0.0},
- {"fabric_port_bytes_total/healthy/off/read", true, false, 2 * COUNTER_START,
+ {"fabric_port_bytes/healthy/off/read", true, false, 2 * COUNTER_START,
2 * COUNTER_INC, 0, 0.0},
- {"fabric_port_bytes_total/healthy/off/write", true, false, COUNTER_START,
+ {"fabric_port_bytes/healthy/off/write", true, false, COUNTER_START,
COUNTER_INC, 0, 0.0},
{"fabric_port_bytes_per_second/healthy/off/read", true, false,
2 * COUNTER_RATE, 0, 0, 0.0},
0, 0, 0.0},
{"fabric_port_ratio/healthy/off/write", true, false, COUNTER_MAX_RATIO, 0,
0, 0.0},
- {"memory_bw_bytes_total/HBM/system/read", true, false, 2 * COUNTER_START,
+ {"memory_bw_bytes/HBM/system/read", true, false, 2 * COUNTER_START,
2 * COUNTER_INC, 0, 0.0},
- {"memory_bw_bytes_total/HBM/system/write", true, false, COUNTER_START,
+ {"memory_bw_bytes/HBM/system/write", true, false, COUNTER_START,
COUNTER_INC, 0, 0.0},
{"memory_bw_bytes_per_second/HBM/system/read", true, false,
2 * COUNTER_RATE, 0, 0, 0.0},
{"power_ratio", true, false, COUNTER_INC / POWER_LIMIT / TIME_INC, 0, 0,
0.0},
{"power_watts", true, false, COUNTER_RATIO, 0, 0, 0.0},
- {"throttled_seconds_total/gpu", true, false, COUNTER_START / 1e6,
+ {"throttled_seconds/gpu", true, false, COUNTER_START / 1e6,
COUNTER_INC / 1e6, 0, 0.0},
{"throttled_ratio/gpu", true, false, COUNTER_RATIO, 0, 0, 0.0},
};