gpu->temp_count = temp_count;
}
- metric_family_t fam = {
+ metric_family_t fam_temp = {
.help = "Temperature sensor value (in Celsius) when queried",
.name = METRIC_PREFIX "temperature_celsius",
.type = METRIC_TYPE_GAUGE,
};
+ metric_family_t fam_ratio = {
+ .help = "Temperature sensor value ratio to its max value when queried",
+ .name = METRIC_PREFIX "temperature_ratio",
+ .type = METRIC_TYPE_GAUGE,
+ };
metric_t metric = {0};
- bool ok = false;
+ bool reported_ratio = false, ok = false;
for (i = 0; i < temp_count; i++) {
zes_temp_properties_t props;
if (zesTemperatureGetProperties(temps[i], &props) != ZE_RESULT_SUCCESS) {
metric.value.gauge = value;
metric_label_set(&metric, "location", type);
metric_set_subdev(&metric, props.onSubdevice, props.subdeviceId);
- metric_family_metric_append(&fam, metric);
+ metric_family_metric_append(&fam_temp, metric);
+
+ if (props.maxTemperature > 0 && (config.output & OUTPUT_RATIO)) {
+ metric.value.gauge = value / props.maxTemperature;
+ metric_family_metric_append(&fam_ratio, metric);
+ reported_ratio = true;
+ }
ok = true;
}
if (ok) {
metric_reset(&metric);
- gpu_submit(gpu, &fam);
+ gpu_submit(gpu, &fam_temp);
+ if (reported_ratio) {
+ gpu_submit(gpu, &fam_ratio);
+ }
}
free(temps);
return ok;
#define RAS_INIT 0
#define RAS_INC 1
+#define TEMP_LIMIT 95
#define TEMP_INIT 10
#define TEMP_INC 5
power_counter.energy += COUNTER_INC,
power_counter.timestamp += TIME_INC)
-static zes_temp_properties_t temp_props;
+static zes_temp_properties_t temp_props = {.maxTemperature = TEMP_LIMIT};
static double temperature = TEMP_INIT;
static int dummy;
double last;
} metrics_validation_t;
+#define TEMP_RATIO_INIT ((double)(TEMP_INIT) / (TEMP_LIMIT))
+#define TEMP_RATIO_INC ((double)(TEMP_INC) / (TEMP_LIMIT))
+
#define MEM_RATIO_INIT ((double)MEMORY_INIT / MEMORY_SIZE)
#define MEM_RATIO_INC ((double)MEMORY_INC / MEMORY_SIZE)
{"memory_usage_ratio/HBM/system", false, false, MEM_RATIO_INIT,
MEM_RATIO_INC, 0, 0.0},
{"temperature_celsius", true, false, TEMP_INIT, TEMP_INC, 0, 0.0},
+ {"temperature_ratio", true, false, TEMP_RATIO_INIT, TEMP_RATIO_INC, 0, 0.0},
/* while counters increase, per-time incremented value should stay same */
{"energy_ujoules_total", true, false, COUNTER_START, COUNTER_INC, 0, 0.0},