bool mem;
bool membw;
bool power;
+ bool power_ratio; // needs extra Sysman data compared to power
bool ras;
bool ras_separate;
bool temp;
gpu->power = scalloc(power_count, sizeof(*gpu->power));
gpu->power_count = power_count;
}
- if (!(config.output & (OUTPUT_COUNTER | OUTPUT_RATE))) {
- ERROR(PLUGIN_NAME ": no power output variants selected");
- free(powers);
- return false;
- }
+ metric_family_t fam_ratio = {
+ .help = "Ratio of average power usage vs sustained or burst "
+ "power limit",
+ .name = METRIC_PREFIX "power_ratio",
+ .type = METRIC_TYPE_GAUGE,
+ };
metric_family_t fam_power = {
.help = "Average power usage (in Watts) over query interval",
.name = METRIC_PREFIX "power_watts",
};
metric_t metric = {0};
- bool reported_power = false, reported_energy = false, ok = false;
+ bool reported_ratio = false, reported_power = false, reported_energy = false;
+
+ bool ok = false;
for (i = 0; i < power_count; i++) {
zes_power_properties_t props;
if (zesPowerGetProperties(powers[i], &props) != ZE_RESULT_SUCCESS) {
reported_energy = true;
}
zes_power_energy_counter_t *old = &gpu->power[i];
- if (old->timestamp && (config.output & OUTPUT_RATE) &&
- counter.timestamp > old->timestamp) {
- /* microJoules / microSeconds => watts */
- metric.value.gauge = (double)(counter.energy - old->energy) /
- (counter.timestamp - old->timestamp);
- metric_family_metric_append(&fam_power, metric);
- reported_power = true;
+ if (old->timestamp && counter.timestamp > old->timestamp &&
+ (config.output & (OUTPUT_RATIO | OUTPUT_RATE))) {
+
+ uint64_t energy_diff = counter.energy - old->energy;
+ double time_diff = counter.timestamp - old->timestamp;
+
+ if (config.output & OUTPUT_RATE) {
+ /* microJoules / microSeconds => watts */
+ metric.value.gauge = energy_diff / time_diff;
+ metric_family_metric_append(&fam_power, metric);
+ reported_power = true;
+ }
+ if ((config.output & OUTPUT_RATIO) && !gpu->disabled.power_ratio) {
+ const char *name;
+ int32_t limit = 0;
+
+ zes_power_burst_limit_t burst;
+ zes_power_sustained_limit_t sustain;
+ if (zesPowerGetLimits(powers[i], &sustain, &burst, NULL) !=
+ ZE_RESULT_SUCCESS) {
+ WARNING(PLUGIN_NAME ": disabling power ratio, failed to get power "
+ "domain %d limits",
+ i);
+ gpu->disabled.power_ratio = true;
+ } else {
+ /* Multiply by 1000, as sustain interval is in ms & power in mJ/s,
+ * whereas energy is in uJ and its timestamp in us:
+ * https://spec.oneapi.io/level-zero/latest/sysman/api.html#zes-power-energy-counter-t
+ */
+ if (sustain.enabled &&
+ (time_diff >= 1000 * sustain.interval || !burst.enabled)) {
+ name = "sustained";
+ limit = sustain.power;
+ } else if (burst.enabled) {
+ name = "burst";
+ limit = burst.power;
+ } else {
+ gpu->disabled.power_ratio = true;
+ }
+ }
+ if (limit > 0) {
+ metric_label_set(&metric, "limit", name);
+ metric.value.gauge = 1000 * energy_diff / (limit * time_diff);
+ metric_family_metric_append(&fam_ratio, metric);
+ reported_ratio = true;
+ }
+ }
}
*old = counter;
ok = true;
if (reported_power) {
gpu_submit(gpu, &fam_power);
}
+ if (reported_ratio) {
+ gpu_submit(gpu, &fam_ratio);
+ }
}
free(powers);
return ok;
#define MEMORY_INIT (MEMORY_SIZE / 2) // so that both free & used get same value
#define MEMORY_INC (MEMORY_SIZE / 64)
+#define POWER_LIMIT (2.0 * COUNTER_INC / TIME_INC) // in Watts
+
#define RAS_INIT 0
#define RAS_INC 1
return ZE_RESULT_SUCCESS;
}
-#define QUERY_CALL_FUNCS 20
+ze_result_t zesPowerGetLimits(zes_pwr_handle_t handle,
+ zes_power_sustained_limit_t *sustained,
+ zes_power_burst_limit_t *burst,
+ zes_power_peak_limit_t *peak) {
+ void *check = NULL; // something must be requested
+ if (sustained) {
+ check = sustained;
+ sustained->enabled = true;
+ sustained->interval = 2 * TIME_INC / 1000; // 2x to get this skipped
+ sustained->power = 2 * 1000 * POWER_LIMIT; // mW
+ }
+ if (burst) {
+ check = burst;
+ burst->enabled = true;
+ burst->power = 1000 * POWER_LIMIT;
+ }
+ if (peak) {
+ check = NULL; // not supported
+ }
+ return metric_args_check(20, "zesPowerGetLimits", handle, check);
+}
+
+#define QUERY_CALL_FUNCS 21
#define QUERY_CALL_BITS (((uint64_t)1 << QUERY_CALL_FUNCS) - 1)
+/* ------------------------------------------------------------------------- */
+/* bitmask for the calls that happen only on successive query rounds:
+ * - zesPowerGetLimits (20)
+ * (due to them being inside 'old->timestamp' check)
+ */
+#define QUERY_MULTI_BITS (1 << 20)
+
/* ------------------------------------------------------------------------- */
/* mock up metrics reporting and validation */
0, 0.0},
{"memory_bw_ratio/HBM/system/write", true, false, COUNTER_MAX_RATIO, 0, 0,
0.0},
+ {"power_ratio", true, false, COUNTER_INC / POWER_LIMIT / TIME_INC, 0, 0,
+ 0.0},
{"power_watts", true, false, COUNTER_RATIO, 0, 0, 0.0},
{"throttled_usecs_total/gpu", true, false, COUNTER_START, COUNTER_INC, 0,
0.0},
} flags[] = {
{"engine", &disabled->engine}, {"frequency", &disabled->freq},
{"memory", &disabled->mem}, {"membw", &disabled->membw},
- {"power", &disabled->power}, {"errors", &disabled->ras},
- {"temperature", &disabled->temp}, {"throttle", &disabled->throttle}};
+ {"power", &disabled->power}, {"power_ratio", &disabled->power_ratio},
+ {"errors", &disabled->ras}, {"temperature", &disabled->temp},
+ {"throttle", &disabled->throttle}};
*all = 0;
int count = 0;
for (int i = 0; i < (int)STATIC_ARRAY_SIZE(flags); i++) {
globs.warnings = globs.api_calls = globs.callbits = 0;
assert(registry.read() == 0);
/* all Sysman metric query first round functions got successfully called? */
- check_call_counts("query", QUERY_CALL_BITS);
+ check_call_counts("query", QUERY_CALL_BITS ^ QUERY_MULTI_BITS);
assert(globs.warnings == 0);
/* per-time counters do not report on first round */
assert(validate_and_reset_saved_metrics(1, 0) > 0);