for the sampled values and submitted instead of the read values
themselves.
-Other metrics values are either counters, or change much slower, and
-are read only at submit intervals. If collecting of the sampled
-values is disabled, it is better to set Samples to 1 (default).
+Most of other metrics values are either counters, or change much
+slower, and are read only at submit intervals. If collecting of the
+sampled values is disabled, it is better to set Samples to 1
+(default).
=item B<LogGpuInfo>
=item B<MetricsOutput>
-Either "raw", "derived" or "both".
+Set of "counter", "rate", and "ratio" values, separated by comma,
+colon, slash or space.
-Specifies whether metrics should be reported as raw values provided
-by Sysman (e.g. HW energy usage counter value in Joules) which is
-preferred for use in Prometheus, as more human-readable and easier
-to debug derived values (e.g. power usage gauge value in Watts), or
-whether to increase number of produced metrics by reporting both.
+Several of the metric types support multiple variants for their
+values. This option specifies which ones of them are to be reported.
+
+Default is to report all variants ("counter:rate:ratio"). To reduce
+amount of data, it is better to configure just the relevant one for
+given use. Note that some of the metric types support only two of
+these variants, whereas metrics supporting only single variant ignore
+this option.
+
+Counter metric variant (e.g. HW energy usage as Joules counter) is
+preferred by Prometheus as doing rate calculations in Prometheus is
+more flexible. However, because collectd stores counters internally as
+integers, counter metrics cannot using base units (seconds, joules) as
+required by OpenMetrics spec, but microseconds and microjoules.
+
+Rate metric variant is directly human readable, and available for
+metrics where it makes sense (e.g. bytes per second, Watts and RPMs).
+
+Ratio variant is a utilization metric. It can be reported only for
+metric types which are either based on time (e.g. GPU engine use
+time), or for which Sysman provides a limit / maximum value. Some
+metrics may give over 100% ratios if their limit applies over longer
+time than the query inteval (could happen e.g. with power limits).
=item B<DisableMemory>
} gpu_device_t;
typedef enum {
- OUTPUT_UNSET = 0,
- OUTPUT_RAW,
- OUTPUT_DERIVED,
- OUTPUT_BOTH, /* 3 = 1 | 2 mask */
- OUTPUT_TYPES
+ OUTPUT_COUNTER = 1,
+ OUTPUT_RATE = 2,
+ OUTPUT_RATIO = 4,
+ OUTPUT_ALL = 7
} output_t;
-static const char *metrics_output[OUTPUT_TYPES] = {"unset", "raw", "derived",
- "both"};
+static const struct {
+ const char *name;
+ output_t value;
+} metrics_output[] = {{"counter", OUTPUT_COUNTER},
+ {"rate", OUTPUT_RATE},
+ {"ratio", OUTPUT_RATIO}};
static gpu_device_t *gpus;
static uint32_t gpu_count;
* if at least some metric is enabled, otherwise error code
*/
static int gpu_config_check(void) {
- if (config.output == OUTPUT_UNSET) {
- config.output = OUTPUT_BOTH;
+ if (!config.output) {
+ config.output = OUTPUT_ALL;
}
- assert(config.output < STATIC_ARRAY_SIZE(metrics_output));
if (config.gpuinfo) {
double interval = CDTIME_T_TO_DOUBLE(plugin_get_interval());
} else {
INFO("- query / submit interval: %.2f", interval);
}
- INFO("- " KEY_METRICS_OUTPUT ": %s", metrics_output[config.output]);
+ for (unsigned i = 0; i < STATIC_ARRAY_SIZE(metrics_output); i++) {
+ if (config.output & metrics_output[i].value) {
+ INFO("- " KEY_METRICS_OUTPUT ": %s", metrics_output[i].name);
+ }
+ }
INFO("Disabled metrics:");
}
struct {
ok = false;
break;
}
- if (config.output & OUTPUT_RAW) {
+ if (config.output & OUTPUT_COUNTER) {
metric.value.counter = bw.writeCounter;
metric_label_set(&metric, "direction", "write");
metric_family_metric_append(&fam_counter, metric);
reported_counter = true;
}
zes_mem_bandwidth_t *old = &gpu->membw[i];
- if (old->maxBandwidth && (config.output & OUTPUT_DERIVED) &&
+ if (old->maxBandwidth && (config.output & OUTPUT_RATIO) &&
bw.timestamp > old->timestamp) {
/* https://spec.oneapi.com/level-zero/latest/sysman/api.html#_CPPv419zes_mem_bandwidth_t
*/
ok = false;
break;
}
- if (config.output & OUTPUT_RAW) {
+ if (config.output & OUTPUT_COUNTER) {
/* cannot convert microsecs to secs as counters are integers */
metric.value.counter = throttle.throttleTime;
metric_family_metric_append(&fam_counter, metric);
reported_counter = true;
}
zes_freq_throttle_time_t *old = &gpu->throttle[i];
- if (old->timestamp && (config.output & OUTPUT_DERIVED) &&
- throttle.timestamp > old->timestamp) {
+ if (old->timestamp && throttle.timestamp > old->timestamp &&
+ (config.output & OUTPUT_RATIO)) {
/* micro seconds => throttle ratio */
metric.value.gauge = (throttle.throttleTime - old->throttleTime) /
(double)(throttle.timestamp - old->timestamp);
break;
}
metric_set_subdev(&metric, props.onSubdevice, props.subdeviceId);
- if (config.output & OUTPUT_RAW) {
+ if (config.output & OUTPUT_COUNTER) {
metric.value.counter = counter.energy;
metric_family_metric_append(&fam_energy, metric);
reported_energy = true;
}
zes_power_energy_counter_t *old = &gpu->power[i];
- if (old->timestamp && (config.output & OUTPUT_DERIVED) &&
+ if (old->timestamp && (config.output & OUTPUT_RATE) &&
counter.timestamp > old->timestamp) {
/* microJoules / microSeconds => watts */
metric.value.gauge = (double)(counter.energy - old->energy) /
}
metric_set_subdev(&metric, props.onSubdevice, props.subdeviceId);
metric_label_set(&metric, "type", vname);
- if (config.output & OUTPUT_RAW) {
+ if (config.output & OUTPUT_COUNTER) {
metric.value.counter = stats.activeTime;
metric_family_metric_append(&fam_counter, metric);
reported_counter = true;
}
zes_engine_stats_t *old = &gpu->engine[i];
- if (old->timestamp && (config.output & OUTPUT_DERIVED) &&
- stats.timestamp > old->timestamp) {
+ if (old->timestamp && stats.timestamp > old->timestamp &&
+ (config.output & OUTPUT_RATIO)) {
metric.value.gauge = (double)(stats.activeTime - old->activeTime) /
(stats.timestamp - old->timestamp);
metric_family_metric_append(&fam_ratio, metric);
} else if (strcasecmp(key, KEY_LOG_GPU_INFO) == 0) {
config.gpuinfo = IS_TRUE(value);
} else if (strcasecmp(key, KEY_METRICS_OUTPUT) == 0) {
- config.output = OUTPUT_UNSET;
- for (unsigned i = 0; i < STATIC_ARRAY_SIZE(metrics_output); i++) {
- if (strcasecmp(value, metrics_output[i]) == 0) {
- config.output = i;
- break;
+ config.output = 0;
+ static const char delim[] = ",:/ ";
+ char *save, *flag, *flags = strdup(value);
+ for (flag = strtok_r(flags, delim, &save); flag;
+ flag = strtok_r(NULL, delim, &save)) {
+ unsigned i;
+ for (i = 0; i < STATIC_ARRAY_SIZE(metrics_output); i++) {
+ if (strcasecmp(flag, metrics_output[i].name) == 0) {
+ config.output |= metrics_output[i].value;
+ break;
+ }
+ }
+ if (i >= STATIC_ARRAY_SIZE(metrics_output)) {
+ free(flags);
+ return RET_INVALID_CONFIG;
}
}
- if (config.output == OUTPUT_UNSET) {
+ free(flags);
+ if (!config.output) {
ERROR(PLUGIN_NAME ": Invalid '%s' config key value '%s'", key, value);
return RET_INVALID_CONFIG;
}
const char *value;
bool success;
} test[] = {
- {"MetricsOutput", "derived", true},
- {"MetricsOutput", "raW", true},
- {"MetricsOutput", "Foobar", false},
+ {"MetricsOutput", "counter", true},
+ {"MetricsOutput", "rate", true},
+ {"MetricsOutput", "RatiO", true},
+ {"MetricsOutput", "RatiO/fooBAR", false},
{"MetricsOutput", "1", false},
{"Foobar", "Foobar", false},
{"Samples", "999", false},
{"Samples", "-1", false},
{"Samples", "8", true},
/* set back to default */
- {"MetricsOutput", "Both", true},
+ {"MetricsOutput", "counter:rate:ratio", true},
{"Samples", "1", true},
};
unsigned int i, j;