[collectd 6] Add 'gpu_sysman' plugin for (Intel) GPU metrics (#3968)

author Eero Tamminen <eero.t.tamminen@intel.com>

Tue, 7 Jun 2022 17:55:14 +0000 (20:55 +0300)

committer GitHub <noreply@github.com>

Tue, 7 Jun 2022 17:55:14 +0000 (19:55 +0200)
author Eero Tamminen <eero.t.tamminen@intel.com>
Tue, 7 Jun 2022 17:55:14 +0000 (20:55 +0300)
committer GitHub <noreply@github.com>
Tue, 7 Jun 2022 17:55:14 +0000 (19:55 +0200)
diff --git a/Makefile.am b/Makefile.am

index 396a8c11c741dc0f7356eb7f50828231dbf568af..35a3678b3a031f871c0db47e9f00e0d66ddec080 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -1121,6 +1121,21 @@ gpu_nvidia_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_CUDA_LDFLAGS)
  gpu_nvidia_la_LIBADD = $(BUILD_WITH_CUDA_LIBS)
  endif
  
+if BUILD_PLUGIN_GPU_SYSMAN
+pkglib_LTLIBRARIES += gpu_sysman.la
+gpu_sysman_la_SOURCES = src/gpu_sysman.c
+gpu_sysman_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_CPPFLAGS) $(BUILD_WITH_SYSMAN_CPPFLAGS)
+gpu_sysman_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_SYSMAN_LDFLAGS)
+gpu_sysman_la_LIBADD = $(BUILD_WITH_SYSMAN_LIBS)
+
+test_plugin_gpu_sysman_SOURCES = src/gpu_sysman_test.c
+test_plugin_gpu_sysman_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_CPPFLAGS) $(BUILD_WITH_SYSMAN_CPPFLAGS)
+test_plugin_gpu_sysman_LDFLAGS = $(PLUGIN_LDFLAGS)
+test_plugin_gpu_sysman_LDADD =
+check_PROGRAMS += test_plugin_gpu_sysman
+TESTS += test_plugin_gpu_sysman
+endif
+
  if BUILD_PLUGIN_GRPC
  pkglib_LTLIBRARIES += grpc.la
  grpc_la_SOURCES = src/grpc.cc
diff --git a/README b/README

index 14993038230784a761d6a4030b35fec7ea9740f4..6d1c078a31aefb2de4ca45bad423e63145134465 100644 (file)
--- a/README
+++ b/README
@@ -158,6 +158,9 @@ Features
      - gpu_nvidia
        Monitor NVIDIA GPU statistics available through NVML.
  
+    - gpu_sysman
+      Monitor GPU statistics available through Level-Zero Sysman API.
+
      - hddtemp
        Hard disk temperatures using hddtempd.
  
@@ -793,6 +796,10 @@ Prerequisites
      Used by the `gpu_nvidia' plugin
      <https://developer.nvidia.com/cuda-downloads>
  
+  * Level-Zero / Sysman (optional)
+    Used by the `gpu_sysman' plugin
+    <https://github.com/oneapi-src/level-zero>
+
    * libatasmart (optional)
      Used by the `smart' plugin.
      <http://git.0pointer.de/?p=libatasmart.git>
diff --git a/configure.ac b/configure.ac

index 5fdef90fae672b88fe1f2099120b74fafa15047a..a29ef348b5d9fd37dc323ed4b156e8629e0feb49 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -2105,6 +2105,54 @@ AC_SUBST([BUILD_WITH_CUDA_LIBS])
  
  # }}}
  
+# --with-sysman {{{
+AC_ARG_WITH([sysman],
+  [AS_HELP_STRING([--with-sysman@<:@=PREFIX@:>@], [Path to Level-Zero sysman.])],
+  [
+    if test "x$withval" != "xno" && test "x$withval" != "xyes"; then
+      with_sysman_cppflags="-I$withval/include"
+      with_sysman_ldflags="-I$withval/lib"
+      with_sysman="yes"
+    else
+      with_sysman="$withval"
+    fi
+  ],
+  [with_sysman="no"]
+)
+
+if test "x$with_sysman" = "xyes"; then
+  SAVE_CPPFLAGS="$CPPFLAGS"
+  CPPFLAGS="$CPPFLAGS $with_sysman_cppflags"
+
+  AC_CHECK_HEADERS([level_zero/ze_api.h level_zero/zes_api.h],
+    [with_sysman="yes"],
+    [with_sysman="no (ze_api.h / zes_api.h not found)"]
+  )
+  CPPFLAGS="$SAVE_CPPFLAGS"
+fi
+
+if test "x$with_sysman" = "xyes"; then
+  SAVE_LDFLAGS="$LDFLAGS"
+  LDFLAGS="$LDFLAGS $with_sysman_ldflags"
+
+  AC_CHECK_LIB([ze_loader], zeInit,
+    [with_sysman="yes"],
+    [with_sysman="no (libze_loader symbol 'zeInit' not found)"]
+  )
+  LDFLAGS="$SAVE_LDFLAGS"
+fi
+
+if test "x$with_sysman" = "xyes"; then
+  BUILD_WITH_SYSMAN_CPPFLAGS="$with_sysman_cppflags"
+  BUILD_WITH_SYSMAN_LDFLAGS="$with_sysman_ldflags"
+  BUILD_WITH_SYSMAN_LIBS="-lze_loader"
+fi
+
+AC_SUBST([BUILD_WITH_SYSMAN_CPPFLAGS])
+AC_SUBST([BUILD_WITH_SYSMAN_LDFLAGS])
+AC_SUBST([BUILD_WITH_SYSMAN_LIBS])
+# }}}
+
  # --with-libaquaero5 {{{
  AC_ARG_WITH([libaquaero5],
    [AS_HELP_STRING([--with-libaquaero5@<:@=PREFIX@:>@], [Path to aquatools-ng source code.])],
@@ -6546,6 +6594,7 @@ plugin_fhcount="no"
  plugin_fscache="no"
  plugin_gps="no"
  plugin_gpu_nvidia="no"
+plugin_gpu_sysman="no"
  plugin_grpc="no"
  plugin_hugepages="no"
  plugin_infiniband="no"
@@ -7006,6 +7055,7 @@ AC_PLUGIN([fscache],             [$plugin_fscache],           [fscache statistic
  AC_PLUGIN([gmond],               [$with_libganglia],          [Ganglia plugin])
  AC_PLUGIN([gps],                 [$plugin_gps],               [GPS plugin])
  AC_PLUGIN([gpu_nvidia],          [$with_cuda],                [NVIDIA GPU plugin])
+AC_PLUGIN([gpu_sysman],          [$with_sysman],              [Level-Zero Sysman GPU plugin])
  AC_PLUGIN([grpc],                [$plugin_grpc],              [gRPC plugin])
  AC_PLUGIN([hddtemp],             [yes],                       [Query hddtempd])
  AC_PLUGIN([hugepages],           [$plugin_hugepages],         [Hugepages statistics])
@@ -7385,6 +7435,7 @@ AC_MSG_RESULT([    libsigrok   . . . . . $with_libsigrok])
  AC_MSG_RESULT([    libssl  . . . . . . . $with_libssl])
  AC_MSG_RESULT([    libslurm .  . . . . . $with_libslurm])
  AC_MSG_RESULT([    libstatgrab . . . . . $with_libstatgrab])
+AC_MSG_RESULT([    libsysman . . . . . . $with_sysman])
  AC_MSG_RESULT([    libtokyotyrant  . . . $with_libtokyotyrant])
  AC_MSG_RESULT([    libudev . . . . . . . $with_libudev])
  AC_MSG_RESULT([    libupsclient  . . . . $with_libupsclient])
@@ -7450,6 +7501,7 @@ AC_MSG_RESULT([    fscache . . . . . . . $enable_fscache])
  AC_MSG_RESULT([    gmond . . . . . . . . $enable_gmond])
  AC_MSG_RESULT([    gps . . . . . . . . . $enable_gps])
  AC_MSG_RESULT([    gpu_nvidia  . . . . . $enable_gpu_nvidia])
+AC_MSG_RESULT([    gpu_sysman  . . . . . $enable_gpu_sysman])
  AC_MSG_RESULT([    grpc  . . . . . . . . $enable_grpc])
  AC_MSG_RESULT([    hddtemp . . . . . . . $enable_hddtemp])
  AC_MSG_RESULT([    hugepages . . . . . . $enable_hugepages])
diff --git a/src/collectd.conf.in b/src/collectd.conf.in

index c3918b54e05d95110ccc0f0011ad4a35a1b65a7a..fd54ed7ad546642f176636ffaec22dde80e237db 100644 (file)
--- a/src/collectd.conf.in
+++ b/src/collectd.conf.in
@@ -133,6 +133,7 @@
  #@BUILD_PLUGIN_FSCACHE_TRUE@LoadPlugin fscache
  #@BUILD_PLUGIN_GMOND_TRUE@LoadPlugin gmond
  #@BUILD_PLUGIN_GPS_TRUE@LoadPlugin gps
+#@BUILD_PLUGIN_GPU_SYSMAN_TRUE@LoadPlugin gpu_sysman
  #@BUILD_PLUGIN_GRPC_TRUE@LoadPlugin grpc
  #@BUILD_PLUGIN_HDDTEMP_TRUE@LoadPlugin hddtemp
  #@BUILD_PLUGIN_HUGEPAGES_TRUE@LoadPlugin hugepages
@@ -792,6 +793,22 @@
  #   InstanceByGPUName true
  #</Plugin>
  
+#<Plugin gpu_sysman>
+#   Samples 1
+#   LogGpuInfo false
+#   MetricsOutput both
+#   DisableMemory false
+#   DisableMemoryBandwidth false
+#   DisableFrequency false
+#   DisableThrottleTime false
+#   DisableTemperature false
+#   DisablePower false
+#   DisableEngine false
+#   DisableEngineSingle false
+#   DisableErrors false
+#   DisableSeparateErrors false
+#</Plugin>
+
  #<Plugin grpc>
  #      <Server "example.com" "50051">
  #              EnableSSL true
diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod

index 7b261c2142e00c8bf742fdb74c27f7b94f6da122..9c3d961e4c569c33312c41d842a46e4bdbecb992 100644 (file)
--- a/src/collectd.conf.pod
+++ b/src/collectd.conf.pod
@@ -3690,6 +3690,98 @@ default is 'GPU ID'-'GPU name'
  
  =back
  
+=head2 Plugin C<gpu_sysman>
+
+This plugin is available on Linux only. It uses level-Zero Sysman API
+to read GPU information.
+
+Options below give overview of what information it could provide, but
+the information actually available through it depends on what is
+supported by the underlying GPU HW, its kernel driver, and Sysman
+backend user-space (compute) driver implementation.
+
+=over 4
+
+=item B<Samples>
+
+How many values to collect (at specified plugin Interval) for sampled
+metrics, before submitting their (potentially aggregate) metric
+values.
+
+This means that the actual GPU metrics submit interval is B<Interval>
+* B<Samples>.
+
+Currently GPU frequency and GPU memory values are sampled (like this),
+because their values can have very large fluctuations multiple times
+per second. When Samples is larger than 1, min + max are calculated
+for the sampled values and submitted instead of the read values
+themselves.
+
+Other metrics values are either counters, or change much slower, and
+are read only at submit intervals.  If collecting of the sampled
+values is disabled, it is better to set Samples to 1 (default).
+
+=item B<LogGpuInfo>
+
+If enabled, plugin logs at start some information about all the GPUs
+detected through Sysman API.
+
+=item B<MetricsOutput>
+
+Either "raw", "derived" or "both".
+
+Specifies whether metrics should be reported as raw values provided
+by Sysman (e.g. HW energy usage counter value in Joules) which is
+preferred for use in Prometheus, as more human-readable and easier
+to debug derived values (e.g. power usage gauge value in Watts), or
+whether to increase number of produced metrics by reporting both.
+
+=item B<DisableMemory>
+
+Disable memory usage metrics collection.
+
+=item B<DisableMemoryBandwidth>
+
+Disable memory bandwidth metrics collection.
+
+=item B<DisableFrequency>
+
+Disable actual / requested frequency metrics collection.
+
+=item B<DisableThrottleTime>
+
+Disable frequency throttling time metric collection.
+
+=item B<DisablePower>
+
+Disable power usage metrics collection.
+
+=item B<DisableTemperature>
+
+Disable temperature metrics collection.
+
+=item B<DisableEngine>
+
+Disable engine utilization metrics collection.
+
+=item B<DisableEngineSingle>
+
+Disable utilization metrics collection for single engines i.e. provide
+utilization information only for engine groups.
+
+=item B<DisableErrors>
+
+Disable RAS (Reliability, Availability, and Serviceability) error
+metrics collection.
+
+=item B<DisableSeparateErrors>
+
+Disable each error sub-category being reported separately, and just
+report total error counters for higher level "correctable" and
+"uncorrectable" errors.
+
+=back
+
  =head2 Plugin C<grpc>
  
  The I<grpc> plugin provides an RPC interface to submit values to or query
diff --git a/src/gpu_sysman.c b/src/gpu_sysman.c

new file mode 100644 (file)

index 0000000..19dde63
--- /dev/null
+++ b/src/gpu_sysman.c
@@ -0,0 +1,1913 @@
+/**
+ * collectd - src/gpu_sysman.c
+ *
+ * Copyright(c) 2020-2022 Intel Corporation. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * - Eero Tamminen <eero.t.tamminen@intel.com>
+ *
+ * See: https://spec.oneapi.com/level-zero/latest/sysman/PROG.html
+ *
+ * Error handling:
+ * - All allocation checking is done with asserts, so plugin will abort
+ *   if any allocation fails
+ * - All Sysman API call errors are logged
+ * - Sysman errors do not cause plugin initialization failure if even
+ *   one GPU device is available with PCI ID
+ * - Sysman errors in metrics queries cause just given metric to be
+ *   disabled (for given GPU)
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <level_zero/ze_api.h>
+#include <level_zero/zes_api.h>
+
+/* whether to add "dev_file" label to metrics for Kubernetes Intel GPU plugin,
+ * needs (POSIX.1-2001) basename() + glob() and (POSIX.1-2008) getline()
+ * functions.
+ */
+#define ADD_DEV_FILE 1
+#if ADD_DEV_FILE
+#include <glob.h>
+#include <libgen.h>
+#endif
+
+#include "collectd.h"
+#include "plugin.h"
+#include "utils/common/common.h"
+
+#define PLUGIN_NAME "gpu_sysman"
+#define METRIC_PREFIX "collectd_" PLUGIN_NAME "_"
+
+/* collectd plugin API callback finished OK */
+#define RET_OK 0
+/* plugin specific callback error return values */
+#define RET_NO_METRICS -1
+#define RET_INVALID_CONFIG -2
+#define RET_ZE_INIT_FAIL -3
+#define RET_NO_DRIVERS -4
+#define RET_ZE_DRIVER_GET_FAIL -5
+#define RET_ZE_DEVICE_GET_FAIL -6
+#define RET_ZE_DEVICE_PROPS_FAIL -7
+#define RET_NO_GPUS -9
+
+/* GPU metrics to disable */
+typedef struct {
+  bool all; /* no metrics from whole GPU */
+  bool engine;
+  bool engine_single;
+  bool freq;
+  bool mem;
+  bool membw;
+  bool power;
+  bool ras;
+  bool ras_separate;
+  bool temp;
+  bool throttle;
+} gpu_disable_t;
+
+/* handles for the GPU devices discovered by Sysman library */
+typedef struct {
+  char *pci_bdf;
+  char *dev_file;
+  /* number of types for metrics without allocs */
+  uint32_t ras_count;
+  uint32_t temp_count;
+  /* number of types for each counter metric */
+  uint32_t engine_count;
+  uint32_t membw_count;
+  uint32_t power_count;
+  uint32_t throttle_count;
+  /* number of types for each sampled metric */
+  uint32_t frequency_count;
+  uint32_t memory_count;
+  /* previous values for counters */
+  zes_engine_stats_t *engine;
+  zes_mem_bandwidth_t *membw;
+  zes_power_energy_counter_t *power;
+  zes_freq_throttle_time_t *throttle;
+  /* types * samples sized array of values, used for aggregate outputs */
+  zes_freq_state_t **frequency;
+  zes_mem_state_t **memory;
+  /* GPU  specific disable flags */
+  gpu_disable_t disabled;
+  zes_device_handle_t handle;
+  /* report counter */
+  uint64_t counter;
+} gpu_device_t;
+
+typedef enum {
+  OUTPUT_UNSET = 0,
+  OUTPUT_RAW,
+  OUTPUT_DERIVED,
+  OUTPUT_BOTH, /* 3 = 1 | 2 mask */
+  OUTPUT_TYPES
+} output_t;
+
+static const char *metrics_output[OUTPUT_TYPES] = {"unset", "raw", "derived",
+                                                   "both"};
+
+static gpu_device_t *gpus;
+static uint32_t gpu_count;
+static struct {
+  bool gpuinfo;
+  gpu_disable_t disabled;
+  output_t output;
+  uint32_t samples;
+} config;
+
+/* Sysman GPU plugin config options (defines to ease catching typos) */
+#define KEY_DISABLE_ENGINE "DisableEngine"
+#define KEY_DISABLE_ENGINE_SINGLE "DisableEngineSingle"
+#define KEY_DISABLE_FREQ "DisableFrequency"
+#define KEY_DISABLE_MEM "DisableMemory"
+#define KEY_DISABLE_MEMBW "DisableMemoryBandwidth"
+#define KEY_DISABLE_POWER "DisablePower"
+#define KEY_DISABLE_RAS "DisableErrors"
+#define KEY_DISABLE_RAS_SEPARATE "DisableSeparateErrors"
+#define KEY_DISABLE_TEMP "DisableTemperature"
+#define KEY_DISABLE_THROTTLE "DisableThrottleTime"
+
+#define KEY_METRICS_OUTPUT "MetricsOutput"
+#define KEY_LOG_GPU_INFO "LogGpuInfo"
+#define KEY_SAMPLES "Samples"
+#define MAX_SAMPLES 64
+
+/* Free array of arrays allocated with gpu_subarray_realloc().
+ *
+ * config.samples must not have changed since allocation, because
+ * that determines the number of allocated subarrays
+ */
+static bool gpu_subarray_free(void **mem) {
+  uint32_t i;
+  if (!mem) {
+    return false;
+  }
+  for (i = 0; i < config.samples; i++) {
+    free(mem[i]);
+    mem[i] = NULL;
+  }
+  free(mem);
+  return true;
+}
+
+/* Allocate 'config.samples' sized array of 'count' sized arrays having 'size'
+ * sized items.  If given array is already allocated, it and its subarrays
+ * is freed first
+ */
+static void **gpu_subarray_realloc(void **mem, int count, int size) {
+  uint32_t i;
+  gpu_subarray_free(mem);
+  mem = smalloc(config.samples * sizeof(void *));
+  for (i = 0; i < config.samples; i++) {
+    mem[i] = scalloc(count, size);
+  }
+  return mem;
+}
+
+/* Free GPU allocations and zero counters
+ *
+ * Return RET_OK for shutdown callback success
+ */
+static int gpu_config_free(void) {
+#define FREE_GPU_ARRAY(i, member)                                              \
+  if (gpus[i].member) {                                                        \
+    free(gpus[i].member);                                                      \
+    gpus[i].member##_count = 0;                                                \
+    gpus[i].member = NULL;                                                     \
+  }
+#define FREE_GPU_SAMPLING_ARRAYS(i, member)                                    \
+  if (gpus[i].member) {                                                        \
+    gpu_subarray_free((void **)gpus[i].member);                                \
+    gpus[i].member##_count = 0;                                                \
+    gpus[i].member = NULL;                                                     \
+  }
+  if (!gpus) {
+    /* gpu_init() should have failed with no GPUs, so no need for this */
+    WARNING(PLUGIN_NAME
+            ": gpu_config_free() (shutdown) called with no GPUs initialized");
+    return RET_NO_GPUS;
+  }
+  for (uint32_t i = 0; i < gpu_count; i++) {
+    /* free previous values for counters & zero their counts */
+    FREE_GPU_ARRAY(i, engine);
+    FREE_GPU_ARRAY(i, membw);
+    FREE_GPU_ARRAY(i, power);
+    FREE_GPU_ARRAY(i, throttle);
+    /* and similar for sampling arrays */
+    FREE_GPU_SAMPLING_ARRAYS(i, frequency);
+    FREE_GPU_SAMPLING_ARRAYS(i, memory);
+    /* zero rest of counters & free name */
+    gpus[i].ras_count = 0;
+    gpus[i].temp_count = 0;
+    free(gpus[i].pci_bdf);
+    gpus[i].pci_bdf = NULL;
+    free(gpus[i].dev_file);
+    gpus[i].dev_file = NULL;
+  }
+#undef FREE_GPU_SAMPLING_ARRAYS
+#undef FREE_GPU_ARRAY
+  free(gpus);
+  gpus = NULL;
+  return RET_OK;
+}
+
+/* show plugin GPU metrics config options, return RET_OK
+ * if at least some metric is enabled, otherwise error code
+ */
+static int gpu_config_check(void) {
+  if (config.output == OUTPUT_UNSET) {
+    config.output = OUTPUT_BOTH;
+  }
+  assert(config.output < STATIC_ARRAY_SIZE(metrics_output));
+
+  if (config.gpuinfo) {
+    INFO("Sysman '" KEY_SAMPLES "': %d", config.samples);
+    INFO(KEY_METRICS_OUTPUT ": %s", metrics_output[config.output]);
+    INFO("Disabled metrics:");
+  }
+  struct {
+    const char *name;
+    bool value;
+  } options[] = {{KEY_DISABLE_ENGINE, config.disabled.engine},
+                 {KEY_DISABLE_ENGINE_SINGLE, config.disabled.engine_single},
+                 {KEY_DISABLE_FREQ, config.disabled.freq},
+                 {KEY_DISABLE_MEM, config.disabled.mem},
+                 {KEY_DISABLE_MEMBW, config.disabled.membw},
+                 {KEY_DISABLE_POWER, config.disabled.power},
+                 {KEY_DISABLE_RAS, config.disabled.ras},
+                 {KEY_DISABLE_RAS_SEPARATE, config.disabled.ras_separate},
+                 {KEY_DISABLE_TEMP, config.disabled.temp},
+                 {KEY_DISABLE_THROTTLE, config.disabled.throttle}};
+  unsigned int i, disabled = 0;
+  for (i = 0; i < STATIC_ARRAY_SIZE(options); i++) {
+    if (options[i].value) {
+      if (config.gpuinfo) {
+        INFO("- %s", options[i].name);
+      }
+      disabled++;
+    }
+  }
+  if (disabled >= STATIC_ARRAY_SIZE(options)) {
+    ERROR(PLUGIN_NAME ": all metrics disabled");
+    return RET_NO_METRICS;
+  }
+  if (config.gpuinfo) {
+    if (disabled) {
+      INFO("=> %d disabled metrics", disabled);
+    } else {
+      INFO("- no disabled metrics");
+    }
+  }
+  return RET_OK;
+}
+
+/* Set GPU specific flags to initial global configuration values
+ * for each GPU.  Allocations of metrics arrays are done when metrics
+ * are queried for the first time (not here), and re-allocated if
+ * number of types for given metric changes.
+ *
+ * Return RET_OK if config is OK, (negative) error value otherwise
+ */
+static int gpu_config_init(unsigned int count) {
+  if (!config.samples) {
+    config.samples = 1;
+  }
+  if (gpu_config_check()) {
+    gpu_config_free();
+    return RET_NO_METRICS;
+  }
+  unsigned int i;
+  for (i = 0; i < count; i++) {
+    gpus[i].disabled = config.disabled;
+    gpus[i].counter = 0;
+  }
+  gpu_count = count;
+  return RET_OK;
+}
+
+/* log given UUID (without dashes):
+ * https://en.wikipedia.org/wiki/Universally_unique_identifier
+ */
+static void log_uuid(const char *prefix, const uint8_t *byte, int len) {
+  int offset = strlen(prefix);
+  char buf[offset + 2 * len + 1];
+  sstrncpy(buf, prefix, sizeof(buf));
+  while (len-- > 0) {
+    sprintf(buf + offset, "%02x", *byte++);
+    offset += 2;
+  }
+  INFO("%s", buf);
+}
+
+/* Log Sysman API provided info for given GPU if logging is enabled
+ * and on success, return GPU PCI ID as string in BDF notation:
+ *   https://wiki.xen.org/wiki/Bus:Device.Function_(BDF)_Notation
+ */
+static char *gpu_info(int idx, zes_device_handle_t dev) {
+  char *pci_bdf, buf[32];
+
+  zes_pci_properties_t pci;
+  ze_result_t ret = zesDevicePciGetProperties(dev, &pci);
+  if (ret == ZE_RESULT_SUCCESS) {
+    const zes_pci_address_t *addr = &pci.address;
+    snprintf(buf, sizeof(buf), "%04x:%02x:%02x.%x", addr->domain, addr->bus,
+             addr->device, addr->function);
+  } else {
+    ERROR(PLUGIN_NAME ": failed to get GPU %d PCI device properties => 0x%x",
+          idx, ret);
+    return NULL;
+  }
+  pci_bdf = strdup(buf);
+  assert(pci_bdf);
+  if (!config.gpuinfo) {
+    return pci_bdf;
+  }
+
+  INFO("Level-Zero Sysman API GPU %d info", idx);
+  INFO("==================================");
+
+  INFO("PCI info:");
+  if (ret == ZE_RESULT_SUCCESS) {
+    INFO("- PCI B/D/F:  %s", pci_bdf);
+    const zes_pci_speed_t *speed = &pci.maxSpeed;
+    INFO("- PCI gen:    %d", speed->gen);
+    INFO("- PCI width:  %d", speed->width);
+    double max = speed->maxBandwidth / (double)(1024 * 1024 * 1024);
+    INFO("- max BW:     %.2f GiB/s (all lines)", max);
+  } else {
+    INFO("- unavailable");
+  }
+
+  INFO("HW state:");
+  zes_device_state_t state;
+  /* Note: there's also zesDevicePciGetState() for PCI link status */
+  if (ret = zesDeviceGetState(dev, &state), ret == ZE_RESULT_SUCCESS) {
+    INFO("- repaired: %s",
+         (state.repaired == ZES_REPAIR_STATUS_PERFORMED) ? "yes" : "no");
+    if (state.reset != 0) {
+      INFO("- device RESET required");
+      if (state.reset & ZES_RESET_REASON_FLAG_WEDGED) {
+        INFO(" - HW is wedged");
+      }
+      if (state.reset & ZES_RESET_REASON_FLAG_REPAIR) {
+        INFO(" - HW needs to complete repairs");
+      }
+    } else {
+      INFO("- no RESET required");
+    }
+  } else {
+    INFO("- unavailable");
+    WARNING(PLUGIN_NAME ": failed to get GPU %d device state => 0x%x", idx,
+            ret);
+  }
+
+  INFO("HW identification:");
+  zes_device_properties_t props;
+  if (ret = zesDeviceGetProperties(dev, &props), ret == ZE_RESULT_SUCCESS) {
+    const ze_device_properties_t *core = &props.core;
+    INFO("- name:       %s", core->name);
+    INFO("- vendor ID:  0x%x", core->vendorId);
+    INFO("- device ID:  0x%x", core->deviceId);
+    log_uuid("- UUID:       0x", core->uuid.id, sizeof(core->uuid.id));
+    INFO("- serial#:    %s", props.serialNumber);
+    INFO("- board#:     %s", props.boardNumber);
+    INFO("- brand:      %s", props.brandName);
+    INFO("- model:      %s", props.modelName);
+    INFO("- vendor:     %s", props.vendorName);
+
+    INFO("UMD/KMD driver info:");
+    INFO("- version:    %s", props.driverVersion);
+    INFO("- max alloc:  %lu MiB", core->maxMemAllocSize / (1024 * 1024));
+
+    INFO("HW info:");
+    INFO("- # sub devs: %u", props.numSubdevices);
+    INFO("- core clock: %u", core->coreClockRate);
+    INFO("- EUs:        %u", core->numEUsPerSubslice *
+                                 core->numSubslicesPerSlice * core->numSlices);
+  } else {
+    INFO("- unavailable");
+    WARNING(PLUGIN_NAME ": failed to get GPU %d device properties => 0x%x", idx,
+            ret);
+  }
+
+  /* HW info for all memories */
+  uint32_t i, mem_count = 0;
+  ze_device_handle_t mdev = (ze_device_handle_t)dev;
+  if (zeDeviceGetMemoryProperties(mdev, &mem_count, NULL) !=
+      ZE_RESULT_SUCCESS) {
+    WARNING(PLUGIN_NAME ": failed to get memory properties count");
+    return pci_bdf;
+  }
+  ze_device_memory_properties_t *mems;
+  mems = scalloc(mem_count, sizeof(*mems));
+  if (zeDeviceGetMemoryProperties(mdev, &mem_count, mems) !=
+      ZE_RESULT_SUCCESS) {
+    WARNING(PLUGIN_NAME ": failed to get %d memory properties", mem_count);
+    free(mems);
+    return pci_bdf;
+  }
+  for (i = 0; i < mem_count; i++) {
+    const char *memname = mems[i].name;
+    if (!(memname && *memname)) {
+      memname = "Unknown";
+    }
+    INFO("Memory - %s:", memname);
+    INFO("- size:       %lu MiB", mems[i].totalSize / (1024 * 1024));
+    INFO("- bus width:  %u", mems[i].maxBusWidth);
+    INFO("- max clock:  %u", mems[i].maxClockRate);
+  }
+  free(mems);
+  return pci_bdf;
+}
+
+/* Add (given) BDF string and device file name to GPU struct for metric labels.
+ *
+ * Return false if (required) BDF string is missing, true otherwise.
+ */
+static bool add_gpu_labels(gpu_device_t *gpu, char *pci_bdf) {
+  assert(gpu);
+  if (!pci_bdf) {
+    return false;
+  }
+  gpu->pci_bdf = pci_bdf;
+  /*
+   * scan devfs and sysfs to find primary GPU device file node matching
+   * given BDF, and if one is found, use that as device file name.
+   *
+   * NOTE: scanning can log only INFO messages, because ERRORs and WARNINGs
+   * would FAIL unit test that are run as part of build, if build environment
+   * has no GPU access.
+   */
+#if ADD_DEV_FILE
+#define BDF_LINE "PCI_SLOT_NAME="
+#define DEVFS_GLOB "/dev/dri/card*"
+  glob_t devfs;
+  if (glob(DEVFS_GLOB, 0, NULL, &devfs) != 0) {
+    INFO(PLUGIN_NAME ": device <-> BDF mapping, no matches for: " DEVFS_GLOB);
+    globfree(&devfs);
+    return true;
+  }
+  const size_t prefix_size = strlen(BDF_LINE);
+  for (size_t i = 0; i < devfs.gl_pathc; i++) {
+    char path[PATH_MAX], *dev_file;
+    dev_file = basename(devfs.gl_pathv[i]);
+
+    FILE *fp;
+    snprintf(path, sizeof(path), "/sys/class/drm/%s/device/uevent", dev_file);
+    if (!(fp = fopen(path, "r"))) {
+      INFO(PLUGIN_NAME ": device <-> BDF mapping, file missing: %s", path);
+      continue;
+    }
+    ssize_t nread;
+    size_t len = 0;
+    char *line = NULL;
+    while ((nread = getline(&line, &len, fp)) > 0) {
+      if (strncmp(line, BDF_LINE, prefix_size) != 0) {
+        continue;
+      }
+      line[nread - 1] = '\0'; // remove newline
+      if (strcmp(line + prefix_size, pci_bdf) == 0) {
+        INFO(PLUGIN_NAME ": %s <-> %s", dev_file, pci_bdf);
+        gpu->dev_file = strdup(dev_file);
+        break;
+      }
+    }
+    free(line);
+    fclose(fp);
+    if (gpu->dev_file) {
+      break;
+    }
+  }
+  globfree(&devfs);
+#undef DEVFS_GLOB
+#undef BDF_LINE
+#endif
+  return true;
+}
+
+/* Scan how many GPU devices Sysman reports in total, and set 'scan_count'
+ * accordingly
+ *
+ * Return RET_OK for success, or (negative) error value if any of the device
+ * count queries fails
+ */
+static int gpu_scan(ze_driver_handle_t *drivers, uint32_t driver_count,
+                    uint32_t *scan_count) {
+  assert(!gpus);
+  *scan_count = 0;
+  for (uint32_t drv_idx = 0; drv_idx < driver_count; drv_idx++) {
+
+    uint32_t dev_count = 0;
+    if (zeDeviceGet(drivers[drv_idx], &dev_count, NULL) != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get device count for driver %d", drv_idx);
+      return RET_ZE_DEVICE_GET_FAIL;
+    }
+    if (config.gpuinfo) {
+      INFO("driver %d: %d devices", drv_idx, dev_count);
+    }
+    *scan_count += dev_count;
+  }
+  if (!*scan_count) {
+    ERROR(PLUGIN_NAME ": scan for GPU devices failed");
+    return RET_NO_GPUS;
+  }
+  if (config.gpuinfo) {
+    INFO("scan: %d GPUs in total from %d L0 drivers", *scan_count,
+         driver_count);
+  }
+  return RET_OK;
+}
+
+/* Allocate 'scan_count' GPU structs to 'gpus' and fetch Sysman handle & name
+ * for them.
+ *
+ * Counts of still found & ignored GPUs are set to 'scan_count' and
+ * 'scan_ignored' arguments before returning.
+ *
+ * Return RET_OK for success if at least one GPU device info fetch succeeded,
+ * otherwise (negative) error value for last error encountered
+ */
+static int gpu_fetch(ze_driver_handle_t *drivers, uint32_t driver_count,
+                     uint32_t *scan_count, uint32_t *scan_ignored) {
+  assert(!gpus);
+  assert(*scan_count > 0);
+  gpus = scalloc(*scan_count, sizeof(*gpus));
+
+  uint32_t ignored = 0, count = 0;
+  int retval = RET_NO_GPUS;
+
+  for (uint32_t drv_idx = 0; drv_idx < driver_count; drv_idx++) {
+    uint32_t dev_count = 0;
+    if (zeDeviceGet(drivers[drv_idx], &dev_count, NULL) != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get device count for driver %d", drv_idx);
+      retval = RET_ZE_DEVICE_GET_FAIL;
+      continue;
+    }
+    ze_device_handle_t *devs;
+    devs = scalloc(dev_count, sizeof(*devs));
+    if (zeDeviceGet(drivers[drv_idx], &dev_count, devs) != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get %d devices for driver %d", dev_count,
+            drv_idx);
+      free(devs);
+      devs = NULL;
+      retval = RET_ZE_DEVICE_GET_FAIL;
+      continue;
+    }
+    /* Get all GPU devices for the driver */
+    for (uint32_t dev_idx = 0; dev_idx < dev_count; dev_idx++) {
+      ze_device_properties_t props;
+      if (zeDeviceGetProperties(devs[dev_idx], &props) != ZE_RESULT_SUCCESS) {
+        ERROR(PLUGIN_NAME ": failed to get driver %d device %d properties",
+              drv_idx, dev_idx);
+        retval = RET_ZE_DEVICE_PROPS_FAIL;
+        continue;
+      }
+      assert(ZE_DEVICE_TYPE_GPU == props.type);
+      if (count >= *scan_count) {
+        ignored++;
+        continue;
+      }
+      gpus[count].handle = (zes_device_handle_t)devs[dev_idx];
+      if (!add_gpu_labels(&(gpus[count]), gpu_info(count, devs[dev_idx]))) {
+        ignored++;
+        continue;
+      }
+      count++;
+    }
+    free(devs);
+    devs = NULL;
+  }
+  if (count > 0) {
+    retval = RET_OK;
+    if (config.gpuinfo) {
+      INFO("fetch: %d/%d GPUs in total from %d L0 drivers", count, *scan_count,
+           driver_count);
+    }
+  } else {
+    ERROR(PLUGIN_NAME ": fetch for GPU devices failed");
+    gpu_config_free();
+  }
+  *scan_ignored = ignored;
+  *scan_count = count;
+  return retval;
+}
+
+/* Scan Sysman for GPU devices
+ * Return RET_OK for success, (negative) error value otherwise
+ */
+static int gpu_init(void) {
+  if (gpus) {
+    NOTICE(PLUGIN_NAME ": skipping extra gpu_init() call");
+    return RET_OK;
+  }
+  setenv("ZES_ENABLE_SYSMAN", "1", 1);
+  if (zeInit(ZE_INIT_FLAG_GPU_ONLY) != ZE_RESULT_SUCCESS) {
+    ERROR(PLUGIN_NAME ": Level Zero API init failed");
+    return RET_ZE_INIT_FAIL;
+  }
+  /* Discover all the drivers */
+  uint32_t driver_count = 0;
+  if (zeDriverGet(&driver_count, NULL) != ZE_RESULT_SUCCESS) {
+    ERROR(PLUGIN_NAME ": failed to get L0 GPU drivers count");
+    return RET_ZE_DRIVER_GET_FAIL;
+  }
+  if (!driver_count) {
+    ERROR(PLUGIN_NAME ": no drivers found with Level-Zero Sysman API");
+    return RET_NO_DRIVERS;
+  }
+  ze_driver_handle_t *drivers;
+  drivers = scalloc(driver_count, sizeof(*drivers));
+  if (zeDriverGet(&driver_count, drivers) != ZE_RESULT_SUCCESS) {
+    ERROR(PLUGIN_NAME ": failed to get %d L0 drivers", driver_count);
+    free(drivers);
+    return RET_ZE_DRIVER_GET_FAIL;
+  }
+  /* scan number of Sysman provided GPUs... */
+  int fail;
+  uint32_t count;
+  if ((fail = gpu_scan(drivers, driver_count, &count)) < 0) {
+    free(drivers);
+    return fail;
+  }
+  uint32_t ignored = 0, scanned = count;
+  if (count) {
+    /* ...and allocate & fetch data for them */
+    if ((fail = gpu_fetch(drivers, driver_count, &count, &ignored)) < 0) {
+      free(drivers);
+      return fail;
+    }
+  }
+  free(drivers);
+  if (scanned > count) {
+    WARNING(PLUGIN_NAME ": %d GPUs disappeared after first scan",
+            scanned - count);
+  }
+  if (ignored) {
+    WARNING(PLUGIN_NAME ": %d GPUs appeared after first scan (are ignored)",
+            ignored);
+  }
+  if (!count) {
+    ERROR(PLUGIN_NAME ": no GPU devices found with Level-Zero Sysman API");
+    return RET_NO_GPUS;
+  }
+  return gpu_config_init(count);
+}
+
+/* Add device labels to all metrics in given metric family and submit family to
+ * collectd.  Resets metric family after dispatch */
+static void gpu_submit(gpu_device_t *gpu, metric_family_t *fam) {
+  metric_t *m = fam->metric.ptr;
+  for (size_t i = 0; i < fam->metric.num; i++) {
+    metric_label_set(m + i, "pci_bdf", gpu->pci_bdf);
+    if (gpu->dev_file) {
+      metric_label_set(m + i, "dev_file", gpu->dev_file);
+    }
+  }
+  int status = plugin_dispatch_metric_family(fam);
+  if (status != 0) {
+    ERROR(PLUGIN_NAME ": gpu_submit(%s, %s) failed: %s", gpu->pci_bdf,
+          fam->name, strerror(status));
+  }
+  metric_family_metric_reset(fam);
+}
+
+/* because of family name change, each RAS metric needs to be submitted +
+ * reseted separately */
+static void ras_submit(gpu_device_t *gpu, const char *name, const char *help,
+                       const char *type, const char *subdev, double value) {
+  metric_family_t fam = {
+      .type = METRIC_TYPE_COUNTER,
+      /*
+       * String literals are const, so they are passed as such to
+       * here, but .name & .help members are not, so casts are
+       * necessary.
+       *
+       * Note that same casts happen implicitly when string
+       * literals are assigned directly to these members, GCC
+       * just does not warn about that unless "-Write-strings"
+       * warning is enabled, which is NOT part of even "-Wall
+       * -Wextra".
+       *
+       * This cast is safe as long as metric_family_free() is not
+       * called on these families (which is the case).
+       */
+      .name = (char *)name,
+      .help = (char *)help,
+  };
+  metric_t m = {0};
+
+  m.value.counter = value;
+  if (type) {
+    metric_label_set(&m, "type", type);
+  }
+  if (subdev) {
+    metric_label_set(&m, "sub_dev", subdev);
+  }
+  metric_family_metric_append(&fam, m);
+  metric_reset(&m);
+  gpu_submit(gpu, &fam);
+}
+
+/* Report error set types, return true for success */
+static bool gpu_ras(gpu_device_t *gpu) {
+  uint32_t i, ras_count = 0;
+  zes_device_handle_t dev = gpu->handle;
+  if ((zesDeviceEnumRasErrorSets(dev, &ras_count, NULL) != ZE_RESULT_SUCCESS)) {
+    ERROR(PLUGIN_NAME ": failed to get RAS error sets count");
+    return false;
+  }
+  zes_ras_handle_t *ras;
+  ras = scalloc(ras_count, sizeof(*ras));
+  if (zesDeviceEnumRasErrorSets(dev, &ras_count, ras) != ZE_RESULT_SUCCESS) {
+    ERROR(PLUGIN_NAME ": failed to get %d RAS error sets", ras_count);
+    free(ras);
+    return false;
+  }
+  if (gpu->ras_count != ras_count) {
+    INFO(PLUGIN_NAME ": Sysman reports %d RAS error sets", ras_count);
+    gpu->ras_count = ras_count;
+  }
+
+  bool ok = false;
+  for (i = 0; i < ras_count; i++) {
+    zes_ras_properties_t props;
+    if (zesRasGetProperties(ras[i], &props) != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get RAS set %d properties", i);
+      ok = false;
+      break;
+    }
+    const char *type;
+    switch (props.type) {
+    case ZES_RAS_ERROR_TYPE_CORRECTABLE:
+      type = "correctable";
+      break;
+    case ZES_RAS_ERROR_TYPE_UNCORRECTABLE:
+      type = "uncorrectable";
+      break;
+    default:
+      type = "unknown";
+    }
+    char buf[8];
+    const char *subdev = NULL;
+    if (props.onSubdevice) {
+      snprintf(buf, sizeof(buf), "%d", props.subdeviceId);
+      subdev = buf;
+    }
+    zes_ras_state_t values;
+    const bool clear = false;
+    if (zesRasGetState(ras[i], clear, &values) != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get RAS set %d (%s) state", i, type);
+      ok = false;
+      break;
+    }
+
+    bool correctable;
+    uint64_t value, total = 0;
+    const char *catname, *help;
+    for (int cat_idx = 0; cat_idx < ZES_MAX_RAS_ERROR_CATEGORY_COUNT;
+         cat_idx++) {
+      value = values.category[cat_idx];
+      total += value;
+      if (gpu->disabled.ras_separate) {
+        continue;
+      }
+      correctable = true;
+      switch (cat_idx) {
+        // categories which are not correctable, see:
+        // https://spec.oneapi.io/level-zero/latest/sysman/PROG.html#querying-ras-errors
+      case ZES_RAS_ERROR_CAT_RESET:
+        help = "Total number of GPU reset attempts by the driver";
+        catname = METRIC_PREFIX "resets_total";
+        correctable = false;
+        break;
+      case ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS:
+        help = "Total number of non-correctable HW exceptions generated by the "
+               "way workloads have programmed the HW";
+        catname = METRIC_PREFIX "programming_errors_total";
+        correctable = false;
+        break;
+      case ZES_RAS_ERROR_CAT_DRIVER_ERRORS:
+        help = "total number of non-correctable low level driver communication "
+               "errors";
+        catname = METRIC_PREFIX "driver_errors_total";
+        correctable = false;
+        break;
+        // categories which can have both correctable and uncorrectable errors
+      case ZES_RAS_ERROR_CAT_COMPUTE_ERRORS:
+        help = "Total number of errors occurrend in the accelerator HW";
+        catname = METRIC_PREFIX "compute_errors_total";
+        break;
+      case ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS:
+        help = "Total number of errors occurred in the fixed-function "
+               "accelerator HW";
+        catname = METRIC_PREFIX "fixed_function_errors_total";
+        break;
+      case ZES_RAS_ERROR_CAT_CACHE_ERRORS:
+        help = "Total number of ECC errors that have occurred in the on-chip "
+               "caches";
+        catname = METRIC_PREFIX "cache_errors_total";
+        break;
+      case ZES_RAS_ERROR_CAT_DISPLAY_ERRORS:
+        help = "Total number of ECC errors that have occurred in the display";
+        catname = METRIC_PREFIX "display_errors_total";
+        break;
+      default:
+        help = "Total number of errors in unsupported categories";
+        catname = METRIC_PREFIX "unknown_errors_total";
+      }
+      if (correctable) {
+        ras_submit(gpu, catname, help, type, subdev, value);
+      } else if (props.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
+        ras_submit(gpu, catname, help, NULL, subdev, value);
+      }
+    }
+    catname = METRIC_PREFIX "all_errors_total";
+    help = "Total number of errors in all categories";
+    ras_submit(gpu, catname, help, type, subdev, total);
+    ok = true;
+  }
+  free(ras);
+  return ok;
+}
+
+static void metric_set_subdev(metric_t *m, bool onsub, uint32_t subid) {
+  if (onsub) {
+    char buf[8];
+    snprintf(buf, sizeof(buf), "%d", subid);
+    metric_label_set(m, "sub_dev", buf);
+  }
+}
+
+static bool set_mem_labels(zes_mem_handle_t mem, metric_t *metric) {
+  zes_mem_properties_t props;
+  if (zesMemoryGetProperties(mem, &props) != ZE_RESULT_SUCCESS) {
+    return false;
+  }
+  const char *location;
+  switch (props.location) {
+  case ZES_MEM_LOC_SYSTEM:
+    location = "system";
+    break;
+  case ZES_MEM_LOC_DEVICE:
+    location = "device";
+    break;
+  default:
+    location = "unknown";
+  }
+  const char *type;
+  switch (props.type) {
+  case ZES_MEM_TYPE_HBM:
+    type = "HBM";
+    break;
+  case ZES_MEM_TYPE_DDR:
+    type = "DDR";
+    break;
+  case ZES_MEM_TYPE_DDR3:
+    type = "DDR3";
+    break;
+  case ZES_MEM_TYPE_DDR4:
+    type = "DDR4";
+    break;
+  case ZES_MEM_TYPE_DDR5:
+    type = "DDR5";
+    break;
+  case ZES_MEM_TYPE_LPDDR:
+    type = "LPDDR";
+    break;
+  case ZES_MEM_TYPE_LPDDR3:
+    type = "LPDDR3";
+    break;
+  case ZES_MEM_TYPE_LPDDR4:
+    type = "LPDDR4";
+    break;
+  case ZES_MEM_TYPE_LPDDR5:
+    type = "LPDDR5";
+    break;
+  case ZES_MEM_TYPE_SRAM:
+    type = "SRAM";
+    break;
+  case ZES_MEM_TYPE_L1:
+    type = "L1";
+    break;
+  case ZES_MEM_TYPE_L3:
+    type = "L3";
+    break;
+  case ZES_MEM_TYPE_GRF:
+    type = "GRF";
+    break;
+  case ZES_MEM_TYPE_SLM:
+    type = "SLM";
+    break;
+  default:
+    type = "unknown";
+  }
+  metric_label_set(metric, "type", type);
+  metric_label_set(metric, "location", location);
+  metric_set_subdev(metric, props.onSubdevice, props.subdeviceId);
+  return true;
+}
+
+/* Report memory usage for memory modules, return true for success.
+ *
+ * See gpu_read() on 'cache_idx' usage.
+ */
+static bool gpu_mems(gpu_device_t *gpu, unsigned int cache_idx) {
+  uint32_t i, mem_count = 0;
+  zes_device_handle_t dev = gpu->handle;
+  if ((zesDeviceEnumMemoryModules(dev, &mem_count, NULL) !=
+       ZE_RESULT_SUCCESS)) {
+    ERROR(PLUGIN_NAME ": failed to get memory modules count");
+    return false;
+  }
+  zes_mem_handle_t *mems;
+  mems = scalloc(mem_count, sizeof(*mems));
+  if (zesDeviceEnumMemoryModules(dev, &mem_count, mems) != ZE_RESULT_SUCCESS) {
+    ERROR(PLUGIN_NAME ": failed to get %d memory modules", mem_count);
+    free(mems);
+    return false;
+  }
+
+  if (gpu->memory_count != mem_count) {
+    INFO(PLUGIN_NAME ": Sysman reports %d memory modules", mem_count);
+    gpu->memory = (zes_mem_state_t **)gpu_subarray_realloc(
+        (void **)gpu->memory, mem_count, sizeof(gpu->memory[0][0]));
+    gpu->memory_count = mem_count;
+    assert(gpu->memory);
+  }
+
+  metric_family_t fam_bytes = {
+      .help = "Memory usage (in bytes)",
+      .name = METRIC_PREFIX "memory_used_bytes",
+      .type = METRIC_TYPE_GAUGE,
+  };
+  metric_family_t fam_ratio = {
+      .help = "Memory usage ratio (0-1)",
+      .name = METRIC_PREFIX "memory_usage_ratio",
+      .type = METRIC_TYPE_GAUGE,
+  };
+  metric_t metric = {0};
+
+  bool ok = false;
+  for (i = 0; i < mem_count; i++) {
+    /* fetch memory samples */
+    if (zesMemoryGetState(mems[i], &(gpu->memory[cache_idx][i])) !=
+        ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get memory module %d state", i);
+      ok = false;
+      break;
+    }
+    ok = true;
+    if (cache_idx > 0) {
+      continue;
+    }
+    const uint64_t mem_size = gpu->memory[0][i].size;
+    if (!mem_size) {
+      ERROR(PLUGIN_NAME ": invalid (zero) memory module %d size", i);
+      ok = false;
+      break;
+    }
+    /* process samples */
+    if (!set_mem_labels(mems[i], &metric)) {
+      ERROR(PLUGIN_NAME ": failed to get memory module %d properties", i);
+      ok = false;
+      break;
+    }
+    double mem_used;
+    if (config.samples < 2) {
+      const uint64_t mem_free = gpu->memory[0][i].free;
+      /* Sysman reports just memory size & free amounts => calculate used */
+      mem_used = mem_size - mem_free;
+      metric.value.gauge = mem_used;
+      metric_family_metric_append(&fam_bytes, metric);
+      metric.value.gauge = mem_used / mem_size;
+      metric_family_metric_append(&fam_ratio, metric);
+    } else {
+      /* find min & max values for memory free from
+       * (the configured number of) samples
+       */
+      uint64_t free_min = (uint64_t)1024 * 1024 * 1024 * 1024;
+      uint64_t free_max = 0, mem_free;
+      for (uint32_t j = 0; j < config.samples; j++) {
+        mem_free = gpu->memory[j][i].free;
+        if (mem_free < free_min) {
+          free_min = mem_free;
+        }
+        if (mem_free > free_max) {
+          free_max = mem_free;
+        }
+      }
+      /* largest used amount of memory */
+      mem_used = mem_size - free_max;
+      metric.value.gauge = mem_used;
+      metric_label_set(&metric, "function", "min");
+      metric_family_metric_append(&fam_bytes, metric);
+      metric.value.gauge = mem_used / mem_size;
+      metric_family_metric_append(&fam_ratio, metric);
+
+      /* smallest used amount of memory */
+      mem_used = mem_size - free_min;
+      metric.value.gauge = mem_used;
+      metric_label_set(&metric, "function", "max");
+      metric_family_metric_append(&fam_bytes, metric);
+      metric.value.gauge = mem_used / mem_size;
+      metric_family_metric_append(&fam_ratio, metric);
+    }
+  }
+  if (ok && cache_idx == 0) {
+    metric_reset(&metric);
+    gpu_submit(gpu, &fam_bytes);
+    gpu_submit(gpu, &fam_ratio);
+  }
+  free(mems);
+  return ok;
+}
+
+/* Report memory modules bandwidth usage, return true for success.
+ */
+static bool gpu_mems_bw(gpu_device_t *gpu) {
+  uint32_t i, mem_count = 0;
+  zes_device_handle_t dev = gpu->handle;
+  if ((zesDeviceEnumMemoryModules(dev, &mem_count, NULL) !=
+       ZE_RESULT_SUCCESS)) {
+    ERROR(PLUGIN_NAME ": failed to get memory (BW) modules count");
+    return false;
+  }
+  zes_mem_handle_t *mems;
+  mems = scalloc(mem_count, sizeof(*mems));
+  if (zesDeviceEnumMemoryModules(dev, &mem_count, mems) != ZE_RESULT_SUCCESS) {
+    ERROR(PLUGIN_NAME ": failed to get %d memory (BW) modules", mem_count);
+    free(mems);
+    return false;
+  }
+
+  if (gpu->membw_count != mem_count) {
+    INFO(PLUGIN_NAME ": Sysman reports %d memory (BW) modules", mem_count);
+    if (gpu->membw) {
+      free(gpu->membw);
+    }
+    gpu->membw = scalloc(mem_count, sizeof(*gpu->membw));
+    gpu->membw_count = mem_count;
+  }
+
+  metric_family_t fam_ratio = {
+      .help = "Average memory bandwidth usage ratio (0-1) over query interval",
+      .name = METRIC_PREFIX "memory_bw_ratio",
+      .type = METRIC_TYPE_GAUGE,
+  };
+  metric_family_t fam_counter = {
+      .help = "Memory bandwidth usage total (in bytes)",
+      .name = METRIC_PREFIX "memory_bw_bytes_total",
+      .type = METRIC_TYPE_COUNTER,
+  };
+  metric_t metric = {0};
+
+  bool reported_ratio = false, reported_counter = false, ok = false;
+  for (i = 0; i < mem_count; i++) {
+    ze_result_t ret;
+    zes_mem_bandwidth_t bw;
+    if (ret = zesMemoryGetBandwidth(mems[i], &bw), ret != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get memory module %d bandwidth => 0x%x", i,
+            ret);
+      ok = false;
+      break;
+    }
+    if (!set_mem_labels(mems[i], &metric)) {
+      ERROR(PLUGIN_NAME ": failed to get memory module %d properties", i);
+      ok = false;
+      break;
+    }
+    if (config.output & OUTPUT_RAW) {
+      metric.value.counter = bw.writeCounter;
+      metric_label_set(&metric, "direction", "write");
+      metric_family_metric_append(&fam_counter, metric);
+
+      metric.value.counter = bw.readCounter;
+      metric_label_set(&metric, "direction", "read");
+      metric_family_metric_append(&fam_counter, metric);
+      reported_counter = true;
+    }
+    zes_mem_bandwidth_t *old = &gpu->membw[i];
+    if (old->maxBandwidth && (config.output & OUTPUT_DERIVED) &&
+        bw.timestamp > old->timestamp) {
+      /* https://spec.oneapi.com/level-zero/latest/sysman/api.html#_CPPv419zes_mem_bandwidth_t
+       */
+      uint64_t writes = bw.writeCounter - old->writeCounter;
+      uint64_t reads = bw.readCounter - old->readCounter;
+      uint64_t timediff = bw.timestamp - old->timestamp;
+      double factor = 1.0e6 / (old->maxBandwidth * timediff);
+
+      metric.value.gauge = factor * writes;
+      metric_label_set(&metric, "direction", "write");
+      metric_family_metric_append(&fam_ratio, metric);
+
+      metric.value.gauge = factor * reads;
+      metric_label_set(&metric, "direction", "read");
+      metric_family_metric_append(&fam_ratio, metric);
+      reported_ratio = true;
+    }
+    *old = bw;
+    ok = true;
+  }
+  if (ok) {
+    metric_reset(&metric);
+    if (reported_ratio) {
+      gpu_submit(gpu, &fam_ratio);
+    }
+    if (reported_counter) {
+      gpu_submit(gpu, &fam_counter);
+    }
+  }
+  free(mems);
+  return ok;
+}
+
+/* set frequency metric labels based on its properties, return true for success
+ */
+static bool set_freq_labels(zes_freq_handle_t freq, metric_t *metric) {
+  zes_freq_properties_t props;
+  if (zesFrequencyGetProperties(freq, &props) != ZE_RESULT_SUCCESS) {
+    return false;
+  }
+  const char *type;
+  switch (props.type) {
+  case ZES_FREQ_DOMAIN_GPU:
+    type = "gpu";
+    break;
+  case ZES_FREQ_DOMAIN_MEMORY:
+    type = "memory";
+    break;
+  default:
+    type = "unknown";
+  }
+  metric_label_set(metric, "location", type);
+  metric_set_subdev(metric, props.onSubdevice, props.subdeviceId);
+  return true;
+}
+
+/* Report frequency domains request & actual frequency, return true for success
+ *
+ * See gpu_read() on 'cache_idx' usage.
+ */
+static bool gpu_freqs(gpu_device_t *gpu, unsigned int cache_idx) {
+  uint32_t i, freq_count = 0;
+  zes_device_handle_t dev = gpu->handle;
+  if ((zesDeviceEnumFrequencyDomains(dev, &freq_count, NULL) !=
+       ZE_RESULT_SUCCESS)) {
+    ERROR(PLUGIN_NAME ": failed to get frequency domains count");
+    return false;
+  }
+  zes_freq_handle_t *freqs;
+  freqs = scalloc(freq_count, sizeof(*freqs));
+  if (zesDeviceEnumFrequencyDomains(dev, &freq_count, freqs) !=
+      ZE_RESULT_SUCCESS) {
+    ERROR(PLUGIN_NAME ": failed to get %d frequency domains", freq_count);
+    free(freqs);
+    return false;
+  }
+
+  if (gpu->frequency_count != freq_count) {
+    INFO(PLUGIN_NAME ": Sysman reports %d frequency domains", freq_count);
+    gpu->frequency = (zes_freq_state_t **)gpu_subarray_realloc(
+        (void **)gpu->frequency, freq_count, sizeof(gpu->frequency[0][0]));
+    gpu->frequency_count = freq_count;
+    assert(gpu->frequency);
+  }
+
+  metric_family_t fam = {
+      .help = "HW frequency (in MHz)",
+      .name = METRIC_PREFIX "frequency_mhz",
+      .type = METRIC_TYPE_GAUGE,
+  };
+  metric_t metric = {0};
+
+  bool reported = false, ok = false;
+  for (i = 0; i < freq_count; i++) {
+    /* fetch freq samples */
+    if (zesFrequencyGetState(freqs[i], &(gpu->frequency[cache_idx][i])) !=
+        ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get frequency domain %d state", i);
+      ok = false;
+      break;
+    }
+    ok = true;
+    if (cache_idx > 0) {
+      continue;
+    }
+    /* process samples */
+    if (!set_freq_labels(freqs[i], &metric)) {
+      ERROR(PLUGIN_NAME ": failed to get frequency domain %d properties", i);
+      ok = false;
+      break;
+    }
+
+    bool freq_ok = false;
+    double value;
+
+    if (config.samples < 2) {
+      /* negative value = unsupported:
+       * https://spec.oneapi.com/level-zero/latest/sysman/api.html#_CPPv416zes_freq_state_t
+       */
+      value = gpu->frequency[0][i].request;
+      if (value >= 0) {
+        metric.value.gauge = value;
+        metric_label_set(&metric, "type", "request");
+        metric_family_metric_append(&fam, metric);
+        freq_ok = true;
+      }
+      value = gpu->frequency[0][i].actual;
+      if (value >= 0) {
+        metric.value.gauge = value;
+        metric_label_set(&metric, "type", "actual");
+        metric_family_metric_append(&fam, metric);
+        freq_ok = true;
+      }
+    } else {
+      /* find min & max values for actual frequency & its request
+       * from (the configured number of) samples
+       */
+      double req_min = 1.0e12, req_max = -1.0e12;
+      double act_min = 1.0e12, act_max = -1.0e12;
+      for (uint32_t j = 0; j < config.samples; j++) {
+        value = gpu->frequency[j][i].request;
+        if (value < req_min) {
+          req_min = value;
+        }
+        if (value > req_max) {
+          req_max = value;
+        }
+        value = gpu->frequency[j][i].actual;
+        if (value < act_min) {
+          act_min = value;
+        }
+        if (value > act_max) {
+          act_max = value;
+        }
+      }
+      if (req_max >= 0.0) {
+        metric.value.gauge = req_min;
+        metric_label_set(&metric, "type", "request");
+        metric_label_set(&metric, "function", "min");
+        metric_family_metric_append(&fam, metric);
+
+        metric.value.gauge = req_max;
+        metric_label_set(&metric, "function", "max");
+        metric_family_metric_append(&fam, metric);
+        freq_ok = true;
+      }
+      if (act_max >= 0.0) {
+        metric.value.gauge = act_min;
+        metric_label_set(&metric, "type", "actual");
+        metric_label_set(&metric, "function", "min");
+        metric_family_metric_append(&fam, metric);
+
+        metric.value.gauge = act_max;
+        metric_label_set(&metric, "function", "max");
+        metric_family_metric_append(&fam, metric);
+        freq_ok = true;
+      }
+    }
+    if (freq_ok) {
+      reported = true;
+    } else {
+      ERROR(PLUGIN_NAME ": neither requests nor actual frequencies supported "
+                        "for domain %d",
+            i);
+      ok = false;
+      break;
+    }
+  }
+  if (reported) {
+    metric_reset(&metric);
+    gpu_submit(gpu, &fam);
+  }
+  free(freqs);
+  return ok;
+}
+
+/* Report throttling time, return true for success
+ */
+static bool gpu_freqs_throttle(gpu_device_t *gpu) {
+  uint32_t i, freq_count = 0;
+  zes_device_handle_t dev = gpu->handle;
+  if ((zesDeviceEnumFrequencyDomains(dev, &freq_count, NULL) !=
+       ZE_RESULT_SUCCESS)) {
+    ERROR(PLUGIN_NAME ": failed to get frequency (throttling) domains count");
+    return false;
+  }
+  zes_freq_handle_t *freqs;
+  freqs = scalloc(freq_count, sizeof(*freqs));
+  if (zesDeviceEnumFrequencyDomains(dev, &freq_count, freqs) !=
+      ZE_RESULT_SUCCESS) {
+    ERROR(PLUGIN_NAME ": failed to get %d frequency (throttling) domains",
+          freq_count);
+    free(freqs);
+    return false;
+  }
+
+  if (gpu->throttle_count != freq_count) {
+    INFO(PLUGIN_NAME ": Sysman reports %d frequency (throttling) domains",
+         freq_count);
+    if (gpu->throttle) {
+      free(gpu->throttle);
+    }
+    gpu->throttle = scalloc(freq_count, sizeof(*gpu->throttle));
+    gpu->throttle_count = freq_count;
+  }
+
+  metric_family_t fam_ratio = {
+      .help =
+          "Ratio (0-1) of HW frequency being throttled during query interval",
+      .name = METRIC_PREFIX "throttled_ratio",
+      .type = METRIC_TYPE_GAUGE,
+  };
+  metric_family_t fam_counter = {
+      .help = "Total time HW frequency has been throttled (in microseconds)",
+      .name = METRIC_PREFIX "throttled_usecs_total",
+      .type = METRIC_TYPE_COUNTER,
+  };
+  metric_t metric = {0};
+
+  bool reported_ratio = false, reported_counter = false, ok = false;
+  for (i = 0; i < freq_count; i++) {
+    ze_result_t ret;
+    zes_freq_throttle_time_t throttle;
+    if (ret = zesFrequencyGetThrottleTime(freqs[i], &throttle),
+        ret != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME
+            ": failed to get frequency domain %d throttle time => 0x%x",
+            i, ret);
+      ok = false;
+      break;
+    }
+    if (!set_freq_labels(freqs[i], &metric)) {
+      ERROR(PLUGIN_NAME ": failed to get frequency domain %d properties", i);
+      ok = false;
+      break;
+    }
+    if (config.output & OUTPUT_RAW) {
+      /* cannot convert microsecs to secs as counters are integers */
+      metric.value.counter = throttle.throttleTime;
+      metric_family_metric_append(&fam_counter, metric);
+      reported_counter = true;
+    }
+    zes_freq_throttle_time_t *old = &gpu->throttle[i];
+    if (old->timestamp && (config.output & OUTPUT_DERIVED) &&
+        throttle.timestamp > old->timestamp) {
+      /* micro seconds => throttle ratio */
+      metric.value.gauge = (throttle.throttleTime - old->throttleTime) /
+                           (double)(throttle.timestamp - old->timestamp);
+      metric_family_metric_append(&fam_ratio, metric);
+      reported_ratio = true;
+    }
+    *old = throttle;
+    ok = true;
+  }
+  if (ok) {
+    metric_reset(&metric);
+    if (reported_ratio) {
+      gpu_submit(gpu, &fam_ratio);
+    }
+    if (reported_counter) {
+      gpu_submit(gpu, &fam_counter);
+    }
+  }
+  free(freqs);
+  return ok;
+}
+
+/* Report relevant temperature sensor values, return true for success */
+static bool gpu_temps(gpu_device_t *gpu) {
+  uint32_t i, temp_count = 0;
+  zes_device_handle_t dev = gpu->handle;
+  if ((zesDeviceEnumTemperatureSensors(dev, &temp_count, NULL) !=
+       ZE_RESULT_SUCCESS)) {
+    ERROR(PLUGIN_NAME ": failed to get temperature sensors count");
+    return false;
+  }
+  zes_temp_handle_t *temps;
+  temps = scalloc(temp_count, sizeof(*temps));
+  if (zesDeviceEnumTemperatureSensors(dev, &temp_count, temps) !=
+      ZE_RESULT_SUCCESS) {
+    ERROR(PLUGIN_NAME ": failed to get %d temperature sensors", temp_count);
+    free(temps);
+    return false;
+  }
+  if (gpu->temp_count != temp_count) {
+    INFO(PLUGIN_NAME ": Sysman reports %d temperature sensors", temp_count);
+    gpu->temp_count = temp_count;
+  }
+
+  metric_family_t fam = {
+      .help = "Temperature sensor value (in Celsius) when queried",
+      .name = METRIC_PREFIX "temperature_celsius",
+      .type = METRIC_TYPE_GAUGE,
+  };
+  metric_t metric = {0};
+
+  bool ok = false;
+  for (i = 0; i < temp_count; i++) {
+    zes_temp_properties_t props;
+    if (zesTemperatureGetProperties(temps[i], &props) != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get temperature sensor %d properties", i);
+      ok = false;
+      break;
+    }
+    const char *type;
+    /*
+     * https://spec.oneapi.io/level-zero/latest/sysman/PROG.html#querying-temperature
+     */
+    switch (props.type) {
+    /* max temperatures */
+    case ZES_TEMP_SENSORS_GLOBAL:
+      type = "global-max";
+      break;
+    case ZES_TEMP_SENSORS_GPU:
+      type = "gpu-max";
+      break;
+    case ZES_TEMP_SENSORS_MEMORY:
+      type = "memory-max";
+      break;
+    /* min temperatures */
+    case ZES_TEMP_SENSORS_GLOBAL_MIN:
+      type = "global-min";
+      break;
+    case ZES_TEMP_SENSORS_GPU_MIN:
+      type = "gpu-min";
+      break;
+    case ZES_TEMP_SENSORS_MEMORY_MIN:
+      type = "memory-min";
+      break;
+    default:
+      type = "unknown";
+    }
+
+    double value;
+    if (zesTemperatureGetState(temps[i], &value) != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get temperature sensor %d (%s) state", i,
+            type);
+      ok = false;
+      break;
+    }
+    metric.value.gauge = value;
+    metric_label_set(&metric, "location", type);
+    metric_set_subdev(&metric, props.onSubdevice, props.subdeviceId);
+    metric_family_metric_append(&fam, metric);
+    ok = true;
+  }
+  if (ok) {
+    metric_reset(&metric);
+    gpu_submit(gpu, &fam);
+  }
+  free(temps);
+  return ok;
+}
+
+/* Report power usage for relevant domains, return true for success */
+static bool gpu_powers(gpu_device_t *gpu) {
+  uint32_t i, power_count = 0;
+  zes_device_handle_t dev = gpu->handle;
+  if ((zesDeviceEnumPowerDomains(dev, &power_count, NULL) !=
+       ZE_RESULT_SUCCESS)) {
+    ERROR(PLUGIN_NAME ": failed to get power domains count");
+    return false;
+  }
+  zes_pwr_handle_t *powers;
+  powers = scalloc(power_count, sizeof(*powers));
+  if (zesDeviceEnumPowerDomains(dev, &power_count, powers) !=
+      ZE_RESULT_SUCCESS) {
+    ERROR(PLUGIN_NAME ": failed to get %d power domains", power_count);
+    free(powers);
+    return false;
+  }
+
+  if (gpu->power_count != power_count) {
+    INFO(PLUGIN_NAME ": Sysman reports %d power domains", power_count);
+    if (gpu->power) {
+      free(gpu->power);
+    }
+    gpu->power = scalloc(power_count, sizeof(*gpu->power));
+    gpu->power_count = power_count;
+  }
+
+  metric_family_t fam_power = {
+      .help = "Average power usage (in Watts) over query interval",
+      .name = METRIC_PREFIX "power_watts",
+      .type = METRIC_TYPE_GAUGE,
+  };
+  metric_family_t fam_energy = {
+      .help = "Total energy consumption since boot (in microjoules)",
+      .name = METRIC_PREFIX "energy_ujoules_total",
+      .type = METRIC_TYPE_COUNTER,
+  };
+  metric_t metric = {0};
+
+  bool reported_power = false, reported_energy = false, ok = false;
+  for (i = 0; i < power_count; i++) {
+    zes_power_properties_t props;
+    if (zesPowerGetProperties(powers[i], &props) != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get power domain %d properties", i);
+      ok = false;
+      break;
+    }
+    zes_power_energy_counter_t counter;
+    if (zesPowerGetEnergyCounter(powers[i], &counter) != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get power domain %d energy counter", i);
+      ok = false;
+      break;
+    }
+    metric_set_subdev(&metric, props.onSubdevice, props.subdeviceId);
+    if (config.output & OUTPUT_RAW) {
+      metric.value.counter = counter.energy;
+      metric_family_metric_append(&fam_energy, metric);
+      reported_energy = true;
+    }
+    zes_power_energy_counter_t *old = &gpu->power[i];
+    if (old->timestamp && (config.output & OUTPUT_DERIVED) &&
+        counter.timestamp > old->timestamp) {
+      /* microJoules / microSeconds => watts */
+      metric.value.gauge = (double)(counter.energy - old->energy) /
+                           (counter.timestamp - old->timestamp);
+      metric_family_metric_append(&fam_power, metric);
+      reported_power = true;
+    }
+    *old = counter;
+    ok = true;
+  }
+  if (ok) {
+    metric_reset(&metric);
+    if (reported_energy) {
+      gpu_submit(gpu, &fam_energy);
+    }
+    if (reported_power) {
+      gpu_submit(gpu, &fam_power);
+    }
+  }
+  free(powers);
+  return ok;
+}
+
+/* Report engine activity in relevant groups, return true for success */
+static bool gpu_engines(gpu_device_t *gpu) {
+  uint32_t i, engine_count = 0;
+  zes_device_handle_t dev = gpu->handle;
+  if ((zesDeviceEnumEngineGroups(dev, &engine_count, NULL) !=
+       ZE_RESULT_SUCCESS)) {
+    ERROR(PLUGIN_NAME ": failed to get engine groups count");
+    return false;
+  }
+  zes_engine_handle_t *engines;
+  engines = scalloc(engine_count, sizeof(*engines));
+  if (zesDeviceEnumEngineGroups(dev, &engine_count, engines) !=
+      ZE_RESULT_SUCCESS) {
+    ERROR(PLUGIN_NAME ": failed to get %d engine groups", engine_count);
+    free(engines);
+    return false;
+  }
+
+  if (gpu->engine_count != engine_count) {
+    INFO(PLUGIN_NAME ": Sysman reports %d engine groups", engine_count);
+    if (gpu->engine) {
+      free(gpu->engine);
+    }
+    gpu->engine = scalloc(engine_count, sizeof(*gpu->engine));
+    gpu->engine_count = engine_count;
+  }
+
+  metric_family_t fam_ratio = {
+      .help = "Average GPU engine / group utilization ratio (0-1) over query "
+              "interval",
+      .name = METRIC_PREFIX "engine_ratio",
+      .type = METRIC_TYPE_GAUGE,
+  };
+  metric_family_t fam_counter = {
+      .help = "GPU engine / group execution time (activity) total (in "
+              "microseconds)",
+      .name = METRIC_PREFIX "engine_use_usecs_total",
+      .type = METRIC_TYPE_COUNTER,
+  };
+  metric_t metric = {0};
+
+  int type_idx[16] = {0};
+  bool reported_ratio = false, reported_counter = false, ok = false;
+  for (i = 0; i < engine_count; i++) {
+    zes_engine_properties_t props;
+    if (zesEngineGetProperties(engines[i], &props) != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get engine group %d properties", i);
+      ok = false;
+      break;
+    }
+    bool all = false;
+    const char *type;
+    switch (props.type) {
+    case ZES_ENGINE_GROUP_ALL:
+      type = "all";
+      all = true;
+      break;
+      /* multiple engines */
+    case ZES_ENGINE_GROUP_COMPUTE_ALL:
+      type = "compute";
+      all = true;
+      break;
+    case ZES_ENGINE_GROUP_MEDIA_ALL:
+      type = "media";
+      all = true;
+      break;
+    case ZES_ENGINE_GROUP_COPY_ALL:
+      type = "copy";
+      all = true;
+      break;
+      /* individual engines */
+    case ZES_ENGINE_GROUP_COMPUTE_SINGLE:
+      type = "compute";
+      break;
+    case ZES_ENGINE_GROUP_MEDIA_DECODE_SINGLE:
+      type = "decode";
+      break;
+    case ZES_ENGINE_GROUP_MEDIA_ENCODE_SINGLE:
+      type = "encode";
+      break;
+    case ZES_ENGINE_GROUP_COPY_SINGLE:
+      type = "copy";
+      break;
+    case ZES_ENGINE_GROUP_RENDER_SINGLE:
+      type = "render";
+      break;
+
+    /* Following defines require at least Level-Zero relase v1.1 */
+    case ZES_ENGINE_GROUP_RENDER_ALL:
+      type = "render";
+      all = true;
+      break;
+    case ZES_ENGINE_GROUP_3D_ALL:
+      type = "3d";
+      all = true;
+      break;
+    case ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL:
+      type = "3d-render-compute";
+      all = true;
+      break;
+    case ZES_ENGINE_GROUP_MEDIA_ENHANCEMENT_SINGLE:
+      type = "enhance";
+      break;
+    case ZES_ENGINE_GROUP_3D_SINGLE:
+      type = "3d";
+      break;
+
+    default:
+      type = "unknown";
+    }
+    const char *vname;
+    char buf[32];
+    if (all) {
+      vname = type;
+    } else {
+      if (gpu->disabled.engine_single) {
+        continue;
+      }
+      assert(props.type < sizeof(type_idx));
+      /* include engine index as there can be multiple engines of same type */
+      snprintf(buf, sizeof(buf), "%s-%03d", type, type_idx[props.type]);
+      type_idx[props.type]++;
+      vname = buf;
+    }
+    ze_result_t ret;
+    zes_engine_stats_t stats;
+    if (ret = zesEngineGetActivity(engines[i], &stats),
+        ret != ZE_RESULT_SUCCESS) {
+      ERROR(PLUGIN_NAME ": failed to get engine %d (%s) group activity => 0x%x",
+            i, vname, ret);
+      ok = false;
+      break;
+    }
+    metric_set_subdev(&metric, props.onSubdevice, props.subdeviceId);
+    metric_label_set(&metric, "type", vname);
+    if (config.output & OUTPUT_RAW) {
+      metric.value.counter = stats.activeTime;
+      metric_family_metric_append(&fam_counter, metric);
+      reported_counter = true;
+    }
+    zes_engine_stats_t *old = &gpu->engine[i];
+    if (old->timestamp && (config.output & OUTPUT_DERIVED) &&
+        stats.timestamp > old->timestamp) {
+      metric.value.gauge = (double)(stats.activeTime - old->activeTime) /
+                           (stats.timestamp - old->timestamp);
+      metric_family_metric_append(&fam_ratio, metric);
+      reported_ratio = true;
+    }
+    *old = stats;
+    ok = true;
+  }
+  if (ok) {
+    metric_reset(&metric);
+    if (reported_ratio) {
+      gpu_submit(gpu, &fam_ratio);
+    }
+    if (reported_counter) {
+      gpu_submit(gpu, &fam_counter);
+    }
+  }
+  free(engines);
+  return ok;
+}
+
+static int gpu_read(void) {
+  /* no metrics yet */
+  int retval = RET_NO_METRICS;
+  /* go through all GPUs */
+  for (uint32_t i = 0; i < gpu_count; i++) {
+    gpu_device_t *gpu = &gpus[i];
+    gpu_disable_t *disabled = &gpu->disabled;
+    if (disabled->all) {
+      continue;
+    }
+    if (!gpu->counter) {
+      INFO(PLUGIN_NAME ": GPU-%d queries:", i);
+    }
+    /* 'cache_idx' is high frequency sampling aggregation counter.
+     *
+     * Functions needing that should use gpu_subarray_realloc() to
+     * allocate 'config.samples' sized array of metric value arrays,
+     * and use 'cache_idx' as index to that array.
+     *
+     * 'cache_idx' goes down to zero, so that functions themselves
+     * need to care less about config.samples value.  But when it
+     * does reache zero, function should process 'config.samples'
+     * amount of cached items and provide aggregated metrics of
+     * them to gpu_submit().
+     */
+    unsigned int cache_idx =
+        (config.samples - 1) - gpu->counter % config.samples;
+    /* get potentially high-frequency metrics data (aggregate metrics sent when
+     * counter=0)
+     */
+    if (!disabled->freq && !gpu_freqs(gpu, cache_idx)) {
+      WARNING(PLUGIN_NAME
+              ": GPU-%d frequency query fail / no domains => disabled",
+              i);
+      disabled->freq = true;
+    }
+    if (!disabled->mem && !gpu_mems(gpu, cache_idx)) {
+      WARNING(PLUGIN_NAME ": GPU-%d memory query fail / no modules => disabled",
+              i);
+      disabled->mem = true;
+    }
+    /* rest of the metrics are read only when the high frequency
+     * counter goes down to zero
+     */
+    gpu->counter++;
+    if (cache_idx > 0) {
+      if (!disabled->all) {
+        /* there are still valid counters at least for this GPU */
+        retval = RET_OK;
+      }
+      continue;
+    }
+
+    /* process lower frequency counters */
+    if (config.samples > 1 && gpu->counter <= config.samples) {
+      INFO(PLUGIN_NAME ": GPU-%d queries:", i);
+    }
+    /* get lower frequency metrics */
+    if (!disabled->engine && !gpu_engines(gpu)) {
+      WARNING(PLUGIN_NAME ": GPU-%d engine query fail / no groups => disabled",
+              i);
+      disabled->engine = true;
+    }
+    if (!disabled->membw && !gpu_mems_bw(gpu)) {
+      WARNING(PLUGIN_NAME ": GPU-%d mem BW query fail / no modules => disabled",
+              i);
+      gpu->disabled.membw = true;
+    }
+    if (!disabled->power && !gpu_powers(gpu)) {
+      WARNING(PLUGIN_NAME ": GPU-%d power query fail / no domains => disabled",
+              i);
+      disabled->power = true;
+    }
+    if (!disabled->ras && !gpu_ras(gpu)) {
+      WARNING(PLUGIN_NAME ": GPU-%d errors query fail / no sets => disabled",
+              i);
+      disabled->ras = true;
+    }
+    if (!disabled->temp && !gpu_temps(gpu)) {
+      WARNING(PLUGIN_NAME
+              ": GPU-%d temperature query fail / no sensors => disabled",
+              i);
+      disabled->temp = true;
+    }
+    if (!disabled->throttle && !gpu_freqs_throttle(gpu)) {
+      WARNING(PLUGIN_NAME
+              ": GPU-%d throttle time query fail / no domains => disabled",
+              i);
+      gpu->disabled.throttle = true;
+    }
+    if (disabled->engine && disabled->mem && disabled->freq &&
+        disabled->membw && disabled->power && disabled->ras && disabled->temp &&
+        disabled->throttle) {
+      /* all metrics missing -> disable use of that GPU */
+      ERROR(PLUGIN_NAME ": No metrics from GPU-%d, disabling its querying", i);
+      disabled->all = true;
+    } else {
+      retval = RET_OK;
+    }
+  }
+  return retval;
+}
+
+static int gpu_config_parse(const char *key, const char *value) {
+  /* all metrics are enabled by default, but user can disable them */
+  if (strcasecmp(key, KEY_DISABLE_ENGINE) == 0) {
+    config.disabled.engine = IS_TRUE(value);
+  } else if (strcasecmp(key, KEY_DISABLE_ENGINE_SINGLE) == 0) {
+    config.disabled.engine_single = IS_TRUE(value);
+  } else if (strcasecmp(key, KEY_DISABLE_FREQ) == 0) {
+    config.disabled.freq = IS_TRUE(value);
+  } else if (strcasecmp(key, KEY_DISABLE_MEM) == 0) {
+    config.disabled.mem = IS_TRUE(value);
+  } else if (strcasecmp(key, KEY_DISABLE_MEMBW) == 0) {
+    config.disabled.membw = IS_TRUE(value);
+  } else if (strcasecmp(key, KEY_DISABLE_POWER) == 0) {
+    config.disabled.power = IS_TRUE(value);
+  } else if (strcasecmp(key, KEY_DISABLE_RAS) == 0) {
+    config.disabled.ras = IS_TRUE(value);
+  } else if (strcasecmp(key, KEY_DISABLE_RAS_SEPARATE) == 0) {
+    config.disabled.ras_separate = IS_TRUE(value);
+  } else if (strcasecmp(key, KEY_DISABLE_TEMP) == 0) {
+    config.disabled.temp = IS_TRUE(value);
+  } else if (strcasecmp(key, KEY_DISABLE_THROTTLE) == 0) {
+    config.disabled.throttle = IS_TRUE(value);
+  } else if (strcasecmp(key, KEY_LOG_GPU_INFO) == 0) {
+    config.gpuinfo = IS_TRUE(value);
+  } else if (strcasecmp(key, KEY_METRICS_OUTPUT) == 0) {
+    config.output = OUTPUT_UNSET;
+    for (unsigned i = 0; i < STATIC_ARRAY_SIZE(metrics_output); i++) {
+      if (strcasecmp(value, metrics_output[i]) == 0) {
+        config.output = i;
+        break;
+      }
+    }
+    if (config.output == OUTPUT_UNSET) {
+      ERROR(PLUGIN_NAME ": Invalid '%s' config key value '%s'", key, value);
+      return RET_INVALID_CONFIG;
+    }
+  } else if (strcasecmp(key, KEY_SAMPLES) == 0) {
+    /* because collectd converts config values to floating point strings,
+     * this can't use strtol() to check that value is integer, so simply
+     * just take the integer part
+     */
+    int samples = atoi(value);
+    if (samples < 1 || samples > MAX_SAMPLES) {
+      ERROR(PLUGIN_NAME ": Invalid " KEY_SAMPLES " value '%s'", value);
+      return RET_INVALID_CONFIG;
+    }
+    /* number of samples cannot be changed without freeing per-GPU
+     * metrics cache arrays & members, zeroing metric counters and
+     * GPU cache index counter.  However, this parse function should
+     * be called only before gpu structures have been initialized, so
+     * just assert here
+     */
+    assert(gpus == NULL);
+    config.samples = samples;
+  } else {
+    ERROR(PLUGIN_NAME ": Invalid '%s' config key", key);
+    return RET_INVALID_CONFIG;
+  }
+  return RET_OK;
+}
+
+void module_register(void) {
+  /* NOTE: key strings *must* be static */
+  static const char *config_keys[] = {
+      KEY_DISABLE_ENGINE,   KEY_DISABLE_ENGINE_SINGLE, KEY_DISABLE_FREQ,
+      KEY_DISABLE_MEM,      KEY_DISABLE_MEMBW,         KEY_DISABLE_POWER,
+      KEY_DISABLE_RAS,      KEY_DISABLE_RAS_SEPARATE,  KEY_DISABLE_TEMP,
+      KEY_DISABLE_THROTTLE, KEY_METRICS_OUTPUT,        KEY_LOG_GPU_INFO,
+      KEY_SAMPLES};
+  const int config_keys_num = STATIC_ARRAY_SIZE(config_keys);
+
+  plugin_register_config(PLUGIN_NAME, gpu_config_parse, config_keys,
+                         config_keys_num);
+  plugin_register_init(PLUGIN_NAME, gpu_init);
+  plugin_register_read(PLUGIN_NAME, gpu_read);
+  plugin_register_shutdown(PLUGIN_NAME, gpu_config_free);
+} /* void module_register */
diff --git a/src/gpu_sysman_test.c b/src/gpu_sysman_test.c

new file mode 100644 (file)

index 0000000..03545ff
--- /dev/null
+++ b/src/gpu_sysman_test.c
@@ -0,0 +1,1376 @@
+/**
+ * collectd - src/gpu_sysman_test.c
+ *
+ * Copyright(c) 2020-2022 Intel Corporation. All rights reserved.
+ *
+ * Licensed under the same terms and conditions as src/gpu_sysman.c.
+ *
+ * Authors:
+ * - Eero Tamminen <eero.t.tamminen@intel.com>
+ *
+ * Testing for gpu_sysman.c Sysman API and its error handling.
+ *
+ * See: https://spec.oneapi.com/level-zero/latest/sysman/PROG.html
+ *
+ * Building unit-tests:
+ *   gcc -I. -Idaemon  -I/path/to/level-zero -O3 -g --coverage -Werror \
+ *       -Wall -Wextra -Wpedantic -Wcast-align=strict -Wformat-security \
+ *       gpu_sysman_test.c -o test_plugin_gpu_sysman
+ *
+ * Running unit-units:
+ *     ./test_plugin_gpu_sysman
+ *
+ * Testing for memory leakage:
+ *     valgrind --error-exitcode=1 --leak-check=full test_plugin_gpu_sysman
+ *
+ * Test coverage:
+ *     ./test_plugin_gpu_sysman
+ *     gcov gpu_sysman_test.*
+ * Untested lines:
+ *     grep '###' gpu_sysman.c.gcov
+ *
+ * Note:
+ * - Code lines run coverage is best with code compiled using -O3 because
+ *   it causes gcc to convert switch-cases to lookup tables.  Builds without
+ *   optimizations have significantly lower coverage due to each (trivial
+ *   and build-time verifiable) switch-case being considered separately
+ *
+ *
+ * Mock up functionality details:
+ * - All functions return only a single property or metric item,
+ *   until hitting earlier set call limit, after which they return error
+ * - All metric property functions report them coming from subdevice 0
+ *   (as non-subdevice cases can be tested on more easily available real HW)
+ * - Except for device.prop.type, subdev type in metric property, and
+ *   actual metric values in metric state structs, all other struct members
+ *   are zeroed
+ * - Memory free metric is decreased, all other metric values are increased
+ *   after each query
+ *
+ * Testing validates that:
+ * - All registered config variables work and invalid config values are rejected
+ * - All mocked up Sysman functions get called when no errors are returned and
+ *   count of Sysman calls is always same for plugin init() and read() callbacks
+ * - Plugin dispatch API receives correct values for all metrics both in
+ *   single-sampling and multi-sampling configurations
+ * - Single Sysman call failing during init or metrics queries causes logging
+ *   of the failure, and in case of metric queries, disabling of the (only)
+ *   relevant metric, and that working for all metrics and Sysman APIs they call
+ * - Plugin init, shutdown and re-init works without problems
+ */
+
+#define KERNEL_LINUX 1
+#define FP_LAYOUT_NEED_NOTHING 1
+#include "gpu_sysman.c" /* test this */
+
+/* logging check bit, and per-phase logging bits enabling it */
+#define VERBOSE_CALLS 1
+#define VERBOSE_CALLS_INIT 2
+#define VERBOSE_CALLS_INIT_LIMIT 4
+#define VERBOSE_CALLS_METRICS 8
+#define VERBOSE_CALLS_METRICS_LIMIT 16
+#define VERBOSE_CALLS_METRICS_SAMPLED 32
+
+/* logging check bit, and per-phase logging bits enabling it */
+#define VERBOSE_METRICS 64
+#define VERBOSE_METRICS_NORMAL 128
+#define VERBOSE_METRICS_LIMIT 256
+#define VERBOSE_METRICS_SAMPLED 512
+
+static struct {
+  /* bitmask of enabled verbosity areas */
+  unsigned int verbose;
+
+  /* to be able to count & limit Sysman API calls */
+  unsigned int api_calls, api_limit;
+
+  /* to verify that all mocked Level-Zero/Sysman functions get called */
+  unsigned int callbits;
+
+  /* how many errors & warnings have been logged */
+  unsigned int warnings;
+
+  /* how many messages have been logged regardless of log level */
+  unsigned int messages;
+} globs;
+
+/* set verbosity mask call & metric logging bits based on calls & metrics
+ * enabling bits */
+static void set_verbose(unsigned int callmask, unsigned int metricmask) {
+  if (globs.verbose & callmask) {
+    globs.verbose |= VERBOSE_CALLS;
+    fprintf(stderr, "Enabling call tracing...\n\n");
+  } else {
+    globs.verbose &= ~VERBOSE_CALLS;
+  }
+  if (globs.verbose & metricmask) {
+    fprintf(stderr, "Enabling metrics value tracing...\n\n");
+    globs.verbose |= VERBOSE_METRICS;
+  } else {
+    globs.verbose &= ~VERBOSE_METRICS;
+  }
+}
+
+/* set given bit in the 'callbits' call type tracking bitmask
+ * and increase 'api_calls' API call counter.
+ *
+ * return true if given call should be failed (call=limit)
+ */
+static bool call_limit(int callbit, const char *name) {
+  globs.callbits |= 1u << callbit;
+  globs.api_calls++;
+
+  if (globs.verbose & VERBOSE_CALLS) {
+    fprintf(stderr, "CALL %d: %s()\n", globs.api_calls, name);
+  }
+  if (!globs.api_limit || globs.api_calls != globs.api_limit) {
+    return false;
+  }
+  fprintf(stderr, "LIMIT @ %d: %s()\n", globs.api_calls, name);
+  return true;
+}
+
+/* ------------------------------------------------------------------------- */
+/* mock up level-zero init/driver/device handling API, called during gpu_init()
+ */
+
+/* mock up handle values to set & check against */
+#define DRV_HANDLE ((ze_driver_handle_t)(0x123456))
+#define DEV_HANDLE ((ze_device_handle_t)(0xecced))
+#define VAL_HANDLE 0xcaffa
+
+ze_result_t zeInit(ze_init_flags_t flags) {
+  if (call_limit(0, "zeInit"))
+    return ZE_RESULT_ERROR_DEVICE_LOST;
+  if (flags && flags != ZE_INIT_FLAG_GPU_ONLY) {
+    return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+  }
+  return ZE_RESULT_SUCCESS;
+}
+
+ze_result_t zeDriverGet(uint32_t *count, ze_driver_handle_t *handles) {
+  if (call_limit(1, "zeDriverGet"))
+    return ZE_RESULT_ERROR_DEVICE_LOST;
+  if (!count)
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  if (!*count) {
+    *count = 1;
+    return ZE_RESULT_SUCCESS;
+  }
+  if (*count != 1)
+    return ZE_RESULT_ERROR_INVALID_SIZE;
+  if (!handles)
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  handles[0] = DRV_HANDLE;
+  return ZE_RESULT_SUCCESS;
+}
+
+ze_result_t zeDeviceGet(ze_driver_handle_t drv, uint32_t *count,
+                        ze_device_handle_t *handles) {
+  if (call_limit(2, "zeDeviceGet"))
+    return ZE_RESULT_ERROR_DEVICE_LOST;
+  if (drv != DRV_HANDLE)
+    return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+  if (!count)
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  if (!*count) {
+    *count = 1;
+    return ZE_RESULT_SUCCESS;
+  }
+  if (*count != 1)
+    return ZE_RESULT_ERROR_INVALID_SIZE;
+  if (!handles)
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  handles[0] = DEV_HANDLE;
+  return ZE_RESULT_SUCCESS;
+}
+
+ze_result_t zeDeviceGetProperties(ze_device_handle_t dev,
+                                  ze_device_properties_t *props) {
+  if (call_limit(3, "zeDeviceGetProperties"))
+    return ZE_RESULT_ERROR_DEVICE_LOST;
+  if (dev != DEV_HANDLE)
+    return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+  if (!props)
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  memset(props, 0, sizeof(*props));
+  props->type = ZE_DEVICE_TYPE_GPU;
+  return ZE_RESULT_SUCCESS;
+}
+
+ze_result_t zeDeviceGetMemoryProperties(ze_device_handle_t dev, uint32_t *count,
+                                        ze_device_memory_properties_t *props) {
+  if (call_limit(4, "zeDeviceGetMemoryProperties"))
+    return ZE_RESULT_ERROR_DEVICE_LOST;
+  if (dev != DEV_HANDLE)
+    return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+  if (!count)
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  if (!*count) {
+    *count = 1;
+    return ZE_RESULT_SUCCESS;
+  }
+  if (*count != 1)
+    return ZE_RESULT_ERROR_INVALID_SIZE;
+  if (!props)
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  memset(props, 0, sizeof(*props));
+  return ZE_RESULT_SUCCESS;
+}
+
+/* mock up level-zero sysman device handling API, called during gpu_init() */
+
+ze_result_t zesDeviceGetProperties(zes_device_handle_t dev,
+                                   zes_device_properties_t *props) {
+  if (call_limit(5, "zesDeviceGetProperties"))
+    return ZE_RESULT_ERROR_DEVICE_LOST;
+  if (dev != DEV_HANDLE)
+    return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+  if (!props)
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  memset(props, 0, sizeof(*props));
+  return ZE_RESULT_SUCCESS;
+}
+
+ze_result_t zesDevicePciGetProperties(zes_device_handle_t dev,
+                                      zes_pci_properties_t *props) {
+  if (call_limit(6, "zesDevicePciGetProperties"))
+    return ZE_RESULT_ERROR_DEVICE_LOST;
+  if (dev != DEV_HANDLE)
+    return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+  if (!props)
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  memset(props, 0, sizeof(*props));
+  return ZE_RESULT_SUCCESS;
+}
+
+ze_result_t zesDeviceGetState(zes_device_handle_t dev,
+                              zes_device_state_t *state) {
+  if (call_limit(7, "zesDeviceGetState"))
+    return ZE_RESULT_ERROR_DEVICE_LOST;
+  if (dev != DEV_HANDLE)
+    return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+  if (!state)
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  memset(state, 0, sizeof(*state));
+  return ZE_RESULT_SUCCESS;
+}
+
+#define INIT_CALL_FUNCS 8
+#define INIT_CALL_BITS (((uint64_t)1 << INIT_CALL_FUNCS) - 1)
+
+/* ------------------------------------------------------------------------- */
+/* mock up Sysman API metrics querying functions */
+
+#define COUNTER_START 100000 // 100ms
+#define COUNTER_INC 20000    // 20ms
+#define TIME_START 5000000   // 5s in us
+#define TIME_INC 1000000     // 1s in us
+#define COUNTER_MAX TIME_INC
+
+/* what should get reported as result of above */
+#define COUNTER_RATIO ((double)COUNTER_INC / TIME_INC)
+
+#define FREQ_INIT 300
+#define FREQ_INC 50
+
+#define MEMORY_SIZE (1024 * 1024 * 1024)
+#define MEMORY_INIT (MEMORY_SIZE / 2) // so that both free & used get same value
+#define MEMORY_INC (MEMORY_SIZE / 64)
+
+#define RAS_INIT 0
+#define RAS_INC 1
+
+#define TEMP_INIT 10
+#define TEMP_INC 5
+
+/* Call bit, metric enumaration function name, its handle type,
+ * corresponding zes*GetProperties() function name, its property struct type,
+ * corresponding zes*GetState() function name, its state struct type, global
+ * variable for intial state values, two increment operations for the global
+ * state variable members (or void)
+ */
+#define ADD_METRIC(callbit, getname, handletype, propname, proptype,           \
+                   statename, statetype, statevar, stateinc1, stateinc2)       \
+  ze_result_t getname(zes_device_handle_t dev, uint32_t *count,                \
+                      handletype *handles) {                                   \
+    if (call_limit(callbit, #getname))                                         \
+      return ZE_RESULT_ERROR_NOT_AVAILABLE;                                    \
+    if (dev != DEV_HANDLE)                                                     \
+      return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;                              \
+    if (!count)                                                                \
+      return ZE_RESULT_ERROR_INVALID_NULL_POINTER;                             \
+    if (!*count) {                                                             \
+      *count = 1;                                                              \
+      return ZE_RESULT_SUCCESS;                                                \
+    }                                                                          \
+    if (*count != 1)                                                           \
+      return ZE_RESULT_ERROR_INVALID_SIZE;                                     \
+    if (!handles)                                                              \
+      return ZE_RESULT_ERROR_INVALID_NULL_POINTER;                             \
+    handles[0] = (handletype)VAL_HANDLE;                                       \
+    return ZE_RESULT_SUCCESS;                                                  \
+  }                                                                            \
+  ze_result_t propname(handletype handle, proptype *prop) {                    \
+    proptype value = {.onSubdevice = true};                                    \
+    if (call_limit(callbit + 1, #propname))                                    \
+      return ZE_RESULT_ERROR_NOT_AVAILABLE;                                    \
+    if (handle != (handletype)VAL_HANDLE)                                      \
+      return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;                              \
+    if (!prop)                                                                 \
+      return ZE_RESULT_ERROR_INVALID_NULL_POINTER;                             \
+    *prop = value;                                                             \
+    return ZE_RESULT_SUCCESS;                                                  \
+  }                                                                            \
+  ze_result_t statename(handletype handle, statetype *state) {                 \
+    if (call_limit(callbit + 2, #statename))                                   \
+      return ZE_RESULT_ERROR_NOT_AVAILABLE;                                    \
+    if (handle != (handletype)VAL_HANDLE)                                      \
+      return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;                              \
+    if (!state)                                                                \
+      return ZE_RESULT_ERROR_INVALID_NULL_POINTER;                             \
+    *state = statevar;                                                         \
+    stateinc1;                                                                 \
+    stateinc2;                                                                 \
+    return ZE_RESULT_SUCCESS;                                                  \
+  }
+
+static zes_engine_stats_t engine_stats = {.activeTime = COUNTER_START,
+                                          .timestamp = TIME_START};
+
+ADD_METRIC(0, zesDeviceEnumEngineGroups, zes_engine_handle_t,
+           zesEngineGetProperties, zes_engine_properties_t,
+           zesEngineGetActivity, zes_engine_stats_t, engine_stats,
+           engine_stats.activeTime += COUNTER_INC,
+           engine_stats.timestamp += TIME_INC)
+
+static zes_freq_state_t freq_state = {.request = FREQ_INIT,
+                                      .actual = FREQ_INIT};
+
+ADD_METRIC(3, zesDeviceEnumFrequencyDomains, zes_freq_handle_t,
+           zesFrequencyGetProperties, zes_freq_properties_t,
+           zesFrequencyGetState, zes_freq_state_t, freq_state,
+           freq_state.request += 2 * FREQ_INC, freq_state.actual += FREQ_INC)
+
+static zes_mem_state_t mem_state = {.free = MEMORY_SIZE - MEMORY_INIT,
+                                    .size = MEMORY_SIZE};
+
+ADD_METRIC(6, zesDeviceEnumMemoryModules, zes_mem_handle_t,
+           zesMemoryGetProperties, zes_mem_properties_t, zesMemoryGetState,
+           zes_mem_state_t, mem_state, mem_state.free -= MEMORY_INC,
+           mem_state.health ^= ZES_MEM_HEALTH_OK)
+
+static zes_power_energy_counter_t power_counter = {.energy = COUNTER_START,
+                                                   .timestamp = TIME_START};
+
+ADD_METRIC(9, zesDeviceEnumPowerDomains, zes_pwr_handle_t,
+           zesPowerGetProperties, zes_power_properties_t,
+           zesPowerGetEnergyCounter, zes_power_energy_counter_t, power_counter,
+           power_counter.energy += COUNTER_INC,
+           power_counter.timestamp += TIME_INC)
+
+static int dummy;
+static double temperature = TEMP_INIT;
+
+ADD_METRIC(12, zesDeviceEnumTemperatureSensors, zes_temp_handle_t,
+           zesTemperatureGetProperties, zes_temp_properties_t,
+           zesTemperatureGetState, double, temperature, temperature += TEMP_INC,
+           dummy = 0)
+
+ADD_METRIC(15, zesDeviceEnumRasErrorSets, zes_ras_handle_t, zesRasGetProperties,
+           zes_ras_properties_t, zesRasGetDummy, int,
+           dummy, // dummy as state API differs from others
+           dummy = 0, dummy = 0)
+
+ze_result_t zesRasGetState(zes_ras_handle_t handle, ze_bool_t clear,
+                           zes_ras_state_t *state) {
+  if (call_limit(17, "zesRasGetState")) {
+    return ZE_RESULT_ERROR_NOT_AVAILABLE;
+  }
+  if (handle != (zes_ras_handle_t)VAL_HANDLE) {
+    return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+  }
+  if (clear) {
+    return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  }
+  if (!state) {
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  }
+  static uint64_t count = RAS_INIT;
+  memset(state, 0, sizeof(zes_ras_state_t));
+  /* props default to zeroes i.e. correctable error type,
+   * so this needs to be a correctable category
+   */
+  state->category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS] = count;
+  count += RAS_INC;
+  return ZE_RESULT_SUCCESS;
+}
+
+ze_result_t zesFrequencyGetThrottleTime(zes_freq_handle_t handle,
+                                        zes_freq_throttle_time_t *state) {
+  if (call_limit(18, "zesFrequencyGetThrottleTime")) {
+    return ZE_RESULT_ERROR_NOT_AVAILABLE;
+  }
+  if (handle != (zes_freq_handle_t)VAL_HANDLE) {
+    return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+  }
+  if (!state) {
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  }
+  static zes_freq_throttle_time_t throttle = {.throttleTime = COUNTER_START,
+                                              .timestamp = TIME_START};
+  *state = throttle;
+  throttle.timestamp += TIME_INC;
+  throttle.throttleTime += COUNTER_INC;
+  return ZE_RESULT_SUCCESS;
+}
+
+ze_result_t zesMemoryGetBandwidth(zes_mem_handle_t handle,
+                                  zes_mem_bandwidth_t *state) {
+  if (call_limit(19, "zesMemoryGetBandwidth")) {
+    return ZE_RESULT_ERROR_NOT_AVAILABLE;
+  }
+  if (handle != (zes_mem_handle_t)VAL_HANDLE) {
+    return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+  }
+  if (!state) {
+    return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+  }
+  static zes_mem_bandwidth_t bw = {.readCounter = 2 * COUNTER_START,
+                                   .writeCounter = COUNTER_START,
+                                   .maxBandwidth = COUNTER_MAX,
+                                   .timestamp = TIME_START};
+  *state = bw;
+  bw.timestamp += TIME_INC;
+  bw.readCounter += 2 * COUNTER_INC;
+  bw.writeCounter += COUNTER_INC;
+  return ZE_RESULT_SUCCESS;
+}
+
+#define QUERY_CALL_FUNCS 20
+#define QUERY_CALL_BITS (((uint64_t)1 << QUERY_CALL_FUNCS) - 1)
+
+/* ------------------------------------------------------------------------- */
+/* mock up metrics reporting and validation */
+
+typedef struct {
+  const char *name;
+  /* present also when multisampling */
+  const bool multipresent;
+  /* metric values are multisampled and present only when multisampling */
+  const bool multisampled;
+  const double value_init;
+  const double value_inc;
+  unsigned int count;
+  double last;
+} metrics_validation_t;
+
+#define RATIO_INIT ((double)MEMORY_INIT / MEMORY_SIZE)
+#define RATIO_INC ((double)MEMORY_INC / MEMORY_SIZE)
+
+static metrics_validation_t valid_metrics[] = {
+    {"all_errors_total", true, false, RAS_INIT, RAS_INC, 0, 0.0},
+    {"frequency_mhz/actual/gpu/min", true, true, FREQ_INIT, FREQ_INC, 0, 0.0},
+    {"frequency_mhz/actual/gpu/max", true, true, FREQ_INIT, FREQ_INC, 0, 0.0},
+    {"frequency_mhz/actual/gpu", false, false, FREQ_INIT, FREQ_INC, 0, 0.0},
+    {"frequency_mhz/request/gpu/min", true, true, FREQ_INIT, 2 * FREQ_INC, 0,
+     0.0},
+    {"frequency_mhz/request/gpu/max", true, true, FREQ_INIT, 2 * FREQ_INC, 0,
+     0.0},
+    {"frequency_mhz/request/gpu", false, false, FREQ_INIT, 2 * FREQ_INC, 0,
+     0.0},
+    {"memory_used_bytes/HBM/system/min", true, true, MEMORY_INIT, +MEMORY_INC,
+     0, 0.0},
+    {"memory_used_bytes/HBM/system/max", true, true, MEMORY_INIT, +MEMORY_INC,
+     0, 0.0},
+    {"memory_used_bytes/HBM/system", false, false, MEMORY_INIT, +MEMORY_INC, 0,
+     0.0},
+    {"memory_usage_ratio/HBM/system/min", true, true, RATIO_INIT, +RATIO_INC, 0,
+     0.0},
+    {"memory_usage_ratio/HBM/system/max", true, true, RATIO_INIT, +RATIO_INC, 0,
+     0.0},
+    {"memory_usage_ratio/HBM/system", false, false, RATIO_INIT, +RATIO_INC, 0,
+     0.0},
+    {"temperature_celsius", true, false, TEMP_INIT, TEMP_INC, 0, 0.0},
+
+    /* while counters increase, per-time incremented value should stay same */
+    {"engine_use_usecs_total/all", true, false, COUNTER_START, COUNTER_INC, 0,
+     0.0},
+    {"engine_ratio/all", true, false, COUNTER_RATIO, 0, 0, 0.0},
+    {"throttled_usecs_total/gpu", true, false, COUNTER_START, COUNTER_INC, 0,
+     0.0},
+    {"throttled_ratio/gpu", true, false, COUNTER_RATIO, 0, 0, 0.0},
+    {"memory_bw_bytes_total/HBM/system/read", true, false, 2 * COUNTER_START,
+     2 * COUNTER_INC, 0, 0.0},
+    {"memory_bw_bytes_total/HBM/system/write", true, false, COUNTER_START,
+     COUNTER_INC, 0, 0.0},
+    {"memory_bw_ratio/HBM/system/read", true, false, 2 * COUNTER_RATIO, 0, 0,
+     0.0},
+    {"memory_bw_ratio/HBM/system/write", true, false, COUNTER_RATIO, 0, 0, 0.0},
+    {"energy_ujoules_total", true, false, COUNTER_START, COUNTER_INC, 0, 0.0},
+    {"power_watts", true, false, COUNTER_RATIO, 0, 0, 0.0},
+};
+
+/* VALIDATE: reset tracked metrics values and return count of how many
+ * metrics were not set since last reset.
+ *
+ * For non-zero 'base_rounds' parameter values, last metrics value
+ * will be compared to expected value for that round, and if there's
+ * a mismatch, error is logged and that metrics is also included to
+ * returned count.
+ *
+ * If 'multisampled' is non-zero, rounds is increased by suitable
+ * amount based on 'config.samples' value and metric 'multisample'
+ * flag.
+ */
+static int validate_and_reset_saved_metrics(unsigned int base_rounds,
+                                            unsigned int multisampled) {
+  assert(config.samples > 0);
+  int wrong = 0, missing = 0;
+  for (int i = 0; i < (int)STATIC_ARRAY_SIZE(valid_metrics); i++) {
+    metrics_validation_t *metric = &valid_metrics[i];
+    if (!metric->count) {
+      bool missed = false;
+      if (multisampled) {
+        if (metric->multipresent) {
+          missed = true;
+        }
+      } else {
+        if (!metric->multisampled) {
+          missed = true;
+        }
+      }
+      if (missed) {
+        fprintf(stderr, "expected metric type '%s' not reported\n",
+                metric->name);
+        missing++;
+      }
+      continue;
+    }
+    /* verify metrics array above is correctly filled */
+    if (multisampled && !metric->multipresent) {
+      fprintf(stderr, "%s: %s / %s = %g (%d)\n", metric->name,
+              metric->multipresent ? "multipresent" : "-",
+              metric->multisampled ? "multisampled" : "-", metric->last,
+              metric->count);
+      abort();
+    }
+
+    double last = metric->last;
+    metric->last = 0.0;
+    metric->count = 0;
+    if (!base_rounds) {
+      /* no metric value checking requested */
+      continue;
+    }
+    int incrounds = base_rounds - 1;
+    if (multisampled && metric->multisampled) {
+      /* min for increasing metrics is first value in given multisample round */
+      if (metric->value_inc > 0.0 && strstr(metric->name, "/min")) {
+        incrounds += multisampled - config.samples + 1;
+      }
+      /* max for decreasing metrics is first value in given multisample round */
+      else if (metric->value_inc < 0.0 && strstr(metric->name, "/max")) {
+        incrounds += multisampled - config.samples + 1;
+      } else {
+        /* for all others, it's the last value sampled */
+        incrounds += multisampled;
+      }
+    } else {
+      /* other metrics are sampled only at sample intervals */
+      incrounds += multisampled / config.samples;
+    }
+    double expected = metric->value_init + incrounds * metric->value_inc;
+    if (last != expected) {
+      fprintf(
+          stderr,
+          "ERROR: expected %g, but got value %g for metric '%s' on round %d\n",
+          expected, last, metric->name, incrounds);
+      wrong++;
+    } else if (globs.verbose & VERBOSE_METRICS) {
+      fprintf(stderr, "round %d metric value verified for '%s' (%.2f)\n",
+              incrounds, metric->name, expected);
+    }
+  }
+  if (missing && (globs.verbose & VERBOSE_METRICS)) {
+    fprintf(stderr, "%d metric(s) missing\n", missing);
+  }
+  return missing + wrong;
+}
+
+/* sort in reverse order so 'type' label comes first */
+static int cmp_labels(const void *a, const void *b) {
+  return strcmp(((const label_pair_t *)b)->name,
+                ((const label_pair_t *)a)->name);
+}
+
+/* constructs metric name from metric family name and metric label values */
+static void compose_name(char *buf, size_t bufsize, const char *name,
+                         metric_t *metric) {
+  label_pair_t *label = metric->label.ptr;
+  size_t num = metric->label.num;
+  assert(num && label);
+
+  /* guarantee stable label ordering i.e. names */
+  qsort(label, num, sizeof(*label), cmp_labels);
+
+  /* compose names (metric family + metric label values) */
+  size_t len = strlen(name);
+  assert(len < bufsize);
+  sstrncpy(buf, name, bufsize);
+  for (size_t i = 0; i < num; i++) {
+    const char *name = label[i].name;
+    const char *value = label[i].value;
+    assert(name && value);
+    if (strcmp(name, "pci_bdf") == 0 || strcmp(name, "sub_dev") == 0) {
+      /* do not add device PCI ID / sub device IDs to metric name */
+      continue;
+    }
+    len += snprintf(buf + len, bufsize - len, "/%s", value);
+  }
+  assert(len < bufsize);
+}
+
+static double get_value(metric_type_t type, value_t value) {
+  switch (type) {
+  case METRIC_TYPE_COUNTER:
+    return value.counter;
+    break;
+  case METRIC_TYPE_GAUGE:
+    return value.gauge;
+    break;
+  default:
+    assert(0);
+  }
+}
+
+/* matches constructed metric names against validation array ones and
+ * updates the values accordingly
+ */
+int plugin_dispatch_metric_family(metric_family_t const *fam) {
+  assert(fam && fam->name && fam->metric.num && fam->metric.ptr);
+
+  char name[128];
+  bool found = false;
+  metric_t *metric = fam->metric.ptr;
+
+  for (size_t m = 0; m < fam->metric.num; m++) {
+    double value = get_value(fam->type, metric[m].value);
+    compose_name(name, sizeof(name), fam->name, &metric[m]);
+    if (globs.verbose & VERBOSE_METRICS) {
+      fprintf(stderr, "METRIC: %s: %.2f\n", name, value);
+    }
+    /* for now, ignore other errors than for all_errors */
+    if (strstr(name, "errors") && !strstr(name, "all_errors")) {
+      return 0;
+    }
+    for (int v = 0; v < (int)STATIC_ARRAY_SIZE(valid_metrics); v++) {
+      metrics_validation_t *valid = &valid_metrics[v];
+      if (strstr(name, valid->name)) {
+        valid->last = value;
+        valid->count++;
+        found = true;
+        break;
+      }
+    }
+  }
+  assert(found);
+  return 0;
+}
+
+#define MAX_LABELS 8
+
+/* mock function uses just one large enough metrics array (for testing)
+ * instead of increasing it one-by-one, like the real collectd metrics
+ * code does
+ */
+int metric_label_set(metric_t *m, char const *name, char const *value) {
+  assert(m && name);
+  size_t num = m->label.num;
+  label_pair_t *pair = m->label.ptr;
+  if (num) {
+    assert(num < MAX_LABELS);
+    assert(pair);
+  } else {
+    assert(!pair);
+    pair = scalloc(MAX_LABELS, sizeof(*pair));
+    m->label.ptr = pair;
+  }
+  int i;
+  for (i = 0; i < MAX_LABELS; i++) {
+    if (!pair[i].name) {
+      /* not found -> new label */
+      pair[i].name = strdup(name);
+      m->label.num++;
+      break;
+    }
+    if (strcmp(name, pair[i].name) == 0) {
+      break;
+    }
+  }
+  assert(value); /* removing label with NULL 'value' is not supported */
+  free(pair[i].value);
+  pair[i].value = strdup(value);
+  return 0;
+}
+
+int metric_reset(metric_t *m) {
+  assert(m);
+  size_t num = m->label.num;
+  label_pair_t *pair = m->label.ptr;
+  if (!num) {
+    assert(!pair);
+    return 0;
+  }
+  assert(pair);
+  for (int i = 0; i < MAX_LABELS; i++) {
+    if (!pair[i].name) {
+      break;
+    }
+    free(pair[i].name);
+    free(pair[i].value);
+    pair[i].value = pair[i].name = NULL;
+    num--;
+  }
+  assert(!num);
+  free(pair);
+  m->label.ptr = NULL;
+  m->label.num = 0;
+  return 0;
+}
+
+#define MAX_METRICS 8
+
+/* mock function uses just one large enough metrics array (for testing)
+ * instead of increasing it one-by-one, like the real collectd metrics
+ * code does
+ */
+int metric_family_metric_append(metric_family_t *fam, metric_t m) {
+  assert(fam);
+  size_t num = fam->metric.num;
+  metric_t *metric = fam->metric.ptr;
+  if (num) {
+    assert(num < MAX_METRICS);
+    assert(metric);
+  } else {
+    assert(!metric);
+    metric = scalloc(MAX_METRICS, sizeof(*metric));
+    fam->metric.ptr = metric;
+  }
+  /* copy metric and pointers to its labels */
+  metric[num] = m;
+  label_pair_t *src = m.label.ptr;
+  if (src) {
+    /* alloc max size as labels can be added also to family metrics copies */
+    label_pair_t *dst = scalloc(MAX_LABELS, sizeof(*src));
+    metric[num].label.ptr = dst;
+    for (size_t i = 0; i < m.label.num; i++) {
+      dst[i].name = strdup(src[i].name);
+      dst[i].value = strdup(src[i].value);
+    }
+  }
+  fam->metric.num++;
+  m.family = fam;
+  return 0;
+}
+
+int metric_family_metric_reset(metric_family_t *fam) {
+  metric_t *metric = fam->metric.ptr;
+  for (size_t m = 0; m < fam->metric.num; m++) {
+    label_pair_t *pair = metric[m].label.ptr;
+    for (size_t i = 0; i < metric[m].label.num; i++) {
+      free(pair[i].name);
+      free(pair[i].value);
+    }
+    free(pair);
+    metric[m].label.ptr = NULL;
+    metric[m].label.num = 0;
+  }
+  free(fam->metric.ptr);
+  fam->metric.ptr = NULL;
+  fam->metric.num = 0;
+  return 0;
+}
+
+/* ------------------------------------------------------------------------- */
+/* mock up of collectd plugin API */
+
+static struct {
+  char *name;
+  char **keys;
+  unsigned int key_count;
+  int (*config)(const char *key, const char *val);
+  plugin_init_cb init;
+  int (*read)(void);
+  plugin_shutdown_cb shutdown;
+} registry;
+
+int plugin_register_config(const char *name,
+                           int (*callback)(const char *key, const char *val),
+                           const char **keys, int keys_num) {
+  assert(name && callback && keys && keys_num > 0);
+  registry.name = strdup(name);
+  registry.config = callback;
+
+  registry.keys = scalloc(keys_num, sizeof(char *));
+  for (int i = 0; i < keys_num; i++) {
+    assert(keys[i]);
+    registry.keys[i] = strdup(keys[i]);
+  }
+  registry.key_count = keys_num;
+  return 0;
+}
+int plugin_register_init(const char *name, plugin_init_cb callback) {
+  assert(name && callback);
+  assert(strcmp(name, registry.name) == 0);
+  registry.init = callback;
+  return 0;
+}
+int plugin_register_read(const char *name, int (*callback)(void)) {
+  assert(name && callback);
+  assert(strcmp(name, registry.name) == 0);
+  registry.read = callback;
+  return 0;
+}
+int plugin_register_shutdown(const char *name, plugin_shutdown_cb callback) {
+  assert(name && callback);
+  assert(strcmp(name, registry.name) == 0);
+  registry.shutdown = callback;
+  return 0;
+}
+
+/* ------------------------------------------------------------------------- */
+/* helper code partially copied from collectd (initially Copyright Florian
+ * Foster) */
+
+static const struct {
+  int level;
+  const char *name;
+} log_levels[] = {{0, "???"},
+                  {1, "???"},
+                  {2, "???"},
+                  {LOG_ERR, "ERROR"},
+                  {LOG_WARNING, "WARN"},
+                  {LOG_NOTICE, "NOTICE"},
+                  {LOG_INFO, "INFO"},
+                  {LOG_DEBUG, "DEBUG"}};
+
+/* half based on daemon/plugin.c, for logging */
+void plugin_log(int level, const char *format, ...) {
+  assert(level >= LOG_ERR && level < (int)STATIC_ARRAY_SIZE(log_levels));
+  if (level <= LOG_WARNING) {
+    globs.warnings++;
+  }
+  globs.messages++;
+  char msg[1024];
+  va_list ap;
+  va_start(ap, format);
+  vsnprintf(msg, sizeof(msg), format, ap);
+  va_end(ap);
+  fprintf(stderr, "%s (%s)\n", msg, log_levels[level].name);
+}
+
+/* safe function wrapper from utils/common/common.c */
+char *sstrncpy(char *dest, const char *src, size_t n) {
+  strncpy(dest, src, n);
+  dest[n - 1] = '\0';
+  return dest;
+}
+void *scalloc(size_t nmemb, size_t size) {
+  void *p = calloc(nmemb, size);
+  assert(p);
+  return p;
+}
+void *smalloc(size_t size) {
+  void *p = malloc(size);
+  assert(p);
+  return p;
+}
+
+/* ------------------------------------------------------------------------- */
+/* TEST: plugin setup & teardown */
+
+static void plugin_register(void) {
+  for (int i = 0; i < (int)STATIC_ARRAY_SIZE(log_levels); i++) {
+    /* verify log levels match expected */
+    assert(log_levels[i].level == i);
+  }
+  module_register();
+  assert(registry.config && registry.init && registry.read &&
+         registry.shutdown);
+}
+
+/* free test code registry struct allocs after config checks are done
+ */
+static void plugin_register_free(void) {
+  for (unsigned int i = 0; i < registry.key_count; i++) {
+    free(registry.keys[i]);
+  }
+  free(registry.keys);
+  registry.keys = NULL;
+  free(registry.name);
+  registry.name = NULL;
+}
+
+/* ------------------------------------------------------------------------- */
+
+/* TEST: config keys. 'check_nonbool' checks non-boolean config keys,
+ * 'enable_metrics' enables quering of all metrics, and 'enable_logs' enables
+ * all logs as part of testing. return 0 for success
+ */
+static int test_config_keys(bool check_nonbool, bool enable_metrics,
+                            bool enable_logs) {
+  struct {
+    bool set_false;
+    const char *prefix;
+  } bool_checks[] = {{enable_metrics, "Disable"}, {!enable_logs, "Log"}};
+  /* tests for non-bool config keys */
+  struct {
+    const char *key;
+    const char *value;
+    bool success;
+  } test[] = {
+      {"MetricsOutput", "derived", true},
+      {"MetricsOutput", "raW", true},
+      {"MetricsOutput", "Foobar", false},
+      {"MetricsOutput", "1", false},
+      {"Foobar", "Foobar", false},
+      {"Samples", "999", false},
+      {"Samples", "-1", false},
+      {"Samples", "8", true},
+      /* set back to default */
+      {"MetricsOutput", "Both", true},
+      {"Samples", "1", true},
+  };
+  unsigned int i, j;
+  int ret, fails = 0;
+
+  if (check_nonbool) {
+    for (i = 0; i < STATIC_ARRAY_SIZE(test); i++) {
+      ret = registry.config(test[i].key, test[i].value);
+      if ((ret == 0) != test[i].success) {
+        fprintf(stderr, "ERROR: unexpected config %s with '%s'='%s'\n",
+                ret ? "fail" : "success", test[i].key, test[i].value);
+        fails++;
+      }
+    }
+  }
+
+  /* make sure that also bool values work */
+  for (i = 0; i < registry.key_count; i++) {
+
+    const char *prefix, *key = registry.keys[i];
+    for (j = 0; j < 2; j++) {
+      prefix = bool_checks[j].prefix;
+
+      if (strncmp(key, prefix, strlen(prefix))) {
+        continue;
+      }
+      ret = registry.config(key, "true");
+      if (bool_checks[j].set_false) {
+        ret += registry.config(key, "false");
+      }
+      if (ret != 0) {
+        fprintf(stderr, "ERROR: unexpected '%s' bool config set fail\n", key);
+        fails++;
+      }
+    }
+  }
+  return fails;
+}
+
+/* ------------------------------------------------------------------------- */
+
+/*
+ * set all GPU metrics Disable* flags to 'value', update bitmask of
+ * what was changed + set what's the full bitmask, and return count
+ * of changed items
+ */
+static int get_reset_disabled(gpu_disable_t *disabled, bool value, int *mask,
+                              int *all) {
+  struct {
+    const char *name;
+    bool *flag;
+  } flags[] = {
+      {"engine", &disabled->engine},    {"frequency", &disabled->freq},
+      {"memory", &disabled->mem},       {"membw", &disabled->membw},
+      {"power", &disabled->power},      {"errors", &disabled->ras},
+      {"temperature", &disabled->temp}, {"throttle", &disabled->throttle}};
+  *all = 0;
+  int count = 0;
+  for (int i = 0; i < (int)STATIC_ARRAY_SIZE(flags); i++) {
+    if (*(flags[i].flag) != value) {
+      if (globs.verbose & VERBOSE_METRICS) {
+        fprintf(stderr, "=> %s: %s\n", value ? "DISABLED" : "ENABLED",
+                flags[i].name);
+      }
+      *(flags[i].flag) = value;
+      *mask |= (1 << i);
+      count++;
+    }
+    *all |= (1 << i);
+  }
+  return count;
+}
+
+/* TEST: metrics queries error handling, return 0 for success */
+static int test_query_errors(unsigned int limit) {
+  assert(gpu_count == 1);
+  gpu_disable_t *disabled = &(gpus[0].disabled);
+
+  /* enable all metrics */
+  int fails, all, mask = 0;
+  get_reset_disabled(disabled, false, &mask, &all);
+
+  mask = fails = 0;
+  for (; limit > 0; limit--) {
+    int count;
+
+    globs.warnings = 0;
+    globs.api_calls = 0;
+    globs.api_limit = limit;
+
+    if (registry.read() != 0) {
+      fprintf(stderr,
+              "ERROR: metrics query failed completely with single call fail\n");
+      fails++;
+    }
+    /* there were logged call failures? */
+    if (globs.warnings == 0) {
+      fprintf(stderr, "ERROR: no errors/warnings reported when call %d fails\n",
+              limit);
+      fails++;
+    }
+    /* enable all metrics again & check that exactly one metric type got
+     * disabled? */
+    count = get_reset_disabled(disabled, false, &mask, &all);
+    if (count != 1) {
+      fprintf(stderr, "ERROR: %d metric types disabled instead of 1\n", count);
+      fails++;
+    }
+  }
+  if (mask != all) {
+    fprintf(stderr,
+            "ERROR: all metric types were not disabled, expected %x, got %x\n",
+            all, mask);
+    fails++;
+  }
+  /* disable all metrics & check read fail */
+  globs.warnings = 0;
+  get_reset_disabled(disabled, true, &mask, &all);
+  registry.read();
+  if (registry.read() == 0) {
+    fprintf(
+        stderr,
+        "ERROR: metrics query succceeded although all metrics were disabled\n");
+    fails++;
+  }
+  globs.warnings = globs.api_limit = 0;
+  return fails;
+}
+
+/* change sampling rate to given, implies plugin reset */
+static void change_sampling_reset(const char *samples) {
+  fprintf(stderr, "Setting 'Samples' to '%s' and reseting plugin\n", samples);
+  assert(registry.shutdown() == 0);
+  assert(atoi(samples) > 0);
+  assert(registry.config("Samples", samples) == 0);
+  assert(registry.init() == 0);
+}
+
+/* TEST: metrics queries with multiple samples, return number of fails */
+static int test_multisampled_queries(unsigned int prev_rounds,
+                                     unsigned int samples) {
+  assert(samples > 1);
+  /* first 'samples' rounds to prime counter metrics & count API calls */
+  if (globs.verbose & VERBOSE_METRICS) {
+    fprintf(stderr, "METRIC: first %d multisample rounds for query priming:\n",
+            samples);
+  }
+  unsigned int i, calls_sampled = 0;
+  for (i = 1; i <= samples; i++) {
+    globs.api_calls = 0;
+    assert(registry.read() == 0);
+    assert(globs.warnings == 0);
+    if (!calls_sampled) {
+      calls_sampled = globs.api_calls;
+    }
+  }
+  unsigned int calls_all = globs.api_calls;
+  fprintf(stderr,
+          "expect %d API calls for %dx multisampled metrics, >= %d for all\n",
+          calls_sampled, samples, calls_all);
+
+  /* additional 2x 'samples' rounds to verify the results */
+  if (globs.verbose & VERBOSE_METRICS) {
+    fprintf(stderr,
+            "METRIC: additional %d+%d multisample rounds for verification:\n",
+            samples, samples);
+  }
+  int fails = 0;
+  for (/* i=samples */; i <= 3 * samples; i++) {
+    globs.api_calls = 0;
+    assert(registry.read() == 0);
+    assert(globs.warnings == 0);
+    /* verify same amount of calls on every run, separately for
+     * the case when only sampled metrics are read, and when all are
+     */
+    if (i % samples > 0) {
+      if (calls_sampled != globs.api_calls) {
+        fprintf(stderr, "ERROR: expected %d API calls, got %d\n", calls_sampled,
+                globs.api_calls);
+        fails++;
+      }
+      continue;
+    }
+    /* number of calls may differ on multisampled rounds, so just
+     * check that at least expected number of them is done
+     */
+    if (calls_all < calls_sampled || calls_all > globs.api_calls) {
+      fprintf(stderr, "ERROR: expected >= %d (and > %d) API calls, got %d\n",
+              calls_all, calls_sampled, globs.api_calls);
+      fails++;
+    }
+    fails += validate_and_reset_saved_metrics(prev_rounds, i);
+  }
+  /* back to single sample */
+  assert(registry.shutdown() == 0);
+  assert(registry.config("Samples", "1") == 0);
+  assert(registry.init() == 0);
+  return fails;
+}
+
+/* TEST: error handling for Sysman calls during plugin init, return 0 for
+ * success */
+static int test_init_errors(unsigned int limit) {
+  int fails = 0;
+  for (; limit > 0; limit--) {
+    globs.warnings = 0;
+    globs.api_calls = 0;
+    globs.api_limit = limit;
+
+    if (registry.init() == 0) {
+      fprintf(stderr, "ERROR: metrics init succeeded despite call %d failing\n",
+              limit);
+      fails++;
+      if (registry.shutdown() != 0) {
+        fprintf(stderr, "ERROR: plugin shutdown failed after init succeeded\n");
+        fails++;
+      }
+    }
+    if (globs.warnings == 0) {
+      fprintf(stderr, "ERROR: no errors/warnings reported when call %d fails\n",
+              limit);
+      fails++;
+    }
+  }
+  globs.warnings = globs.api_limit = 0;
+  return fails;
+}
+
+/* ------------------------------------------------------------------------- */
+/* options parsing & main */
+
+static void parse_options(int argc, const char **argv) {
+  static const struct {
+    const char *opt;
+    unsigned int bit;
+    const char *desc;
+  } opts[] = {{"-ci", VERBOSE_CALLS_INIT, "Trace calls during metric inits"},
+              {"-cil", VERBOSE_CALLS_INIT_LIMIT,
+               "Trace calls during N call-limited init runs"},
+              {"-cm", VERBOSE_CALLS_METRICS,
+               "Trace calls during normal metric query runs"},
+              {"-cms", VERBOSE_CALLS_METRICS_SAMPLED,
+               "Trace calls during N sampled metric runs"},
+              {"-cml", VERBOSE_CALLS_METRICS_LIMIT,
+               "Trace calls during N call-limited metric runs"},
+              {"-mn", VERBOSE_METRICS_NORMAL,
+               "Log metric values in normal (samples=1) runs"},
+              {"-ms", VERBOSE_METRICS_SAMPLED,
+               "Log metric values in N sampled (samples>1) runs"},
+              {"-ml", VERBOSE_METRICS_LIMIT,
+               "Log metric values in N call-limited runs"}};
+  int i, j, count = STATIC_ARRAY_SIZE(opts);
+
+  for (i = 1; i < argc; i++) {
+    for (j = 0; j < count; j++) {
+      if (strcmp(argv[i], opts[j].opt) != 0) {
+        continue;
+      }
+      globs.verbose |= opts[j].bit;
+      break;
+    }
+    if (j >= count) {
+      const char *basename = strrchr(argv[0], '/');
+      fprintf(stderr, "\nUsage: %s [options]\n\nOptions:\n", basename);
+      for (int j = 0; j < count; j++) {
+        fprintf(stderr, "\t%s\t%s\n", opts[j].opt, opts[j].desc);
+      }
+      fprintf(stderr, "\n\t(Only Sysman API calls are traced.)\n");
+      exit(1);
+    }
+  }
+}
+
+int main(int argc, const char **argv) {
+  parse_options(argc, argv);
+
+  plugin_register();
+
+  /* config & minimal init checks */
+
+  set_verbose(VERBOSE_CALLS_INIT, 0);
+
+  fprintf(stderr, "Default plugin config + 2*init + shutdown...\n");
+  assert(registry.init() == 0);
+  /* 2nd init call should be no-op with log message about that */
+  globs.messages = 0;
+  assert(registry.init() == 0);
+  assert(globs.messages > 0);
+  assert(registry.shutdown() == 0);
+  fprintf(stderr, "default init/shutdown: PASS\n\n");
+
+  /* check misc config options, enable all metrics & extra plugin logging */
+  fprintf(stderr, "Misc config options checks...\n");
+  globs.warnings = 0;
+  assert(test_config_keys(true, true, true) == 0);
+  assert(globs.warnings > 0);
+  /* more coverage by disabling only some of metrics at init */
+  globs.warnings = 0;
+  assert(registry.config("DisablePower", "true") == 0);
+  assert(registry.init() == 0);
+  assert(registry.shutdown() == 0);
+  assert(globs.warnings == 0);
+  fprintf(stderr, "misc config: PASS\n\n");
+
+  /* init should fail when every metric is disabled */
+  globs.warnings = 0;
+  fprintf(stderr, "All metrics & logs disabled + init/shutdown...\n");
+  assert(test_config_keys(false, false, false) == 0);
+  assert(registry.init() != 0);
+  assert(globs.warnings > 0);
+  /* undefined whether shutdown() returns fail or success after failed init */
+  registry.shutdown();
+  fprintf(stderr, "metrics disabled init/shutdown: PASS\n\n");
+
+  /* config tests done, re-enable metrics */
+  globs.warnings = 0;
+  assert(test_config_keys(false, true, false) == 0);
+  plugin_register_free();
+
+  /* full init checks */
+
+  /* make sure all Sysman functions are called at init */
+  assert(registry.config("LogGpuInfo", "true") == 0);
+  assert(globs.warnings == 0);
+
+  fprintf(stderr,
+          "Check whether init with GPU info does all Sysman calls...\n");
+  globs.warnings = globs.api_calls = globs.callbits = 0;
+  assert(registry.init() == 0);
+  /* all Sysman metric init functions got called? */
+  assert(globs.callbits == INIT_CALL_BITS);
+  fprintf(stderr, "%d calls to all %d Sysman metric init functions\n",
+          globs.api_calls, INIT_CALL_FUNCS);
+  assert(registry.shutdown() == 0);
+  assert(globs.warnings == 0);
+  fprintf(stderr, "full init: PASS\n\n");
+
+  /* skip Sysman functions which failure isn't fatal for init */
+  assert(registry.config("LogGpuInfo", "false") == 0);
+
+  /* count relevant API calls */
+  globs.warnings = globs.api_calls = 0;
+  fprintf(stderr, "No init errors/warnings with GPU info disabled...\n");
+  assert(registry.init() == 0);
+  assert(registry.shutdown() == 0);
+  assert(globs.warnings == 0);
+  fprintf(stderr, "init warnings: PASS\n\n");
+
+  set_verbose(VERBOSE_CALLS_INIT_LIMIT, 0);
+
+  unsigned int api_calls = globs.api_calls;
+
+  fprintf(stderr,
+          "Error handling for each of %d relevant init Sysman calls...\n",
+          api_calls);
+  assert(test_init_errors(api_calls) == 0);
+  /* undefined whether shutdown() returns fail or success after failed init */
+  registry.shutdown();
+  fprintf(stderr, "init error handling: PASS\n\n");
+
+  /* metrics query & value checks */
+
+  assert(registry.config("DisableSeparateErrors", "false") == 0);
+  set_verbose(VERBOSE_CALLS_METRICS, VERBOSE_METRICS_NORMAL);
+  assert(registry.init() == 0);
+
+  fprintf(stderr, "Query all metrics for the first time, with separate errors "
+                  "enabled...\n");
+  globs.warnings = globs.api_calls = globs.callbits = 0;
+  assert(registry.read() == 0);
+  /* all Sysman metric query functions got successfully called? */
+  assert(globs.callbits == QUERY_CALL_BITS);
+  assert(globs.warnings == 0);
+  fprintf(stderr, "%d calls to all %d Sysman metric query functions\n",
+          globs.api_calls, QUERY_CALL_FUNCS);
+  /* per-time counters do not report on first round */
+  assert(validate_and_reset_saved_metrics(1, 0) > 0);
+  fprintf(stderr, "metrics query round 1: PASS\n\n");
+
+  api_calls = globs.api_calls;
+  globs.api_calls = 0;
+
+  fprintf(stderr, "Another query for per-timediff metric values + validation "
+                  "for all values...\n");
+  assert(registry.read() == 0);
+  /* make sure second round does (successfully) same (amount of) calls */
+  assert(globs.warnings == 0);
+  /* second round may make additional calls */
+  assert(globs.api_calls >= api_calls);
+  /* make sure metrics values were correct and all metric types were now
+   * reported */
+  assert(validate_and_reset_saved_metrics(2, 0) == 0);
+  fprintf(stderr, "metrics query round 2: PASS\n\n");
+
+  /* just report total count of errors (should not affect calls) */
+  assert(registry.config("DisableSeparateErrors", "true") == 0);
+
+  api_calls = globs.api_calls;
+  globs.api_calls = 0;
+
+  fprintf(stderr, "One more query to verify increment handling, with only "
+                  "error totals...\n");
+  assert(registry.read() == 0);
+  assert(globs.warnings == 0);
+  assert(globs.api_calls == api_calls);
+  /* make sure metrics values were correct and all metric types were reported */
+  assert(validate_and_reset_saved_metrics(3, 0) == 0);
+  fprintf(stderr, "metrics query round 3: PASS\n\n");
+
+  /* queries with metrics sampling enabled */
+
+  set_verbose(VERBOSE_CALLS_METRICS_SAMPLED, VERBOSE_METRICS_SAMPLED);
+  fprintf(stderr, "Check metrics with >1 'Samples' sampling factor...\n");
+  change_sampling_reset("8");
+  assert(test_multisampled_queries(3, 8) == 0);
+  fprintf(stderr, "metrics sampling: PASS\n\n");
+
+  /* metrics error handling checks */
+
+  set_verbose(VERBOSE_CALLS_METRICS_LIMIT, VERBOSE_METRICS_LIMIT);
+  fprintf(stderr,
+          "Test error handling separately for each of the %d query calls...\n",
+          api_calls);
+  /* disable multisampling & do one query round to guarantee
+   * that all L0 calls are done on every read */
+  change_sampling_reset("1");
+  assert(registry.read() == 0);
+  assert(test_query_errors(api_calls) == 0);
+  assert(registry.shutdown() == 0);
+  fprintf(stderr, "metrics query error handling: PASS\n\n");
+
+  fprintf(stderr, "=> SUCCESS, all tests PASSed!\n");
+  return 0;
+}
diff --git a/src/utils/common/common.c b/src/utils/common/common.c

index a9b87612ca8ad8a4d8f1f19bf2ace893e830e7d4..c1d0d14a6f1bb23e6334635ce23f621df6aa548f 100644 (file)
--- a/src/utils/common/common.c
+++ b/src/utils/common/common.c
@@ -239,6 +239,17 @@ char *sstrerror(int errnum, char *buf, size_t buflen) {
    return buf;
  } /* char *sstrerror */
  
+void *scalloc(size_t nmemb, size_t size) {
+  void *r;
+
+  if ((r = calloc(nmemb, size)) == NULL) {
+    ERROR("Not enough memory.");
+    exit(3);
+  }
+
+  return r;
+} /* void *scalloc */
+
  void *smalloc(size_t size) {
    void *r;
  
diff --git a/src/utils/common/common.h b/src/utils/common/common.h

index 2812644a257b8d00caaae72368187906973fb0e2..cc702272d5d4240e43b09971dc3d56036ba6154c 100644 (file)
--- a/src/utils/common/common.h
+++ b/src/utils/common/common.h
@@ -79,6 +79,7 @@ size_t strnlen(const char *s, size_t maxlen);
  #endif
  
  char *sstrndup(const char *s, size_t n);
+void *scalloc(size_t nmemb, size_t size);
  void *smalloc(size_t size);
  char *sstrerror(int errnum, char *buf, size_t buflen);
author	Eero Tamminen <eero.t.tamminen@intel.com>
	Tue, 7 Jun 2022 17:55:14 +0000 (20:55 +0300)
committer	GitHub <noreply@github.com>
	Tue, 7 Jun 2022 17:55:14 +0000 (19:55 +0200)
Makefile.am		patch \| blob \| blame \| history
README		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
src/collectd.conf.in		patch \| blob \| blame \| history
src/collectd.conf.pod		patch \| blob \| blame \| history
src/gpu_sysman.c	[new file with mode: 0644]	patch \| blob
src/gpu_sysman_test.c	[new file with mode: 0644]	patch \| blob
src/utils/common/common.c		patch \| blob \| blame \| history
src/utils/common/common.h		patch \| blob \| blame \| history