From: Eero Tamminen Date: Thu, 17 Nov 2022 20:19:07 +0000 (+0200) Subject: gpu_sysman: initialize struct .pNext members before use X-Git-Tag: 6.0.0-rc0~89 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b18aed6930d9ce91534c41ce90453588ec006950;p=thirdparty%2Fcollectd.git gpu_sysman: initialize struct .pNext members before use Next Sysman spec will explictly state that they need be initialized: https://github.com/oneapi-src/level-zero-spec/commit/98dfaaf041dedfd8c9bcf9a3957f334836e859e4 And latest Sysman backend versions corrupt memory / crash unless .pNext values in some of the structs given to Get functions are initialized. (Releases before fall 2022 did not use .pNext values in get* calls, and worked fine. It just took a long time until I was able to verify whether this was a regression that will be fixed, or intended change.) Additionally, validate in test code that .pNext values are set to NULL (because some structs lack those pointer members, ADD_METRIC() macro cannot do that check for the functions given for it, but otherwise everything is covered). Signed-off-by: Eero Tamminen --- diff --git a/src/gpu_sysman.c b/src/gpu_sysman.c index 720c24223..dc6e101e6 100644 --- a/src/gpu_sysman.c +++ b/src/gpu_sysman.c @@ -188,6 +188,8 @@ static void **gpu_subarray_realloc(void **mem, int count, int size) { gpu_subarray_free(mem); mem = smalloc(config.samples * sizeof(void *)); for (i = 0; i < config.samples; i++) { + // (s)calloc used so pointers in structs are initialized to + // NULLs for Sysman metric state/property Get calls mem[i] = scalloc(count, size); } return mem; @@ -351,7 +353,7 @@ static bool gpu_info(zes_device_handle_t dev, char **pci_bdf, char **pci_dev) { char buf[32]; *pci_bdf = *pci_dev = NULL; - zes_pci_properties_t pci; + zes_pci_properties_t pci = {.pNext = NULL}; ze_result_t ret = zesDevicePciGetProperties(dev, &pci); if (ret == ZE_RESULT_SUCCESS) { const zes_pci_address_t *addr = &pci.address; @@ -382,7 +384,7 @@ static bool gpu_info(zes_device_handle_t dev, char **pci_bdf, char **pci_dev) { } INFO("HW state:"); - zes_device_state_t state; + zes_device_state_t state = {.pNext = NULL}; /* Note: there's also zesDevicePciGetState() for PCI link status */ if (ret = zesDeviceGetState(dev, &state), ret == ZE_RESULT_SUCCESS) { INFO("- repaired: %s", @@ -404,7 +406,7 @@ static bool gpu_info(zes_device_handle_t dev, char **pci_bdf, char **pci_dev) { } INFO("HW identification:"); - zes_device_properties_t props; + zes_device_properties_t props = {.pNext = NULL}; if (ret = zesDeviceGetProperties(dev, &props), ret == ZE_RESULT_SUCCESS) { const ze_device_properties_t *core = &props.core; snprintf(buf, sizeof(buf), "0x%x", core->deviceId); @@ -441,8 +443,7 @@ static bool gpu_info(zes_device_handle_t dev, char **pci_bdf, char **pci_dev) { WARNING(PLUGIN_NAME ": failed to get memory properties count => 0x%x", ret); return true; } - ze_device_memory_properties_t *mems; - mems = scalloc(mem_count, sizeof(*mems)); + ze_device_memory_properties_t *mems = scalloc(mem_count, sizeof(*mems)); if (ret = zeDeviceGetMemoryProperties(mdev, &mem_count, mems), ret != ZE_RESULT_SUCCESS) { WARNING(PLUGIN_NAME ": failed to get %d memory properties => 0x%x", @@ -607,7 +608,7 @@ static int gpu_fetch(ze_driver_handle_t *drivers, uint32_t driver_count, } /* Get all GPU devices for the driver */ for (uint32_t dev_idx = 0; dev_idx < dev_count; dev_idx++) { - ze_device_properties_t props; + ze_device_properties_t props = {.pNext = NULL}; if (ret = zeDeviceGetProperties(devs[dev_idx], &props), ret != ZE_RESULT_SUCCESS) { ERROR(PLUGIN_NAME @@ -794,7 +795,7 @@ static bool gpu_ras(gpu_device_t *gpu) { bool ok = false; for (i = 0; i < ras_count; i++) { - zes_ras_properties_t props; + zes_ras_properties_t props = {.pNext = NULL}; if (ret = zesRasGetProperties(ras[i], &props), ret != ZE_RESULT_SUCCESS) { ERROR(PLUGIN_NAME ": failed to get RAS set %d properties => 0x%x", i, ret); @@ -818,8 +819,8 @@ static bool gpu_ras(gpu_device_t *gpu) { snprintf(buf, sizeof(buf), "%d", props.subdeviceId); subdev = buf; } - zes_ras_state_t values; const bool clear = false; + zes_ras_state_t values = {.pNext = NULL}; if (ret = zesRasGetState(ras[i], clear, &values), ret != ZE_RESULT_SUCCESS) { ERROR(PLUGIN_NAME ": failed to get RAS set %d (%s) state => 0x%x", i, @@ -912,7 +913,7 @@ static void metric_set_subdev(metric_t *m, bool onsub, uint32_t subid) { * for success */ static ze_result_t set_mem_labels(zes_mem_handle_t mem, metric_t *metric) { - zes_mem_properties_t props; + zes_mem_properties_t props = {.pNext = NULL}; ze_result_t ret = zesMemoryGetProperties(mem, &props); if (ret != ZE_RESULT_SUCCESS) { return ret; @@ -1260,7 +1261,7 @@ static bool gpu_mems_bw(gpu_device_t *gpu) { */ static ze_result_t set_freq_labels(zes_freq_handle_t freq, metric_t *metric, double *maxfreq) { - zes_freq_properties_t props; + zes_freq_properties_t props = {.pNext = NULL}; ze_result_t ret = zesFrequencyGetProperties(freq, &props); if (ret != ZE_RESULT_SUCCESS) { return ret; @@ -1637,7 +1638,7 @@ static bool gpu_temps(gpu_device_t *gpu) { bool reported_ratio = false, ok = false; for (i = 0; i < temp_count; i++) { - zes_temp_properties_t props; + zes_temp_properties_t props = {.pNext = NULL}; if (ret = zesTemperatureGetProperties(temps[i], &props), ret != ZE_RESULT_SUCCESS) { ERROR(PLUGIN_NAME @@ -1758,7 +1759,7 @@ static bool gpu_powers(gpu_device_t *gpu) { bool ok = false; for (i = 0; i < power_count; i++) { - zes_power_properties_t props; + zes_power_properties_t props = {.pNext = NULL}; if (ret = zesPowerGetProperties(powers[i], &props), ret != ZE_RESULT_SUCCESS) { ERROR(PLUGIN_NAME ": failed to get power domain %d properties => 0x%x", i, @@ -1905,7 +1906,7 @@ static bool gpu_engines(gpu_device_t *gpu) { int type_idx[16] = {0}; bool reported_ratio = false, reported_counter = false, ok = false; for (i = 0; i < engine_count; i++) { - zes_engine_properties_t props; + zes_engine_properties_t props = {.pNext = NULL}; if (ret = zesEngineGetProperties(engines[i], &props), ret != ZE_RESULT_SUCCESS) { ERROR(PLUGIN_NAME ": failed to get engine group %d properties => 0x%x", i, diff --git a/src/gpu_sysman_test.c b/src/gpu_sysman_test.c index e5f248cd2..2cb697731 100644 --- a/src/gpu_sysman_test.c +++ b/src/gpu_sysman_test.c @@ -55,6 +55,7 @@ * - All registered config variables work and invalid config values are rejected * - All mocked up Sysman functions get called when no errors are returned and * count of Sysman calls is always same for plugin init() and read() callbacks + * - .pNext pointers in structs given to (most) Get functions are initialized * - Plugin dispatch API receives correct values for all metrics, both in * single-sampling, and in multi-sampling configurations * - Every Sysman call failure during init or metrics queries is logged, and @@ -222,6 +223,7 @@ ze_result_t zeDeviceGetProperties(ze_device_handle_t dev, ze_device_properties_t *props) { ze_result_t ret = dev_args_check(3, "zeDeviceGetProperties", dev, props); if (ret == ZE_RESULT_SUCCESS) { + assert(!props->pNext); memset(props, 0, sizeof(*props)); props->type = ZE_DEVICE_TYPE_GPU; } @@ -241,6 +243,7 @@ ze_result_t zeDeviceGetMemoryProperties(ze_device_handle_t dev, uint32_t *count, *count = 1; if (!props) return ZE_RESULT_ERROR_INVALID_NULL_POINTER; + assert(!props->pNext); memset(props, 0, sizeof(*props)); return ZE_RESULT_SUCCESS; } @@ -250,8 +253,10 @@ ze_result_t zeDeviceGetMemoryProperties(ze_device_handle_t dev, uint32_t *count, #define DEV_GET_ZEROED_STRUCT(callbit, getname, structtype) \ ze_result_t getname(zes_device_handle_t dev, structtype *to_zero) { \ ze_result_t ret = dev_args_check(callbit, #getname, dev, to_zero); \ - if (ret == ZE_RESULT_SUCCESS) \ + if (ret == ZE_RESULT_SUCCESS) { \ + assert(!to_zero->pNext); \ memset(to_zero, 0, sizeof(*to_zero)); \ + } \ return ret; \ } @@ -343,6 +348,7 @@ static ze_result_t metric_args_check(int callbit, const char *name, ze_result_t propname(handletype handle, proptype *prop) { \ ze_result_t ret = metric_args_check(callbit + 1, #propname, handle, prop); \ if (ret == ZE_RESULT_SUCCESS) { \ + assert(!prop->pNext); \ *prop = propvar; \ prop->onSubdevice = true; \ } \ @@ -425,6 +431,7 @@ ze_result_t zesRasGetState(zes_ras_handle_t handle, ze_bool_t clear, if (clear) { return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; } + assert(!state->pNext); static uint64_t count = RAS_INIT; memset(state, 0, sizeof(zes_ras_state_t)); /* props default to zeroes i.e. correctable error type,