return ret;
}
+#define AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY 512
+/* Max bad page slots allowed for SRIOV*/
+#define AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_MAX_CAPACITY 10665U
+
+/**
+ * amdgpu_virt_ras_realloc_eh_data_space - alloc/realloc VF bad-page @data->bps and @data->bps_bo
+ * @adev: amdgpu device
+ * @data: VF RAS error-handler data
+ * @pages: minimum number of new slots to add beyond @data->capacity
+ *
+ * Return: 0 on success, %-ENOMEM on failure.
+ */
+static int amdgpu_virt_ras_realloc_eh_data_space(struct amdgpu_device *adev,
+ struct amdgpu_virt_ras_err_handler_data *data,
+ int pages)
+{
+ struct eeprom_table_record *new_bps;
+ struct amdgpu_bo **new_bo;
+ unsigned int old_space;
+ unsigned int new_space;
+ unsigned int align_space;
+
+ old_space = (unsigned int)data->capacity;
+ new_space = old_space + max_t(unsigned int, (unsigned int)pages,
+ (unsigned int)AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY);
+ if (new_space < old_space || new_space > AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_MAX_CAPACITY)
+ return -ENOMEM;
+
+ align_space = ALIGN(new_space, AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY);
+ if (align_space > AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_MAX_CAPACITY)
+ return -ENOMEM;
+
+ new_bps = kmalloc_array(align_space, sizeof(*data->bps), GFP_KERNEL);
+ new_bo = kcalloc(align_space, sizeof(*data->bps_bo), GFP_KERNEL);
+ if (!new_bps || !new_bo) {
+ kfree(new_bps);
+ kfree(new_bo);
+ dev_warn_ratelimited(adev->dev,
+ "RAS WARN: failed to grow bad page table to %u slots\n",
+ align_space);
+ return -ENOMEM;
+ }
+
+ memcpy(new_bps, data->bps, data->count * sizeof(*data->bps));
+ memcpy(new_bo, data->bps_bo, data->count * sizeof(*data->bps_bo));
+
+ kfree(data->bps);
+ kfree(data->bps_bo);
+ data->bps = new_bps;
+ data->bps_bo = new_bo;
+ data->capacity = (int)align_space;
+
+ return 0;
+}
+
static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev)
{
struct amdgpu_virt *virt = &adev->virt;
struct amdgpu_virt_ras_err_handler_data **data = &virt->virt_eh_data;
- /* GPU will be marked bad on host if bp count more then 10,
- * so alloc 512 is enough.
- */
- unsigned int align_space = 512;
+ unsigned int align_space = AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY;
void *bps = NULL;
struct amdgpu_bo **bps_bo = NULL;
if (!bps)
goto bps_failure;
- bps_bo = kmalloc_objs(*(*data)->bps_bo, align_space);
+ bps_bo = kcalloc(align_space, sizeof(*(*data)->bps_bo), GFP_KERNEL);
if (!bps_bo)
goto bps_bo_failure;
(*data)->bps = bps;
(*data)->bps_bo = bps_bo;
+ (*data)->capacity = align_space;
(*data)->count = 0;
(*data)->last_reserved = 0;
virt->virt_eh_data = NULL;
}
-static void amdgpu_virt_ras_add_bps(struct amdgpu_device *adev,
- struct eeprom_table_record *bps, int pages)
+static bool amdgpu_virt_ras_add_bps(struct amdgpu_device *adev,
+ const struct eeprom_table_record *bps, int pages)
{
struct amdgpu_virt *virt = &adev->virt;
struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
+ int need;
- if (!data)
- return;
+ if (!data || pages <= 0)
+ return false;
+
+ if (pages > AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_MAX_CAPACITY - data->count) {
+ dev_warn_ratelimited(adev->dev,
+ "RAS WARN: bad page table at capacity (count=%d pages=%d max=%u)\n",
+ data->count, pages,
+ AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_MAX_CAPACITY);
+ return false;
+ }
+
+ need = data->count + pages;
+ if (need > data->capacity &&
+ amdgpu_virt_ras_realloc_eh_data_space(adev, data, need - data->capacity))
+ return false;
memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
data->count += pages;
+
+ return true;
}
static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
memset(&bp, 0, sizeof(bp));
- if (bp_block_size) {
- bp_cnt = bp_block_size / sizeof(uint64_t);
- for (bp_idx = 0; bp_idx < bp_cnt; bp_idx++) {
- retired_page = *(uint64_t *)(vram_usage_va +
- bp_block_offset + bp_idx * sizeof(uint64_t));
- bp.retired_page = retired_page;
+ if (!bp_block_size)
+ return;
- if (amdgpu_virt_ras_check_bad_page(adev, retired_page))
- continue;
+ bp_cnt = bp_block_size / sizeof(uint64_t);
+ for (bp_idx = 0; bp_idx < bp_cnt; bp_idx++) {
+ retired_page = *(uint64_t *)(vram_usage_va +
+ bp_block_offset + bp_idx * sizeof(uint64_t));
+ bp.retired_page = retired_page;
- amdgpu_virt_ras_add_bps(adev, &bp, 1);
+ if (amdgpu_virt_ras_check_bad_page(adev, retired_page))
+ continue;
- amdgpu_virt_ras_reserve_bps(adev);
- }
+ if (!amdgpu_virt_ras_add_bps(adev, &bp, 1))
+ break;
+
+ amdgpu_virt_ras_reserve_bps(adev);
}
}