From 7fb41ab3c94828ad48e1a6d2237e8a7e682c74b9 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 6 Nov 2025 16:26:56 +0800 Subject: [PATCH] drm/amdgpu: optimize timeout implemention in ras_eeprom_update_record_num The busy status returned by ras_eeprom_update_record_num may not be an error, increase timeout to exclude false busy status. Also add more comments to make the code readable. v2: define a macro for the timeout value. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 99aa1908833d3..64dd7a81bff5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -124,6 +124,8 @@ RAS_TABLE_V2_1_INFO_SIZE) \ / RAS_TABLE_RECORD_SIZE) +#define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */ + /* Given a zero-based index of an EEPROM RAS record, yields the EEPROM * offset off of RAS_TABLE_START. That is, this is something you can * add to control->i2c_address, and then tell I2C layer to read @@ -874,7 +876,7 @@ Out: int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); - int ret, timeout = 1000; + int ret, retry = 20; if (!amdgpu_ras_smu_eeprom_supported(adev)) return 0; @@ -882,17 +884,23 @@ int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *contro control->ras_num_recs_old = control->ras_num_recs; do { + /* 1000ms timeout is long enough, smu_get_badpage_count won't + * return -EBUSY before timeout. + */ ret = amdgpu_ras_smu_get_badpage_count(adev, - &(control->ras_num_recs), 12); + &(control->ras_num_recs), RAS_SMU_MESSAGE_TIMEOUT_MS); if (!ret && (control->ras_num_recs_old == control->ras_num_recs)) { - /* record number update in PMFW needs some time */ + /* record number update in PMFW needs some time, + * smu_get_badpage_count may return immediately without + * count update, sleep for a while and retry again. + */ msleep(50); - timeout -= 50; + retry--; } else { break; } - } while (timeout); + } while (retry); /* no update of record number is not a real failure, * don't print warning here -- 2.47.3