From: Gangliang Xie Date: Mon, 15 Dec 2025 07:54:35 +0000 (+0800) Subject: drm/amd/ras: add check func for pmfw eeprom X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e82f9aac39885c220168e0ac11ff866e1aebbca1;p=thirdparty%2Flinux.git drm/amd/ras: add check func for pmfw eeprom add check func for pmfw eeprom Signed-off-by: Gangliang Xie Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c index 45ed8c3b5563b..7d728e5236042 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c @@ -137,7 +137,8 @@ static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core, break; case RAS_EVENT_ID__DEVICE_RMA: ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, NULL); - ret = amdgpu_dpm_send_rma_reason(ras_core->dev); + if (!ras_fw_eeprom_supported(ras_core)) + ret = amdgpu_dpm_send_rma_reason(ras_core->dev); break; case RAS_EVENT_ID__RESET_GPU: ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data); diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h b/drivers/gpu/drm/amd/ras/rascore/ras.h index 04f9e09884aa9..6449d7b8627d6 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras.h +++ b/drivers/gpu/drm/amd/ras/rascore/ras.h @@ -50,6 +50,13 @@ #define GPU_RESET_CAUSE_FATAL (RAS_CORE_RESET_GPU | 0x0002) #define GPU_RESET_CAUSE_RMA (RAS_CORE_RESET_GPU | 0x0004) +enum ras_gpu_health_status { + RAS_GPU_HEALTH_NONE = 0, + RAS_GPU_HEALTH_USABLE = 1, + RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2, + RAS_GPU_IN_BAD_STATUS = 3, +}; + enum ras_core_fw_feature_flags { RAS_CORE_FW_FEATURE_BIT__RAS_EEPROM = BIT_ULL(0), }; diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_core.c b/drivers/gpu/drm/amd/ras/rascore/ras_core.c index 05c7575b76e8e..36d9bb8ca9e45 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_core.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_core.c @@ -403,7 +403,10 @@ int ras_core_hw_init(struct ras_core_context *ras_core) goto init_err6; } - ret = ras_eeprom_check_storage_status(ras_core); + if (ras_fw_eeprom_supported(ras_core)) + ret = ras_fw_eeprom_check_storage_status(ras_core); + else + ret = ras_eeprom_check_storage_status(ras_core); if (ret) goto init_err6; diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h index 2abe566c18b67..f2c001ef64e1f 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h +++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h @@ -57,13 +57,6 @@ do { \ (RECORD)->retired_row_pfn = tmp; \ } while (0) -enum ras_gpu_health_status { - RAS_GPU_HEALTH_NONE = 0, - RAS_GPU_HEALTH_USABLE = 1, - RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2, - RAS_GPU_IN_BAD_STATUS = 3, -}; - enum ras_eeprom_err_type { RAS_EEPROM_ERR_NA, RAS_EEPROM_ERR_RECOVERABLE, diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c index 4362b8a0f3c46..70bbf1334c4f7 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c @@ -453,3 +453,54 @@ int ras_fw_eeprom_hw_fini(struct ras_core_context *ras_core) return 0; } + +int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core) +{ + struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom; + int bad_page_count; + + bad_page_count = ras_umc_get_badpage_count(ras_core); + + if ((control->record_threshold_count < bad_page_count) && + (control->record_threshold_config != 0)) { + RAS_DEV_ERR(ras_core->dev, "RAS records:%d exceed threshold:%d", + bad_page_count, control->record_threshold_count); + if ((control->record_threshold_config == WARN_NONSTOP_OVER_THRESHOLD) || + (control->record_threshold_config == NONSTOP_OVER_THRESHOLD)) { + RAS_DEV_WARN(ras_core->dev, + "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); + } else { + ras_core->is_rma = true; + RAS_DEV_ERR(ras_core->dev, + "User defined threshold is set, runtime service will be halt when threshold is reached\n"); + } + return 0; + } + + RAS_DEV_INFO(ras_core->dev, + "Found existing EEPROM table with %d records\n", + bad_page_count); + /* Warn if we are at 90% of the threshold or above + */ + if (10 * bad_page_count >= 9 * control->record_threshold_count) + RAS_DEV_WARN(ras_core->dev, + "RAS records:%u exceeds 90%% of threshold:%d\n", + bad_page_count, + control->record_threshold_count); + + return 0; +} + +enum ras_gpu_health_status + ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core) +{ + struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom; + + if (!control->record_threshold_config) + return RAS_GPU_HEALTH_NONE; + + if (ras_core->is_rma) + return RAS_GPU_RETIRED__ECC_REACH_THRESHOLD; + + return RAS_GPU_HEALTH_USABLE; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h index cb92e6a63cf54..75d8b95c6923d 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h +++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h @@ -79,5 +79,8 @@ int ras_fw_eeprom_update_record(struct ras_core_context *ras_core, struct ras_bank_ecc *ras_ecc); int ras_fw_eeprom_hw_init(struct ras_core_context *ras_core); int ras_fw_eeprom_hw_fini(struct ras_core_context *ras_core); +int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core); +enum ras_gpu_health_status + ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core); #endif