drm/amd/ras: add check func for pmfw eeprom

author Gangliang Xie <ganglxie@amd.com>

Mon, 15 Dec 2025 07:54:35 +0000 (15:54 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Wed, 4 Mar 2026 16:43:43 +0000 (11:43 -0500)
author Gangliang Xie <ganglxie@amd.com>
Mon, 15 Dec 2025 07:54:35 +0000 (15:54 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Wed, 4 Mar 2026 16:43:43 +0000 (11:43 -0500)
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c

index 45ed8c3b5563bf47776f67a5bf405603e22a4cd7..7d728e52360421e9c7e8d4681811adc5e598b467 100644 (file)
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
@@ -137,7 +137,8 @@ static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core,
                 break;
         case RAS_EVENT_ID__DEVICE_RMA:
                 ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, NULL);
-               ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
+               if (!ras_fw_eeprom_supported(ras_core))
+                       ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
                 break;
         case RAS_EVENT_ID__RESET_GPU:
                 ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data);
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h b/drivers/gpu/drm/amd/ras/rascore/ras.h

index 04f9e09884aa94f4c6400d98efb4ee28d77e0b7a..6449d7b8627d6a18cc46aabfc0257cfd375bba92 100644 (file)
--- a/drivers/gpu/drm/amd/ras/rascore/ras.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras.h
@@ -50,6 +50,13 @@
  #define GPU_RESET_CAUSE_FATAL   (RAS_CORE_RESET_GPU | 0x0002)
  #define GPU_RESET_CAUSE_RMA     (RAS_CORE_RESET_GPU | 0x0004)
  
+enum ras_gpu_health_status {
+       RAS_GPU_HEALTH_NONE = 0,
+       RAS_GPU_HEALTH_USABLE = 1,
+       RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
+       RAS_GPU_IN_BAD_STATUS = 3,
+};
+
  enum ras_core_fw_feature_flags {
         RAS_CORE_FW_FEATURE_BIT__RAS_EEPROM = BIT_ULL(0),
  };
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_core.c b/drivers/gpu/drm/amd/ras/rascore/ras_core.c

index 05c7575b76e8e609a9f3e0a7022867527747452c..36d9bb8ca9e45ea6a8d51fa2f374bf51b621bc0e 100644 (file)
--- a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
@@ -403,7 +403,10 @@ int ras_core_hw_init(struct ras_core_context *ras_core)
                 goto init_err6;
         }
  
-       ret = ras_eeprom_check_storage_status(ras_core);
+       if (ras_fw_eeprom_supported(ras_core))
+               ret = ras_fw_eeprom_check_storage_status(ras_core);
+       else
+               ret = ras_eeprom_check_storage_status(ras_core);
         if (ret)
                 goto init_err6;
  
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h

index 2abe566c18b670b3007fd1efd6c1895eb6f7a8db..f2c001ef64e1f8991143759c440c615ee4a22b18 100644 (file)
--- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h
@@ -57,13 +57,6 @@ do { \
         (RECORD)->retired_row_pfn = tmp; \
  } while (0)
  
-enum ras_gpu_health_status {
-       RAS_GPU_HEALTH_NONE = 0,
-       RAS_GPU_HEALTH_USABLE = 1,
-       RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
-       RAS_GPU_IN_BAD_STATUS = 3,
-};
-
  enum ras_eeprom_err_type {
         RAS_EEPROM_ERR_NA,
         RAS_EEPROM_ERR_RECOVERABLE,
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c

index 4362b8a0f3c46e5d9c9b592f790f95e78d58673a..70bbf1334c4f7550f839c94c81772bd7aaab8680 100644 (file)
--- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
@@ -453,3 +453,54 @@ int ras_fw_eeprom_hw_fini(struct ras_core_context *ras_core)
  
         return 0;
  }
+
+int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core)
+{
+       struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
+       int bad_page_count;
+
+       bad_page_count = ras_umc_get_badpage_count(ras_core);
+
+       if ((control->record_threshold_count < bad_page_count) &&
+           (control->record_threshold_config != 0)) {
+               RAS_DEV_ERR(ras_core->dev, "RAS records:%d exceed threshold:%d",
+                               bad_page_count, control->record_threshold_count);
+               if ((control->record_threshold_config == WARN_NONSTOP_OVER_THRESHOLD) ||
+                       (control->record_threshold_config == NONSTOP_OVER_THRESHOLD)) {
+                       RAS_DEV_WARN(ras_core->dev,
+                       "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
+               } else {
+                       ras_core->is_rma = true;
+                       RAS_DEV_ERR(ras_core->dev,
+                       "User defined threshold is set, runtime service will be halt when threshold is reached\n");
+               }
+               return 0;
+       }
+
+       RAS_DEV_INFO(ras_core->dev,
+                       "Found existing EEPROM table with %d records\n",
+                       bad_page_count);
+       /* Warn if we are at 90% of the threshold or above
+        */
+       if (10 * bad_page_count >= 9 * control->record_threshold_count)
+               RAS_DEV_WARN(ras_core->dev,
+                       "RAS records:%u exceeds 90%% of threshold:%d\n",
+                       bad_page_count,
+                       control->record_threshold_count);
+
+       return 0;
+}
+
+enum ras_gpu_health_status
+       ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core)
+{
+       struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
+
+       if (!control->record_threshold_config)
+               return RAS_GPU_HEALTH_NONE;
+
+       if (ras_core->is_rma)
+               return RAS_GPU_RETIRED__ECC_REACH_THRESHOLD;
+
+       return RAS_GPU_HEALTH_USABLE;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h

index cb92e6a63cf54def7e011c680ed738cbdd9f00ef..75d8b95c6923d558a02903b0ca9109d659b66078 100644 (file)
--- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
@@ -79,5 +79,8 @@ int ras_fw_eeprom_update_record(struct ras_core_context *ras_core,
                                 struct ras_bank_ecc *ras_ecc);
  int ras_fw_eeprom_hw_init(struct ras_core_context *ras_core);
  int ras_fw_eeprom_hw_fini(struct ras_core_context *ras_core);
+int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core);
+enum ras_gpu_health_status
+       ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core);
  
  #endif
author	Gangliang Xie <ganglxie@amd.com>
	Mon, 15 Dec 2025 07:54:35 +0000 (15:54 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Wed, 4 Mar 2026 16:43:43 +0000 (11:43 -0500)
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/ras/rascore/ras.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/ras/rascore/ras_core.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h		patch \| blob \| blame \| history