drm/amdgpu: load RAS bad page from PMFW in page retirement

author Tao Zhou <tao.zhou1@amd.com>

Fri, 25 Jul 2025 02:47:35 +0000 (10:47 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Wed, 12 Nov 2025 02:53:26 +0000 (21:53 -0500)
author Tao Zhou <tao.zhou1@amd.com>
Fri, 25 Jul 2025 02:47:35 +0000 (10:47 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Wed, 12 Nov 2025 02:53:26 +0000 (21:53 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 055a9bbabbdbb01afcdb6af40f5a9eca2732eeae..36a5393d6b74eb73d1f0f8c12e1704eff04cb5d7 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3300,7 +3300,13 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
         mutex_lock(&con->recovery_lock);
         control = &con->eeprom_control;
         data = con->eh_data;
-       unit_num = data->count / adev->umc.retire_unit - control->ras_num_recs;
+       if (amdgpu_ras_smu_eeprom_supported(adev))
+               unit_num = control->ras_num_recs -
+                       control->ras_num_recs_old;
+       else
+               unit_num = data->count / adev->umc.retire_unit -
+                       control->ras_num_recs;
+
         save_count = con->bad_page_num - control->ras_num_bad_pages;
         mutex_unlock(&con->recovery_lock);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

index 2e039fb778ea8b288a63aa9247a171151e2e7872..3eb252de343bb74185acbc15126de6ccbfc0848c 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -96,67 +96,96 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
  {
         struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct amdgpu_ras_eeprom_control *control = &con->eeprom_control;
         unsigned int error_query_mode;
         int ret = 0;
         unsigned long err_count;
  
         amdgpu_ras_get_error_query_mode(adev, &error_query_mode);
  
+       err_data->err_addr =
+               kcalloc(adev->umc.max_ras_err_cnt_per_query,
+                       sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+       /* still call query_ras_error_address to clear error status
+        * even NOMEM error is encountered
+        */
+       if (!err_data->err_addr)
+               dev_warn(adev->dev,
+                       "Failed to alloc memory for umc error address record!\n");
+       else
+               err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
+
         mutex_lock(&con->page_retirement_lock);
-       ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
-       if (ret == -EOPNOTSUPP &&
-           error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
-               if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
-                   adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
-                   adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
-
-               if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
-                   adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
-                   adev->umc.max_ras_err_cnt_per_query) {
-                       err_data->err_addr =
-                               kcalloc(adev->umc.max_ras_err_cnt_per_query,
-                                       sizeof(struct eeprom_table_record), GFP_KERNEL);
-
-                       /* still call query_ras_error_address to clear error status
-                        * even NOMEM error is encountered
-                        */
-                       if(!err_data->err_addr)
-                               dev_warn(adev->dev, "Failed to alloc memory for "
-                                               "umc error address record!\n");
-                       else
-                               err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
-
-                       /* umc query_ras_error_address is also responsible for clearing
-                        * error status
-                        */
-                       adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status);
+       if (!amdgpu_ras_smu_eeprom_supported(adev)) {
+               ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
+               if (ret == -EOPNOTSUPP &&
+                   error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
+                       if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
+                           adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
+                               adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev,
+                                                               ras_error_status);
+
+                       if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
+                           adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
+                           adev->umc.max_ras_err_cnt_per_query) {
+                               err_data->err_addr =
+                                       kcalloc(adev->umc.max_ras_err_cnt_per_query,
+                                               sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+                               /* still call query_ras_error_address to clear error status
+                                * even NOMEM error is encountered
+                                */
+                               if (!err_data->err_addr)
+                                       dev_warn(adev->dev,
+                                               "Failed to alloc memory for umc error address record!\n");
+                               else
+                                       err_data->err_addr_len =
+                                               adev->umc.max_ras_err_cnt_per_query;
+
+                               /* umc query_ras_error_address is also responsible for clearing
+                                * error status
+                                */
+                               adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev,
+                                                               ras_error_status);
+                       }
+               } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
+                   (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
+                       if (adev->umc.ras &&
+                           adev->umc.ras->ecc_info_query_ras_error_count)
+                               adev->umc.ras->ecc_info_query_ras_error_count(adev,
+                                                               ras_error_status);
+
+                       if (adev->umc.ras &&
+                           adev->umc.ras->ecc_info_query_ras_error_address &&
+                           adev->umc.max_ras_err_cnt_per_query) {
+                               err_data->err_addr =
+                                       kcalloc(adev->umc.max_ras_err_cnt_per_query,
+                                               sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+                               /* still call query_ras_error_address to clear error status
+                                * even NOMEM error is encountered
+                                */
+                               if (!err_data->err_addr)
+                                       dev_warn(adev->dev,
+                                               "Failed to alloc memory for umc error address record!\n");
+                               else
+                                       err_data->err_addr_len =
+                                               adev->umc.max_ras_err_cnt_per_query;
+
+                               /* umc query_ras_error_address is also responsible for clearing
+                                * error status
+                                */
+                               adev->umc.ras->ecc_info_query_ras_error_address(adev,
+                                                               ras_error_status);
+                       }
                 }
-       } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
-           (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
-               if (adev->umc.ras &&
-                   adev->umc.ras->ecc_info_query_ras_error_count)
-                   adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status);
-
-               if (adev->umc.ras &&
-                   adev->umc.ras->ecc_info_query_ras_error_address &&
-                   adev->umc.max_ras_err_cnt_per_query) {
-                       err_data->err_addr =
-                               kcalloc(adev->umc.max_ras_err_cnt_per_query,
-                                       sizeof(struct eeprom_table_record), GFP_KERNEL);
-
-                       /* still call query_ras_error_address to clear error status
-                        * even NOMEM error is encountered
-                        */
-                       if(!err_data->err_addr)
-                               dev_warn(adev->dev, "Failed to alloc memory for "
-                                               "umc error address record!\n");
-                       else
-                               err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
-
-                       /* umc query_ras_error_address is also responsible for clearing
-                        * error status
-                        */
-                       adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status);
+       } else {
+               if (!amdgpu_ras_eeprom_update_record_num(control)) {
+                       err_data->err_addr_cnt = err_data->de_count =
+                               control->ras_num_recs - control->ras_num_recs_old;
+                       amdgpu_ras_eeprom_read_idx(control, err_data->err_addr,
+                               control->ras_num_recs_old, err_data->de_count);
                 }
         }
  
@@ -166,7 +195,7 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
                 if ((amdgpu_bad_page_threshold != 0) &&
                         err_data->err_addr_cnt) {
                         amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
-                                               err_data->err_addr_cnt, false);
+                               err_data->err_addr_cnt, amdgpu_ras_smu_eeprom_supported(adev));
                         amdgpu_ras_save_bad_pages(adev, &err_count);
  
                         amdgpu_dpm_send_hbm_bad_pages_num(adev,
author	Tao Zhou <tao.zhou1@amd.com>
	Fri, 25 Jul 2025 02:47:35 +0000 (10:47 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Wed, 12 Nov 2025 02:53:26 +0000 (21:53 -0500)
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c		patch \| blob \| blame \| history