]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amdgpu: Send applicable RMA CPERs at end of RAS init
authorKent Russell <kent.russell@amd.com>
Tue, 3 Feb 2026 14:48:23 +0000 (09:48 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 5 Feb 2026 22:28:34 +0000 (17:28 -0500)
Firmware and monitoring tools may not be ready to receive a CPER when we
read the bad pages, so send the CPERs at the end of RAS initialization
to ensure that the FW is ready to receive and process the CPER. This
removes the previous CPER submission that was added during bad page
load, and sends both in-band and out-of-band at the same time.

Signed-off-by: Kent Russell <kent.russell@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

index b28fcf932f7ea250ac4635c44a5a55f19476991a..856b1bf83533d55535d039b0f4203f681ed78162 100644 (file)
@@ -4650,6 +4650,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
                        amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
        }
 
+       amdgpu_ras_check_bad_page_status(adev);
+
        return 0;
 }
 
index 469d04a39d7d0e77a6ba2a5d685da74135c30f1d..2c5d7f87e593ed233e4bec5e8afefb1752d92855 100644 (file)
@@ -1712,10 +1712,6 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
                        dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
                                        control->ras_num_bad_pages,
                                        ras->bad_page_cnt_threshold);
-               if (amdgpu_bad_page_threshold != 0 &&
-                       control->ras_num_bad_pages >= ras->bad_page_cnt_threshold)
-                       amdgpu_dpm_send_rma_reason(adev);
-
        } else if (hdr->header == RAS_TABLE_HDR_BAD &&
                   amdgpu_bad_page_threshold != 0) {
                if (hdr->version >= RAS_TABLE_VER_V2_1) {
@@ -1932,3 +1928,26 @@ int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
                                                                           result);
        return -EOPNOTSUPP;
 }
+
+void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+       struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
+
+       if (!control || amdgpu_bad_page_threshold == 0)
+               return;
+
+       if (control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
+               if (amdgpu_dpm_send_rma_reason(adev))
+                       dev_warn(adev->dev, "Unable to send out-of-band RMA CPER");
+               else
+                       dev_dbg(adev->dev, "Sent out-of-band RMA CPER");
+
+               if (adev->cper.enabled && !amdgpu_uniras_enabled(adev)) {
+                       if (amdgpu_cper_generate_bp_threshold_record(adev))
+                               dev_warn(adev->dev, "Unable to send in-band RMA CPER");
+                       else
+                               dev_dbg(adev->dev, "Sent in-band RMA CPER");
+               }
+       }
+}
index 2e5d63957e714336f4a1c10452843c3550a2d79f..a62114800a92a7c33db98c55cf15b62ce630a5ec 100644 (file)
@@ -193,6 +193,8 @@ int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
 
 int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control);
 
+void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev);
+
 extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
 extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;