]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdgpu: refine eeprom data check
authorganglxie <ganglxie@amd.com>
Wed, 2 Jul 2025 08:56:22 +0000 (16:56 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 15 Jul 2025 18:07:53 +0000 (14:07 -0400)
add eeprom data checksum check before driver unload. reset eeprom
and save correct data to eeprom when check failed

Signed-off-by: ganglxie <ganglxie@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

index 1c54b2e5a225d76aa791ff5cf240e9ffbbc7e0cc..648a829559e197506173d2ead9b9a35255456709 100644 (file)
@@ -2512,6 +2512,7 @@ amdgpu_pci_remove(struct pci_dev *pdev)
        struct drm_device *dev = pci_get_drvdata(pdev);
        struct amdgpu_device *adev = drm_to_adev(dev);
 
+       amdgpu_ras_eeprom_check_and_recover(adev);
        amdgpu_xcp_dev_unplug(adev);
        amdgpu_gmc_prepare_nps_mode_change(adev);
        drm_dev_unplug(dev);
index 54838746f97dd317d889a1a683596c33d0edc2a9..9bda9ad13f882e50b9e55973d116412bfca8bfd7 100644 (file)
@@ -1531,3 +1531,31 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
 
        return res < 0 ? res : 0;
 }
+
+void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+       struct amdgpu_ras_eeprom_control *control;
+       int res;
+
+       if (!__is_ras_eeprom_supported(adev) || !ras)
+               return;
+       control = &ras->eeprom_control;
+       if (!control->is_eeprom_valid)
+               return;
+       res = __verify_ras_table_checksum(control);
+       if (res) {
+               dev_warn(adev->dev,
+                       "RAS table incorrect checksum or error:%d, try to recover\n",
+                       res);
+               if (!amdgpu_ras_eeprom_reset_table(control))
+                       if (!amdgpu_ras_save_bad_pages(adev, NULL))
+                               if (!__verify_ras_table_checksum(control)) {
+                                       dev_info(adev->dev, "RAS table recovery succeed\n");
+                                       return;
+                               }
+               dev_err(adev->dev, "RAS table recovery failed\n");
+               control->is_eeprom_valid = false;
+       }
+       return;
+}
\ No newline at end of file
index 35c69ac3dbeb5e836c22b326401442cc3b361dac..ebfca4cb5688b69a000c103707f155a33ef39f46 100644 (file)
@@ -161,6 +161,8 @@ void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);
 
 int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control);
 
+void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev);
+
 extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
 extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;