]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdgpu: Extend bus status check to more cases
authorLijo Lazar <lijo.lazar@amd.com>
Fri, 13 Jun 2025 11:00:30 +0000 (16:30 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 18 Jun 2025 16:19:21 +0000 (12:19 -0400)
In case of unexpected errors, check if device is alive on the bus.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Asad Kamal <asad.kamal@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c

index 5e2f086d2c99e7db41245fb3306af0b8a428e05b..42278e9a2d75a844e7f0e3659039d9fec831a529 100644 (file)
@@ -1762,4 +1762,19 @@ extern const struct attribute_group amdgpu_flash_attr_group;
 
 void amdgpu_set_init_level(struct amdgpu_device *adev,
                           enum amdgpu_init_lvl_id lvl);
+
+static inline int amdgpu_device_bus_status_check(struct amdgpu_device *adev)
+{
+       u32 status;
+       int r;
+
+       r = pci_read_config_dword(adev->pdev, PCI_COMMAND, &status);
+       if (r || PCI_POSSIBLE_ERROR(status)) {
+               dev_err(adev->dev, "device lost from bus!");
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
 #endif
index dbe36fdb0a3ccf28f23b0dab2b0561c2f6bf5ed9..c2d10be53d3e8dbad0817f549c922a51aaaad365 100644 (file)
@@ -6071,14 +6071,9 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
 {
        struct amdgpu_device *tmp_adev;
        int ret = 0;
-       u32 status;
 
        list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-               pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
-               if (PCI_POSSIBLE_ERROR(status)) {
-                       dev_err(tmp_adev->dev, "device lost from bus!");
-                       ret = -ENODEV;
-               }
+               ret |= amdgpu_device_bus_status_check(tmp_adev);
        }
 
        return ret;
index 1c083304ae77676aaa825609317bb66cca7de7fb..41acc1ce0b3ed00bca1b4d0192bd9f22b6edfee9 100644 (file)
@@ -353,11 +353,14 @@ static int aqua_vanjaram_query_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr)
 
        if (adev->nbio.funcs->get_compute_partition_mode) {
                mode = adev->nbio.funcs->get_compute_partition_mode(adev);
-               if (mode != derv_mode)
+               if (mode != derv_mode) {
                        dev_warn(
                                adev->dev,
                                "Mismatch in compute partition mode - reported : %d derived : %d",
                                mode, derv_mode);
+                       if (derv_mode == AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE)
+                               amdgpu_device_bus_status_check(adev);
+               }
        }
 
        return mode;
index 7eaf58fd7f9ae509280b9b2c087b1d938190b0e1..59f9abd0f7b8cefa0c6bea45cd38decd425bf067 100644 (file)
@@ -86,6 +86,7 @@ static void smu_cmn_read_arg(struct smu_context *smu,
 #define SMU_RESP_BUSY_OTHER     0xFC
 #define SMU_RESP_DEBUG_END      0xFB
 
+#define SMU_RESP_UNEXP (~0U)
 /**
  * __smu_cmn_poll_stat -- poll for a status from the SMU
  * @smu: a pointer to SMU context
@@ -171,6 +172,15 @@ static void __smu_cmn_reg_print_error(struct smu_context *smu,
                dev_err_ratelimited(adev->dev,
                                    "SMU: I'm debugging!");
                break;
+       case SMU_RESP_UNEXP:
+               if (amdgpu_device_bus_status_check(smu->adev)) {
+                       /* print error immediately if device is off the bus */
+                       dev_err(adev->dev,
+                               "SMU: response:0x%08X for index:%d param:0x%08X message:%s?",
+                               reg_c2pmsg_90, msg_index, param, message);
+                       break;
+               }
+               fallthrough;
        default:
                dev_err_ratelimited(adev->dev,
                                    "SMU: response:0x%08X for index:%d param:0x%08X message:%s?",