]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
drm/amdgpu: Fix error handling in slot reset
authorLijo Lazar <lijo.lazar@amd.com>
Tue, 24 Feb 2026 04:48:51 +0000 (10:18 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 25 Feb 2026 22:57:55 +0000 (17:57 -0500)
If the device has not recovered after slot reset is called, it goes to
out label for error handling. There it could make decision based on
uninitialized hive pointer and could result in accessing an uninitialized
list.

Initialize the list and hive properly so that it handles the error
situation and also releases the reset domain lock which is acquired
during error_detected callback.

Fixes: 732c6cefc1ec ("drm/amdgpu: Replace tmp_adev with hive in amdgpu_pci_slot_reset")
Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Ce Sun <cesun102@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit bb71362182e59caa227e4192da5a612b09349696)

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index d9789e0b52015b2791a41181fb5512631cb2ff1a..3e19b51a276380588951bee1a66c12d05b2393e0 100644 (file)
@@ -7059,6 +7059,15 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
        dev_info(adev->dev, "PCI error: slot reset callback!!\n");
 
        memset(&reset_context, 0, sizeof(reset_context));
+       INIT_LIST_HEAD(&device_list);
+       hive = amdgpu_get_xgmi_hive(adev);
+       if (hive) {
+               mutex_lock(&hive->hive_lock);
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+                       list_add_tail(&tmp_adev->reset_list, &device_list);
+       } else {
+               list_add_tail(&adev->reset_list, &device_list);
+       }
 
        if (adev->pcie_reset_ctx.swus)
                link_dev = adev->pcie_reset_ctx.swus;
@@ -7099,19 +7108,13 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
        reset_context.reset_req_dev = adev;
        set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
        set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
-       INIT_LIST_HEAD(&device_list);
 
-       hive = amdgpu_get_xgmi_hive(adev);
        if (hive) {
-               mutex_lock(&hive->hive_lock);
                reset_context.hive = hive;
-               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
                        tmp_adev->pcie_reset_ctx.in_link_reset = true;
-                       list_add_tail(&tmp_adev->reset_list, &device_list);
-               }
        } else {
                set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
-               list_add_tail(&adev->reset_list, &device_list);
        }
 
        r = amdgpu_device_asic_reset(adev, &device_list, &reset_context);