]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
drm/amdgpu: Report individual reset error
authorLijo Lazar <lijo.lazar@amd.com>
Mon, 6 Oct 2025 05:09:03 +0000 (10:39 +0530)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 13 Nov 2025 20:37:39 +0000 (15:37 -0500)
[ Upstream commit 2e97663760e5fb7ee14f399c68e57b894f01e505 ]

If reinitialization of one of the GPUs fails after reset, it logs
failure on all subsequent GPUs eventhough they have resumed
successfully.

A sample log where only device at 0000:95:00.0 had a failure -

amdgpu 0000:15:00.0: amdgpu: GPU reset(19) succeeded!
amdgpu 0000:65:00.0: amdgpu: GPU reset(19) succeeded!
amdgpu 0000:75:00.0: amdgpu: GPU reset(19) succeeded!
amdgpu 0000:85:00.0: amdgpu: GPU reset(19) succeeded!
amdgpu 0000:95:00.0: amdgpu: GPU reset(19) failed
amdgpu 0000:e5:00.0: amdgpu: GPU reset(19) failed
amdgpu 0000:f5:00.0: amdgpu: GPU reset(19) failed
amdgpu 0000:05:00.0: amdgpu: GPU reset(19) failed
amdgpu 0000:15:00.0: amdgpu: GPU reset end with ret = -5

To avoid confusion, report the error for each device
separately and return the first error as the overall result.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Asad Kamal <asad.kamal@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 1115af343e01351fdd41deaa9f0f47826990f2c8..ddd0e7ab82be7eb2db0555890188eeef96e3e46e 100644 (file)
@@ -6337,23 +6337,28 @@ static int amdgpu_device_sched_resume(struct list_head *device_list,
                if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
                        drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
 
-               if (tmp_adev->asic_reset_res)
-                       r = tmp_adev->asic_reset_res;
-
-               tmp_adev->asic_reset_res = 0;
-
-               if (r) {
+               if (tmp_adev->asic_reset_res) {
                        /* bad news, how to tell it to userspace ?
                         * for ras error, we should report GPU bad status instead of
                         * reset failure
                         */
                        if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
                            !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
-                               dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
-                                       atomic_read(&tmp_adev->gpu_reset_counter));
-                       amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
+                               dev_info(
+                                       tmp_adev->dev,
+                                       "GPU reset(%d) failed with error %d \n",
+                                       atomic_read(
+                                               &tmp_adev->gpu_reset_counter),
+                                       tmp_adev->asic_reset_res);
+                       amdgpu_vf_error_put(tmp_adev,
+                                           AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0,
+                                           tmp_adev->asic_reset_res);
+                       if (!r)
+                               r = tmp_adev->asic_reset_res;
+                       tmp_adev->asic_reset_res = 0;
                } else {
-                       dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
+                       dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n",
+                                atomic_read(&tmp_adev->gpu_reset_counter));
                        if (amdgpu_acpi_smart_shift_update(tmp_adev,
                                                           AMDGPU_SS_DEV_D0))
                                dev_warn(tmp_adev->dev,