]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amdgpu: re-add the bad job to the pending list for ring resets
authorAlex Deucher <alexander.deucher@amd.com>
Fri, 30 Jan 2026 16:19:49 +0000 (11:19 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 5 Feb 2026 22:25:44 +0000 (17:25 -0500)
Returning DRM_GPU_SCHED_STAT_NO_HANG causes the scheduler
to add the bad job back the pending list.  We've already
set the errors on the fence and killed the bad job at this point
so it's the correct behavior.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

index aaf5477fcd7ac6a0eb08174fbb9a906c8e56967b..2c82d9e8c0be1e490e283b5582ff5579b36f3d31 100644 (file)
@@ -92,6 +92,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
        struct drm_wedge_task_info *info = NULL;
        struct amdgpu_task_info *ti = NULL;
        struct amdgpu_device *adev = ring->adev;
+       enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_RESET;
        int idx, r;
 
        if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
@@ -135,13 +136,19 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
            ring->funcs->reset) {
                dev_err(adev->dev, "Starting %s ring reset\n",
                        s_job->sched->name);
+               /* Stop the scheduler to prevent anybody else from touching the ring buffer. */
+               drm_sched_wqueue_stop(&ring->sched);
                r = amdgpu_ring_reset(ring, job->vmid, job->hw_fence);
                if (!r) {
+                       /* Start the scheduler again */
+                       drm_sched_wqueue_start(&ring->sched);
                        atomic_inc(&ring->adev->gpu_reset_counter);
                        dev_err(adev->dev, "Ring %s reset succeeded\n",
                                ring->sched.name);
                        drm_dev_wedged_event(adev_to_drm(adev),
                                             DRM_WEDGE_RECOVERY_NONE, info);
+                       /* This is needed to add the job back to the pending list */
+                       status = DRM_GPU_SCHED_STAT_NO_HANG;
                        goto exit;
                }
                dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name);
@@ -177,7 +184,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 exit:
        amdgpu_vm_put_task_info(ti);
        drm_dev_exit(idx);
-       return DRM_GPU_SCHED_STAT_RESET;
+       return status;
 }
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
index b82357c657237a49bdb963629a2638de4b533384..129ad51386535038117403cf511f41ef3a0d9d8f 100644 (file)
@@ -868,8 +868,6 @@ bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring)
 void amdgpu_ring_reset_helper_begin(struct amdgpu_ring *ring,
                                    struct amdgpu_fence *guilty_fence)
 {
-       /* Stop the scheduler to prevent anybody else from touching the ring buffer. */
-       drm_sched_wqueue_stop(&ring->sched);
        /* back up the non-guilty commands */
        amdgpu_ring_backup_unprocessed_commands(ring, guilty_fence);
 }
@@ -895,8 +893,6 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
                        amdgpu_ring_write(ring, ring->ring_backup[i]);
                amdgpu_ring_commit(ring);
        }
-       /* Start the scheduler again */
-       drm_sched_wqueue_start(&ring->sched);
        return 0;
 }