]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdgpu: set an error on all fences from a bad context
authorAlex Deucher <alexander.deucher@amd.com>
Wed, 3 Sep 2025 17:48:23 +0000 (13:48 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 13 Oct 2025 18:14:15 +0000 (14:14 -0400)
When we backup ring contents to reemit after a queue reset,
we don't backup ring contents from the bad context.  When
we signal the fences, we should set an error on those
fences as well.

v2: misc cleanups
v3: add locking for fence error, fix comment (Christian)
v4: fix wrap around, locking (Christian)

Fixes: 77cc0da39c7c ("drm/amdgpu: track ring state associated with a fence")
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index e270df30c27907c301e2512f958ab5c0c43c4146..18a7829122d246c8be936abe9d4c930d2e81f1b0 100644 (file)
@@ -758,11 +758,42 @@ void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring)
  * @fence: fence of the ring to signal
  *
  */
-void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence)
+void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
 {
-       dma_fence_set_error(&fence->base, -ETIME);
-       amdgpu_fence_write(fence->ring, fence->seq);
-       amdgpu_fence_process(fence->ring);
+       struct dma_fence *unprocessed;
+       struct dma_fence __rcu **ptr;
+       struct amdgpu_fence *fence;
+       struct amdgpu_ring *ring = af->ring;
+       unsigned long flags;
+       u32 seq, last_seq;
+
+       last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
+       seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
+
+       /* mark all fences from the guilty context with an error */
+       spin_lock_irqsave(&ring->fence_drv.lock, flags);
+       do {
+               last_seq++;
+               last_seq &= ring->fence_drv.num_fences_mask;
+
+               ptr = &ring->fence_drv.fences[last_seq];
+               rcu_read_lock();
+               unprocessed = rcu_dereference(*ptr);
+
+               if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
+                       fence = container_of(unprocessed, struct amdgpu_fence, base);
+
+                       if (fence == af)
+                               dma_fence_set_error(&fence->base, -ETIME);
+                       else if (fence->context == af->context)
+                               dma_fence_set_error(&fence->base, -ECANCELED);
+               }
+               rcu_read_unlock();
+       } while (last_seq != seq);
+       spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
+       /* signal the guilty fence */
+       amdgpu_fence_write(ring, af->seq);
+       amdgpu_fence_process(ring);
 }
 
 void amdgpu_fence_save_wptr(struct dma_fence *fence)
index 8f6ce948c6841d751ad82712dfe0976ed2ef1713..5ec5c3ff22bb07768bf9536f03a247b17565eb0b 100644 (file)
@@ -811,7 +811,7 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
        if (r)
                return r;
 
-       /* signal the fence of the bad job */
+       /* signal the guilty fence and set an error on all fences from the context */
        if (guilty_fence)
                amdgpu_fence_driver_guilty_force_completion(guilty_fence);
        /* Re-emit the non-guilty commands */
index b6b6491797761546aa0b20eac87cc8df7962d3e4..4b46e3c26ff39f040c2527887721e168aad12d7c 100644 (file)
@@ -155,7 +155,7 @@ extern const struct drm_sched_backend_ops amdgpu_sched_ops;
 void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
 void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error);
 void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
-void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence);
+void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af);
 void amdgpu_fence_save_wptr(struct dma_fence *fence);
 
 int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring);