]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdgpu: don't reemit ring contents more than once
authorAlex Deucher <alexander.deucher@amd.com>
Thu, 13 Nov 2025 18:24:10 +0000 (13:24 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 5 Jan 2026 21:59:57 +0000 (16:59 -0500)
If we cancel a bad job and reemit the ring contents, and
we get another timeout, cancel everything rather than reemitting.
The wptr markers are only relevant for the original emit.  If
we reemit, the wptr markers are no longer correct.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index c7843e336310906ec63f0601ffb7d5edc2290b8a..4f74a02a9a05c4f09fffef28fc5bd5ab6183f3db 100644 (file)
@@ -709,6 +709,7 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
        struct amdgpu_ring *ring = af->ring;
        unsigned long flags;
        u32 seq, last_seq;
+       bool reemitted = false;
 
        last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
        seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
@@ -726,7 +727,9 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
                if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
                        fence = container_of(unprocessed, struct amdgpu_fence, base);
 
-                       if (fence == af)
+                       if (fence->reemitted > 1)
+                               reemitted = true;
+                       else if (fence == af)
                                dma_fence_set_error(&fence->base, -ETIME);
                        else if (fence->context == af->context)
                                dma_fence_set_error(&fence->base, -ECANCELED);
@@ -734,9 +737,16 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
                rcu_read_unlock();
        } while (last_seq != seq);
        spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
-       /* signal the guilty fence */
-       amdgpu_fence_write(ring, (u32)af->base.seqno);
-       amdgpu_fence_process(ring);
+
+       if (reemitted) {
+               /* if we've already reemitted once then just cancel everything */
+               amdgpu_fence_driver_force_completion(af->ring);
+               af->ring->ring_backup_entries_to_copy = 0;
+       } else {
+               /* signal the guilty fence */
+               amdgpu_fence_write(ring, (u32)af->base.seqno);
+               amdgpu_fence_process(ring);
+       }
 }
 
 void amdgpu_fence_save_wptr(struct amdgpu_fence *af)
@@ -784,10 +794,12 @@ void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring,
                        /* save everything if the ring is not guilty, otherwise
                         * just save the content from other contexts.
                         */
-                       if (!guilty_fence || (fence->context != guilty_fence->context))
+                       if (!fence->reemitted &&
+                           (!guilty_fence || (fence->context != guilty_fence->context)))
                                amdgpu_ring_backup_unprocessed_command(ring, wptr,
                                                                       fence->wptr);
                        wptr = fence->wptr;
+                       fence->reemitted++;
                }
                rcu_read_unlock();
        } while (last_seq != seq);
index a1fb0fadb6eab14334e4d68c49ed21d1db16e8e2..d8818295289768be4d5664ecf933d05bdf0cfd4d 100644 (file)
@@ -150,6 +150,8 @@ struct amdgpu_fence {
        u64                             wptr;
        /* fence context for resets */
        u64                             context;
+       /* has this fence been reemitted */
+       unsigned int                    reemitted;
 };
 
 extern const struct drm_sched_backend_ops amdgpu_sched_ops;