]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amd/ras: reset CPER ring on corrupt entry size
authorXiang Liu <xiang.liu@amd.com>
Mon, 11 May 2026 07:48:55 +0000 (15:48 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 19 May 2026 15:49:25 +0000 (11:49 -0400)
When CPER ring overflow handling advances the read pointer, it trusts the
parsed entry size from the current ring contents. Corrupt CPER data can
produce an entry size that does not advance rptr after dword conversion
and pointer masking.

In that case the recovery loop keeps testing the same location while
holding the CPER ring mutex. This can hang the worker that is writing the
next CPER record.

Detect a no-progress rptr update and reset the CPER ring to an empty
state instead. This drops the corrupt contents and lets the writer leave
the recovery path without spinning.

Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c

index 004edc28d0cc007922945284f4abba62ed66feeb..d5e59c24d907d7273dad59cd2b6f2e9d5b1e3da5 100644 (file)
@@ -484,7 +484,7 @@ calc:
 
 void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)
 {
-       u64 pos, wptr_old, rptr;
+       u64 pos, wptr_old, rptr, next_rptr;
        int rec_cnt_dw = count >> 2;
        u32 chunk, ent_sz;
        u8 *s = (u8 *)src;
@@ -525,9 +525,19 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)
 
                do {
                        ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos);
-
-                       rptr += (ent_sz >> 2);
-                       rptr &= ring->ptr_mask;
+                       next_rptr = rptr;
+                       if (ent_sz >= sizeof(u32))
+                               next_rptr = (rptr + (ent_sz >> 2)) & ring->ptr_mask;
+
+                       if (next_rptr == rptr) {
+                               /* Corrupt entry size, reset the ring to avoid an infinite loop. */
+                               rptr = ring->wptr;
+                               *ring->rptr_cpu_addr = rptr;
+                               ring->count_dw = (ring->ring_size - 4) >> 2;
+                               goto out_unlock;
+                       }
+
+                       rptr = next_rptr;
                        *ring->rptr_cpu_addr = rptr;
 
                        pos = rptr;
@@ -536,6 +546,8 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)
 
        if (ring->count_dw >= rec_cnt_dw)
                ring->count_dw -= rec_cnt_dw;
+
+out_unlock:
        mutex_unlock(&ring->adev->cper.ring_lock);
 }