]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdkfd: Fix queue preemption/eviction failures by aligning control stack size...
authorDonet Tom <donettom@linux.ibm.com>
Mon, 23 Mar 2026 04:28:39 +0000 (09:58 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 30 Mar 2026 20:22:44 +0000 (16:22 -0400)
The control stack size is calculated based on the number of CUs and
waves, and is then aligned to PAGE_SIZE. When the resulting control
stack size is aligned to 64 KB, GPU hangs and queue preemption
failures are observed while running RCCL unit tests on systems with
more than two GPUs.

amdgpu 0048:0f:00.0: amdgpu: Queue preemption failed for queue with
doorbell_id: 80030008
amdgpu 0048:0f:00.0: amdgpu: Failed to evict process queues
amdgpu 0048:0f:00.0: amdgpu: GPU reset begin!. Source: 4
amdgpu 0048:0f:00.0: amdgpu: Queue preemption failed for queue with
doorbell_id: 80030008
amdgpu 0048:0f:00.0: amdgpu: Failed to evict process queues
amdgpu 0048:0f:00.0: amdgpu: Failed to restore process queues

This issue is observed on both 4 KB and 64 KB system page-size
configurations.

This patch fixes the issue by aligning the control stack size to
AMDGPU_GPU_PAGE_SIZE instead of PAGE_SIZE, so the control stack size
will not be 64 KB on systems with a 64 KB page size and queue
preemption works correctly.

Additionally, In the current code, wg_data_size is aligned to PAGE_SIZE,
which can waste memory if the system page size is large. In this patch,
wg_data_size is aligned to AMDGPU_GPU_PAGE_SIZE. The cwsr_size, calculated
from wg_data_size and the control stack size, is aligned to PAGE_SIZE.

Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit a3e14436304392fbada359edd0f1d1659850c9b7)

drivers/gpu/drm/amd/amdkfd/kfd_queue.c

index e1a922bb2ab75a29fa0e248a319a7eebdf6d5f82..28354a4e5dd5d153d49bcb3a3c2b0125b0ddea28 100644 (file)
@@ -492,10 +492,11 @@ void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev)
        cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask);
        wave_num = get_num_waves(props, gfxv, cu_num);
 
-       wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props), PAGE_SIZE);
+       wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props),
+                               AMDGPU_GPU_PAGE_SIZE);
        ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8;
        ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size,
-                              PAGE_SIZE);
+                              AMDGPU_GPU_PAGE_SIZE);
 
        if ((gfxv / 10000 * 10000) == 100000) {
                /* HW design limits control stack size to 0x7000.
@@ -507,7 +508,7 @@ void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev)
 
        props->ctl_stack_size = ctl_stack_size;
        props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN);
-       props->cwsr_size = ctl_stack_size + wg_data_size;
+       props->cwsr_size = ALIGN(ctl_stack_size + wg_data_size, PAGE_SIZE);
 
        if (gfxv == 80002)      /* GFX_VERSION_TONGA */
                props->eop_buffer_size = 0x8000;