]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdgpu: Fixup boost mes detect hang array size
authorAmber Lin <Amber.Lin@amd.com>
Fri, 13 Mar 2026 09:27:22 +0000 (05:27 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 17 Apr 2026 19:41:14 +0000 (15:41 -0400)
When allocate the hung queues memory, we need to take the number of
queues into account for the worst hang case.

Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c

index 0d4c77c1b4b5f9790e1993acc4d2a359d6a10472..0d75d1aa60ec508f4c2c67f9394cd777e69f9255 100644 (file)
@@ -103,7 +103,7 @@ static inline u32 amdgpu_mes_get_hqd_mask(u32 num_pipe,
 
 int amdgpu_mes_init(struct amdgpu_device *adev)
 {
-       int i, r, num_pipes;
+       int i, r, num_pipes, num_queues = 0;
        u32 total_vmid_mask, reserved_vmid_mask;
        int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
        u32 gfx_hqd_mask = amdgpu_mes_get_hqd_mask(adev->gfx.me.num_pipe_per_me,
@@ -159,7 +159,8 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
                adev->mes.compute_hqd_mask[i] = compute_hqd_mask;
        }
 
-       num_pipes = adev->sdma.num_instances;
+       num_pipes = adev->sdma.num_inst_per_xcc ?
+               adev->sdma.num_inst_per_xcc : adev->sdma.num_instances;
        if (num_pipes > AMDGPU_MES_MAX_SDMA_PIPES)
                dev_warn(adev->dev, "more SDMA pipes than supported by MES! (%d vs %d)\n",
                         num_pipes, AMDGPU_MES_MAX_SDMA_PIPES);
@@ -216,8 +217,27 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
        if (r)
                goto error_doorbell;
 
+       if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 1, 0)) {
+               /* When queue/pipe reset is done in MES instead of in the
+                * driver, MES passes hung queues information to the driver in
+                * hung_queue_hqd_info. Calculate required space to store this
+                * information.
+                */
+               for (i = 0; i < AMDGPU_MES_MAX_GFX_PIPES; i++)
+                       num_queues += hweight32(adev->mes.gfx_hqd_mask[i]);
+
+               for (i = 0; i < AMDGPU_MES_MAX_COMPUTE_PIPES; i++)
+                       num_queues += hweight32(adev->mes.compute_hqd_mask[i]);
+
+               for (i = 0; i < AMDGPU_MES_MAX_SDMA_PIPES; i++)
+                       num_queues += hweight32(adev->mes.sdma_hqd_mask[i]) * num_xcc;
+
+               adev->mes.hung_queue_hqd_info_offset = num_queues;
+               adev->mes.hung_queue_db_array_size = num_queues * 2;
+       }
+
        if (adev->mes.hung_queue_db_array_size) {
-               for (i = 0; i < AMDGPU_MAX_MES_PIPES * num_xcc; i++) {
+               for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) {
                        r = amdgpu_bo_create_kernel(adev,
                                                    adev->mes.hung_queue_db_array_size * sizeof(u32),
                                                    PAGE_SIZE,
@@ -264,10 +284,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
                              &adev->mes.event_log_cpu_addr);
 
        for (i = 0; i < AMDGPU_MAX_MES_PIPES * num_xcc; i++) {
-               amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj[i],
-                                     &adev->mes.hung_queue_db_array_gpu_addr[i],
-                                     &adev->mes.hung_queue_db_array_cpu_addr[i]);
-
+               if (adev->mes.hung_queue_db_array_gpu_obj[i])
+                        amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj[i],
+                                        &adev->mes.hung_queue_db_array_gpu_addr[i],
+                                        &adev->mes.hung_queue_db_array_cpu_addr[i]);
                if (adev->mes.sch_ctx_ptr[i])
                        amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs[i]);
                if (adev->mes.query_status_fence_ptr[i])