From: Amber Lin Date: Fri, 13 Mar 2026 10:15:24 +0000 (-0400) Subject: drm/amdkfd: Add detect+reset hangs to GC 12.1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=65f8a3676d12a76de62923ee5ee0248927b732c1;p=thirdparty%2Fkernel%2Flinux.git drm/amdkfd: Add detect+reset hangs to GC 12.1 Add detect_and_reset_hung_queues to user mode compute queues on GC 12.1. Signed-off-by: Amber Lin Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c index 600d7d6588263..1931562ea6b3f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c @@ -47,6 +47,8 @@ static int mes_v12_1_self_test(struct amdgpu_device *adev, int xcc_id); static int mes_v12_1_setup_coop_mode(struct amdgpu_device *adev, int xcc_id); #define MES_EOP_SIZE 2048 +#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset [4:7] hqd info */ +#define MES12_HUNG_HQD_INFO_OFFSET 4 #define regCP_HQD_IB_CONTROL_MES_12_1_DEFAULT 0x100000 #define XCC_MID_MASK 0x41000000 @@ -230,7 +232,7 @@ static int mes_v12_1_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, xcc_id, pipe, x_pkt->header.opcode); r = amdgpu_fence_wait_polling(ring, seq, timeout); - if (r < 1 || !*status_ptr) { + if (r < 1 || !lower_32_bits(*status_ptr)) { if (misc_op_str) dev_err(adev->dev, "MES(%d, %d) failed to respond to msg=%s (%s)\n", @@ -877,6 +879,33 @@ static int mes_v12_1_reset_legacy_queue(struct amdgpu_mes *mes, } #endif +static int mes_v12_1_detect_and_reset_hung_queues(struct amdgpu_mes *mes, + struct mes_detect_and_reset_queue_input *input) +{ + union MESAPI__RESET mes_reset_queue_pkt; + + memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt)); + + mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER; + mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET; + mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; + + mes_reset_queue_pkt.queue_type = + convert_to_mes_queue_type(input->queue_type); + mes_reset_queue_pkt.doorbell_offset_addr = + mes->hung_queue_db_array_gpu_addr[0]; + + if (input->detect_only) + mes_reset_queue_pkt.hang_detect_only = 1; + else + mes_reset_queue_pkt.hang_detect_then_reset = 1; + + return mes_v12_1_submit_pkt_and_poll_completion(mes, + input->xcc_id, AMDGPU_MES_SCHED_PIPE, + &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt), + offsetof(union MESAPI__RESET, api_status)); +} + static int mes_v12_inv_tlb_convert_hub_id(uint8_t id) { /* @@ -934,6 +963,7 @@ static const struct amdgpu_mes_funcs mes_v12_1_funcs = { .resume_gang = mes_v12_1_resume_gang, .misc_op = mes_v12_1_misc_op, .reset_hw_queue = mes_v12_1_reset_hw_queue, + .detect_and_reset_hung_queues = mes_v12_1_detect_and_reset_hung_queues, .invalidate_tlbs_pasid = mes_v12_1_inv_tlbs_pasid, }; @@ -1936,6 +1966,9 @@ static int mes_v12_1_early_init(struct amdgpu_ip_block *ip_block) struct amdgpu_device *adev = ip_block->adev; int pipe, r; + adev->mes.hung_queue_db_array_size = MES12_HUNG_DB_OFFSET_ARRAY_SIZE; + adev->mes.hung_queue_hqd_info_offset = MES12_HUNG_HQD_INFO_OFFSET; + for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { r = amdgpu_mes_init_microcode(adev, pipe); if (r)