From: Michal Wajdeczko Date: Fri, 10 Apr 2026 11:04:57 +0000 (+0200) Subject: drm/xe/guc: Add support for NO_RESPONSE_BUSY in CTB X-Git-Tag: v7.2-rc1~141^2~27^2~73 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4d33314decfeac8b82d771a1bd083a59f4ac6fae;p=thirdparty%2Fkernel%2Flinux.git drm/xe/guc: Add support for NO_RESPONSE_BUSY in CTB We only have support for G2H NO_RESPONSE_BUSY messages over MMIO, but it turned out that GuC also uses that type of messages in CTB. The following error was recently observed on BMG after adding VGT policy updates to the GT restart sequence: [] xe 0000:03:00.0: [drm] *ERROR* Tile0: GT1: G2H channel broken on read, type=3, reset required [] xe 0000:03:00.0: [drm] *ERROR* Tile0: GT1: CT dequeue failed: -95 ... [] xe 0000:03:00.0: [drm] *ERROR* Tile0: GT1: Timed out wait for G2H, fence 21965, action 5502, done no [] xe 0000:03:00.0: [drm] PF: Tile0: GT1: Failed to push 1 policy KLV (-ETIME) [] xe 0000:03:00.0: [drm] Tile0: GT1: { key 0x8004 : no value } # engine_group_config where type=3 was this unrecognized NO_RESPONSE_BUSY message. Note that GuC might send the real RESPONSE message right after the BUSY message, so we must be prepared to update our g2h_fence data twice before sender actually wakes up and clears the flags. Signed-off-by: Michal Wajdeczko Cc: Daniele Ceraolo Spurio Reviewed-by: Daniele Ceraolo Spurio Link: https://patch.msgid.link/20260410110457.573-1-michal.wajdeczko@intel.com --- diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index a11cff7a20be7..21e0dad9a4815 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -186,13 +186,16 @@ static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action) { } struct g2h_fence { u32 *response_buffer; u32 seqno; + /* fields below this point are setup based on the response */ u32 response_data; u16 response_len; u16 error; u16 hint; u16 reason; + u32 counter; bool cancel; bool retry; + bool wait; bool fail; bool done; }; @@ -204,6 +207,11 @@ static void g2h_fence_init(struct g2h_fence *g2h_fence, u32 *response_buffer) g2h_fence->seqno = ~0x0; } +static void g2h_fence_reinit(struct g2h_fence *g2h_fence) +{ + memset_after(g2h_fence, 0, seqno); +} + static void g2h_fence_cancel(struct g2h_fence *g2h_fence) { g2h_fence->cancel = true; @@ -1331,6 +1339,7 @@ retry_same_fence: /* READ_ONCEs pairs with WRITE_ONCEs in parse_g2h_response * and g2h_fence_cancel. */ +wait_again: ret = wait_event_timeout(ct->g2h_fence_wq, READ_ONCE(g2h_fence.done), HZ); if (!ret) { LNL_FLUSH_WORK(&ct->g2h_worker); @@ -1356,6 +1365,14 @@ retry_same_fence: return -ETIME; } + if (g2h_fence.wait) { + xe_gt_dbg(gt, "H2G action %#x busy: counter %u\n", + action[0], g2h_fence.counter); + /* we can't leave any response data if we want to wait again */ + g2h_fence_reinit(&g2h_fence); + mutex_unlock(&ct->lock); + goto wait_again; + } if (g2h_fence.retry) { xe_gt_dbg(gt, "H2G action %#x retrying: reason %#x\n", action[0], g2h_fence.reason); @@ -1508,7 +1525,12 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len) return -EPROTO; } - g2h_fence = xa_erase(&ct->fence_lookup, fence); + /* don't erase as we still expect a final response with the same fence */ + if (type == GUC_HXG_TYPE_NO_RESPONSE_BUSY) + g2h_fence = xa_load(&ct->fence_lookup, fence); + else + g2h_fence = xa_erase(&ct->fence_lookup, fence); + if (unlikely(!g2h_fence)) { /* Don't tear down channel, as send could've timed out */ /* CT_DEAD(ct, NULL, PARSE_G2H_UNKNOWN); */ @@ -1519,6 +1541,12 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len) xe_gt_assert(gt, fence == g2h_fence->seqno); + /* + * reinit as we might have already process this g2h_fence before + * if we received a NO_RESPONSE_BUSY reply + */ + g2h_fence_reinit(g2h_fence); + if (type == GUC_HXG_TYPE_RESPONSE_FAILURE) { g2h_fence->fail = true; g2h_fence->error = FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, hxg[0]); @@ -1526,6 +1554,9 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len) } else if (type == GUC_HXG_TYPE_NO_RESPONSE_RETRY) { g2h_fence->retry = true; g2h_fence->reason = FIELD_GET(GUC_HXG_RETRY_MSG_0_REASON, hxg[0]); + } else if (type == GUC_HXG_TYPE_NO_RESPONSE_BUSY) { + g2h_fence->wait = true; + g2h_fence->counter = FIELD_GET(GUC_HXG_BUSY_MSG_0_COUNTER, hxg[0]); } else if (g2h_fence->response_buffer) { g2h_fence->response_len = hxg_len; memcpy(g2h_fence->response_buffer, hxg, hxg_len * sizeof(u32)); @@ -1533,7 +1564,9 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len) g2h_fence->response_data = FIELD_GET(GUC_HXG_RESPONSE_MSG_0_DATA0, hxg[0]); } - g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); + /* don't release any space if it was an intermediate message */ + if (!g2h_fence->wait) + g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); /* WRITE_ONCE pairs with READ_ONCEs in guc_ct_send_recv. */ WRITE_ONCE(g2h_fence->done, true); @@ -1570,6 +1603,7 @@ static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) case GUC_HXG_TYPE_RESPONSE_SUCCESS: case GUC_HXG_TYPE_RESPONSE_FAILURE: case GUC_HXG_TYPE_NO_RESPONSE_RETRY: + case GUC_HXG_TYPE_NO_RESPONSE_BUSY: ret = parse_g2h_response(ct, msg, len); break; default: