From c85285b32cc697d7612ee28a9ea9ded5e53d2b57 Mon Sep 17 00:00:00 2001 From: Niranjana Vishwanathapura Date: Wed, 10 Dec 2025 17:02:59 -0800 Subject: [PATCH] drm/xe/multi_queue: Handle CGP context error Trigger multi-queue context cleanup upon CGP context error notification from GuC. v4: Fix error message Signed-off-by: Niranjana Vishwanathapura Reviewed-by: Matthew Brost Link: https://patch.msgid.link/20251211010249.1647839-30-niranjana.vishwanathapura@intel.com --- drivers/gpu/drm/xe/abi/guc_actions_abi.h | 1 + drivers/gpu/drm/xe/xe_guc_ct.c | 4 +++ drivers/gpu/drm/xe/xe_guc_submit.c | 31 ++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_guc_submit.h | 2 ++ drivers/gpu/drm/xe/xe_trace.h | 5 ++++ 5 files changed, 43 insertions(+) diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_abi.h index 3e9fbed9cda6b..8af3691626bfd 100644 --- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h @@ -142,6 +142,7 @@ enum xe_guc_action { XE_GUC_ACTION_REGISTER_CONTEXT_MULTI_QUEUE = 0x4602, XE_GUC_ACTION_MULTI_QUEUE_CONTEXT_CGP_SYNC = 0x4603, XE_GUC_ACTION_NOTIFY_MULTI_QUEUE_CONTEXT_CGP_SYNC_DONE = 0x4604, + XE_GUC_ACTION_NOTIFY_MULTI_QUEUE_CGP_CONTEXT_ERROR = 0x4605, XE_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507, XE_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A, XE_GUC_ACTION_SET_DEVICE_ENGINE_ACTIVITY_BUFFER = 0x550C, diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 4d5b4ed357cc0..3e49e7fd00311 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -1618,6 +1618,10 @@ static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) case XE_GUC_ACTION_NOTIFY_MULTI_QUEUE_CONTEXT_CGP_SYNC_DONE: ret = xe_guc_exec_queue_cgp_sync_done_handler(guc, payload, adj_len); break; + case XE_GUC_ACTION_NOTIFY_MULTI_QUEUE_CGP_CONTEXT_ERROR: + ret = xe_guc_exec_queue_cgp_context_error_handler(guc, payload, + adj_len); + break; default: xe_gt_err(gt, "unexpected G2H action 0x%04x\n", action); } diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index d38f5aab0a994..3be5e78485c7a 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -48,6 +48,8 @@ #include "xe_uc_fw.h" #include "xe_vm.h" +#define XE_GUC_EXEC_QUEUE_CGP_CONTEXT_ERROR_LEN 6 + static struct xe_guc * exec_queue_to_guc(struct xe_exec_queue *q) { @@ -3009,6 +3011,35 @@ int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 le return 0; } +int xe_guc_exec_queue_cgp_context_error_handler(struct xe_guc *guc, u32 *msg, + u32 len) +{ + struct xe_gt *gt = guc_to_gt(guc); + struct xe_device *xe = guc_to_xe(guc); + struct xe_exec_queue *q; + u32 guc_id = msg[2]; + + if (unlikely(len != XE_GUC_EXEC_QUEUE_CGP_CONTEXT_ERROR_LEN)) { + drm_err(&xe->drm, "Invalid length %u", len); + return -EPROTO; + } + + q = g2h_exec_queue_lookup(guc, guc_id); + if (unlikely(!q)) + return -EPROTO; + + xe_gt_dbg(gt, + "CGP context error: [%s] err=0x%x, q0_id=0x%x LRCA=0x%x guc_id=0x%x", + msg[0] & 1 ? "uc" : "kmd", msg[1], msg[2], msg[3], msg[4]); + + trace_xe_exec_queue_cgp_context_error(q); + + /* Treat the same as engine reset */ + xe_guc_exec_queue_reset_trigger_cleanup(q); + + return 0; +} + /** * xe_guc_exec_queue_cgp_sync_done_handler - CGP synchronization done handler * @guc: guc diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h index ad8c0e8e0415e..4d89b2975fe93 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.h +++ b/drivers/gpu/drm/xe/xe_guc_submit.h @@ -37,6 +37,8 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg, int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 len); int xe_guc_error_capture_handler(struct xe_guc *guc, u32 *msg, u32 len); int xe_guc_exec_queue_cgp_sync_done_handler(struct xe_guc *guc, u32 *msg, u32 len); +int xe_guc_exec_queue_cgp_context_error_handler(struct xe_guc *guc, u32 *msg, + u32 len); struct xe_guc_submit_exec_queue_snapshot * xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q); diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h index 79a97b086cb27..c9d0748dae9db 100644 --- a/drivers/gpu/drm/xe/xe_trace.h +++ b/drivers/gpu/drm/xe/xe_trace.h @@ -172,6 +172,11 @@ DEFINE_EVENT(xe_exec_queue, xe_exec_queue_memory_cat_error, TP_ARGS(q) ); +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_cgp_context_error, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + DEFINE_EVENT(xe_exec_queue, xe_exec_queue_stop, TP_PROTO(struct xe_exec_queue *q), TP_ARGS(q) -- 2.47.3