From: Niranjana Vishwanathapura Date: Thu, 11 Dec 2025 01:03:01 +0000 (-0800) Subject: drm/xe/multi_queue: Teardown group upon job timeout X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8b81c76885e8f61681cf4c7d6d0ce816809e3b2f;p=thirdparty%2Fkernel%2Flinux.git drm/xe/multi_queue: Teardown group upon job timeout Upon a job timeout, teardown the multi-queue group by triggering TDR on all queues of the multi-queue group and by skipping timeout checks in them. v5: Ban the group while triggering TDR for the guc reported errors Add FIXME in TDR to take multi-queue group off HW (Matt Brost) v6: Trigger cleanup of group only for multi-queue case Signed-off-by: Niranjana Vishwanathapura Reviewed-by: Matthew Brost Link: https://patch.msgid.link/20251211010249.1647839-32-niranjana.vishwanathapura@intel.com --- diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 8a954ee62505..5fc516b0bb77 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -64,6 +64,8 @@ struct xe_exec_queue_group { struct mutex list_lock; /** @sync_pending: CGP_SYNC_DONE g2h response pending */ bool sync_pending; + /** @banned: Group banned */ + bool banned; }; /** diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index e8bde976e4c8..f678b806acaa 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -602,6 +602,8 @@ static void xe_guc_exec_queue_group_trigger_cleanup(struct xe_exec_queue *q) xe_gt_assert(guc_to_gt(exec_queue_to_guc(q)), xe_exec_queue_is_multi_queue(q)); + /* Group banned, skip timeout check in TDR */ + WRITE_ONCE(group->banned, true); xe_guc_exec_queue_trigger_cleanup(primary); mutex_lock(&group->list_lock); @@ -617,6 +619,9 @@ static void xe_guc_exec_queue_reset_trigger_cleanup(struct xe_exec_queue *q) struct xe_exec_queue_group *group = q->multi_queue.group; struct xe_exec_queue *eq; + /* Group banned, skip timeout check in TDR */ + WRITE_ONCE(group->banned, true); + set_exec_queue_reset(primary); if (!exec_queue_banned(primary) && !exec_queue_check_timeout(primary)) xe_guc_exec_queue_trigger_cleanup(primary); @@ -1487,6 +1492,19 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) exec_queue_killed_or_banned_or_wedged(q) || exec_queue_destroyed(q); + /* Skip timeout check if multi-queue group is banned */ + if (xe_exec_queue_is_multi_queue(q) && + READ_ONCE(q->multi_queue.group->banned)) + skip_timeout_check = true; + + /* + * FIXME: In multi-queue scenario, the TDR must ensure that the whole + * multi-queue group is off the HW before signaling the fences to avoid + * possible memory corruptions. This means disabling scheduling on the + * primary queue before or during the secondary queue's TDR. Need to + * implement this in least obtrusive way. + */ + /* * If devcoredump not captured and GuC capture for the job is not ready * do manual capture first and decide later if we need to use it @@ -1639,7 +1657,10 @@ trigger_reset: xe_sched_add_pending_job(sched, job); xe_sched_submission_start(sched); - xe_guc_exec_queue_trigger_cleanup(q); + if (xe_exec_queue_is_multi_queue(q)) + xe_guc_exec_queue_group_trigger_cleanup(q); + else + xe_guc_exec_queue_trigger_cleanup(q); /* Mark all outstanding jobs as bad, thus completing them */ spin_lock(&sched->base.job_list_lock);