drm/xe/multi_queue: Teardown group upon job timeout

author Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>

Thu, 11 Dec 2025 01:03:01 +0000 (17:03 -0800)

committer Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>

Fri, 12 Dec 2025 03:21:53 +0000 (19:21 -0800)
author Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Thu, 11 Dec 2025 01:03:01 +0000 (17:03 -0800)
committer Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Fri, 12 Dec 2025 03:21:53 +0000 (19:21 -0800)
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h

index 8a954ee6250599b9c87d315fdb03db1aa5354e0d..5fc516b0bb77f7706268e4be3918f7c01e350cd6 100644 (file)
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -64,6 +64,8 @@ struct xe_exec_queue_group {
         struct mutex list_lock;
         /** @sync_pending: CGP_SYNC_DONE g2h response pending */
         bool sync_pending;
+       /** @banned: Group banned */
+       bool banned;
  };
  
  /**
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c

index e8bde976e4c863c33984a8d39f0ced75e8e4b11a..f678b806acaafc77425ad46dfb3500a294b982d5 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -602,6 +602,8 @@ static void xe_guc_exec_queue_group_trigger_cleanup(struct xe_exec_queue *q)
         xe_gt_assert(guc_to_gt(exec_queue_to_guc(q)),
                      xe_exec_queue_is_multi_queue(q));
  
+       /* Group banned, skip timeout check in TDR */
+       WRITE_ONCE(group->banned, true);
         xe_guc_exec_queue_trigger_cleanup(primary);
  
         mutex_lock(&group->list_lock);
@@ -617,6 +619,9 @@ static void xe_guc_exec_queue_reset_trigger_cleanup(struct xe_exec_queue *q)
                 struct xe_exec_queue_group *group = q->multi_queue.group;
                 struct xe_exec_queue *eq;
  
+               /* Group banned, skip timeout check in TDR */
+               WRITE_ONCE(group->banned, true);
+
                 set_exec_queue_reset(primary);
                 if (!exec_queue_banned(primary) && !exec_queue_check_timeout(primary))
                         xe_guc_exec_queue_trigger_cleanup(primary);
@@ -1487,6 +1492,19 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
                 exec_queue_killed_or_banned_or_wedged(q) ||
                 exec_queue_destroyed(q);
  
+       /* Skip timeout check if multi-queue group is banned */
+       if (xe_exec_queue_is_multi_queue(q) &&
+           READ_ONCE(q->multi_queue.group->banned))
+               skip_timeout_check = true;
+
+       /*
+        * FIXME: In multi-queue scenario, the TDR must ensure that the whole
+        * multi-queue group is off the HW before signaling the fences to avoid
+        * possible memory corruptions. This means disabling scheduling on the
+        * primary queue before or during the secondary queue's TDR. Need to
+        * implement this in least obtrusive way.
+        */
+
         /*
          * If devcoredump not captured and GuC capture for the job is not ready
          * do manual capture first and decide later if we need to use it
@@ -1639,7 +1657,10 @@ trigger_reset:
         xe_sched_add_pending_job(sched, job);
         xe_sched_submission_start(sched);
  
-       xe_guc_exec_queue_trigger_cleanup(q);
+       if (xe_exec_queue_is_multi_queue(q))
+               xe_guc_exec_queue_group_trigger_cleanup(q);
+       else
+               xe_guc_exec_queue_trigger_cleanup(q);
  
         /* Mark all outstanding jobs as bad, thus completing them */
         spin_lock(&sched->base.job_list_lock);
author	Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
	Thu, 11 Dec 2025 01:03:01 +0000 (17:03 -0800)
committer	Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
	Fri, 12 Dec 2025 03:21:53 +0000 (19:21 -0800)
drivers/gpu/drm/xe/xe_exec_queue_types.h		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_guc_submit.c		patch \| blob \| blame \| history