]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/xe/multi_queue: Teardown group upon job timeout
authorNiranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Thu, 11 Dec 2025 01:03:01 +0000 (17:03 -0800)
committerNiranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Fri, 12 Dec 2025 03:21:53 +0000 (19:21 -0800)
Upon a job timeout, teardown the multi-queue group by
triggering TDR on all queues of the multi-queue group
and by skipping timeout checks in them.

v5: Ban the group while triggering TDR for the guc
    reported errors
    Add FIXME in TDR to take multi-queue group off HW
    (Matt Brost)
v6: Trigger cleanup of group only for multi-queue case

Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patch.msgid.link/20251211010249.1647839-32-niranjana.vishwanathapura@intel.com
drivers/gpu/drm/xe/xe_exec_queue_types.h
drivers/gpu/drm/xe/xe_guc_submit.c

index 8a954ee6250599b9c87d315fdb03db1aa5354e0d..5fc516b0bb77f7706268e4be3918f7c01e350cd6 100644 (file)
@@ -64,6 +64,8 @@ struct xe_exec_queue_group {
        struct mutex list_lock;
        /** @sync_pending: CGP_SYNC_DONE g2h response pending */
        bool sync_pending;
+       /** @banned: Group banned */
+       bool banned;
 };
 
 /**
index e8bde976e4c863c33984a8d39f0ced75e8e4b11a..f678b806acaafc77425ad46dfb3500a294b982d5 100644 (file)
@@ -602,6 +602,8 @@ static void xe_guc_exec_queue_group_trigger_cleanup(struct xe_exec_queue *q)
        xe_gt_assert(guc_to_gt(exec_queue_to_guc(q)),
                     xe_exec_queue_is_multi_queue(q));
 
+       /* Group banned, skip timeout check in TDR */
+       WRITE_ONCE(group->banned, true);
        xe_guc_exec_queue_trigger_cleanup(primary);
 
        mutex_lock(&group->list_lock);
@@ -617,6 +619,9 @@ static void xe_guc_exec_queue_reset_trigger_cleanup(struct xe_exec_queue *q)
                struct xe_exec_queue_group *group = q->multi_queue.group;
                struct xe_exec_queue *eq;
 
+               /* Group banned, skip timeout check in TDR */
+               WRITE_ONCE(group->banned, true);
+
                set_exec_queue_reset(primary);
                if (!exec_queue_banned(primary) && !exec_queue_check_timeout(primary))
                        xe_guc_exec_queue_trigger_cleanup(primary);
@@ -1487,6 +1492,19 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
                exec_queue_killed_or_banned_or_wedged(q) ||
                exec_queue_destroyed(q);
 
+       /* Skip timeout check if multi-queue group is banned */
+       if (xe_exec_queue_is_multi_queue(q) &&
+           READ_ONCE(q->multi_queue.group->banned))
+               skip_timeout_check = true;
+
+       /*
+        * FIXME: In multi-queue scenario, the TDR must ensure that the whole
+        * multi-queue group is off the HW before signaling the fences to avoid
+        * possible memory corruptions. This means disabling scheduling on the
+        * primary queue before or during the secondary queue's TDR. Need to
+        * implement this in least obtrusive way.
+        */
+
        /*
         * If devcoredump not captured and GuC capture for the job is not ready
         * do manual capture first and decide later if we need to use it
@@ -1639,7 +1657,10 @@ trigger_reset:
        xe_sched_add_pending_job(sched, job);
        xe_sched_submission_start(sched);
 
-       xe_guc_exec_queue_trigger_cleanup(q);
+       if (xe_exec_queue_is_multi_queue(q))
+               xe_guc_exec_queue_group_trigger_cleanup(q);
+       else
+               xe_guc_exec_queue_trigger_cleanup(q);
 
        /* Mark all outstanding jobs as bad, thus completing them */
        spin_lock(&sched->base.job_list_lock);