]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/xe: Stop abusing DRM scheduler internals
authorMatthew Brost <matthew.brost@intel.com>
Sat, 10 Jan 2026 01:27:34 +0000 (17:27 -0800)
committerMatthew Brost <matthew.brost@intel.com>
Sat, 10 Jan 2026 21:39:50 +0000 (13:39 -0800)
Use new pending job list iterator and new helper functions in Xe to
avoid reaching into DRM scheduler internals.

Part of this change involves removing pending jobs debug information
from debugfs and devcoredump. As agreed, the pending job list should
only be accessed when the scheduler is stopped. However, it's not
straightforward to determine whether the scheduler is stopped from the
shared debugfs/devcoredump code path. Additionally, the pending job list
provides little useful information, as pending jobs can be inferred from
seqnos and ring head/tail positions. Therefore, this debug information
is being removed.

v4:
 - Add comment around DRM_GPU_SCHED_STAT_NO_HANG (Niranjana)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Link: https://patch.msgid.link/20260110012739.2888434-3-matthew.brost@intel.com
drivers/gpu/drm/xe/xe_gpu_scheduler.c
drivers/gpu/drm/xe/xe_gpu_scheduler.h
drivers/gpu/drm/xe/xe_guc_submit.c
drivers/gpu/drm/xe/xe_guc_submit_types.h
drivers/gpu/drm/xe/xe_hw_fence.c
drivers/gpu/drm/xe/xe_hw_fence.h

index f4f23317191ff1811b9e4525aedfcd266fa50f63..9c8004d5dd915f7f56473fb5b7570d51baf30f94 100644 (file)
@@ -7,7 +7,7 @@
 
 static void xe_sched_process_msg_queue(struct xe_gpu_scheduler *sched)
 {
-       if (!READ_ONCE(sched->base.pause_submit))
+       if (!drm_sched_is_stopped(&sched->base))
                queue_work(sched->base.submit_wq, &sched->work_process_msg);
 }
 
@@ -43,7 +43,7 @@ static void xe_sched_process_msg_work(struct work_struct *w)
                container_of(w, struct xe_gpu_scheduler, work_process_msg);
        struct xe_sched_msg *msg;
 
-       if (READ_ONCE(sched->base.pause_submit))
+       if (drm_sched_is_stopped(&sched->base))
                return;
 
        msg = xe_sched_get_msg(sched);
index dceb2cd0ee5bde6727f744e461db284ddef86fa7..664c2db56af3d5cb97dacac6a6cf02d91142fdaa 100644 (file)
@@ -56,12 +56,9 @@ static inline void xe_sched_resubmit_jobs(struct xe_gpu_scheduler *sched)
        struct drm_sched_job *s_job;
        bool restore_replay = false;
 
-       list_for_each_entry(s_job, &sched->base.pending_list, list) {
-               struct drm_sched_fence *s_fence = s_job->s_fence;
-               struct dma_fence *hw_fence = s_fence->parent;
-
+       drm_sched_for_each_pending_job(s_job, &sched->base, NULL) {
                restore_replay |= to_xe_sched_job(s_job)->restore_replay;
-               if (restore_replay || (hw_fence && !dma_fence_is_signaled(hw_fence)))
+               if (restore_replay || !drm_sched_job_is_signaled(s_job))
                        sched->base.ops->run_job(s_job);
        }
 }
@@ -72,14 +69,6 @@ xe_sched_invalidate_job(struct xe_sched_job *job, int threshold)
        return drm_sched_invalidate_job(&job->drm, threshold);
 }
 
-static inline void xe_sched_add_pending_job(struct xe_gpu_scheduler *sched,
-                                           struct xe_sched_job *job)
-{
-       spin_lock(&sched->base.job_list_lock);
-       list_add(&job->drm.list, &sched->base.pending_list);
-       spin_unlock(&sched->base.job_list_lock);
-}
-
 /**
  * xe_sched_first_pending_job() - Find first pending job which is unsignaled
  * @sched: Xe GPU scheduler
@@ -89,21 +78,13 @@ static inline void xe_sched_add_pending_job(struct xe_gpu_scheduler *sched,
 static inline
 struct xe_sched_job *xe_sched_first_pending_job(struct xe_gpu_scheduler *sched)
 {
-       struct xe_sched_job *job, *r_job = NULL;
-
-       spin_lock(&sched->base.job_list_lock);
-       list_for_each_entry(job, &sched->base.pending_list, drm.list) {
-               struct drm_sched_fence *s_fence = job->drm.s_fence;
-               struct dma_fence *hw_fence = s_fence->parent;
+       struct drm_sched_job *job;
 
-               if (hw_fence && !dma_fence_is_signaled(hw_fence)) {
-                       r_job = job;
-                       break;
-               }
-       }
-       spin_unlock(&sched->base.job_list_lock);
+       drm_sched_for_each_pending_job(job, &sched->base, NULL)
+               if (!drm_sched_job_is_signaled(job))
+                       return to_xe_sched_job(job);
 
-       return r_job;
+       return NULL;
 }
 
 static inline int
index 45aa56b30a127a35381089f50d3923652f9603a6..b7824834ff337717125c31517bb56872ac32b096 100644 (file)
@@ -1374,7 +1374,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
        struct xe_exec_queue *q = ge->q;
        struct xe_guc *guc = exec_queue_to_guc(q);
        struct xe_gpu_scheduler *sched = &ge->sched;
-       struct xe_sched_job *job;
+       struct drm_sched_job *job;
        bool wedged = false;
 
        xe_gt_assert(guc_to_gt(guc), xe_exec_queue_is_lr(q));
@@ -1433,16 +1433,10 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
        if (!exec_queue_killed(q) && !xe_lrc_ring_is_idle(q->lrc[0]))
                xe_devcoredump(q, NULL, "LR job cleanup, guc_id=%d", q->guc->id);
 
-       xe_hw_fence_irq_stop(q->fence_irq);
+       drm_sched_for_each_pending_job(job, &sched->base, NULL)
+               xe_sched_job_set_error(to_xe_sched_job(job), -ECANCELED);
 
        xe_sched_submission_start(sched);
-
-       spin_lock(&sched->base.job_list_lock);
-       list_for_each_entry(job, &sched->base.pending_list, drm.list)
-               xe_sched_job_set_error(job, -ECANCELED);
-       spin_unlock(&sched->base.job_list_lock);
-
-       xe_hw_fence_irq_start(q->fence_irq);
 }
 
 #define ADJUST_FIVE_PERCENT(__t)       mul_u64_u32_div(__t, 105, 100)
@@ -1570,7 +1564,7 @@ static enum drm_gpu_sched_stat
 guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
 {
        struct xe_sched_job *job = to_xe_sched_job(drm_job);
-       struct xe_sched_job *tmp_job;
+       struct drm_sched_job *tmp_job;
        struct xe_exec_queue *q = job->q;
        struct xe_gpu_scheduler *sched = &q->guc->sched;
        struct xe_guc *guc = exec_queue_to_guc(q);
@@ -1578,7 +1572,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
        struct xe_device *xe = guc_to_xe(guc);
        int err = -ETIME;
        pid_t pid = -1;
-       int i = 0;
        bool wedged = false, skip_timeout_check;
 
        xe_gt_assert(guc_to_gt(guc), !xe_exec_queue_is_lr(q));
@@ -1756,14 +1749,11 @@ trigger_reset:
                __deregister_exec_queue(guc, q);
        }
 
-       /* Stop fence signaling */
-       xe_hw_fence_irq_stop(q->fence_irq);
+       /* Mark all outstanding jobs as bad, thus completing them */
+       xe_sched_job_set_error(job, err);
+       drm_sched_for_each_pending_job(tmp_job, &sched->base, NULL)
+               xe_sched_job_set_error(to_xe_sched_job(tmp_job), -ECANCELED);
 
-       /*
-        * Fence state now stable, stop / start scheduler which cleans up any
-        * fences that are complete
-        */
-       xe_sched_add_pending_job(sched, job);
        xe_sched_submission_start(sched);
 
        if (xe_exec_queue_is_multi_queue(q))
@@ -1771,16 +1761,11 @@ trigger_reset:
        else
                xe_guc_exec_queue_trigger_cleanup(q);
 
-       /* Mark all outstanding jobs as bad, thus completing them */
-       spin_lock(&sched->base.job_list_lock);
-       list_for_each_entry(tmp_job, &sched->base.pending_list, drm.list)
-               xe_sched_job_set_error(tmp_job, !i++ ? err : -ECANCELED);
-       spin_unlock(&sched->base.job_list_lock);
-
-       /* Start fence signaling */
-       xe_hw_fence_irq_start(q->fence_irq);
-
-       return DRM_GPU_SCHED_STAT_RESET;
+       /*
+        * We want the job added back to the pending list so it gets freed; this
+        * is what DRM_GPU_SCHED_STAT_NO_HANG does.
+        */
+       return DRM_GPU_SCHED_STAT_NO_HANG;
 
 sched_enable:
        set_exec_queue_pending_tdr_exit(q);
@@ -2754,11 +2739,12 @@ static void guc_exec_queue_unpause_prepare(struct xe_guc *guc,
                                           struct xe_exec_queue *q)
 {
        struct xe_gpu_scheduler *sched = &q->guc->sched;
-       struct xe_sched_job *job = NULL, *__job;
+       struct xe_sched_job *job = NULL;
+       struct drm_sched_job *s_job;
        bool restore_replay = false;
 
-       list_for_each_entry(__job, &sched->base.pending_list, drm.list) {
-               job = __job;
+       drm_sched_for_each_pending_job(s_job, &sched->base, NULL) {
+               job = to_xe_sched_job(s_job);
                restore_replay |= job->restore_replay;
                if (restore_replay) {
                        xe_gt_dbg(guc_to_gt(guc), "Replay JOB - guc_id=%d, seqno=%d",
@@ -2882,7 +2868,7 @@ void xe_guc_submit_unpause_vf(struct xe_guc *guc)
                 * created after resfix done.
                 */
                if (q->guc->id != index ||
-                   !READ_ONCE(q->guc->sched.base.pause_submit))
+                   !drm_sched_is_stopped(&q->guc->sched.base))
                        continue;
 
                guc_exec_queue_unpause(guc, q);
@@ -3387,29 +3373,6 @@ xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q)
                snapshot->multi_queue.primary = xe_exec_queue_multi_queue_primary(q)->guc->id;
                snapshot->multi_queue.pos = q->multi_queue.pos;
        }
-       spin_lock(&sched->base.job_list_lock);
-       snapshot->pending_list_size = list_count_nodes(&sched->base.pending_list);
-       snapshot->pending_list = kmalloc_array(snapshot->pending_list_size,
-                                              sizeof(struct pending_list_snapshot),
-                                              GFP_ATOMIC);
-
-       if (snapshot->pending_list) {
-               struct xe_sched_job *job_iter;
-
-               i = 0;
-               list_for_each_entry(job_iter, &sched->base.pending_list, drm.list) {
-                       snapshot->pending_list[i].seqno =
-                               xe_sched_job_seqno(job_iter);
-                       snapshot->pending_list[i].fence =
-                               dma_fence_is_signaled(job_iter->fence) ? 1 : 0;
-                       snapshot->pending_list[i].finished =
-                               dma_fence_is_signaled(&job_iter->drm.s_fence->finished)
-                               ? 1 : 0;
-                       i++;
-               }
-       }
-
-       spin_unlock(&sched->base.job_list_lock);
 
        return snapshot;
 }
@@ -3473,13 +3436,6 @@ xe_guc_exec_queue_snapshot_print(struct xe_guc_submit_exec_queue_snapshot *snaps
                drm_printf(p, "\tMulti queue primary GuC ID: %d\n", snapshot->multi_queue.primary);
                drm_printf(p, "\tMulti queue position: %d\n", snapshot->multi_queue.pos);
        }
-
-       for (i = 0; snapshot->pending_list && i < snapshot->pending_list_size;
-            i++)
-               drm_printf(p, "\tJob: seqno=%d, fence=%d, finished=%d\n",
-                          snapshot->pending_list[i].seqno,
-                          snapshot->pending_list[i].fence,
-                          snapshot->pending_list[i].finished);
 }
 
 /**
@@ -3502,7 +3458,6 @@ void xe_guc_exec_queue_snapshot_free(struct xe_guc_submit_exec_queue_snapshot *s
                        xe_lrc_snapshot_free(snapshot->lrc[i]);
                kfree(snapshot->lrc);
        }
-       kfree(snapshot->pending_list);
        kfree(snapshot);
 }
 
index 25e29e85502cbc3fd1ebe93e41314133567bf56a..5ccc5f959bb3a6b844fccd4c85a97c94cebb5031 100644 (file)
@@ -61,12 +61,6 @@ struct guc_submit_parallel_scratch {
        u32 wq[WQ_SIZE / sizeof(u32)];
 };
 
-struct pending_list_snapshot {
-       u32 seqno;
-       bool fence;
-       bool finished;
-};
-
 /**
  * struct xe_guc_submit_exec_queue_snapshot - Snapshot for devcoredump
  */
@@ -147,11 +141,6 @@ struct xe_guc_submit_exec_queue_snapshot {
                /** @valid: The exec queue is part of a multi queue group */
                bool valid;
        } multi_queue;
-
-       /** @pending_list_size: Size of the pending list snapshot array */
-       int pending_list_size;
-       /** @pending_list: snapshot of the pending list info */
-       struct pending_list_snapshot *pending_list;
 };
 
 #endif
index f6057456e4609345775996419c7bb9e91a1480c3..6b5bc67767d3e5f5250d88045335c666d634ba15 100644 (file)
@@ -108,22 +108,6 @@ void xe_hw_fence_irq_run(struct xe_hw_fence_irq *irq)
        irq_work_queue(&irq->work);
 }
 
-void xe_hw_fence_irq_stop(struct xe_hw_fence_irq *irq)
-{
-       spin_lock_irq(&irq->lock);
-       irq->enabled = false;
-       spin_unlock_irq(&irq->lock);
-}
-
-void xe_hw_fence_irq_start(struct xe_hw_fence_irq *irq)
-{
-       spin_lock_irq(&irq->lock);
-       irq->enabled = true;
-       spin_unlock_irq(&irq->lock);
-
-       irq_work_queue(&irq->work);
-}
-
 void xe_hw_fence_ctx_init(struct xe_hw_fence_ctx *ctx, struct xe_gt *gt,
                          struct xe_hw_fence_irq *irq, const char *name)
 {
index f13a1c4982c7306dfced818c664082f783faf337..599492c13f8004deac10a5cb9ec86ad2dca746c7 100644 (file)
@@ -17,8 +17,6 @@ void xe_hw_fence_module_exit(void);
 void xe_hw_fence_irq_init(struct xe_hw_fence_irq *irq);
 void xe_hw_fence_irq_finish(struct xe_hw_fence_irq *irq);
 void xe_hw_fence_irq_run(struct xe_hw_fence_irq *irq);
-void xe_hw_fence_irq_stop(struct xe_hw_fence_irq *irq);
-void xe_hw_fence_irq_start(struct xe_hw_fence_irq *irq);
 
 void xe_hw_fence_ctx_init(struct xe_hw_fence_ctx *ctx, struct xe_gt *gt,
                          struct xe_hw_fence_irq *irq, const char *name);