drm/xe: Stop abusing DRM scheduler internals

author Matthew Brost <matthew.brost@intel.com>

Sat, 10 Jan 2026 01:27:34 +0000 (17:27 -0800)

committer Matthew Brost <matthew.brost@intel.com>

Sat, 10 Jan 2026 21:39:50 +0000 (13:39 -0800)
author Matthew Brost <matthew.brost@intel.com>
Sat, 10 Jan 2026 01:27:34 +0000 (17:27 -0800)
committer Matthew Brost <matthew.brost@intel.com>
Sat, 10 Jan 2026 21:39:50 +0000 (13:39 -0800)
diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.c b/drivers/gpu/drm/xe/xe_gpu_scheduler.c

index f4f23317191ff1811b9e4525aedfcd266fa50f63..9c8004d5dd915f7f56473fb5b7570d51baf30f94 100644 (file)
--- a/drivers/gpu/drm/xe/xe_gpu_scheduler.c
+++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.c
@@ -7,7 +7,7 @@
  
  static void xe_sched_process_msg_queue(struct xe_gpu_scheduler *sched)
  {
-       if (!READ_ONCE(sched->base.pause_submit))
+       if (!drm_sched_is_stopped(&sched->base))
                 queue_work(sched->base.submit_wq, &sched->work_process_msg);
  }
  
@@ -43,7 +43,7 @@ static void xe_sched_process_msg_work(struct work_struct *w)
                 container_of(w, struct xe_gpu_scheduler, work_process_msg);
         struct xe_sched_msg *msg;
  
-       if (READ_ONCE(sched->base.pause_submit))
+       if (drm_sched_is_stopped(&sched->base))
                 return;
  
         msg = xe_sched_get_msg(sched);
diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.h b/drivers/gpu/drm/xe/xe_gpu_scheduler.h

index dceb2cd0ee5bde6727f744e461db284ddef86fa7..664c2db56af3d5cb97dacac6a6cf02d91142fdaa 100644 (file)
--- a/drivers/gpu/drm/xe/xe_gpu_scheduler.h
+++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
@@ -56,12 +56,9 @@ static inline void xe_sched_resubmit_jobs(struct xe_gpu_scheduler *sched)
         struct drm_sched_job *s_job;
         bool restore_replay = false;
  
-       list_for_each_entry(s_job, &sched->base.pending_list, list) {
-               struct drm_sched_fence *s_fence = s_job->s_fence;
-               struct dma_fence *hw_fence = s_fence->parent;
-
+       drm_sched_for_each_pending_job(s_job, &sched->base, NULL) {
                 restore_replay |= to_xe_sched_job(s_job)->restore_replay;
-               if (restore_replay || (hw_fence && !dma_fence_is_signaled(hw_fence)))
+               if (restore_replay || !drm_sched_job_is_signaled(s_job))
                         sched->base.ops->run_job(s_job);
         }
  }
@@ -72,14 +69,6 @@ xe_sched_invalidate_job(struct xe_sched_job *job, int threshold)
         return drm_sched_invalidate_job(&job->drm, threshold);
  }
  
-static inline void xe_sched_add_pending_job(struct xe_gpu_scheduler *sched,
-                                           struct xe_sched_job *job)
-{
-       spin_lock(&sched->base.job_list_lock);
-       list_add(&job->drm.list, &sched->base.pending_list);
-       spin_unlock(&sched->base.job_list_lock);
-}
-
  /**
   * xe_sched_first_pending_job() - Find first pending job which is unsignaled
   * @sched: Xe GPU scheduler
@@ -89,21 +78,13 @@ static inline void xe_sched_add_pending_job(struct xe_gpu_scheduler *sched,
  static inline
  struct xe_sched_job *xe_sched_first_pending_job(struct xe_gpu_scheduler *sched)
  {
-       struct xe_sched_job *job, *r_job = NULL;
-
-       spin_lock(&sched->base.job_list_lock);
-       list_for_each_entry(job, &sched->base.pending_list, drm.list) {
-               struct drm_sched_fence *s_fence = job->drm.s_fence;
-               struct dma_fence *hw_fence = s_fence->parent;
+       struct drm_sched_job *job;
  
-               if (hw_fence && !dma_fence_is_signaled(hw_fence)) {
-                       r_job = job;
-                       break;
-               }
-       }
-       spin_unlock(&sched->base.job_list_lock);
+       drm_sched_for_each_pending_job(job, &sched->base, NULL)
+               if (!drm_sched_job_is_signaled(job))
+                       return to_xe_sched_job(job);
  
-       return r_job;
+       return NULL;
  }
  
  static inline int
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c

index 45aa56b30a127a35381089f50d3923652f9603a6..b7824834ff337717125c31517bb56872ac32b096 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1374,7 +1374,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
         struct xe_exec_queue *q = ge->q;
         struct xe_guc *guc = exec_queue_to_guc(q);
         struct xe_gpu_scheduler *sched = &ge->sched;
-       struct xe_sched_job *job;
+       struct drm_sched_job *job;
         bool wedged = false;
  
         xe_gt_assert(guc_to_gt(guc), xe_exec_queue_is_lr(q));
@@ -1433,16 +1433,10 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
         if (!exec_queue_killed(q) && !xe_lrc_ring_is_idle(q->lrc[0]))
                 xe_devcoredump(q, NULL, "LR job cleanup, guc_id=%d", q->guc->id);
  
-       xe_hw_fence_irq_stop(q->fence_irq);
+       drm_sched_for_each_pending_job(job, &sched->base, NULL)
+               xe_sched_job_set_error(to_xe_sched_job(job), -ECANCELED);
  
         xe_sched_submission_start(sched);
-
-       spin_lock(&sched->base.job_list_lock);
-       list_for_each_entry(job, &sched->base.pending_list, drm.list)
-               xe_sched_job_set_error(job, -ECANCELED);
-       spin_unlock(&sched->base.job_list_lock);
-
-       xe_hw_fence_irq_start(q->fence_irq);
  }
  
  #define ADJUST_FIVE_PERCENT(__t)       mul_u64_u32_div(__t, 105, 100)
@@ -1570,7 +1564,7 @@ static enum drm_gpu_sched_stat
  guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
  {
         struct xe_sched_job *job = to_xe_sched_job(drm_job);
-       struct xe_sched_job *tmp_job;
+       struct drm_sched_job *tmp_job;
         struct xe_exec_queue *q = job->q;
         struct xe_gpu_scheduler *sched = &q->guc->sched;
         struct xe_guc *guc = exec_queue_to_guc(q);
@@ -1578,7 +1572,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
         struct xe_device *xe = guc_to_xe(guc);
         int err = -ETIME;
         pid_t pid = -1;
-       int i = 0;
         bool wedged = false, skip_timeout_check;
  
         xe_gt_assert(guc_to_gt(guc), !xe_exec_queue_is_lr(q));
@@ -1756,14 +1749,11 @@ trigger_reset:
                 __deregister_exec_queue(guc, q);
         }
  
-       /* Stop fence signaling */
-       xe_hw_fence_irq_stop(q->fence_irq);
+       /* Mark all outstanding jobs as bad, thus completing them */
+       xe_sched_job_set_error(job, err);
+       drm_sched_for_each_pending_job(tmp_job, &sched->base, NULL)
+               xe_sched_job_set_error(to_xe_sched_job(tmp_job), -ECANCELED);
  
-       /*
-        * Fence state now stable, stop / start scheduler which cleans up any
-        * fences that are complete
-        */
-       xe_sched_add_pending_job(sched, job);
         xe_sched_submission_start(sched);
  
         if (xe_exec_queue_is_multi_queue(q))
@@ -1771,16 +1761,11 @@ trigger_reset:
         else
                 xe_guc_exec_queue_trigger_cleanup(q);
  
-       /* Mark all outstanding jobs as bad, thus completing them */
-       spin_lock(&sched->base.job_list_lock);
-       list_for_each_entry(tmp_job, &sched->base.pending_list, drm.list)
-               xe_sched_job_set_error(tmp_job, !i++ ? err : -ECANCELED);
-       spin_unlock(&sched->base.job_list_lock);
-
-       /* Start fence signaling */
-       xe_hw_fence_irq_start(q->fence_irq);
-
-       return DRM_GPU_SCHED_STAT_RESET;
+       /*
+        * We want the job added back to the pending list so it gets freed; this
+        * is what DRM_GPU_SCHED_STAT_NO_HANG does.
+        */
+       return DRM_GPU_SCHED_STAT_NO_HANG;
  
  sched_enable:
         set_exec_queue_pending_tdr_exit(q);
@@ -2754,11 +2739,12 @@ static void guc_exec_queue_unpause_prepare(struct xe_guc *guc,
                                            struct xe_exec_queue *q)
  {
         struct xe_gpu_scheduler *sched = &q->guc->sched;
-       struct xe_sched_job *job = NULL, *__job;
+       struct xe_sched_job *job = NULL;
+       struct drm_sched_job *s_job;
         bool restore_replay = false;
  
-       list_for_each_entry(__job, &sched->base.pending_list, drm.list) {
-               job = __job;
+       drm_sched_for_each_pending_job(s_job, &sched->base, NULL) {
+               job = to_xe_sched_job(s_job);
                 restore_replay |= job->restore_replay;
                 if (restore_replay) {
                         xe_gt_dbg(guc_to_gt(guc), "Replay JOB - guc_id=%d, seqno=%d",
@@ -2882,7 +2868,7 @@ void xe_guc_submit_unpause_vf(struct xe_guc *guc)
                  * created after resfix done.
                  */
                 if (q->guc->id != index ||
-                   !READ_ONCE(q->guc->sched.base.pause_submit))
+                   !drm_sched_is_stopped(&q->guc->sched.base))
                         continue;
  
                 guc_exec_queue_unpause(guc, q);
@@ -3387,29 +3373,6 @@ xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q)
                 snapshot->multi_queue.primary = xe_exec_queue_multi_queue_primary(q)->guc->id;
                 snapshot->multi_queue.pos = q->multi_queue.pos;
         }
-       spin_lock(&sched->base.job_list_lock);
-       snapshot->pending_list_size = list_count_nodes(&sched->base.pending_list);
-       snapshot->pending_list = kmalloc_array(snapshot->pending_list_size,
-                                              sizeof(struct pending_list_snapshot),
-                                              GFP_ATOMIC);
-
-       if (snapshot->pending_list) {
-               struct xe_sched_job *job_iter;
-
-               i = 0;
-               list_for_each_entry(job_iter, &sched->base.pending_list, drm.list) {
-                       snapshot->pending_list[i].seqno =
-                               xe_sched_job_seqno(job_iter);
-                       snapshot->pending_list[i].fence =
-                               dma_fence_is_signaled(job_iter->fence) ? 1 : 0;
-                       snapshot->pending_list[i].finished =
-                               dma_fence_is_signaled(&job_iter->drm.s_fence->finished)
-                               ? 1 : 0;
-                       i++;
-               }
-       }
-
-       spin_unlock(&sched->base.job_list_lock);
  
         return snapshot;
  }
@@ -3473,13 +3436,6 @@ xe_guc_exec_queue_snapshot_print(struct xe_guc_submit_exec_queue_snapshot *snaps
                 drm_printf(p, "\tMulti queue primary GuC ID: %d\n", snapshot->multi_queue.primary);
                 drm_printf(p, "\tMulti queue position: %d\n", snapshot->multi_queue.pos);
         }
-
-       for (i = 0; snapshot->pending_list && i < snapshot->pending_list_size;
-            i++)
-               drm_printf(p, "\tJob: seqno=%d, fence=%d, finished=%d\n",
-                          snapshot->pending_list[i].seqno,
-                          snapshot->pending_list[i].fence,
-                          snapshot->pending_list[i].finished);
  }
  
  /**
@@ -3502,7 +3458,6 @@ void xe_guc_exec_queue_snapshot_free(struct xe_guc_submit_exec_queue_snapshot *s
                         xe_lrc_snapshot_free(snapshot->lrc[i]);
                 kfree(snapshot->lrc);
         }
-       kfree(snapshot->pending_list);
         kfree(snapshot);
  }
  
diff --git a/drivers/gpu/drm/xe/xe_guc_submit_types.h b/drivers/gpu/drm/xe/xe_guc_submit_types.h

index 25e29e85502cbc3fd1ebe93e41314133567bf56a..5ccc5f959bb3a6b844fccd4c85a97c94cebb5031 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit_types.h
+++ b/drivers/gpu/drm/xe/xe_guc_submit_types.h
@@ -61,12 +61,6 @@ struct guc_submit_parallel_scratch {
         u32 wq[WQ_SIZE / sizeof(u32)];
  };
  
-struct pending_list_snapshot {
-       u32 seqno;
-       bool fence;
-       bool finished;
-};
-
  /**
   * struct xe_guc_submit_exec_queue_snapshot - Snapshot for devcoredump
   */
@@ -147,11 +141,6 @@ struct xe_guc_submit_exec_queue_snapshot {
                 /** @valid: The exec queue is part of a multi queue group */
                 bool valid;
         } multi_queue;
-
-       /** @pending_list_size: Size of the pending list snapshot array */
-       int pending_list_size;
-       /** @pending_list: snapshot of the pending list info */
-       struct pending_list_snapshot *pending_list;
  };
  
  #endif
diff --git a/drivers/gpu/drm/xe/xe_hw_fence.c b/drivers/gpu/drm/xe/xe_hw_fence.c

index f6057456e4609345775996419c7bb9e91a1480c3..6b5bc67767d3e5f5250d88045335c666d634ba15 100644 (file)
--- a/drivers/gpu/drm/xe/xe_hw_fence.c
+++ b/drivers/gpu/drm/xe/xe_hw_fence.c
@@ -108,22 +108,6 @@ void xe_hw_fence_irq_run(struct xe_hw_fence_irq *irq)
         irq_work_queue(&irq->work);
  }
  
-void xe_hw_fence_irq_stop(struct xe_hw_fence_irq *irq)
-{
-       spin_lock_irq(&irq->lock);
-       irq->enabled = false;
-       spin_unlock_irq(&irq->lock);
-}
-
-void xe_hw_fence_irq_start(struct xe_hw_fence_irq *irq)
-{
-       spin_lock_irq(&irq->lock);
-       irq->enabled = true;
-       spin_unlock_irq(&irq->lock);
-
-       irq_work_queue(&irq->work);
-}
-
  void xe_hw_fence_ctx_init(struct xe_hw_fence_ctx *ctx, struct xe_gt *gt,
                           struct xe_hw_fence_irq *irq, const char *name)
  {
diff --git a/drivers/gpu/drm/xe/xe_hw_fence.h b/drivers/gpu/drm/xe/xe_hw_fence.h

index f13a1c4982c7306dfced818c664082f783faf337..599492c13f8004deac10a5cb9ec86ad2dca746c7 100644 (file)
--- a/drivers/gpu/drm/xe/xe_hw_fence.h
+++ b/drivers/gpu/drm/xe/xe_hw_fence.h
@@ -17,8 +17,6 @@ void xe_hw_fence_module_exit(void);
  void xe_hw_fence_irq_init(struct xe_hw_fence_irq *irq);
  void xe_hw_fence_irq_finish(struct xe_hw_fence_irq *irq);
  void xe_hw_fence_irq_run(struct xe_hw_fence_irq *irq);
-void xe_hw_fence_irq_stop(struct xe_hw_fence_irq *irq);
-void xe_hw_fence_irq_start(struct xe_hw_fence_irq *irq);
  
  void xe_hw_fence_ctx_init(struct xe_hw_fence_ctx *ctx, struct xe_gt *gt,
                           struct xe_hw_fence_irq *irq, const char *name);
author	Matthew Brost <matthew.brost@intel.com>
	Sat, 10 Jan 2026 01:27:34 +0000 (17:27 -0800)
committer	Matthew Brost <matthew.brost@intel.com>
	Sat, 10 Jan 2026 21:39:50 +0000 (13:39 -0800)
drivers/gpu/drm/xe/xe_gpu_scheduler.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_gpu_scheduler.h		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_guc_submit.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_guc_submit_types.h		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_hw_fence.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_hw_fence.h		patch \| blob \| blame \| history