drm/xe: Track LR jobs in DRM scheduler pending list

author Matthew Brost <matthew.brost@intel.com>

Wed, 8 Oct 2025 21:45:02 +0000 (14:45 -0700)

committer Matthew Brost <matthew.brost@intel.com>

Thu, 9 Oct 2025 10:22:19 +0000 (03:22 -0700)
author Matthew Brost <matthew.brost@intel.com>
Wed, 8 Oct 2025 21:45:02 +0000 (14:45 -0700)
committer Matthew Brost <matthew.brost@intel.com>
Thu, 9 Oct 2025 10:22:19 +0000 (03:22 -0700)
diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c

index 83897950f0da3e0d976c6e6f68a45242bf78ab7b..0dc27476832b5d1dec6a99786d428db1615b71d1 100644 (file)
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -124,7 +124,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
         struct xe_validation_ctx ctx;
         struct xe_sched_job *job;
         struct xe_vm *vm;
-       bool write_locked, skip_retry = false;
+       bool write_locked;
         int err = 0;
         struct xe_hw_engine_group *group;
         enum xe_hw_engine_group_execution_mode mode, previous_mode;
@@ -266,12 +266,6 @@ retry:
                 goto err_exec;
         }
  
-       if (xe_exec_queue_is_lr(q) && xe_exec_queue_ring_full(q)) {
-               err = -EWOULDBLOCK;     /* Aliased to -EAGAIN */
-               skip_retry = true;
-               goto err_exec;
-       }
-
         if (xe_exec_queue_uses_pxp(q)) {
                 err = xe_vm_validate_protected(q->vm);
                 if (err)
@@ -328,8 +322,6 @@ retry:
                 xe_sched_job_init_user_fence(job, &syncs[i]);
         }
  
-       if (xe_exec_queue_is_lr(q))
-               q->ring_ops->emit_job(job);
         if (!xe_vm_in_lr_mode(vm))
                 xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
         xe_sched_job_push(job);
@@ -355,7 +347,7 @@ err_exec:
                 xe_validation_ctx_fini(&ctx);
  err_unlock_list:
         up_read(&vm->lock);
-       if (err == -EAGAIN && !skip_retry)
+       if (err == -EAGAIN)
                 goto retry;
  err_hw_exec_mode:
         if (mode == EXEC_MODE_DMA_FENCE)
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c

index df82463b19f6cae0cafacca9453744cadbacc075..7621089a47fe107797b9195ec9f177c3cad59619 100644 (file)
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -850,25 +850,6 @@ bool xe_exec_queue_is_lr(struct xe_exec_queue *q)
                 !(q->flags & EXEC_QUEUE_FLAG_VM);
  }
  
-static s32 xe_exec_queue_num_job_inflight(struct xe_exec_queue *q)
-{
-       return q->lrc[0]->fence_ctx.next_seqno - xe_lrc_seqno(q->lrc[0]) - 1;
-}
-
-/**
- * xe_exec_queue_ring_full() - Whether an exec_queue's ring is full
- * @q: The exec_queue
- *
- * Return: True if the exec_queue's ring is full, false otherwise.
- */
-bool xe_exec_queue_ring_full(struct xe_exec_queue *q)
-{
-       struct xe_lrc *lrc = q->lrc[0];
-       s32 max_job = lrc->ring.size / MAX_JOB_SIZE_BYTES;
-
-       return xe_exec_queue_num_job_inflight(q) >= max_job;
-}
-
  /**
   * xe_exec_queue_is_idle() - Whether an exec_queue is idle.
   * @q: The exec_queue
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h

index 8821ceb838d0b1adc872ad42b138b4a79961459b..a4dfbe858bda2e384bb1cfac01e6771a3a1f7506 100644 (file)
--- a/drivers/gpu/drm/xe/xe_exec_queue.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue.h
@@ -64,8 +64,6 @@ static inline bool xe_exec_queue_uses_pxp(struct xe_exec_queue *q)
  
  bool xe_exec_queue_is_lr(struct xe_exec_queue *q);
  
-bool xe_exec_queue_ring_full(struct xe_exec_queue *q);
-
  bool xe_exec_queue_is_idle(struct xe_exec_queue *q);
  
  void xe_exec_queue_kill(struct xe_exec_queue *q);
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c

index 13746f32b231961ea377f127dcacbf9cc7fdca52..3a534d93505f016e919b0619556de0057078f952 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -851,30 +851,31 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)
         struct xe_sched_job *job = to_xe_sched_job(drm_job);
         struct xe_exec_queue *q = job->q;
         struct xe_guc *guc = exec_queue_to_guc(q);
-       struct dma_fence *fence = NULL;
-       bool lr = xe_exec_queue_is_lr(q);
+       bool lr = xe_exec_queue_is_lr(q), killed_or_banned_or_wedged =
+               exec_queue_killed_or_banned_or_wedged(q);
  
         xe_gt_assert(guc_to_gt(guc), !(exec_queue_destroyed(q) || exec_queue_pending_disable(q)) ||
                      exec_queue_banned(q) || exec_queue_suspended(q));
  
         trace_xe_sched_job_run(job);
  
-       if (!exec_queue_killed_or_banned_or_wedged(q) && !xe_sched_job_is_error(job)) {
+       if (!killed_or_banned_or_wedged && !xe_sched_job_is_error(job)) {
                 if (!exec_queue_registered(q))
                         register_exec_queue(q, GUC_CONTEXT_NORMAL);
-               if (!lr)        /* LR jobs are emitted in the exec IOCTL */
-                       q->ring_ops->emit_job(job);
+               q->ring_ops->emit_job(job);
                 submit_exec_queue(q);
         }
  
-       if (lr) {
-               xe_sched_job_set_error(job, -EOPNOTSUPP);
-               dma_fence_put(job->fence);      /* Drop ref from xe_sched_job_arm */
-       } else {
-               fence = job->fence;
-       }
+       /*
+        * We don't care about job-fence ordering in LR VMs because these fences
+        * are never exported; they are used solely to keep jobs on the pending
+        * list. Once a queue enters an error state, there's no need to track
+        * them.
+        */
+       if (killed_or_banned_or_wedged && lr)
+               xe_sched_job_set_error(job, -ECANCELED);
  
-       return fence;
+       return job->fence;
  }
  
  static void guc_exec_queue_free_job(struct drm_sched_job *drm_job)
@@ -916,7 +917,8 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
                 xe_gt_warn(q->gt, "Pending enable/disable failed to respond\n");
                 xe_sched_submission_start(sched);
                 xe_gt_reset_async(q->gt);
-               xe_sched_tdr_queue_imm(sched);
+               if (!xe_exec_queue_is_lr(q))
+                       xe_sched_tdr_queue_imm(sched);
                 return;
         }
  
@@ -1008,6 +1010,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
         struct xe_exec_queue *q = ge->q;
         struct xe_guc *guc = exec_queue_to_guc(q);
         struct xe_gpu_scheduler *sched = &ge->sched;
+       struct xe_sched_job *job;
         bool wedged = false;
  
         xe_gt_assert(guc_to_gt(guc), xe_exec_queue_is_lr(q));
@@ -1058,7 +1061,16 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
         if (!exec_queue_killed(q) && !xe_lrc_ring_is_idle(q->lrc[0]))
                 xe_devcoredump(q, NULL, "LR job cleanup, guc_id=%d", q->guc->id);
  
+       xe_hw_fence_irq_stop(q->fence_irq);
+
         xe_sched_submission_start(sched);
+
+       spin_lock(&sched->base.job_list_lock);
+       list_for_each_entry(job, &sched->base.pending_list, drm.list)
+               xe_sched_job_set_error(job, -ECANCELED);
+       spin_unlock(&sched->base.job_list_lock);
+
+       xe_hw_fence_irq_start(q->fence_irq);
  }
  
  #define ADJUST_FIVE_PERCENT(__t)       mul_u64_u32_div(__t, 105, 100)
@@ -1129,7 +1141,8 @@ static void enable_scheduling(struct xe_exec_queue *q)
                 xe_gt_warn(guc_to_gt(guc), "Schedule enable failed to respond");
                 set_exec_queue_banned(q);
                 xe_gt_reset_async(q->gt);
-               xe_sched_tdr_queue_imm(&q->guc->sched);
+               if (!xe_exec_queue_is_lr(q))
+                       xe_sched_tdr_queue_imm(&q->guc->sched);
         }
  }
  
@@ -1187,6 +1200,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
         int i = 0;
         bool wedged = false, skip_timeout_check;
  
+       xe_gt_assert(guc_to_gt(guc), !xe_exec_queue_is_lr(q));
+
         /*
          * TDR has fired before free job worker. Common if exec queue
          * immediately closed after last fence signaled. Add back to pending
author	Matthew Brost <matthew.brost@intel.com>
	Wed, 8 Oct 2025 21:45:02 +0000 (14:45 -0700)
committer	Matthew Brost <matthew.brost@intel.com>
	Thu, 9 Oct 2025 10:22:19 +0000 (03:22 -0700)
drivers/gpu/drm/xe/xe_exec.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_exec_queue.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_exec_queue.h		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_guc_submit.c		patch \| blob \| blame \| history