drm/xe: Move queue init before LRC creation

author Matthew Brost <matthew.brost@intel.com>

Wed, 8 Oct 2025 21:45:25 +0000 (14:45 -0700)

committer Matthew Brost <matthew.brost@intel.com>

Thu, 9 Oct 2025 10:22:53 +0000 (03:22 -0700)
author Matthew Brost <matthew.brost@intel.com>
Wed, 8 Oct 2025 21:45:25 +0000 (14:45 -0700)
committer Matthew Brost <matthew.brost@intel.com>
Thu, 9 Oct 2025 10:22:53 +0000 (03:22 -0700)
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c

index 7621089a47fe107797b9195ec9f177c3cad59619..90cbc95f8e2e12dbddc0e71acd400480d889a60d 100644 (file)
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -15,6 +15,7 @@
  #include "xe_dep_scheduler.h"
  #include "xe_device.h"
  #include "xe_gt.h"
+#include "xe_gt_sriov_vf.h"
  #include "xe_hw_engine_class_sysfs.h"
  #include "xe_hw_engine_group.h"
  #include "xe_hw_fence.h"
@@ -205,17 +206,34 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q, u32 exec_queue_flags)
         if (!(exec_queue_flags & EXEC_QUEUE_FLAG_KERNEL))
                 flags |= XE_LRC_CREATE_USER_CTX;
  
+       err = q->ops->init(q);
+       if (err)
+               return err;
+
+       /*
+        * This must occur after q->ops->init to avoid race conditions during VF
+        * post-migration recovery, as the fixups for the LRC GGTT addresses
+        * depend on the queue being present in the backend tracking structure.
+        *
+        * In addition to above, we must wait on inflight GGTT changes to avoid
+        * writing out stale values here. Such wait provides a solid solution
+        * (without a race) only if the function can detect migration instantly
+        * from the moment vCPU resumes execution.
+        */
         for (i = 0; i < q->width; ++i) {
-               q->lrc[i] = xe_lrc_create(q->hwe, q->vm, SZ_16K, q->msix_vec, flags);
-               if (IS_ERR(q->lrc[i])) {
-                       err = PTR_ERR(q->lrc[i]);
+               struct xe_lrc *lrc;
+
+               xe_gt_sriov_vf_wait_valid_ggtt(q->gt);
+               lrc = xe_lrc_create(q->hwe, q->vm, xe_lrc_ring_size(),
+                                   q->msix_vec, flags);
+               if (IS_ERR(lrc)) {
+                       err = PTR_ERR(lrc);
                         goto err_lrc;
                 }
-       }
  
-       err = q->ops->init(q);
-       if (err)
-               goto err_lrc;
+               /* Pairs with READ_ONCE to xe_exec_queue_contexts_hwsp_rebase */
+               WRITE_ONCE(q->lrc[i], lrc);
+       }
  
         return 0;
  
@@ -1121,9 +1139,16 @@ int xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void *scratch)
         int err = 0;
  
         for (i = 0; i < q->width; ++i) {
-               xe_lrc_update_memirq_regs_with_address(q->lrc[i], q->hwe, scratch);
-               xe_lrc_update_hwctx_regs_with_address(q->lrc[i]);
-               err = xe_lrc_setup_wa_bb_with_scratch(q->lrc[i], q->hwe, scratch);
+               struct xe_lrc *lrc;
+
+               /* Pairs with WRITE_ONCE in __xe_exec_queue_init  */
+               lrc = READ_ONCE(q->lrc[i]);
+               if (!lrc)
+                       continue;
+
+               xe_lrc_update_memirq_regs_with_address(lrc, q->hwe, scratch);
+               xe_lrc_update_hwctx_regs_with_address(lrc);
+               err = xe_lrc_setup_wa_bb_with_scratch(lrc, q->hwe, scratch);
                 if (err)
                         break;
         }
diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c

index f83d421ac9d3d2a098fdeb170e96f36eb09baec7..769d05517f93d7b5fae23608e0648a52e7b3596b 100644 (file)
--- a/drivers/gpu/drm/xe/xe_execlist.c
+++ b/drivers/gpu/drm/xe/xe_execlist.c
@@ -339,7 +339,7 @@ static int execlist_exec_queue_init(struct xe_exec_queue *q)
         const struct drm_sched_init_args args = {
                 .ops = &drm_sched_ops,
                 .num_rqs = 1,
-               .credit_limit = q->lrc[0]->ring.size / MAX_JOB_SIZE_BYTES,
+               .credit_limit = xe_lrc_ring_size() / MAX_JOB_SIZE_BYTES,
                 .hang_limit = XE_SCHED_HANG_LIMIT,
                 .timeout = XE_SCHED_JOB_TIMEOUT,
                 .name = q->hwe->name,
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c

index ced4884d11c33fa5ff3830f81f798db0669f9e57..edd98a4da3066294d84f054bde58206a38ff6ce6 100644 (file)
--- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
@@ -482,6 +482,12 @@ static int vf_get_ggtt_info(struct xe_gt *gt)
                 xe_tile_sriov_vf_fixup_ggtt_nodes_locked(gt_to_tile(gt), shift);
         }
  
+       if (xe_sriov_vf_migration_supported(gt_to_xe(gt))) {
+               WRITE_ONCE(gt->sriov.vf.migration.ggtt_need_fixes, false);
+               smp_wmb();      /* Ensure above write visible before wake */
+               wake_up_all(&gt->sriov.vf.migration.wq);
+       }
+
         return 0;
  }
  
@@ -731,7 +737,8 @@ static void vf_start_migration_recovery(struct xe_gt *gt)
             !gt->sriov.vf.migration.recovery_teardown) {
                 gt->sriov.vf.migration.recovery_queued = true;
                 WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, true);
-               smp_wmb();      /* Ensure above write visable before wake */
+               WRITE_ONCE(gt->sriov.vf.migration.ggtt_need_fixes, true);
+               smp_wmb();      /* Ensure above writes visable before wake */
  
                 xe_guc_ct_wake_waiters(&gt->uc.guc.ct);
  
@@ -1149,8 +1156,11 @@ static void vf_post_migration_abort(struct xe_gt *gt)
  {
         spin_lock_irq(&gt->sriov.vf.migration.lock);
         WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, false);
+       WRITE_ONCE(gt->sriov.vf.migration.ggtt_need_fixes, false);
         spin_unlock_irq(&gt->sriov.vf.migration.lock);
  
+       wake_up_all(&gt->sriov.vf.migration.wq);
+
         xe_guc_submit_pause_abort(&gt->uc.guc);
  }
  
@@ -1259,6 +1269,7 @@ int xe_gt_sriov_vf_init_early(struct xe_gt *gt)
         gt->sriov.vf.migration.scratch = buf;
         spin_lock_init(&gt->sriov.vf.migration.lock);
         INIT_WORK(&gt->sriov.vf.migration.worker, migration_worker_func);
+       init_waitqueue_head(&gt->sriov.vf.migration.wq);
  
         return 0;
  }
@@ -1308,3 +1319,35 @@ bool xe_gt_sriov_vf_recovery_pending(struct xe_gt *gt)
  
         return READ_ONCE(gt->sriov.vf.migration.recovery_inprogress);
  }
+
+static bool vf_valid_ggtt(struct xe_gt *gt)
+{
+       struct xe_memirq *memirq = &gt_to_tile(gt)->memirq;
+       bool irq_pending = xe_device_uses_memirq(gt_to_xe(gt)) &&
+               xe_memirq_guc_sw_int_0_irq_pending(memirq, &gt->uc.guc);
+
+       xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
+
+       if (irq_pending || READ_ONCE(gt->sriov.vf.migration.ggtt_need_fixes))
+               return false;
+
+       return true;
+}
+
+/**
+ * xe_gt_sriov_vf_wait_valid_ggtt() - VF wait for valid GGTT addresses
+ * @gt: the &xe_gt
+ */
+void xe_gt_sriov_vf_wait_valid_ggtt(struct xe_gt *gt)
+{
+       int ret;
+
+       if (!IS_SRIOV_VF(gt_to_xe(gt)) ||
+           !xe_sriov_vf_migration_supported(gt_to_xe(gt)))
+               return;
+
+       ret = wait_event_interruptible_timeout(gt->sriov.vf.migration.wq,
+                                              vf_valid_ggtt(gt),
+                                              HZ * 5);
+       xe_gt_WARN_ON(gt, !ret);
+}
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.h b/drivers/gpu/drm/xe/xe_gt_sriov_vf.h

index 1d2eaa52f8046bd626f550ef4daae839b2a4c641..af40276790fadb21a28c22afe0f83053e5dcd8ef 100644 (file)
--- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
@@ -38,4 +38,6 @@ void xe_gt_sriov_vf_print_config(struct xe_gt *gt, struct drm_printer *p);
  void xe_gt_sriov_vf_print_runtime(struct xe_gt *gt, struct drm_printer *p);
  void xe_gt_sriov_vf_print_version(struct xe_gt *gt, struct drm_printer *p);
  
+void xe_gt_sriov_vf_wait_valid_ggtt(struct xe_gt *gt);
+
  #endif
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h b/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h

index bdd9b968f204c06411f1b9eee7b40ffb9c6f9d6e..420b0e6089de229de1de452002ec3ec7d39be91a 100644 (file)
--- a/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
@@ -7,6 +7,7 @@
  #define _XE_GT_SRIOV_VF_TYPES_H_
  
  #include <linux/types.h>
+#include <linux/wait.h>
  #include <linux/workqueue.h>
  #include "xe_uc_fw_types.h"
  
@@ -47,6 +48,8 @@ struct xe_gt_sriov_vf_migration {
         struct work_struct worker;
         /** @lock: Protects recovery_queued, teardown */
         spinlock_t lock;
+       /** @wq: wait queue for migration fixes */
+       wait_queue_head_t wq;
         /** @scratch: Scratch memory for VF recovery */
         void *scratch;
         /** @recovery_teardown: VF post migration recovery is being torn down */
@@ -55,6 +58,8 @@ struct xe_gt_sriov_vf_migration {
         bool recovery_queued;
         /** @recovery_inprogress: VF post migration recovery in progress */
         bool recovery_inprogress;
+       /** @ggtt_need_fixes: VF GGTT needs fixes */
+       bool ggtt_need_fixes;
  };
  
  /**
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c

index 0a9080bba50ba86c8ccd532d1fbf1fc091c2c2c0..b4c09604c5ea3a5bd538f650b20584c87548fab3 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1670,7 +1670,7 @@ static int guc_exec_queue_init(struct xe_exec_queue *q)
         timeout = (q->vm && xe_vm_in_lr_mode(q->vm)) ? MAX_SCHEDULE_TIMEOUT :
                   msecs_to_jiffies(q->sched_props.job_timeout_ms);
         err = xe_sched_init(&ge->sched, &drm_sched_ops, &xe_sched_ops,
-                           NULL, q->lrc[0]->ring.size / MAX_JOB_SIZE_BYTES, 64,
+                           NULL, xe_lrc_ring_size() / MAX_JOB_SIZE_BYTES, 64,
                             timeout, guc_to_gt(guc)->ordered_wq, NULL,
                             q->name, gt_to_xe(q->gt)->drm.dev);
         if (err)
diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h

index 21a3daab0154184452f02c9d28d3920a16bd0b93..2fb628da5c4331c8b7e96e0de11e78af20acb3bc 100644 (file)
--- a/drivers/gpu/drm/xe/xe_lrc.h
+++ b/drivers/gpu/drm/xe/xe_lrc.h
@@ -76,6 +76,16 @@ static inline void xe_lrc_put(struct xe_lrc *lrc)
         kref_put(&lrc->refcount, xe_lrc_destroy);
  }
  
+/**
+ * xe_lrc_ring_size() - Xe LRC ring size
+ *
+ * Return: Size of LRC ring buffer
+ */
+static inline size_t xe_lrc_ring_size(void)
+{
+       return SZ_16K;
+}
+
  size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class);
  u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc);
  u32 xe_lrc_regs_offset(struct xe_lrc *lrc);
author	Matthew Brost <matthew.brost@intel.com>
	Wed, 8 Oct 2025 21:45:25 +0000 (14:45 -0700)
committer	Matthew Brost <matthew.brost@intel.com>
	Thu, 9 Oct 2025 10:22:53 +0000 (03:22 -0700)
drivers/gpu/drm/xe/xe_exec_queue.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_execlist.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_gt_sriov_vf.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_gt_sriov_vf.h		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_guc_submit.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_lrc.h		patch \| blob \| blame \| history