drm/xe/vf: Pause submissions during RESFIX fixups

author Tomasz Lis <tomasz.lis@intel.com>

Sat, 2 Aug 2025 03:10:39 +0000 (05:10 +0200)

committer Michał Winiarski <michal.winiarski@intel.com>

Mon, 4 Aug 2025 14:46:25 +0000 (16:46 +0200)
author Tomasz Lis <tomasz.lis@intel.com>
Sat, 2 Aug 2025 03:10:39 +0000 (05:10 +0200)
committer Michał Winiarski <michal.winiarski@intel.com>
Mon, 4 Aug 2025 14:46:25 +0000 (16:46 +0200)
diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.c b/drivers/gpu/drm/xe/xe_gpu_scheduler.c

index 869b43a4151d2ac7c3be50f64b096b67d66d09c5..455ccaf173147f2ce409a830f90109f175eaddfe 100644 (file)
--- a/drivers/gpu/drm/xe/xe_gpu_scheduler.c
+++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.c
@@ -101,6 +101,19 @@ void xe_sched_submission_stop(struct xe_gpu_scheduler *sched)
         cancel_work_sync(&sched->work_process_msg);
  }
  
+/**
+ * xe_sched_submission_stop_async - Stop further runs of submission tasks on a scheduler.
+ * @sched: the &xe_gpu_scheduler struct instance
+ *
+ * This call disables further runs of scheduling work queue. It does not wait
+ * for any in-progress runs to finish, only makes sure no further runs happen
+ * afterwards.
+ */
+void xe_sched_submission_stop_async(struct xe_gpu_scheduler *sched)
+{
+       drm_sched_wqueue_stop(&sched->base);
+}
+
  void xe_sched_submission_resume_tdr(struct xe_gpu_scheduler *sched)
  {
         drm_sched_resume_timeout(&sched->base, sched->base.timeout);
diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.h b/drivers/gpu/drm/xe/xe_gpu_scheduler.h

index 308061f0cf372aaffaaf6df342ee7a431f3e3bf1..e548b2aed95a37e48f1abd33818cc5e94e45966b 100644 (file)
--- a/drivers/gpu/drm/xe/xe_gpu_scheduler.h
+++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
@@ -21,6 +21,7 @@ void xe_sched_fini(struct xe_gpu_scheduler *sched);
  
  void xe_sched_submission_start(struct xe_gpu_scheduler *sched);
  void xe_sched_submission_stop(struct xe_gpu_scheduler *sched);
+void xe_sched_submission_stop_async(struct xe_gpu_scheduler *sched);
  
  void xe_sched_submission_resume_tdr(struct xe_gpu_scheduler *sched);
  
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c

index 679ae229cc827e18f88867f03c96086e2a49c66a..ebc137be4de20420bb6e62909a64c0d558ce66c6 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1836,6 +1836,19 @@ void xe_guc_submit_stop(struct xe_guc *guc)
  
  }
  
+/**
+ * xe_guc_submit_pause - Stop further runs of submission tasks on given GuC.
+ * @guc: the &xe_guc struct instance whose scheduler is to be disabled
+ */
+void xe_guc_submit_pause(struct xe_guc *guc)
+{
+       struct xe_exec_queue *q;
+       unsigned long index;
+
+       xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
+               xe_sched_submission_stop_async(&q->guc->sched);
+}
+
  static void guc_exec_queue_start(struct xe_exec_queue *q)
  {
         struct xe_gpu_scheduler *sched = &q->guc->sched;
@@ -1876,6 +1889,28 @@ int xe_guc_submit_start(struct xe_guc *guc)
         return 0;
  }
  
+static void guc_exec_queue_unpause(struct xe_exec_queue *q)
+{
+       struct xe_gpu_scheduler *sched = &q->guc->sched;
+
+       xe_sched_submission_start(sched);
+}
+
+/**
+ * xe_guc_submit_unpause - Allow further runs of submission tasks on given GuC.
+ * @guc: the &xe_guc struct instance whose scheduler is to be enabled
+ */
+void xe_guc_submit_unpause(struct xe_guc *guc)
+{
+       struct xe_exec_queue *q;
+       unsigned long index;
+
+       xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
+               guc_exec_queue_unpause(q);
+
+       wake_up_all(&guc->ct.wq);
+}
+
  static struct xe_exec_queue *
  g2h_exec_queue_lookup(struct xe_guc *guc, u32 guc_id)
  {
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h

index 8f64e799283bef38a61d7c20209f32121efb5f94..ff44500f3da2f823cb9bed7e02ca78311cb387a3 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit.h
+++ b/drivers/gpu/drm/xe/xe_guc_submit.h
@@ -18,6 +18,8 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc);
  void xe_guc_submit_reset_wait(struct xe_guc *guc);
  void xe_guc_submit_stop(struct xe_guc *guc);
  int xe_guc_submit_start(struct xe_guc *guc);
+void xe_guc_submit_pause(struct xe_guc *guc);
+void xe_guc_submit_unpause(struct xe_guc *guc);
  void xe_guc_submit_wedge(struct xe_guc *guc);
  
  int xe_guc_read_stopped(struct xe_guc *guc);
diff --git a/drivers/gpu/drm/xe/xe_sriov_vf.c b/drivers/gpu/drm/xe/xe_sriov_vf.c

index 26e243c28994d746e58b655808911a53297a78a7..c66b17da1ce7aeeb5a7f43d5b2211b0ce186c85e 100644 (file)
--- a/drivers/gpu/drm/xe/xe_sriov_vf.c
+++ b/drivers/gpu/drm/xe/xe_sriov_vf.c
@@ -11,6 +11,8 @@
  #include "xe_gt_sriov_printk.h"
  #include "xe_gt_sriov_vf.h"
  #include "xe_guc_ct.h"
+#include "xe_guc_submit.h"
+#include "xe_irq.h"
  #include "xe_pm.h"
  #include "xe_sriov.h"
  #include "xe_sriov_printk.h"
@@ -147,6 +149,48 @@ void xe_sriov_vf_init_early(struct xe_device *xe)
                 xe_sriov_info(xe, "migration not supported by this module version\n");
  }
  
+/**
+ * vf_post_migration_shutdown - Stop the driver activities after VF migration.
+ * @xe: the &xe_device struct instance
+ *
+ * After this VM is migrated and assigned to a new VF, it is running on a new
+ * hardware, and therefore many hardware-dependent states and related structures
+ * require fixups. Without fixups, the hardware cannot do any work, and therefore
+ * all GPU pipelines are stalled.
+ * Stop some of kernel activities to make the fixup process faster.
+ */
+static void vf_post_migration_shutdown(struct xe_device *xe)
+{
+       struct xe_gt *gt;
+       unsigned int id;
+
+       for_each_gt(gt, xe, id)
+               xe_guc_submit_pause(&gt->uc.guc);
+}
+
+/**
+ * vf_post_migration_kickstart - Re-start the driver activities under new hardware.
+ * @xe: the &xe_device struct instance
+ *
+ * After we have finished with all post-migration fixups, restart the driver
+ * activities to continue feeding the GPU with workloads.
+ */
+static void vf_post_migration_kickstart(struct xe_device *xe)
+{
+       struct xe_gt *gt;
+       unsigned int id;
+
+       /*
+        * Make sure interrupts on the new HW are properly set. The GuC IRQ
+        * must be working at this point, since the recovery did started,
+        * but the rest was not enabled using the procedure from spec.
+        */
+       xe_irq_resume(xe);
+
+       for_each_gt(gt, xe, id)
+               xe_guc_submit_unpause(&gt->uc.guc);
+}
+
  static bool gt_vf_post_migration_needed(struct xe_gt *gt)
  {
         return test_bit(gt->info.id, &gt_to_xe(gt)->sriov.vf.migration.gt_flags);
@@ -230,6 +274,7 @@ static void vf_post_migration_recovery(struct xe_device *xe)
  
         drm_dbg(&xe->drm, "migration recovery in progress\n");
         xe_pm_runtime_get(xe);
+       vf_post_migration_shutdown(xe);
  
         if (!vf_migration_supported(xe)) {
                 xe_sriov_err(xe, "migration not supported by this module version\n");
@@ -247,6 +292,7 @@ static void vf_post_migration_recovery(struct xe_device *xe)
                 set_bit(id, &fixed_gts);
         }
  
+       vf_post_migration_kickstart(xe);
         err = vf_post_migration_notify_resfix_done(xe, fixed_gts);
         if (err)
                 goto fail;
author	Tomasz Lis <tomasz.lis@intel.com>
	Sat, 2 Aug 2025 03:10:39 +0000 (05:10 +0200)
committer	Michał Winiarski <michal.winiarski@intel.com>
	Mon, 4 Aug 2025 14:46:25 +0000 (16:46 +0200)
drivers/gpu/drm/xe/xe_gpu_scheduler.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_gpu_scheduler.h		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_guc_submit.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_guc_submit.h		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_sriov_vf.c		patch \| blob \| blame \| history