drm/xe/vf: Make multi-GT migration less error prone

author Tomasz Lis <tomasz.lis@intel.com>

Mon, 30 Jun 2025 15:21:55 +0000 (17:21 +0200)

committer Michał Winiarski <michal.winiarski@intel.com>

Thu, 3 Jul 2025 19:05:21 +0000 (21:05 +0200)
author Tomasz Lis <tomasz.lis@intel.com>
Mon, 30 Jun 2025 15:21:55 +0000 (17:21 +0200)
committer Michał Winiarski <michal.winiarski@intel.com>
Thu, 3 Jul 2025 19:05:21 +0000 (21:05 +0200)
diff --git a/drivers/gpu/drm/xe/xe_sriov_vf.c b/drivers/gpu/drm/xe/xe_sriov_vf.c

index 6526fe450e553d143ab7cf42636005be7e7aaf1e..26e243c28994d746e58b655808911a53297a78a7 100644 (file)
--- a/drivers/gpu/drm/xe/xe_sriov_vf.c
+++ b/drivers/gpu/drm/xe/xe_sriov_vf.c
@@ -147,127 +147,113 @@ void xe_sriov_vf_init_early(struct xe_device *xe)
                 xe_sriov_info(xe, "migration not supported by this module version\n");
  }
  
-/**
- * vf_post_migration_requery_guc - Re-query GuC for current VF provisioning.
+static bool gt_vf_post_migration_needed(struct xe_gt *gt)
+{
+       return test_bit(gt->info.id, &gt_to_xe(gt)->sriov.vf.migration.gt_flags);
+}
+
+/*
+ * Notify GuCs marked in flags about resource fixups apply finished.
   * @xe: the &xe_device struct instance
- *
- * After migration, we need to re-query all VF configuration to make sure
- * they match previous provisioning. Note that most of VF provisioning
- * shall be the same, except GGTT range, since GGTT is not virtualized per-VF.
- *
- * Returns: 0 if the operation completed successfully, or a negative error
- * code otherwise.
+ * @gt_flags: flags marking to which GTs the notification shall be sent
   */
-static int vf_post_migration_requery_guc(struct xe_device *xe)
+static int vf_post_migration_notify_resfix_done(struct xe_device *xe, unsigned long gt_flags)
  {
         struct xe_gt *gt;
         unsigned int id;
-       int err, ret = 0;
+       int err = 0;
  
         for_each_gt(gt, xe, id) {
-               err = xe_gt_sriov_vf_query_config(gt);
-               ret = ret ?: err;
+               if (!test_bit(id, &gt_flags))
+                       continue;
+               /* skip asking GuC for RESFIX exit if new recovery request arrived */
+               if (gt_vf_post_migration_needed(gt))
+                       continue;
+               err = xe_gt_sriov_vf_notify_resfix_done(gt);
+               if (err)
+                       break;
+               clear_bit(id, &gt_flags);
         }
  
-       return ret;
+       if (gt_flags && !err)
+               drm_dbg(&xe->drm, "another recovery imminent, skipped some notifications\n");
+       return err;
  }
  
-static void vf_post_migration_fixup_ctb(struct xe_device *xe)
+static int vf_get_next_migrated_gt_id(struct xe_device *xe)
  {
         struct xe_gt *gt;
         unsigned int id;
  
-       xe_assert(xe, IS_SRIOV_VF(xe));
-
         for_each_gt(gt, xe, id) {
-               s32 shift = xe_gt_sriov_vf_ggtt_shift(gt);
-
-               xe_guc_ct_fixup_messages_with_ggtt(&gt->uc.guc.ct, shift);
+               if (test_and_clear_bit(id, &xe->sriov.vf.migration.gt_flags))
+                       return id;
         }
+       return -1;
  }
  
-/*
- * vf_post_migration_imminent - Check if post-restore recovery is coming.
- * @xe: the &xe_device struct instance
+/**
+ * Perform post-migration fixups on a single GT.
   *
- * Return: True if migration recovery worker will soon be running. Any worker currently
- * executing does not affect the result.
+ * After migration, GuC needs to be re-queried for VF configuration to check
+ * if it matches previous provisioning. Most of VF provisioning shall be the
+ * same, except GGTT range, since GGTT is not virtualized per-VF. If GGTT
+ * range has changed, we have to perform fixups - shift all GGTT references
+ * used anywhere within the driver. After the fixups in this function succeed,
+ * it is allowed to ask the GuC bound to this GT to continue normal operation.
+ *
+ * Returns: 0 if the operation completed successfully, or a negative error
+ * code otherwise.
   */
-static bool vf_post_migration_imminent(struct xe_device *xe)
+static int gt_vf_post_migration_fixups(struct xe_gt *gt)
  {
-       return xe->sriov.vf.migration.gt_flags != 0 ||
-       work_pending(&xe->sriov.vf.migration.worker);
-}
-
-static bool vf_post_migration_fixup_ggtt_nodes(struct xe_device *xe)
-{
-       bool need_fixups = false;
-       struct xe_tile *tile;
-       unsigned int id;
-
-       for_each_tile(tile, xe, id) {
-               struct xe_gt *gt = tile->primary_gt;
-               s64 shift;
-
-               shift = xe_gt_sriov_vf_ggtt_shift(gt);
-               if (shift) {
-                       need_fixups = true;
-                       xe_tile_sriov_vf_fixup_ggtt_nodes(tile, shift);
-               }
-       }
-       return need_fixups;
-}
+       s64 shift;
+       int err;
  
-/*
- * Notify all GuCs about resource fixups apply finished.
- */
-static void vf_post_migration_notify_resfix_done(struct xe_device *xe)
-{
-       struct xe_gt *gt;
-       unsigned int id;
+       err = xe_gt_sriov_vf_query_config(gt);
+       if (err)
+               return err;
  
-       for_each_gt(gt, xe, id) {
-               if (vf_post_migration_imminent(xe))
-                       goto skip;
-               xe_gt_sriov_vf_notify_resfix_done(gt);
+       shift = xe_gt_sriov_vf_ggtt_shift(gt);
+       if (shift) {
+               xe_tile_sriov_vf_fixup_ggtt_nodes(gt_to_tile(gt), shift);
+               /* FIXME: add the recovery steps */
+               xe_guc_ct_fixup_messages_with_ggtt(&gt->uc.guc.ct, shift);
         }
-       return;
-
-skip:
-       drm_dbg(&xe->drm, "another recovery imminent, skipping notifications\n");
+       return 0;
  }
  
  static void vf_post_migration_recovery(struct xe_device *xe)
  {
-       bool need_fixups;
-       int err;
+       unsigned long fixed_gts = 0;
+       int id, err;
  
         drm_dbg(&xe->drm, "migration recovery in progress\n");
         xe_pm_runtime_get(xe);
-       err = vf_post_migration_requery_guc(xe);
-       if (vf_post_migration_imminent(xe))
-               goto defer;
-       if (unlikely(err))
-               goto fail;
+
         if (!vf_migration_supported(xe)) {
                 xe_sriov_err(xe, "migration not supported by this module version\n");
                 err = -ENOTRECOVERABLE;
                 goto fail;
         }
  
-       need_fixups = vf_post_migration_fixup_ggtt_nodes(xe);
-       /* FIXME: add the recovery steps */
-       if (need_fixups)
-               vf_post_migration_fixup_ctb(xe);
+       while (id = vf_get_next_migrated_gt_id(xe), id >= 0) {
+               struct xe_gt *gt = xe_device_get_gt(xe, id);
+
+               err = gt_vf_post_migration_fixups(gt);
+               if (err)
+                       goto fail;
+
+               set_bit(id, &fixed_gts);
+       }
+
+       err = vf_post_migration_notify_resfix_done(xe, fixed_gts);
+       if (err)
+               goto fail;
  
-       vf_post_migration_notify_resfix_done(xe);
         xe_pm_runtime_put(xe);
         drm_notice(&xe->drm, "migration recovery ended\n");
         return;
-defer:
-       xe_pm_runtime_put(xe);
-       drm_dbg(&xe->drm, "migration recovery deferred\n");
-       return;
  fail:
         xe_pm_runtime_put(xe);
         drm_err(&xe->drm, "migration recovery failed (%pe)\n", ERR_PTR(err));
@@ -282,18 +268,23 @@ static void migration_worker_func(struct work_struct *w)
         vf_post_migration_recovery(xe);
  }
  
-static bool vf_ready_to_recovery_on_all_gts(struct xe_device *xe)
+/*
+ * Check if post-restore recovery is coming on any of GTs.
+ * @xe: the &xe_device struct instance
+ *
+ * Return: True if migration recovery worker will soon be running. Any worker currently
+ * executing does not affect the result.
+ */
+static bool vf_ready_to_recovery_on_any_gts(struct xe_device *xe)
  {
         struct xe_gt *gt;
         unsigned int id;
  
         for_each_gt(gt, xe, id) {
-               if (!test_bit(id, &xe->sriov.vf.migration.gt_flags)) {
-                       xe_gt_sriov_dbg_verbose(gt, "still not ready to recover\n");
-                       return false;
-               }
+               if (test_bit(id, &xe->sriov.vf.migration.gt_flags))
+                       return true;
         }
-       return true;
+       return false;
  }
  
  /**
@@ -308,13 +299,9 @@ void xe_sriov_vf_start_migration_recovery(struct xe_device *xe)
  
         xe_assert(xe, IS_SRIOV_VF(xe));
  
-       if (!vf_ready_to_recovery_on_all_gts(xe))
+       if (!vf_ready_to_recovery_on_any_gts(xe))
                 return;
  
-       WRITE_ONCE(xe->sriov.vf.migration.gt_flags, 0);
-       /* Ensure other threads see that no flags are set now. */
-       smp_mb();
-
         started = queue_work(xe->sriov.wq, &xe->sriov.vf.migration.worker);
         drm_info(&xe->drm, "VF migration recovery %s\n", started ?
                  "scheduled" : "already in progress");
author	Tomasz Lis <tomasz.lis@intel.com>
	Mon, 30 Jun 2025 15:21:55 +0000 (17:21 +0200)
committer	Michał Winiarski <michal.winiarski@intel.com>
	Thu, 3 Jul 2025 19:05:21 +0000 (21:05 +0200)