]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/xe/vf: Abort VF post migration recovery on failure
authorMatthew Brost <matthew.brost@intel.com>
Wed, 8 Oct 2025 21:45:23 +0000 (14:45 -0700)
committerMatthew Brost <matthew.brost@intel.com>
Thu, 9 Oct 2025 10:22:49 +0000 (03:22 -0700)
If VF post-migration recovery fails, the device is wedged. However,
submission queues still need to be enabled for proper cleanup. In such
cases, call into the GuC submission backend to restart all queues that
were previously paused.

v3:
 - s/Avort/Abort (Tomasz)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Tomasz Lis <tomasz.lis@intel.com>
Link: https://lore.kernel.org/r/20251008214532.3442967-26-matthew.brost@intel.com
drivers/gpu/drm/xe/xe_gt_sriov_vf.c
drivers/gpu/drm/xe/xe_guc_submit.c
drivers/gpu/drm/xe/xe_guc_submit.h

index 321178b6022a66504e568f2d228b1ee69bdcaba1..3b6f56062e21b93bd92e3d602a0a9d579ba38711 100644 (file)
@@ -1144,6 +1144,15 @@ static void vf_post_migration_kickstart(struct xe_gt *gt)
        xe_guc_submit_unpause(&gt->uc.guc);
 }
 
+static void vf_post_migration_abort(struct xe_gt *gt)
+{
+       spin_lock_irq(&gt->sriov.vf.migration.lock);
+       WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, false);
+       spin_unlock_irq(&gt->sriov.vf.migration.lock);
+
+       xe_guc_submit_pause_abort(&gt->uc.guc);
+}
+
 static int vf_post_migration_notify_resfix_done(struct xe_gt *gt)
 {
        bool skip_resfix = false;
@@ -1202,6 +1211,7 @@ static void vf_post_migration_recovery(struct xe_gt *gt)
        xe_gt_sriov_notice(gt, "migration recovery ended\n");
        return;
 fail:
+       vf_post_migration_abort(gt);
        xe_pm_runtime_put(xe);
        xe_gt_sriov_err(gt, "migration recovery failed (%pe)\n", ERR_PTR(err));
        xe_device_declare_wedged(xe);
index 7f0ea35f4f0a073cf98c1542efadcc9d5dfb8e63..be410a7126c7e208dc84a1d483c2d0cccc9d8caf 100644 (file)
@@ -2098,6 +2098,26 @@ void xe_guc_submit_unpause(struct xe_guc *guc)
        wake_up_all(&guc->ct.wq);
 }
 
+/**
+ * xe_guc_submit_pause_abort - Abort all paused submission task on given GuC.
+ * @guc: the &xe_guc struct instance whose scheduler is to be aborted
+ */
+void xe_guc_submit_pause_abort(struct xe_guc *guc)
+{
+       struct xe_exec_queue *q;
+       unsigned long index;
+
+       mutex_lock(&guc->submission_state.lock);
+       xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) {
+               struct xe_gpu_scheduler *sched = &q->guc->sched;
+
+               xe_sched_submission_start(sched);
+               if (exec_queue_killed_or_banned_or_wedged(q))
+                       xe_guc_exec_queue_trigger_cleanup(q);
+       }
+       mutex_unlock(&guc->submission_state.lock);
+}
+
 static struct xe_exec_queue *
 g2h_exec_queue_lookup(struct xe_guc *guc, u32 guc_id)
 {
index f535fe3895e50c6589dc71a28ea058d685d1cb34..fe82c317048e70d17cb705322b730864415bdf41 100644 (file)
@@ -22,6 +22,7 @@ void xe_guc_submit_stop(struct xe_guc *guc);
 int xe_guc_submit_start(struct xe_guc *guc);
 void xe_guc_submit_pause(struct xe_guc *guc);
 void xe_guc_submit_unpause(struct xe_guc *guc);
+void xe_guc_submit_pause_abort(struct xe_guc *guc);
 void xe_guc_submit_wedge(struct xe_guc *guc);
 
 int xe_guc_read_stopped(struct xe_guc *guc);