]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/xe/vf: Add debugfs entries to test VF double migration
authorSatyanarayana K V P <satyanarayana.k.v.p@intel.com>
Mon, 1 Dec 2025 09:50:16 +0000 (15:20 +0530)
committerMichal Wajdeczko <michal.wajdeczko@intel.com>
Tue, 2 Dec 2025 15:18:05 +0000 (16:18 +0100)
VF migration sends a marker to the GUC before resource fixups begin,
and repeats the marker with the RESFIX_DONE notification. This prevents
the GUC from submitting jobs during double migration events.

To reliably test double migration, a second migration must be triggered
while fixups from the first migration are still in progress. Since fixups
complete quickly, reproducing this scenario is difficult. Introduce
debugfs controls to add delays in the post-fixup phase, creating a
deterministic window for subsequent migrations.

New debugfs entries:
/sys/kernel/debug/dri/BDF/
├── tile0
│   ├─gt0
│   │ ├──vf
│   │ │  ├── resfix_stoppers

resfix_stoppers: Predefined checkpoints that allow the migration process
to pause at specific stages. The stages are given below.

VF_MIGRATION_WAIT_RESFIX_START - BIT(0)
VF_MIGRATION_WAIT_FIXUPS - BIT(1)
VF_MIGRATION_WAIT_RESTART_JOBS - BIT(2)
VF_MIGRATION_WAIT_RESFIX_DONE - BIT(3)

Each state will pause with a 1-second delay per iteration, continuing until
its corresponding bit is cleared.

Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Tomasz Lis <tomasz.lis@intel.com>
Acked-by: Adam Miszczak <adam.miszczak@linux.intel.com>
Reviewed-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://patch.msgid.link/20251201095011.21453-10-satyanarayana.k.v.p@intel.com
drivers/gpu/drm/xe/xe_gt_sriov_vf.c
drivers/gpu/drm/xe/xe_gt_sriov_vf_debugfs.c
drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h

index 0b3ecb000ff7123892387a4566a26fe66b8f6b07..3c806c8e5f3e4532394cac172aa2b12eb9b42280 100644 (file)
@@ -5,6 +5,7 @@
 
 #include <linux/bitfield.h>
 #include <linux/bsearch.h>
+#include <linux/delay.h>
 
 #include <drm/drm_managed.h>
 #include <drm/drm_print.h>
 
 #define make_u64_from_u32(hi, lo) ((u64)((u64)(u32)(hi) << 32 | (u32)(lo)))
 
+#ifdef CONFIG_DRM_XE_DEBUG
+enum VF_MIGRATION_WAIT_POINTS {
+       VF_MIGRATION_WAIT_RESFIX_START  = BIT(0),
+       VF_MIGRATION_WAIT_FIXUPS        = BIT(1),
+       VF_MIGRATION_WAIT_RESTART_JOBS  = BIT(2),
+       VF_MIGRATION_WAIT_RESFIX_DONE   = BIT(3),
+};
+
+#define VF_MIGRATION_WAIT_DELAY_IN_MS  1000
+static void vf_post_migration_inject_wait(struct xe_gt *gt,
+                                         enum VF_MIGRATION_WAIT_POINTS wait)
+{
+       while (gt->sriov.vf.migration.debug.resfix_stoppers & wait) {
+               xe_gt_dbg(gt,
+                         "*TESTING* injecting %u ms delay due to resfix_stoppers=%#x, to continue clear %#x\n",
+                         VF_MIGRATION_WAIT_DELAY_IN_MS,
+                         gt->sriov.vf.migration.debug.resfix_stoppers, wait);
+
+               msleep(VF_MIGRATION_WAIT_DELAY_IN_MS);
+       }
+}
+
+#define VF_MIGRATION_INJECT_WAIT(gt, _POS) ({                                  \
+       struct xe_gt *__gt = (gt);                                              \
+       vf_post_migration_inject_wait(__gt, VF_MIGRATION_WAIT_##_POS);          \
+       })
+
+#else
+#define VF_MIGRATION_INJECT_WAIT(_gt, ...)     typecheck(struct xe_gt *, (_gt))
+#endif
+
 static int guc_action_vf_reset(struct xe_guc *guc)
 {
        u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
@@ -320,6 +352,8 @@ static int vf_resfix_start(struct xe_gt *gt, u16 marker)
 
        xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
 
+       VF_MIGRATION_INJECT_WAIT(gt, RESFIX_START);
+
        xe_gt_sriov_dbg_verbose(gt, "Sending resfix start marker %u\n", marker);
 
        return guc_action_vf_resfix_start(guc, marker);
@@ -1158,6 +1192,8 @@ static int vf_post_migration_fixups(struct xe_gt *gt)
        void *buf = gt->sriov.vf.migration.scratch;
        int err;
 
+       VF_MIGRATION_INJECT_WAIT(gt, FIXUPS);
+
        /* xe_gt_sriov_vf_query_config will fixup the GGTT addresses */
        err = xe_gt_sriov_vf_query_config(gt);
        if (err)
@@ -1176,6 +1212,8 @@ static int vf_post_migration_fixups(struct xe_gt *gt)
 
 static void vf_post_migration_rearm(struct xe_gt *gt)
 {
+       VF_MIGRATION_INJECT_WAIT(gt, RESTART_JOBS);
+
        /*
         * Make sure interrupts on the new HW are properly set. The GuC IRQ
         * must be working at this point, since the recovery did started,
@@ -1206,6 +1244,8 @@ static void vf_post_migration_abort(struct xe_gt *gt)
 
 static int vf_post_migration_resfix_done(struct xe_gt *gt, u16 marker)
 {
+       VF_MIGRATION_INJECT_WAIT(gt, RESFIX_DONE);
+
        spin_lock_irq(&gt->sriov.vf.migration.lock);
        if (gt->sriov.vf.migration.recovery_queued)
                xe_gt_sriov_dbg(gt, "another recovery imminent\n");
index 2ed5b6780d30d81154607e18cf5b021432e06018..507718326e1f6393f77f7cbf4dc5c3506ed13632 100644 (file)
@@ -69,4 +69,16 @@ void xe_gt_sriov_vf_debugfs_register(struct xe_gt *gt, struct dentry *root)
        vfdentry->d_inode->i_private = gt;
 
        drm_debugfs_create_files(vf_info, ARRAY_SIZE(vf_info), vfdentry, minor);
+
+       /*
+        *      /sys/kernel/debug/dri/BDF/
+        *      ├── tile0
+        *          ├── gt0
+        *              ├── vf
+        *                  ├── resfix_stoppers
+        */
+       if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
+               debugfs_create_x8("resfix_stoppers", 0600, vfdentry,
+                                 &gt->sriov.vf.migration.debug.resfix_stoppers);
+       }
 }
index db2f8b3ed3e93314f80d4c25bc799236fdabaadc..510c33116fbdb3820066531c25b2d0cbbaeda2bb 100644 (file)
@@ -52,6 +52,14 @@ struct xe_gt_sriov_vf_migration {
        wait_queue_head_t wq;
        /** @scratch: Scratch memory for VF recovery */
        void *scratch;
+       /** @debug: Debug hooks for delaying migration */
+       struct {
+               /**
+                * @debug.resfix_stoppers: Stop and wait at different stages
+                * during post migration recovery
+                */
+               u8 resfix_stoppers;
+       } debug;
        /**
         * @resfix_marker: Marker sent on start and on end of post-migration
         * steps.