#define VF2GUC_VF_RESET_RESPONSE_MSG_0_MBZ GUC_HXG_RESPONSE_MSG_0_DATA0
/**
- * DOC: VF2GUC_NOTIFY_RESFIX_DONE
+ * DOC: VF2GUC_RESFIX_DONE
*
- * This action is used by VF to notify the GuC that the VF KMD has completed
- * post-migration recovery steps.
+ * This action is used by VF to inform the GuC that the VF KMD has completed
+ * post-migration recovery steps. From GuC VF compatibility 1.27.0 onwards, it
+ * shall only be sent after posting RESFIX_START and that both @MARKER fields
+ * must match.
*
* This message must be sent as `MMIO HXG Message`_.
*
+ * Updated since GuC VF compatibility 1.27.0.
+ *
* +---+-------+--------------------------------------------------------------+
* | | Bits | Description |
* +===+=======+==============================================================+
* | +-------+--------------------------------------------------------------+
* | | 30:28 | TYPE = GUC_HXG_TYPE_REQUEST_ |
* | +-------+--------------------------------------------------------------+
- * | | 27:16 | DATA0 = MBZ |
+ * | | 27:16 | DATA0 = MARKER = MBZ (only prior 1.27.0) |
* | +-------+--------------------------------------------------------------+
- * | | 15:0 | ACTION = _`GUC_ACTION_VF2GUC_NOTIFY_RESFIX_DONE` = 0x5508 |
+ * | | 27:16 | DATA0 = MARKER - can't be zero (1.27.0+) |
+ * | +-------+--------------------------------------------------------------+
+ * | | 15:0 | ACTION = _`GUC_ACTION_VF2GUC_RESFIX_DONE` = 0x5508 |
* +---+-------+--------------------------------------------------------------+
*
* +---+-------+--------------------------------------------------------------+
* | | 27:0 | DATA0 = MBZ |
* +---+-------+--------------------------------------------------------------+
*/
-#define GUC_ACTION_VF2GUC_NOTIFY_RESFIX_DONE 0x5508u
+#define GUC_ACTION_VF2GUC_RESFIX_DONE 0x5508u
-#define VF2GUC_NOTIFY_RESFIX_DONE_REQUEST_MSG_LEN GUC_HXG_REQUEST_MSG_MIN_LEN
-#define VF2GUC_NOTIFY_RESFIX_DONE_REQUEST_MSG_0_MBZ GUC_HXG_REQUEST_MSG_0_DATA0
+#define VF2GUC_RESFIX_DONE_REQUEST_MSG_LEN GUC_HXG_REQUEST_MSG_MIN_LEN
+#define VF2GUC_RESFIX_DONE_REQUEST_MSG_0_MARKER GUC_HXG_REQUEST_MSG_0_DATA0
-#define VF2GUC_NOTIFY_RESFIX_DONE_RESPONSE_MSG_LEN GUC_HXG_RESPONSE_MSG_MIN_LEN
-#define VF2GUC_NOTIFY_RESFIX_DONE_RESPONSE_MSG_0_MBZ GUC_HXG_RESPONSE_MSG_0_DATA0
+#define VF2GUC_RESFIX_DONE_RESPONSE_MSG_LEN GUC_HXG_RESPONSE_MSG_MIN_LEN
+#define VF2GUC_RESFIX_DONE_RESPONSE_MSG_0_MBZ GUC_HXG_RESPONSE_MSG_0_DATA0
/**
* DOC: VF2GUC_QUERY_SINGLE_KLV
#define PF2GUC_SAVE_RESTORE_VF_RESPONSE_MSG_LEN GUC_HXG_RESPONSE_MSG_MIN_LEN
#define PF2GUC_SAVE_RESTORE_VF_RESPONSE_MSG_0_USED GUC_HXG_RESPONSE_MSG_0_DATA0
+/**
+ * DOC: VF2GUC_RESFIX_START
+ *
+ * This action is used by VF to inform the GuC that the VF KMD will be starting
+ * post-migration recovery fixups. The @MARKER sent with this action must match
+ * with the MARKER posted in the VF2GUC_RESFIX_DONE message.
+ *
+ * This message must be sent as `MMIO HXG Message`_.
+ *
+ * Available since GuC VF compatibility 1.27.0.
+ *
+ * +---+-------+--------------------------------------------------------------+
+ * | | Bits | Description |
+ * +===+=======+==============================================================+
+ * | 0 | 31 | ORIGIN = GUC_HXG_ORIGIN_HOST_ |
+ * | +-------+--------------------------------------------------------------+
+ * | | 30:28 | TYPE = GUC_HXG_TYPE_REQUEST_ |
+ * | +-------+--------------------------------------------------------------+
+ * | | 27:16 | DATA0 = MARKER - can't be zero |
+ * | +-------+--------------------------------------------------------------+
+ * | | 15:0 | ACTION = _`GUC_ACTION_VF2GUC_RESFIX_START` = 0x550F |
+ * +---+-------+--------------------------------------------------------------+
+ *
+ * +---+-------+--------------------------------------------------------------+
+ * | | Bits | Description |
+ * +===+=======+==============================================================+
+ * | 0 | 31 | ORIGIN = GUC_HXG_ORIGIN_GUC_ |
+ * | +-------+--------------------------------------------------------------+
+ * | | 30:28 | TYPE = GUC_HXG_TYPE_RESPONSE_SUCCESS_ |
+ * | +-------+--------------------------------------------------------------+
+ * | | 27:0 | DATA0 = MBZ |
+ * +---+-------+--------------------------------------------------------------+
+ */
+#define GUC_ACTION_VF2GUC_RESFIX_START 0x550Fu
+
+#define VF2GUC_RESFIX_START_REQUEST_MSG_LEN GUC_HXG_REQUEST_MSG_MIN_LEN
+#define VF2GUC_RESFIX_START_REQUEST_MSG_0_MARKER GUC_HXG_REQUEST_MSG_0_DATA0
+
+#define VF2GUC_RESFIX_START_RESPONSE_MSG_LEN GUC_HXG_RESPONSE_MSG_MIN_LEN
+#define VF2GUC_RESFIX_START_RESPONSE_MSG_0_MBZ GUC_HXG_RESPONSE_MSG_0_DATA0
+
#endif
*found = gt->sriov.vf.guc_version;
}
-static int guc_action_vf_notify_resfix_done(struct xe_guc *guc)
+static int guc_action_vf_resfix_start(struct xe_guc *guc, u16 marker)
{
u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
- FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_NOTIFY_RESFIX_DONE),
+ FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_RESFIX_START) |
+ FIELD_PREP(VF2GUC_RESFIX_START_REQUEST_MSG_0_MARKER, marker),
};
int ret;
return ret > 0 ? -EPROTO : ret;
}
-/**
- * vf_notify_resfix_done - Notify GuC about resource fixups apply completed.
- * @gt: the &xe_gt struct instance linked to target GuC
- *
- * Returns: 0 if the operation completed successfully, or a negative error
- * code otherwise.
- */
-static int vf_notify_resfix_done(struct xe_gt *gt)
+static int vf_resfix_start(struct xe_gt *gt, u16 marker)
{
struct xe_guc *guc = >->uc.guc;
- int err;
xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
- err = guc_action_vf_notify_resfix_done(guc);
- if (unlikely(err))
- xe_gt_sriov_err(gt, "Failed to notify GuC about resource fixup done (%pe)\n",
- ERR_PTR(err));
- else
- xe_gt_sriov_dbg_verbose(gt, "sent GuC resource fixup done\n");
+ xe_gt_sriov_dbg_verbose(gt, "Sending resfix start marker %u\n", marker);
- return err;
+ return guc_action_vf_resfix_start(guc, marker);
+}
+
+static int guc_action_vf_resfix_done(struct xe_guc *guc, u16 marker)
+{
+ u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
+ FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
+ FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
+ FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_RESFIX_DONE) |
+ FIELD_PREP(VF2GUC_RESFIX_DONE_REQUEST_MSG_0_MARKER, marker),
+ };
+ int ret;
+
+ ret = xe_guc_mmio_send(guc, request, ARRAY_SIZE(request));
+
+ return ret > 0 ? -EPROTO : ret;
+}
+
+static int vf_resfix_done(struct xe_gt *gt, u16 marker)
+{
+ struct xe_guc *guc = >->uc.guc;
+
+ xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
+
+ xe_gt_sriov_dbg_verbose(gt, "Sending resfix done marker %u\n", marker);
+
+ return guc_action_vf_resfix_done(guc, marker);
}
static int guc_action_query_single_klv(struct xe_guc *guc, u32 key,
static void vf_post_migration_rearm(struct xe_gt *gt)
{
+ /*
+ * Make sure interrupts on the new HW are properly set. The GuC IRQ
+ * must be working at this point, since the recovery did started,
+ * but the rest was not enabled using the procedure from spec.
+ */
+ xe_irq_resume(gt_to_xe(gt));
+
xe_guc_ct_restart(>->uc.guc.ct);
xe_guc_submit_unpause_prepare_vf(>->uc.guc);
}
xe_guc_submit_pause_abort(>->uc.guc);
}
-static int vf_post_migration_notify_resfix_done(struct xe_gt *gt)
+static int vf_post_migration_resfix_done(struct xe_gt *gt, u16 marker)
{
- bool skip_resfix = false;
-
spin_lock_irq(>->sriov.vf.migration.lock);
- if (gt->sriov.vf.migration.recovery_queued) {
- skip_resfix = true;
- xe_gt_sriov_dbg(gt, "another recovery imminent, resfix skipped\n");
- } else {
+ if (gt->sriov.vf.migration.recovery_queued)
+ xe_gt_sriov_dbg(gt, "another recovery imminent\n");
+ else
WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, false);
- }
spin_unlock_irq(>->sriov.vf.migration.lock);
- if (skip_resfix)
- return -EAGAIN;
+ return vf_resfix_done(gt, marker);
+}
- /*
- * Make sure interrupts on the new HW are properly set. The GuC IRQ
- * must be working at this point, since the recovery did started,
- * but the rest was not enabled using the procedure from spec.
- */
- xe_irq_resume(gt_to_xe(gt));
+static int vf_post_migration_resfix_start(struct xe_gt *gt, u16 marker)
+{
+ return vf_resfix_start(gt, marker);
+}
+
+static u16 vf_post_migration_next_resfix_marker(struct xe_gt *gt)
+{
+ xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
- return vf_notify_resfix_done(gt);
+ BUILD_BUG_ON(1 + ((typeof(gt->sriov.vf.migration.resfix_marker))~0) >
+ FIELD_MAX(VF2GUC_RESFIX_START_REQUEST_MSG_0_MARKER));
+
+ /* add 1 to avoid zero-marker */
+ return 1 + gt->sriov.vf.migration.resfix_marker++;
}
static void vf_post_migration_recovery(struct xe_gt *gt)
{
struct xe_device *xe = gt_to_xe(gt);
- int err;
+ u16 marker;
bool retry;
+ int err;
xe_gt_sriov_dbg(gt, "migration recovery in progress\n");
goto fail;
}
+ marker = vf_post_migration_next_resfix_marker(gt);
+
+ err = vf_post_migration_resfix_start(gt, marker);
+ if (unlikely(err)) {
+ xe_gt_sriov_err(gt, "Recovery failed at GuC RESFIX_START step (%pe)\n",
+ ERR_PTR(err));
+ goto fail;
+ }
+
err = vf_post_migration_fixups(gt);
if (err)
goto fail;
vf_post_migration_rearm(gt);
- err = vf_post_migration_notify_resfix_done(gt);
- if (err && err != -EAGAIN)
+ err = vf_post_migration_resfix_done(gt, marker);
+ if (err) {
+ xe_gt_sriov_err(gt, "Recovery failed at GuC RESFIX_DONE step (%pe)\n",
+ ERR_PTR(err));
goto fail;
+ }
vf_post_migration_kickstart(gt);
wait_queue_head_t wq;
/** @scratch: Scratch memory for VF recovery */
void *scratch;
+ /**
+ * @resfix_marker: Marker sent on start and on end of post-migration
+ * steps.
+ */
+ u8 resfix_marker;
/** @recovery_teardown: VF post migration recovery is being torn down */
bool recovery_teardown;
/** @recovery_queued: VF post migration recovery in queued */
*
* As soon as Virtual GPU of the VM starts, the VF driver within receives
* the MIGRATED interrupt and schedules post-migration recovery worker.
- * That worker queries GuC for new provisioning (using MMIO communication),
+ * That worker sends `VF2GUC_RESFIX_START` action along with non-zero
+ * marker, queries GuC for new provisioning (using MMIO communication),
* and applies fixups to any non-virtualized resources used by the VF.
*
* When the VF driver is ready to continue operation on the newly connected
- * hardware, it sends `VF2GUC_NOTIFY_RESFIX_DONE` which causes it to
+ * hardware, it sends `VF2GUC_RESFIX_DONE` action along with the same
+ * marker which was sent with `VF2GUC_RESFIX_START` which causes it to
* enter the long awaited `VF_RUNNING` state, and therefore start handling
* CTB messages and scheduling workloads from the VF::
*
* | [ ] new VF provisioning [ ]
* | [ ]---------------------------> [ ]
* | | [ ]
+ * | | VF2GUC_RESFIX_START [ ]
+ * | [ ] <---------------------------[ ]
+ * | [ ] [ ]
+ * | [ ] success [ ]
+ * | [ ]---------------------------> [ ]
* | | VF driver applies post [ ]
* | | migration fixups -------[ ]
* | | | [ ]
* | | -----> [ ]
* | | [ ]
- * | | VF2GUC_NOTIFY_RESFIX_DONE [ ]
+ * | | VF2GUC_RESFIX_DONE [ ]
* | [ ] <---------------------------[ ]
* | [ ] [ ]
* | [ ] GuC sets new VF state to [ ]
* | [ ]---------------------------> [ ]
* | | |
* | | |
+ *
+ * Handling of VF double migration flow is shown below::
+ *
+ * GuC1 VF
+ * | |
+ * | [ ]<--- start fixups
+ * | VF2GUC_RESFIX_START(marker) [ ]
+ * [ ] <-------------------------------------------[ ]
+ * [ ] [ ]
+ * [ ]---\ [ ]
+ * [ ] store marker [ ]
+ * [ ]<--/ [ ]
+ * [ ] [ ]
+ * [ ] success [ ]
+ * [ ] ------------------------------------------> [ ]
+ * | [ ]
+ * | [ ]---\
+ * | [ ] do fixups
+ * | [ ]<--/
+ * | [ ]
+ * -------------- VF paused / saved ----------------
+ * :
+ *
+ * GuC2
+ * |
+ * ----------------- VF restored ------------------
+ * |
+ * [ ]
+ * [ ]---\
+ * [ ] reset marker
+ * [ ]<--/
+ * [ ]
+ * ----------------- VF resumed ------------------
+ * | [ ]
+ * | [ ]
+ * | VF2GUC_RESFIX_DONE(marker) [ ]
+ * [ ] <-------------------------------------------[ ]
+ * [ ] [ ]
+ * [ ]---\ [ ]
+ * [ ] check marker [ ]
+ * [ ] (mismatch) [ ]
+ * [ ]<--/ [ ]
+ * [ ] [ ]
+ * [ ] RESPONSE_VF_MIGRATED [ ]
+ * [ ] ------------------------------------------> [ ]
+ * | [ ]---\
+ * | [ ] reschedule fixups
+ * | [ ]<--/
+ * | |
*/
/**