return err;
}
+ if (IS_SRIOV_VF(gt_to_xe(gt))) {
+ err = xe_gt_sriov_vf_init_early(gt);
+ if (err)
+ return err;
+ }
+
xe_reg_sr_init(>->reg_sr, "GT", gt_to_xe(gt));
err = xe_wa_gt_init(gt);
#include "xe_guc.h"
#include "xe_guc_hxg_helpers.h"
#include "xe_guc_relay.h"
+#include "xe_guc_submit.h"
+#include "xe_irq.h"
#include "xe_lrc.h"
#include "xe_memirq.h"
#include "xe_mmio.h"
+#include "xe_pm.h"
#include "xe_sriov.h"
#include "xe_sriov_vf.h"
+#include "xe_tile_sriov_vf.h"
#include "xe_uc_fw.h"
#include "xe_wopcm.h"
}
/**
- * xe_gt_sriov_vf_notify_resfix_done - Notify GuC about resource fixups apply completed.
+ * vf_notify_resfix_done - Notify GuC about resource fixups apply completed.
* @gt: the &xe_gt struct instance linked to target GuC
*
* Returns: 0 if the operation completed successfully, or a negative error
* code otherwise.
*/
-int xe_gt_sriov_vf_notify_resfix_done(struct xe_gt *gt)
+static int vf_notify_resfix_done(struct xe_gt *gt)
{
struct xe_guc *guc = >->uc.guc;
int err;
* xe_gt_sriov_vf_default_lrcs_hwsp_rebase - Update GGTT references in HWSP of default LRCs.
* @gt: the &xe_gt struct instance
*/
-void xe_gt_sriov_vf_default_lrcs_hwsp_rebase(struct xe_gt *gt)
+static void xe_gt_sriov_vf_default_lrcs_hwsp_rebase(struct xe_gt *gt)
{
struct xe_hw_engine *hwe;
enum xe_hw_engine_id id;
xe_default_lrc_update_memirq_regs_with_address(hwe);
}
+static void vf_start_migration_recovery(struct xe_gt *gt)
+{
+ bool started;
+
+ xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
+
+ spin_lock(>->sriov.vf.migration.lock);
+
+ if (!gt->sriov.vf.migration.recovery_queued) {
+ gt->sriov.vf.migration.recovery_queued = true;
+ WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, true);
+
+ started = queue_work(gt->ordered_wq, >->sriov.vf.migration.worker);
+ xe_gt_sriov_info(gt, "VF migration recovery %s\n", started ?
+ "scheduled" : "already in progress");
+ }
+
+ spin_unlock(>->sriov.vf.migration.lock);
+}
+
/**
* xe_gt_sriov_vf_migrated_event_handler - Start a VF migration recovery,
* or just mark that a GuC is ready for it.
xe_gt_assert(gt, IS_SRIOV_VF(xe));
xe_gt_assert(gt, xe_gt_sriov_vf_recovery_pending(gt));
- set_bit(gt->info.id, &xe->sriov.vf.migration.gt_flags);
- /*
- * We need to be certain that if all flags were set, at least one
- * thread will notice that and schedule the recovery.
- */
- smp_mb__after_atomic();
+ if (!xe_sriov_vf_migration_supported(xe)) {
+ xe_gt_sriov_err(gt, "migration not supported\n");
+ return;
+ }
xe_gt_sriov_info(gt, "ready for recovery after migration\n");
- xe_sriov_vf_start_migration_recovery(xe);
+ vf_start_migration_recovery(gt);
}
static bool vf_is_negotiated(struct xe_gt *gt, u16 major, u16 minor)
pf_version->major, pf_version->minor);
}
+static void vf_post_migration_shutdown(struct xe_gt *gt)
+{
+ int ret = 0;
+
+ spin_lock_irq(>->sriov.vf.migration.lock);
+ gt->sriov.vf.migration.recovery_queued = false;
+ spin_unlock_irq(>->sriov.vf.migration.lock);
+
+ xe_guc_submit_pause(>->uc.guc);
+ ret |= xe_guc_submit_reset_block(>->uc.guc);
+
+ if (ret)
+ xe_gt_sriov_info(gt, "migration recovery encountered ongoing reset\n");
+}
+
+static size_t post_migration_scratch_size(struct xe_device *xe)
+{
+ return max(xe_lrc_reg_size(xe), LRC_WA_BB_SIZE);
+}
+
+static int vf_post_migration_fixups(struct xe_gt *gt)
+{
+ s64 shift;
+ void *buf;
+ int err;
+
+ buf = kmalloc(post_migration_scratch_size(gt_to_xe(gt)), GFP_ATOMIC);
+ if (!buf)
+ return -ENOMEM;
+
+ err = xe_gt_sriov_vf_query_config(gt);
+ if (err)
+ goto out;
+
+ shift = xe_gt_sriov_vf_ggtt_shift(gt);
+ if (shift) {
+ xe_tile_sriov_vf_fixup_ggtt_nodes(gt_to_tile(gt), shift);
+ xe_gt_sriov_vf_default_lrcs_hwsp_rebase(gt);
+ err = xe_guc_contexts_hwsp_rebase(>->uc.guc, buf);
+ if (err)
+ goto out;
+ }
+
+out:
+ kfree(buf);
+ return err;
+}
+
+static void vf_post_migration_kickstart(struct xe_gt *gt)
+{
+ /*
+ * Make sure interrupts on the new HW are properly set. The GuC IRQ
+ * must be working at this point, since the recovery did started,
+ * but the rest was not enabled using the procedure from spec.
+ */
+ xe_irq_resume(gt_to_xe(gt));
+
+ xe_guc_submit_reset_unblock(>->uc.guc);
+ xe_guc_submit_unpause(>->uc.guc);
+}
+
+static int vf_post_migration_notify_resfix_done(struct xe_gt *gt)
+{
+ bool skip_resfix = false;
+
+ spin_lock_irq(>->sriov.vf.migration.lock);
+ if (gt->sriov.vf.migration.recovery_queued) {
+ skip_resfix = true;
+ xe_gt_sriov_dbg(gt, "another recovery imminent, resfix skipped\n");
+ } else {
+ WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, false);
+ }
+ spin_unlock_irq(>->sriov.vf.migration.lock);
+
+ if (skip_resfix)
+ return -EAGAIN;
+
+ return vf_notify_resfix_done(gt);
+}
+
+static void vf_post_migration_recovery(struct xe_gt *gt)
+{
+ struct xe_device *xe = gt_to_xe(gt);
+ int err;
+
+ xe_gt_sriov_dbg(gt, "migration recovery in progress\n");
+
+ xe_pm_runtime_get(xe);
+ vf_post_migration_shutdown(gt);
+
+ if (!xe_sriov_vf_migration_supported(xe)) {
+ xe_gt_sriov_err(gt, "migration is not supported\n");
+ err = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ err = vf_post_migration_fixups(gt);
+ if (err)
+ goto fail;
+
+ vf_post_migration_kickstart(gt);
+ err = vf_post_migration_notify_resfix_done(gt);
+ if (err && err != -EAGAIN)
+ goto fail;
+
+ xe_pm_runtime_put(xe);
+ xe_gt_sriov_notice(gt, "migration recovery ended\n");
+ return;
+fail:
+ xe_pm_runtime_put(xe);
+ xe_gt_sriov_err(gt, "migration recovery failed (%pe)\n", ERR_PTR(err));
+ xe_device_declare_wedged(xe);
+}
+
+static void migration_worker_func(struct work_struct *w)
+{
+ struct xe_gt *gt = container_of(w, struct xe_gt,
+ sriov.vf.migration.worker);
+
+ vf_post_migration_recovery(gt);
+}
+
+/**
+ * xe_gt_sriov_vf_init_early() - GT VF init early
+ * @gt: the &xe_gt
+ *
+ * Return 0 on success, errno on failure
+ */
+int xe_gt_sriov_vf_init_early(struct xe_gt *gt)
+{
+ if (!xe_sriov_vf_migration_supported(gt_to_xe(gt)))
+ return 0;
+
+ spin_lock_init(>->sriov.vf.migration.lock);
+ INIT_WORK(>->sriov.vf.migration.worker, migration_worker_func);
+
+ return 0;
+}
+
/**
* xe_gt_sriov_vf_recovery_pending() - VF post migration recovery pending
* @gt: the &xe_gt
int xe_gt_sriov_vf_query_config(struct xe_gt *gt);
int xe_gt_sriov_vf_connect(struct xe_gt *gt);
int xe_gt_sriov_vf_query_runtime(struct xe_gt *gt);
-void xe_gt_sriov_vf_default_lrcs_hwsp_rebase(struct xe_gt *gt);
-int xe_gt_sriov_vf_notify_resfix_done(struct xe_gt *gt);
void xe_gt_sriov_vf_migrated_event_handler(struct xe_gt *gt);
+int xe_gt_sriov_vf_init_early(struct xe_gt *gt);
bool xe_gt_sriov_vf_recovery_pending(struct xe_gt *gt);
u32 xe_gt_sriov_vf_gmdid(struct xe_gt *gt);
#define _XE_GT_SRIOV_VF_TYPES_H_
#include <linux/types.h>
+#include <linux/workqueue.h>
#include "xe_uc_fw_types.h"
/**
* xe_gt_sriov_vf_migration - VF migration data.
*/
struct xe_gt_sriov_vf_migration {
+ /** @migration: VF migration recovery worker */
+ struct work_struct worker;
+ /** @lock: Protects recovery_queued */
+ spinlock_t lock;
+ /** @recovery_queued: VF post migration recovery in queued */
+ bool recovery_queued;
/** @recovery_inprogress: VF post migration recovery in progress */
bool recovery_inprogress;
};
#include <drm/drm_debugfs.h>
#include <drm/drm_managed.h>
-#include "xe_assert.h"
-#include "xe_device.h"
#include "xe_gt.h"
-#include "xe_gt_sriov_printk.h"
#include "xe_gt_sriov_vf.h"
#include "xe_guc.h"
-#include "xe_guc_submit.h"
-#include "xe_irq.h"
-#include "xe_lrc.h"
-#include "xe_pm.h"
-#include "xe_sriov.h"
#include "xe_sriov_printk.h"
#include "xe_sriov_vf.h"
#include "xe_sriov_vf_ccs.h"
-#include "xe_tile_sriov_vf.h"
/**
* DOC: VF restore procedure in PF KMD and VF KMD
xe->sriov.vf.migration.enabled = false;
}
-static void migration_worker_func(struct work_struct *w);
-
static void vf_migration_init_early(struct xe_device *xe)
{
/*
guc_version.major, guc_version.minor);
}
- INIT_WORK(&xe->sriov.vf.migration.worker, migration_worker_func);
-
xe->sriov.vf.migration.enabled = true;
xe_sriov_dbg(xe, "migration support enabled\n");
}
vf_migration_init_early(xe);
}
-/**
- * vf_post_migration_shutdown - Stop the driver activities after VF migration.
- * @xe: the &xe_device struct instance
- *
- * After this VM is migrated and assigned to a new VF, it is running on a new
- * hardware, and therefore many hardware-dependent states and related structures
- * require fixups. Without fixups, the hardware cannot do any work, and therefore
- * all GPU pipelines are stalled.
- * Stop some of kernel activities to make the fixup process faster.
- */
-static void vf_post_migration_shutdown(struct xe_device *xe)
-{
- struct xe_gt *gt;
- unsigned int id;
- int ret = 0;
-
- for_each_gt(gt, xe, id) {
- xe_guc_submit_pause(>->uc.guc);
- ret |= xe_guc_submit_reset_block(>->uc.guc);
- }
-
- if (ret)
- drm_info(&xe->drm, "migration recovery encountered ongoing reset\n");
-}
-
-/**
- * vf_post_migration_kickstart - Re-start the driver activities under new hardware.
- * @xe: the &xe_device struct instance
- *
- * After we have finished with all post-migration fixups, restart the driver
- * activities to continue feeding the GPU with workloads.
- */
-static void vf_post_migration_kickstart(struct xe_device *xe)
-{
- struct xe_gt *gt;
- unsigned int id;
-
- /*
- * Make sure interrupts on the new HW are properly set. The GuC IRQ
- * must be working at this point, since the recovery did started,
- * but the rest was not enabled using the procedure from spec.
- */
- xe_irq_resume(xe);
-
- for_each_gt(gt, xe, id) {
- xe_guc_submit_reset_unblock(>->uc.guc);
- xe_guc_submit_unpause(>->uc.guc);
- }
-}
-
-static bool gt_vf_post_migration_needed(struct xe_gt *gt)
-{
- return test_bit(gt->info.id, >_to_xe(gt)->sriov.vf.migration.gt_flags);
-}
-
-/*
- * Notify GuCs marked in flags about resource fixups apply finished.
- * @xe: the &xe_device struct instance
- * @gt_flags: flags marking to which GTs the notification shall be sent
- */
-static int vf_post_migration_notify_resfix_done(struct xe_device *xe, unsigned long gt_flags)
-{
- struct xe_gt *gt;
- unsigned int id;
- int err = 0;
-
- for_each_gt(gt, xe, id) {
- if (!test_bit(id, >_flags))
- continue;
- /* skip asking GuC for RESFIX exit if new recovery request arrived */
- if (gt_vf_post_migration_needed(gt))
- continue;
- err = xe_gt_sriov_vf_notify_resfix_done(gt);
- if (err)
- break;
- clear_bit(id, >_flags);
- }
-
- if (gt_flags && !err)
- drm_dbg(&xe->drm, "another recovery imminent, skipped some notifications\n");
- return err;
-}
-
-static int vf_get_next_migrated_gt_id(struct xe_device *xe)
-{
- struct xe_gt *gt;
- unsigned int id;
-
- for_each_gt(gt, xe, id) {
- if (test_and_clear_bit(id, &xe->sriov.vf.migration.gt_flags))
- return id;
- }
- return -1;
-}
-
-static size_t post_migration_scratch_size(struct xe_device *xe)
-{
- return max(xe_lrc_reg_size(xe), LRC_WA_BB_SIZE);
-}
-
-/**
- * Perform post-migration fixups on a single GT.
- *
- * After migration, GuC needs to be re-queried for VF configuration to check
- * if it matches previous provisioning. Most of VF provisioning shall be the
- * same, except GGTT range, since GGTT is not virtualized per-VF. If GGTT
- * range has changed, we have to perform fixups - shift all GGTT references
- * used anywhere within the driver. After the fixups in this function succeed,
- * it is allowed to ask the GuC bound to this GT to continue normal operation.
- *
- * Returns: 0 if the operation completed successfully, or a negative error
- * code otherwise.
- */
-static int gt_vf_post_migration_fixups(struct xe_gt *gt)
-{
- s64 shift;
- void *buf;
- int err;
-
- buf = kmalloc(post_migration_scratch_size(gt_to_xe(gt)), GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- err = xe_gt_sriov_vf_query_config(gt);
- if (err)
- goto out;
-
- shift = xe_gt_sriov_vf_ggtt_shift(gt);
- if (shift) {
- xe_tile_sriov_vf_fixup_ggtt_nodes(gt_to_tile(gt), shift);
- xe_gt_sriov_vf_default_lrcs_hwsp_rebase(gt);
- err = xe_guc_contexts_hwsp_rebase(>->uc.guc, buf);
- if (err)
- goto out;
- }
-
-out:
- kfree(buf);
- return err;
-}
-
-static void vf_post_migration_recovery(struct xe_device *xe)
-{
- unsigned long fixed_gts = 0;
- int id, err;
-
- drm_dbg(&xe->drm, "migration recovery in progress\n");
- xe_pm_runtime_get(xe);
- vf_post_migration_shutdown(xe);
-
- if (!xe_sriov_vf_migration_supported(xe)) {
- xe_sriov_err(xe, "migration is not supported\n");
- err = -ENOTRECOVERABLE;
- goto fail;
- }
-
- while (id = vf_get_next_migrated_gt_id(xe), id >= 0) {
- struct xe_gt *gt = xe_device_get_gt(xe, id);
-
- err = gt_vf_post_migration_fixups(gt);
- if (err)
- goto fail;
-
- set_bit(id, &fixed_gts);
- }
-
- vf_post_migration_kickstart(xe);
- err = vf_post_migration_notify_resfix_done(xe, fixed_gts);
- if (err)
- goto fail;
-
- xe_pm_runtime_put(xe);
- drm_notice(&xe->drm, "migration recovery ended\n");
- return;
-fail:
- xe_pm_runtime_put(xe);
- drm_err(&xe->drm, "migration recovery failed (%pe)\n", ERR_PTR(err));
- xe_device_declare_wedged(xe);
-}
-
-static void migration_worker_func(struct work_struct *w)
-{
- struct xe_device *xe = container_of(w, struct xe_device,
- sriov.vf.migration.worker);
-
- vf_post_migration_recovery(xe);
-}
-
-/*
- * Check if post-restore recovery is coming on any of GTs.
- * @xe: the &xe_device struct instance
- *
- * Return: True if migration recovery worker will soon be running. Any worker currently
- * executing does not affect the result.
- */
-static bool vf_ready_to_recovery_on_any_gts(struct xe_device *xe)
-{
- struct xe_gt *gt;
- unsigned int id;
-
- for_each_gt(gt, xe, id) {
- if (test_bit(id, &xe->sriov.vf.migration.gt_flags))
- return true;
- }
- return false;
-}
-
-/**
- * xe_sriov_vf_start_migration_recovery - Start VF migration recovery.
- * @xe: the &xe_device to start recovery on
- *
- * This function shall be called only by VF.
- */
-void xe_sriov_vf_start_migration_recovery(struct xe_device *xe)
-{
- bool started;
-
- xe_assert(xe, IS_SRIOV_VF(xe));
-
- if (!vf_ready_to_recovery_on_any_gts(xe))
- return;
-
- started = queue_work(xe->sriov.wq, &xe->sriov.vf.migration.worker);
- drm_info(&xe->drm, "VF migration recovery %s\n", started ?
- "scheduled" : "already in progress");
-}
-
/**
* xe_sriov_vf_init_late() - SR-IOV VF late initialization functions.
* @xe: the &xe_device to initialize
void xe_sriov_vf_init_early(struct xe_device *xe);
int xe_sriov_vf_init_late(struct xe_device *xe);
-void xe_sriov_vf_start_migration_recovery(struct xe_device *xe);
bool xe_sriov_vf_migration_supported(struct xe_device *xe);
void xe_sriov_vf_debugfs_register(struct xe_device *xe, struct dentry *root);
/** @migration: VF Migration state data */
struct {
- /** @migration.worker: VF migration recovery worker */
- struct work_struct worker;
- /** @migration.gt_flags: Per-GT request flags for VF migration recovery */
- unsigned long gt_flags;
/**
* @migration.enabled: flag indicating if migration support
* was enabled or not due to missing prerequisites