#include "xe_gt.h"
#include "xe_gt_clock.h"
#include "xe_gt_printk.h"
-#include "xe_gt_sriov_vf.h"
#include "xe_guc.h"
#include "xe_guc_capture.h"
#include "xe_guc_ct.h"
return (WQ_SIZE - q->guc->wqi_tail);
}
+static bool vf_recovery(struct xe_guc *guc)
+{
+ return xe_gt_recovery_pending(guc_to_gt(guc));
+}
+
static int wq_wait_for_space(struct xe_exec_queue *q, u32 wqi_size)
{
struct xe_guc *guc = exec_queue_to_guc(q);
#define AVAILABLE_SPACE \
CIRC_SPACE(q->guc->wqi_tail, q->guc->wqi_head, WQ_SIZE)
- if (wqi_size > AVAILABLE_SPACE) {
+ if (wqi_size > AVAILABLE_SPACE && !vf_recovery(guc)) {
try_again:
q->guc->wqi_head = parallel_read(xe, map, wq_desc.head);
if (wqi_size > AVAILABLE_SPACE) {
ret = wait_event_timeout(guc->ct.wq,
(!exec_queue_pending_enable(q) &&
!exec_queue_pending_disable(q)) ||
- xe_guc_read_stopped(guc),
+ xe_guc_read_stopped(guc) ||
+ vf_recovery(guc),
HZ * 5);
- if (!ret) {
+ if (!ret && !vf_recovery(guc)) {
struct xe_gpu_scheduler *sched = &q->guc->sched;
xe_gt_warn(q->gt, "Pending enable/disable failed to respond\n");
bool wedged = false;
xe_gt_assert(guc_to_gt(guc), xe_exec_queue_is_lr(q));
+
+ if (vf_recovery(guc))
+ return;
+
trace_xe_exec_queue_lr_cleanup(q);
if (!exec_queue_killed(q))
*/
ret = wait_event_timeout(guc->ct.wq,
!exec_queue_pending_disable(q) ||
- xe_guc_read_stopped(guc), HZ * 5);
+ xe_guc_read_stopped(guc) ||
+ vf_recovery(guc), HZ * 5);
+ if (vf_recovery(guc))
+ return;
+
if (!ret) {
xe_gt_warn(q->gt, "Schedule disable failed to respond, guc_id=%d\n",
q->guc->id);
ret = wait_event_timeout(guc->ct.wq,
!exec_queue_pending_enable(q) ||
- xe_guc_read_stopped(guc), HZ * 5);
- if (!ret || xe_guc_read_stopped(guc)) {
+ xe_guc_read_stopped(guc) ||
+ vf_recovery(guc), HZ * 5);
+ if ((!ret && !vf_recovery(guc)) || xe_guc_read_stopped(guc)) {
xe_gt_warn(guc_to_gt(guc), "Schedule enable failed to respond");
set_exec_queue_banned(q);
xe_gt_reset_async(q->gt);
* list so job can be freed and kick scheduler ensuring free job is not
* lost.
*/
- if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags))
+ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags) ||
+ vf_recovery(guc))
return DRM_GPU_SCHED_STAT_NO_HANG;
/* Kill the run_job entry point */
ret = wait_event_timeout(guc->ct.wq,
(!exec_queue_pending_enable(q) &&
!exec_queue_pending_disable(q)) ||
- xe_guc_read_stopped(guc), HZ * 5);
+ xe_guc_read_stopped(guc) ||
+ vf_recovery(guc), HZ * 5);
+ if (vf_recovery(guc))
+ goto handle_vf_resume;
if (!ret || xe_guc_read_stopped(guc))
goto trigger_reset;
smp_rmb();
ret = wait_event_timeout(guc->ct.wq,
!exec_queue_pending_disable(q) ||
- xe_guc_read_stopped(guc), HZ * 5);
+ xe_guc_read_stopped(guc) ||
+ vf_recovery(guc), HZ * 5);
+ if (vf_recovery(guc))
+ goto handle_vf_resume;
if (!ret || xe_guc_read_stopped(guc)) {
trigger_reset:
if (!ret)
* some thought, do this in a follow up.
*/
xe_sched_submission_start(sched);
+handle_vf_resume:
return DRM_GPU_SCHED_STAT_NO_HANG;
}
static void __suspend_fence_signal(struct xe_exec_queue *q)
{
+ struct xe_guc *guc = exec_queue_to_guc(q);
+ struct xe_device *xe = guc_to_xe(guc);
+
if (!q->guc->suspend_pending)
return;
WRITE_ONCE(q->guc->suspend_pending, false);
- wake_up(&q->guc->suspend_wait);
+
+ /*
+ * We use a GuC shared wait queue for VFs because the VF resfix start
+ * interrupt must be able to wake all instances of suspend_wait. This
+ * prevents the VF migration worker from being starved during
+ * scheduling.
+ */
+ if (IS_SRIOV_VF(xe))
+ wake_up_all(&guc->ct.wq);
+ else
+ wake_up(&q->guc->suspend_wait);
}
static void suspend_fence_signal(struct xe_exec_queue *q)
if (guc_exec_queue_allowed_to_change_state(q) && !exec_queue_suspended(q) &&
exec_queue_enabled(q)) {
- wait_event(guc->ct.wq, (q->guc->resume_time != RESUME_PENDING ||
- xe_guc_read_stopped(guc)) && !exec_queue_pending_disable(q));
+ wait_event(guc->ct.wq, vf_recovery(guc) ||
+ ((q->guc->resume_time != RESUME_PENDING ||
+ xe_guc_read_stopped(guc)) && !exec_queue_pending_disable(q)));
if (!xe_guc_read_stopped(guc)) {
s64 since_resume_ms =
q->entity = &ge->entity;
- if (xe_guc_read_stopped(guc))
+ if (xe_guc_read_stopped(guc) || vf_recovery(guc))
xe_sched_stop(sched);
mutex_unlock(&guc->submission_state.lock);
static int guc_exec_queue_suspend_wait(struct xe_exec_queue *q)
{
struct xe_guc *guc = exec_queue_to_guc(q);
+ struct xe_device *xe = guc_to_xe(guc);
int ret;
/*
* suspend_pending upon kill but to be paranoid but races in which
* suspend_pending is set after kill also check kill here.
*/
- ret = wait_event_interruptible_timeout(q->guc->suspend_wait,
- !READ_ONCE(q->guc->suspend_pending) ||
- exec_queue_killed(q) ||
- xe_guc_read_stopped(guc),
- HZ * 5);
+#define WAIT_COND \
+ (!READ_ONCE(q->guc->suspend_pending) || exec_queue_killed(q) || \
+ xe_guc_read_stopped(guc))
+
+retry:
+ if (IS_SRIOV_VF(xe))
+ ret = wait_event_interruptible_timeout(guc->ct.wq, WAIT_COND ||
+ vf_recovery(guc),
+ HZ * 5);
+ else
+ ret = wait_event_interruptible_timeout(q->guc->suspend_wait,
+ WAIT_COND, HZ * 5);
+
+ if (vf_recovery(guc) && !xe_device_wedged((guc_to_xe(guc))))
+ return -EAGAIN;
if (!ret) {
xe_gt_warn(guc_to_gt(guc),
q->guc->id);
/* XXX: Trigger GT reset? */
return -ETIME;
+ } else if (IS_SRIOV_VF(xe) && !WAIT_COND) {
+ /* Corner case on RESFIX DONE where vf_recovery() changes */
+ goto retry;
}
+#undef WAIT_COND
+
return ret < 0 ? ret : 0;
}
{
int ret;
- if (xe_gt_WARN_ON(guc_to_gt(guc),
- xe_gt_sriov_vf_recovery_pending(guc_to_gt(guc))))
+ if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
return 0;
if (!guc->submission_state.initialized)