The current implementation of drm_sched_start uses a hardcoded
-ECANCELED to dispose of a job when the parent/hw fence is NULL.
This results in drm_sched_job_done being called with -ECANCELED for
each job with a NULL parent in the pending list, making it difficult
to distinguish between recovery methods, whether a queue reset or a
full GPU reset was used.
To improve this, we first try a soft recovery for timeout jobs and
use the error code -ENODATA. If soft recovery fails, we proceed with
a queue reset, where the error code remains -ENODATA for the job.
Finally, for a full GPU reset, we use error codes -ECANCELED or
-ETIME. This patch adds an error code parameter to drm_sched_start,
allowing us to differentiate between queue reset and GPU reset
failures. This enables user mode and test applications to validate
the expected correctness of the requested operation. After a
successful queue reset, the only way to continue normal operation is
to call drm_sched_job_done with the specific error code -ENODATA.
v1: Initial implementation by Jesse utilized amdgpu_device_lock_reset_domain
and amdgpu_device_unlock_reset_domain to allow user mode to track
the queue reset status and distinguish between queue reset and
GPU reset.
v2: Christian suggested using the error codes -ENODATA for queue reset
and -ECANCELED or -ETIME for GPU reset, returned to
amdgpu_cs_wait_ioctl.
v3: To meet the requirements, we introduce a new function
drm_sched_start_ex with an additional parameter to set
dma_fence_set_error, allowing us to handle the specific error
codes appropriately and dispose of bad jobs with the selected
error code depending on whether it was a queue reset or GPU reset.
v4: Alex suggested using a new name, drm_sched_start_with_recovery_error,
which more accurately describes the function's purpose.
Additionally, it was recommended to add documentation details
about the new method.
v5: Fixed declaration of new function drm_sched_start_with_recovery_error.(Alex)
v6 (chk): rebase on upstream changes, cleanup the commit message,
drop the new function again and update all callers,
apply the errno also to scheduler fences with hw fences
v7 (chk): rebased
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240826122541.85663-1-christian.koenig@amd.com
if (r)
goto out;
} else {
- drm_sched_start(&ring->sched);
+ drm_sched_start(&ring->sched, 0);
}
}
if (!amdgpu_ring_sched_ready(ring))
continue;
- drm_sched_start(&ring->sched);
+ drm_sched_start(&ring->sched, 0);
}
if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
if (!amdgpu_ring_sched_ready(ring))
continue;
- drm_sched_start(&ring->sched);
+ drm_sched_start(&ring->sched, 0);
}
amdgpu_device_unset_mp1_state(adev);
atomic_inc(&ring->adev->gpu_reset_counter);
amdgpu_fence_driver_force_completion(ring);
if (amdgpu_ring_sched_ready(ring))
- drm_sched_start(&ring->sched);
+ drm_sched_start(&ring->sched, 0);
goto exit;
}
}
drm_sched_resubmit_jobs(&gpu->sched);
- drm_sched_start(&gpu->sched);
+ drm_sched_start(&gpu->sched, 0);
return DRM_GPU_SCHED_STAT_NOMINAL;
out_no_timeout:
}
}
- drm_sched_start(&queue->scheduler);
+ drm_sched_start(&queue->scheduler, 0);
}
/**
}
mutex_unlock(&pvr_dev->queues.lock);
- drm_sched_start(sched);
+ drm_sched_start(sched, 0);
return DRM_GPU_SCHED_STAT_NOMINAL;
}
lima_pm_idle(ldev);
drm_sched_resubmit_jobs(&pipe->base);
- drm_sched_start(&pipe->base);
+ drm_sched_start(&pipe->base, 0);
return DRM_GPU_SCHED_STAT_NOMINAL;
}
else
NV_PRINTK(warn, job->cli, "Generic job timeout.\n");
- drm_sched_start(sched);
+ drm_sched_start(sched, 0);
return stat;
}
/* Restart the schedulers */
for (i = 0; i < NUM_JOB_SLOTS; i++)
- drm_sched_start(&pfdev->js->queue[i].sched);
+ drm_sched_start(&pfdev->js->queue[i].sched, 0);
/* Re-enable job interrupts now that everything has been restarted. */
job_write(pfdev, JOB_INT_MASK,
static void panthor_vm_start(struct panthor_vm *vm)
{
- drm_sched_start(&vm->sched);
+ drm_sched_start(&vm->sched, 0);
}
/**
list_for_each_entry(job, &queue->scheduler.pending_list, base.list)
job->base.s_fence->parent = dma_fence_get(job->done_fence);
- drm_sched_start(&queue->scheduler);
+ drm_sched_start(&queue->scheduler, 0);
}
static void panthor_group_stop(struct panthor_group *group)
* drm_sched_start - recover jobs after a reset
*
* @sched: scheduler instance
+ * @errno: error to set on the pending fences
*
*/
-void drm_sched_start(struct drm_gpu_scheduler *sched)
+void drm_sched_start(struct drm_gpu_scheduler *sched, int errno)
{
struct drm_sched_job *s_job, *tmp;
atomic_add(s_job->credits, &sched->credit_count);
if (!fence) {
- drm_sched_job_done(s_job, -ECANCELED);
+ drm_sched_job_done(s_job, errno ?: -ECANCELED);
continue;
}
if (dma_fence_add_callback(fence, &s_job->cb,
drm_sched_job_done_cb))
- drm_sched_job_done(s_job, fence->error);
+ drm_sched_job_done(s_job, fence->error ?: errno);
}
drm_sched_start_timeout_unlocked(sched);
/* Unblock schedulers and restart their jobs. */
for (q = 0; q < V3D_MAX_QUEUES; q++) {
- drm_sched_start(&v3d->queue[q].sched);
+ drm_sched_start(&v3d->queue[q].sched, 0);
}
mutex_unlock(&v3d->reset_lock);
void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched);
void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched);
void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
-void drm_sched_start(struct drm_gpu_scheduler *sched);
+void drm_sched_start(struct drm_gpu_scheduler *sched, int errno);
void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
void drm_sched_increase_karma(struct drm_sched_job *bad);
void drm_sched_reset_karma(struct drm_sched_job *bad);