#include <drm/drm_auth.h>
#include <drm/drm_exec.h>
#include <linux/pm_runtime.h>
+#include <drm/drm_drv.h>
#include "amdgpu.h"
+#include "amdgpu_reset.h"
#include "amdgpu_vm.h"
#include "amdgpu_userq.h"
#include "amdgpu_hmm.h"
return userq_ip_mask;
}
+static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
+ enum amdgpu_ring_type ring_type, int reset_type)
+{
+
+ if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX)
+ return false;
+
+ switch (ring_type) {
+ case AMDGPU_RING_TYPE_GFX:
+ if (adev->gfx.gfx_supported_reset & reset_type)
+ return true;
+ break;
+ case AMDGPU_RING_TYPE_COMPUTE:
+ if (adev->gfx.compute_supported_reset & reset_type)
+ return true;
+ break;
+ case AMDGPU_RING_TYPE_SDMA:
+ if (adev->sdma.supported_reset & reset_type)
+ return true;
+ break;
+ case AMDGPU_RING_TYPE_VCN_DEC:
+ case AMDGPU_RING_TYPE_VCN_ENC:
+ if (adev->vcn.supported_reset & reset_type)
+ return true;
+ break;
+ case AMDGPU_RING_TYPE_VCN_JPEG:
+ if (adev->jpeg.supported_reset & reset_type)
+ return true;
+ break;
+ default:
+ break;
+ }
+ return false;
+}
+
+static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
+{
+ if (amdgpu_device_should_recover_gpu(adev)) {
+ amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->userq_reset_work);
+ /* Wait for the reset job to complete */
+ flush_work(&adev->userq_reset_work);
+ }
+}
+
+static int
+amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ const int queue_types[] = {
+ AMDGPU_RING_TYPE_COMPUTE,
+ AMDGPU_RING_TYPE_GFX,
+ AMDGPU_RING_TYPE_SDMA
+ };
+ const int num_queue_types = ARRAY_SIZE(queue_types);
+ bool gpu_reset = false;
+ int r = 0;
+ int i;
+
+ /* Warning if current process mutex is not held */
+ WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
+
+ if (unlikely(adev->debug_disable_gpu_ring_reset)) {
+ dev_err(adev->dev, "userq reset disabled by debug mask\n");
+ return 0;
+ }
+
+ /*
+ * If GPU recovery feature is disabled system-wide,
+ * skip all reset detection logic
+ */
+ if (!amdgpu_gpu_recovery)
+ return 0;
+
+ /*
+ * Iterate through all queue types to detect and reset problematic queues
+ * Process each queue type in the defined order
+ */
+ for (i = 0; i < num_queue_types; i++) {
+ int ring_type = queue_types[i];
+ const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
+
+ if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE))
+ continue;
+
+ if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
+ funcs && funcs->detect_and_reset) {
+ r = funcs->detect_and_reset(adev, ring_type);
+ if (r) {
+ gpu_reset = true;
+ break;
+ }
+ }
+ }
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
+ return r;
+}
+
static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
struct amdgpu_bo_va_mapping *va_map, u64 addr)
{
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool found_hung_queue = false;
int r = 0;
if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
r = userq_funcs->preempt(uq_mgr, queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
+ found_hung_queue = true;
} else {
queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
}
}
+ if (found_hung_queue)
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
+
return r;
}
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool found_hung_queue = false;
int r = 0;
if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
(queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
r = userq_funcs->unmap(uq_mgr, queue);
- if (r)
+ if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
- else
+ found_hung_queue = true;
+ } else {
queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
+ }
}
+
+ if (found_hung_queue)
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
+
return r;
}
r = userq_funcs->map(uq_mgr, queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
} else {
queue->state = AMDGPU_USERQ_STATE_MAPPED;
}
}
+
return r;
}
amdgpu_bo_unreserve(queue->db_obj.obj);
}
amdgpu_bo_unref(&queue->db_obj.obj);
-
+ atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
#if defined(CONFIG_DEBUG_FS)
debugfs_remove_recursive(queue->debugfs_queue);
#endif
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
r = amdgpu_userq_unmap_helper(uq_mgr, queue);
/*TODO: It requires a reset for userq hw unmap error*/
if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
kfree(queue_name);
args->out.queue_id = qid;
+ atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
unlock:
mutex_unlock(&uq_mgr->userq_mutex);
unsigned long queue_id;
int ret = 0, r;
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
/* Try to unmap all the queues in this process ctx */
xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
r = amdgpu_userq_preempt_helper(uq_mgr, queue);
return ret;
}
+void amdgpu_userq_reset_work(struct work_struct *work)
+{
+ struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
+ userq_reset_work);
+ struct amdgpu_reset_context reset_context;
+
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ reset_context.src = AMDGPU_RESET_SRC_USERQ;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+ amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+}
+
static int
amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
{
amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence *ev_fence)
{
- int ret;
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
+ struct amdgpu_device *adev = uq_mgr->adev;
+ int ret;
/* Wait for any pending userqueue fence work to finish */
ret = amdgpu_userq_wait_for_signal(uq_mgr);
- if (ret) {
- drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
ret = amdgpu_userq_evict_all(uq_mgr);
- if (ret) {
- drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Failed to evict userqueue\n");
/* Signal current eviction fence */
amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
cancel_delayed_work_sync(&userq_mgr->resume_work);
mutex_lock(&userq_mgr->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(userq_mgr);
xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
amdgpu_userq_unmap_helper(userq_mgr, queue);
uqm = queue->userq_mgr;
cancel_delayed_work_sync(&uqm->resume_work);
guard(mutex)(&uqm->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(uqm);
if (adev->in_s0ix)
r = amdgpu_userq_preempt_helper(uqm, queue);
else
if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
(queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
(queue->xcp_id == idx)) {
+ amdgpu_userq_detect_and_reset_queues(uqm);
r = amdgpu_userq_preempt_helper(uqm, queue);
if (r)
ret = r;
return 0;
}
+
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
+{
+ const struct amdgpu_userq_funcs *userq_funcs;
+ struct amdgpu_usermode_queue *queue;
+ struct amdgpu_userq_mgr *uqm;
+ unsigned long queue_id;
+
+ xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+ uqm = queue->userq_mgr;
+ cancel_delayed_work_sync(&uqm->resume_work);
+ if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+ amdgpu_userq_wait_for_last_fence(uqm, queue);
+ userq_funcs = adev->userq_funcs[queue->queue_type];
+ userq_funcs->unmap(uqm, queue);
+ /* just mark all queues as hung at this point.
+ * if unmap succeeds, we could map again
+ * in amdgpu_userq_post_reset() if vram is not lost
+ */
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+ amdgpu_userq_fence_driver_force_completion(queue);
+ }
+ }
+}
+
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
+{
+ /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
+ * at this point, we should be able to map it again
+ * and continue if vram is not lost.
+ */
+ struct amdgpu_userq_mgr *uqm;
+ struct amdgpu_usermode_queue *queue;
+ const struct amdgpu_userq_funcs *userq_funcs;
+ unsigned long queue_id;
+ int r = 0;
+
+ xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
+ uqm = queue->userq_mgr;
+ if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
+ userq_funcs = adev->userq_funcs[queue->queue_type];
+ /* Re-map queue */
+ r = userq_funcs->map(uqm, queue);
+ if (r) {
+ dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id);
+ continue;
+ }
+ queue->state = AMDGPU_USERQ_STATE_MAPPED;
+ }
+ }
+
+ return r;
+}