]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdgpu: remove almost all calls to amdgpu_userq_detect_and_reset_queues
authorChristian König <christian.koenig@amd.com>
Mon, 20 Apr 2026 13:13:57 +0000 (15:13 +0200)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 11 May 2026 21:47:04 +0000 (17:47 -0400)
Well the reset handling seems broken on multiple levels.

As first step of fixing this remove most calls to the hang detection.
That function should only be called after we run into a timeout! And *NOT*
as random check spread over the code in multiple places.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Sunil Khatri <sunil.khatri@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 71bea36b54ccfb14cbc90f94267af6369af4e702)

drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c

index 2d4f159f33623e551c1d5db3a29b893fe68dc5ab..ba03d2a42e1ef9dfa5cd37ec414ea34676c57c16 100644 (file)
@@ -345,23 +345,18 @@ static int amdgpu_userq_preempt_helper(struct amdgpu_usermode_queue *queue)
        struct amdgpu_device *adev = uq_mgr->adev;
        const struct amdgpu_userq_funcs *userq_funcs =
                adev->userq_funcs[queue->queue_type];
-       bool found_hung_queue = false;
-       int r = 0;
+       int r;
 
        if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
                r = userq_funcs->preempt(queue);
                if (r) {
                        queue->state = AMDGPU_USERQ_STATE_HUNG;
-                       found_hung_queue = true;
+                       return r;
                } else {
                        queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
                }
        }
-
-       if (found_hung_queue)
-               amdgpu_userq_detect_and_reset_queues(uq_mgr);
-
-       return r;
+       return 0;
 }
 
 static int amdgpu_userq_restore_helper(struct amdgpu_usermode_queue *queue)
@@ -390,24 +385,21 @@ static int amdgpu_userq_unmap_helper(struct amdgpu_usermode_queue *queue)
        struct amdgpu_device *adev = uq_mgr->adev;
        const struct amdgpu_userq_funcs *userq_funcs =
                adev->userq_funcs[queue->queue_type];
-       bool found_hung_queue = false;
-       int r = 0;
+       int r;
 
        if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
-               (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
+           (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
+
                r = userq_funcs->unmap(queue);
                if (r) {
                        queue->state = AMDGPU_USERQ_STATE_HUNG;
-                       found_hung_queue = true;
+                       return r;
                } else {
                        queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
                }
        }
 
-       if (found_hung_queue)
-               amdgpu_userq_detect_and_reset_queues(uq_mgr);
-
-       return r;
+       return 0;
 }
 
 static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue)
@@ -416,19 +408,19 @@ static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue)
        struct amdgpu_device *adev = uq_mgr->adev;
        const struct amdgpu_userq_funcs *userq_funcs =
                adev->userq_funcs[queue->queue_type];
-       int r = 0;
+       int r;
 
        if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
                r = userq_funcs->map(queue);
                if (r) {
                        queue->state = AMDGPU_USERQ_STATE_HUNG;
-                       amdgpu_userq_detect_and_reset_queues(uq_mgr);
+                       return r;
                } else {
                        queue->state = AMDGPU_USERQ_STATE_MAPPED;
                }
        }
 
-       return r;
+       return 0;
 }
 
 static void amdgpu_userq_wait_for_last_fence(struct amdgpu_usermode_queue *queue)
@@ -654,7 +646,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
 #if defined(CONFIG_DEBUG_FS)
        debugfs_remove_recursive(queue->debugfs_queue);
 #endif
-       amdgpu_userq_detect_and_reset_queues(uq_mgr);
        r = amdgpu_userq_unmap_helper(queue);
        atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
        amdgpu_userq_cleanup(queue);
@@ -1264,7 +1255,6 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
        unsigned long queue_id;
        int ret = 0, r;
 
-       amdgpu_userq_detect_and_reset_queues(uq_mgr);
        /* Try to unmap all the queues in this process ctx */
        xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
                r = amdgpu_userq_preempt_helper(queue);
@@ -1272,9 +1262,11 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
                        ret = r;
        }
 
-       if (ret)
+       if (ret) {
                drm_file_err(uq_mgr->file,
                             "Couldn't unmap all the queues, eviction failed ret=%d\n", ret);
+               amdgpu_userq_detect_and_reset_queues(uq_mgr);
+       }
        return ret;
 }
 
@@ -1374,7 +1366,6 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
                uqm = queue->userq_mgr;
                cancel_delayed_work_sync(&uqm->resume_work);
                guard(mutex)(&uqm->userq_mutex);
-               amdgpu_userq_detect_and_reset_queues(uqm);
                if (adev->in_s0ix)
                        r = amdgpu_userq_preempt_helper(queue);
                else
@@ -1433,7 +1424,6 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
                if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
                     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
                    (queue->xcp_id == idx)) {
-                       amdgpu_userq_detect_and_reset_queues(uqm);
                        r = amdgpu_userq_preempt_helper(queue);
                        if (r)
                                ret = r;