]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdgpu: rework userq reset work handling
authorChristian König <christian.koenig@amd.com>
Tue, 21 Apr 2026 10:39:54 +0000 (12:39 +0200)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 19 May 2026 16:07:42 +0000 (12:07 -0400)
It is illegal to schedule reset work from another reset work!

Fix this by scheduling the userq reset work directly on the work queue
of the reset domain.

Not fully tested, I leave that to the IGT test cases.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Prike Liang <Prike.Liang@amd.com>
Reviewed-by: Sunil Khatri <sunil.khatri@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit fd9200ccefab94f27877d1943761d6b0ccbd89c8)

drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h

index 8bc591deb546d10ce5f7c5bb37a8fa9f17153e63..fd50da4c7b186cf9e00278cb7f168d3be4ed2427 100644 (file)
@@ -1190,7 +1190,6 @@ struct amdgpu_device {
        bool                            apu_prefer_gtt;
 
        bool                            userq_halt_for_enforce_isolation;
-       struct work_struct              userq_reset_work;
        struct amdgpu_uid *uid_info;
 
        struct amdgpu_uma_carveout_info uma_info;
index 66ca043658ff8eb8a1eb5e0144ed32818946810d..1424c98d2006a83415fea2b0762aee07ffee628f 100644 (file)
@@ -3787,7 +3787,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        }
 
        INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
-       INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
 
        amdgpu_coredump_init(adev);
 
@@ -5478,7 +5477,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
        if (!amdgpu_sriov_vf(adev))
                cancel_work(&adev->reset_work);
 #endif
-       cancel_work(&adev->userq_reset_work);
+       amdgpu_userq_mgr_cancel_reset_work(adev);
 
        if (adev->kfd.dev)
                cancel_work(&adev->kfd.reset_work);
index 8841955927bbf1dd6c2afe58e142c45fd8c7ea46..aa6d4c71fba69d18d50e667d1cd673b85f7b7749 100644 (file)
@@ -82,19 +82,11 @@ static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
        return false;
 }
 
-static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
-{
-       if (amdgpu_device_should_recover_gpu(adev)) {
-               amdgpu_reset_domain_schedule(adev->reset_domain,
-                                            &adev->userq_reset_work);
-               /* Wait for the reset job to complete */
-               flush_work(&adev->userq_reset_work);
-       }
-}
-
-static int
-amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
+static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
 {
+       struct amdgpu_userq_mgr *uq_mgr =
+               container_of(work, struct amdgpu_userq_mgr,
+                            reset_work);
        struct amdgpu_device *adev = uq_mgr->adev;
        const int queue_types[] = {
                AMDGPU_RING_TYPE_COMPUTE,
@@ -103,12 +95,11 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
        };
        const int num_queue_types = ARRAY_SIZE(queue_types);
        bool gpu_reset = false;
-       int r = 0;
-       int i;
+       int i, r;
 
        if (unlikely(adev->debug_disable_gpu_ring_reset)) {
                dev_err(adev->dev, "userq reset disabled by debug mask\n");
-               return 0;
+               return;
        }
 
        /*
@@ -116,7 +107,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
         * skip all reset detection logic
         */
        if (!amdgpu_gpu_recovery)
-               return 0;
+               return;
 
        /*
         * Iterate through all queue types to detect and reset problematic queues
@@ -141,10 +132,19 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
                }
        }
 
-       if (gpu_reset)
-               amdgpu_userq_gpu_reset(adev);
+       if (gpu_reset) {
+               struct amdgpu_reset_context reset_context;
 
-       return r;
+               memset(&reset_context, 0, sizeof(reset_context));
+
+               reset_context.method = AMD_RESET_METHOD_NONE;
+               reset_context.reset_req_dev = adev;
+               reset_context.src = AMDGPU_RESET_SRC_USERQ;
+               set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+               amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+       }
 }
 
 static void amdgpu_userq_hang_detect_work(struct work_struct *work)
@@ -153,7 +153,11 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work)
                container_of(work, struct amdgpu_usermode_queue,
                             hang_detect_work.work);
 
-       amdgpu_userq_detect_and_reset_queues(queue->userq_mgr);
+       /*
+        * Don't schedule the work here! Scheduling or queue work from one reset
+        * handler to another is illegal if you don't take extra precautions!
+        */
+       amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work);
 }
 
 /*
@@ -182,8 +186,8 @@ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
                break;
        }
 
-       schedule_delayed_work(&queue->hang_detect_work,
-                    msecs_to_jiffies(timeout_ms));
+       queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
+                          msecs_to_jiffies(timeout_ms));
 }
 
 void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell)
@@ -1259,28 +1263,13 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
        if (ret) {
                drm_file_err(uq_mgr->file,
                             "Couldn't unmap all the queues, eviction failed ret=%d\n", ret);
-               amdgpu_userq_detect_and_reset_queues(uq_mgr);
+               amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain,
+                                            &uq_mgr->reset_work);
+               flush_work(&uq_mgr->reset_work);
        }
        return ret;
 }
 
-void amdgpu_userq_reset_work(struct work_struct *work)
-{
-       struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
-                                                 userq_reset_work);
-       struct amdgpu_reset_context reset_context;
-
-       memset(&reset_context, 0, sizeof(reset_context));
-
-       reset_context.method = AMD_RESET_METHOD_NONE;
-       reset_context.reset_req_dev = adev;
-       reset_context.src = AMDGPU_RESET_SRC_USERQ;
-       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-       /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
-
-       amdgpu_device_gpu_recover(adev, NULL, &reset_context);
-}
-
 static void
 amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
 {
@@ -1314,9 +1303,24 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
        userq_mgr->file = file_priv;
 
        INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
+       INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
        return 0;
 }
 
+void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev)
+{
+       struct xarray *xa = &adev->userq_doorbell_xa;
+       struct amdgpu_usermode_queue *queue;
+       unsigned long flags, queue_id;
+
+       xa_lock_irqsave(xa, flags);
+       xa_for_each(xa, queue_id, queue) {
+               cancel_delayed_work(&queue->hang_detect_work);
+               cancel_work(&queue->userq_mgr->reset_work);
+       }
+       xa_unlock_irqrestore(xa, flags);
+}
+
 void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr)
 {
        cancel_delayed_work_sync(&userq_mgr->resume_work);
index 85f460e7c31b24e79836c2905ded34a7f4dea04e..49b33e2d6932f87af65fcd1f49d6ec360030c385 100644 (file)
@@ -84,7 +84,13 @@ struct amdgpu_usermode_queue {
        u32                     xcp_id;
        int                     priority;
        struct dentry           *debugfs_queue;
-       struct delayed_work hang_detect_work;
+
+       /**
+        * @hang_detect_work:
+        *
+        * Delayed work which runs when userq_fences time out.
+        */
+       struct delayed_work     hang_detect_work;
        struct kref             refcount;
 
        struct list_head        userq_va_list;
@@ -116,6 +122,13 @@ struct amdgpu_userq_mgr {
        struct amdgpu_device            *adev;
        struct delayed_work             resume_work;
        struct drm_file                 *file;
+
+       /**
+        * @reset_work:
+        *
+        * Reset work which is used when eviction fails.
+        */
+       struct work_struct              reset_work;
        atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
 };
 
@@ -134,6 +147,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp
 int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
                          struct amdgpu_device *adev);
 
+void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev);
 void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr);
 void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr);