drm/amdgpu: rework userq reset work handling

author Christian König <christian.koenig@amd.com>

Tue, 21 Apr 2026 10:39:54 +0000 (12:39 +0200)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 19 May 2026 16:07:42 +0000 (12:07 -0400)
author Christian König <christian.koenig@amd.com>
Tue, 21 Apr 2026 10:39:54 +0000 (12:39 +0200)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 19 May 2026 16:07:42 +0000 (12:07 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 8bc591deb546d10ce5f7c5bb37a8fa9f17153e63..fd50da4c7b186cf9e00278cb7f168d3be4ed2427 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1190,7 +1190,6 @@ struct amdgpu_device {
         bool                            apu_prefer_gtt;
  
         bool                            userq_halt_for_enforce_isolation;
-       struct work_struct              userq_reset_work;
         struct amdgpu_uid *uid_info;
  
         struct amdgpu_uma_carveout_info uma_info;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 66ca043658ff8eb8a1eb5e0144ed32818946810d..1424c98d2006a83415fea2b0762aee07ffee628f 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3787,7 +3787,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
         }
  
         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
-       INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
  
         amdgpu_coredump_init(adev);
  
@@ -5478,7 +5477,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
         if (!amdgpu_sriov_vf(adev))
                 cancel_work(&adev->reset_work);
  #endif
-       cancel_work(&adev->userq_reset_work);
+       amdgpu_userq_mgr_cancel_reset_work(adev);
  
         if (adev->kfd.dev)
                 cancel_work(&adev->kfd.reset_work);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c

index 8841955927bbf1dd6c2afe58e142c45fd8c7ea46..aa6d4c71fba69d18d50e667d1cd673b85f7b7749 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -82,19 +82,11 @@ static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
         return false;
  }
  
-static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
-{
-       if (amdgpu_device_should_recover_gpu(adev)) {
-               amdgpu_reset_domain_schedule(adev->reset_domain,
-                                            &adev->userq_reset_work);
-               /* Wait for the reset job to complete */
-               flush_work(&adev->userq_reset_work);
-       }
-}
-
-static int
-amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
+static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
  {
+       struct amdgpu_userq_mgr *uq_mgr =
+               container_of(work, struct amdgpu_userq_mgr,
+                            reset_work);
         struct amdgpu_device *adev = uq_mgr->adev;
         const int queue_types[] = {
                 AMDGPU_RING_TYPE_COMPUTE,
@@ -103,12 +95,11 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
         };
         const int num_queue_types = ARRAY_SIZE(queue_types);
         bool gpu_reset = false;
-       int r = 0;
-       int i;
+       int i, r;
  
         if (unlikely(adev->debug_disable_gpu_ring_reset)) {
                 dev_err(adev->dev, "userq reset disabled by debug mask\n");
-               return 0;
+               return;
         }
  
         /*
@@ -116,7 +107,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
          * skip all reset detection logic
          */
         if (!amdgpu_gpu_recovery)
-               return 0;
+               return;
  
         /*
          * Iterate through all queue types to detect and reset problematic queues
@@ -141,10 +132,19 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
                 }
         }
  
-       if (gpu_reset)
-               amdgpu_userq_gpu_reset(adev);
+       if (gpu_reset) {
+               struct amdgpu_reset_context reset_context;
  
-       return r;
+               memset(&reset_context, 0, sizeof(reset_context));
+
+               reset_context.method = AMD_RESET_METHOD_NONE;
+               reset_context.reset_req_dev = adev;
+               reset_context.src = AMDGPU_RESET_SRC_USERQ;
+               set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+               amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+       }
  }
  
  static void amdgpu_userq_hang_detect_work(struct work_struct *work)
@@ -153,7 +153,11 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work)
                 container_of(work, struct amdgpu_usermode_queue,
                              hang_detect_work.work);
  
-       amdgpu_userq_detect_and_reset_queues(queue->userq_mgr);
+       /*
+        * Don't schedule the work here! Scheduling or queue work from one reset
+        * handler to another is illegal if you don't take extra precautions!
+        */
+       amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work);
  }
  
  /*
@@ -182,8 +186,8 @@ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
                 break;
         }
  
-       schedule_delayed_work(&queue->hang_detect_work,
-                    msecs_to_jiffies(timeout_ms));
+       queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
+                          msecs_to_jiffies(timeout_ms));
  }
  
  void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell)
@@ -1259,28 +1263,13 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
         if (ret) {
                 drm_file_err(uq_mgr->file,
                              "Couldn't unmap all the queues, eviction failed ret=%d\n", ret);
-               amdgpu_userq_detect_and_reset_queues(uq_mgr);
+               amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain,
+                                            &uq_mgr->reset_work);
+               flush_work(&uq_mgr->reset_work);
         }
         return ret;
  }
  
-void amdgpu_userq_reset_work(struct work_struct *work)
-{
-       struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
-                                                 userq_reset_work);
-       struct amdgpu_reset_context reset_context;
-
-       memset(&reset_context, 0, sizeof(reset_context));
-
-       reset_context.method = AMD_RESET_METHOD_NONE;
-       reset_context.reset_req_dev = adev;
-       reset_context.src = AMDGPU_RESET_SRC_USERQ;
-       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-       /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
-
-       amdgpu_device_gpu_recover(adev, NULL, &reset_context);
-}
-
  static void
  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
  {
@@ -1314,9 +1303,24 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
         userq_mgr->file = file_priv;
  
         INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
+       INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
         return 0;
  }
  
+void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev)
+{
+       struct xarray *xa = &adev->userq_doorbell_xa;
+       struct amdgpu_usermode_queue *queue;
+       unsigned long flags, queue_id;
+
+       xa_lock_irqsave(xa, flags);
+       xa_for_each(xa, queue_id, queue) {
+               cancel_delayed_work(&queue->hang_detect_work);
+               cancel_work(&queue->userq_mgr->reset_work);
+       }
+       xa_unlock_irqrestore(xa, flags);
+}
+
  void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr)
  {
         cancel_delayed_work_sync(&userq_mgr->resume_work);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h

index 85f460e7c31b24e79836c2905ded34a7f4dea04e..49b33e2d6932f87af65fcd1f49d6ec360030c385 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -84,7 +84,13 @@ struct amdgpu_usermode_queue {
         u32                     xcp_id;
         int                     priority;
         struct dentry           *debugfs_queue;
-       struct delayed_work hang_detect_work;
+
+       /**
+        * @hang_detect_work:
+        *
+        * Delayed work which runs when userq_fences time out.
+        */
+       struct delayed_work     hang_detect_work;
         struct kref             refcount;
  
         struct list_head        userq_va_list;
@@ -116,6 +122,13 @@ struct amdgpu_userq_mgr {
         struct amdgpu_device            *adev;
         struct delayed_work             resume_work;
         struct drm_file                 *file;
+
+       /**
+        * @reset_work:
+        *
+        * Reset work which is used when eviction fails.
+        */
+       struct work_struct              reset_work;
         atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
  };
  
@@ -134,6 +147,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp
  int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
                           struct amdgpu_device *adev);
  
+void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev);
  void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr);
  void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr);
author	Christian König <christian.koenig@amd.com>
	Tue, 21 Apr 2026 10:39:54 +0000 (12:39 +0200)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 19 May 2026 16:07:42 +0000 (12:07 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h		patch \| blob \| blame \| history