]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amdgpu/sdma: allow caller to handle kernel rings in engine reset
authorAlex Deucher <alexander.deucher@amd.com>
Thu, 26 Jun 2025 12:58:21 +0000 (08:58 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 7 Jul 2025 17:48:25 +0000 (13:48 -0400)
Add a parameter to amdgpu_sdma_reset_engine() to let the
caller handle the kernel rings.  This allows the kernel
rings to back up their unprocessed state if the reset comes in
via the drm scheduler rather than KFD.

Reviewed-by: Jesse Zhang <Jesse.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index 56939bb1d1a951fadf9bd04c133880bb589443de..8b8a04138711cf3c929ef8be2ac3b374e7aa750f 100644 (file)
@@ -545,10 +545,13 @@ static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id)
  * amdgpu_sdma_reset_engine - Reset a specific SDMA engine
  * @adev: Pointer to the AMDGPU device
  * @instance_id: Logical ID of the SDMA engine instance to reset
+ * @caller_handles_kernel_queues: Skip kernel queue processing. Caller
+ * will handle it.
  *
  * Returns: 0 on success, or a negative error code on failure.
  */
-int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
+int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id,
+                            bool caller_handles_kernel_queues)
 {
        int ret = 0;
        struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id];
@@ -556,14 +559,17 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
        struct amdgpu_ring *page_ring = &sdma_instance->page;
 
        mutex_lock(&sdma_instance->engine_reset_mutex);
-       /* Stop the scheduler's work queue for the GFX and page rings if they are running.
-       * This ensures that no new tasks are submitted to the queues while
-       * the reset is in progress.
-       */
-       drm_sched_wqueue_stop(&gfx_ring->sched);
 
-       if (adev->sdma.has_page_queue)
-               drm_sched_wqueue_stop(&page_ring->sched);
+       if (!caller_handles_kernel_queues) {
+               /* Stop the scheduler's work queue for the GFX and page rings if they are running.
+                * This ensures that no new tasks are submitted to the queues while
+                * the reset is in progress.
+                */
+               drm_sched_wqueue_stop(&gfx_ring->sched);
+
+               if (adev->sdma.has_page_queue)
+                       drm_sched_wqueue_stop(&page_ring->sched);
+       }
 
        if (sdma_instance->funcs->stop_kernel_queue) {
                sdma_instance->funcs->stop_kernel_queue(gfx_ring);
@@ -585,16 +591,18 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
        }
 
 exit:
-       /* Restart the scheduler's work queue for the GFX and page rings
-        * if they were stopped by this function. This allows new tasks
-        * to be submitted to the queues after the reset is complete.
-        */
-       if (!ret) {
-               amdgpu_fence_driver_force_completion(gfx_ring);
-               drm_sched_wqueue_start(&gfx_ring->sched);
-               if (adev->sdma.has_page_queue) {
-                       amdgpu_fence_driver_force_completion(page_ring);
-                       drm_sched_wqueue_start(&page_ring->sched);
+       if (!caller_handles_kernel_queues) {
+               /* Restart the scheduler's work queue for the GFX and page rings
+                * if they were stopped by this function. This allows new tasks
+                * to be submitted to the queues after the reset is complete.
+                */
+               if (!ret) {
+                       amdgpu_fence_driver_force_completion(gfx_ring);
+                       drm_sched_wqueue_start(&gfx_ring->sched);
+                       if (adev->sdma.has_page_queue) {
+                               amdgpu_fence_driver_force_completion(page_ring);
+                               drm_sched_wqueue_start(&page_ring->sched);
+                       }
                }
        }
        mutex_unlock(&sdma_instance->engine_reset_mutex);
index e5f8951bbb6f49534732460c8a02ad09909f0b44..34311f32be4c6ddc92ba6872cda64440015e4990 100644 (file)
@@ -172,7 +172,8 @@ struct amdgpu_buffer_funcs {
                                 uint32_t byte_count);
 };
 
-int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id);
+int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id,
+                            bool caller_handles_kernel_queues);
 
 #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) (adev)->mman.buffer_funcs->emit_copy_buffer((ib),  (s), (d), (b), (t))
 #define amdgpu_emit_fill_buffer(adev, ib, s, d, b) (adev)->mman.buffer_funcs->emit_fill_buffer((ib), (s), (d), (b))
index a7e1dbe03b2942006de7b8a9a451c0fde03074d9..20fad2525969b2d386fc37d62605c4604055809e 100644 (file)
@@ -1668,7 +1668,7 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring,
                return -EOPNOTSUPP;
 
        amdgpu_amdkfd_suspend(adev, true);
-       r = amdgpu_sdma_reset_engine(adev, id);
+       r = amdgpu_sdma_reset_engine(adev, id, false);
        amdgpu_amdkfd_resume(adev, true);
        return r;
 }
index ed1706da7deecd1521b74849055f7656af9efdf3..5a1098bdd825631b885d14c6efad796c96fd2a38 100644 (file)
@@ -1548,7 +1548,7 @@ static int sdma_v5_0_reset_queue(struct amdgpu_ring *ring,
        int r;
 
        amdgpu_amdkfd_suspend(adev, true);
-       r = amdgpu_sdma_reset_engine(adev, inst_id);
+       r = amdgpu_sdma_reset_engine(adev, inst_id, false);
        amdgpu_amdkfd_resume(adev, true);
 
        return r;
index b87a4b44fa939c549bbea9a513402c8374121789..6843c2c3d71f5d1d55baa9fb70cf40f14a530290 100644 (file)
@@ -1461,7 +1461,7 @@ static int sdma_v5_2_reset_queue(struct amdgpu_ring *ring,
        int r;
 
        amdgpu_amdkfd_suspend(adev, true);
-       r = amdgpu_sdma_reset_engine(adev, inst_id);
+       r = amdgpu_sdma_reset_engine(adev, inst_id, false);
        amdgpu_amdkfd_resume(adev, true);
 
        return r;
index 500f5155203892440fbe3ef187a8d8fe724c0ae7..2d91027e2a747dc18600c2bc4a4a42895a26723a 100644 (file)
@@ -2312,7 +2312,7 @@ static int reset_hung_queues_sdma(struct device_queue_manager *dqm)
                                continue;
 
                        /* Reset engine and check. */
-                       if (amdgpu_sdma_reset_engine(dqm->dev->adev, i) ||
+                       if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) ||
                            dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) ||
                            !set_sdma_queue_as_reset(dqm, doorbell_off)) {
                                r = -ENOTRECOVERABLE;