]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
drm/amdgpu: Fix map/unmap queue logic
authorLijo Lazar <lijo.lazar@amd.com>
Tue, 5 Nov 2024 05:00:20 +0000 (10:30 +0530)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 5 Dec 2024 12:53:24 +0000 (13:53 +0100)
[ Upstream commit fa31798582882740f2b13d19e1bd43b4ef918e2f ]

In current logic, it calls ring_alloc followed by a ring_test. ring_test
in turn will call another ring_alloc. This is illegal usage as a
ring_alloc is expected to be closed properly with a ring_commit. Change
to commit the map/unmap queue packet first followed by a ring_test. Add a
comment about the usage of ring_test.

Also, reorder the current pre-condition checks of job hang or kiq ring
scheduler not ready. Without them being met, it is not useful to attempt
ring or memory allocations.

Fixes tag refers to the original patch which introduced this issue which
then got carried over into newer code.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Le Ma <le.ma@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Fixes: 6c10b5cc4eaa ("drm/amdgpu: Remove duplicate code in gfx_v8_0.c")
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c

index c272461d70a9a0bbeca229c77deb3ddf1e02bd5a..b397c783867391a8f8742d0881000c75238e4a62 100644 (file)
@@ -850,6 +850,9 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
        if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
                return -EINVAL;
 
+       if (!kiq_ring->sched.ready || adev->job_hang)
+               return 0;
+
        ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL);
        if (!ring_funcs)
                return -ENOMEM;
@@ -874,8 +877,14 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
 
        kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0);
 
-       if (kiq_ring->sched.ready && !adev->job_hang)
-               r = amdgpu_ring_test_helper(kiq_ring);
+       /* Submit unmap queue packet */
+       amdgpu_ring_commit(kiq_ring);
+       /*
+        * Ring test will do a basic scratch register change check. Just run
+        * this to ensure that unmap queues that is submitted before got
+        * processed successfully before returning.
+        */
+       r = amdgpu_ring_test_helper(kiq_ring);
 
        spin_unlock(&kiq->ring_lock);
 
index 3ff39d3ec317c0dbc8f30046d14b062d999489e5..0058f1796c75c96e57231a3acfad78275f9e6cd1 100644 (file)
@@ -522,6 +522,17 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
        if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
                return -EINVAL;
 
+       if (!kiq_ring->sched.ready || adev->job_hang)
+               return 0;
+       /**
+        * This is workaround: only skip kiq_ring test
+        * during ras recovery in suspend stage for gfx9.4.3
+        */
+       if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
+            amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
+           amdgpu_ras_in_recovery(adev))
+               return 0;
+
        spin_lock(&kiq->ring_lock);
        if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
                                        adev->gfx.num_compute_rings)) {
@@ -535,20 +546,15 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
                                           &adev->gfx.compute_ring[j],
                                           RESET_QUEUES, 0, 0);
        }
-
-       /**
-        * This is workaround: only skip kiq_ring test
-        * during ras recovery in suspend stage for gfx9.4.3
+       /* Submit unmap queue packet */
+       amdgpu_ring_commit(kiq_ring);
+       /*
+        * Ring test will do a basic scratch register change check. Just run
+        * this to ensure that unmap queues that is submitted before got
+        * processed successfully before returning.
         */
-       if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
-           amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
-           amdgpu_ras_in_recovery(adev)) {
-               spin_unlock(&kiq->ring_lock);
-               return 0;
-       }
+       r = amdgpu_ring_test_helper(kiq_ring);
 
-       if (kiq_ring->sched.ready && !adev->job_hang)
-               r = amdgpu_ring_test_helper(kiq_ring);
        spin_unlock(&kiq->ring_lock);
 
        return r;
@@ -576,8 +582,11 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id)
        if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
                return -EINVAL;
 
-       spin_lock(&kiq->ring_lock);
+       if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang)
+               return 0;
+
        if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) {
+               spin_lock(&kiq->ring_lock);
                if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
                                                adev->gfx.num_gfx_rings)) {
                        spin_unlock(&kiq->ring_lock);
@@ -590,11 +599,17 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id)
                                                   &adev->gfx.gfx_ring[j],
                                                   PREEMPT_QUEUES, 0, 0);
                }
-       }
+               /* Submit unmap queue packet */
+               amdgpu_ring_commit(kiq_ring);
 
-       if (adev->gfx.kiq[0].ring.sched.ready && !adev->job_hang)
+               /*
+                * Ring test will do a basic scratch register change check.
+                * Just run this to ensure that unmap queues that is submitted
+                * before got processed successfully before returning.
+                */
                r = amdgpu_ring_test_helper(kiq_ring);
-       spin_unlock(&kiq->ring_lock);
+               spin_unlock(&kiq->ring_lock);
+       }
 
        return r;
 }
@@ -699,7 +714,13 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id)
                kiq->pmf->kiq_map_queues(kiq_ring,
                                         &adev->gfx.compute_ring[j]);
        }
-
+       /* Submit map queue packet */
+       amdgpu_ring_commit(kiq_ring);
+       /*
+        * Ring test will do a basic scratch register change check. Just run
+        * this to ensure that map queues that is submitted before got
+        * processed successfully before returning.
+        */
        r = amdgpu_ring_test_helper(kiq_ring);
        spin_unlock(&kiq->ring_lock);
        if (r)
@@ -750,7 +771,13 @@ int amdgpu_gfx_enable_kgq(struct amdgpu_device *adev, int xcc_id)
                                                 &adev->gfx.gfx_ring[j]);
                }
        }
-
+       /* Submit map queue packet */
+       amdgpu_ring_commit(kiq_ring);
+       /*
+        * Ring test will do a basic scratch register change check. Just run
+        * this to ensure that map queues that is submitted before got
+        * processed successfully before returning.
+        */
        r = amdgpu_ring_test_helper(kiq_ring);
        spin_unlock(&kiq->ring_lock);
        if (r)
index b4658c7db0e1671bffa0f669bd11d741b332b0f3..618972c43aac5378a97e9d27fbeb42f32097c061 100644 (file)
@@ -4823,6 +4823,13 @@ static int gfx_v8_0_kcq_disable(struct amdgpu_device *adev)
                amdgpu_ring_write(kiq_ring, 0);
                amdgpu_ring_write(kiq_ring, 0);
        }
+       /* Submit unmap queue packet */
+       amdgpu_ring_commit(kiq_ring);
+       /*
+        * Ring test will do a basic scratch register change check. Just run
+        * this to ensure that unmap queues that is submitted before got
+        * processed successfully before returning.
+        */
        r = amdgpu_ring_test_helper(kiq_ring);
        if (r)
                DRM_ERROR("KCQ disable failed\n");