]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdkfd: Handle GPU reset and drain retry fault race
authorPhilip Yang <Philip.Yang@amd.com>
Wed, 19 Nov 2025 21:32:45 +0000 (16:32 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 10 Dec 2025 22:37:45 +0000 (17:37 -0500)
Only check and drain IH1 ring if CAM is not enabled.

If GPU is under reset, don't access IH to drain retry fault.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index 015fc952fa8111b8338a72eee096700f3309c36d..3235774f3b64c9ec35099e66878d4bf7c2957335 100644 (file)
@@ -33,6 +33,7 @@
 #include "amdgpu_hmm.h"
 #include "amdgpu.h"
 #include "amdgpu_xgmi.h"
+#include "amdgpu_reset.h"
 #include "kfd_priv.h"
 #include "kfd_svm.h"
 #include "kfd_migrate.h"
@@ -2369,6 +2370,9 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
 
                pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
 
+               if (!down_read_trylock(&pdd->dev->adev->reset_domain->sem))
+                       continue;
+
                amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
                                pdd->dev->adev->irq.retry_cam_enabled ?
                                &pdd->dev->adev->irq.ih :
@@ -2378,6 +2382,7 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
                        amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
                                &pdd->dev->adev->irq.ih_soft);
 
+               up_read(&pdd->dev->adev->reset_domain->sem);
 
                pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
        }
@@ -2561,7 +2566,7 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
                adev = pdd->dev->adev;
 
                /* Check and drain ih1 ring if cam not available */
-               if (adev->irq.ih1.ring_size) {
+               if (!adev->irq.retry_cam_enabled && adev->irq.ih1.ring_size) {
                        ih = &adev->irq.ih1;
                        checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
                        if (ih->rptr != checkpoint_wptr) {