drm/amdgpu: add a helper for processing recoverable GPUVM faults

author Alex Deucher <alexander.deucher@amd.com>

Mon, 1 Dec 2025 19:46:53 +0000 (14:46 -0500)

committer Alex Deucher <alexander.deucher@amd.com>

Mon, 8 Dec 2025 19:14:38 +0000 (14:14 -0500)
author Alex Deucher <alexander.deucher@amd.com>
Mon, 1 Dec 2025 19:46:53 +0000 (14:46 -0500)
committer Alex Deucher <alexander.deucher@amd.com>
Mon, 8 Dec 2025 19:14:38 +0000 (14:14 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c

index 4abed753fc2dfcffe1ce8fe1f290ec23eb03c9e1..8ac92e7bed31506e583735a73963291afb2ded69 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -524,6 +524,54 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
         } while (fault->timestamp < tmp);
  }
  
+int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev,
+                                 struct amdgpu_iv_entry *entry,
+                                 u64 addr,
+                                 u32 cam_index,
+                                 u32 node_id,
+                                 bool write_fault)
+{
+       int ret;
+
+       if (adev->irq.retry_cam_enabled) {
+               /* Delegate it to a different ring if the hardware hasn't
+                * already done it.
+                */
+               if (entry->ih == &adev->irq.ih) {
+                       amdgpu_irq_delegate(adev, entry, 8);
+                       return 1;
+               }
+
+               ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
+                                            addr, entry->timestamp, write_fault);
+               WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
+               if (ret)
+                       return 1;
+       } else {
+               /* Process it only if it's the first fault for this address */
+               if (entry->ih != &adev->irq.ih_soft &&
+                   amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
+                                            entry->timestamp))
+                       return 1;
+
+               /* Delegate it to a different ring if the hardware hasn't
+                * already done it.
+                */
+               if (entry->ih == &adev->irq.ih) {
+                       amdgpu_irq_delegate(adev, entry, 8);
+                       return 1;
+               }
+
+               /* Try to handle the recoverable page faults by filling page
+                * tables
+                */
+               if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
+                                          addr, entry->timestamp, write_fault))
+                       return 1;
+       }
+       return 0;
+}
+
  int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
  {
         int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h

index b62fa7e92c79d57da02e79d149a95f7f2ea12963..e8e8bfa098c3ecf88417de99fc4e982f06b4fd12 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -425,6 +425,12 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
                               uint16_t pasid, uint64_t timestamp);
  void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
                                      uint16_t pasid);
+int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev,
+                                 struct amdgpu_iv_entry *entry,
+                                 u64 addr,
+                                 u32 cam_index,
+                                 u32 node_id,
+                                 bool write_fault);
  int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev);
  int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
  void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

index 47558e572553ad3bd8e9779e13020b908f85ecf3..0b385a15194d98ab89471682cb5793dfc5e7e3e8 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -115,27 +115,10 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
         addr |= ((u64)entry->src_data[1] & 0xf) << 44;
  
         if (retry_fault) {
+               int ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, 0, 0,
+                                                       write_fault);
                 /* Returning 1 here also prevents sending the IV to the KFD */
-
-               /* Process it only if it's the first fault for this address */
-               if (entry->ih != &adev->irq.ih_soft &&
-                   amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
-                                            entry->timestamp))
-                       return 1;
-
-               /* Delegate it to a different ring if the hardware hasn't
-                * already done it.
-                */
-               if (entry->ih == &adev->irq.ih) {
-                       amdgpu_irq_delegate(adev, entry, 8);
-                       return 1;
-               }
-
-               /* Try to handle the recoverable page faults by filling page
-                * tables
-                */
-               if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
-                                          entry->timestamp, write_fault))
+               if (ret == 1)
                         return 1;
         }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c

index ba59ee8e398a807ed54dd44fdc6d4ec7c89c02d0..7a1f0742754a6977caad4212ac2a01dd687f73f3 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -114,27 +114,10 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
         addr |= ((u64)entry->src_data[1] & 0xf) << 44;
  
         if (retry_fault) {
+               int ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, 0, 0,
+                                                       write_fault);
                 /* Returning 1 here also prevents sending the IV to the KFD */
-
-               /* Process it only if it's the first fault for this address */
-               if (entry->ih != &adev->irq.ih_soft &&
-                   amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
-                                            entry->timestamp))
-                       return 1;
-
-               /* Delegate it to a different ring if the hardware hasn't
-                * already done it.
-                */
-               if (entry->ih == &adev->irq.ih) {
-                       amdgpu_irq_delegate(adev, entry, 8);
-                       return 1;
-               }
-
-               /* Try to handle the recoverable page faults by filling page
-                * tables
-                */
-               if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
-                                          entry->timestamp, write_fault))
+               if (ret == 1)
                         return 1;
         }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c

index dfb06baea1ff191fe8249f3837e5cd7792d1e137..145fcefd1c7839d4df581bb9702a527b68ceb94d 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
@@ -110,27 +110,10 @@ static int gmc_v12_0_process_interrupt(struct amdgpu_device *adev,
                 hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
  
         if (retry_fault) {
+               int ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, 0, 0,
+                                                       write_fault);
                 /* Returning 1 here also prevents sending the IV to the KFD */
-
-               /* Process it only if it's the first fault for this address */
-               if (entry->ih != &adev->irq.ih_soft &&
-                   amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
-                                            entry->timestamp))
-                       return 1;
-
-               /* Delegate it to a different ring if the hardware hasn't
-                * already done it.
-                */
-               if (entry->ih == &adev->irq.ih) {
-                       amdgpu_irq_delegate(adev, entry, 8);
-                       return 1;
-               }
-
-               /* Try to handle the recoverable page faults by filling page
-                * tables
-                */
-               if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
-                                          entry->timestamp, write_fault))
+               if (ret == 1)
                         return 1;
         }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index 778ad7ac6d086cd6021254218b4cc0006788c4dc..97a04e3171f2dfb29085606fb43928fb3bcbeb67 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -583,44 +583,13 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
         hub = &adev->vmhub[vmhub];
  
         if (retry_fault) {
-               if (adev->irq.retry_cam_enabled) {
-                       /* Delegate it to a different ring if the hardware hasn't
-                        * already done it.
-                        */
-                       if (entry->ih == &adev->irq.ih) {
-                               amdgpu_irq_delegate(adev, entry, 8);
-                               return 1;
-                       }
-
-                       cam_index = entry->src_data[2] & 0x3ff;
+               cam_index = entry->src_data[2] & 0x3ff;
  
-                       ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-                                                    addr, entry->timestamp, write_fault);
-                       WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
-                       if (ret)
-                               return 1;
-               } else {
-                       /* Process it only if it's the first fault for this address */
-                       if (entry->ih != &adev->irq.ih_soft &&
-                           amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
-                                            entry->timestamp))
-                               return 1;
-
-                       /* Delegate it to a different ring if the hardware hasn't
-                        * already done it.
-                        */
-                       if (entry->ih == &adev->irq.ih) {
-                               amdgpu_irq_delegate(adev, entry, 8);
-                               return 1;
-                       }
-
-                       /* Try to handle the recoverable page faults by filling page
-                        * tables
-                        */
-                       if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-                                                  addr, entry->timestamp, write_fault))
-                               return 1;
-               }
+               ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, cam_index, node_id,
+                                                   write_fault);
+               /* Returning 1 here also prevents sending the IV to the KFD */
+               if (ret == 1)
+                       return 1;
         }
  
         if (kgd2kfd_vmfault_fast_path(adev, entry, retry_fault))
author	Alex Deucher <alexander.deucher@amd.com>
	Mon, 1 Dec 2025 19:46:53 +0000 (14:46 -0500)
committer	Alex Deucher <alexander.deucher@amd.com>
	Mon, 8 Dec 2025 19:14:38 +0000 (14:14 -0500)
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c		patch \| blob \| blame \| history