From d3ff65243a52afa85166abaa8d00a44c17691dbd Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 1 Dec 2025 14:46:53 -0500 Subject: [PATCH] drm/amdgpu: add a helper for processing recoverable GPUVM faults Add a common helper to remove the repeated logic from each gmc module. Suggested-by: Lijo Lazar Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 48 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 6 ++++ drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 23 ++---------- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 23 ++---------- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 23 ++---------- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 43 ++++------------------ 6 files changed, 69 insertions(+), 97 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 4abed753fc2df..8ac92e7bed315 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -524,6 +524,54 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr, } while (fault->timestamp < tmp); } +int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev, + struct amdgpu_iv_entry *entry, + u64 addr, + u32 cam_index, + u32 node_id, + bool write_fault) +{ + int ret; + + if (adev->irq.retry_cam_enabled) { + /* Delegate it to a different ring if the hardware hasn't + * already done it. + */ + if (entry->ih == &adev->irq.ih) { + amdgpu_irq_delegate(adev, entry, 8); + return 1; + } + + ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, + addr, entry->timestamp, write_fault); + WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); + if (ret) + return 1; + } else { + /* Process it only if it's the first fault for this address */ + if (entry->ih != &adev->irq.ih_soft && + amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid, + entry->timestamp)) + return 1; + + /* Delegate it to a different ring if the hardware hasn't + * already done it. + */ + if (entry->ih == &adev->irq.ih) { + amdgpu_irq_delegate(adev, entry, 8); + return 1; + } + + /* Try to handle the recoverable page faults by filling page + * tables + */ + if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, + addr, entry->timestamp, write_fault)) + return 1; + } + return 0; +} + int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev) { int r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index b62fa7e92c79d..e8e8bfa098c3e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -425,6 +425,12 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint16_t pasid, uint64_t timestamp); void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr, uint16_t pasid); +int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev, + struct amdgpu_iv_entry *entry, + u64 addr, + u32 cam_index, + u32 node_id, + bool write_fault); int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev); int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev); void amdgpu_gmc_ras_fini(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 47558e572553a..0b385a15194d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -115,27 +115,10 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, addr |= ((u64)entry->src_data[1] & 0xf) << 44; if (retry_fault) { + int ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, 0, 0, + write_fault); /* Returning 1 here also prevents sending the IV to the KFD */ - - /* Process it only if it's the first fault for this address */ - if (entry->ih != &adev->irq.ih_soft && - amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid, - entry->timestamp)) - return 1; - - /* Delegate it to a different ring if the hardware hasn't - * already done it. - */ - if (entry->ih == &adev->irq.ih) { - amdgpu_irq_delegate(adev, entry, 8); - return 1; - } - - /* Try to handle the recoverable page faults by filling page - * tables - */ - if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, - entry->timestamp, write_fault)) + if (ret == 1) return 1; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index ba59ee8e398a8..7a1f0742754a6 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -114,27 +114,10 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev, addr |= ((u64)entry->src_data[1] & 0xf) << 44; if (retry_fault) { + int ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, 0, 0, + write_fault); /* Returning 1 here also prevents sending the IV to the KFD */ - - /* Process it only if it's the first fault for this address */ - if (entry->ih != &adev->irq.ih_soft && - amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid, - entry->timestamp)) - return 1; - - /* Delegate it to a different ring if the hardware hasn't - * already done it. - */ - if (entry->ih == &adev->irq.ih) { - amdgpu_irq_delegate(adev, entry, 8); - return 1; - } - - /* Try to handle the recoverable page faults by filling page - * tables - */ - if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, - entry->timestamp, write_fault)) + if (ret == 1) return 1; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index dfb06baea1ff1..145fcefd1c783 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -110,27 +110,10 @@ static int gmc_v12_0_process_interrupt(struct amdgpu_device *adev, hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; if (retry_fault) { + int ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, 0, 0, + write_fault); /* Returning 1 here also prevents sending the IV to the KFD */ - - /* Process it only if it's the first fault for this address */ - if (entry->ih != &adev->irq.ih_soft && - amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid, - entry->timestamp)) - return 1; - - /* Delegate it to a different ring if the hardware hasn't - * already done it. - */ - if (entry->ih == &adev->irq.ih) { - amdgpu_irq_delegate(adev, entry, 8); - return 1; - } - - /* Try to handle the recoverable page faults by filling page - * tables - */ - if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, - entry->timestamp, write_fault)) + if (ret == 1) return 1; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 778ad7ac6d086..97a04e3171f2d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -583,44 +583,13 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, hub = &adev->vmhub[vmhub]; if (retry_fault) { - if (adev->irq.retry_cam_enabled) { - /* Delegate it to a different ring if the hardware hasn't - * already done it. - */ - if (entry->ih == &adev->irq.ih) { - amdgpu_irq_delegate(adev, entry, 8); - return 1; - } - - cam_index = entry->src_data[2] & 0x3ff; + cam_index = entry->src_data[2] & 0x3ff; - ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, entry->timestamp, write_fault); - WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); - if (ret) - return 1; - } else { - /* Process it only if it's the first fault for this address */ - if (entry->ih != &adev->irq.ih_soft && - amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid, - entry->timestamp)) - return 1; - - /* Delegate it to a different ring if the hardware hasn't - * already done it. - */ - if (entry->ih == &adev->irq.ih) { - amdgpu_irq_delegate(adev, entry, 8); - return 1; - } - - /* Try to handle the recoverable page faults by filling page - * tables - */ - if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, entry->timestamp, write_fault)) - return 1; - } + ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, cam_index, node_id, + write_fault); + /* Returning 1 here also prevents sending the IV to the KFD */ + if (ret == 1) + return 1; } if (kgd2kfd_vmfault_fast_path(adev, entry, retry_fault)) -- 2.47.3