From: YiPeng Chai Date: Mon, 21 Jul 2025 07:14:03 +0000 (+0800) Subject: drm/amdgpu: Improve ras fatal error handling function X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=408bd841ad24fd9891a1cd9ca11cf4f48d9667dc;p=thirdparty%2Fkernel%2Flinux.git drm/amdgpu: Improve ras fatal error handling function In multi-gpu case, a fatal error will generate several fatal error interrupts. After improving this function, the ras module can reuse this function to only handle the first interrupt. V3: Initialize event_id using RAS_EVENT_INVALID_ID. Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9aa4b93ac6afd..9e632adc7746c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -4650,20 +4650,18 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type return id; } -void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) +int amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); enum ras_event_type type = RAS_EVENT_TYPE_FATAL; - u64 event_id; + u64 event_id = RAS_EVENT_INVALID_ID; - if (amdgpu_ras_mark_ras_event(adev, type)) { - dev_err(adev->dev, - "uncorrectable hardware error (ERREVENT_ATHUB_INTERRUPT) detected!\n"); - return; - } + if (amdgpu_uniras_enabled(adev)) + return 0; - event_id = amdgpu_ras_acquire_event_id(adev, type); + if (!amdgpu_ras_mark_ras_event(adev, type)) + event_id = amdgpu_ras_acquire_event_id(adev, type); RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error" "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); @@ -4672,6 +4670,8 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; amdgpu_ras_reset_gpu(adev); } + + return -EBUSY; } bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 9f21b6cf87245..556cf4d7b5ef8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -910,7 +910,7 @@ static inline void amdgpu_ras_intr_cleared(void) atomic_set(&amdgpu_ras_in_intr, 0); } -void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); +int amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready); diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c index 40071b8763336..f21cd55a25be9 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c @@ -29,8 +29,13 @@ static int amdgpu_ras_sys_detect_fatal_event(struct ras_core_context *ras_core, void *data) { struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + int ret; uint64_t seq_no; + ret = amdgpu_ras_global_ras_isr(adev); + if (ret) + return ret; + seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_UE); RAS_DEV_INFO(adev, "{%llu} Uncorrectable hardware error(ERREVENT_ATHUB_INTERRUPT) detected!\n",