From: YiPeng Chai Date: Tue, 28 Oct 2025 08:18:31 +0000 (+0800) Subject: drm/amdgpu: suspend ras module before gpu reset X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=d95ca7f515cfd2e721de07e86aa79adb17575a52;p=thirdparty%2Fkernel%2Flinux.git drm/amdgpu: suspend ras module before gpu reset During gpu reset, all GPU-related resources are inaccessible. To avoid affecting ras functionality, suspend ras module before gpu reset and resume it after gpu reset is complete. V2: Rename functions to avoid misunderstanding. V3: Move flush_delayed_work to amdgpu_ras_process_pause, Move schedule_delayed_work to amdgpu_ras_process_unpause. V4: Rename functions. V5: Move the function to amdgpu_ras.c. Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Reviewed-by: Hawking Zhang Acked-by: Lijo Lazar Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 95f7ae36e4f19..dcf6fce1c5a23 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -71,6 +71,7 @@ #include "amdgpu_xgmi.h" #include "amdgpu_ras.h" +#include "amdgpu_ras_mgr.h" #include "amdgpu_pmu.h" #include "amdgpu_fru_eeprom.h" #include "amdgpu_reset.h" @@ -6660,6 +6661,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, goto end_reset; } + /* Cannot be called after locking reset domain */ + amdgpu_ras_pre_reset(adev, &device_list); + /* We need to lock reset domain only once both for XGMI and single device */ amdgpu_device_recovery_get_reset_lock(adev, &device_list); @@ -6691,6 +6695,7 @@ skip_sched_resume: reset_unlock: amdgpu_device_recovery_put_reset_lock(adev, &device_list); end_reset: + amdgpu_ras_post_reset(adev, &device_list); if (hive) { mutex_unlock(&hive->hive_lock); amdgpu_put_xgmi_hive(hive); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 37999b3679573..62d2f988d88f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2921,8 +2921,12 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) type = amdgpu_ras_get_fatal_error_event(adev); list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { - amdgpu_ras_query_err_status(remote_adev); - amdgpu_ras_log_on_err_counter(remote_adev, type); + if (amdgpu_uniras_enabled(remote_adev)) { + amdgpu_ras_mgr_update_ras_ecc(remote_adev); + } else { + amdgpu_ras_query_err_status(remote_adev); + amdgpu_ras_log_on_err_counter(remote_adev, type); + } } } @@ -5673,3 +5677,25 @@ bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr return ret; } + +void amdgpu_ras_pre_reset(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; + + list_for_each_entry(tmp_adev, device_list, reset_list) { + if (amdgpu_uniras_enabled(tmp_adev)) + amdgpu_ras_mgr_pre_reset(tmp_adev); + } +} + +void amdgpu_ras_post_reset(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; + + list_for_each_entry(tmp_adev, device_list, reset_list) { + if (amdgpu_uniras_enabled(tmp_adev)) + amdgpu_ras_mgr_post_reset(tmp_adev); + } +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 674bcd3c814c0..ff44190d7d98e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -1039,4 +1039,9 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, const char *fmt, ...); bool amdgpu_ras_is_rma(struct amdgpu_device *adev); + +void amdgpu_ras_pre_reset(struct amdgpu_device *adev, + struct list_head *device_list); +void amdgpu_ras_post_reset(struct amdgpu_device *adev, + struct list_head *device_list); #endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c index adb01bdee0037..afe8135b62586 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c @@ -624,3 +624,25 @@ int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev, return ret; } + +int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_mgr_is_ready(adev)) { + RAS_DEV_ERR(adev, "Invalid ras suspend!\n"); + return -EPERM; + } + + amdgpu_ras_process_pre_reset(adev); + return 0; +} + +int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_mgr_is_ready(adev)) { + RAS_DEV_ERR(adev, "Invalid ras resume!\n"); + return -EPERM; + } + + amdgpu_ras_process_post_reset(adev); + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h index 42f190a8feb94..8fb7eb4b8f132 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h @@ -52,6 +52,9 @@ struct amdgpu_ras_mgr { struct ras_event_manager ras_event_mgr; uint64_t last_poison_consumption_seqno; bool ras_is_ready; + + bool is_paused; + struct completion ras_event_done; }; extern const struct amdgpu_ip_block_version ras_v1_0_ip_block; @@ -75,4 +78,6 @@ bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev); int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev, uint32_t cmd_id, void *input, uint32_t input_size, void *output, uint32_t out_size); +int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev); +int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c index 6727fc9a2b9b7..5782c007de71c 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c @@ -29,6 +29,7 @@ #include "amdgpu_ras_process.h" #define RAS_MGR_RETIRE_PAGE_INTERVAL 100 +#define RAS_EVENT_PROCESS_TIMEOUT 1200 static void ras_process_retire_page_dwork(struct work_struct *work) { @@ -57,6 +58,9 @@ int amdgpu_ras_process_init(struct amdgpu_device *adev) { struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + ras_mgr->is_paused = false; + init_completion(&ras_mgr->ras_event_done); + INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork); return 0; @@ -66,6 +70,7 @@ int amdgpu_ras_process_fini(struct amdgpu_device *adev) { struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + ras_mgr->is_paused = false; /* Save all cached bad pages to eeprom */ flush_delayed_work(&ras_mgr->retire_page_dwork); cancel_delayed_work_sync(&ras_mgr->retire_page_dwork); @@ -124,3 +129,62 @@ int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false); } + +int amdgpu_ras_process_begin(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (ras_mgr->is_paused) + return -EAGAIN; + + reinit_completion(&ras_mgr->ras_event_done); + return 0; +} + +int amdgpu_ras_process_end(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + complete(&ras_mgr->ras_event_done); + return 0; +} + +int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + long rc; + + if (!ras_mgr || !ras_mgr->ras_core) + return -EINVAL; + + if (!ras_mgr->ras_core->is_initialized) + return -EPERM; + + ras_mgr->is_paused = true; + + /* Wait for RAS event processing to complete */ + rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done, + msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT)); + if (rc <= 0) + RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n", + rc ? "interrupted" : "timeout"); + + flush_delayed_work(&ras_mgr->retire_page_dwork); + return 0; +} + +int amdgpu_ras_process_post_reset(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!ras_mgr || !ras_mgr->ras_core) + return -EINVAL; + + if (!ras_mgr->ras_core->is_initialized) + return -EPERM; + + ras_mgr->is_paused = false; + + schedule_delayed_work(&ras_mgr->retire_page_dwork, 0); + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h index b9502bd21bebe..d55cdaeac4410 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h @@ -34,4 +34,8 @@ int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev, void *data); int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, void *data); +int amdgpu_ras_process_begin(struct amdgpu_device *adev); +int amdgpu_ras_process_end(struct amdgpu_device *adev); +int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev); +int amdgpu_ras_process_post_reset(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c index f21cd55a25be9..45ed8c3b5563b 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c @@ -142,6 +142,12 @@ static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core, case RAS_EVENT_ID__RESET_GPU: ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data); break; + case RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN: + ret = amdgpu_ras_process_begin(ras_core->dev); + break; + case RAS_EVENT_ID__RAS_EVENT_PROC_END: + ret = amdgpu_ras_process_end(ras_core->dev); + break; default: RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", event_id); break; diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h b/drivers/gpu/drm/amd/ras/rascore/ras.h index fa224b36e3f2d..3396b2e0949df 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras.h +++ b/drivers/gpu/drm/amd/ras/rascore/ras.h @@ -115,6 +115,8 @@ enum ras_notify_event { RAS_EVENT_ID__FATAL_ERROR_DETECTED, RAS_EVENT_ID__RESET_GPU, RAS_EVENT_ID__RESET_VF, + RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, + RAS_EVENT_ID__RAS_EVENT_PROC_END, }; enum ras_gpu_status { diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_process.c b/drivers/gpu/drm/amd/ras/rascore/ras_process.c index 02f0657f78a39..3267dcdb169cd 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_process.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_process.c @@ -162,6 +162,11 @@ int ras_process_handle_ras_event(struct ras_core_context *ras_core) uint32_t umc_event_count; int ret; + ret = ras_core_event_notify(ras_core, + RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL); + if (ret) + return ret; + ras_aca_clear_fatal_flag(ras_core); ras_umc_log_pending_bad_bank(ras_core); @@ -185,6 +190,8 @@ int ras_process_handle_ras_event(struct ras_core_context *ras_core) atomic_set(&ras_proc->umc_interrupt_count, 0); } + ras_core_event_notify(ras_core, + RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL); return ret; }