From: Tao Zhou Date: Wed, 29 May 2024 07:39:41 +0000 (+0800) Subject: drm/amdgpu: create amdgpu_ras_in_recovery to simplify code X-Git-Tag: v6.11-rc1~141^2~18^2~56 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7e4371676e5e58739ffc884b1b5d6bbf1cce3d17;p=thirdparty%2Fkernel%2Flinux.git drm/amdgpu: create amdgpu_ras_in_recovery to simplify code Reduce redundant code and user doesn't need to pay attention to RAS details. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7df5544ac9839..3fb02f5b91c96 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6276,20 +6276,11 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) struct amdgpu_reset_context reset_context; u32 memsize; struct list_head device_list; - struct amdgpu_hive_info *hive; - int hive_ras_recovery = 0; - struct amdgpu_ras *ras; /* PCI error slot reset should be skipped During RAS recovery */ - hive = amdgpu_get_xgmi_hive(adev); - if (hive) { - hive_ras_recovery = atomic_read(&hive->ras_recovery); - amdgpu_put_xgmi_hive(hive); - } - ras = amdgpu_ras_get_context(adev); if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && - ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && + amdgpu_ras_in_recovery(adev)) return PCI_ERS_RESULT_RECOVERED; DRM_INFO("PCI error: slot reset callback!!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 19b1817b55d71..82452606ae6ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -506,9 +506,6 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) { struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id]; struct amdgpu_ring *kiq_ring = &kiq->ring; - struct amdgpu_hive_info *hive; - struct amdgpu_ras *ras; - int hive_ras_recovery = 0; int i, r = 0; int j; @@ -533,16 +530,9 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) * This is workaround: only skip kiq_ring test * during ras recovery in suspend stage for gfx9.4.3 */ - hive = amdgpu_get_xgmi_hive(adev); - if (hive) { - hive_ras_recovery = atomic_read(&hive->ras_recovery); - amdgpu_put_xgmi_hive(hive); - } - - ras = amdgpu_ras_get_context(adev); if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && - ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) { + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && + amdgpu_ras_in_recovery(adev)) { spin_unlock(&kiq->ring_lock); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 671bbb19995ae..02d9ef988e81a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1409,11 +1409,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; - struct amdgpu_hive_info *hive; - int hive_ras_recovery = 0; if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", @@ -1425,15 +1422,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, !amdgpu_ras_get_aca_debug_mode(adev)) return -EOPNOTSUPP; - hive = amdgpu_get_xgmi_hive(adev); - if (hive) { - hive_ras_recovery = atomic_read(&hive->ras_recovery); - amdgpu_put_xgmi_hive(hive); - } - /* skip ras error reset in gpu reset */ - if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) || - hive_ras_recovery) && + if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) && ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode))) return -EOPNOTSUPP; @@ -2461,6 +2451,23 @@ static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev, } } +bool amdgpu_ras_in_recovery(struct amdgpu_device *adev) +{ + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + int hive_ras_recovery = 0; + + if (hive) { + hive_ras_recovery = atomic_read(&hive->ras_recovery); + amdgpu_put_xgmi_hive(hive); + } + + if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) + return true; + + return false; +} + static void amdgpu_ras_do_recovery(struct work_struct *work) { struct amdgpu_ras *ras = @@ -2821,7 +2828,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) struct ras_err_data err_data; unsigned long err_cnt; - if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery)) + if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) return; amdgpu_ras_error_data_init(&err_data); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index e70c45712ddb4..83437fef9df5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -954,6 +954,8 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, enum amdgpu_ras_block block, uint16_t pasid, pasid_notify pasid_fn, void *data, uint32_t reset); +bool amdgpu_ras_in_recovery(struct amdgpu_device *adev); + __printf(3, 4) void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, const char *fmt, ...); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 825786fc849e1..04533f99f1e33 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -1863,7 +1863,6 @@ static int aldebaran_mode1_reset(struct smu_context *smu) u32 fatal_err, param; int ret = 0; struct amdgpu_device *adev = smu->adev; - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); fatal_err = 0; param = SMU_RESET_MODE_1; @@ -1876,8 +1875,8 @@ static int aldebaran_mode1_reset(struct smu_context *smu) } else { /* fatal error triggered by ras, PMFW supports the flag from 68.44.0 */ - if ((smu->smc_fw_version >= 0x00442c00) && ras && - atomic_read(&ras->in_recovery)) + if ((smu->smc_fw_version >= 0x00442c00) && + amdgpu_ras_in_recovery(adev)) fatal_err = 1; param |= (fatal_err << 16); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 6c24e23063834..ce9b77074cb11 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2786,10 +2786,9 @@ static void smu_v13_0_0_set_mode1_reset_param(struct smu_context *smu, uint32_t *param) { struct amdgpu_device *adev = smu->adev; - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); if ((smu->smc_fw_version >= supported_version) && - ras && atomic_read(&ras->in_recovery)) + amdgpu_ras_in_recovery(adev)) /* Set RAS fatal error reset flag */ *param = 1 << 16; else diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index ffe3528e92cb3..6790095a2fa77 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2574,24 +2574,14 @@ failed: static int smu_v13_0_6_mode1_reset(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; - struct amdgpu_hive_info *hive = NULL; - u32 hive_ras_recovery = 0; - struct amdgpu_ras *ras; u32 fatal_err, param; int ret = 0; - hive = amdgpu_get_xgmi_hive(adev); - ras = amdgpu_ras_get_context(adev); fatal_err = 0; param = SMU_RESET_MODE_1; - if (hive) { - hive_ras_recovery = atomic_read(&hive->ras_recovery); - amdgpu_put_xgmi_hive(hive); - } - /* fatal error triggered by ras, PMFW supports the flag */ - if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) + if (amdgpu_ras_in_recovery(adev)) fatal_err = 1; param |= (fatal_err << 16);