From: Yunxiang Li Date: Wed, 27 May 2026 18:06:00 +0000 (-0400) Subject: drm/amdgpu/ras: add ras_suspend callback and use it for cp_ecc_error_irq X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;h=e3829992dd9fa0a82511af4f01733fc854cd15a5;p=thirdparty%2Flinux.git drm/amdgpu/ras: add ras_suspend callback and use it for cp_ecc_error_irq cp_ecc_error_irq is acquired in amdgpu_gfx_ras_late_init() but released in gfx_v9_0_hw_fini(), so the put site has to query amdgpu_irq_enabled() because the get is skipped on SR-IOV VF. ras_late_init / ras_fini have no suspend counterpart, so move the put to amdgpu_gfx_ras_suspend() / amdgpu_gfx_ras_fini() and add a matching ras_suspend callback that is invoked from amdgpu_ras_suspend() before disable_all_features(). The get and put now sit in the same place and check the same condition (not VF, funcs registered), no refcount querying needed. An active flag gates ras_fini so the suspend-then-unload-without-resume path falls into amdgpu_ras_block_late_fini_default() instead of double-releasing what ras_suspend already cleaned up. Drop the cp_ecc_error_irq put from gfx_v9_0_hw_fini(). gfx_v8_0 manages cp_ecc_error_irq locally and is unaffected; no other GFX generation has this IRQ. Signed-off-by: Yunxiang Li Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 515cc4a2aeb4..1e190fb54a97 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -990,10 +990,7 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r if (r) return r; - if (amdgpu_sriov_vf(adev)) - return r; - - if (adev->gfx.cp_ecc_error_irq.funcs) { + if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs) { r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0); if (r) goto late_fini; @@ -1008,6 +1005,21 @@ late_fini: return r; } +void amdgpu_gfx_ras_suspend(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs) + amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0); +} + +void amdgpu_gfx_ras_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs) + amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0); + amdgpu_ras_block_late_fini(adev, ras_block); +} + int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev) { int err = 0; @@ -1036,6 +1048,12 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev) if (!ras->ras_block.ras_late_init) ras->ras_block.ras_late_init = amdgpu_gfx_ras_late_init; + if (!ras->ras_block.ras_suspend) + ras->ras_block.ras_suspend = amdgpu_gfx_ras_suspend; + + if (!ras->ras_block.ras_fini) + ras->ras_block.ras_fini = amdgpu_gfx_ras_fini; + /* If not defined special ras_cb function, use default ras_cb */ if (!ras->ras_block.ras_cb) ras->ras_block.ras_cb = amdgpu_gfx_process_ras_data_cb; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 77050f9884f2..54c1eb9c499b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -615,7 +615,8 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable); void amdgpu_gfx_off_ctrl_immediate(struct amdgpu_device *adev, bool enable); int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value); int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -void amdgpu_gfx_ras_fini(struct amdgpu_device *adev); +void amdgpu_gfx_ras_suspend(struct amdgpu_device *adev, struct ras_common_if *ras_block); +void amdgpu_gfx_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block); int amdgpu_get_gfx_off_entrycount(struct amdgpu_device *adev, u64 *value); int amdgpu_get_gfx_off_residency(struct amdgpu_device *adev, u32 *residency); int amdgpu_set_gfx_off_residency(struct amdgpu_device *adev, bool value); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 093c0bf760c1..764cd4950408 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -95,6 +95,9 @@ struct amdgpu_ras_block_list { struct list_head node; struct amdgpu_ras_block_object *ras_obj; + + /* set by ras_late_init, cleared by ras_suspend/ras_fini */ + bool active; }; const char *get_ras_block_str(struct ras_common_if *ras_block) @@ -4637,10 +4640,23 @@ void amdgpu_ras_resume(struct amdgpu_device *adev) void amdgpu_ras_suspend(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_ras_block_list *node; + struct amdgpu_ras_block_object *obj; if (!adev->ras_enabled || !con) return; + /* run per-block ras_suspend before tearing down the RAS context */ + list_for_each_entry(node, &adev->ras_list, node) { + if (!node->active) + continue; + + obj = node->ras_obj; + if (obj && obj->ras_suspend) + obj->ras_suspend(adev, &obj->ras_comm); + node->active = false; + } + amdgpu_ras_disable_all_features(adev, 0); /* Make sure all ras objects are disabled. */ if (AMDGPU_RAS_GET_FEATURES(con->features)) @@ -4694,8 +4710,15 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) obj->ras_comm.name, r); return r; } - } else - amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); + } else { + r = amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); + if (r) { + dev_err(adev->dev, "%s failed to execute ras_block_late_init_default! ret:%d\n", + obj->ras_comm.name, r); + return r; + } + } + node->active = true; } amdgpu_ras_check_bad_page_status(adev); @@ -4734,11 +4757,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { if (ras_node->ras_obj) { obj = ras_node->ras_obj; - if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && - obj->ras_fini) + /* fall back to default cleanup if ras_suspend already ran */ + if (ras_node->active && obj->ras_fini) obj->ras_fini(adev, &obj->ras_comm); else amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); + ras_node->active = false; } /* Clear ras blocks from ras_list and free ras block list node */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ff44190d7d98..a86ab65aa2f0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -762,6 +762,7 @@ struct amdgpu_ras_block_object { int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index); int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if *ras_block); + void (*ras_suspend)(struct amdgpu_device *adev, struct ras_common_if *ras_block); void (*ras_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block); ras_ih_cb ras_cb; const struct amdgpu_ras_block_hw_ops *hw_ops; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index bf270e605949..60376d43e81d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4057,8 +4057,6 @@ static int gfx_v9_0_hw_fini(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) - amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0);