]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amdgpu/ras: add ras_suspend callback and use it for cp_ecc_error_irq
authorYunxiang Li <Yunxiang.Li@amd.com>
Wed, 27 May 2026 18:06:00 +0000 (14:06 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 3 Jun 2026 17:59:29 +0000 (13:59 -0400)
cp_ecc_error_irq is acquired in amdgpu_gfx_ras_late_init() but
released in gfx_v9_0_hw_fini(), so the put site has to query
amdgpu_irq_enabled() because the get is skipped on SR-IOV VF.

ras_late_init / ras_fini have no suspend counterpart, so move the
put to amdgpu_gfx_ras_suspend() / amdgpu_gfx_ras_fini() and add a
matching ras_suspend callback that is invoked from
amdgpu_ras_suspend() before disable_all_features().  The get and
put now sit in the same place and check the same condition (not
VF, funcs registered), no refcount querying needed.

An active flag gates ras_fini so the
suspend-then-unload-without-resume path falls into
amdgpu_ras_block_late_fini_default() instead of double-releasing
what ras_suspend already cleaned up.

Drop the cp_ecc_error_irq put from gfx_v9_0_hw_fini().  gfx_v8_0
manages cp_ecc_error_irq locally and is unaffected; no other GFX
generation has this IRQ.

Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

index 515cc4a2aeb4d24d91cf27114c53ea1f703ceb5e..1e190fb54a9776f3f963015eaf76db2f61d256a3 100644 (file)
@@ -990,10 +990,7 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
                if (r)
                        return r;
 
-               if (amdgpu_sriov_vf(adev))
-                       return r;
-
-               if (adev->gfx.cp_ecc_error_irq.funcs) {
+               if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs) {
                        r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
                        if (r)
                                goto late_fini;
@@ -1008,6 +1005,21 @@ late_fini:
        return r;
 }
 
+void amdgpu_gfx_ras_suspend(struct amdgpu_device *adev,
+                           struct ras_common_if *ras_block)
+{
+       if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs)
+               amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
+}
+
+void amdgpu_gfx_ras_fini(struct amdgpu_device *adev,
+                        struct ras_common_if *ras_block)
+{
+       if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs)
+               amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
+       amdgpu_ras_block_late_fini(adev, ras_block);
+}
+
 int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
 {
        int err = 0;
@@ -1036,6 +1048,12 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
        if (!ras->ras_block.ras_late_init)
                ras->ras_block.ras_late_init = amdgpu_gfx_ras_late_init;
 
+       if (!ras->ras_block.ras_suspend)
+               ras->ras_block.ras_suspend = amdgpu_gfx_ras_suspend;
+
+       if (!ras->ras_block.ras_fini)
+               ras->ras_block.ras_fini = amdgpu_gfx_ras_fini;
+
        /* If not defined special ras_cb function, use default ras_cb */
        if (!ras->ras_block.ras_cb)
                ras->ras_block.ras_cb = amdgpu_gfx_process_ras_data_cb;
index 77050f9884f20173411f5686c298a511ffff4ed3..54c1eb9c499ba077239fa1e5bd06a0bbf68397a5 100644 (file)
@@ -615,7 +615,8 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable);
 void amdgpu_gfx_off_ctrl_immediate(struct amdgpu_device *adev, bool enable);
 int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value);
 int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
-void amdgpu_gfx_ras_fini(struct amdgpu_device *adev);
+void amdgpu_gfx_ras_suspend(struct amdgpu_device *adev, struct ras_common_if *ras_block);
+void amdgpu_gfx_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block);
 int amdgpu_get_gfx_off_entrycount(struct amdgpu_device *adev, u64 *value);
 int amdgpu_get_gfx_off_residency(struct amdgpu_device *adev, u32 *residency);
 int amdgpu_set_gfx_off_residency(struct amdgpu_device *adev, bool value);
index 093c0bf760c161554c4f170ff7e097adc51e45c6..764cd49504083c077c7952b8408e293eebbaf190 100644 (file)
@@ -95,6 +95,9 @@ struct amdgpu_ras_block_list {
        struct list_head node;
 
        struct amdgpu_ras_block_object *ras_obj;
+
+       /* set by ras_late_init, cleared by ras_suspend/ras_fini */
+       bool active;
 };
 
 const char *get_ras_block_str(struct ras_common_if *ras_block)
@@ -4637,10 +4640,23 @@ void amdgpu_ras_resume(struct amdgpu_device *adev)
 void amdgpu_ras_suspend(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct amdgpu_ras_block_list *node;
+       struct amdgpu_ras_block_object *obj;
 
        if (!adev->ras_enabled || !con)
                return;
 
+       /* run per-block ras_suspend before tearing down the RAS context */
+       list_for_each_entry(node, &adev->ras_list, node) {
+               if (!node->active)
+                       continue;
+
+               obj = node->ras_obj;
+               if (obj && obj->ras_suspend)
+                       obj->ras_suspend(adev, &obj->ras_comm);
+               node->active = false;
+       }
+
        amdgpu_ras_disable_all_features(adev, 0);
        /* Make sure all ras objects are disabled. */
        if (AMDGPU_RAS_GET_FEATURES(con->features))
@@ -4694,8 +4710,15 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
                                        obj->ras_comm.name, r);
                                return r;
                        }
-               } else
-                       amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
+               } else {
+                       r = amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
+                       if (r) {
+                               dev_err(adev->dev, "%s failed to execute ras_block_late_init_default! ret:%d\n",
+                                       obj->ras_comm.name, r);
+                               return r;
+                       }
+               }
+               node->active = true;
        }
 
        amdgpu_ras_check_bad_page_status(adev);
@@ -4734,11 +4757,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
        list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
                if (ras_node->ras_obj) {
                        obj = ras_node->ras_obj;
-                       if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
-                           obj->ras_fini)
+                       /* fall back to default cleanup if ras_suspend already ran */
+                       if (ras_node->active && obj->ras_fini)
                                obj->ras_fini(adev, &obj->ras_comm);
                        else
                                amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
+                       ras_node->active = false;
                }
 
                /* Clear ras blocks from ras_list and free ras block list node */
index ff44190d7d98ee97a22d111acdb00fccebfbafaa..a86ab65aa2f07d6d249340c8b219c3fc48266fe5 100644 (file)
@@ -762,6 +762,7 @@ struct amdgpu_ras_block_object {
        int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj,
                                enum amdgpu_ras_block block, uint32_t sub_block_index);
        int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
+       void (*ras_suspend)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
        void (*ras_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
        ras_ih_cb ras_cb;
        const struct amdgpu_ras_block_hw_ops *hw_ops;
index bf270e605949f12ed3fb6b7ff78f5467eb2ee287..60376d43e81d1b5a83ef81440fb47266c579f425 100644 (file)
@@ -4057,8 +4057,6 @@ static int gfx_v9_0_hw_fini(struct amdgpu_ip_block *ip_block)
 {
        struct amdgpu_device *adev = ip_block->adev;
 
-       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
-               amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
        amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
        amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
        amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0);