]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdgpu: fix gpu page fault after hibernation on PF passthrough
authorSamuel Zhang <guoqing.zhang@amd.com>
Wed, 5 Nov 2025 03:04:08 +0000 (03:04 +0000)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 6 Nov 2025 14:57:11 +0000 (09:57 -0500)
On PF passthrough environment, after hibernate and then resume, coralgemm
will cause gpu page fault.

Mode1 reset happens during hibernate, but partition mode is not restored
on resume, register mmCP_HYP_XCP_CTL and mmCP_PSP_XCP_CTL is not right
after resume. When CP access the MQD BO, wrong stride size is used,
this will cause out of bound access on the MQD BO, resulting page fault.

The fix is to ensure gfx_v9_4_3_switch_compute_partition() is called
when resume from a hibernation.
KFD resume is called separately during a reset recovery or resume from
suspend sequence. Hence it's not required to be called as part of
partition switch.

Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c

index 811124ff88a88446e498ed83b08f30c3243ebf24..f9e2edf5260bc5f5783db68115c53c88af24be25 100644 (file)
@@ -407,7 +407,8 @@ static int aqua_vanjaram_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr,
                return -EINVAL;
        }
 
-       if (adev->kfd.init_complete && !amdgpu_in_reset(adev))
+       if (adev->kfd.init_complete && !amdgpu_in_reset(adev) &&
+               !adev->in_suspend)
                flags |= AMDGPU_XCP_OPS_KFD;
 
        if (flags & AMDGPU_XCP_OPS_KFD) {
index c4c551ef6b874da8baa0495d0d66936d45778aba..cbb74ffc479257cba532dad925e71aa09d018aaa 100644 (file)
@@ -2291,7 +2291,9 @@ static int gfx_v9_4_3_cp_resume(struct amdgpu_device *adev)
                r = amdgpu_xcp_init(adev->xcp_mgr, num_xcp, mode);
 
        } else {
-               if (amdgpu_xcp_query_partition_mode(adev->xcp_mgr,
+               if (adev->in_suspend)
+                       amdgpu_xcp_restore_partition_mode(adev->xcp_mgr);
+               else if (amdgpu_xcp_query_partition_mode(adev->xcp_mgr,
                                                    AMDGPU_XCP_FL_NONE) ==
                    AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE)
                        r = amdgpu_xcp_switch_partition_mode(