]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
drm/amdgpu: fix gpu page fault after hibernation on PF passthrough
authorSamuel Zhang <guoqing.zhang@amd.com>
Wed, 5 Nov 2025 03:04:08 +0000 (03:04 +0000)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 1 Dec 2025 10:43:38 +0000 (11:43 +0100)
[ Upstream commit eb6e7f520d6efa4d4ebf1671455abe4a681f7a05 ]

On PF passthrough environment, after hibernate and then resume, coralgemm
will cause gpu page fault.

Mode1 reset happens during hibernate, but partition mode is not restored
on resume, register mmCP_HYP_XCP_CTL and mmCP_PSP_XCP_CTL is not right
after resume. When CP access the MQD BO, wrong stride size is used,
this will cause out of bound access on the MQD BO, resulting page fault.

The fix is to ensure gfx_v9_4_3_switch_compute_partition() is called
when resume from a hibernation.
KFD resume is called separately during a reset recovery or resume from
suspend sequence. Hence it's not required to be called as part of
partition switch.

Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 5d1b32cfe4a676fe552416cb5ae847b215463a1a)
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c

index ccfd2a4b4acc850c6f5a12ce9b334c1a7559e36e..9c89e234c7869a5c06bac0623b1bc1ddbb83ccb6 100644 (file)
@@ -555,7 +555,8 @@ static int aqua_vanjaram_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr,
                return -EINVAL;
        }
 
-       if (adev->kfd.init_complete && !amdgpu_in_reset(adev))
+       if (adev->kfd.init_complete && !amdgpu_in_reset(adev) &&
+               !adev->in_suspend)
                flags |= AMDGPU_XCP_OPS_KFD;
 
        if (flags & AMDGPU_XCP_OPS_KFD) {
index f27ccb8f3c8c573635aabe2671f584078d1c61b1..26c2d8d9e2463d24c11928fede885f5bb8178437 100644 (file)
@@ -2297,7 +2297,9 @@ static int gfx_v9_4_3_cp_resume(struct amdgpu_device *adev)
                r = amdgpu_xcp_init(adev->xcp_mgr, num_xcp, mode);
 
        } else {
-               if (amdgpu_xcp_query_partition_mode(adev->xcp_mgr,
+               if (adev->in_suspend)
+                       amdgpu_xcp_restore_partition_mode(adev->xcp_mgr);
+               else if (amdgpu_xcp_query_partition_mode(adev->xcp_mgr,
                                                    AMDGPU_XCP_FL_NONE) ==
                    AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE)
                        r = amdgpu_xcp_switch_partition_mode(