]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amdgpu: Fix CPER error handling on VFs
authorVictor Skvortsov <Victor.Skvortsov@amd.com>
Sun, 30 Mar 2025 18:54:06 +0000 (13:54 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 7 Apr 2025 22:00:40 +0000 (18:00 -0400)
CPER read will loop infinitely if an error is encountered and
the more bit is set. Add error checks to break upon failure.

v2: added function pointer checks

Suggested-by: Tony Yi <Tony.Yi@amd.com>
Signed-off-by: Victor Skvortsov <Victor.Skvortsov@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

index 0bb8cbe0dcc05fda2e6c73500339eeccfbb28a62..83f3334b39312db45ce3c66d9bbbc916c21a14c4 100644 (file)
@@ -1323,6 +1323,9 @@ static int amdgpu_virt_req_ras_err_count_internal(struct amdgpu_device *adev, bo
 {
        struct amdgpu_virt *virt = &adev->virt;
 
+       if (!virt->ops || !virt->ops->req_ras_err_count)
+               return -EOPNOTSUPP;
+
        /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host
         * will ignore incoming guest messages. Ratelimit the guest messages to
         * prevent guest self DOS.
@@ -1378,14 +1381,16 @@ amdgpu_virt_write_cpers_to_ring(struct amdgpu_device *adev,
        used_size = host_telemetry->header.used_size;
 
        if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10))
-               return 0;
+               return -EINVAL;
 
        cper_dump = kmemdup(&host_telemetry->body.cper_dump, used_size, GFP_KERNEL);
        if (!cper_dump)
                return -ENOMEM;
 
-       if (checksum != amd_sriov_msg_checksum(cper_dump, used_size, 0, 0))
+       if (checksum != amd_sriov_msg_checksum(cper_dump, used_size, 0, 0)) {
+               ret = -EINVAL;
                goto out;
+       }
 
        *more = cper_dump->more;
 
@@ -1425,7 +1430,7 @@ static int amdgpu_virt_req_ras_cper_dump_internal(struct amdgpu_device *adev)
        int ret = 0;
        uint32_t more = 0;
 
-       if (!amdgpu_sriov_ras_cper_en(adev))
+       if (!virt->ops || !virt->ops->req_ras_cper_dump)
                return -EOPNOTSUPP;
 
        do {
@@ -1434,7 +1439,7 @@ static int amdgpu_virt_req_ras_cper_dump_internal(struct amdgpu_device *adev)
                                adev, virt->fw_reserve.ras_telemetry, &more);
                else
                        ret = 0;
-       } while (more);
+       } while (more && !ret);
 
        return ret;
 }
@@ -1444,6 +1449,9 @@ int amdgpu_virt_req_ras_cper_dump(struct amdgpu_device *adev, bool force_update)
        struct amdgpu_virt *virt = &adev->virt;
        int ret = 0;
 
+       if (!amdgpu_sriov_ras_cper_en(adev))
+               return -EOPNOTSUPP;
+
        if ((__ratelimit(&virt->ras.ras_cper_dump_rs) || force_update) &&
            down_read_trylock(&adev->reset_domain->sem)) {
                mutex_lock(&virt->ras.ras_telemetry_mutex);