From: Xiang Liu Date: Tue, 11 Feb 2025 11:45:52 +0000 (+0800) Subject: drm/amdgpu: Generate bad page threshold cper records X-Git-Tag: v6.15-rc1~120^2~17^2~25 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f9d35b945c599e8dbed17f484e82b4ad3d21721a;p=thirdparty%2Fkernel%2Flinux.git drm/amdgpu: Generate bad page threshold cper records Generate CPER record when bad page threshold exceed and commit to CPER ring. v2: return -ENOMEM instead of false v2: check return value of fill section function Signed-off-by: Xiang Liu Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c index 26e0655e7ed49..8805381e19b99 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c @@ -207,7 +207,7 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false, - CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN, + CPER_SEV_NUM, RUNTIME, NONSTD_SEC_LEN, NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); section->hdr.valid_bits.err_info_cnt = 1; @@ -308,6 +308,28 @@ int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev, return 0; } +int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev) +{ + struct cper_hdr *bp_threshold = NULL; + struct amdgpu_ring *ring = &adev->cper.ring_buf; + int ret; + + bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_BP_THRESHOLD, 1); + if (!bp_threshold) { + dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n"); + return -ENOMEM; + } + + amdgpu_cper_entry_fill_hdr(adev, bp_threshold, AMDGPU_CPER_TYPE_BP_THRESHOLD, CPER_SEV_NUM); + ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0); + if (ret) + return ret; + + amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length); + + return 0; +} + static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev, enum aca_error_type aca_err_type) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h index d35d1ddac7ccf..bcb97d245673b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h @@ -95,6 +95,8 @@ int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev, int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev, struct aca_banks *banks, uint16_t bank_count); +/* Bad page threshold is encoded into separated cper entry */ +int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev); void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count); int amdgpu_cper_init(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c index 7a22aef6e59c3..faae9bf48aa4f 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c @@ -716,6 +716,9 @@ int amdgpu_dpm_send_rma_reason(struct amdgpu_device *adev) ret = smu_send_rma_reason(smu); mutex_unlock(&adev->pm.mutex); + if (amdgpu_cper_generate_bp_threshold_record(adev)) + dev_warn(adev->dev, "fail to generate bad page threshold cper records\n"); + return ret; }