res = __write_table_ras_info(control);
control->ras_num_recs = 0;
+ control->ras_num_bad_pages = 0;
control->ras_fri = 0;
- amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
+ amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages);
control->bad_channel_bitmap = 0;
amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
if (amdgpu_bad_page_threshold == -1) {
dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
- con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
+ con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
dev_warn(adev->dev,
"But GPU can be operated due to bad_page_threshold = -1.\n");
return false;
const u32 num)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
+ struct amdgpu_device *adev = to_amdgpu_device(control);
u32 a, b, i;
u8 *buf, *pp;
int res;
control->ras_num_recs = 1 + (control->ras_max_record_count + b
- control->ras_fri)
% control->ras_max_record_count;
+
+ if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
+ control->ras_num_bad_pages = control->ras_num_recs;
+ else
+ control->ras_num_bad_pages =
+ control->ras_num_recs * adev->umc.retire_unit;
Out:
kfree(buf);
return res;
/* Modify the header if it exceeds.
*/
if (amdgpu_bad_page_threshold != 0 &&
- control->ras_num_recs >= ras->bad_page_cnt_threshold) {
+ control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
dev_warn(adev->dev,
"Saved bad pages %d reaches threshold value %d\n",
- control->ras_num_recs, ras->bad_page_cnt_threshold);
+ control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
*/
if (amdgpu_bad_page_threshold != 0 &&
control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
- control->ras_num_recs < ras->bad_page_cnt_threshold)
+ control->ras_num_bad_pages < ras->bad_page_cnt_threshold)
control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
- control->ras_num_recs) * 100) /
+ control->ras_num_bad_pages) * 100) /
ras->bad_page_cnt_threshold;
/* Recalc the checksum.
if (!__get_eeprom_i2c_addr(adev, control))
return -EINVAL;
+ if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
+ control->ras_num_bad_pages = control->ras_num_recs;
+ else
+ control->ras_num_bad_pages =
+ control->ras_num_recs * adev->umc.retire_unit;
+
if (hdr->header == RAS_TABLE_HDR_VAL) {
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
- control->ras_num_recs);
+ control->ras_num_bad_pages);
if (hdr->version == RAS_TABLE_VER_V2_1) {
res = __read_table_ras_info(control);
/* Warn if we are at 90% of the threshold or above
*/
- if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold)
+ if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold)
dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
- control->ras_num_recs,
+ control->ras_num_bad_pages,
ras->bad_page_cnt_threshold);
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
amdgpu_bad_page_threshold != 0) {
res);
return -EINVAL;
}
- if (ras->bad_page_cnt_threshold > control->ras_num_recs) {
+ if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages) {
/* This means that, the threshold was increased since
* the last time the system was booted, and now,
* ras->bad_page_cnt_threshold - control->num_recs > 0,
dev_info(adev->dev,
"records:%d threshold:%d, resetting "
"RAS table header signature",
- control->ras_num_recs,
+ control->ras_num_bad_pages,
ras->bad_page_cnt_threshold);
res = amdgpu_ras_eeprom_correct_header_tag(control,
RAS_TABLE_HDR_VAL);
} else {
dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
- control->ras_num_recs, ras->bad_page_cnt_threshold);
+ control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
if (amdgpu_bad_page_threshold == -1) {
dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
res = 0;
dev_err(adev->dev,
"RAS records:%d exceed threshold:%d, "
"GPU will not be initialized. Replace this GPU or increase the threshold",
- control->ras_num_recs, ras->bad_page_cnt_threshold);
+ control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
}
}
} else {