Add new types to distinguish between ACA error type and smu mca type.
e.g.:
the ACA_ERROR_TYPE_DEFERRED is not matched any smu mca valid bank
channel, so add new type 'aca_smu_type' to distinguish aca error type
and smu mca type.
Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
#define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype}
-typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, void *data);
+typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
struct aca_banks {
int nr_banks;
}
}
-static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_error_type type, u32 *count)
+static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count)
{
struct amdgpu_aca *aca = &adev->aca;
const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
}
-static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_error_type type,
+static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
int start, int count,
struct aca_banks *banks)
{
return -EOPNOTSUPP;
switch (type) {
- case ACA_ERROR_TYPE_UE:
+ case ACA_SMU_TYPE_UE:
max_count = smu_funcs->max_ue_bank_count;
break;
- case ACA_ERROR_TYPE_CE:
+ case ACA_SMU_TYPE_CE:
max_count = smu_funcs->max_ce_bank_count;
break;
- case ACA_ERROR_TYPE_DEFERRED:
default:
return -EINVAL;
}
if (ret)
return ret;
+ bank.type = type;
+
aca_smu_bank_dump(adev, i, count, &bank);
ret = aca_banks_add_bank(banks, &bank);
return hwip->hwid == hwid && hwip->mcatype == mcatype;
}
-static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type)
+static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
{
const struct aca_bank_ops *bank_ops = handle->bank_ops;
}
static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_error_type type, struct aca_bank_report *report)
+ enum aca_smu_type type, struct aca_bank_report *report)
{
const struct aca_bank_ops *bank_ops = handle->bank_ops;
}
static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_error_type type, void *data)
+ enum aca_smu_type smu_type, void *data)
{
struct aca_bank_report report;
+ enum aca_error_type type;
int ret;
- ret = aca_generate_bank_report(handle, bank, type, &report);
+ switch (smu_type) {
+ case ACA_SMU_TYPE_UE:
+ type = ACA_ERROR_TYPE_UE;
+ break;
+ case ACA_SMU_TYPE_CE:
+ type = ACA_ERROR_TYPE_CE;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ ret = aca_generate_bank_report(handle, bank, smu_type, &report);
if (ret)
return ret;
}
static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank,
- enum aca_error_type type, bank_handler_t handler, void *data)
+ enum aca_smu_type type, bank_handler_t handler, void *data)
{
struct aca_handle *handle;
int ret;
}
static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks,
- enum aca_error_type type, bank_handler_t handler, void *data)
+ enum aca_smu_type type, bank_handler_t handler, void *data)
{
struct aca_bank_node *node;
struct aca_bank *bank;
return 0;
}
-static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type,
+static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
bank_handler_t handler, void *data)
{
struct amdgpu_aca *aca = &adev->aca;
if (list_empty(&aca->mgr.list))
return 0;
- /* NOTE: pmfw is only support UE and CE */
- if (type == ACA_ERROR_TYPE_DEFERRED)
- type = ACA_ERROR_TYPE_CE;
-
ret = aca_smu_get_valid_aca_count(adev, type, &count);
if (ret)
return ret;
static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type,
struct ras_err_data *err_data)
{
+ enum aca_smu_type smu_type;
int ret;
+ switch (type) {
+ case ACA_ERROR_TYPE_UE:
+ smu_type = ACA_SMU_TYPE_UE;
+ break;
+ case ACA_ERROR_TYPE_CE:
+ smu_type = ACA_SMU_TYPE_CE;
+ break;
+ default:
+ return -EINVAL;
+ }
+
/* udpate aca bank to aca source error_cache first */
- ret = aca_banks_update(adev, type, handler_aca_log_bank_error, NULL);
+ ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, NULL);
if (ret)
return ret;
return 0;
}
-static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_error_type type, int idx)
+static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx)
{
struct aca_bank_info info;
int i, ret;
if (ret)
return;
- seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_ERROR_TYPE_UE ? "UE" : "CE");
+ seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_SMU_TYPE_UE ? "UE" : "CE");
seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",
idx, info.socket_id, info.die_id, info.hwid, info.mcatype);
};
static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_error_type type, void *data)
+ enum aca_smu_type type, void *data)
{
struct aca_dump_context *ctx = (struct aca_dump_context *)data;
return handler_aca_log_bank_error(handle, bank, type, NULL);
}
-static int aca_dump_show(struct seq_file *m, enum aca_error_type type)
+static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)
{
struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
struct aca_dump_context context = {
static int aca_dump_ce_show(struct seq_file *m, void *unused)
{
- return aca_dump_show(m, ACA_ERROR_TYPE_CE);
+ return aca_dump_show(m, ACA_SMU_TYPE_CE);
}
static int aca_dump_ce_open(struct inode *inode, struct file *file)
static int aca_dump_ue_show(struct seq_file *m, void *unused)
{
- return aca_dump_show(m, ACA_ERROR_TYPE_UE);
+ return aca_dump_show(m, ACA_SMU_TYPE_UE);
}
static int aca_dump_ue_open(struct inode *inode, struct file *file)
ACA_ERROR_TYPE_COUNT
};
+enum aca_smu_type {
+ ACA_SMU_TYPE_UE = 0,
+ ACA_SMU_TYPE_CE,
+ ACA_SMU_TYPE_COUNT,
+};
+
struct aca_bank {
+ enum aca_smu_type type;
u64 regs[ACA_MAX_REGS_COUNT];
};
};
struct aca_bank_ops {
- int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
+ int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,
struct aca_bank_report *report, void *data);
- bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
+ bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,
void *data);
};
int max_ue_bank_count;
int max_ce_bank_count;
int (*set_debug_mode)(struct amdgpu_device *adev, bool enable);
- int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_error_type type, u32 *count);
- int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_error_type type, int idx, struct aca_bank *bank);
+ int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count);
+ int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_smu_type type, int idx, struct aca_bank *bank);
};
struct amdgpu_aca {
return 0;
}
-static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
+static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,
struct aca_bank_report *report, void *data)
{
struct amdgpu_device *adev = handle->adev;
const char *error_str;
- u64 status;
+ u64 status, count;
int ret, ext_error_code;
ret = aca_bank_info_decode(bank, &report->info);
if (error_str)
dev_info(adev->dev, "%s detected\n", error_str);
- if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
- (type == ACA_ERROR_TYPE_CE && ext_error_code == 6))
- report->count[type] = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
+ count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
+ switch (type) {
+ case ACA_SMU_TYPE_UE:
+ report->count[ACA_ERROR_TYPE_UE] = ext_error_code == 0 ? count : 0ULL;
+ break;
+ case ACA_SMU_TYPE_CE:
+ report->count[ACA_ERROR_TYPE_CE] = ext_error_code == 6 ? count : 0ULL;
+ break;
+ default:
+ return -EINVAL;
+ }
return 0;
}
};
static int gfx_v9_4_3_aca_bank_generate_report(struct aca_handle *handle,
- struct aca_bank *bank, enum aca_error_type type,
+ struct aca_bank *bank, enum aca_smu_type type,
struct aca_bank_report *report, void *data)
{
- u64 status, misc0;
+ u64 misc0;
u32 instlo;
int ret;
- status = bank->regs[ACA_REG_IDX_STATUS];
- if ((type == ACA_ERROR_TYPE_UE &&
- ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
- (type == ACA_ERROR_TYPE_CE &&
- ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {
+ ret = aca_bank_info_decode(bank, &report->info);
+ if (ret)
+ return ret;
- ret = aca_bank_info_decode(bank, &report->info);
- if (ret)
- return ret;
+ /* NOTE: overwrite info.die_id with xcd id for gfx */
+ instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
+ instlo &= GENMASK(31, 1);
+ report->info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;
- /* NOTE: overwrite info.die_id with xcd id for gfx */
- instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
- instlo &= GENMASK(31, 1);
- report->info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;
+ misc0 = bank->regs[ACA_REG_IDX_MISC0];
- misc0 = bank->regs[ACA_REG_IDX_MISC0];
- report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);
+ switch (type) {
+ case ACA_SMU_TYPE_UE:
+ report->count[ACA_ERROR_TYPE_UE] = 1ULL;
+ break;
+ case ACA_SMU_TYPE_CE:
+ report->count[ACA_ERROR_TYPE_CE] = ACA_REG__MISC0__ERRCNT(misc0);
+ break;
+ default:
+ return -EINVAL;
}
return 0;
}
static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_error_type type, void *data)
+ enum aca_smu_type type, void *data)
{
u32 instlo;
};
static int mmhub_v1_8_aca_bank_generate_report(struct aca_handle *handle,
- struct aca_bank *bank, enum aca_error_type type,
+ struct aca_bank *bank, enum aca_smu_type type,
struct aca_bank_report *report, void *data)
{
- u64 status, misc0;
+ u64 misc0;
int ret;
- status = bank->regs[ACA_REG_IDX_STATUS];
- if ((type == ACA_ERROR_TYPE_UE &&
- ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
- (type == ACA_ERROR_TYPE_CE &&
- ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {
-
- ret = aca_bank_info_decode(bank, &report->info);
- if (ret)
- return ret;
-
- misc0 = bank->regs[ACA_REG_IDX_MISC0];
- report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);
+ ret = aca_bank_info_decode(bank, &report->info);
+ if (ret)
+ return ret;
+
+ misc0 = bank->regs[ACA_REG_IDX_MISC0];
+
+ switch (type) {
+ case ACA_SMU_TYPE_UE:
+ report->count[ACA_ERROR_TYPE_UE] = 1ULL;
+ break;
+ case ACA_SMU_TYPE_CE:
+ report->count[ACA_ERROR_TYPE_CE] = ACA_REG__MISC0__ERRCNT(misc0);
+ break;
+ default:
+ return -EINVAL;
}
return 0;
};
static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_error_type type, void *data)
+ enum aca_smu_type type, void *data)
{
u32 instlo;
};
static int sdma_v4_4_2_aca_bank_generate_report(struct aca_handle *handle,
- struct aca_bank *bank, enum aca_error_type type,
+ struct aca_bank *bank, enum aca_smu_type type,
struct aca_bank_report *report, void *data)
{
- u64 status, misc0;
+ u64 misc0;
int ret;
- status = bank->regs[ACA_REG_IDX_STATUS];
- if ((type == ACA_ERROR_TYPE_UE &&
- ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
- (type == ACA_ERROR_TYPE_CE &&
- ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {
+ ret = aca_bank_info_decode(bank, &report->info);
+ if (ret)
+ return ret;
- ret = aca_bank_info_decode(bank, &report->info);
- if (ret)
- return ret;
+ misc0 = bank->regs[ACA_REG_IDX_MISC0];
- misc0 = bank->regs[ACA_REG_IDX_MISC0];
- report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);
+ switch (type) {
+ case ACA_SMU_TYPE_UE:
+ report->count[ACA_ERROR_TYPE_UE] = 1ULL;
+ break;
+ case ACA_SMU_TYPE_CE:
+ report->count[ACA_ERROR_TYPE_CE] = ACA_REG__MISC0__ERRCNT(misc0);
+ break;
+ default:
+ return -EINVAL;
}
return 0;
static int sdma_v4_4_2_err_codes[] = { 33, 34, 35, 36 };
static bool sdma_v4_4_2_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_error_type type, void *data)
+ enum aca_smu_type type, void *data)
{
u32 instlo;
.query_ras_error_address = umc_v12_0_query_ras_error_address,
};
-static int umc_v12_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
+static int umc_v12_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,
struct aca_bank_report *report, void *data)
{
struct amdgpu_device *adev = handle->adev;
status = bank->regs[ACA_REG_IDX_STATUS];
switch (type) {
- case ACA_ERROR_TYPE_UE:
+ case ACA_SMU_TYPE_UE:
if (umc_v12_0_is_uncorrectable_error(adev, status)) {
- report->count[type] = 1;
+ report->count[ACA_ERROR_TYPE_UE] = 1;
}
break;
- case ACA_ERROR_TYPE_CE:
+ case ACA_SMU_TYPE_CE:
if (umc_v12_0_is_correctable_error(adev, status)) {
- report->count[type] = 1;
+ report->count[ACA_ERROR_TYPE_CE] = 1;
}
break;
default:
return smu_v13_0_6_mca_set_debug_mode(smu, enable);
}
-static int smu_v13_0_6_get_valid_aca_count(struct smu_context *smu, enum aca_error_type type, u32 *count)
+static int smu_v13_0_6_get_valid_aca_count(struct smu_context *smu, enum aca_smu_type type, u32 *count)
{
uint32_t msg;
int ret;
return -EINVAL;
switch (type) {
- case ACA_ERROR_TYPE_UE:
+ case ACA_SMU_TYPE_UE:
msg = SMU_MSG_QueryValidMcaCount;
break;
- case ACA_ERROR_TYPE_CE:
+ case ACA_SMU_TYPE_CE:
msg = SMU_MSG_QueryValidMcaCeCount;
break;
default:
}
static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev,
- enum aca_error_type type, u32 *count)
+ enum aca_smu_type type, u32 *count)
{
struct smu_context *smu = adev->powerplay.pp_handle;
int ret;
switch (type) {
- case ACA_ERROR_TYPE_UE:
- case ACA_ERROR_TYPE_CE:
+ case ACA_SMU_TYPE_UE:
+ case ACA_SMU_TYPE_CE:
ret = smu_v13_0_6_get_valid_aca_count(smu, type, count);
break;
default:
return ret;
}
-static int __smu_v13_0_6_aca_bank_dump(struct smu_context *smu, enum aca_error_type type,
+static int __smu_v13_0_6_aca_bank_dump(struct smu_context *smu, enum aca_smu_type type,
int idx, int offset, u32 *val)
{
uint32_t msg, param;
switch (type) {
- case ACA_ERROR_TYPE_UE:
+ case ACA_SMU_TYPE_UE:
msg = SMU_MSG_McaBankDumpDW;
break;
- case ACA_ERROR_TYPE_CE:
+ case ACA_SMU_TYPE_CE:
msg = SMU_MSG_McaBankCeDumpDW;
break;
default:
return smu_cmn_send_smc_msg_with_param(smu, msg, param, (uint32_t *)val);
}
-static int smu_v13_0_6_aca_bank_dump(struct smu_context *smu, enum aca_error_type type,
+static int smu_v13_0_6_aca_bank_dump(struct smu_context *smu, enum aca_smu_type type,
int idx, int offset, u32 *val, int count)
{
int ret, i;
return 0;
}
-static int aca_bank_read_reg(struct amdgpu_device *adev, enum aca_error_type type,
+static int aca_bank_read_reg(struct amdgpu_device *adev, enum aca_smu_type type,
int idx, int reg_idx, u64 *val)
{
struct smu_context *smu = adev->powerplay.pp_handle;
*val = (u64)data[1] << 32 | data[0];
dev_dbg(adev->dev, "mca read bank reg: type:%s, index: %d, reg_idx: %d, val: 0x%016llx\n",
- type == ACA_ERROR_TYPE_UE ? "UE" : "CE", idx, reg_idx, *val);
+ type == ACA_SMU_TYPE_UE ? "UE" : "CE", idx, reg_idx, *val);
return 0;
}
static int aca_smu_get_valid_aca_bank(struct amdgpu_device *adev,
- enum aca_error_type type, int idx, struct aca_bank *bank)
+ enum aca_smu_type type, int idx, struct aca_bank *bank)
{
int i, ret, count;