]>
Commit | Line | Data |
---|---|---|
7e6ec099 CL |
1 | /* |
2 | * Copyright 2023 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | */ | |
23 | #include "umc_v12_0.h" | |
24 | #include "amdgpu_ras.h" | |
25 | #include "amdgpu_umc.h" | |
26 | #include "amdgpu.h" | |
27 | #include "umc/umc_12_0_0_offset.h" | |
28 | #include "umc/umc_12_0_0_sh_mask.h" | |
29 | ||
3cb9ebc9 TZ |
30 | const uint32_t |
31 | umc_v12_0_channel_idx_tbl[] | |
32 | [UMC_V12_0_UMC_INSTANCE_NUM] | |
33 | [UMC_V12_0_CHANNEL_INSTANCE_NUM] = { | |
34 | {{3, 7, 11, 15, 2, 6, 10, 14}, {1, 5, 9, 13, 0, 4, 8, 12}, | |
35 | {19, 23, 27, 31, 18, 22, 26, 30}, {17, 21, 25, 29, 16, 20, 24, 28}}, | |
36 | {{47, 43, 39, 35, 46, 42, 38, 34}, {45, 41, 37, 33, 44, 40, 36, 32}, | |
37 | {63, 59, 55, 51, 62, 58, 54, 50}, {61, 57, 53, 49, 60, 56, 52, 48}}, | |
38 | {{79, 75, 71, 67, 78, 74, 70, 66}, {77, 73, 69, 65, 76, 72, 68, 64}, | |
39 | {95, 91, 87, 83, 94, 90, 86, 82}, {93, 89, 85, 81, 92, 88, 84, 80}}, | |
40 | {{99, 103, 107, 111, 98, 102, 106, 110}, {97, 101, 105, 109, 96, 100, 104, 108}, | |
41 | {115, 119, 123, 127, 114, 118, 122, 126}, {113, 117, 121, 125, 112, 116, 120, 124}} | |
42 | }; | |
43 | ||
40a08fe8 TZ |
44 | /* mapping of MCA error address to normalized address */ |
45 | static const uint32_t umc_v12_0_ma2na_mapping[] = { | |
46 | 0, 5, 6, 8, 9, 14, 12, 13, | |
47 | 10, 11, 15, 16, 17, 18, 19, 20, | |
48 | 21, 22, 23, 24, 25, 26, 27, 28, | |
49 | 24, 7, 29, 30, | |
50 | }; | |
51 | ||
7e6ec099 CL |
52 | static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev, |
53 | uint32_t node_inst, | |
54 | uint32_t umc_inst, | |
55 | uint32_t ch_inst) | |
56 | { | |
57 | uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst; | |
58 | uint64_t cross_node_offset = (node_inst == 0) ? 0 : UMC_V12_0_CROSS_NODE_OFFSET; | |
59 | ||
60 | umc_inst = index / 4; | |
61 | ch_inst = index % 4; | |
62 | ||
63 | return adev->umc.channel_offs * ch_inst + UMC_V12_0_INST_DIST * umc_inst + | |
64 | UMC_V12_0_NODE_DIST * node_inst + cross_node_offset; | |
65 | } | |
66 | ||
67 | static int umc_v12_0_reset_error_count_per_channel(struct amdgpu_device *adev, | |
68 | uint32_t node_inst, uint32_t umc_inst, | |
69 | uint32_t ch_inst, void *data) | |
70 | { | |
71 | uint64_t odecc_err_cnt_addr; | |
72 | uint64_t umc_reg_offset = | |
73 | get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); | |
74 | ||
75 | odecc_err_cnt_addr = | |
76 | SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt); | |
77 | ||
78 | /* clear error count */ | |
79 | WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4, | |
80 | UMC_V12_0_CE_CNT_INIT); | |
81 | ||
82 | return 0; | |
83 | } | |
84 | ||
85 | static void umc_v12_0_reset_error_count(struct amdgpu_device *adev) | |
86 | { | |
87 | amdgpu_umc_loop_channels(adev, | |
88 | umc_v12_0_reset_error_count_per_channel, NULL); | |
89 | } | |
90 | ||
bf13da6a | 91 | bool umc_v12_0_is_uncorrectable_error(uint64_t mc_umc_status) |
d59fcfb0 CL |
92 | { |
93 | return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && | |
e020d015 | 94 | (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || |
d59fcfb0 CL |
95 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || |
96 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)); | |
97 | } | |
98 | ||
bf13da6a | 99 | bool umc_v12_0_is_correctable_error(uint64_t mc_umc_status) |
d59fcfb0 CL |
100 | { |
101 | return (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && | |
102 | (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1 || | |
103 | (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 && | |
104 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 0) || | |
105 | /* Identify data parity error in replay mode */ | |
106 | ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 0x5 || | |
107 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 0xb) && | |
108 | !(umc_v12_0_is_uncorrectable_error(mc_umc_status))))); | |
109 | } | |
110 | ||
7e6ec099 CL |
111 | static void umc_v12_0_query_correctable_error_count(struct amdgpu_device *adev, |
112 | uint64_t umc_reg_offset, | |
113 | unsigned long *error_count) | |
114 | { | |
115 | uint64_t mc_umc_status; | |
116 | uint64_t mc_umc_status_addr; | |
117 | ||
118 | mc_umc_status_addr = | |
119 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); | |
120 | ||
121 | /* Rely on MCUMC_STATUS for correctable error counter | |
122 | * MCUMC_STATUS is a 64 bit register | |
123 | */ | |
124 | mc_umc_status = | |
125 | RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4); | |
126 | ||
d59fcfb0 | 127 | if (umc_v12_0_is_correctable_error(mc_umc_status)) |
7e6ec099 CL |
128 | *error_count += 1; |
129 | } | |
130 | ||
131 | static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device *adev, | |
132 | uint64_t umc_reg_offset, | |
133 | unsigned long *error_count) | |
134 | { | |
135 | uint64_t mc_umc_status; | |
136 | uint64_t mc_umc_status_addr; | |
137 | ||
138 | mc_umc_status_addr = | |
139 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); | |
140 | ||
141 | /* Check the MCUMC_STATUS. */ | |
142 | mc_umc_status = | |
143 | RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4); | |
144 | ||
d59fcfb0 | 145 | if (umc_v12_0_is_uncorrectable_error(mc_umc_status)) |
7e6ec099 CL |
146 | *error_count += 1; |
147 | } | |
148 | ||
149 | static int umc_v12_0_query_error_count(struct amdgpu_device *adev, | |
150 | uint32_t node_inst, uint32_t umc_inst, | |
151 | uint32_t ch_inst, void *data) | |
152 | { | |
153 | struct ras_err_data *err_data = (struct ras_err_data *)data; | |
3bba4bc6 YW |
154 | unsigned long ue_count = 0, ce_count = 0; |
155 | ||
156 | /* NOTE: node_inst is converted by adev->umc.active_mask and the range is [0-3], | |
157 | * which can be used as die ID directly */ | |
158 | struct amdgpu_smuio_mcm_config_info mcm_info = { | |
159 | .socket_id = adev->smuio.funcs->get_socket_id(adev), | |
160 | .die_id = node_inst, | |
161 | }; | |
162 | ||
7e6ec099 CL |
163 | uint64_t umc_reg_offset = |
164 | get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); | |
165 | ||
3bba4bc6 YW |
166 | umc_v12_0_query_correctable_error_count(adev, umc_reg_offset, &ce_count); |
167 | umc_v12_0_query_uncorrectable_error_count(adev, umc_reg_offset, &ue_count); | |
168 | ||
169 | amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count); | |
170 | amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count); | |
7e6ec099 CL |
171 | |
172 | return 0; | |
173 | } | |
174 | ||
175 | static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev, | |
176 | void *ras_error_status) | |
177 | { | |
178 | amdgpu_umc_loop_channels(adev, | |
179 | umc_v12_0_query_error_count, ras_error_status); | |
180 | ||
181 | umc_v12_0_reset_error_count(adev); | |
182 | } | |
183 | ||
40a08fe8 TZ |
184 | static bool umc_v12_0_bit_wise_xor(uint32_t val) |
185 | { | |
186 | bool result = 0; | |
187 | int i; | |
188 | ||
189 | for (i = 0; i < 32; i++) | |
190 | result = result ^ ((val >> i) & 0x1); | |
191 | ||
192 | return result; | |
193 | } | |
194 | ||
7e6ec099 CL |
195 | static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, |
196 | struct ras_err_data *err_data, uint64_t err_addr, | |
197 | uint32_t ch_inst, uint32_t umc_inst, | |
198 | uint32_t node_inst) | |
199 | { | |
40a08fe8 TZ |
200 | uint32_t channel_index, i; |
201 | uint64_t soc_pa, na, retired_page, column; | |
ced57520 | 202 | uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row, row_xor; |
40a08fe8 TZ |
203 | uint32_t bank0, bank1, bank2, bank3, bank; |
204 | ||
205 | bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL; | |
206 | bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL; | |
207 | bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL; | |
208 | bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL; | |
209 | col = (err_addr >> 1) & 0x1fULL; | |
210 | row = (err_addr >> 10) & 0x3fffULL; | |
211 | ||
212 | /* apply bank hash algorithm */ | |
213 | bank0 = | |
214 | bank_hash0 ^ (UMC_V12_0_XOR_EN0 & | |
215 | (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^ | |
216 | (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0)))); | |
217 | bank1 = | |
218 | bank_hash1 ^ (UMC_V12_0_XOR_EN1 & | |
219 | (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^ | |
220 | (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1)))); | |
221 | bank2 = | |
222 | bank_hash2 ^ (UMC_V12_0_XOR_EN2 & | |
223 | (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^ | |
224 | (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2)))); | |
225 | bank3 = | |
226 | bank_hash3 ^ (UMC_V12_0_XOR_EN3 & | |
227 | (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR3) ^ | |
228 | (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR3)))); | |
229 | ||
230 | bank = bank0 | (bank1 << 1) | (bank2 << 2) | (bank3 << 3); | |
231 | err_addr &= ~0x3c0ULL; | |
232 | err_addr |= (bank << UMC_V12_0_MCA_B0_BIT); | |
233 | ||
234 | na = 0x0; | |
235 | /* convert mca error address to normalized address */ | |
236 | for (i = 1; i < ARRAY_SIZE(umc_v12_0_ma2na_mapping); i++) | |
237 | na |= ((err_addr >> i) & 0x1ULL) << umc_v12_0_ma2na_mapping[i]; | |
238 | ||
239 | channel_index = | |
240 | adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num * | |
241 | adev->umc.channel_inst_num + | |
242 | umc_inst * adev->umc.channel_inst_num + | |
243 | ch_inst]; | |
244 | /* translate umc channel address to soc pa, 3 parts are included */ | |
245 | soc_pa = ADDR_OF_32KB_BLOCK(na) | | |
246 | ADDR_OF_256B_BLOCK(channel_index) | | |
247 | OFFSET_IN_256B_BLOCK(na); | |
248 | ||
249 | /* the umc channel bits are not original values, they are hashed */ | |
250 | UMC_V12_0_SET_CHANNEL_HASH(channel_index, soc_pa); | |
251 | ||
252 | /* clear [C3 C2] in soc physical address */ | |
253 | soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT); | |
254 | /* clear [C4] in soc physical address */ | |
255 | soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT); | |
256 | ||
ced57520 | 257 | row_xor = row ^ (0x1ULL << 13); |
40a08fe8 TZ |
258 | /* loop for all possibilities of [C4 C3 C2] */ |
259 | for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) { | |
260 | retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT); | |
261 | retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT); | |
ced57520 TZ |
262 | /* include column bit 0 and 1 */ |
263 | col &= 0x3; | |
264 | col |= (column << 2); | |
f8754f58 TZ |
265 | dev_info(adev->dev, |
266 | "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", | |
267 | retired_page, row, col, bank, channel_index); | |
40a08fe8 TZ |
268 | amdgpu_umc_fill_error_record(err_data, err_addr, |
269 | retired_page, channel_index, umc_inst); | |
270 | ||
271 | /* shift R13 bit */ | |
272 | retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); | |
f8754f58 TZ |
273 | dev_info(adev->dev, |
274 | "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", | |
275 | retired_page, row_xor, col, bank, channel_index); | |
40a08fe8 TZ |
276 | amdgpu_umc_fill_error_record(err_data, err_addr, |
277 | retired_page, channel_index, umc_inst); | |
278 | } | |
7e6ec099 CL |
279 | } |
280 | ||
281 | static int umc_v12_0_query_error_address(struct amdgpu_device *adev, | |
282 | uint32_t node_inst, uint32_t umc_inst, | |
283 | uint32_t ch_inst, void *data) | |
284 | { | |
285 | uint64_t mc_umc_status_addr; | |
286 | uint64_t mc_umc_status, err_addr; | |
287 | uint64_t mc_umc_addrt0; | |
288 | struct ras_err_data *err_data = (struct ras_err_data *)data; | |
289 | uint64_t umc_reg_offset = | |
290 | get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); | |
291 | ||
292 | mc_umc_status_addr = | |
293 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); | |
294 | ||
295 | mc_umc_status = RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4); | |
296 | ||
297 | if (mc_umc_status == 0) | |
298 | return 0; | |
299 | ||
300 | if (!err_data->err_addr) { | |
301 | /* clear umc status */ | |
302 | WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); | |
303 | ||
304 | return 0; | |
305 | } | |
306 | ||
307 | /* calculate error address if ue error is detected */ | |
308 | if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && | |
309 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 && | |
afcf949c | 310 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1) { |
7e6ec099 CL |
311 | |
312 | mc_umc_addrt0 = | |
313 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); | |
314 | ||
315 | err_addr = RREG64_PCIE_EXT((mc_umc_addrt0 + umc_reg_offset) * 4); | |
316 | ||
317 | err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); | |
318 | ||
319 | umc_v12_0_convert_error_address(adev, err_data, err_addr, | |
320 | ch_inst, umc_inst, node_inst); | |
321 | } | |
322 | ||
323 | /* clear umc status */ | |
324 | WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); | |
325 | ||
326 | return 0; | |
327 | } | |
328 | ||
329 | static void umc_v12_0_query_ras_error_address(struct amdgpu_device *adev, | |
330 | void *ras_error_status) | |
331 | { | |
332 | amdgpu_umc_loop_channels(adev, | |
333 | umc_v12_0_query_error_address, ras_error_status); | |
334 | } | |
335 | ||
336 | static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev, | |
337 | uint32_t node_inst, uint32_t umc_inst, | |
338 | uint32_t ch_inst, void *data) | |
339 | { | |
340 | uint32_t odecc_cnt_sel; | |
341 | uint64_t odecc_cnt_sel_addr, odecc_err_cnt_addr; | |
342 | uint64_t umc_reg_offset = | |
343 | get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); | |
344 | ||
345 | odecc_cnt_sel_addr = | |
346 | SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccCntSel); | |
347 | odecc_err_cnt_addr = | |
348 | SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt); | |
349 | ||
350 | odecc_cnt_sel = RREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4); | |
351 | ||
352 | /* set ce error interrupt type to APIC based interrupt */ | |
353 | odecc_cnt_sel = REG_SET_FIELD(odecc_cnt_sel, UMCCH0_OdEccCntSel, | |
354 | OdEccErrInt, 0x1); | |
355 | WREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4, odecc_cnt_sel); | |
356 | ||
357 | /* set error count to initial value */ | |
358 | WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V12_0_CE_CNT_INIT); | |
359 | ||
360 | return 0; | |
361 | } | |
362 | ||
363 | static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev) | |
364 | { | |
365 | amdgpu_umc_loop_channels(adev, | |
366 | umc_v12_0_err_cnt_init_per_channel, NULL); | |
367 | } | |
368 | ||
369 | static bool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev) | |
370 | { | |
371 | /* | |
372 | * Force return true, because regUMCCH0_EccCtrl | |
373 | * is not accessible from host side | |
374 | */ | |
375 | return true; | |
376 | } | |
377 | ||
378 | const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = { | |
379 | .query_ras_error_count = umc_v12_0_query_ras_error_count, | |
380 | .query_ras_error_address = umc_v12_0_query_ras_error_address, | |
381 | }; | |
382 | ||
383 | struct amdgpu_umc_ras umc_v12_0_ras = { | |
384 | .ras_block = { | |
385 | .hw_ops = &umc_v12_0_ras_hw_ops, | |
386 | }, | |
387 | .err_cnt_init = umc_v12_0_err_cnt_init, | |
388 | .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, | |
389 | }; |