]> git.ipfire.org Git - people/arne_f/kernel.git/blob - drivers/edac/mce_amd.c
Merge tag 'dma-mapping-4.13' of git://git.infradead.org/users/hch/dma-mapping
[people/arne_f/kernel.git] / drivers / edac / mce_amd.c
1 #include <linux/module.h>
2 #include <linux/slab.h>
3
4 #include "mce_amd.h"
5
6 static struct amd_decoder_ops *fam_ops;
7
8 static u8 xec_mask = 0xf;
9
10 static bool report_gart_errors;
11 static void (*decode_dram_ecc)(int node_id, struct mce *m);
12
13 void amd_report_gart_errors(bool v)
14 {
15 report_gart_errors = v;
16 }
17 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
18
19 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
20 {
21 decode_dram_ecc = f;
22 }
23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
24
25 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
26 {
27 if (decode_dram_ecc) {
28 WARN_ON(decode_dram_ecc != f);
29
30 decode_dram_ecc = NULL;
31 }
32 }
33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
34
35 /*
36 * string representation for the different MCA reported error types, see F3x48
37 * or MSR0000_0411.
38 */
39
40 /* transaction type */
41 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
42
43 /* cache level */
44 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
45
46 /* memory transaction type */
47 static const char * const rrrr_msgs[] = {
48 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
49 };
50
51 /* participating processor */
52 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
53 EXPORT_SYMBOL_GPL(pp_msgs);
54
55 /* request timeout */
56 static const char * const to_msgs[] = { "no timeout", "timed out" };
57
58 /* memory or i/o */
59 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
60
61 /* internal error type */
62 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
63
64 static const char * const f15h_mc1_mce_desc[] = {
65 "UC during a demand linefill from L2",
66 "Parity error during data load from IC",
67 "Parity error for IC valid bit",
68 "Main tag parity error",
69 "Parity error in prediction queue",
70 "PFB data/address parity error",
71 "Parity error in the branch status reg",
72 "PFB promotion address error",
73 "Tag error during probe/victimization",
74 "Parity error for IC probe tag valid bit",
75 "PFB non-cacheable bit parity error",
76 "PFB valid bit parity error", /* xec = 0xd */
77 "Microcode Patch Buffer", /* xec = 010 */
78 "uop queue",
79 "insn buffer",
80 "predecode buffer",
81 "fetch address FIFO",
82 "dispatch uop queue"
83 };
84
85 static const char * const f15h_mc2_mce_desc[] = {
86 "Fill ECC error on data fills", /* xec = 0x4 */
87 "Fill parity error on insn fills",
88 "Prefetcher request FIFO parity error",
89 "PRQ address parity error",
90 "PRQ data parity error",
91 "WCC Tag ECC error",
92 "WCC Data ECC error",
93 "WCB Data parity error",
94 "VB Data ECC or parity error",
95 "L2 Tag ECC error", /* xec = 0x10 */
96 "Hard L2 Tag ECC error",
97 "Multiple hits on L2 tag",
98 "XAB parity error",
99 "PRB address parity error"
100 };
101
102 static const char * const mc4_mce_desc[] = {
103 "DRAM ECC error detected on the NB",
104 "CRC error detected on HT link",
105 "Link-defined sync error packets detected on HT link",
106 "HT Master abort",
107 "HT Target abort",
108 "Invalid GART PTE entry during GART table walk",
109 "Unsupported atomic RMW received from an IO link",
110 "Watchdog timeout due to lack of progress",
111 "DRAM ECC error detected on the NB",
112 "SVM DMA Exclusion Vector error",
113 "HT data error detected on link",
114 "Protocol error (link, L3, probe filter)",
115 "NB internal arrays parity error",
116 "DRAM addr/ctl signals parity error",
117 "IO link transmission error",
118 "L3 data cache ECC error", /* xec = 0x1c */
119 "L3 cache tag error",
120 "L3 LRU parity bits error",
121 "ECC Error in the Probe Filter directory"
122 };
123
124 static const char * const mc5_mce_desc[] = {
125 "CPU Watchdog timer expire",
126 "Wakeup array dest tag",
127 "AG payload array",
128 "EX payload array",
129 "IDRF array",
130 "Retire dispatch queue",
131 "Mapper checkpoint array",
132 "Physical register file EX0 port",
133 "Physical register file EX1 port",
134 "Physical register file AG0 port",
135 "Physical register file AG1 port",
136 "Flag register file",
137 "DE error occurred",
138 "Retire status queue"
139 };
140
141 static const char * const mc6_mce_desc[] = {
142 "Hardware Assertion",
143 "Free List",
144 "Physical Register File",
145 "Retire Queue",
146 "Scheduler table",
147 "Status Register File",
148 };
149
150 /* Scalable MCA error strings */
151 static const char * const smca_ls_mce_desc[] = {
152 "Load queue parity",
153 "Store queue parity",
154 "Miss address buffer payload parity",
155 "L1 TLB parity",
156 "Reserved",
157 "DC tag error type 6",
158 "DC tag error type 1",
159 "Internal error type 1",
160 "Internal error type 2",
161 "Sys Read data error thread 0",
162 "Sys read data error thread 1",
163 "DC tag error type 2",
164 "DC data error type 1 (poison consumption)",
165 "DC data error type 2",
166 "DC data error type 3",
167 "DC tag error type 4",
168 "L2 TLB parity",
169 "PDC parity error",
170 "DC tag error type 3",
171 "DC tag error type 5",
172 "L2 fill data error",
173 };
174
175 static const char * const smca_if_mce_desc[] = {
176 "microtag probe port parity error",
177 "IC microtag or full tag multi-hit error",
178 "IC full tag parity",
179 "IC data array parity",
180 "Decoupling queue phys addr parity error",
181 "L0 ITLB parity error",
182 "L1 ITLB parity error",
183 "L2 ITLB parity error",
184 "BPQ snoop parity on Thread 0",
185 "BPQ snoop parity on Thread 1",
186 "L1 BTB multi-match error",
187 "L2 BTB multi-match error",
188 "L2 Cache Response Poison error",
189 "System Read Data error",
190 };
191
192 static const char * const smca_l2_mce_desc[] = {
193 "L2M tag multi-way-hit error",
194 "L2M tag ECC error",
195 "L2M data ECC error",
196 "HW assert",
197 };
198
199 static const char * const smca_de_mce_desc[] = {
200 "uop cache tag parity error",
201 "uop cache data parity error",
202 "Insn buffer parity error",
203 "uop queue parity error",
204 "Insn dispatch queue parity error",
205 "Fetch address FIFO parity",
206 "Patch RAM data parity",
207 "Patch RAM sequencer parity",
208 "uop buffer parity"
209 };
210
211 static const char * const smca_ex_mce_desc[] = {
212 "Watchdog timeout error",
213 "Phy register file parity",
214 "Flag register file parity",
215 "Immediate displacement register file parity",
216 "Address generator payload parity",
217 "EX payload parity",
218 "Checkpoint queue parity",
219 "Retire dispatch queue parity",
220 "Retire status queue parity error",
221 "Scheduling queue parity error",
222 "Branch buffer queue parity error",
223 };
224
225 static const char * const smca_fp_mce_desc[] = {
226 "Physical register file parity",
227 "Freelist parity error",
228 "Schedule queue parity",
229 "NSQ parity error",
230 "Retire queue parity",
231 "Status register file parity",
232 "Hardware assertion",
233 };
234
235 static const char * const smca_l3_mce_desc[] = {
236 "Shadow tag macro ECC error",
237 "Shadow tag macro multi-way-hit error",
238 "L3M tag ECC error",
239 "L3M tag multi-way-hit error",
240 "L3M data ECC error",
241 "XI parity, L3 fill done channel error",
242 "L3 victim queue parity",
243 "L3 HW assert",
244 };
245
246 static const char * const smca_cs_mce_desc[] = {
247 "Illegal request from transport layer",
248 "Address violation",
249 "Security violation",
250 "Illegal response from transport layer",
251 "Unexpected response",
252 "Parity error on incoming request or probe response data",
253 "Parity error on incoming read response data",
254 "Atomic request parity",
255 "ECC error on probe filter access",
256 };
257
258 static const char * const smca_pie_mce_desc[] = {
259 "HW assert",
260 "Internal PIE register security violation",
261 "Error on GMI link",
262 "Poison data written to internal PIE register",
263 };
264
265 static const char * const smca_umc_mce_desc[] = {
266 "DRAM ECC error",
267 "Data poison error on DRAM",
268 "SDP parity error",
269 "Advanced peripheral bus error",
270 "Command/address parity error",
271 "Write data CRC error",
272 };
273
274 static const char * const smca_pb_mce_desc[] = {
275 "Parameter Block RAM ECC error",
276 };
277
278 static const char * const smca_psp_mce_desc[] = {
279 "PSP RAM ECC or parity error",
280 };
281
282 static const char * const smca_smu_mce_desc[] = {
283 "SMU RAM ECC or parity error",
284 };
285
286 struct smca_mce_desc {
287 const char * const *descs;
288 unsigned int num_descs;
289 };
290
291 static struct smca_mce_desc smca_mce_descs[] = {
292 [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
293 [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
294 [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
295 [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
296 [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
297 [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
298 [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
299 [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
300 [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
301 [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
302 [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
303 [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
304 [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
305 };
306
307 static bool f12h_mc0_mce(u16 ec, u8 xec)
308 {
309 bool ret = false;
310
311 if (MEM_ERROR(ec)) {
312 u8 ll = LL(ec);
313 ret = true;
314
315 if (ll == LL_L2)
316 pr_cont("during L1 linefill from L2.\n");
317 else if (ll == LL_L1)
318 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
319 else
320 ret = false;
321 }
322 return ret;
323 }
324
325 static bool f10h_mc0_mce(u16 ec, u8 xec)
326 {
327 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
328 pr_cont("during data scrub.\n");
329 return true;
330 }
331 return f12h_mc0_mce(ec, xec);
332 }
333
334 static bool k8_mc0_mce(u16 ec, u8 xec)
335 {
336 if (BUS_ERROR(ec)) {
337 pr_cont("during system linefill.\n");
338 return true;
339 }
340
341 return f10h_mc0_mce(ec, xec);
342 }
343
344 static bool cat_mc0_mce(u16 ec, u8 xec)
345 {
346 u8 r4 = R4(ec);
347 bool ret = true;
348
349 if (MEM_ERROR(ec)) {
350
351 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
352 return false;
353
354 switch (r4) {
355 case R4_DRD:
356 case R4_DWR:
357 pr_cont("Data/Tag parity error due to %s.\n",
358 (r4 == R4_DRD ? "load/hw prf" : "store"));
359 break;
360 case R4_EVICT:
361 pr_cont("Copyback parity error on a tag miss.\n");
362 break;
363 case R4_SNOOP:
364 pr_cont("Tag parity error during snoop.\n");
365 break;
366 default:
367 ret = false;
368 }
369 } else if (BUS_ERROR(ec)) {
370
371 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
372 return false;
373
374 pr_cont("System read data error on a ");
375
376 switch (r4) {
377 case R4_RD:
378 pr_cont("TLB reload.\n");
379 break;
380 case R4_DWR:
381 pr_cont("store.\n");
382 break;
383 case R4_DRD:
384 pr_cont("load.\n");
385 break;
386 default:
387 ret = false;
388 }
389 } else {
390 ret = false;
391 }
392
393 return ret;
394 }
395
396 static bool f15h_mc0_mce(u16 ec, u8 xec)
397 {
398 bool ret = true;
399
400 if (MEM_ERROR(ec)) {
401
402 switch (xec) {
403 case 0x0:
404 pr_cont("Data Array access error.\n");
405 break;
406
407 case 0x1:
408 pr_cont("UC error during a linefill from L2/NB.\n");
409 break;
410
411 case 0x2:
412 case 0x11:
413 pr_cont("STQ access error.\n");
414 break;
415
416 case 0x3:
417 pr_cont("SCB access error.\n");
418 break;
419
420 case 0x10:
421 pr_cont("Tag error.\n");
422 break;
423
424 case 0x12:
425 pr_cont("LDQ access error.\n");
426 break;
427
428 default:
429 ret = false;
430 }
431 } else if (BUS_ERROR(ec)) {
432
433 if (!xec)
434 pr_cont("System Read Data Error.\n");
435 else
436 pr_cont(" Internal error condition type %d.\n", xec);
437 } else if (INT_ERROR(ec)) {
438 if (xec <= 0x1f)
439 pr_cont("Hardware Assert.\n");
440 else
441 ret = false;
442
443 } else
444 ret = false;
445
446 return ret;
447 }
448
449 static void decode_mc0_mce(struct mce *m)
450 {
451 u16 ec = EC(m->status);
452 u8 xec = XEC(m->status, xec_mask);
453
454 pr_emerg(HW_ERR "MC0 Error: ");
455
456 /* TLB error signatures are the same across families */
457 if (TLB_ERROR(ec)) {
458 if (TT(ec) == TT_DATA) {
459 pr_cont("%s TLB %s.\n", LL_MSG(ec),
460 ((xec == 2) ? "locked miss"
461 : (xec ? "multimatch" : "parity")));
462 return;
463 }
464 } else if (fam_ops->mc0_mce(ec, xec))
465 ;
466 else
467 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
468 }
469
470 static bool k8_mc1_mce(u16 ec, u8 xec)
471 {
472 u8 ll = LL(ec);
473 bool ret = true;
474
475 if (!MEM_ERROR(ec))
476 return false;
477
478 if (ll == 0x2)
479 pr_cont("during a linefill from L2.\n");
480 else if (ll == 0x1) {
481 switch (R4(ec)) {
482 case R4_IRD:
483 pr_cont("Parity error during data load.\n");
484 break;
485
486 case R4_EVICT:
487 pr_cont("Copyback Parity/Victim error.\n");
488 break;
489
490 case R4_SNOOP:
491 pr_cont("Tag Snoop error.\n");
492 break;
493
494 default:
495 ret = false;
496 break;
497 }
498 } else
499 ret = false;
500
501 return ret;
502 }
503
504 static bool cat_mc1_mce(u16 ec, u8 xec)
505 {
506 u8 r4 = R4(ec);
507 bool ret = true;
508
509 if (!MEM_ERROR(ec))
510 return false;
511
512 if (TT(ec) != TT_INSTR)
513 return false;
514
515 if (r4 == R4_IRD)
516 pr_cont("Data/tag array parity error for a tag hit.\n");
517 else if (r4 == R4_SNOOP)
518 pr_cont("Tag error during snoop/victimization.\n");
519 else if (xec == 0x0)
520 pr_cont("Tag parity error from victim castout.\n");
521 else if (xec == 0x2)
522 pr_cont("Microcode patch RAM parity error.\n");
523 else
524 ret = false;
525
526 return ret;
527 }
528
529 static bool f15h_mc1_mce(u16 ec, u8 xec)
530 {
531 bool ret = true;
532
533 if (!MEM_ERROR(ec))
534 return false;
535
536 switch (xec) {
537 case 0x0 ... 0xa:
538 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
539 break;
540
541 case 0xd:
542 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
543 break;
544
545 case 0x10:
546 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
547 break;
548
549 case 0x11 ... 0x15:
550 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
551 break;
552
553 default:
554 ret = false;
555 }
556 return ret;
557 }
558
559 static void decode_mc1_mce(struct mce *m)
560 {
561 u16 ec = EC(m->status);
562 u8 xec = XEC(m->status, xec_mask);
563
564 pr_emerg(HW_ERR "MC1 Error: ");
565
566 if (TLB_ERROR(ec))
567 pr_cont("%s TLB %s.\n", LL_MSG(ec),
568 (xec ? "multimatch" : "parity error"));
569 else if (BUS_ERROR(ec)) {
570 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
571
572 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
573 } else if (INT_ERROR(ec)) {
574 if (xec <= 0x3f)
575 pr_cont("Hardware Assert.\n");
576 else
577 goto wrong_mc1_mce;
578 } else if (fam_ops->mc1_mce(ec, xec))
579 ;
580 else
581 goto wrong_mc1_mce;
582
583 return;
584
585 wrong_mc1_mce:
586 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
587 }
588
589 static bool k8_mc2_mce(u16 ec, u8 xec)
590 {
591 bool ret = true;
592
593 if (xec == 0x1)
594 pr_cont(" in the write data buffers.\n");
595 else if (xec == 0x3)
596 pr_cont(" in the victim data buffers.\n");
597 else if (xec == 0x2 && MEM_ERROR(ec))
598 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
599 else if (xec == 0x0) {
600 if (TLB_ERROR(ec))
601 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
602 TT_MSG(ec));
603 else if (BUS_ERROR(ec))
604 pr_cont(": %s/ECC error in data read from NB: %s.\n",
605 R4_MSG(ec), PP_MSG(ec));
606 else if (MEM_ERROR(ec)) {
607 u8 r4 = R4(ec);
608
609 if (r4 >= 0x7)
610 pr_cont(": %s error during data copyback.\n",
611 R4_MSG(ec));
612 else if (r4 <= 0x1)
613 pr_cont(": %s parity/ECC error during data "
614 "access from L2.\n", R4_MSG(ec));
615 else
616 ret = false;
617 } else
618 ret = false;
619 } else
620 ret = false;
621
622 return ret;
623 }
624
625 static bool f15h_mc2_mce(u16 ec, u8 xec)
626 {
627 bool ret = true;
628
629 if (TLB_ERROR(ec)) {
630 if (xec == 0x0)
631 pr_cont("Data parity TLB read error.\n");
632 else if (xec == 0x1)
633 pr_cont("Poison data provided for TLB fill.\n");
634 else
635 ret = false;
636 } else if (BUS_ERROR(ec)) {
637 if (xec > 2)
638 ret = false;
639
640 pr_cont("Error during attempted NB data read.\n");
641 } else if (MEM_ERROR(ec)) {
642 switch (xec) {
643 case 0x4 ... 0xc:
644 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
645 break;
646
647 case 0x10 ... 0x14:
648 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
649 break;
650
651 default:
652 ret = false;
653 }
654 } else if (INT_ERROR(ec)) {
655 if (xec <= 0x3f)
656 pr_cont("Hardware Assert.\n");
657 else
658 ret = false;
659 }
660
661 return ret;
662 }
663
664 static bool f16h_mc2_mce(u16 ec, u8 xec)
665 {
666 u8 r4 = R4(ec);
667
668 if (!MEM_ERROR(ec))
669 return false;
670
671 switch (xec) {
672 case 0x04 ... 0x05:
673 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
674 break;
675
676 case 0x09 ... 0x0b:
677 case 0x0d ... 0x0f:
678 pr_cont("ECC error in L2 tag (%s).\n",
679 ((r4 == R4_GEN) ? "BankReq" :
680 ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
681 break;
682
683 case 0x10 ... 0x19:
684 case 0x1b:
685 pr_cont("ECC error in L2 data array (%s).\n",
686 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
687 ((r4 == R4_GEN) ? "Attr" :
688 ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
689 break;
690
691 case 0x1c ... 0x1d:
692 case 0x1f:
693 pr_cont("Parity error in L2 attribute bits (%s).\n",
694 ((r4 == R4_RD) ? "Hit" :
695 ((r4 == R4_GEN) ? "Attr" : "Fill")));
696 break;
697
698 default:
699 return false;
700 }
701
702 return true;
703 }
704
705 static void decode_mc2_mce(struct mce *m)
706 {
707 u16 ec = EC(m->status);
708 u8 xec = XEC(m->status, xec_mask);
709
710 pr_emerg(HW_ERR "MC2 Error: ");
711
712 if (!fam_ops->mc2_mce(ec, xec))
713 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
714 }
715
716 static void decode_mc3_mce(struct mce *m)
717 {
718 u16 ec = EC(m->status);
719 u8 xec = XEC(m->status, xec_mask);
720
721 if (boot_cpu_data.x86 >= 0x14) {
722 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
723 " please report on LKML.\n");
724 return;
725 }
726
727 pr_emerg(HW_ERR "MC3 Error");
728
729 if (xec == 0x0) {
730 u8 r4 = R4(ec);
731
732 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
733 goto wrong_mc3_mce;
734
735 pr_cont(" during %s.\n", R4_MSG(ec));
736 } else
737 goto wrong_mc3_mce;
738
739 return;
740
741 wrong_mc3_mce:
742 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
743 }
744
745 static void decode_mc4_mce(struct mce *m)
746 {
747 struct cpuinfo_x86 *c = &boot_cpu_data;
748 int node_id = amd_get_nb_id(m->extcpu);
749 u16 ec = EC(m->status);
750 u8 xec = XEC(m->status, 0x1f);
751 u8 offset = 0;
752
753 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
754
755 switch (xec) {
756 case 0x0 ... 0xe:
757
758 /* special handling for DRAM ECCs */
759 if (xec == 0x0 || xec == 0x8) {
760 /* no ECCs on F11h */
761 if (c->x86 == 0x11)
762 goto wrong_mc4_mce;
763
764 pr_cont("%s.\n", mc4_mce_desc[xec]);
765
766 if (decode_dram_ecc)
767 decode_dram_ecc(node_id, m);
768 return;
769 }
770 break;
771
772 case 0xf:
773 if (TLB_ERROR(ec))
774 pr_cont("GART Table Walk data error.\n");
775 else if (BUS_ERROR(ec))
776 pr_cont("DMA Exclusion Vector Table Walk error.\n");
777 else
778 goto wrong_mc4_mce;
779 return;
780
781 case 0x19:
782 if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
783 pr_cont("Compute Unit Data Error.\n");
784 else
785 goto wrong_mc4_mce;
786 return;
787
788 case 0x1c ... 0x1f:
789 offset = 13;
790 break;
791
792 default:
793 goto wrong_mc4_mce;
794 }
795
796 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
797 return;
798
799 wrong_mc4_mce:
800 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
801 }
802
803 static void decode_mc5_mce(struct mce *m)
804 {
805 struct cpuinfo_x86 *c = &boot_cpu_data;
806 u16 ec = EC(m->status);
807 u8 xec = XEC(m->status, xec_mask);
808
809 if (c->x86 == 0xf || c->x86 == 0x11)
810 goto wrong_mc5_mce;
811
812 pr_emerg(HW_ERR "MC5 Error: ");
813
814 if (INT_ERROR(ec)) {
815 if (xec <= 0x1f) {
816 pr_cont("Hardware Assert.\n");
817 return;
818 } else
819 goto wrong_mc5_mce;
820 }
821
822 if (xec == 0x0 || xec == 0xc)
823 pr_cont("%s.\n", mc5_mce_desc[xec]);
824 else if (xec <= 0xd)
825 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
826 else
827 goto wrong_mc5_mce;
828
829 return;
830
831 wrong_mc5_mce:
832 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
833 }
834
835 static void decode_mc6_mce(struct mce *m)
836 {
837 u8 xec = XEC(m->status, xec_mask);
838
839 pr_emerg(HW_ERR "MC6 Error: ");
840
841 if (xec > 0x5)
842 goto wrong_mc6_mce;
843
844 pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
845 return;
846
847 wrong_mc6_mce:
848 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
849 }
850
851 /* Decode errors according to Scalable MCA specification */
852 static void decode_smca_errors(struct mce *m)
853 {
854 struct smca_hwid *hwid;
855 unsigned int bank_type;
856 const char *ip_name;
857 u8 xec = XEC(m->status, xec_mask);
858
859 if (m->bank >= ARRAY_SIZE(smca_banks))
860 return;
861
862 if (boot_cpu_data.x86 >= 0x17 && m->bank == 4)
863 pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n");
864
865 hwid = smca_banks[m->bank].hwid;
866 if (!hwid)
867 return;
868
869 bank_type = hwid->bank_type;
870 ip_name = smca_get_long_name(bank_type);
871
872 pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
873
874 /* Only print the decode of valid error codes */
875 if (xec < smca_mce_descs[bank_type].num_descs &&
876 (hwid->xec_bitmap & BIT_ULL(xec))) {
877 pr_emerg(HW_ERR "%s Error: ", ip_name);
878 pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
879 }
880
881 /*
882 * amd_get_nb_id() returns the last level cache id.
883 * The last level cache on Fam17h is 1 level below the node.
884 */
885 if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
886 decode_dram_ecc(amd_get_nb_id(m->extcpu) >> 1, m);
887 }
888
889 static inline void amd_decode_err_code(u16 ec)
890 {
891 if (INT_ERROR(ec)) {
892 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
893 return;
894 }
895
896 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
897
898 if (BUS_ERROR(ec))
899 pr_cont(", mem/io: %s", II_MSG(ec));
900 else
901 pr_cont(", tx: %s", TT_MSG(ec));
902
903 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
904 pr_cont(", mem-tx: %s", R4_MSG(ec));
905
906 if (BUS_ERROR(ec))
907 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
908 }
909
910 pr_cont("\n");
911 }
912
913 /*
914 * Filter out unwanted MCE signatures here.
915 */
916 static bool amd_filter_mce(struct mce *m)
917 {
918 u8 xec = (m->status >> 16) & 0x1f;
919
920 /*
921 * NB GART TLB error reporting is disabled by default.
922 */
923 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
924 return true;
925
926 return false;
927 }
928
929 static const char *decode_error_status(struct mce *m)
930 {
931 if (m->status & MCI_STATUS_UC) {
932 if (m->status & MCI_STATUS_PCC)
933 return "System Fatal error.";
934 if (m->mcgstatus & MCG_STATUS_RIPV)
935 return "Uncorrected, software restartable error.";
936 return "Uncorrected, software containable error.";
937 }
938
939 if (m->status & MCI_STATUS_DEFERRED)
940 return "Deferred error, no action required.";
941
942 return "Corrected error, no action required.";
943 }
944
945 static int
946 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
947 {
948 struct mce *m = (struct mce *)data;
949 struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
950 int ecc;
951
952 if (amd_filter_mce(m))
953 return NOTIFY_STOP;
954
955 pr_emerg(HW_ERR "%s\n", decode_error_status(m));
956
957 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
958 m->extcpu,
959 c->x86, c->x86_model, c->x86_mask,
960 m->bank,
961 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
962 ((m->status & MCI_STATUS_UC) ? "UE" :
963 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
964 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
965 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
966 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
967
968 if (c->x86 >= 0x15) {
969 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
970
971 /* F15h, bank4, bit 43 is part of McaStatSubCache. */
972 if (c->x86 != 0x15 || m->bank != 4)
973 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
974 }
975
976 if (boot_cpu_has(X86_FEATURE_SMCA)) {
977 u32 low, high;
978 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
979
980 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
981
982 if (!rdmsr_safe(addr, &low, &high) &&
983 (low & MCI_CONFIG_MCAX))
984 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
985 }
986
987 /* do the two bits[14:13] together */
988 ecc = (m->status >> 45) & 0x3;
989 if (ecc)
990 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
991
992 pr_cont("]: 0x%016llx\n", m->status);
993
994 if (m->status & MCI_STATUS_ADDRV)
995 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
996
997 if (boot_cpu_has(X86_FEATURE_SMCA)) {
998 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
999
1000 if (m->status & MCI_STATUS_SYNDV)
1001 pr_cont(", Syndrome: 0x%016llx", m->synd);
1002
1003 pr_cont("\n");
1004
1005 decode_smca_errors(m);
1006 goto err_code;
1007 }
1008
1009 if (m->tsc)
1010 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1011
1012 if (!fam_ops)
1013 goto err_code;
1014
1015 switch (m->bank) {
1016 case 0:
1017 decode_mc0_mce(m);
1018 break;
1019
1020 case 1:
1021 decode_mc1_mce(m);
1022 break;
1023
1024 case 2:
1025 decode_mc2_mce(m);
1026 break;
1027
1028 case 3:
1029 decode_mc3_mce(m);
1030 break;
1031
1032 case 4:
1033 decode_mc4_mce(m);
1034 break;
1035
1036 case 5:
1037 decode_mc5_mce(m);
1038 break;
1039
1040 case 6:
1041 decode_mc6_mce(m);
1042 break;
1043
1044 default:
1045 break;
1046 }
1047
1048 err_code:
1049 amd_decode_err_code(m->status & 0xffff);
1050
1051 return NOTIFY_STOP;
1052 }
1053
1054 static struct notifier_block amd_mce_dec_nb = {
1055 .notifier_call = amd_decode_mce,
1056 .priority = MCE_PRIO_EDAC,
1057 };
1058
1059 static int __init mce_amd_init(void)
1060 {
1061 struct cpuinfo_x86 *c = &boot_cpu_data;
1062
1063 if (c->x86_vendor != X86_VENDOR_AMD)
1064 return -ENODEV;
1065
1066 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1067 if (!fam_ops)
1068 return -ENOMEM;
1069
1070 switch (c->x86) {
1071 case 0xf:
1072 fam_ops->mc0_mce = k8_mc0_mce;
1073 fam_ops->mc1_mce = k8_mc1_mce;
1074 fam_ops->mc2_mce = k8_mc2_mce;
1075 break;
1076
1077 case 0x10:
1078 fam_ops->mc0_mce = f10h_mc0_mce;
1079 fam_ops->mc1_mce = k8_mc1_mce;
1080 fam_ops->mc2_mce = k8_mc2_mce;
1081 break;
1082
1083 case 0x11:
1084 fam_ops->mc0_mce = k8_mc0_mce;
1085 fam_ops->mc1_mce = k8_mc1_mce;
1086 fam_ops->mc2_mce = k8_mc2_mce;
1087 break;
1088
1089 case 0x12:
1090 fam_ops->mc0_mce = f12h_mc0_mce;
1091 fam_ops->mc1_mce = k8_mc1_mce;
1092 fam_ops->mc2_mce = k8_mc2_mce;
1093 break;
1094
1095 case 0x14:
1096 fam_ops->mc0_mce = cat_mc0_mce;
1097 fam_ops->mc1_mce = cat_mc1_mce;
1098 fam_ops->mc2_mce = k8_mc2_mce;
1099 break;
1100
1101 case 0x15:
1102 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1103
1104 fam_ops->mc0_mce = f15h_mc0_mce;
1105 fam_ops->mc1_mce = f15h_mc1_mce;
1106 fam_ops->mc2_mce = f15h_mc2_mce;
1107 break;
1108
1109 case 0x16:
1110 xec_mask = 0x1f;
1111 fam_ops->mc0_mce = cat_mc0_mce;
1112 fam_ops->mc1_mce = cat_mc1_mce;
1113 fam_ops->mc2_mce = f16h_mc2_mce;
1114 break;
1115
1116 case 0x17:
1117 xec_mask = 0x3f;
1118 if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1119 printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1120 goto err_out;
1121 }
1122 break;
1123
1124 default:
1125 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1126 goto err_out;
1127 }
1128
1129 pr_info("MCE: In-kernel MCE decoding enabled.\n");
1130
1131 mce_register_decode_chain(&amd_mce_dec_nb);
1132
1133 return 0;
1134
1135 err_out:
1136 kfree(fam_ops);
1137 fam_ops = NULL;
1138 return -EINVAL;
1139 }
1140 early_initcall(mce_amd_init);
1141
1142 #ifdef MODULE
1143 static void __exit mce_amd_exit(void)
1144 {
1145 mce_unregister_decode_chain(&amd_mce_dec_nb);
1146 kfree(fam_ops);
1147 }
1148
1149 MODULE_DESCRIPTION("AMD MCE decoder");
1150 MODULE_ALIAS("edac-mce-amd");
1151 MODULE_LICENSE("GPL");
1152 module_exit(mce_amd_exit);
1153 #endif