2 * Copyright 2018 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
24 #include <linux/debugfs.h>
25 #include <linux/list.h>
26 #include <linux/module.h>
27 #include <linux/uaccess.h>
28 #include <linux/reboot.h>
29 #include <linux/syscalls.h>
30 #include <linux/pm_runtime.h>
33 #include "amdgpu_ras.h"
34 #include "amdgpu_atomfirmware.h"
35 #include "amdgpu_xgmi.h"
36 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
37 #include "nbio_v4_3.h"
38 #include "nbio_v7_9.h"
40 #include "amdgpu_reset.h"
42 #ifdef CONFIG_X86_MCE_AMD
45 static bool notifier_registered
;
47 static const char *RAS_FS_NAME
= "ras";
49 const char *ras_error_string
[] = {
53 "multi_uncorrectable",
57 const char *ras_block_string
[] = {
77 const char *ras_mca_block_string
[] = {
84 struct amdgpu_ras_block_list
{
86 struct list_head node
;
88 struct amdgpu_ras_block_object
*ras_obj
;
91 const char *get_ras_block_str(struct ras_common_if
*ras_block
)
96 if (ras_block
->block
>= AMDGPU_RAS_BLOCK_COUNT
)
97 return "OUT OF RANGE";
99 if (ras_block
->block
== AMDGPU_RAS_BLOCK__MCA
)
100 return ras_mca_block_string
[ras_block
->sub_block_index
];
102 return ras_block_string
[ras_block
->block
];
105 #define ras_block_str(_BLOCK_) \
106 (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range")
108 #define ras_err_str(i) (ras_error_string[ffs(i)])
110 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
112 /* inject address is 52 bits */
113 #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
115 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
116 #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
118 enum amdgpu_ras_retire_page_reservation
{
119 AMDGPU_RAS_RETIRE_PAGE_RESERVED
,
120 AMDGPU_RAS_RETIRE_PAGE_PENDING
,
121 AMDGPU_RAS_RETIRE_PAGE_FAULT
,
124 atomic_t amdgpu_ras_in_intr
= ATOMIC_INIT(0);
126 static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras
*con
,
128 static bool amdgpu_ras_check_bad_page(struct amdgpu_device
*adev
,
130 #ifdef CONFIG_X86_MCE_AMD
131 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device
*adev
);
132 struct mce_notifier_adev_list
{
133 struct amdgpu_device
*devs
[MAX_GPU_INSTANCE
];
136 static struct mce_notifier_adev_list mce_adev_list
;
139 void amdgpu_ras_set_error_query_ready(struct amdgpu_device
*adev
, bool ready
)
141 if (adev
&& amdgpu_ras_get_context(adev
))
142 amdgpu_ras_get_context(adev
)->error_query_ready
= ready
;
145 static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device
*adev
)
147 if (adev
&& amdgpu_ras_get_context(adev
))
148 return amdgpu_ras_get_context(adev
)->error_query_ready
;
153 static int amdgpu_reserve_page_direct(struct amdgpu_device
*adev
, uint64_t address
)
155 struct ras_err_data err_data
;
156 struct eeprom_table_record err_rec
;
159 if ((address
>= adev
->gmc
.mc_vram_size
) ||
160 (address
>= RAS_UMC_INJECT_ADDR_LIMIT
)) {
162 "RAS WARN: input address 0x%llx is invalid.\n",
167 if (amdgpu_ras_check_bad_page(adev
, address
)) {
169 "RAS WARN: 0x%llx has already been marked as bad page!\n",
174 ret
= amdgpu_ras_error_data_init(&err_data
);
178 memset(&err_rec
, 0x0, sizeof(struct eeprom_table_record
));
179 err_data
.err_addr
= &err_rec
;
180 amdgpu_umc_fill_error_record(&err_data
, address
, address
, 0, 0);
182 if (amdgpu_bad_page_threshold
!= 0) {
183 amdgpu_ras_add_bad_pages(adev
, err_data
.err_addr
,
184 err_data
.err_addr_cnt
);
185 amdgpu_ras_save_bad_pages(adev
, NULL
);
188 amdgpu_ras_error_data_fini(&err_data
);
190 dev_warn(adev
->dev
, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
191 dev_warn(adev
->dev
, "Clear EEPROM:\n");
192 dev_warn(adev
->dev
, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
197 static ssize_t
amdgpu_ras_debugfs_read(struct file
*f
, char __user
*buf
,
198 size_t size
, loff_t
*pos
)
200 struct ras_manager
*obj
= (struct ras_manager
*)file_inode(f
)->i_private
;
201 struct ras_query_if info
= {
207 if (amdgpu_ras_query_error_status(obj
->adev
, &info
))
210 /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
211 if (amdgpu_ip_version(obj
->adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 2) &&
212 amdgpu_ip_version(obj
->adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 4)) {
213 if (amdgpu_ras_reset_error_status(obj
->adev
, info
.head
.block
))
214 dev_warn(obj
->adev
->dev
, "Failed to reset error counter and error status");
217 s
= snprintf(val
, sizeof(val
), "%s: %lu\n%s: %lu\n",
219 "ce", info
.ce_count
);
224 s
= min_t(u64
, s
, size
);
227 if (copy_to_user(buf
, &val
[*pos
], s
))
235 static const struct file_operations amdgpu_ras_debugfs_ops
= {
236 .owner
= THIS_MODULE
,
237 .read
= amdgpu_ras_debugfs_read
,
239 .llseek
= default_llseek
242 static int amdgpu_ras_find_block_id_by_name(const char *name
, int *block_id
)
246 for (i
= 0; i
< ARRAY_SIZE(ras_block_string
); i
++) {
248 if (strcmp(name
, ras_block_string
[i
]) == 0)
254 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file
*f
,
255 const char __user
*buf
, size_t size
,
256 loff_t
*pos
, struct ras_debug_if
*data
)
258 ssize_t s
= min_t(u64
, 64, size
);
266 /* default value is 0 if the mask is not set by user */
267 u32 instance_mask
= 0;
273 memset(str
, 0, sizeof(str
));
274 memset(data
, 0, sizeof(*data
));
276 if (copy_from_user(str
, buf
, s
))
279 if (sscanf(str
, "disable %32s", block_name
) == 1)
281 else if (sscanf(str
, "enable %32s %8s", block_name
, err
) == 2)
283 else if (sscanf(str
, "inject %32s %8s", block_name
, err
) == 2)
285 else if (strstr(str
, "retire_page") != NULL
)
287 else if (str
[0] && str
[1] && str
[2] && str
[3])
288 /* ascii string, but commands are not matched. */
293 if (sscanf(str
, "%*s 0x%llx", &address
) != 1 &&
294 sscanf(str
, "%*s %llu", &address
) != 1)
298 data
->inject
.address
= address
;
303 if (amdgpu_ras_find_block_id_by_name(block_name
, &block_id
))
306 data
->head
.block
= block_id
;
307 /* only ue and ce errors are supported */
308 if (!memcmp("ue", err
, 2))
309 data
->head
.type
= AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE
;
310 else if (!memcmp("ce", err
, 2))
311 data
->head
.type
= AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE
;
318 if (sscanf(str
, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x",
319 &sub_block
, &address
, &value
, &instance_mask
) != 4 &&
320 sscanf(str
, "%*s %*s %*s %u %llu %llu %u",
321 &sub_block
, &address
, &value
, &instance_mask
) != 4 &&
322 sscanf(str
, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
323 &sub_block
, &address
, &value
) != 3 &&
324 sscanf(str
, "%*s %*s %*s %u %llu %llu",
325 &sub_block
, &address
, &value
) != 3)
327 data
->head
.sub_block_index
= sub_block
;
328 data
->inject
.address
= address
;
329 data
->inject
.value
= value
;
330 data
->inject
.instance_mask
= instance_mask
;
333 if (size
< sizeof(*data
))
336 if (copy_from_user(data
, buf
, sizeof(*data
)))
343 static void amdgpu_ras_instance_mask_check(struct amdgpu_device
*adev
,
344 struct ras_debug_if
*data
)
346 int num_xcc
= adev
->gfx
.xcc_mask
? NUM_XCC(adev
->gfx
.xcc_mask
) : 1;
347 uint32_t mask
, inst_mask
= data
->inject
.instance_mask
;
349 /* no need to set instance mask if there is only one instance */
350 if (num_xcc
<= 1 && inst_mask
) {
351 data
->inject
.instance_mask
= 0;
353 "RAS inject mask(0x%x) isn't supported and force it to 0.\n",
359 switch (data
->head
.block
) {
360 case AMDGPU_RAS_BLOCK__GFX
:
361 mask
= GENMASK(num_xcc
- 1, 0);
363 case AMDGPU_RAS_BLOCK__SDMA
:
364 mask
= GENMASK(adev
->sdma
.num_instances
- 1, 0);
366 case AMDGPU_RAS_BLOCK__VCN
:
367 case AMDGPU_RAS_BLOCK__JPEG
:
368 mask
= GENMASK(adev
->vcn
.num_vcn_inst
- 1, 0);
375 /* remove invalid bits in instance mask */
376 data
->inject
.instance_mask
&= mask
;
377 if (inst_mask
!= data
->inject
.instance_mask
)
379 "Adjust RAS inject mask 0x%x to 0x%x\n",
380 inst_mask
, data
->inject
.instance_mask
);
384 * DOC: AMDGPU RAS debugfs control interface
386 * The control interface accepts struct ras_debug_if which has two members.
388 * First member: ras_debug_if::head or ras_debug_if::inject.
390 * head is used to indicate which IP block will be under control.
392 * head has four members, they are block, type, sub_block_index, name.
393 * block: which IP will be under control.
394 * type: what kind of error will be enabled/disabled/injected.
395 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
396 * name: the name of IP.
398 * inject has three more members than head, they are address, value and mask.
399 * As their names indicate, inject operation will write the
400 * value to the address.
402 * The second member: struct ras_debug_if::op.
403 * It has three kinds of operations.
405 * - 0: disable RAS on the block. Take ::head as its data.
406 * - 1: enable RAS on the block. Take ::head as its data.
407 * - 2: inject errors on the block. Take ::inject as its data.
409 * How to use the interface?
413 * Copy the struct ras_debug_if in your code and initialize it.
414 * Write the struct to the control interface.
418 * .. code-block:: bash
420 * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
421 * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
422 * echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
424 * Where N, is the card which you want to affect.
426 * "disable" requires only the block.
427 * "enable" requires the block and error type.
428 * "inject" requires the block, error type, address, and value.
430 * The block is one of: umc, sdma, gfx, etc.
431 * see ras_block_string[] for details
433 * The error type is one of: ue, ce, where,
434 * ue is multi-uncorrectable
435 * ce is single-correctable
437 * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
438 * The address and value are hexadecimal numbers, leading 0x is optional.
439 * The mask means instance mask, is optional, default value is 0x1.
443 * .. code-block:: bash
445 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
446 * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
447 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
449 * How to check the result of the operation?
451 * To check disable/enable, see "ras" features at,
452 * /sys/class/drm/card[0/1/2...]/device/ras/features
454 * To check inject, see the corresponding error count at,
455 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
458 * Operations are only allowed on blocks which are supported.
459 * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
460 * to see which blocks support RAS on a particular asic.
463 static ssize_t
amdgpu_ras_debugfs_ctrl_write(struct file
*f
,
464 const char __user
*buf
,
465 size_t size
, loff_t
*pos
)
467 struct amdgpu_device
*adev
= (struct amdgpu_device
*)file_inode(f
)->i_private
;
468 struct ras_debug_if data
;
471 if (!amdgpu_ras_get_error_query_ready(adev
)) {
472 dev_warn(adev
->dev
, "RAS WARN: error injection "
473 "currently inaccessible\n");
477 ret
= amdgpu_ras_debugfs_ctrl_parse_data(f
, buf
, size
, pos
, &data
);
482 ret
= amdgpu_reserve_page_direct(adev
, data
.inject
.address
);
489 if (!amdgpu_ras_is_supported(adev
, data
.head
.block
))
494 ret
= amdgpu_ras_feature_enable(adev
, &data
.head
, 0);
497 ret
= amdgpu_ras_feature_enable(adev
, &data
.head
, 1);
500 if ((data
.inject
.address
>= adev
->gmc
.mc_vram_size
&&
501 adev
->gmc
.mc_vram_size
) ||
502 (data
.inject
.address
>= RAS_UMC_INJECT_ADDR_LIMIT
)) {
503 dev_warn(adev
->dev
, "RAS WARN: input address "
504 "0x%llx is invalid.",
505 data
.inject
.address
);
510 /* umc ce/ue error injection for a bad page is not allowed */
511 if ((data
.head
.block
== AMDGPU_RAS_BLOCK__UMC
) &&
512 amdgpu_ras_check_bad_page(adev
, data
.inject
.address
)) {
513 dev_warn(adev
->dev
, "RAS WARN: inject: 0x%llx has "
514 "already been marked as bad!\n",
515 data
.inject
.address
);
519 amdgpu_ras_instance_mask_check(adev
, &data
);
521 /* data.inject.address is offset instead of absolute gpu address */
522 ret
= amdgpu_ras_error_inject(adev
, &data
.inject
);
536 * DOC: AMDGPU RAS debugfs EEPROM table reset interface
538 * Some boards contain an EEPROM which is used to persistently store a list of
539 * bad pages which experiences ECC errors in vram. This interface provides
540 * a way to reset the EEPROM, e.g., after testing error injection.
544 * .. code-block:: bash
546 * echo 1 > ../ras/ras_eeprom_reset
548 * will reset EEPROM table to 0 entries.
551 static ssize_t
amdgpu_ras_debugfs_eeprom_write(struct file
*f
,
552 const char __user
*buf
,
553 size_t size
, loff_t
*pos
)
555 struct amdgpu_device
*adev
=
556 (struct amdgpu_device
*)file_inode(f
)->i_private
;
559 ret
= amdgpu_ras_eeprom_reset_table(
560 &(amdgpu_ras_get_context(adev
)->eeprom_control
));
563 /* Something was written to EEPROM.
565 amdgpu_ras_get_context(adev
)->flags
= RAS_DEFAULT_FLAGS
;
572 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops
= {
573 .owner
= THIS_MODULE
,
575 .write
= amdgpu_ras_debugfs_ctrl_write
,
576 .llseek
= default_llseek
579 static const struct file_operations amdgpu_ras_debugfs_eeprom_ops
= {
580 .owner
= THIS_MODULE
,
582 .write
= amdgpu_ras_debugfs_eeprom_write
,
583 .llseek
= default_llseek
587 * DOC: AMDGPU RAS sysfs Error Count Interface
589 * It allows the user to read the error count for each IP block on the gpu through
590 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
592 * It outputs the multiple lines which report the uncorrected (ue) and corrected
595 * The format of one line is below,
601 * .. code-block:: bash
607 static ssize_t
amdgpu_ras_sysfs_read(struct device
*dev
,
608 struct device_attribute
*attr
, char *buf
)
610 struct ras_manager
*obj
= container_of(attr
, struct ras_manager
, sysfs_attr
);
611 struct ras_query_if info
= {
615 if (!amdgpu_ras_get_error_query_ready(obj
->adev
))
616 return sysfs_emit(buf
, "Query currently inaccessible\n");
618 if (amdgpu_ras_query_error_status(obj
->adev
, &info
))
621 if (amdgpu_ip_version(obj
->adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 2) &&
622 amdgpu_ip_version(obj
->adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 4)) {
623 if (amdgpu_ras_reset_error_status(obj
->adev
, info
.head
.block
))
624 dev_warn(obj
->adev
->dev
, "Failed to reset error counter and error status");
627 return sysfs_emit(buf
, "%s: %lu\n%s: %lu\n", "ue", info
.ue_count
,
628 "ce", info
.ce_count
);
633 #define get_obj(obj) do { (obj)->use++; } while (0)
634 #define alive_obj(obj) ((obj)->use)
636 static inline void put_obj(struct ras_manager
*obj
)
638 if (obj
&& (--obj
->use
== 0)) {
639 list_del(&obj
->node
);
640 amdgpu_ras_error_data_fini(&obj
->err_data
);
643 if (obj
&& (obj
->use
< 0))
644 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj
->head
));
647 /* make one obj and return it. */
648 static struct ras_manager
*amdgpu_ras_create_obj(struct amdgpu_device
*adev
,
649 struct ras_common_if
*head
)
651 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
652 struct ras_manager
*obj
;
654 if (!adev
->ras_enabled
|| !con
)
657 if (head
->block
>= AMDGPU_RAS_BLOCK_COUNT
)
660 if (head
->block
== AMDGPU_RAS_BLOCK__MCA
) {
661 if (head
->sub_block_index
>= AMDGPU_RAS_MCA_BLOCK__LAST
)
664 obj
= &con
->objs
[AMDGPU_RAS_BLOCK__LAST
+ head
->sub_block_index
];
666 obj
= &con
->objs
[head
->block
];
668 /* already exist. return obj? */
672 if (amdgpu_ras_error_data_init(&obj
->err_data
))
677 list_add(&obj
->node
, &con
->head
);
683 /* return an obj equal to head, or the first when head is NULL */
684 struct ras_manager
*amdgpu_ras_find_obj(struct amdgpu_device
*adev
,
685 struct ras_common_if
*head
)
687 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
688 struct ras_manager
*obj
;
691 if (!adev
->ras_enabled
|| !con
)
695 if (head
->block
>= AMDGPU_RAS_BLOCK_COUNT
)
698 if (head
->block
== AMDGPU_RAS_BLOCK__MCA
) {
699 if (head
->sub_block_index
>= AMDGPU_RAS_MCA_BLOCK__LAST
)
702 obj
= &con
->objs
[AMDGPU_RAS_BLOCK__LAST
+ head
->sub_block_index
];
704 obj
= &con
->objs
[head
->block
];
709 for (i
= 0; i
< AMDGPU_RAS_BLOCK_COUNT
+ AMDGPU_RAS_MCA_BLOCK_COUNT
; i
++) {
720 /* feature ctl begin */
721 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device
*adev
,
722 struct ras_common_if
*head
)
724 return adev
->ras_hw_enabled
& BIT(head
->block
);
727 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device
*adev
,
728 struct ras_common_if
*head
)
730 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
732 return con
->features
& BIT(head
->block
);
736 * if obj is not created, then create one.
737 * set feature enable flag.
739 static int __amdgpu_ras_feature_enable(struct amdgpu_device
*adev
,
740 struct ras_common_if
*head
, int enable
)
742 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
743 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
745 /* If hardware does not support ras, then do not create obj.
746 * But if hardware support ras, we can create the obj.
747 * Ras framework checks con->hw_supported to see if it need do
748 * corresponding initialization.
749 * IP checks con->support to see if it need disable ras.
751 if (!amdgpu_ras_is_feature_allowed(adev
, head
))
756 obj
= amdgpu_ras_create_obj(adev
, head
);
760 /* In case we create obj somewhere else */
763 con
->features
|= BIT(head
->block
);
765 if (obj
&& amdgpu_ras_is_feature_enabled(adev
, head
)) {
766 con
->features
&= ~BIT(head
->block
);
774 /* wrapper of psp_ras_enable_features */
775 int amdgpu_ras_feature_enable(struct amdgpu_device
*adev
,
776 struct ras_common_if
*head
, bool enable
)
778 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
779 union ta_ras_cmd_input
*info
;
785 /* For non-gfx ip, do not enable ras feature if it is not allowed */
786 /* For gfx ip, regardless of feature support status, */
787 /* Force issue enable or disable ras feature commands */
788 if (head
->block
!= AMDGPU_RAS_BLOCK__GFX
&&
789 !amdgpu_ras_is_feature_allowed(adev
, head
))
792 /* Only enable gfx ras feature from host side */
793 if (head
->block
== AMDGPU_RAS_BLOCK__GFX
&&
794 !amdgpu_sriov_vf(adev
) &&
795 !amdgpu_ras_intr_triggered()) {
796 info
= kzalloc(sizeof(union ta_ras_cmd_input
), GFP_KERNEL
);
801 info
->disable_features
= (struct ta_ras_disable_features_input
) {
802 .block_id
= amdgpu_ras_block_to_ta(head
->block
),
803 .error_type
= amdgpu_ras_error_to_ta(head
->type
),
806 info
->enable_features
= (struct ta_ras_enable_features_input
) {
807 .block_id
= amdgpu_ras_block_to_ta(head
->block
),
808 .error_type
= amdgpu_ras_error_to_ta(head
->type
),
812 ret
= psp_ras_enable_features(&adev
->psp
, info
, enable
);
814 dev_err(adev
->dev
, "ras %s %s failed poison:%d ret:%d\n",
815 enable
? "enable":"disable",
816 get_ras_block_str(head
),
817 amdgpu_ras_is_poison_mode_supported(adev
), ret
);
826 __amdgpu_ras_feature_enable(adev
, head
, enable
);
831 /* Only used in device probe stage and called only once. */
832 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device
*adev
,
833 struct ras_common_if
*head
, bool enable
)
835 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
841 if (con
->flags
& AMDGPU_RAS_FLAG_INIT_BY_VBIOS
) {
843 /* There is no harm to issue a ras TA cmd regardless of
844 * the currecnt ras state.
845 * If current state == target state, it will do nothing
846 * But sometimes it requests driver to reset and repost
847 * with error code -EAGAIN.
849 ret
= amdgpu_ras_feature_enable(adev
, head
, 1);
850 /* With old ras TA, we might fail to enable ras.
851 * Log it and just setup the object.
852 * TODO need remove this WA in the future.
854 if (ret
== -EINVAL
) {
855 ret
= __amdgpu_ras_feature_enable(adev
, head
, 1);
858 "RAS INFO: %s setup object\n",
859 get_ras_block_str(head
));
862 /* setup the object then issue a ras TA disable cmd.*/
863 ret
= __amdgpu_ras_feature_enable(adev
, head
, 1);
867 /* gfx block ras dsiable cmd must send to ras-ta */
868 if (head
->block
== AMDGPU_RAS_BLOCK__GFX
)
869 con
->features
|= BIT(head
->block
);
871 ret
= amdgpu_ras_feature_enable(adev
, head
, 0);
873 /* clean gfx block ras features flag */
874 if (adev
->ras_enabled
&& head
->block
== AMDGPU_RAS_BLOCK__GFX
)
875 con
->features
&= ~BIT(head
->block
);
878 ret
= amdgpu_ras_feature_enable(adev
, head
, enable
);
883 static int amdgpu_ras_disable_all_features(struct amdgpu_device
*adev
,
886 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
887 struct ras_manager
*obj
, *tmp
;
889 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
891 * aka just release the obj and corresponding flags
894 if (__amdgpu_ras_feature_enable(adev
, &obj
->head
, 0))
897 if (amdgpu_ras_feature_enable(adev
, &obj
->head
, 0))
902 return con
->features
;
905 static int amdgpu_ras_enable_all_features(struct amdgpu_device
*adev
,
908 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
910 const enum amdgpu_ras_error_type default_ras_type
= AMDGPU_RAS_ERROR__NONE
;
912 for (i
= 0; i
< AMDGPU_RAS_BLOCK_COUNT
; i
++) {
913 struct ras_common_if head
= {
915 .type
= default_ras_type
,
916 .sub_block_index
= 0,
919 if (i
== AMDGPU_RAS_BLOCK__MCA
)
924 * bypass psp. vbios enable ras for us.
925 * so just create the obj
927 if (__amdgpu_ras_feature_enable(adev
, &head
, 1))
930 if (amdgpu_ras_feature_enable(adev
, &head
, 1))
935 for (i
= 0; i
< AMDGPU_RAS_MCA_BLOCK_COUNT
; i
++) {
936 struct ras_common_if head
= {
937 .block
= AMDGPU_RAS_BLOCK__MCA
,
938 .type
= default_ras_type
,
939 .sub_block_index
= i
,
944 * bypass psp. vbios enable ras for us.
945 * so just create the obj
947 if (__amdgpu_ras_feature_enable(adev
, &head
, 1))
950 if (amdgpu_ras_feature_enable(adev
, &head
, 1))
955 return con
->features
;
957 /* feature ctl end */
959 static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object
*block_obj
,
960 enum amdgpu_ras_block block
)
965 if (block_obj
->ras_comm
.block
== block
)
971 static struct amdgpu_ras_block_object
*amdgpu_ras_get_ras_block(struct amdgpu_device
*adev
,
972 enum amdgpu_ras_block block
, uint32_t sub_block_index
)
974 struct amdgpu_ras_block_list
*node
, *tmp
;
975 struct amdgpu_ras_block_object
*obj
;
977 if (block
>= AMDGPU_RAS_BLOCK__LAST
)
980 list_for_each_entry_safe(node
, tmp
, &adev
->ras_list
, node
) {
981 if (!node
->ras_obj
) {
982 dev_warn(adev
->dev
, "Warning: abnormal ras list node.\n");
987 if (obj
->ras_block_match
) {
988 if (obj
->ras_block_match(obj
, block
, sub_block_index
) == 0)
991 if (amdgpu_ras_block_match_default(obj
, block
) == 0)
999 static void amdgpu_ras_get_ecc_info(struct amdgpu_device
*adev
, struct ras_err_data
*err_data
)
1001 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
1005 * choosing right query method according to
1006 * whether smu support query error information
1008 ret
= amdgpu_dpm_get_ecc_info(adev
, (void *)&(ras
->umc_ecc
));
1009 if (ret
== -EOPNOTSUPP
) {
1010 if (adev
->umc
.ras
&& adev
->umc
.ras
->ras_block
.hw_ops
&&
1011 adev
->umc
.ras
->ras_block
.hw_ops
->query_ras_error_count
)
1012 adev
->umc
.ras
->ras_block
.hw_ops
->query_ras_error_count(adev
, err_data
);
1014 /* umc query_ras_error_address is also responsible for clearing
1017 if (adev
->umc
.ras
&& adev
->umc
.ras
->ras_block
.hw_ops
&&
1018 adev
->umc
.ras
->ras_block
.hw_ops
->query_ras_error_address
)
1019 adev
->umc
.ras
->ras_block
.hw_ops
->query_ras_error_address(adev
, err_data
);
1021 if (adev
->umc
.ras
&&
1022 adev
->umc
.ras
->ecc_info_query_ras_error_count
)
1023 adev
->umc
.ras
->ecc_info_query_ras_error_count(adev
, err_data
);
1025 if (adev
->umc
.ras
&&
1026 adev
->umc
.ras
->ecc_info_query_ras_error_address
)
1027 adev
->umc
.ras
->ecc_info_query_ras_error_address(adev
, err_data
);
1031 static void amdgpu_ras_error_print_error_data(struct amdgpu_device
*adev
,
1032 struct ras_manager
*ras_mgr
,
1033 struct ras_err_data
*err_data
,
1034 const char *blk_name
,
1037 struct amdgpu_smuio_mcm_config_info
*mcm_info
;
1038 struct ras_err_node
*err_node
;
1039 struct ras_err_info
*err_info
;
1042 for_each_ras_error(err_node
, err_data
) {
1043 err_info
= &err_node
->err_info
;
1044 mcm_info
= &err_info
->mcm_info
;
1045 if (err_info
->ue_count
) {
1046 dev_info(adev
->dev
, "socket: %d, die: %d, "
1047 "%lld new uncorrectable hardware errors detected in %s block\n",
1048 mcm_info
->socket_id
,
1055 for_each_ras_error(err_node
, &ras_mgr
->err_data
) {
1056 err_info
= &err_node
->err_info
;
1057 mcm_info
= &err_info
->mcm_info
;
1058 dev_info(adev
->dev
, "socket: %d, die: %d, "
1059 "%lld uncorrectable hardware errors detected in total in %s block\n",
1060 mcm_info
->socket_id
, mcm_info
->die_id
, err_info
->ue_count
, blk_name
);
1064 for_each_ras_error(err_node
, err_data
) {
1065 err_info
= &err_node
->err_info
;
1066 mcm_info
= &err_info
->mcm_info
;
1067 if (err_info
->ce_count
) {
1068 dev_info(adev
->dev
, "socket: %d, die: %d, "
1069 "%lld new correctable hardware errors detected in %s block, "
1070 "no user action is needed\n",
1071 mcm_info
->socket_id
,
1078 for_each_ras_error(err_node
, &ras_mgr
->err_data
) {
1079 err_info
= &err_node
->err_info
;
1080 mcm_info
= &err_info
->mcm_info
;
1081 dev_info(adev
->dev
, "socket: %d, die: %d, "
1082 "%lld correctable hardware errors detected in total in %s block, "
1083 "no user action is needed\n",
1084 mcm_info
->socket_id
, mcm_info
->die_id
, err_info
->ce_count
, blk_name
);
1089 static inline bool err_data_has_source_info(struct ras_err_data
*data
)
1091 return !list_empty(&data
->err_node_list
);
1094 static void amdgpu_ras_error_generate_report(struct amdgpu_device
*adev
,
1095 struct ras_query_if
*query_if
,
1096 struct ras_err_data
*err_data
)
1098 struct ras_manager
*ras_mgr
= amdgpu_ras_find_obj(adev
, &query_if
->head
);
1099 const char *blk_name
= get_ras_block_str(&query_if
->head
);
1101 if (err_data
->ce_count
) {
1102 if (err_data_has_source_info(err_data
)) {
1103 amdgpu_ras_error_print_error_data(adev
, ras_mgr
, err_data
, blk_name
, false);
1104 } else if (!adev
->aid_mask
&&
1105 adev
->smuio
.funcs
&&
1106 adev
->smuio
.funcs
->get_socket_id
&&
1107 adev
->smuio
.funcs
->get_die_id
) {
1108 dev_info(adev
->dev
, "socket: %d, die: %d "
1109 "%ld correctable hardware errors "
1110 "detected in %s block, no user "
1111 "action is needed.\n",
1112 adev
->smuio
.funcs
->get_socket_id(adev
),
1113 adev
->smuio
.funcs
->get_die_id(adev
),
1114 ras_mgr
->err_data
.ce_count
,
1117 dev_info(adev
->dev
, "%ld correctable hardware errors "
1118 "detected in %s block, no user "
1119 "action is needed.\n",
1120 ras_mgr
->err_data
.ce_count
,
1125 if (err_data
->ue_count
) {
1126 if (err_data_has_source_info(err_data
)) {
1127 amdgpu_ras_error_print_error_data(adev
, ras_mgr
, err_data
, blk_name
, true);
1128 } else if (!adev
->aid_mask
&&
1129 adev
->smuio
.funcs
&&
1130 adev
->smuio
.funcs
->get_socket_id
&&
1131 adev
->smuio
.funcs
->get_die_id
) {
1132 dev_info(adev
->dev
, "socket: %d, die: %d "
1133 "%ld uncorrectable hardware errors "
1134 "detected in %s block\n",
1135 adev
->smuio
.funcs
->get_socket_id(adev
),
1136 adev
->smuio
.funcs
->get_die_id(adev
),
1137 ras_mgr
->err_data
.ue_count
,
1140 dev_info(adev
->dev
, "%ld uncorrectable hardware errors "
1141 "detected in %s block\n",
1142 ras_mgr
->err_data
.ue_count
,
1149 static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager
*obj
, struct ras_err_data
*err_data
)
1151 struct ras_err_node
*err_node
;
1152 struct ras_err_info
*err_info
;
1154 if (err_data_has_source_info(err_data
)) {
1155 for_each_ras_error(err_node
, err_data
) {
1156 err_info
= &err_node
->err_info
;
1158 amdgpu_ras_error_statistic_ce_count(&obj
->err_data
, &err_info
->mcm_info
, err_info
->ce_count
);
1159 amdgpu_ras_error_statistic_ue_count(&obj
->err_data
, &err_info
->mcm_info
, err_info
->ue_count
);
1162 /* for legacy asic path which doesn't has error source info */
1163 obj
->err_data
.ue_count
+= err_data
->ue_count
;
1164 obj
->err_data
.ce_count
+= err_data
->ce_count
;
1168 /* query/inject/cure begin */
1169 int amdgpu_ras_query_error_status(struct amdgpu_device
*adev
,
1170 struct ras_query_if
*info
)
1172 struct amdgpu_ras_block_object
*block_obj
= NULL
;
1173 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &info
->head
);
1174 struct ras_err_data err_data
;
1180 ret
= amdgpu_ras_error_data_init(&err_data
);
1184 if (info
->head
.block
== AMDGPU_RAS_BLOCK__UMC
) {
1185 amdgpu_ras_get_ecc_info(adev
, &err_data
);
1187 block_obj
= amdgpu_ras_get_ras_block(adev
, info
->head
.block
, 0);
1188 if (!block_obj
|| !block_obj
->hw_ops
) {
1189 dev_dbg_once(adev
->dev
, "%s doesn't config RAS function\n",
1190 get_ras_block_str(&info
->head
));
1192 goto out_fini_err_data
;
1195 if (block_obj
->hw_ops
->query_ras_error_count
)
1196 block_obj
->hw_ops
->query_ras_error_count(adev
, &err_data
);
1198 if ((info
->head
.block
== AMDGPU_RAS_BLOCK__SDMA
) ||
1199 (info
->head
.block
== AMDGPU_RAS_BLOCK__GFX
) ||
1200 (info
->head
.block
== AMDGPU_RAS_BLOCK__MMHUB
)) {
1201 if (block_obj
->hw_ops
->query_ras_error_status
)
1202 block_obj
->hw_ops
->query_ras_error_status(adev
);
1206 amdgpu_rasmgr_error_data_statistic_update(obj
, &err_data
);
1208 info
->ue_count
= obj
->err_data
.ue_count
;
1209 info
->ce_count
= obj
->err_data
.ce_count
;
1211 amdgpu_ras_error_generate_report(adev
, info
, &err_data
);
1214 amdgpu_ras_error_data_fini(&err_data
);
1219 int amdgpu_ras_reset_error_count(struct amdgpu_device
*adev
,
1220 enum amdgpu_ras_block block
)
1222 struct amdgpu_ras_block_object
*block_obj
= amdgpu_ras_get_ras_block(adev
, block
, 0);
1223 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
1224 const struct amdgpu_mca_smu_funcs
*mca_funcs
= adev
->mca
.mca_funcs
;
1225 struct amdgpu_hive_info
*hive
;
1226 int hive_ras_recovery
= 0;
1228 if (!block_obj
|| !block_obj
->hw_ops
) {
1229 dev_dbg_once(adev
->dev
, "%s doesn't config RAS function\n",
1230 ras_block_str(block
));
1234 if (!amdgpu_ras_is_supported(adev
, block
) ||
1235 !amdgpu_ras_get_mca_debug_mode(adev
))
1238 hive
= amdgpu_get_xgmi_hive(adev
);
1240 hive_ras_recovery
= atomic_read(&hive
->ras_recovery
);
1241 amdgpu_put_xgmi_hive(hive
);
1244 /* skip ras error reset in gpu reset */
1245 if ((amdgpu_in_reset(adev
) || atomic_read(&ras
->in_recovery
) ||
1246 hive_ras_recovery
) &&
1247 mca_funcs
&& mca_funcs
->mca_set_debug_mode
)
1250 if (block_obj
->hw_ops
->reset_ras_error_count
)
1251 block_obj
->hw_ops
->reset_ras_error_count(adev
);
1256 int amdgpu_ras_reset_error_status(struct amdgpu_device
*adev
,
1257 enum amdgpu_ras_block block
)
1259 struct amdgpu_ras_block_object
*block_obj
= amdgpu_ras_get_ras_block(adev
, block
, 0);
1261 if (amdgpu_ras_reset_error_count(adev
, block
) == -EOPNOTSUPP
)
1264 if ((block
== AMDGPU_RAS_BLOCK__GFX
) ||
1265 (block
== AMDGPU_RAS_BLOCK__MMHUB
)) {
1266 if (block_obj
->hw_ops
->reset_ras_error_status
)
1267 block_obj
->hw_ops
->reset_ras_error_status(adev
);
1273 /* wrapper of psp_ras_trigger_error */
1274 int amdgpu_ras_error_inject(struct amdgpu_device
*adev
,
1275 struct ras_inject_if
*info
)
1277 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &info
->head
);
1278 struct ta_ras_trigger_error_input block_info
= {
1279 .block_id
= amdgpu_ras_block_to_ta(info
->head
.block
),
1280 .inject_error_type
= amdgpu_ras_error_to_ta(info
->head
.type
),
1281 .sub_block_index
= info
->head
.sub_block_index
,
1282 .address
= info
->address
,
1283 .value
= info
->value
,
1286 struct amdgpu_ras_block_object
*block_obj
= amdgpu_ras_get_ras_block(adev
,
1288 info
->head
.sub_block_index
);
1290 /* inject on guest isn't allowed, return success directly */
1291 if (amdgpu_sriov_vf(adev
))
1297 if (!block_obj
|| !block_obj
->hw_ops
) {
1298 dev_dbg_once(adev
->dev
, "%s doesn't config RAS function\n",
1299 get_ras_block_str(&info
->head
));
1303 /* Calculate XGMI relative offset */
1304 if (adev
->gmc
.xgmi
.num_physical_nodes
> 1 &&
1305 info
->head
.block
!= AMDGPU_RAS_BLOCK__GFX
) {
1306 block_info
.address
=
1307 amdgpu_xgmi_get_relative_phy_addr(adev
,
1308 block_info
.address
);
1311 if (block_obj
->hw_ops
->ras_error_inject
) {
1312 if (info
->head
.block
== AMDGPU_RAS_BLOCK__GFX
)
1313 ret
= block_obj
->hw_ops
->ras_error_inject(adev
, info
, info
->instance_mask
);
1314 else /* Special ras_error_inject is defined (e.g: xgmi) */
1315 ret
= block_obj
->hw_ops
->ras_error_inject(adev
, &block_info
,
1316 info
->instance_mask
);
1319 ret
= psp_ras_trigger_error(&adev
->psp
, &block_info
, info
->instance_mask
);
1323 dev_err(adev
->dev
, "ras inject %s failed %d\n",
1324 get_ras_block_str(&info
->head
), ret
);
1330 * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
1331 * @adev: pointer to AMD GPU device
1332 * @ce_count: pointer to an integer to be set to the count of correctible errors.
1333 * @ue_count: pointer to an integer to be set to the count of uncorrectible errors.
1334 * @query_info: pointer to ras_query_if
1336 * Return 0 for query success or do nothing, otherwise return an error
1339 static int amdgpu_ras_query_error_count_helper(struct amdgpu_device
*adev
,
1340 unsigned long *ce_count
,
1341 unsigned long *ue_count
,
1342 struct ras_query_if
*query_info
)
1347 /* do nothing if query_info is not specified */
1350 ret
= amdgpu_ras_query_error_status(adev
, query_info
);
1354 *ce_count
+= query_info
->ce_count
;
1355 *ue_count
+= query_info
->ue_count
;
1357 /* some hardware/IP supports read to clear
1358 * no need to explictly reset the err status after the query call */
1359 if (amdgpu_ip_version(adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 2) &&
1360 amdgpu_ip_version(adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 4)) {
1361 if (amdgpu_ras_reset_error_status(adev
, query_info
->head
.block
))
1363 "Failed to reset error counter and error status\n");
1370 * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
1371 * @adev: pointer to AMD GPU device
1372 * @ce_count: pointer to an integer to be set to the count of correctible errors.
1373 * @ue_count: pointer to an integer to be set to the count of uncorrectible
1375 * @query_info: pointer to ras_query_if if the query request is only for
1376 * specific ip block; if info is NULL, then the qurey request is for
1377 * all the ip blocks that support query ras error counters/status
1379 * If set, @ce_count or @ue_count, count and return the corresponding
1380 * error counts in those integer pointers. Return 0 if the device
1381 * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
1383 int amdgpu_ras_query_error_count(struct amdgpu_device
*adev
,
1384 unsigned long *ce_count
,
1385 unsigned long *ue_count
,
1386 struct ras_query_if
*query_info
)
1388 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1389 struct ras_manager
*obj
;
1390 unsigned long ce
, ue
;
1393 if (!adev
->ras_enabled
|| !con
)
1396 /* Don't count since no reporting.
1398 if (!ce_count
&& !ue_count
)
1404 /* query all the ip blocks that support ras query interface */
1405 list_for_each_entry(obj
, &con
->head
, node
) {
1406 struct ras_query_if info
= {
1410 ret
= amdgpu_ras_query_error_count_helper(adev
, &ce
, &ue
, &info
);
1413 /* query specific ip block */
1414 ret
= amdgpu_ras_query_error_count_helper(adev
, &ce
, &ue
, query_info
);
1428 /* query/inject/cure end */
1433 static int amdgpu_ras_badpages_read(struct amdgpu_device
*adev
,
1434 struct ras_badpage
**bps
, unsigned int *count
);
1436 static char *amdgpu_ras_badpage_flags_str(unsigned int flags
)
1439 case AMDGPU_RAS_RETIRE_PAGE_RESERVED
:
1441 case AMDGPU_RAS_RETIRE_PAGE_PENDING
:
1443 case AMDGPU_RAS_RETIRE_PAGE_FAULT
:
1450 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
1452 * It allows user to read the bad pages of vram on the gpu through
1453 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
1455 * It outputs multiple lines, and each line stands for one gpu page.
1457 * The format of one line is below,
1458 * gpu pfn : gpu page size : flags
1460 * gpu pfn and gpu page size are printed in hex format.
1461 * flags can be one of below character,
1463 * R: reserved, this gpu page is reserved and not able to use.
1465 * P: pending for reserve, this gpu page is marked as bad, will be reserved
1466 * in next window of page_reserve.
1468 * F: unable to reserve. this gpu page can't be reserved due to some reasons.
1472 * .. code-block:: bash
1474 * 0x00000001 : 0x00001000 : R
1475 * 0x00000002 : 0x00001000 : P
1479 static ssize_t
amdgpu_ras_sysfs_badpages_read(struct file
*f
,
1480 struct kobject
*kobj
, struct bin_attribute
*attr
,
1481 char *buf
, loff_t ppos
, size_t count
)
1483 struct amdgpu_ras
*con
=
1484 container_of(attr
, struct amdgpu_ras
, badpages_attr
);
1485 struct amdgpu_device
*adev
= con
->adev
;
1486 const unsigned int element_size
=
1487 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
1488 unsigned int start
= div64_ul(ppos
+ element_size
- 1, element_size
);
1489 unsigned int end
= div64_ul(ppos
+ count
- 1, element_size
);
1491 struct ras_badpage
*bps
= NULL
;
1492 unsigned int bps_count
= 0;
1494 memset(buf
, 0, count
);
1496 if (amdgpu_ras_badpages_read(adev
, &bps
, &bps_count
))
1499 for (; start
< end
&& start
< bps_count
; start
++)
1500 s
+= scnprintf(&buf
[s
], element_size
+ 1,
1501 "0x%08x : 0x%08x : %1s\n",
1504 amdgpu_ras_badpage_flags_str(bps
[start
].flags
));
1511 static ssize_t
amdgpu_ras_sysfs_features_read(struct device
*dev
,
1512 struct device_attribute
*attr
, char *buf
)
1514 struct amdgpu_ras
*con
=
1515 container_of(attr
, struct amdgpu_ras
, features_attr
);
1517 return sysfs_emit(buf
, "feature mask: 0x%x\n", con
->features
);
1520 static ssize_t
amdgpu_ras_sysfs_version_show(struct device
*dev
,
1521 struct device_attribute
*attr
, char *buf
)
1523 struct amdgpu_ras
*con
=
1524 container_of(attr
, struct amdgpu_ras
, version_attr
);
1525 return sysfs_emit(buf
, "table version: 0x%x\n", con
->eeprom_control
.tbl_hdr
.version
);
1528 static ssize_t
amdgpu_ras_sysfs_schema_show(struct device
*dev
,
1529 struct device_attribute
*attr
, char *buf
)
1531 struct amdgpu_ras
*con
=
1532 container_of(attr
, struct amdgpu_ras
, schema_attr
);
1533 return sysfs_emit(buf
, "schema: 0x%x\n", con
->schema
);
1536 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device
*adev
)
1538 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1540 sysfs_remove_file_from_group(&adev
->dev
->kobj
,
1541 &con
->badpages_attr
.attr
,
1545 static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device
*adev
)
1547 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1548 struct attribute
*attrs
[] = {
1549 &con
->features_attr
.attr
,
1550 &con
->version_attr
.attr
,
1551 &con
->schema_attr
.attr
,
1554 struct attribute_group group
= {
1555 .name
= RAS_FS_NAME
,
1559 sysfs_remove_group(&adev
->dev
->kobj
, &group
);
1564 int amdgpu_ras_sysfs_create(struct amdgpu_device
*adev
,
1565 struct ras_common_if
*head
)
1567 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
1569 if (!obj
|| obj
->attr_inuse
)
1574 snprintf(obj
->fs_data
.sysfs_name
, sizeof(obj
->fs_data
.sysfs_name
),
1575 "%s_err_count", head
->name
);
1577 obj
->sysfs_attr
= (struct device_attribute
){
1579 .name
= obj
->fs_data
.sysfs_name
,
1582 .show
= amdgpu_ras_sysfs_read
,
1584 sysfs_attr_init(&obj
->sysfs_attr
.attr
);
1586 if (sysfs_add_file_to_group(&adev
->dev
->kobj
,
1587 &obj
->sysfs_attr
.attr
,
1593 obj
->attr_inuse
= 1;
1598 int amdgpu_ras_sysfs_remove(struct amdgpu_device
*adev
,
1599 struct ras_common_if
*head
)
1601 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
1603 if (!obj
|| !obj
->attr_inuse
)
1606 sysfs_remove_file_from_group(&adev
->dev
->kobj
,
1607 &obj
->sysfs_attr
.attr
,
1609 obj
->attr_inuse
= 0;
1615 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device
*adev
)
1617 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1618 struct ras_manager
*obj
, *tmp
;
1620 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
1621 amdgpu_ras_sysfs_remove(adev
, &obj
->head
);
1624 if (amdgpu_bad_page_threshold
!= 0)
1625 amdgpu_ras_sysfs_remove_bad_page_node(adev
);
1627 amdgpu_ras_sysfs_remove_dev_attr_node(adev
);
1634 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
1636 * Normally when there is an uncorrectable error, the driver will reset
1637 * the GPU to recover. However, in the event of an unrecoverable error,
1638 * the driver provides an interface to reboot the system automatically
1641 * The following file in debugfs provides that interface:
1642 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1646 * .. code-block:: bash
1648 * echo true > .../ras/auto_reboot
1652 static struct dentry
*amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device
*adev
)
1654 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1655 struct amdgpu_ras_eeprom_control
*eeprom
= &con
->eeprom_control
;
1656 struct drm_minor
*minor
= adev_to_drm(adev
)->primary
;
1659 dir
= debugfs_create_dir(RAS_FS_NAME
, minor
->debugfs_root
);
1660 debugfs_create_file("ras_ctrl", S_IWUGO
| S_IRUGO
, dir
, adev
,
1661 &amdgpu_ras_debugfs_ctrl_ops
);
1662 debugfs_create_file("ras_eeprom_reset", S_IWUGO
| S_IRUGO
, dir
, adev
,
1663 &amdgpu_ras_debugfs_eeprom_ops
);
1664 debugfs_create_u32("bad_page_cnt_threshold", 0444, dir
,
1665 &con
->bad_page_cnt_threshold
);
1666 debugfs_create_u32("ras_num_recs", 0444, dir
, &eeprom
->ras_num_recs
);
1667 debugfs_create_x32("ras_hw_enabled", 0444, dir
, &adev
->ras_hw_enabled
);
1668 debugfs_create_x32("ras_enabled", 0444, dir
, &adev
->ras_enabled
);
1669 debugfs_create_file("ras_eeprom_size", S_IRUGO
, dir
, adev
,
1670 &amdgpu_ras_debugfs_eeprom_size_ops
);
1671 con
->de_ras_eeprom_table
= debugfs_create_file("ras_eeprom_table",
1673 &amdgpu_ras_debugfs_eeprom_table_ops
);
1674 amdgpu_ras_debugfs_set_ret_size(&con
->eeprom_control
);
1677 * After one uncorrectable error happens, usually GPU recovery will
1678 * be scheduled. But due to the known problem in GPU recovery failing
1679 * to bring GPU back, below interface provides one direct way to
1680 * user to reboot system automatically in such case within
1681 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
1682 * will never be called.
1684 debugfs_create_bool("auto_reboot", S_IWUGO
| S_IRUGO
, dir
, &con
->reboot
);
1687 * User could set this not to clean up hardware's error count register
1688 * of RAS IPs during ras recovery.
1690 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir
,
1691 &con
->disable_ras_err_cnt_harvest
);
1695 static void amdgpu_ras_debugfs_create(struct amdgpu_device
*adev
,
1696 struct ras_fs_if
*head
,
1699 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &head
->head
);
1706 memcpy(obj
->fs_data
.debugfs_name
,
1708 sizeof(obj
->fs_data
.debugfs_name
));
1710 debugfs_create_file(obj
->fs_data
.debugfs_name
, S_IWUGO
| S_IRUGO
, dir
,
1711 obj
, &amdgpu_ras_debugfs_ops
);
1714 void amdgpu_ras_debugfs_create_all(struct amdgpu_device
*adev
)
1716 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1718 struct ras_manager
*obj
;
1719 struct ras_fs_if fs_info
;
1722 * it won't be called in resume path, no need to check
1723 * suspend and gpu reset status
1725 if (!IS_ENABLED(CONFIG_DEBUG_FS
) || !con
)
1728 dir
= amdgpu_ras_debugfs_create_ctrl_node(adev
);
1730 list_for_each_entry(obj
, &con
->head
, node
) {
1731 if (amdgpu_ras_is_supported(adev
, obj
->head
.block
) &&
1732 (obj
->attr_inuse
== 1)) {
1733 sprintf(fs_info
.debugfs_name
, "%s_err_inject",
1734 get_ras_block_str(&obj
->head
));
1735 fs_info
.head
= obj
->head
;
1736 amdgpu_ras_debugfs_create(adev
, &fs_info
, dir
);
1740 amdgpu_mca_smu_debugfs_init(adev
, dir
);
1746 static BIN_ATTR(gpu_vram_bad_pages
, S_IRUGO
,
1747 amdgpu_ras_sysfs_badpages_read
, NULL
, 0);
1748 static DEVICE_ATTR(features
, S_IRUGO
,
1749 amdgpu_ras_sysfs_features_read
, NULL
);
1750 static DEVICE_ATTR(version
, 0444,
1751 amdgpu_ras_sysfs_version_show
, NULL
);
1752 static DEVICE_ATTR(schema
, 0444,
1753 amdgpu_ras_sysfs_schema_show
, NULL
);
1754 static int amdgpu_ras_fs_init(struct amdgpu_device
*adev
)
1756 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1757 struct attribute_group group
= {
1758 .name
= RAS_FS_NAME
,
1760 struct attribute
*attrs
[] = {
1761 &con
->features_attr
.attr
,
1762 &con
->version_attr
.attr
,
1763 &con
->schema_attr
.attr
,
1766 struct bin_attribute
*bin_attrs
[] = {
1772 group
.attrs
= attrs
;
1774 /* add features entry */
1775 con
->features_attr
= dev_attr_features
;
1776 sysfs_attr_init(attrs
[0]);
1778 /* add version entry */
1779 con
->version_attr
= dev_attr_version
;
1780 sysfs_attr_init(attrs
[1]);
1782 /* add schema entry */
1783 con
->schema_attr
= dev_attr_schema
;
1784 sysfs_attr_init(attrs
[2]);
1786 if (amdgpu_bad_page_threshold
!= 0) {
1787 /* add bad_page_features entry */
1788 bin_attr_gpu_vram_bad_pages
.private = NULL
;
1789 con
->badpages_attr
= bin_attr_gpu_vram_bad_pages
;
1790 bin_attrs
[0] = &con
->badpages_attr
;
1791 group
.bin_attrs
= bin_attrs
;
1792 sysfs_bin_attr_init(bin_attrs
[0]);
1795 r
= sysfs_create_group(&adev
->dev
->kobj
, &group
);
1797 dev_err(adev
->dev
, "Failed to create RAS sysfs group!");
1802 static int amdgpu_ras_fs_fini(struct amdgpu_device
*adev
)
1804 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1805 struct ras_manager
*con_obj
, *ip_obj
, *tmp
;
1807 if (IS_ENABLED(CONFIG_DEBUG_FS
)) {
1808 list_for_each_entry_safe(con_obj
, tmp
, &con
->head
, node
) {
1809 ip_obj
= amdgpu_ras_find_obj(adev
, &con_obj
->head
);
1815 amdgpu_ras_sysfs_remove_all(adev
);
1822 /* For the hardware that cannot enable bif ring for both ras_controller_irq
1823 * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
1824 * register to check whether the interrupt is triggered or not, and properly
1825 * ack the interrupt if it is there
1827 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device
*adev
)
1829 /* Fatal error events are handled on host side */
1830 if (amdgpu_sriov_vf(adev
))
1833 if (adev
->nbio
.ras
&&
1834 adev
->nbio
.ras
->handle_ras_controller_intr_no_bifring
)
1835 adev
->nbio
.ras
->handle_ras_controller_intr_no_bifring(adev
);
1837 if (adev
->nbio
.ras
&&
1838 adev
->nbio
.ras
->handle_ras_err_event_athub_intr_no_bifring
)
1839 adev
->nbio
.ras
->handle_ras_err_event_athub_intr_no_bifring(adev
);
1842 static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager
*obj
,
1843 struct amdgpu_iv_entry
*entry
)
1845 bool poison_stat
= false;
1846 struct amdgpu_device
*adev
= obj
->adev
;
1847 struct amdgpu_ras_block_object
*block_obj
=
1848 amdgpu_ras_get_ras_block(adev
, obj
->head
.block
, 0);
1853 /* both query_poison_status and handle_poison_consumption are optional,
1854 * but at least one of them should be implemented if we need poison
1855 * consumption handler
1857 if (block_obj
->hw_ops
&& block_obj
->hw_ops
->query_poison_status
) {
1858 poison_stat
= block_obj
->hw_ops
->query_poison_status(adev
);
1860 /* Not poison consumption interrupt, no need to handle it */
1861 dev_info(adev
->dev
, "No RAS poison status in %s poison IH.\n",
1862 block_obj
->ras_comm
.name
);
1868 amdgpu_umc_poison_handler(adev
, false);
1870 if (block_obj
->hw_ops
&& block_obj
->hw_ops
->handle_poison_consumption
)
1871 poison_stat
= block_obj
->hw_ops
->handle_poison_consumption(adev
);
1873 /* gpu reset is fallback for failed and default cases */
1875 dev_info(adev
->dev
, "GPU reset for %s RAS poison consumption is issued!\n",
1876 block_obj
->ras_comm
.name
);
1877 amdgpu_ras_reset_gpu(adev
);
1879 amdgpu_gfx_poison_consumption_handler(adev
, entry
);
1883 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager
*obj
,
1884 struct amdgpu_iv_entry
*entry
)
1886 dev_info(obj
->adev
->dev
,
1887 "Poison is created, no user action is needed.\n");
1890 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager
*obj
,
1891 struct amdgpu_iv_entry
*entry
)
1893 struct ras_ih_data
*data
= &obj
->ih_data
;
1894 struct ras_err_data err_data
;
1900 ret
= amdgpu_ras_error_data_init(&err_data
);
1904 /* Let IP handle its data, maybe we need get the output
1905 * from the callback to update the error type/count, etc
1907 ret
= data
->cb(obj
->adev
, &err_data
, entry
);
1908 /* ue will trigger an interrupt, and in that case
1909 * we need do a reset to recovery the whole system.
1910 * But leave IP do that recovery, here we just dispatch
1913 if (ret
== AMDGPU_RAS_SUCCESS
) {
1914 /* these counts could be left as 0 if
1915 * some blocks do not count error number
1917 obj
->err_data
.ue_count
+= err_data
.ue_count
;
1918 obj
->err_data
.ce_count
+= err_data
.ce_count
;
1921 amdgpu_ras_error_data_fini(&err_data
);
1924 static void amdgpu_ras_interrupt_handler(struct ras_manager
*obj
)
1926 struct ras_ih_data
*data
= &obj
->ih_data
;
1927 struct amdgpu_iv_entry entry
;
1929 while (data
->rptr
!= data
->wptr
) {
1931 memcpy(&entry
, &data
->ring
[data
->rptr
],
1932 data
->element_size
);
1935 data
->rptr
= (data
->aligned_element_size
+
1936 data
->rptr
) % data
->ring_size
;
1938 if (amdgpu_ras_is_poison_mode_supported(obj
->adev
)) {
1939 if (obj
->head
.block
== AMDGPU_RAS_BLOCK__UMC
)
1940 amdgpu_ras_interrupt_poison_creation_handler(obj
, &entry
);
1942 amdgpu_ras_interrupt_poison_consumption_handler(obj
, &entry
);
1944 if (obj
->head
.block
== AMDGPU_RAS_BLOCK__UMC
)
1945 amdgpu_ras_interrupt_umc_handler(obj
, &entry
);
1947 dev_warn(obj
->adev
->dev
,
1948 "No RAS interrupt handler for non-UMC block with poison disabled.\n");
1953 static void amdgpu_ras_interrupt_process_handler(struct work_struct
*work
)
1955 struct ras_ih_data
*data
=
1956 container_of(work
, struct ras_ih_data
, ih_work
);
1957 struct ras_manager
*obj
=
1958 container_of(data
, struct ras_manager
, ih_data
);
1960 amdgpu_ras_interrupt_handler(obj
);
1963 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device
*adev
,
1964 struct ras_dispatch_if
*info
)
1966 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &info
->head
);
1967 struct ras_ih_data
*data
= &obj
->ih_data
;
1972 if (data
->inuse
== 0)
1975 /* Might be overflow... */
1976 memcpy(&data
->ring
[data
->wptr
], info
->entry
,
1977 data
->element_size
);
1980 data
->wptr
= (data
->aligned_element_size
+
1981 data
->wptr
) % data
->ring_size
;
1983 schedule_work(&data
->ih_work
);
1988 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device
*adev
,
1989 struct ras_common_if
*head
)
1991 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
1992 struct ras_ih_data
*data
;
1997 data
= &obj
->ih_data
;
1998 if (data
->inuse
== 0)
2001 cancel_work_sync(&data
->ih_work
);
2004 memset(data
, 0, sizeof(*data
));
2010 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device
*adev
,
2011 struct ras_common_if
*head
)
2013 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
2014 struct ras_ih_data
*data
;
2015 struct amdgpu_ras_block_object
*ras_obj
;
2018 /* in case we registe the IH before enable ras feature */
2019 obj
= amdgpu_ras_create_obj(adev
, head
);
2025 ras_obj
= container_of(head
, struct amdgpu_ras_block_object
, ras_comm
);
2027 data
= &obj
->ih_data
;
2028 /* add the callback.etc */
2029 *data
= (struct ras_ih_data
) {
2031 .cb
= ras_obj
->ras_cb
,
2032 .element_size
= sizeof(struct amdgpu_iv_entry
),
2037 INIT_WORK(&data
->ih_work
, amdgpu_ras_interrupt_process_handler
);
2039 data
->aligned_element_size
= ALIGN(data
->element_size
, 8);
2040 /* the ring can store 64 iv entries. */
2041 data
->ring_size
= 64 * data
->aligned_element_size
;
2042 data
->ring
= kmalloc(data
->ring_size
, GFP_KERNEL
);
2054 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device
*adev
)
2056 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2057 struct ras_manager
*obj
, *tmp
;
2059 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
2060 amdgpu_ras_interrupt_remove_handler(adev
, &obj
->head
);
2067 /* traversal all IPs except NBIO to query error counter */
2068 static void amdgpu_ras_log_on_err_counter(struct amdgpu_device
*adev
)
2070 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2071 struct ras_manager
*obj
;
2073 if (!adev
->ras_enabled
|| !con
)
2076 list_for_each_entry(obj
, &con
->head
, node
) {
2077 struct ras_query_if info
= {
2082 * PCIE_BIF IP has one different isr by ras controller
2083 * interrupt, the specific ras counter query will be
2084 * done in that isr. So skip such block from common
2085 * sync flood interrupt isr calling.
2087 if (info
.head
.block
== AMDGPU_RAS_BLOCK__PCIE_BIF
)
2091 * this is a workaround for aldebaran, skip send msg to
2092 * smu to get ecc_info table due to smu handle get ecc
2093 * info table failed temporarily.
2094 * should be removed until smu fix handle ecc_info table.
2096 if ((info
.head
.block
== AMDGPU_RAS_BLOCK__UMC
) &&
2097 (amdgpu_ip_version(adev
, MP1_HWIP
, 0) ==
2098 IP_VERSION(13, 0, 2)))
2101 amdgpu_ras_query_error_status(adev
, &info
);
2103 if (amdgpu_ip_version(adev
, MP0_HWIP
, 0) !=
2104 IP_VERSION(11, 0, 2) &&
2105 amdgpu_ip_version(adev
, MP0_HWIP
, 0) !=
2106 IP_VERSION(11, 0, 4) &&
2107 amdgpu_ip_version(adev
, MP0_HWIP
, 0) !=
2108 IP_VERSION(13, 0, 0)) {
2109 if (amdgpu_ras_reset_error_status(adev
, info
.head
.block
))
2110 dev_warn(adev
->dev
, "Failed to reset error counter and error status");
2115 /* Parse RdRspStatus and WrRspStatus */
2116 static void amdgpu_ras_error_status_query(struct amdgpu_device
*adev
,
2117 struct ras_query_if
*info
)
2119 struct amdgpu_ras_block_object
*block_obj
;
2121 * Only two block need to query read/write
2122 * RspStatus at current state
2124 if ((info
->head
.block
!= AMDGPU_RAS_BLOCK__GFX
) &&
2125 (info
->head
.block
!= AMDGPU_RAS_BLOCK__MMHUB
))
2128 block_obj
= amdgpu_ras_get_ras_block(adev
,
2130 info
->head
.sub_block_index
);
2132 if (!block_obj
|| !block_obj
->hw_ops
) {
2133 dev_dbg_once(adev
->dev
, "%s doesn't config RAS function\n",
2134 get_ras_block_str(&info
->head
));
2138 if (block_obj
->hw_ops
->query_ras_error_status
)
2139 block_obj
->hw_ops
->query_ras_error_status(adev
);
2143 static void amdgpu_ras_query_err_status(struct amdgpu_device
*adev
)
2145 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2146 struct ras_manager
*obj
;
2148 if (!adev
->ras_enabled
|| !con
)
2151 list_for_each_entry(obj
, &con
->head
, node
) {
2152 struct ras_query_if info
= {
2156 amdgpu_ras_error_status_query(adev
, &info
);
2160 /* recovery begin */
2162 /* return 0 on success.
2163 * caller need free bps.
2165 static int amdgpu_ras_badpages_read(struct amdgpu_device
*adev
,
2166 struct ras_badpage
**bps
, unsigned int *count
)
2168 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2169 struct ras_err_handler_data
*data
;
2171 int ret
= 0, status
;
2173 if (!con
|| !con
->eh_data
|| !bps
|| !count
)
2176 mutex_lock(&con
->recovery_lock
);
2177 data
= con
->eh_data
;
2178 if (!data
|| data
->count
== 0) {
2184 *bps
= kmalloc(sizeof(struct ras_badpage
) * data
->count
, GFP_KERNEL
);
2190 for (; i
< data
->count
; i
++) {
2191 (*bps
)[i
] = (struct ras_badpage
){
2192 .bp
= data
->bps
[i
].retired_page
,
2193 .size
= AMDGPU_GPU_PAGE_SIZE
,
2194 .flags
= AMDGPU_RAS_RETIRE_PAGE_RESERVED
,
2196 status
= amdgpu_vram_mgr_query_page_status(&adev
->mman
.vram_mgr
,
2197 data
->bps
[i
].retired_page
);
2198 if (status
== -EBUSY
)
2199 (*bps
)[i
].flags
= AMDGPU_RAS_RETIRE_PAGE_PENDING
;
2200 else if (status
== -ENOENT
)
2201 (*bps
)[i
].flags
= AMDGPU_RAS_RETIRE_PAGE_FAULT
;
2204 *count
= data
->count
;
2206 mutex_unlock(&con
->recovery_lock
);
2210 static void amdgpu_ras_do_recovery(struct work_struct
*work
)
2212 struct amdgpu_ras
*ras
=
2213 container_of(work
, struct amdgpu_ras
, recovery_work
);
2214 struct amdgpu_device
*remote_adev
= NULL
;
2215 struct amdgpu_device
*adev
= ras
->adev
;
2216 struct list_head device_list
, *device_list_handle
= NULL
;
2217 struct amdgpu_hive_info
*hive
= amdgpu_get_xgmi_hive(adev
);
2220 atomic_set(&hive
->ras_recovery
, 1);
2221 if (!ras
->disable_ras_err_cnt_harvest
) {
2223 /* Build list of devices to query RAS related errors */
2224 if (hive
&& adev
->gmc
.xgmi
.num_physical_nodes
> 1) {
2225 device_list_handle
= &hive
->device_list
;
2227 INIT_LIST_HEAD(&device_list
);
2228 list_add_tail(&adev
->gmc
.xgmi
.head
, &device_list
);
2229 device_list_handle
= &device_list
;
2232 list_for_each_entry(remote_adev
,
2233 device_list_handle
, gmc
.xgmi
.head
) {
2234 amdgpu_ras_query_err_status(remote_adev
);
2235 amdgpu_ras_log_on_err_counter(remote_adev
);
2240 if (amdgpu_device_should_recover_gpu(ras
->adev
)) {
2241 struct amdgpu_reset_context reset_context
;
2242 memset(&reset_context
, 0, sizeof(reset_context
));
2244 reset_context
.method
= AMD_RESET_METHOD_NONE
;
2245 reset_context
.reset_req_dev
= adev
;
2247 /* Perform full reset in fatal error mode */
2248 if (!amdgpu_ras_is_poison_mode_supported(ras
->adev
))
2249 set_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
.flags
);
2251 clear_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
.flags
);
2253 if (ras
->gpu_reset_flags
& AMDGPU_RAS_GPU_RESET_MODE2_RESET
) {
2254 ras
->gpu_reset_flags
&= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET
;
2255 reset_context
.method
= AMD_RESET_METHOD_MODE2
;
2258 /* Fatal error occurs in poison mode, mode1 reset is used to
2261 if (ras
->gpu_reset_flags
& AMDGPU_RAS_GPU_RESET_MODE1_RESET
) {
2262 ras
->gpu_reset_flags
&= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET
;
2263 set_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
.flags
);
2265 psp_fatal_error_recovery_quirk(&adev
->psp
);
2269 amdgpu_device_gpu_recover(ras
->adev
, NULL
, &reset_context
);
2271 atomic_set(&ras
->in_recovery
, 0);
2273 atomic_set(&hive
->ras_recovery
, 0);
2274 amdgpu_put_xgmi_hive(hive
);
2278 /* alloc/realloc bps array */
2279 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device
*adev
,
2280 struct ras_err_handler_data
*data
, int pages
)
2282 unsigned int old_space
= data
->count
+ data
->space_left
;
2283 unsigned int new_space
= old_space
+ pages
;
2284 unsigned int align_space
= ALIGN(new_space
, 512);
2285 void *bps
= kmalloc(align_space
* sizeof(*data
->bps
), GFP_KERNEL
);
2292 memcpy(bps
, data
->bps
,
2293 data
->count
* sizeof(*data
->bps
));
2298 data
->space_left
+= align_space
- old_space
;
2302 /* it deal with vram only. */
2303 int amdgpu_ras_add_bad_pages(struct amdgpu_device
*adev
,
2304 struct eeprom_table_record
*bps
, int pages
)
2306 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2307 struct ras_err_handler_data
*data
;
2311 if (!con
|| !con
->eh_data
|| !bps
|| pages
<= 0)
2314 mutex_lock(&con
->recovery_lock
);
2315 data
= con
->eh_data
;
2319 for (i
= 0; i
< pages
; i
++) {
2320 if (amdgpu_ras_check_bad_page_unlock(con
,
2321 bps
[i
].retired_page
<< AMDGPU_GPU_PAGE_SHIFT
))
2324 if (!data
->space_left
&&
2325 amdgpu_ras_realloc_eh_data_space(adev
, data
, 256)) {
2330 amdgpu_vram_mgr_reserve_range(&adev
->mman
.vram_mgr
,
2331 bps
[i
].retired_page
<< AMDGPU_GPU_PAGE_SHIFT
,
2332 AMDGPU_GPU_PAGE_SIZE
);
2334 memcpy(&data
->bps
[data
->count
], &bps
[i
], sizeof(*data
->bps
));
2339 mutex_unlock(&con
->recovery_lock
);
2345 * write error record array to eeprom, the function should be
2346 * protected by recovery_lock
2347 * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
2349 int amdgpu_ras_save_bad_pages(struct amdgpu_device
*adev
,
2350 unsigned long *new_cnt
)
2352 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2353 struct ras_err_handler_data
*data
;
2354 struct amdgpu_ras_eeprom_control
*control
;
2357 if (!con
|| !con
->eh_data
) {
2364 mutex_lock(&con
->recovery_lock
);
2365 control
= &con
->eeprom_control
;
2366 data
= con
->eh_data
;
2367 save_count
= data
->count
- control
->ras_num_recs
;
2368 mutex_unlock(&con
->recovery_lock
);
2371 *new_cnt
= save_count
/ adev
->umc
.retire_unit
;
2373 /* only new entries are saved */
2374 if (save_count
> 0) {
2375 if (amdgpu_ras_eeprom_append(control
,
2376 &data
->bps
[control
->ras_num_recs
],
2378 dev_err(adev
->dev
, "Failed to save EEPROM table data!");
2382 dev_info(adev
->dev
, "Saved %d pages to EEPROM table.\n", save_count
);
2389 * read error record array in eeprom and reserve enough space for
2390 * storing new bad pages
2392 static int amdgpu_ras_load_bad_pages(struct amdgpu_device
*adev
)
2394 struct amdgpu_ras_eeprom_control
*control
=
2395 &adev
->psp
.ras_context
.ras
->eeprom_control
;
2396 struct eeprom_table_record
*bps
;
2399 /* no bad page record, skip eeprom access */
2400 if (control
->ras_num_recs
== 0 || amdgpu_bad_page_threshold
== 0)
2403 bps
= kcalloc(control
->ras_num_recs
, sizeof(*bps
), GFP_KERNEL
);
2407 ret
= amdgpu_ras_eeprom_read(control
, bps
, control
->ras_num_recs
);
2409 dev_err(adev
->dev
, "Failed to load EEPROM table records!");
2411 ret
= amdgpu_ras_add_bad_pages(adev
, bps
, control
->ras_num_recs
);
2417 static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras
*con
,
2420 struct ras_err_handler_data
*data
= con
->eh_data
;
2423 addr
>>= AMDGPU_GPU_PAGE_SHIFT
;
2424 for (i
= 0; i
< data
->count
; i
++)
2425 if (addr
== data
->bps
[i
].retired_page
)
2432 * check if an address belongs to bad page
2434 * Note: this check is only for umc block
2436 static bool amdgpu_ras_check_bad_page(struct amdgpu_device
*adev
,
2439 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2442 if (!con
|| !con
->eh_data
)
2445 mutex_lock(&con
->recovery_lock
);
2446 ret
= amdgpu_ras_check_bad_page_unlock(con
, addr
);
2447 mutex_unlock(&con
->recovery_lock
);
2451 static void amdgpu_ras_validate_threshold(struct amdgpu_device
*adev
,
2454 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2457 * Justification of value bad_page_cnt_threshold in ras structure
2459 * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
2460 * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
2461 * scenarios accordingly.
2463 * Bad page retirement enablement:
2464 * - If amdgpu_bad_page_threshold = -2,
2465 * bad_page_cnt_threshold = typical value by formula.
2467 * - When the value from user is 0 < amdgpu_bad_page_threshold <
2468 * max record length in eeprom, use it directly.
2470 * Bad page retirement disablement:
2471 * - If amdgpu_bad_page_threshold = 0, bad page retirement
2472 * functionality is disabled, and bad_page_cnt_threshold will
2476 if (amdgpu_bad_page_threshold
< 0) {
2477 u64 val
= adev
->gmc
.mc_vram_size
;
2479 do_div(val
, RAS_BAD_PAGE_COVER
);
2480 con
->bad_page_cnt_threshold
= min(lower_32_bits(val
),
2483 con
->bad_page_cnt_threshold
= min_t(int, max_count
,
2484 amdgpu_bad_page_threshold
);
2488 int amdgpu_ras_recovery_init(struct amdgpu_device
*adev
)
2490 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2491 struct ras_err_handler_data
**data
;
2492 u32 max_eeprom_records_count
= 0;
2493 bool exc_err_limit
= false;
2496 if (!con
|| amdgpu_sriov_vf(adev
))
2499 /* Allow access to RAS EEPROM via debugfs, when the ASIC
2500 * supports RAS and debugfs is enabled, but when
2501 * adev->ras_enabled is unset, i.e. when "ras_enable"
2502 * module parameter is set to 0.
2506 if (!adev
->ras_enabled
)
2509 data
= &con
->eh_data
;
2510 *data
= kmalloc(sizeof(**data
), GFP_KERNEL
| __GFP_ZERO
);
2516 mutex_init(&con
->recovery_lock
);
2517 INIT_WORK(&con
->recovery_work
, amdgpu_ras_do_recovery
);
2518 atomic_set(&con
->in_recovery
, 0);
2519 con
->eeprom_control
.bad_channel_bitmap
= 0;
2521 max_eeprom_records_count
= amdgpu_ras_eeprom_max_record_count(&con
->eeprom_control
);
2522 amdgpu_ras_validate_threshold(adev
, max_eeprom_records_count
);
2524 /* Todo: During test the SMU might fail to read the eeprom through I2C
2525 * when the GPU is pending on XGMI reset during probe time
2526 * (Mostly after second bus reset), skip it now
2528 if (adev
->gmc
.xgmi
.pending_reset
)
2530 ret
= amdgpu_ras_eeprom_init(&con
->eeprom_control
, &exc_err_limit
);
2532 * This calling fails when exc_err_limit is true or
2535 if (exc_err_limit
|| ret
)
2538 if (con
->eeprom_control
.ras_num_recs
) {
2539 ret
= amdgpu_ras_load_bad_pages(adev
);
2543 amdgpu_dpm_send_hbm_bad_pages_num(adev
, con
->eeprom_control
.ras_num_recs
);
2545 if (con
->update_channel_flag
== true) {
2546 amdgpu_dpm_send_hbm_bad_channel_flag(adev
, con
->eeprom_control
.bad_channel_bitmap
);
2547 con
->update_channel_flag
= false;
2551 #ifdef CONFIG_X86_MCE_AMD
2552 if ((adev
->asic_type
== CHIP_ALDEBARAN
) &&
2553 (adev
->gmc
.xgmi
.connected_to_cpu
))
2554 amdgpu_register_bad_pages_mca_notifier(adev
);
2559 kfree((*data
)->bps
);
2561 con
->eh_data
= NULL
;
2563 dev_warn(adev
->dev
, "Failed to initialize ras recovery! (%d)\n", ret
);
2566 * Except error threshold exceeding case, other failure cases in this
2567 * function would not fail amdgpu driver init.
2577 static int amdgpu_ras_recovery_fini(struct amdgpu_device
*adev
)
2579 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2580 struct ras_err_handler_data
*data
= con
->eh_data
;
2582 /* recovery_init failed to init it, fini is useless */
2586 cancel_work_sync(&con
->recovery_work
);
2588 mutex_lock(&con
->recovery_lock
);
2589 con
->eh_data
= NULL
;
2592 mutex_unlock(&con
->recovery_lock
);
2598 static bool amdgpu_ras_asic_supported(struct amdgpu_device
*adev
)
2600 if (amdgpu_sriov_vf(adev
)) {
2601 switch (amdgpu_ip_version(adev
, MP0_HWIP
, 0)) {
2602 case IP_VERSION(13, 0, 2):
2603 case IP_VERSION(13, 0, 6):
2610 if (adev
->asic_type
== CHIP_IP_DISCOVERY
) {
2611 switch (amdgpu_ip_version(adev
, MP0_HWIP
, 0)) {
2612 case IP_VERSION(13, 0, 0):
2613 case IP_VERSION(13, 0, 6):
2614 case IP_VERSION(13, 0, 10):
2621 return adev
->asic_type
== CHIP_VEGA10
||
2622 adev
->asic_type
== CHIP_VEGA20
||
2623 adev
->asic_type
== CHIP_ARCTURUS
||
2624 adev
->asic_type
== CHIP_ALDEBARAN
||
2625 adev
->asic_type
== CHIP_SIENNA_CICHLID
;
2629 * this is workaround for vega20 workstation sku,
2630 * force enable gfx ras, ignore vbios gfx ras flag
2631 * due to GC EDC can not write
2633 static void amdgpu_ras_get_quirks(struct amdgpu_device
*adev
)
2635 struct atom_context
*ctx
= adev
->mode_info
.atom_context
;
2640 if (strnstr(ctx
->vbios_pn
, "D16406",
2641 sizeof(ctx
->vbios_pn
)) ||
2642 strnstr(ctx
->vbios_pn
, "D36002",
2643 sizeof(ctx
->vbios_pn
)))
2644 adev
->ras_hw_enabled
|= (1 << AMDGPU_RAS_BLOCK__GFX
);
2648 * check hardware's ras ability which will be saved in hw_supported.
2649 * if hardware does not support ras, we can skip some ras initializtion and
2650 * forbid some ras operations from IP.
2651 * if software itself, say boot parameter, limit the ras ability. We still
2652 * need allow IP do some limited operations, like disable. In such case,
2653 * we have to initialize ras as normal. but need check if operation is
2654 * allowed or not in each function.
2656 static void amdgpu_ras_check_supported(struct amdgpu_device
*adev
)
2658 adev
->ras_hw_enabled
= adev
->ras_enabled
= 0;
2660 if (!amdgpu_ras_asic_supported(adev
))
2663 if (!adev
->gmc
.xgmi
.connected_to_cpu
&& !adev
->gmc
.is_app_apu
) {
2664 if (amdgpu_atomfirmware_mem_ecc_supported(adev
)) {
2665 dev_info(adev
->dev
, "MEM ECC is active.\n");
2666 adev
->ras_hw_enabled
|= (1 << AMDGPU_RAS_BLOCK__UMC
|
2667 1 << AMDGPU_RAS_BLOCK__DF
);
2669 dev_info(adev
->dev
, "MEM ECC is not presented.\n");
2672 if (amdgpu_atomfirmware_sram_ecc_supported(adev
)) {
2673 dev_info(adev
->dev
, "SRAM ECC is active.\n");
2674 if (!amdgpu_sriov_vf(adev
))
2675 adev
->ras_hw_enabled
|= ~(1 << AMDGPU_RAS_BLOCK__UMC
|
2676 1 << AMDGPU_RAS_BLOCK__DF
);
2678 adev
->ras_hw_enabled
|= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF
|
2679 1 << AMDGPU_RAS_BLOCK__SDMA
|
2680 1 << AMDGPU_RAS_BLOCK__GFX
);
2682 /* VCN/JPEG RAS can be supported on both bare metal and
2685 if (amdgpu_ip_version(adev
, VCN_HWIP
, 0) ==
2686 IP_VERSION(2, 6, 0) ||
2687 amdgpu_ip_version(adev
, VCN_HWIP
, 0) ==
2688 IP_VERSION(4, 0, 0) ||
2689 amdgpu_ip_version(adev
, VCN_HWIP
, 0) ==
2690 IP_VERSION(4, 0, 3))
2691 adev
->ras_hw_enabled
|= (1 << AMDGPU_RAS_BLOCK__VCN
|
2692 1 << AMDGPU_RAS_BLOCK__JPEG
);
2694 adev
->ras_hw_enabled
&= ~(1 << AMDGPU_RAS_BLOCK__VCN
|
2695 1 << AMDGPU_RAS_BLOCK__JPEG
);
2698 * XGMI RAS is not supported if xgmi num physical nodes
2701 if (!adev
->gmc
.xgmi
.num_physical_nodes
)
2702 adev
->ras_hw_enabled
&= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL
);
2704 dev_info(adev
->dev
, "SRAM ECC is not presented.\n");
2707 /* driver only manages a few IP blocks RAS feature
2708 * when GPU is connected cpu through XGMI */
2709 adev
->ras_hw_enabled
|= (1 << AMDGPU_RAS_BLOCK__GFX
|
2710 1 << AMDGPU_RAS_BLOCK__SDMA
|
2711 1 << AMDGPU_RAS_BLOCK__MMHUB
);
2714 amdgpu_ras_get_quirks(adev
);
2716 /* hw_supported needs to be aligned with RAS block mask. */
2717 adev
->ras_hw_enabled
&= AMDGPU_RAS_BLOCK_MASK
;
2719 adev
->ras_enabled
= amdgpu_ras_enable
== 0 ? 0 :
2720 adev
->ras_hw_enabled
& amdgpu_ras_mask
;
2723 static void amdgpu_ras_counte_dw(struct work_struct
*work
)
2725 struct amdgpu_ras
*con
= container_of(work
, struct amdgpu_ras
,
2726 ras_counte_delay_work
.work
);
2727 struct amdgpu_device
*adev
= con
->adev
;
2728 struct drm_device
*dev
= adev_to_drm(adev
);
2729 unsigned long ce_count
, ue_count
;
2732 res
= pm_runtime_get_sync(dev
->dev
);
2736 /* Cache new values.
2738 if (amdgpu_ras_query_error_count(adev
, &ce_count
, &ue_count
, NULL
) == 0) {
2739 atomic_set(&con
->ras_ce_count
, ce_count
);
2740 atomic_set(&con
->ras_ue_count
, ue_count
);
2743 pm_runtime_mark_last_busy(dev
->dev
);
2745 pm_runtime_put_autosuspend(dev
->dev
);
2748 static void amdgpu_ras_query_poison_mode(struct amdgpu_device
*adev
)
2750 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2751 bool df_poison
, umc_poison
;
2753 /* poison setting is useless on SRIOV guest */
2754 if (amdgpu_sriov_vf(adev
) || !con
)
2757 /* Init poison supported flag, the default value is false */
2758 if (adev
->gmc
.xgmi
.connected_to_cpu
||
2759 adev
->gmc
.is_app_apu
) {
2760 /* enabled by default when GPU is connected to CPU */
2761 con
->poison_supported
= true;
2762 } else if (adev
->df
.funcs
&&
2763 adev
->df
.funcs
->query_ras_poison_mode
&&
2765 adev
->umc
.ras
->query_ras_poison_mode
) {
2767 adev
->df
.funcs
->query_ras_poison_mode(adev
);
2769 adev
->umc
.ras
->query_ras_poison_mode(adev
);
2771 /* Only poison is set in both DF and UMC, we can support it */
2772 if (df_poison
&& umc_poison
)
2773 con
->poison_supported
= true;
2774 else if (df_poison
!= umc_poison
)
2776 "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
2777 df_poison
, umc_poison
);
2781 static int amdgpu_get_ras_schema(struct amdgpu_device
*adev
)
2783 return amdgpu_ras_is_poison_mode_supported(adev
) ? AMDGPU_RAS_ERROR__POISON
: 0 |
2784 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE
|
2785 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE
|
2786 AMDGPU_RAS_ERROR__PARITY
;
2789 int amdgpu_ras_init(struct amdgpu_device
*adev
)
2791 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2797 con
= kmalloc(sizeof(struct amdgpu_ras
) +
2798 sizeof(struct ras_manager
) * AMDGPU_RAS_BLOCK_COUNT
+
2799 sizeof(struct ras_manager
) * AMDGPU_RAS_MCA_BLOCK_COUNT
,
2800 GFP_KERNEL
|__GFP_ZERO
);
2805 INIT_DELAYED_WORK(&con
->ras_counte_delay_work
, amdgpu_ras_counte_dw
);
2806 atomic_set(&con
->ras_ce_count
, 0);
2807 atomic_set(&con
->ras_ue_count
, 0);
2809 con
->objs
= (struct ras_manager
*)(con
+ 1);
2811 amdgpu_ras_set_context(adev
, con
);
2813 amdgpu_ras_check_supported(adev
);
2815 if (!adev
->ras_enabled
|| adev
->asic_type
== CHIP_VEGA10
) {
2816 /* set gfx block ras context feature for VEGA20 Gaming
2817 * send ras disable cmd to ras ta during ras late init.
2819 if (!adev
->ras_enabled
&& adev
->asic_type
== CHIP_VEGA20
) {
2820 con
->features
|= BIT(AMDGPU_RAS_BLOCK__GFX
);
2829 con
->update_channel_flag
= false;
2832 INIT_LIST_HEAD(&con
->head
);
2833 /* Might need get this flag from vbios. */
2834 con
->flags
= RAS_DEFAULT_FLAGS
;
2836 /* initialize nbio ras function ahead of any other
2837 * ras functions so hardware fatal error interrupt
2838 * can be enabled as early as possible */
2839 switch (amdgpu_ip_version(adev
, NBIO_HWIP
, 0)) {
2840 case IP_VERSION(7, 4, 0):
2841 case IP_VERSION(7, 4, 1):
2842 case IP_VERSION(7, 4, 4):
2843 if (!adev
->gmc
.xgmi
.connected_to_cpu
)
2844 adev
->nbio
.ras
= &nbio_v7_4_ras
;
2846 case IP_VERSION(4, 3, 0):
2847 if (adev
->ras_hw_enabled
& (1 << AMDGPU_RAS_BLOCK__DF
))
2848 /* unlike other generation of nbio ras,
2849 * nbio v4_3 only support fatal error interrupt
2850 * to inform software that DF is freezed due to
2851 * system fatal error event. driver should not
2852 * enable nbio ras in such case. Instead,
2854 adev
->nbio
.ras
= &nbio_v4_3_ras
;
2856 case IP_VERSION(7, 9, 0):
2857 if (!adev
->gmc
.is_app_apu
)
2858 adev
->nbio
.ras
= &nbio_v7_9_ras
;
2861 /* nbio ras is not available */
2865 /* nbio ras block needs to be enabled ahead of other ras blocks
2866 * to handle fatal error */
2867 r
= amdgpu_nbio_ras_sw_init(adev
);
2871 if (adev
->nbio
.ras
&&
2872 adev
->nbio
.ras
->init_ras_controller_interrupt
) {
2873 r
= adev
->nbio
.ras
->init_ras_controller_interrupt(adev
);
2878 if (adev
->nbio
.ras
&&
2879 adev
->nbio
.ras
->init_ras_err_event_athub_interrupt
) {
2880 r
= adev
->nbio
.ras
->init_ras_err_event_athub_interrupt(adev
);
2885 amdgpu_ras_query_poison_mode(adev
);
2887 /* Get RAS schema for particular SOC */
2888 con
->schema
= amdgpu_get_ras_schema(adev
);
2890 if (amdgpu_ras_fs_init(adev
)) {
2895 dev_info(adev
->dev
, "RAS INFO: ras initialized successfully, "
2896 "hardware ability[%x] ras_mask[%x]\n",
2897 adev
->ras_hw_enabled
, adev
->ras_enabled
);
2901 amdgpu_ras_set_context(adev
, NULL
);
2907 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device
*adev
)
2909 if (adev
->gmc
.xgmi
.connected_to_cpu
||
2910 adev
->gmc
.is_app_apu
)
2915 static int amdgpu_persistent_edc_harvesting(struct amdgpu_device
*adev
,
2916 struct ras_common_if
*ras_block
)
2918 struct ras_query_if info
= {
2922 if (!amdgpu_persistent_edc_harvesting_supported(adev
))
2925 if (amdgpu_ras_query_error_status(adev
, &info
) != 0)
2926 DRM_WARN("RAS init harvest failure");
2928 if (amdgpu_ras_reset_error_status(adev
, ras_block
->block
) != 0)
2929 DRM_WARN("RAS init harvest reset failure");
2934 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device
*adev
)
2936 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2941 return con
->poison_supported
;
2944 /* helper function to handle common stuff in ip late init phase */
2945 int amdgpu_ras_block_late_init(struct amdgpu_device
*adev
,
2946 struct ras_common_if
*ras_block
)
2948 struct amdgpu_ras_block_object
*ras_obj
= NULL
;
2949 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2950 struct ras_query_if
*query_info
;
2951 unsigned long ue_count
, ce_count
;
2954 /* disable RAS feature per IP block if it is not supported */
2955 if (!amdgpu_ras_is_supported(adev
, ras_block
->block
)) {
2956 amdgpu_ras_feature_enable_on_boot(adev
, ras_block
, 0);
2960 r
= amdgpu_ras_feature_enable_on_boot(adev
, ras_block
, 1);
2962 if (adev
->in_suspend
|| amdgpu_in_reset(adev
)) {
2963 /* in resume phase, if fail to enable ras,
2964 * clean up all ras fs nodes, and disable ras */
2970 /* check for errors on warm reset edc persisant supported ASIC */
2971 amdgpu_persistent_edc_harvesting(adev
, ras_block
);
2973 /* in resume phase, no need to create ras fs node */
2974 if (adev
->in_suspend
|| amdgpu_in_reset(adev
))
2977 ras_obj
= container_of(ras_block
, struct amdgpu_ras_block_object
, ras_comm
);
2978 if (ras_obj
->ras_cb
|| (ras_obj
->hw_ops
&&
2979 (ras_obj
->hw_ops
->query_poison_status
||
2980 ras_obj
->hw_ops
->handle_poison_consumption
))) {
2981 r
= amdgpu_ras_interrupt_add_handler(adev
, ras_block
);
2986 if (ras_obj
->hw_ops
&&
2987 (ras_obj
->hw_ops
->query_ras_error_count
||
2988 ras_obj
->hw_ops
->query_ras_error_status
)) {
2989 r
= amdgpu_ras_sysfs_create(adev
, ras_block
);
2993 /* Those are the cached values at init.
2995 query_info
= kzalloc(sizeof(*query_info
), GFP_KERNEL
);
2998 memcpy(&query_info
->head
, ras_block
, sizeof(struct ras_common_if
));
3000 if (amdgpu_ras_query_error_count(adev
, &ce_count
, &ue_count
, query_info
) == 0) {
3001 atomic_set(&con
->ras_ce_count
, ce_count
);
3002 atomic_set(&con
->ras_ue_count
, ue_count
);
3011 if (ras_obj
->ras_cb
)
3012 amdgpu_ras_interrupt_remove_handler(adev
, ras_block
);
3014 amdgpu_ras_feature_enable(adev
, ras_block
, 0);
3018 static int amdgpu_ras_block_late_init_default(struct amdgpu_device
*adev
,
3019 struct ras_common_if
*ras_block
)
3021 return amdgpu_ras_block_late_init(adev
, ras_block
);
3024 /* helper function to remove ras fs node and interrupt handler */
3025 void amdgpu_ras_block_late_fini(struct amdgpu_device
*adev
,
3026 struct ras_common_if
*ras_block
)
3028 struct amdgpu_ras_block_object
*ras_obj
;
3032 amdgpu_ras_sysfs_remove(adev
, ras_block
);
3034 ras_obj
= container_of(ras_block
, struct amdgpu_ras_block_object
, ras_comm
);
3035 if (ras_obj
->ras_cb
)
3036 amdgpu_ras_interrupt_remove_handler(adev
, ras_block
);
3039 static void amdgpu_ras_block_late_fini_default(struct amdgpu_device
*adev
,
3040 struct ras_common_if
*ras_block
)
3042 return amdgpu_ras_block_late_fini(adev
, ras_block
);
3045 /* do some init work after IP late init as dependence.
3046 * and it runs in resume/gpu reset/booting up cases.
3048 void amdgpu_ras_resume(struct amdgpu_device
*adev
)
3050 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3051 struct ras_manager
*obj
, *tmp
;
3053 if (!adev
->ras_enabled
|| !con
) {
3054 /* clean ras context for VEGA20 Gaming after send ras disable cmd */
3055 amdgpu_release_ras_context(adev
);
3060 if (con
->flags
& AMDGPU_RAS_FLAG_INIT_BY_VBIOS
) {
3061 /* Set up all other IPs which are not implemented. There is a
3062 * tricky thing that IP's actual ras error type should be
3063 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
3064 * ERROR_NONE make sense anyway.
3066 amdgpu_ras_enable_all_features(adev
, 1);
3068 /* We enable ras on all hw_supported block, but as boot
3069 * parameter might disable some of them and one or more IP has
3070 * not implemented yet. So we disable them on behalf.
3072 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
3073 if (!amdgpu_ras_is_supported(adev
, obj
->head
.block
)) {
3074 amdgpu_ras_feature_enable(adev
, &obj
->head
, 0);
3075 /* there should be no any reference. */
3076 WARN_ON(alive_obj(obj
));
3082 void amdgpu_ras_suspend(struct amdgpu_device
*adev
)
3084 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3086 if (!adev
->ras_enabled
|| !con
)
3089 amdgpu_ras_disable_all_features(adev
, 0);
3090 /* Make sure all ras objects are disabled. */
3092 amdgpu_ras_disable_all_features(adev
, 1);
3095 int amdgpu_ras_late_init(struct amdgpu_device
*adev
)
3097 struct amdgpu_ras_block_list
*node
, *tmp
;
3098 struct amdgpu_ras_block_object
*obj
;
3101 /* Guest side doesn't need init ras feature */
3102 if (amdgpu_sriov_vf(adev
))
3105 list_for_each_entry_safe(node
, tmp
, &adev
->ras_list
, node
) {
3106 if (!node
->ras_obj
) {
3107 dev_warn(adev
->dev
, "Warning: abnormal ras list node.\n");
3111 obj
= node
->ras_obj
;
3112 if (obj
->ras_late_init
) {
3113 r
= obj
->ras_late_init(adev
, &obj
->ras_comm
);
3115 dev_err(adev
->dev
, "%s failed to execute ras_late_init! ret:%d\n",
3116 obj
->ras_comm
.name
, r
);
3120 amdgpu_ras_block_late_init_default(adev
, &obj
->ras_comm
);
3126 /* do some fini work before IP fini as dependence */
3127 int amdgpu_ras_pre_fini(struct amdgpu_device
*adev
)
3129 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3131 if (!adev
->ras_enabled
|| !con
)
3135 /* Need disable ras on all IPs here before ip [hw/sw]fini */
3137 amdgpu_ras_disable_all_features(adev
, 0);
3138 amdgpu_ras_recovery_fini(adev
);
3142 int amdgpu_ras_fini(struct amdgpu_device
*adev
)
3144 struct amdgpu_ras_block_list
*ras_node
, *tmp
;
3145 struct amdgpu_ras_block_object
*obj
= NULL
;
3146 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3148 if (!adev
->ras_enabled
|| !con
)
3151 list_for_each_entry_safe(ras_node
, tmp
, &adev
->ras_list
, node
) {
3152 if (ras_node
->ras_obj
) {
3153 obj
= ras_node
->ras_obj
;
3154 if (amdgpu_ras_is_supported(adev
, obj
->ras_comm
.block
) &&
3156 obj
->ras_fini(adev
, &obj
->ras_comm
);
3158 amdgpu_ras_block_late_fini_default(adev
, &obj
->ras_comm
);
3161 /* Clear ras blocks from ras_list and free ras block list node */
3162 list_del(&ras_node
->node
);
3166 amdgpu_ras_fs_fini(adev
);
3167 amdgpu_ras_interrupt_remove_all(adev
);
3169 WARN(con
->features
, "Feature mask is not cleared");
3172 amdgpu_ras_disable_all_features(adev
, 1);
3174 cancel_delayed_work_sync(&con
->ras_counte_delay_work
);
3176 amdgpu_ras_set_context(adev
, NULL
);
3182 void amdgpu_ras_global_ras_isr(struct amdgpu_device
*adev
)
3184 if (atomic_cmpxchg(&amdgpu_ras_in_intr
, 0, 1) == 0) {
3185 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
3187 dev_info(adev
->dev
, "uncorrectable hardware error"
3188 "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
3190 ras
->gpu_reset_flags
|= AMDGPU_RAS_GPU_RESET_MODE1_RESET
;
3191 amdgpu_ras_reset_gpu(adev
);
3195 bool amdgpu_ras_need_emergency_restart(struct amdgpu_device
*adev
)
3197 if (adev
->asic_type
== CHIP_VEGA20
&&
3198 adev
->pm
.fw_version
<= 0x283400) {
3199 return !(amdgpu_asic_reset_method(adev
) == AMD_RESET_METHOD_BACO
) &&
3200 amdgpu_ras_intr_triggered();
3206 void amdgpu_release_ras_context(struct amdgpu_device
*adev
)
3208 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3213 if (!adev
->ras_enabled
&& con
->features
& BIT(AMDGPU_RAS_BLOCK__GFX
)) {
3214 con
->features
&= ~BIT(AMDGPU_RAS_BLOCK__GFX
);
3215 amdgpu_ras_set_context(adev
, NULL
);
3220 #ifdef CONFIG_X86_MCE_AMD
3221 static struct amdgpu_device
*find_adev(uint32_t node_id
)
3224 struct amdgpu_device
*adev
= NULL
;
3226 for (i
= 0; i
< mce_adev_list
.num_gpu
; i
++) {
3227 adev
= mce_adev_list
.devs
[i
];
3229 if (adev
&& adev
->gmc
.xgmi
.connected_to_cpu
&&
3230 adev
->gmc
.xgmi
.physical_node_id
== node_id
)
3238 #define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF)
3239 #define GET_UMC_INST(m) (((m) >> 21) & 0x7)
3240 #define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
3241 #define GPU_ID_OFFSET 8
3243 static int amdgpu_bad_page_notifier(struct notifier_block
*nb
,
3244 unsigned long val
, void *data
)
3246 struct mce
*m
= (struct mce
*)data
;
3247 struct amdgpu_device
*adev
= NULL
;
3248 uint32_t gpu_id
= 0;
3249 uint32_t umc_inst
= 0, ch_inst
= 0;
3252 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
3253 * and error occurred in DramECC (Extended error code = 0) then only
3254 * process the error, else bail out.
3256 if (!m
|| !((smca_get_bank_type(m
->extcpu
, m
->bank
) == SMCA_UMC_V2
) &&
3257 (XEC(m
->status
, 0x3f) == 0x0)))
3261 * If it is correctable error, return.
3263 if (mce_is_correctable(m
))
3267 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
3269 gpu_id
= GET_MCA_IPID_GPUID(m
->ipid
) - GPU_ID_OFFSET
;
3271 adev
= find_adev(gpu_id
);
3273 DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__
,
3279 * If it is uncorrectable error, then find out UMC instance and
3282 umc_inst
= GET_UMC_INST(m
->ipid
);
3283 ch_inst
= GET_CHAN_INDEX(m
->ipid
);
3285 dev_info(adev
->dev
, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
3288 if (!amdgpu_umc_page_retirement_mca(adev
, m
->addr
, ch_inst
, umc_inst
))
3294 static struct notifier_block amdgpu_bad_page_nb
= {
3295 .notifier_call
= amdgpu_bad_page_notifier
,
3296 .priority
= MCE_PRIO_UC
,
3299 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device
*adev
)
3302 * Add the adev to the mce_adev_list.
3303 * During mode2 reset, amdgpu device is temporarily
3304 * removed from the mgpu_info list which can cause
3305 * page retirement to fail.
3306 * Use this list instead of mgpu_info to find the amdgpu
3307 * device on which the UMC error was reported.
3309 mce_adev_list
.devs
[mce_adev_list
.num_gpu
++] = adev
;
3312 * Register the x86 notifier only once
3313 * with MCE subsystem.
3315 if (notifier_registered
== false) {
3316 mce_register_decode_chain(&amdgpu_bad_page_nb
);
3317 notifier_registered
= true;
3322 struct amdgpu_ras
*amdgpu_ras_get_context(struct amdgpu_device
*adev
)
3327 return adev
->psp
.ras_context
.ras
;
3330 int amdgpu_ras_set_context(struct amdgpu_device
*adev
, struct amdgpu_ras
*ras_con
)
3335 adev
->psp
.ras_context
.ras
= ras_con
;
3339 /* check if ras is supported on block, say, sdma, gfx */
3340 int amdgpu_ras_is_supported(struct amdgpu_device
*adev
,
3344 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
3346 if (block
>= AMDGPU_RAS_BLOCK_COUNT
)
3349 ret
= ras
&& (adev
->ras_enabled
& (1 << block
));
3351 /* For the special asic with mem ecc enabled but sram ecc
3352 * not enabled, even if the ras block is not supported on
3353 * .ras_enabled, if the asic supports poison mode and the
3354 * ras block has ras configuration, it can be considered
3355 * that the ras block supports ras function.
3358 (block
== AMDGPU_RAS_BLOCK__GFX
||
3359 block
== AMDGPU_RAS_BLOCK__SDMA
||
3360 block
== AMDGPU_RAS_BLOCK__VCN
||
3361 block
== AMDGPU_RAS_BLOCK__JPEG
) &&
3362 amdgpu_ras_is_poison_mode_supported(adev
) &&
3363 amdgpu_ras_get_ras_block(adev
, block
, 0))
3369 int amdgpu_ras_reset_gpu(struct amdgpu_device
*adev
)
3371 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
3373 if (atomic_cmpxchg(&ras
->in_recovery
, 0, 1) == 0)
3374 amdgpu_reset_domain_schedule(ras
->adev
->reset_domain
, &ras
->recovery_work
);
3378 void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device
*adev
, bool enable
)
3380 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3383 con
->is_mca_debug_mode
= enable
;
3386 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device
*adev
)
3388 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3389 const struct amdgpu_mca_smu_funcs
*mca_funcs
= adev
->mca
.mca_funcs
;
3394 if (mca_funcs
&& mca_funcs
->mca_set_debug_mode
)
3395 return con
->is_mca_debug_mode
;
3400 /* Register each ip ras block into amdgpu ras */
3401 int amdgpu_ras_register_ras_block(struct amdgpu_device
*adev
,
3402 struct amdgpu_ras_block_object
*ras_block_obj
)
3404 struct amdgpu_ras_block_list
*ras_node
;
3405 if (!adev
|| !ras_block_obj
)
3408 ras_node
= kzalloc(sizeof(*ras_node
), GFP_KERNEL
);
3412 INIT_LIST_HEAD(&ras_node
->node
);
3413 ras_node
->ras_obj
= ras_block_obj
;
3414 list_add_tail(&ras_node
->node
, &adev
->ras_list
);
3419 void amdgpu_ras_get_error_type_name(uint32_t err_type
, char *err_type_name
)
3425 case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE
:
3426 sprintf(err_type_name
, "correctable");
3428 case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE
:
3429 sprintf(err_type_name
, "uncorrectable");
3432 sprintf(err_type_name
, "unknown");
3437 bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device
*adev
,
3438 const struct amdgpu_ras_err_status_reg_entry
*reg_entry
,
3440 uint32_t *memory_id
)
3442 uint32_t err_status_lo_data
, err_status_lo_offset
;
3447 err_status_lo_offset
=
3448 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry
->hwip
, instance
,
3449 reg_entry
->seg_lo
, reg_entry
->reg_lo
);
3450 err_status_lo_data
= RREG32(err_status_lo_offset
);
3452 if ((reg_entry
->flags
& AMDGPU_RAS_ERR_STATUS_VALID
) &&
3453 !REG_GET_FIELD(err_status_lo_data
, ERR_STATUS_LO
, ERR_STATUS_VALID_FLAG
))
3456 *memory_id
= REG_GET_FIELD(err_status_lo_data
, ERR_STATUS_LO
, MEMORY_ID
);
3461 bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device
*adev
,
3462 const struct amdgpu_ras_err_status_reg_entry
*reg_entry
,
3464 unsigned long *err_cnt
)
3466 uint32_t err_status_hi_data
, err_status_hi_offset
;
3471 err_status_hi_offset
=
3472 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry
->hwip
, instance
,
3473 reg_entry
->seg_hi
, reg_entry
->reg_hi
);
3474 err_status_hi_data
= RREG32(err_status_hi_offset
);
3476 if ((reg_entry
->flags
& AMDGPU_RAS_ERR_INFO_VALID
) &&
3477 !REG_GET_FIELD(err_status_hi_data
, ERR_STATUS_HI
, ERR_INFO_VALID_FLAG
))
3478 /* keep the check here in case we need to refer to the result later */
3479 dev_dbg(adev
->dev
, "Invalid err_info field\n");
3481 /* read err count */
3482 *err_cnt
= REG_GET_FIELD(err_status_hi_data
, ERR_STATUS
, ERR_CNT
);
3487 void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device
*adev
,
3488 const struct amdgpu_ras_err_status_reg_entry
*reg_list
,
3489 uint32_t reg_list_size
,
3490 const struct amdgpu_ras_memory_id_entry
*mem_list
,
3491 uint32_t mem_list_size
,
3494 unsigned long *err_count
)
3497 unsigned long err_cnt
;
3498 char err_type_name
[16];
3501 for (i
= 0; i
< reg_list_size
; i
++) {
3502 /* query memory_id from err_status_lo */
3503 if (!amdgpu_ras_inst_get_memory_id_field(adev
, ®_list
[i
],
3504 instance
, &memory_id
))
3507 /* query err_cnt from err_status_hi */
3508 if (!amdgpu_ras_inst_get_err_cnt_field(adev
, ®_list
[i
],
3509 instance
, &err_cnt
) ||
3513 *err_count
+= err_cnt
;
3515 /* log the errors */
3516 amdgpu_ras_get_error_type_name(err_type
, err_type_name
);
3518 /* memory_list is not supported */
3520 "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n",
3521 err_cnt
, err_type_name
,
3522 reg_list
[i
].block_name
,
3523 instance
, memory_id
);
3525 for (j
= 0; j
< mem_list_size
; j
++) {
3526 if (memory_id
== mem_list
[j
].memory_id
) {
3528 "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n",
3529 err_cnt
, err_type_name
,
3530 reg_list
[i
].block_name
,
3531 instance
, mem_list
[j
].name
);
3539 void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device
*adev
,
3540 const struct amdgpu_ras_err_status_reg_entry
*reg_list
,
3541 uint32_t reg_list_size
,
3544 uint32_t err_status_lo_offset
, err_status_hi_offset
;
3547 for (i
= 0; i
< reg_list_size
; i
++) {
3548 err_status_lo_offset
=
3549 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list
[i
].hwip
, instance
,
3550 reg_list
[i
].seg_lo
, reg_list
[i
].reg_lo
);
3551 err_status_hi_offset
=
3552 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list
[i
].hwip
, instance
,
3553 reg_list
[i
].seg_hi
, reg_list
[i
].reg_hi
);
3554 WREG32(err_status_lo_offset
, 0);
3555 WREG32(err_status_hi_offset
, 0);
3559 int amdgpu_ras_error_data_init(struct ras_err_data
*err_data
)
3561 memset(err_data
, 0, sizeof(*err_data
));
3563 INIT_LIST_HEAD(&err_data
->err_node_list
);
3568 static void amdgpu_ras_error_node_release(struct ras_err_node
*err_node
)
3573 list_del(&err_node
->node
);
3577 void amdgpu_ras_error_data_fini(struct ras_err_data
*err_data
)
3579 struct ras_err_node
*err_node
, *tmp
;
3581 list_for_each_entry_safe(err_node
, tmp
, &err_data
->err_node_list
, node
)
3582 amdgpu_ras_error_node_release(err_node
);
3585 static struct ras_err_node
*amdgpu_ras_error_find_node_by_id(struct ras_err_data
*err_data
,
3586 struct amdgpu_smuio_mcm_config_info
*mcm_info
)
3588 struct ras_err_node
*err_node
;
3589 struct amdgpu_smuio_mcm_config_info
*ref_id
;
3591 if (!err_data
|| !mcm_info
)
3594 for_each_ras_error(err_node
, err_data
) {
3595 ref_id
= &err_node
->err_info
.mcm_info
;
3597 if (mcm_info
->socket_id
== ref_id
->socket_id
&&
3598 mcm_info
->die_id
== ref_id
->die_id
)
3605 static struct ras_err_node
*amdgpu_ras_error_node_new(void)
3607 struct ras_err_node
*err_node
;
3609 err_node
= kvzalloc(sizeof(*err_node
), GFP_KERNEL
);
3613 INIT_LIST_HEAD(&err_node
->node
);
3618 static struct ras_err_info
*amdgpu_ras_error_get_info(struct ras_err_data
*err_data
,
3619 struct amdgpu_smuio_mcm_config_info
*mcm_info
)
3621 struct ras_err_node
*err_node
;
3623 err_node
= amdgpu_ras_error_find_node_by_id(err_data
, mcm_info
);
3625 return &err_node
->err_info
;
3627 err_node
= amdgpu_ras_error_node_new();
3631 memcpy(&err_node
->err_info
.mcm_info
, mcm_info
, sizeof(*mcm_info
));
3633 err_data
->err_list_count
++;
3634 list_add_tail(&err_node
->node
, &err_data
->err_node_list
);
3636 return &err_node
->err_info
;
3639 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data
*err_data
,
3640 struct amdgpu_smuio_mcm_config_info
*mcm_info
, u64 count
)
3642 struct ras_err_info
*err_info
;
3644 if (!err_data
|| !mcm_info
)
3650 err_info
= amdgpu_ras_error_get_info(err_data
, mcm_info
);
3654 err_info
->ue_count
+= count
;
3655 err_data
->ue_count
+= count
;
3660 int amdgpu_ras_error_statistic_ce_count(struct ras_err_data
*err_data
,
3661 struct amdgpu_smuio_mcm_config_info
*mcm_info
, u64 count
)
3663 struct ras_err_info
*err_info
;
3665 if (!err_data
|| !mcm_info
)
3671 err_info
= amdgpu_ras_error_get_info(err_data
, mcm_info
);
3675 err_info
->ce_count
+= count
;
3676 err_data
->ce_count
+= count
;