2 * Copyright 2018 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
24 #include <linux/debugfs.h>
25 #include <linux/list.h>
26 #include <linux/module.h>
27 #include <linux/uaccess.h>
28 #include <linux/reboot.h>
29 #include <linux/syscalls.h>
30 #include <linux/pm_runtime.h>
33 #include "amdgpu_ras.h"
34 #include "amdgpu_atomfirmware.h"
35 #include "amdgpu_xgmi.h"
36 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
37 #include "nbio_v4_3.h"
38 #include "nbio_v7_9.h"
40 #include "amdgpu_reset.h"
42 #ifdef CONFIG_X86_MCE_AMD
45 static bool notifier_registered
;
47 static const char *RAS_FS_NAME
= "ras";
49 const char *ras_error_string
[] = {
53 "multi_uncorrectable",
57 const char *ras_block_string
[] = {
77 const char *ras_mca_block_string
[] = {
84 struct amdgpu_ras_block_list
{
86 struct list_head node
;
88 struct amdgpu_ras_block_object
*ras_obj
;
91 const char *get_ras_block_str(struct ras_common_if
*ras_block
)
96 if (ras_block
->block
>= AMDGPU_RAS_BLOCK_COUNT
)
97 return "OUT OF RANGE";
99 if (ras_block
->block
== AMDGPU_RAS_BLOCK__MCA
)
100 return ras_mca_block_string
[ras_block
->sub_block_index
];
102 return ras_block_string
[ras_block
->block
];
105 #define ras_block_str(_BLOCK_) \
106 (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range")
108 #define ras_err_str(i) (ras_error_string[ffs(i)])
110 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
112 /* inject address is 52 bits */
113 #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
115 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
116 #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
118 enum amdgpu_ras_retire_page_reservation
{
119 AMDGPU_RAS_RETIRE_PAGE_RESERVED
,
120 AMDGPU_RAS_RETIRE_PAGE_PENDING
,
121 AMDGPU_RAS_RETIRE_PAGE_FAULT
,
124 atomic_t amdgpu_ras_in_intr
= ATOMIC_INIT(0);
126 static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras
*con
,
128 static bool amdgpu_ras_check_bad_page(struct amdgpu_device
*adev
,
130 #ifdef CONFIG_X86_MCE_AMD
131 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device
*adev
);
132 struct mce_notifier_adev_list
{
133 struct amdgpu_device
*devs
[MAX_GPU_INSTANCE
];
136 static struct mce_notifier_adev_list mce_adev_list
;
139 void amdgpu_ras_set_error_query_ready(struct amdgpu_device
*adev
, bool ready
)
141 if (adev
&& amdgpu_ras_get_context(adev
))
142 amdgpu_ras_get_context(adev
)->error_query_ready
= ready
;
145 static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device
*adev
)
147 if (adev
&& amdgpu_ras_get_context(adev
))
148 return amdgpu_ras_get_context(adev
)->error_query_ready
;
153 static int amdgpu_reserve_page_direct(struct amdgpu_device
*adev
, uint64_t address
)
155 struct ras_err_data err_data
;
156 struct eeprom_table_record err_rec
;
159 if ((address
>= adev
->gmc
.mc_vram_size
) ||
160 (address
>= RAS_UMC_INJECT_ADDR_LIMIT
)) {
162 "RAS WARN: input address 0x%llx is invalid.\n",
167 if (amdgpu_ras_check_bad_page(adev
, address
)) {
169 "RAS WARN: 0x%llx has already been marked as bad page!\n",
174 ret
= amdgpu_ras_error_data_init(&err_data
);
178 memset(&err_rec
, 0x0, sizeof(struct eeprom_table_record
));
179 err_data
.err_addr
= &err_rec
;
180 amdgpu_umc_fill_error_record(&err_data
, address
, address
, 0, 0);
182 if (amdgpu_bad_page_threshold
!= 0) {
183 amdgpu_ras_add_bad_pages(adev
, err_data
.err_addr
,
184 err_data
.err_addr_cnt
);
185 amdgpu_ras_save_bad_pages(adev
, NULL
);
188 amdgpu_ras_error_data_fini(&err_data
);
190 dev_warn(adev
->dev
, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
191 dev_warn(adev
->dev
, "Clear EEPROM:\n");
192 dev_warn(adev
->dev
, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
197 static ssize_t
amdgpu_ras_debugfs_read(struct file
*f
, char __user
*buf
,
198 size_t size
, loff_t
*pos
)
200 struct ras_manager
*obj
= (struct ras_manager
*)file_inode(f
)->i_private
;
201 struct ras_query_if info
= {
207 if (amdgpu_ras_query_error_status(obj
->adev
, &info
))
210 /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
211 if (amdgpu_ip_version(obj
->adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 2) &&
212 amdgpu_ip_version(obj
->adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 4)) {
213 if (amdgpu_ras_reset_error_status(obj
->adev
, info
.head
.block
))
214 dev_warn(obj
->adev
->dev
, "Failed to reset error counter and error status");
217 s
= snprintf(val
, sizeof(val
), "%s: %lu\n%s: %lu\n",
219 "ce", info
.ce_count
);
224 s
= min_t(u64
, s
, size
);
227 if (copy_to_user(buf
, &val
[*pos
], s
))
235 static const struct file_operations amdgpu_ras_debugfs_ops
= {
236 .owner
= THIS_MODULE
,
237 .read
= amdgpu_ras_debugfs_read
,
239 .llseek
= default_llseek
242 static int amdgpu_ras_find_block_id_by_name(const char *name
, int *block_id
)
246 for (i
= 0; i
< ARRAY_SIZE(ras_block_string
); i
++) {
248 if (strcmp(name
, ras_block_string
[i
]) == 0)
254 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file
*f
,
255 const char __user
*buf
, size_t size
,
256 loff_t
*pos
, struct ras_debug_if
*data
)
258 ssize_t s
= min_t(u64
, 64, size
);
266 /* default value is 0 if the mask is not set by user */
267 u32 instance_mask
= 0;
273 memset(str
, 0, sizeof(str
));
274 memset(data
, 0, sizeof(*data
));
276 if (copy_from_user(str
, buf
, s
))
279 if (sscanf(str
, "disable %32s", block_name
) == 1)
281 else if (sscanf(str
, "enable %32s %8s", block_name
, err
) == 2)
283 else if (sscanf(str
, "inject %32s %8s", block_name
, err
) == 2)
285 else if (strstr(str
, "retire_page") != NULL
)
287 else if (str
[0] && str
[1] && str
[2] && str
[3])
288 /* ascii string, but commands are not matched. */
293 if (sscanf(str
, "%*s 0x%llx", &address
) != 1 &&
294 sscanf(str
, "%*s %llu", &address
) != 1)
298 data
->inject
.address
= address
;
303 if (amdgpu_ras_find_block_id_by_name(block_name
, &block_id
))
306 data
->head
.block
= block_id
;
307 /* only ue and ce errors are supported */
308 if (!memcmp("ue", err
, 2))
309 data
->head
.type
= AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE
;
310 else if (!memcmp("ce", err
, 2))
311 data
->head
.type
= AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE
;
318 if (sscanf(str
, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x",
319 &sub_block
, &address
, &value
, &instance_mask
) != 4 &&
320 sscanf(str
, "%*s %*s %*s %u %llu %llu %u",
321 &sub_block
, &address
, &value
, &instance_mask
) != 4 &&
322 sscanf(str
, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
323 &sub_block
, &address
, &value
) != 3 &&
324 sscanf(str
, "%*s %*s %*s %u %llu %llu",
325 &sub_block
, &address
, &value
) != 3)
327 data
->head
.sub_block_index
= sub_block
;
328 data
->inject
.address
= address
;
329 data
->inject
.value
= value
;
330 data
->inject
.instance_mask
= instance_mask
;
333 if (size
< sizeof(*data
))
336 if (copy_from_user(data
, buf
, sizeof(*data
)))
343 static void amdgpu_ras_instance_mask_check(struct amdgpu_device
*adev
,
344 struct ras_debug_if
*data
)
346 int num_xcc
= adev
->gfx
.xcc_mask
? NUM_XCC(adev
->gfx
.xcc_mask
) : 1;
347 uint32_t mask
, inst_mask
= data
->inject
.instance_mask
;
349 /* no need to set instance mask if there is only one instance */
350 if (num_xcc
<= 1 && inst_mask
) {
351 data
->inject
.instance_mask
= 0;
353 "RAS inject mask(0x%x) isn't supported and force it to 0.\n",
359 switch (data
->head
.block
) {
360 case AMDGPU_RAS_BLOCK__GFX
:
361 mask
= GENMASK(num_xcc
- 1, 0);
363 case AMDGPU_RAS_BLOCK__SDMA
:
364 mask
= GENMASK(adev
->sdma
.num_instances
- 1, 0);
366 case AMDGPU_RAS_BLOCK__VCN
:
367 case AMDGPU_RAS_BLOCK__JPEG
:
368 mask
= GENMASK(adev
->vcn
.num_vcn_inst
- 1, 0);
375 /* remove invalid bits in instance mask */
376 data
->inject
.instance_mask
&= mask
;
377 if (inst_mask
!= data
->inject
.instance_mask
)
379 "Adjust RAS inject mask 0x%x to 0x%x\n",
380 inst_mask
, data
->inject
.instance_mask
);
384 * DOC: AMDGPU RAS debugfs control interface
386 * The control interface accepts struct ras_debug_if which has two members.
388 * First member: ras_debug_if::head or ras_debug_if::inject.
390 * head is used to indicate which IP block will be under control.
392 * head has four members, they are block, type, sub_block_index, name.
393 * block: which IP will be under control.
394 * type: what kind of error will be enabled/disabled/injected.
395 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
396 * name: the name of IP.
398 * inject has three more members than head, they are address, value and mask.
399 * As their names indicate, inject operation will write the
400 * value to the address.
402 * The second member: struct ras_debug_if::op.
403 * It has three kinds of operations.
405 * - 0: disable RAS on the block. Take ::head as its data.
406 * - 1: enable RAS on the block. Take ::head as its data.
407 * - 2: inject errors on the block. Take ::inject as its data.
409 * How to use the interface?
413 * Copy the struct ras_debug_if in your code and initialize it.
414 * Write the struct to the control interface.
418 * .. code-block:: bash
420 * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
421 * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
422 * echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
424 * Where N, is the card which you want to affect.
426 * "disable" requires only the block.
427 * "enable" requires the block and error type.
428 * "inject" requires the block, error type, address, and value.
430 * The block is one of: umc, sdma, gfx, etc.
431 * see ras_block_string[] for details
433 * The error type is one of: ue, ce, where,
434 * ue is multi-uncorrectable
435 * ce is single-correctable
437 * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
438 * The address and value are hexadecimal numbers, leading 0x is optional.
439 * The mask means instance mask, is optional, default value is 0x1.
443 * .. code-block:: bash
445 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
446 * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
447 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
449 * How to check the result of the operation?
451 * To check disable/enable, see "ras" features at,
452 * /sys/class/drm/card[0/1/2...]/device/ras/features
454 * To check inject, see the corresponding error count at,
455 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
458 * Operations are only allowed on blocks which are supported.
459 * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
460 * to see which blocks support RAS on a particular asic.
463 static ssize_t
amdgpu_ras_debugfs_ctrl_write(struct file
*f
,
464 const char __user
*buf
,
465 size_t size
, loff_t
*pos
)
467 struct amdgpu_device
*adev
= (struct amdgpu_device
*)file_inode(f
)->i_private
;
468 struct ras_debug_if data
;
471 if (!amdgpu_ras_get_error_query_ready(adev
)) {
472 dev_warn(adev
->dev
, "RAS WARN: error injection "
473 "currently inaccessible\n");
477 ret
= amdgpu_ras_debugfs_ctrl_parse_data(f
, buf
, size
, pos
, &data
);
482 ret
= amdgpu_reserve_page_direct(adev
, data
.inject
.address
);
489 if (!amdgpu_ras_is_supported(adev
, data
.head
.block
))
494 ret
= amdgpu_ras_feature_enable(adev
, &data
.head
, 0);
497 ret
= amdgpu_ras_feature_enable(adev
, &data
.head
, 1);
500 if ((data
.inject
.address
>= adev
->gmc
.mc_vram_size
&&
501 adev
->gmc
.mc_vram_size
) ||
502 (data
.inject
.address
>= RAS_UMC_INJECT_ADDR_LIMIT
)) {
503 dev_warn(adev
->dev
, "RAS WARN: input address "
504 "0x%llx is invalid.",
505 data
.inject
.address
);
510 /* umc ce/ue error injection for a bad page is not allowed */
511 if ((data
.head
.block
== AMDGPU_RAS_BLOCK__UMC
) &&
512 amdgpu_ras_check_bad_page(adev
, data
.inject
.address
)) {
513 dev_warn(adev
->dev
, "RAS WARN: inject: 0x%llx has "
514 "already been marked as bad!\n",
515 data
.inject
.address
);
519 amdgpu_ras_instance_mask_check(adev
, &data
);
521 /* data.inject.address is offset instead of absolute gpu address */
522 ret
= amdgpu_ras_error_inject(adev
, &data
.inject
);
536 * DOC: AMDGPU RAS debugfs EEPROM table reset interface
538 * Some boards contain an EEPROM which is used to persistently store a list of
539 * bad pages which experiences ECC errors in vram. This interface provides
540 * a way to reset the EEPROM, e.g., after testing error injection.
544 * .. code-block:: bash
546 * echo 1 > ../ras/ras_eeprom_reset
548 * will reset EEPROM table to 0 entries.
551 static ssize_t
amdgpu_ras_debugfs_eeprom_write(struct file
*f
,
552 const char __user
*buf
,
553 size_t size
, loff_t
*pos
)
555 struct amdgpu_device
*adev
=
556 (struct amdgpu_device
*)file_inode(f
)->i_private
;
559 ret
= amdgpu_ras_eeprom_reset_table(
560 &(amdgpu_ras_get_context(adev
)->eeprom_control
));
563 /* Something was written to EEPROM.
565 amdgpu_ras_get_context(adev
)->flags
= RAS_DEFAULT_FLAGS
;
572 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops
= {
573 .owner
= THIS_MODULE
,
575 .write
= amdgpu_ras_debugfs_ctrl_write
,
576 .llseek
= default_llseek
579 static const struct file_operations amdgpu_ras_debugfs_eeprom_ops
= {
580 .owner
= THIS_MODULE
,
582 .write
= amdgpu_ras_debugfs_eeprom_write
,
583 .llseek
= default_llseek
587 * DOC: AMDGPU RAS sysfs Error Count Interface
589 * It allows the user to read the error count for each IP block on the gpu through
590 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
592 * It outputs the multiple lines which report the uncorrected (ue) and corrected
595 * The format of one line is below,
601 * .. code-block:: bash
607 static ssize_t
amdgpu_ras_sysfs_read(struct device
*dev
,
608 struct device_attribute
*attr
, char *buf
)
610 struct ras_manager
*obj
= container_of(attr
, struct ras_manager
, sysfs_attr
);
611 struct ras_query_if info
= {
615 if (!amdgpu_ras_get_error_query_ready(obj
->adev
))
616 return sysfs_emit(buf
, "Query currently inaccessible\n");
618 if (amdgpu_ras_query_error_status(obj
->adev
, &info
))
621 if (amdgpu_ip_version(obj
->adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 2) &&
622 amdgpu_ip_version(obj
->adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 4)) {
623 if (amdgpu_ras_reset_error_status(obj
->adev
, info
.head
.block
))
624 dev_warn(obj
->adev
->dev
, "Failed to reset error counter and error status");
627 return sysfs_emit(buf
, "%s: %lu\n%s: %lu\n", "ue", info
.ue_count
,
628 "ce", info
.ce_count
);
633 #define get_obj(obj) do { (obj)->use++; } while (0)
634 #define alive_obj(obj) ((obj)->use)
636 static inline void put_obj(struct ras_manager
*obj
)
638 if (obj
&& (--obj
->use
== 0)) {
639 list_del(&obj
->node
);
640 amdgpu_ras_error_data_fini(&obj
->err_data
);
643 if (obj
&& (obj
->use
< 0))
644 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj
->head
));
647 /* make one obj and return it. */
648 static struct ras_manager
*amdgpu_ras_create_obj(struct amdgpu_device
*adev
,
649 struct ras_common_if
*head
)
651 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
652 struct ras_manager
*obj
;
654 if (!adev
->ras_enabled
|| !con
)
657 if (head
->block
>= AMDGPU_RAS_BLOCK_COUNT
)
660 if (head
->block
== AMDGPU_RAS_BLOCK__MCA
) {
661 if (head
->sub_block_index
>= AMDGPU_RAS_MCA_BLOCK__LAST
)
664 obj
= &con
->objs
[AMDGPU_RAS_BLOCK__LAST
+ head
->sub_block_index
];
666 obj
= &con
->objs
[head
->block
];
668 /* already exist. return obj? */
672 if (amdgpu_ras_error_data_init(&obj
->err_data
))
677 list_add(&obj
->node
, &con
->head
);
683 /* return an obj equal to head, or the first when head is NULL */
684 struct ras_manager
*amdgpu_ras_find_obj(struct amdgpu_device
*adev
,
685 struct ras_common_if
*head
)
687 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
688 struct ras_manager
*obj
;
691 if (!adev
->ras_enabled
|| !con
)
695 if (head
->block
>= AMDGPU_RAS_BLOCK_COUNT
)
698 if (head
->block
== AMDGPU_RAS_BLOCK__MCA
) {
699 if (head
->sub_block_index
>= AMDGPU_RAS_MCA_BLOCK__LAST
)
702 obj
= &con
->objs
[AMDGPU_RAS_BLOCK__LAST
+ head
->sub_block_index
];
704 obj
= &con
->objs
[head
->block
];
709 for (i
= 0; i
< AMDGPU_RAS_BLOCK_COUNT
+ AMDGPU_RAS_MCA_BLOCK_COUNT
; i
++) {
720 /* feature ctl begin */
721 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device
*adev
,
722 struct ras_common_if
*head
)
724 return adev
->ras_hw_enabled
& BIT(head
->block
);
727 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device
*adev
,
728 struct ras_common_if
*head
)
730 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
732 return con
->features
& BIT(head
->block
);
736 * if obj is not created, then create one.
737 * set feature enable flag.
739 static int __amdgpu_ras_feature_enable(struct amdgpu_device
*adev
,
740 struct ras_common_if
*head
, int enable
)
742 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
743 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
745 /* If hardware does not support ras, then do not create obj.
746 * But if hardware support ras, we can create the obj.
747 * Ras framework checks con->hw_supported to see if it need do
748 * corresponding initialization.
749 * IP checks con->support to see if it need disable ras.
751 if (!amdgpu_ras_is_feature_allowed(adev
, head
))
756 obj
= amdgpu_ras_create_obj(adev
, head
);
760 /* In case we create obj somewhere else */
763 con
->features
|= BIT(head
->block
);
765 if (obj
&& amdgpu_ras_is_feature_enabled(adev
, head
)) {
766 con
->features
&= ~BIT(head
->block
);
774 /* wrapper of psp_ras_enable_features */
775 int amdgpu_ras_feature_enable(struct amdgpu_device
*adev
,
776 struct ras_common_if
*head
, bool enable
)
778 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
779 union ta_ras_cmd_input
*info
;
785 /* For non-gfx ip, do not enable ras feature if it is not allowed */
786 /* For gfx ip, regardless of feature support status, */
787 /* Force issue enable or disable ras feature commands */
788 if (head
->block
!= AMDGPU_RAS_BLOCK__GFX
&&
789 !amdgpu_ras_is_feature_allowed(adev
, head
))
792 /* Only enable gfx ras feature from host side */
793 if (head
->block
== AMDGPU_RAS_BLOCK__GFX
&&
794 !amdgpu_sriov_vf(adev
) &&
795 !amdgpu_ras_intr_triggered()) {
796 info
= kzalloc(sizeof(union ta_ras_cmd_input
), GFP_KERNEL
);
801 info
->disable_features
= (struct ta_ras_disable_features_input
) {
802 .block_id
= amdgpu_ras_block_to_ta(head
->block
),
803 .error_type
= amdgpu_ras_error_to_ta(head
->type
),
806 info
->enable_features
= (struct ta_ras_enable_features_input
) {
807 .block_id
= amdgpu_ras_block_to_ta(head
->block
),
808 .error_type
= amdgpu_ras_error_to_ta(head
->type
),
812 ret
= psp_ras_enable_features(&adev
->psp
, info
, enable
);
814 dev_err(adev
->dev
, "ras %s %s failed poison:%d ret:%d\n",
815 enable
? "enable":"disable",
816 get_ras_block_str(head
),
817 amdgpu_ras_is_poison_mode_supported(adev
), ret
);
826 __amdgpu_ras_feature_enable(adev
, head
, enable
);
831 /* Only used in device probe stage and called only once. */
832 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device
*adev
,
833 struct ras_common_if
*head
, bool enable
)
835 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
841 if (con
->flags
& AMDGPU_RAS_FLAG_INIT_BY_VBIOS
) {
843 /* There is no harm to issue a ras TA cmd regardless of
844 * the currecnt ras state.
845 * If current state == target state, it will do nothing
846 * But sometimes it requests driver to reset and repost
847 * with error code -EAGAIN.
849 ret
= amdgpu_ras_feature_enable(adev
, head
, 1);
850 /* With old ras TA, we might fail to enable ras.
851 * Log it and just setup the object.
852 * TODO need remove this WA in the future.
854 if (ret
== -EINVAL
) {
855 ret
= __amdgpu_ras_feature_enable(adev
, head
, 1);
858 "RAS INFO: %s setup object\n",
859 get_ras_block_str(head
));
862 /* setup the object then issue a ras TA disable cmd.*/
863 ret
= __amdgpu_ras_feature_enable(adev
, head
, 1);
867 /* gfx block ras dsiable cmd must send to ras-ta */
868 if (head
->block
== AMDGPU_RAS_BLOCK__GFX
)
869 con
->features
|= BIT(head
->block
);
871 ret
= amdgpu_ras_feature_enable(adev
, head
, 0);
873 /* clean gfx block ras features flag */
874 if (adev
->ras_enabled
&& head
->block
== AMDGPU_RAS_BLOCK__GFX
)
875 con
->features
&= ~BIT(head
->block
);
878 ret
= amdgpu_ras_feature_enable(adev
, head
, enable
);
883 static int amdgpu_ras_disable_all_features(struct amdgpu_device
*adev
,
886 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
887 struct ras_manager
*obj
, *tmp
;
889 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
891 * aka just release the obj and corresponding flags
894 if (__amdgpu_ras_feature_enable(adev
, &obj
->head
, 0))
897 if (amdgpu_ras_feature_enable(adev
, &obj
->head
, 0))
902 return con
->features
;
905 static int amdgpu_ras_enable_all_features(struct amdgpu_device
*adev
,
908 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
910 const enum amdgpu_ras_error_type default_ras_type
= AMDGPU_RAS_ERROR__NONE
;
912 for (i
= 0; i
< AMDGPU_RAS_BLOCK_COUNT
; i
++) {
913 struct ras_common_if head
= {
915 .type
= default_ras_type
,
916 .sub_block_index
= 0,
919 if (i
== AMDGPU_RAS_BLOCK__MCA
)
924 * bypass psp. vbios enable ras for us.
925 * so just create the obj
927 if (__amdgpu_ras_feature_enable(adev
, &head
, 1))
930 if (amdgpu_ras_feature_enable(adev
, &head
, 1))
935 for (i
= 0; i
< AMDGPU_RAS_MCA_BLOCK_COUNT
; i
++) {
936 struct ras_common_if head
= {
937 .block
= AMDGPU_RAS_BLOCK__MCA
,
938 .type
= default_ras_type
,
939 .sub_block_index
= i
,
944 * bypass psp. vbios enable ras for us.
945 * so just create the obj
947 if (__amdgpu_ras_feature_enable(adev
, &head
, 1))
950 if (amdgpu_ras_feature_enable(adev
, &head
, 1))
955 return con
->features
;
957 /* feature ctl end */
959 static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object
*block_obj
,
960 enum amdgpu_ras_block block
)
965 if (block_obj
->ras_comm
.block
== block
)
971 static struct amdgpu_ras_block_object
*amdgpu_ras_get_ras_block(struct amdgpu_device
*adev
,
972 enum amdgpu_ras_block block
, uint32_t sub_block_index
)
974 struct amdgpu_ras_block_list
*node
, *tmp
;
975 struct amdgpu_ras_block_object
*obj
;
977 if (block
>= AMDGPU_RAS_BLOCK__LAST
)
980 list_for_each_entry_safe(node
, tmp
, &adev
->ras_list
, node
) {
981 if (!node
->ras_obj
) {
982 dev_warn(adev
->dev
, "Warning: abnormal ras list node.\n");
987 if (obj
->ras_block_match
) {
988 if (obj
->ras_block_match(obj
, block
, sub_block_index
) == 0)
991 if (amdgpu_ras_block_match_default(obj
, block
) == 0)
999 static void amdgpu_ras_get_ecc_info(struct amdgpu_device
*adev
, struct ras_err_data
*err_data
)
1001 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
1005 * choosing right query method according to
1006 * whether smu support query error information
1008 ret
= amdgpu_dpm_get_ecc_info(adev
, (void *)&(ras
->umc_ecc
));
1009 if (ret
== -EOPNOTSUPP
) {
1010 if (adev
->umc
.ras
&& adev
->umc
.ras
->ras_block
.hw_ops
&&
1011 adev
->umc
.ras
->ras_block
.hw_ops
->query_ras_error_count
)
1012 adev
->umc
.ras
->ras_block
.hw_ops
->query_ras_error_count(adev
, err_data
);
1014 /* umc query_ras_error_address is also responsible for clearing
1017 if (adev
->umc
.ras
&& adev
->umc
.ras
->ras_block
.hw_ops
&&
1018 adev
->umc
.ras
->ras_block
.hw_ops
->query_ras_error_address
)
1019 adev
->umc
.ras
->ras_block
.hw_ops
->query_ras_error_address(adev
, err_data
);
1021 if (adev
->umc
.ras
&&
1022 adev
->umc
.ras
->ecc_info_query_ras_error_count
)
1023 adev
->umc
.ras
->ecc_info_query_ras_error_count(adev
, err_data
);
1025 if (adev
->umc
.ras
&&
1026 adev
->umc
.ras
->ecc_info_query_ras_error_address
)
1027 adev
->umc
.ras
->ecc_info_query_ras_error_address(adev
, err_data
);
1031 static void amdgpu_ras_error_print_error_data(struct amdgpu_device
*adev
,
1032 struct ras_manager
*ras_mgr
,
1033 struct ras_err_data
*err_data
,
1034 const char *blk_name
,
1037 struct amdgpu_smuio_mcm_config_info
*mcm_info
;
1038 struct ras_err_node
*err_node
;
1039 struct ras_err_info
*err_info
;
1042 for_each_ras_error(err_node
, err_data
) {
1043 err_info
= &err_node
->err_info
;
1044 mcm_info
= &err_info
->mcm_info
;
1045 if (err_info
->ue_count
) {
1046 dev_info(adev
->dev
, "socket: %d, die: %d, "
1047 "%lld new uncorrectable hardware errors detected in %s block\n",
1048 mcm_info
->socket_id
,
1055 for_each_ras_error(err_node
, &ras_mgr
->err_data
) {
1056 err_info
= &err_node
->err_info
;
1057 mcm_info
= &err_info
->mcm_info
;
1058 dev_info(adev
->dev
, "socket: %d, die: %d, "
1059 "%lld uncorrectable hardware errors detected in total in %s block\n",
1060 mcm_info
->socket_id
, mcm_info
->die_id
, err_info
->ue_count
, blk_name
);
1064 for_each_ras_error(err_node
, err_data
) {
1065 err_info
= &err_node
->err_info
;
1066 mcm_info
= &err_info
->mcm_info
;
1067 if (err_info
->ce_count
) {
1068 dev_info(adev
->dev
, "socket: %d, die: %d, "
1069 "%lld new correctable hardware errors detected in %s block, "
1070 "no user action is needed\n",
1071 mcm_info
->socket_id
,
1078 for_each_ras_error(err_node
, &ras_mgr
->err_data
) {
1079 err_info
= &err_node
->err_info
;
1080 mcm_info
= &err_info
->mcm_info
;
1081 dev_info(adev
->dev
, "socket: %d, die: %d, "
1082 "%lld correctable hardware errors detected in total in %s block, "
1083 "no user action is needed\n",
1084 mcm_info
->socket_id
, mcm_info
->die_id
, err_info
->ce_count
, blk_name
);
1089 static inline bool err_data_has_source_info(struct ras_err_data
*data
)
1091 return !list_empty(&data
->err_node_list
);
1094 static void amdgpu_ras_error_generate_report(struct amdgpu_device
*adev
,
1095 struct ras_query_if
*query_if
,
1096 struct ras_err_data
*err_data
)
1098 struct ras_manager
*ras_mgr
= amdgpu_ras_find_obj(adev
, &query_if
->head
);
1099 const char *blk_name
= get_ras_block_str(&query_if
->head
);
1101 if (err_data
->ce_count
) {
1102 if (err_data_has_source_info(err_data
)) {
1103 amdgpu_ras_error_print_error_data(adev
, ras_mgr
, err_data
, blk_name
, false);
1104 } else if (!adev
->aid_mask
&&
1105 adev
->smuio
.funcs
&&
1106 adev
->smuio
.funcs
->get_socket_id
&&
1107 adev
->smuio
.funcs
->get_die_id
) {
1108 dev_info(adev
->dev
, "socket: %d, die: %d "
1109 "%ld correctable hardware errors "
1110 "detected in %s block, no user "
1111 "action is needed.\n",
1112 adev
->smuio
.funcs
->get_socket_id(adev
),
1113 adev
->smuio
.funcs
->get_die_id(adev
),
1114 ras_mgr
->err_data
.ce_count
,
1117 dev_info(adev
->dev
, "%ld correctable hardware errors "
1118 "detected in %s block, no user "
1119 "action is needed.\n",
1120 ras_mgr
->err_data
.ce_count
,
1125 if (err_data
->ue_count
) {
1126 if (err_data_has_source_info(err_data
)) {
1127 amdgpu_ras_error_print_error_data(adev
, ras_mgr
, err_data
, blk_name
, true);
1128 } else if (!adev
->aid_mask
&&
1129 adev
->smuio
.funcs
&&
1130 adev
->smuio
.funcs
->get_socket_id
&&
1131 adev
->smuio
.funcs
->get_die_id
) {
1132 dev_info(adev
->dev
, "socket: %d, die: %d "
1133 "%ld uncorrectable hardware errors "
1134 "detected in %s block\n",
1135 adev
->smuio
.funcs
->get_socket_id(adev
),
1136 adev
->smuio
.funcs
->get_die_id(adev
),
1137 ras_mgr
->err_data
.ue_count
,
1140 dev_info(adev
->dev
, "%ld uncorrectable hardware errors "
1141 "detected in %s block\n",
1142 ras_mgr
->err_data
.ue_count
,
1149 static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager
*obj
, struct ras_err_data
*err_data
)
1151 struct ras_err_node
*err_node
;
1152 struct ras_err_info
*err_info
;
1154 if (err_data_has_source_info(err_data
)) {
1155 for_each_ras_error(err_node
, err_data
) {
1156 err_info
= &err_node
->err_info
;
1158 amdgpu_ras_error_statistic_ce_count(&obj
->err_data
, &err_info
->mcm_info
, err_info
->ce_count
);
1159 amdgpu_ras_error_statistic_ue_count(&obj
->err_data
, &err_info
->mcm_info
, err_info
->ue_count
);
1162 /* for legacy asic path which doesn't has error source info */
1163 obj
->err_data
.ue_count
+= err_data
->ue_count
;
1164 obj
->err_data
.ce_count
+= err_data
->ce_count
;
1168 static int amdgpu_ras_query_error_status_helper(struct amdgpu_device
*adev
,
1169 struct ras_query_if
*info
,
1170 struct ras_err_data
*err_data
,
1171 unsigned int error_query_mode
)
1173 enum amdgpu_ras_block blk
= info
? info
->head
.block
: AMDGPU_RAS_BLOCK_COUNT
;
1174 struct amdgpu_ras_block_object
*block_obj
= NULL
;
1176 if (error_query_mode
== AMDGPU_RAS_INVALID_ERROR_QUERY
)
1179 if (error_query_mode
== AMDGPU_RAS_DIRECT_ERROR_QUERY
) {
1180 if (info
->head
.block
== AMDGPU_RAS_BLOCK__UMC
) {
1181 amdgpu_ras_get_ecc_info(adev
, err_data
);
1183 block_obj
= amdgpu_ras_get_ras_block(adev
, info
->head
.block
, 0);
1184 if (!block_obj
|| !block_obj
->hw_ops
) {
1185 dev_dbg_once(adev
->dev
, "%s doesn't config RAS function\n",
1186 get_ras_block_str(&info
->head
));
1190 if (block_obj
->hw_ops
->query_ras_error_count
)
1191 block_obj
->hw_ops
->query_ras_error_count(adev
, &err_data
);
1193 if ((info
->head
.block
== AMDGPU_RAS_BLOCK__SDMA
) ||
1194 (info
->head
.block
== AMDGPU_RAS_BLOCK__GFX
) ||
1195 (info
->head
.block
== AMDGPU_RAS_BLOCK__MMHUB
)) {
1196 if (block_obj
->hw_ops
->query_ras_error_status
)
1197 block_obj
->hw_ops
->query_ras_error_status(adev
);
1201 /* FIXME: add code to check return value later */
1202 amdgpu_mca_smu_log_ras_error(adev
, blk
, AMDGPU_MCA_ERROR_TYPE_UE
, err_data
);
1203 amdgpu_mca_smu_log_ras_error(adev
, blk
, AMDGPU_MCA_ERROR_TYPE_CE
, err_data
);
1209 /* query/inject/cure begin */
1210 int amdgpu_ras_query_error_status(struct amdgpu_device
*adev
, struct ras_query_if
*info
)
1212 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &info
->head
);
1213 struct ras_err_data err_data
;
1214 unsigned int error_query_mode
;
1220 ret
= amdgpu_ras_error_data_init(&err_data
);
1224 if (!amdgpu_ras_get_error_query_mode(adev
, &error_query_mode
))
1227 ret
= amdgpu_ras_query_error_status_helper(adev
, info
,
1231 goto out_fini_err_data
;
1233 amdgpu_rasmgr_error_data_statistic_update(obj
, &err_data
);
1235 info
->ue_count
= obj
->err_data
.ue_count
;
1236 info
->ce_count
= obj
->err_data
.ce_count
;
1238 amdgpu_ras_error_generate_report(adev
, info
, &err_data
);
1241 amdgpu_ras_error_data_fini(&err_data
);
1246 int amdgpu_ras_reset_error_count(struct amdgpu_device
*adev
,
1247 enum amdgpu_ras_block block
)
1249 struct amdgpu_ras_block_object
*block_obj
= amdgpu_ras_get_ras_block(adev
, block
, 0);
1250 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
1251 const struct amdgpu_mca_smu_funcs
*mca_funcs
= adev
->mca
.mca_funcs
;
1252 struct amdgpu_hive_info
*hive
;
1253 int hive_ras_recovery
= 0;
1255 if (!block_obj
|| !block_obj
->hw_ops
) {
1256 dev_dbg_once(adev
->dev
, "%s doesn't config RAS function\n",
1257 ras_block_str(block
));
1261 if (!amdgpu_ras_is_supported(adev
, block
) ||
1262 !amdgpu_ras_get_mca_debug_mode(adev
))
1265 hive
= amdgpu_get_xgmi_hive(adev
);
1267 hive_ras_recovery
= atomic_read(&hive
->ras_recovery
);
1268 amdgpu_put_xgmi_hive(hive
);
1271 /* skip ras error reset in gpu reset */
1272 if ((amdgpu_in_reset(adev
) || atomic_read(&ras
->in_recovery
) ||
1273 hive_ras_recovery
) &&
1274 mca_funcs
&& mca_funcs
->mca_set_debug_mode
)
1277 if (block_obj
->hw_ops
->reset_ras_error_count
)
1278 block_obj
->hw_ops
->reset_ras_error_count(adev
);
1283 int amdgpu_ras_reset_error_status(struct amdgpu_device
*adev
,
1284 enum amdgpu_ras_block block
)
1286 struct amdgpu_ras_block_object
*block_obj
= amdgpu_ras_get_ras_block(adev
, block
, 0);
1288 if (amdgpu_ras_reset_error_count(adev
, block
) == -EOPNOTSUPP
)
1291 if ((block
== AMDGPU_RAS_BLOCK__GFX
) ||
1292 (block
== AMDGPU_RAS_BLOCK__MMHUB
)) {
1293 if (block_obj
->hw_ops
->reset_ras_error_status
)
1294 block_obj
->hw_ops
->reset_ras_error_status(adev
);
1300 /* wrapper of psp_ras_trigger_error */
1301 int amdgpu_ras_error_inject(struct amdgpu_device
*adev
,
1302 struct ras_inject_if
*info
)
1304 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &info
->head
);
1305 struct ta_ras_trigger_error_input block_info
= {
1306 .block_id
= amdgpu_ras_block_to_ta(info
->head
.block
),
1307 .inject_error_type
= amdgpu_ras_error_to_ta(info
->head
.type
),
1308 .sub_block_index
= info
->head
.sub_block_index
,
1309 .address
= info
->address
,
1310 .value
= info
->value
,
1313 struct amdgpu_ras_block_object
*block_obj
= amdgpu_ras_get_ras_block(adev
,
1315 info
->head
.sub_block_index
);
1317 /* inject on guest isn't allowed, return success directly */
1318 if (amdgpu_sriov_vf(adev
))
1324 if (!block_obj
|| !block_obj
->hw_ops
) {
1325 dev_dbg_once(adev
->dev
, "%s doesn't config RAS function\n",
1326 get_ras_block_str(&info
->head
));
1330 /* Calculate XGMI relative offset */
1331 if (adev
->gmc
.xgmi
.num_physical_nodes
> 1 &&
1332 info
->head
.block
!= AMDGPU_RAS_BLOCK__GFX
) {
1333 block_info
.address
=
1334 amdgpu_xgmi_get_relative_phy_addr(adev
,
1335 block_info
.address
);
1338 if (block_obj
->hw_ops
->ras_error_inject
) {
1339 if (info
->head
.block
== AMDGPU_RAS_BLOCK__GFX
)
1340 ret
= block_obj
->hw_ops
->ras_error_inject(adev
, info
, info
->instance_mask
);
1341 else /* Special ras_error_inject is defined (e.g: xgmi) */
1342 ret
= block_obj
->hw_ops
->ras_error_inject(adev
, &block_info
,
1343 info
->instance_mask
);
1346 ret
= psp_ras_trigger_error(&adev
->psp
, &block_info
, info
->instance_mask
);
1350 dev_err(adev
->dev
, "ras inject %s failed %d\n",
1351 get_ras_block_str(&info
->head
), ret
);
1357 * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
1358 * @adev: pointer to AMD GPU device
1359 * @ce_count: pointer to an integer to be set to the count of correctible errors.
1360 * @ue_count: pointer to an integer to be set to the count of uncorrectible errors.
1361 * @query_info: pointer to ras_query_if
1363 * Return 0 for query success or do nothing, otherwise return an error
1366 static int amdgpu_ras_query_error_count_helper(struct amdgpu_device
*adev
,
1367 unsigned long *ce_count
,
1368 unsigned long *ue_count
,
1369 struct ras_query_if
*query_info
)
1374 /* do nothing if query_info is not specified */
1377 ret
= amdgpu_ras_query_error_status(adev
, query_info
);
1381 *ce_count
+= query_info
->ce_count
;
1382 *ue_count
+= query_info
->ue_count
;
1384 /* some hardware/IP supports read to clear
1385 * no need to explictly reset the err status after the query call */
1386 if (amdgpu_ip_version(adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 2) &&
1387 amdgpu_ip_version(adev
, MP0_HWIP
, 0) != IP_VERSION(11, 0, 4)) {
1388 if (amdgpu_ras_reset_error_status(adev
, query_info
->head
.block
))
1390 "Failed to reset error counter and error status\n");
1397 * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
1398 * @adev: pointer to AMD GPU device
1399 * @ce_count: pointer to an integer to be set to the count of correctible errors.
1400 * @ue_count: pointer to an integer to be set to the count of uncorrectible
1402 * @query_info: pointer to ras_query_if if the query request is only for
1403 * specific ip block; if info is NULL, then the qurey request is for
1404 * all the ip blocks that support query ras error counters/status
1406 * If set, @ce_count or @ue_count, count and return the corresponding
1407 * error counts in those integer pointers. Return 0 if the device
1408 * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
1410 int amdgpu_ras_query_error_count(struct amdgpu_device
*adev
,
1411 unsigned long *ce_count
,
1412 unsigned long *ue_count
,
1413 struct ras_query_if
*query_info
)
1415 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1416 struct ras_manager
*obj
;
1417 unsigned long ce
, ue
;
1420 if (!adev
->ras_enabled
|| !con
)
1423 /* Don't count since no reporting.
1425 if (!ce_count
&& !ue_count
)
1431 /* query all the ip blocks that support ras query interface */
1432 list_for_each_entry(obj
, &con
->head
, node
) {
1433 struct ras_query_if info
= {
1437 ret
= amdgpu_ras_query_error_count_helper(adev
, &ce
, &ue
, &info
);
1440 /* query specific ip block */
1441 ret
= amdgpu_ras_query_error_count_helper(adev
, &ce
, &ue
, query_info
);
1455 /* query/inject/cure end */
1460 static int amdgpu_ras_badpages_read(struct amdgpu_device
*adev
,
1461 struct ras_badpage
**bps
, unsigned int *count
);
1463 static char *amdgpu_ras_badpage_flags_str(unsigned int flags
)
1466 case AMDGPU_RAS_RETIRE_PAGE_RESERVED
:
1468 case AMDGPU_RAS_RETIRE_PAGE_PENDING
:
1470 case AMDGPU_RAS_RETIRE_PAGE_FAULT
:
1477 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
1479 * It allows user to read the bad pages of vram on the gpu through
1480 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
1482 * It outputs multiple lines, and each line stands for one gpu page.
1484 * The format of one line is below,
1485 * gpu pfn : gpu page size : flags
1487 * gpu pfn and gpu page size are printed in hex format.
1488 * flags can be one of below character,
1490 * R: reserved, this gpu page is reserved and not able to use.
1492 * P: pending for reserve, this gpu page is marked as bad, will be reserved
1493 * in next window of page_reserve.
1495 * F: unable to reserve. this gpu page can't be reserved due to some reasons.
1499 * .. code-block:: bash
1501 * 0x00000001 : 0x00001000 : R
1502 * 0x00000002 : 0x00001000 : P
1506 static ssize_t
amdgpu_ras_sysfs_badpages_read(struct file
*f
,
1507 struct kobject
*kobj
, struct bin_attribute
*attr
,
1508 char *buf
, loff_t ppos
, size_t count
)
1510 struct amdgpu_ras
*con
=
1511 container_of(attr
, struct amdgpu_ras
, badpages_attr
);
1512 struct amdgpu_device
*adev
= con
->adev
;
1513 const unsigned int element_size
=
1514 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
1515 unsigned int start
= div64_ul(ppos
+ element_size
- 1, element_size
);
1516 unsigned int end
= div64_ul(ppos
+ count
- 1, element_size
);
1518 struct ras_badpage
*bps
= NULL
;
1519 unsigned int bps_count
= 0;
1521 memset(buf
, 0, count
);
1523 if (amdgpu_ras_badpages_read(adev
, &bps
, &bps_count
))
1526 for (; start
< end
&& start
< bps_count
; start
++)
1527 s
+= scnprintf(&buf
[s
], element_size
+ 1,
1528 "0x%08x : 0x%08x : %1s\n",
1531 amdgpu_ras_badpage_flags_str(bps
[start
].flags
));
1538 static ssize_t
amdgpu_ras_sysfs_features_read(struct device
*dev
,
1539 struct device_attribute
*attr
, char *buf
)
1541 struct amdgpu_ras
*con
=
1542 container_of(attr
, struct amdgpu_ras
, features_attr
);
1544 return sysfs_emit(buf
, "feature mask: 0x%x\n", con
->features
);
1547 static ssize_t
amdgpu_ras_sysfs_version_show(struct device
*dev
,
1548 struct device_attribute
*attr
, char *buf
)
1550 struct amdgpu_ras
*con
=
1551 container_of(attr
, struct amdgpu_ras
, version_attr
);
1552 return sysfs_emit(buf
, "table version: 0x%x\n", con
->eeprom_control
.tbl_hdr
.version
);
1555 static ssize_t
amdgpu_ras_sysfs_schema_show(struct device
*dev
,
1556 struct device_attribute
*attr
, char *buf
)
1558 struct amdgpu_ras
*con
=
1559 container_of(attr
, struct amdgpu_ras
, schema_attr
);
1560 return sysfs_emit(buf
, "schema: 0x%x\n", con
->schema
);
1563 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device
*adev
)
1565 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1567 sysfs_remove_file_from_group(&adev
->dev
->kobj
,
1568 &con
->badpages_attr
.attr
,
1572 static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device
*adev
)
1574 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1575 struct attribute
*attrs
[] = {
1576 &con
->features_attr
.attr
,
1577 &con
->version_attr
.attr
,
1578 &con
->schema_attr
.attr
,
1581 struct attribute_group group
= {
1582 .name
= RAS_FS_NAME
,
1586 sysfs_remove_group(&adev
->dev
->kobj
, &group
);
1591 int amdgpu_ras_sysfs_create(struct amdgpu_device
*adev
,
1592 struct ras_common_if
*head
)
1594 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
1596 if (!obj
|| obj
->attr_inuse
)
1601 snprintf(obj
->fs_data
.sysfs_name
, sizeof(obj
->fs_data
.sysfs_name
),
1602 "%s_err_count", head
->name
);
1604 obj
->sysfs_attr
= (struct device_attribute
){
1606 .name
= obj
->fs_data
.sysfs_name
,
1609 .show
= amdgpu_ras_sysfs_read
,
1611 sysfs_attr_init(&obj
->sysfs_attr
.attr
);
1613 if (sysfs_add_file_to_group(&adev
->dev
->kobj
,
1614 &obj
->sysfs_attr
.attr
,
1620 obj
->attr_inuse
= 1;
1625 int amdgpu_ras_sysfs_remove(struct amdgpu_device
*adev
,
1626 struct ras_common_if
*head
)
1628 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
1630 if (!obj
|| !obj
->attr_inuse
)
1633 sysfs_remove_file_from_group(&adev
->dev
->kobj
,
1634 &obj
->sysfs_attr
.attr
,
1636 obj
->attr_inuse
= 0;
1642 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device
*adev
)
1644 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1645 struct ras_manager
*obj
, *tmp
;
1647 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
1648 amdgpu_ras_sysfs_remove(adev
, &obj
->head
);
1651 if (amdgpu_bad_page_threshold
!= 0)
1652 amdgpu_ras_sysfs_remove_bad_page_node(adev
);
1654 amdgpu_ras_sysfs_remove_dev_attr_node(adev
);
1661 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
1663 * Normally when there is an uncorrectable error, the driver will reset
1664 * the GPU to recover. However, in the event of an unrecoverable error,
1665 * the driver provides an interface to reboot the system automatically
1668 * The following file in debugfs provides that interface:
1669 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1673 * .. code-block:: bash
1675 * echo true > .../ras/auto_reboot
1679 static struct dentry
*amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device
*adev
)
1681 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1682 struct amdgpu_ras_eeprom_control
*eeprom
= &con
->eeprom_control
;
1683 struct drm_minor
*minor
= adev_to_drm(adev
)->primary
;
1686 dir
= debugfs_create_dir(RAS_FS_NAME
, minor
->debugfs_root
);
1687 debugfs_create_file("ras_ctrl", S_IWUGO
| S_IRUGO
, dir
, adev
,
1688 &amdgpu_ras_debugfs_ctrl_ops
);
1689 debugfs_create_file("ras_eeprom_reset", S_IWUGO
| S_IRUGO
, dir
, adev
,
1690 &amdgpu_ras_debugfs_eeprom_ops
);
1691 debugfs_create_u32("bad_page_cnt_threshold", 0444, dir
,
1692 &con
->bad_page_cnt_threshold
);
1693 debugfs_create_u32("ras_num_recs", 0444, dir
, &eeprom
->ras_num_recs
);
1694 debugfs_create_x32("ras_hw_enabled", 0444, dir
, &adev
->ras_hw_enabled
);
1695 debugfs_create_x32("ras_enabled", 0444, dir
, &adev
->ras_enabled
);
1696 debugfs_create_file("ras_eeprom_size", S_IRUGO
, dir
, adev
,
1697 &amdgpu_ras_debugfs_eeprom_size_ops
);
1698 con
->de_ras_eeprom_table
= debugfs_create_file("ras_eeprom_table",
1700 &amdgpu_ras_debugfs_eeprom_table_ops
);
1701 amdgpu_ras_debugfs_set_ret_size(&con
->eeprom_control
);
1704 * After one uncorrectable error happens, usually GPU recovery will
1705 * be scheduled. But due to the known problem in GPU recovery failing
1706 * to bring GPU back, below interface provides one direct way to
1707 * user to reboot system automatically in such case within
1708 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
1709 * will never be called.
1711 debugfs_create_bool("auto_reboot", S_IWUGO
| S_IRUGO
, dir
, &con
->reboot
);
1714 * User could set this not to clean up hardware's error count register
1715 * of RAS IPs during ras recovery.
1717 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir
,
1718 &con
->disable_ras_err_cnt_harvest
);
1722 static void amdgpu_ras_debugfs_create(struct amdgpu_device
*adev
,
1723 struct ras_fs_if
*head
,
1726 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &head
->head
);
1733 memcpy(obj
->fs_data
.debugfs_name
,
1735 sizeof(obj
->fs_data
.debugfs_name
));
1737 debugfs_create_file(obj
->fs_data
.debugfs_name
, S_IWUGO
| S_IRUGO
, dir
,
1738 obj
, &amdgpu_ras_debugfs_ops
);
1741 void amdgpu_ras_debugfs_create_all(struct amdgpu_device
*adev
)
1743 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1745 struct ras_manager
*obj
;
1746 struct ras_fs_if fs_info
;
1749 * it won't be called in resume path, no need to check
1750 * suspend and gpu reset status
1752 if (!IS_ENABLED(CONFIG_DEBUG_FS
) || !con
)
1755 dir
= amdgpu_ras_debugfs_create_ctrl_node(adev
);
1757 list_for_each_entry(obj
, &con
->head
, node
) {
1758 if (amdgpu_ras_is_supported(adev
, obj
->head
.block
) &&
1759 (obj
->attr_inuse
== 1)) {
1760 sprintf(fs_info
.debugfs_name
, "%s_err_inject",
1761 get_ras_block_str(&obj
->head
));
1762 fs_info
.head
= obj
->head
;
1763 amdgpu_ras_debugfs_create(adev
, &fs_info
, dir
);
1767 amdgpu_mca_smu_debugfs_init(adev
, dir
);
1773 static BIN_ATTR(gpu_vram_bad_pages
, S_IRUGO
,
1774 amdgpu_ras_sysfs_badpages_read
, NULL
, 0);
1775 static DEVICE_ATTR(features
, S_IRUGO
,
1776 amdgpu_ras_sysfs_features_read
, NULL
);
1777 static DEVICE_ATTR(version
, 0444,
1778 amdgpu_ras_sysfs_version_show
, NULL
);
1779 static DEVICE_ATTR(schema
, 0444,
1780 amdgpu_ras_sysfs_schema_show
, NULL
);
1781 static int amdgpu_ras_fs_init(struct amdgpu_device
*adev
)
1783 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1784 struct attribute_group group
= {
1785 .name
= RAS_FS_NAME
,
1787 struct attribute
*attrs
[] = {
1788 &con
->features_attr
.attr
,
1789 &con
->version_attr
.attr
,
1790 &con
->schema_attr
.attr
,
1793 struct bin_attribute
*bin_attrs
[] = {
1799 group
.attrs
= attrs
;
1801 /* add features entry */
1802 con
->features_attr
= dev_attr_features
;
1803 sysfs_attr_init(attrs
[0]);
1805 /* add version entry */
1806 con
->version_attr
= dev_attr_version
;
1807 sysfs_attr_init(attrs
[1]);
1809 /* add schema entry */
1810 con
->schema_attr
= dev_attr_schema
;
1811 sysfs_attr_init(attrs
[2]);
1813 if (amdgpu_bad_page_threshold
!= 0) {
1814 /* add bad_page_features entry */
1815 bin_attr_gpu_vram_bad_pages
.private = NULL
;
1816 con
->badpages_attr
= bin_attr_gpu_vram_bad_pages
;
1817 bin_attrs
[0] = &con
->badpages_attr
;
1818 group
.bin_attrs
= bin_attrs
;
1819 sysfs_bin_attr_init(bin_attrs
[0]);
1822 r
= sysfs_create_group(&adev
->dev
->kobj
, &group
);
1824 dev_err(adev
->dev
, "Failed to create RAS sysfs group!");
1829 static int amdgpu_ras_fs_fini(struct amdgpu_device
*adev
)
1831 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1832 struct ras_manager
*con_obj
, *ip_obj
, *tmp
;
1834 if (IS_ENABLED(CONFIG_DEBUG_FS
)) {
1835 list_for_each_entry_safe(con_obj
, tmp
, &con
->head
, node
) {
1836 ip_obj
= amdgpu_ras_find_obj(adev
, &con_obj
->head
);
1842 amdgpu_ras_sysfs_remove_all(adev
);
1849 /* For the hardware that cannot enable bif ring for both ras_controller_irq
1850 * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
1851 * register to check whether the interrupt is triggered or not, and properly
1852 * ack the interrupt if it is there
1854 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device
*adev
)
1856 /* Fatal error events are handled on host side */
1857 if (amdgpu_sriov_vf(adev
))
1860 if (adev
->nbio
.ras
&&
1861 adev
->nbio
.ras
->handle_ras_controller_intr_no_bifring
)
1862 adev
->nbio
.ras
->handle_ras_controller_intr_no_bifring(adev
);
1864 if (adev
->nbio
.ras
&&
1865 adev
->nbio
.ras
->handle_ras_err_event_athub_intr_no_bifring
)
1866 adev
->nbio
.ras
->handle_ras_err_event_athub_intr_no_bifring(adev
);
1869 static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager
*obj
,
1870 struct amdgpu_iv_entry
*entry
)
1872 bool poison_stat
= false;
1873 struct amdgpu_device
*adev
= obj
->adev
;
1874 struct amdgpu_ras_block_object
*block_obj
=
1875 amdgpu_ras_get_ras_block(adev
, obj
->head
.block
, 0);
1880 /* both query_poison_status and handle_poison_consumption are optional,
1881 * but at least one of them should be implemented if we need poison
1882 * consumption handler
1884 if (block_obj
->hw_ops
&& block_obj
->hw_ops
->query_poison_status
) {
1885 poison_stat
= block_obj
->hw_ops
->query_poison_status(adev
);
1887 /* Not poison consumption interrupt, no need to handle it */
1888 dev_info(adev
->dev
, "No RAS poison status in %s poison IH.\n",
1889 block_obj
->ras_comm
.name
);
1895 amdgpu_umc_poison_handler(adev
, false);
1897 if (block_obj
->hw_ops
&& block_obj
->hw_ops
->handle_poison_consumption
)
1898 poison_stat
= block_obj
->hw_ops
->handle_poison_consumption(adev
);
1900 /* gpu reset is fallback for failed and default cases */
1902 dev_info(adev
->dev
, "GPU reset for %s RAS poison consumption is issued!\n",
1903 block_obj
->ras_comm
.name
);
1904 amdgpu_ras_reset_gpu(adev
);
1906 amdgpu_gfx_poison_consumption_handler(adev
, entry
);
1910 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager
*obj
,
1911 struct amdgpu_iv_entry
*entry
)
1913 dev_info(obj
->adev
->dev
,
1914 "Poison is created, no user action is needed.\n");
1917 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager
*obj
,
1918 struct amdgpu_iv_entry
*entry
)
1920 struct ras_ih_data
*data
= &obj
->ih_data
;
1921 struct ras_err_data err_data
;
1927 ret
= amdgpu_ras_error_data_init(&err_data
);
1931 /* Let IP handle its data, maybe we need get the output
1932 * from the callback to update the error type/count, etc
1934 ret
= data
->cb(obj
->adev
, &err_data
, entry
);
1935 /* ue will trigger an interrupt, and in that case
1936 * we need do a reset to recovery the whole system.
1937 * But leave IP do that recovery, here we just dispatch
1940 if (ret
== AMDGPU_RAS_SUCCESS
) {
1941 /* these counts could be left as 0 if
1942 * some blocks do not count error number
1944 obj
->err_data
.ue_count
+= err_data
.ue_count
;
1945 obj
->err_data
.ce_count
+= err_data
.ce_count
;
1948 amdgpu_ras_error_data_fini(&err_data
);
1951 static void amdgpu_ras_interrupt_handler(struct ras_manager
*obj
)
1953 struct ras_ih_data
*data
= &obj
->ih_data
;
1954 struct amdgpu_iv_entry entry
;
1956 while (data
->rptr
!= data
->wptr
) {
1958 memcpy(&entry
, &data
->ring
[data
->rptr
],
1959 data
->element_size
);
1962 data
->rptr
= (data
->aligned_element_size
+
1963 data
->rptr
) % data
->ring_size
;
1965 if (amdgpu_ras_is_poison_mode_supported(obj
->adev
)) {
1966 if (obj
->head
.block
== AMDGPU_RAS_BLOCK__UMC
)
1967 amdgpu_ras_interrupt_poison_creation_handler(obj
, &entry
);
1969 amdgpu_ras_interrupt_poison_consumption_handler(obj
, &entry
);
1971 if (obj
->head
.block
== AMDGPU_RAS_BLOCK__UMC
)
1972 amdgpu_ras_interrupt_umc_handler(obj
, &entry
);
1974 dev_warn(obj
->adev
->dev
,
1975 "No RAS interrupt handler for non-UMC block with poison disabled.\n");
1980 static void amdgpu_ras_interrupt_process_handler(struct work_struct
*work
)
1982 struct ras_ih_data
*data
=
1983 container_of(work
, struct ras_ih_data
, ih_work
);
1984 struct ras_manager
*obj
=
1985 container_of(data
, struct ras_manager
, ih_data
);
1987 amdgpu_ras_interrupt_handler(obj
);
1990 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device
*adev
,
1991 struct ras_dispatch_if
*info
)
1993 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &info
->head
);
1994 struct ras_ih_data
*data
= &obj
->ih_data
;
1999 if (data
->inuse
== 0)
2002 /* Might be overflow... */
2003 memcpy(&data
->ring
[data
->wptr
], info
->entry
,
2004 data
->element_size
);
2007 data
->wptr
= (data
->aligned_element_size
+
2008 data
->wptr
) % data
->ring_size
;
2010 schedule_work(&data
->ih_work
);
2015 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device
*adev
,
2016 struct ras_common_if
*head
)
2018 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
2019 struct ras_ih_data
*data
;
2024 data
= &obj
->ih_data
;
2025 if (data
->inuse
== 0)
2028 cancel_work_sync(&data
->ih_work
);
2031 memset(data
, 0, sizeof(*data
));
2037 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device
*adev
,
2038 struct ras_common_if
*head
)
2040 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
2041 struct ras_ih_data
*data
;
2042 struct amdgpu_ras_block_object
*ras_obj
;
2045 /* in case we registe the IH before enable ras feature */
2046 obj
= amdgpu_ras_create_obj(adev
, head
);
2052 ras_obj
= container_of(head
, struct amdgpu_ras_block_object
, ras_comm
);
2054 data
= &obj
->ih_data
;
2055 /* add the callback.etc */
2056 *data
= (struct ras_ih_data
) {
2058 .cb
= ras_obj
->ras_cb
,
2059 .element_size
= sizeof(struct amdgpu_iv_entry
),
2064 INIT_WORK(&data
->ih_work
, amdgpu_ras_interrupt_process_handler
);
2066 data
->aligned_element_size
= ALIGN(data
->element_size
, 8);
2067 /* the ring can store 64 iv entries. */
2068 data
->ring_size
= 64 * data
->aligned_element_size
;
2069 data
->ring
= kmalloc(data
->ring_size
, GFP_KERNEL
);
2081 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device
*adev
)
2083 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2084 struct ras_manager
*obj
, *tmp
;
2086 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
2087 amdgpu_ras_interrupt_remove_handler(adev
, &obj
->head
);
2094 /* traversal all IPs except NBIO to query error counter */
2095 static void amdgpu_ras_log_on_err_counter(struct amdgpu_device
*adev
)
2097 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2098 struct ras_manager
*obj
;
2100 if (!adev
->ras_enabled
|| !con
)
2103 list_for_each_entry(obj
, &con
->head
, node
) {
2104 struct ras_query_if info
= {
2109 * PCIE_BIF IP has one different isr by ras controller
2110 * interrupt, the specific ras counter query will be
2111 * done in that isr. So skip such block from common
2112 * sync flood interrupt isr calling.
2114 if (info
.head
.block
== AMDGPU_RAS_BLOCK__PCIE_BIF
)
2118 * this is a workaround for aldebaran, skip send msg to
2119 * smu to get ecc_info table due to smu handle get ecc
2120 * info table failed temporarily.
2121 * should be removed until smu fix handle ecc_info table.
2123 if ((info
.head
.block
== AMDGPU_RAS_BLOCK__UMC
) &&
2124 (amdgpu_ip_version(adev
, MP1_HWIP
, 0) ==
2125 IP_VERSION(13, 0, 2)))
2128 amdgpu_ras_query_error_status(adev
, &info
);
2130 if (amdgpu_ip_version(adev
, MP0_HWIP
, 0) !=
2131 IP_VERSION(11, 0, 2) &&
2132 amdgpu_ip_version(adev
, MP0_HWIP
, 0) !=
2133 IP_VERSION(11, 0, 4) &&
2134 amdgpu_ip_version(adev
, MP0_HWIP
, 0) !=
2135 IP_VERSION(13, 0, 0)) {
2136 if (amdgpu_ras_reset_error_status(adev
, info
.head
.block
))
2137 dev_warn(adev
->dev
, "Failed to reset error counter and error status");
2142 /* Parse RdRspStatus and WrRspStatus */
2143 static void amdgpu_ras_error_status_query(struct amdgpu_device
*adev
,
2144 struct ras_query_if
*info
)
2146 struct amdgpu_ras_block_object
*block_obj
;
2148 * Only two block need to query read/write
2149 * RspStatus at current state
2151 if ((info
->head
.block
!= AMDGPU_RAS_BLOCK__GFX
) &&
2152 (info
->head
.block
!= AMDGPU_RAS_BLOCK__MMHUB
))
2155 block_obj
= amdgpu_ras_get_ras_block(adev
,
2157 info
->head
.sub_block_index
);
2159 if (!block_obj
|| !block_obj
->hw_ops
) {
2160 dev_dbg_once(adev
->dev
, "%s doesn't config RAS function\n",
2161 get_ras_block_str(&info
->head
));
2165 if (block_obj
->hw_ops
->query_ras_error_status
)
2166 block_obj
->hw_ops
->query_ras_error_status(adev
);
2170 static void amdgpu_ras_query_err_status(struct amdgpu_device
*adev
)
2172 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2173 struct ras_manager
*obj
;
2175 if (!adev
->ras_enabled
|| !con
)
2178 list_for_each_entry(obj
, &con
->head
, node
) {
2179 struct ras_query_if info
= {
2183 amdgpu_ras_error_status_query(adev
, &info
);
2187 /* recovery begin */
2189 /* return 0 on success.
2190 * caller need free bps.
2192 static int amdgpu_ras_badpages_read(struct amdgpu_device
*adev
,
2193 struct ras_badpage
**bps
, unsigned int *count
)
2195 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2196 struct ras_err_handler_data
*data
;
2198 int ret
= 0, status
;
2200 if (!con
|| !con
->eh_data
|| !bps
|| !count
)
2203 mutex_lock(&con
->recovery_lock
);
2204 data
= con
->eh_data
;
2205 if (!data
|| data
->count
== 0) {
2211 *bps
= kmalloc(sizeof(struct ras_badpage
) * data
->count
, GFP_KERNEL
);
2217 for (; i
< data
->count
; i
++) {
2218 (*bps
)[i
] = (struct ras_badpage
){
2219 .bp
= data
->bps
[i
].retired_page
,
2220 .size
= AMDGPU_GPU_PAGE_SIZE
,
2221 .flags
= AMDGPU_RAS_RETIRE_PAGE_RESERVED
,
2223 status
= amdgpu_vram_mgr_query_page_status(&adev
->mman
.vram_mgr
,
2224 data
->bps
[i
].retired_page
);
2225 if (status
== -EBUSY
)
2226 (*bps
)[i
].flags
= AMDGPU_RAS_RETIRE_PAGE_PENDING
;
2227 else if (status
== -ENOENT
)
2228 (*bps
)[i
].flags
= AMDGPU_RAS_RETIRE_PAGE_FAULT
;
2231 *count
= data
->count
;
2233 mutex_unlock(&con
->recovery_lock
);
2237 static void amdgpu_ras_do_recovery(struct work_struct
*work
)
2239 struct amdgpu_ras
*ras
=
2240 container_of(work
, struct amdgpu_ras
, recovery_work
);
2241 struct amdgpu_device
*remote_adev
= NULL
;
2242 struct amdgpu_device
*adev
= ras
->adev
;
2243 struct list_head device_list
, *device_list_handle
= NULL
;
2244 struct amdgpu_hive_info
*hive
= amdgpu_get_xgmi_hive(adev
);
2247 atomic_set(&hive
->ras_recovery
, 1);
2248 if (!ras
->disable_ras_err_cnt_harvest
) {
2250 /* Build list of devices to query RAS related errors */
2251 if (hive
&& adev
->gmc
.xgmi
.num_physical_nodes
> 1) {
2252 device_list_handle
= &hive
->device_list
;
2254 INIT_LIST_HEAD(&device_list
);
2255 list_add_tail(&adev
->gmc
.xgmi
.head
, &device_list
);
2256 device_list_handle
= &device_list
;
2259 list_for_each_entry(remote_adev
,
2260 device_list_handle
, gmc
.xgmi
.head
) {
2261 amdgpu_ras_query_err_status(remote_adev
);
2262 amdgpu_ras_log_on_err_counter(remote_adev
);
2267 if (amdgpu_device_should_recover_gpu(ras
->adev
)) {
2268 struct amdgpu_reset_context reset_context
;
2269 memset(&reset_context
, 0, sizeof(reset_context
));
2271 reset_context
.method
= AMD_RESET_METHOD_NONE
;
2272 reset_context
.reset_req_dev
= adev
;
2274 /* Perform full reset in fatal error mode */
2275 if (!amdgpu_ras_is_poison_mode_supported(ras
->adev
))
2276 set_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
.flags
);
2278 clear_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
.flags
);
2280 if (ras
->gpu_reset_flags
& AMDGPU_RAS_GPU_RESET_MODE2_RESET
) {
2281 ras
->gpu_reset_flags
&= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET
;
2282 reset_context
.method
= AMD_RESET_METHOD_MODE2
;
2285 /* Fatal error occurs in poison mode, mode1 reset is used to
2288 if (ras
->gpu_reset_flags
& AMDGPU_RAS_GPU_RESET_MODE1_RESET
) {
2289 ras
->gpu_reset_flags
&= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET
;
2290 set_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
.flags
);
2292 psp_fatal_error_recovery_quirk(&adev
->psp
);
2296 amdgpu_device_gpu_recover(ras
->adev
, NULL
, &reset_context
);
2298 atomic_set(&ras
->in_recovery
, 0);
2300 atomic_set(&hive
->ras_recovery
, 0);
2301 amdgpu_put_xgmi_hive(hive
);
2305 /* alloc/realloc bps array */
2306 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device
*adev
,
2307 struct ras_err_handler_data
*data
, int pages
)
2309 unsigned int old_space
= data
->count
+ data
->space_left
;
2310 unsigned int new_space
= old_space
+ pages
;
2311 unsigned int align_space
= ALIGN(new_space
, 512);
2312 void *bps
= kmalloc(align_space
* sizeof(*data
->bps
), GFP_KERNEL
);
2319 memcpy(bps
, data
->bps
,
2320 data
->count
* sizeof(*data
->bps
));
2325 data
->space_left
+= align_space
- old_space
;
2329 /* it deal with vram only. */
2330 int amdgpu_ras_add_bad_pages(struct amdgpu_device
*adev
,
2331 struct eeprom_table_record
*bps
, int pages
)
2333 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2334 struct ras_err_handler_data
*data
;
2338 if (!con
|| !con
->eh_data
|| !bps
|| pages
<= 0)
2341 mutex_lock(&con
->recovery_lock
);
2342 data
= con
->eh_data
;
2346 for (i
= 0; i
< pages
; i
++) {
2347 if (amdgpu_ras_check_bad_page_unlock(con
,
2348 bps
[i
].retired_page
<< AMDGPU_GPU_PAGE_SHIFT
))
2351 if (!data
->space_left
&&
2352 amdgpu_ras_realloc_eh_data_space(adev
, data
, 256)) {
2357 amdgpu_vram_mgr_reserve_range(&adev
->mman
.vram_mgr
,
2358 bps
[i
].retired_page
<< AMDGPU_GPU_PAGE_SHIFT
,
2359 AMDGPU_GPU_PAGE_SIZE
);
2361 memcpy(&data
->bps
[data
->count
], &bps
[i
], sizeof(*data
->bps
));
2366 mutex_unlock(&con
->recovery_lock
);
2372 * write error record array to eeprom, the function should be
2373 * protected by recovery_lock
2374 * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
2376 int amdgpu_ras_save_bad_pages(struct amdgpu_device
*adev
,
2377 unsigned long *new_cnt
)
2379 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2380 struct ras_err_handler_data
*data
;
2381 struct amdgpu_ras_eeprom_control
*control
;
2384 if (!con
|| !con
->eh_data
) {
2391 mutex_lock(&con
->recovery_lock
);
2392 control
= &con
->eeprom_control
;
2393 data
= con
->eh_data
;
2394 save_count
= data
->count
- control
->ras_num_recs
;
2395 mutex_unlock(&con
->recovery_lock
);
2398 *new_cnt
= save_count
/ adev
->umc
.retire_unit
;
2400 /* only new entries are saved */
2401 if (save_count
> 0) {
2402 if (amdgpu_ras_eeprom_append(control
,
2403 &data
->bps
[control
->ras_num_recs
],
2405 dev_err(adev
->dev
, "Failed to save EEPROM table data!");
2409 dev_info(adev
->dev
, "Saved %d pages to EEPROM table.\n", save_count
);
2416 * read error record array in eeprom and reserve enough space for
2417 * storing new bad pages
2419 static int amdgpu_ras_load_bad_pages(struct amdgpu_device
*adev
)
2421 struct amdgpu_ras_eeprom_control
*control
=
2422 &adev
->psp
.ras_context
.ras
->eeprom_control
;
2423 struct eeprom_table_record
*bps
;
2426 /* no bad page record, skip eeprom access */
2427 if (control
->ras_num_recs
== 0 || amdgpu_bad_page_threshold
== 0)
2430 bps
= kcalloc(control
->ras_num_recs
, sizeof(*bps
), GFP_KERNEL
);
2434 ret
= amdgpu_ras_eeprom_read(control
, bps
, control
->ras_num_recs
);
2436 dev_err(adev
->dev
, "Failed to load EEPROM table records!");
2438 ret
= amdgpu_ras_add_bad_pages(adev
, bps
, control
->ras_num_recs
);
2444 static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras
*con
,
2447 struct ras_err_handler_data
*data
= con
->eh_data
;
2450 addr
>>= AMDGPU_GPU_PAGE_SHIFT
;
2451 for (i
= 0; i
< data
->count
; i
++)
2452 if (addr
== data
->bps
[i
].retired_page
)
2459 * check if an address belongs to bad page
2461 * Note: this check is only for umc block
2463 static bool amdgpu_ras_check_bad_page(struct amdgpu_device
*adev
,
2466 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2469 if (!con
|| !con
->eh_data
)
2472 mutex_lock(&con
->recovery_lock
);
2473 ret
= amdgpu_ras_check_bad_page_unlock(con
, addr
);
2474 mutex_unlock(&con
->recovery_lock
);
2478 static void amdgpu_ras_validate_threshold(struct amdgpu_device
*adev
,
2481 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2484 * Justification of value bad_page_cnt_threshold in ras structure
2486 * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
2487 * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
2488 * scenarios accordingly.
2490 * Bad page retirement enablement:
2491 * - If amdgpu_bad_page_threshold = -2,
2492 * bad_page_cnt_threshold = typical value by formula.
2494 * - When the value from user is 0 < amdgpu_bad_page_threshold <
2495 * max record length in eeprom, use it directly.
2497 * Bad page retirement disablement:
2498 * - If amdgpu_bad_page_threshold = 0, bad page retirement
2499 * functionality is disabled, and bad_page_cnt_threshold will
2503 if (amdgpu_bad_page_threshold
< 0) {
2504 u64 val
= adev
->gmc
.mc_vram_size
;
2506 do_div(val
, RAS_BAD_PAGE_COVER
);
2507 con
->bad_page_cnt_threshold
= min(lower_32_bits(val
),
2510 con
->bad_page_cnt_threshold
= min_t(int, max_count
,
2511 amdgpu_bad_page_threshold
);
2515 int amdgpu_ras_recovery_init(struct amdgpu_device
*adev
)
2517 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2518 struct ras_err_handler_data
**data
;
2519 u32 max_eeprom_records_count
= 0;
2520 bool exc_err_limit
= false;
2523 if (!con
|| amdgpu_sriov_vf(adev
))
2526 /* Allow access to RAS EEPROM via debugfs, when the ASIC
2527 * supports RAS and debugfs is enabled, but when
2528 * adev->ras_enabled is unset, i.e. when "ras_enable"
2529 * module parameter is set to 0.
2533 if (!adev
->ras_enabled
)
2536 data
= &con
->eh_data
;
2537 *data
= kmalloc(sizeof(**data
), GFP_KERNEL
| __GFP_ZERO
);
2543 mutex_init(&con
->recovery_lock
);
2544 INIT_WORK(&con
->recovery_work
, amdgpu_ras_do_recovery
);
2545 atomic_set(&con
->in_recovery
, 0);
2546 con
->eeprom_control
.bad_channel_bitmap
= 0;
2548 max_eeprom_records_count
= amdgpu_ras_eeprom_max_record_count(&con
->eeprom_control
);
2549 amdgpu_ras_validate_threshold(adev
, max_eeprom_records_count
);
2551 /* Todo: During test the SMU might fail to read the eeprom through I2C
2552 * when the GPU is pending on XGMI reset during probe time
2553 * (Mostly after second bus reset), skip it now
2555 if (adev
->gmc
.xgmi
.pending_reset
)
2557 ret
= amdgpu_ras_eeprom_init(&con
->eeprom_control
, &exc_err_limit
);
2559 * This calling fails when exc_err_limit is true or
2562 if (exc_err_limit
|| ret
)
2565 if (con
->eeprom_control
.ras_num_recs
) {
2566 ret
= amdgpu_ras_load_bad_pages(adev
);
2570 amdgpu_dpm_send_hbm_bad_pages_num(adev
, con
->eeprom_control
.ras_num_recs
);
2572 if (con
->update_channel_flag
== true) {
2573 amdgpu_dpm_send_hbm_bad_channel_flag(adev
, con
->eeprom_control
.bad_channel_bitmap
);
2574 con
->update_channel_flag
= false;
2578 #ifdef CONFIG_X86_MCE_AMD
2579 if ((adev
->asic_type
== CHIP_ALDEBARAN
) &&
2580 (adev
->gmc
.xgmi
.connected_to_cpu
))
2581 amdgpu_register_bad_pages_mca_notifier(adev
);
2586 kfree((*data
)->bps
);
2588 con
->eh_data
= NULL
;
2590 dev_warn(adev
->dev
, "Failed to initialize ras recovery! (%d)\n", ret
);
2593 * Except error threshold exceeding case, other failure cases in this
2594 * function would not fail amdgpu driver init.
2604 static int amdgpu_ras_recovery_fini(struct amdgpu_device
*adev
)
2606 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2607 struct ras_err_handler_data
*data
= con
->eh_data
;
2609 /* recovery_init failed to init it, fini is useless */
2613 cancel_work_sync(&con
->recovery_work
);
2615 mutex_lock(&con
->recovery_lock
);
2616 con
->eh_data
= NULL
;
2619 mutex_unlock(&con
->recovery_lock
);
2625 static bool amdgpu_ras_asic_supported(struct amdgpu_device
*adev
)
2627 if (amdgpu_sriov_vf(adev
)) {
2628 switch (amdgpu_ip_version(adev
, MP0_HWIP
, 0)) {
2629 case IP_VERSION(13, 0, 2):
2630 case IP_VERSION(13, 0, 6):
2637 if (adev
->asic_type
== CHIP_IP_DISCOVERY
) {
2638 switch (amdgpu_ip_version(adev
, MP0_HWIP
, 0)) {
2639 case IP_VERSION(13, 0, 0):
2640 case IP_VERSION(13, 0, 6):
2641 case IP_VERSION(13, 0, 10):
2648 return adev
->asic_type
== CHIP_VEGA10
||
2649 adev
->asic_type
== CHIP_VEGA20
||
2650 adev
->asic_type
== CHIP_ARCTURUS
||
2651 adev
->asic_type
== CHIP_ALDEBARAN
||
2652 adev
->asic_type
== CHIP_SIENNA_CICHLID
;
2656 * this is workaround for vega20 workstation sku,
2657 * force enable gfx ras, ignore vbios gfx ras flag
2658 * due to GC EDC can not write
2660 static void amdgpu_ras_get_quirks(struct amdgpu_device
*adev
)
2662 struct atom_context
*ctx
= adev
->mode_info
.atom_context
;
2667 if (strnstr(ctx
->vbios_pn
, "D16406",
2668 sizeof(ctx
->vbios_pn
)) ||
2669 strnstr(ctx
->vbios_pn
, "D36002",
2670 sizeof(ctx
->vbios_pn
)))
2671 adev
->ras_hw_enabled
|= (1 << AMDGPU_RAS_BLOCK__GFX
);
2675 * check hardware's ras ability which will be saved in hw_supported.
2676 * if hardware does not support ras, we can skip some ras initializtion and
2677 * forbid some ras operations from IP.
2678 * if software itself, say boot parameter, limit the ras ability. We still
2679 * need allow IP do some limited operations, like disable. In such case,
2680 * we have to initialize ras as normal. but need check if operation is
2681 * allowed or not in each function.
2683 static void amdgpu_ras_check_supported(struct amdgpu_device
*adev
)
2685 adev
->ras_hw_enabled
= adev
->ras_enabled
= 0;
2687 if (!amdgpu_ras_asic_supported(adev
))
2690 if (!adev
->gmc
.xgmi
.connected_to_cpu
&& !adev
->gmc
.is_app_apu
) {
2691 if (amdgpu_atomfirmware_mem_ecc_supported(adev
)) {
2692 dev_info(adev
->dev
, "MEM ECC is active.\n");
2693 adev
->ras_hw_enabled
|= (1 << AMDGPU_RAS_BLOCK__UMC
|
2694 1 << AMDGPU_RAS_BLOCK__DF
);
2696 dev_info(adev
->dev
, "MEM ECC is not presented.\n");
2699 if (amdgpu_atomfirmware_sram_ecc_supported(adev
)) {
2700 dev_info(adev
->dev
, "SRAM ECC is active.\n");
2701 if (!amdgpu_sriov_vf(adev
))
2702 adev
->ras_hw_enabled
|= ~(1 << AMDGPU_RAS_BLOCK__UMC
|
2703 1 << AMDGPU_RAS_BLOCK__DF
);
2705 adev
->ras_hw_enabled
|= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF
|
2706 1 << AMDGPU_RAS_BLOCK__SDMA
|
2707 1 << AMDGPU_RAS_BLOCK__GFX
);
2709 /* VCN/JPEG RAS can be supported on both bare metal and
2712 if (amdgpu_ip_version(adev
, VCN_HWIP
, 0) ==
2713 IP_VERSION(2, 6, 0) ||
2714 amdgpu_ip_version(adev
, VCN_HWIP
, 0) ==
2715 IP_VERSION(4, 0, 0) ||
2716 amdgpu_ip_version(adev
, VCN_HWIP
, 0) ==
2717 IP_VERSION(4, 0, 3))
2718 adev
->ras_hw_enabled
|= (1 << AMDGPU_RAS_BLOCK__VCN
|
2719 1 << AMDGPU_RAS_BLOCK__JPEG
);
2721 adev
->ras_hw_enabled
&= ~(1 << AMDGPU_RAS_BLOCK__VCN
|
2722 1 << AMDGPU_RAS_BLOCK__JPEG
);
2725 * XGMI RAS is not supported if xgmi num physical nodes
2728 if (!adev
->gmc
.xgmi
.num_physical_nodes
)
2729 adev
->ras_hw_enabled
&= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL
);
2731 dev_info(adev
->dev
, "SRAM ECC is not presented.\n");
2734 /* driver only manages a few IP blocks RAS feature
2735 * when GPU is connected cpu through XGMI */
2736 adev
->ras_hw_enabled
|= (1 << AMDGPU_RAS_BLOCK__GFX
|
2737 1 << AMDGPU_RAS_BLOCK__SDMA
|
2738 1 << AMDGPU_RAS_BLOCK__MMHUB
);
2741 amdgpu_ras_get_quirks(adev
);
2743 /* hw_supported needs to be aligned with RAS block mask. */
2744 adev
->ras_hw_enabled
&= AMDGPU_RAS_BLOCK_MASK
;
2746 adev
->ras_enabled
= amdgpu_ras_enable
== 0 ? 0 :
2747 adev
->ras_hw_enabled
& amdgpu_ras_mask
;
2750 static void amdgpu_ras_counte_dw(struct work_struct
*work
)
2752 struct amdgpu_ras
*con
= container_of(work
, struct amdgpu_ras
,
2753 ras_counte_delay_work
.work
);
2754 struct amdgpu_device
*adev
= con
->adev
;
2755 struct drm_device
*dev
= adev_to_drm(adev
);
2756 unsigned long ce_count
, ue_count
;
2759 res
= pm_runtime_get_sync(dev
->dev
);
2763 /* Cache new values.
2765 if (amdgpu_ras_query_error_count(adev
, &ce_count
, &ue_count
, NULL
) == 0) {
2766 atomic_set(&con
->ras_ce_count
, ce_count
);
2767 atomic_set(&con
->ras_ue_count
, ue_count
);
2770 pm_runtime_mark_last_busy(dev
->dev
);
2772 pm_runtime_put_autosuspend(dev
->dev
);
2775 static void amdgpu_ras_query_poison_mode(struct amdgpu_device
*adev
)
2777 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2778 bool df_poison
, umc_poison
;
2780 /* poison setting is useless on SRIOV guest */
2781 if (amdgpu_sriov_vf(adev
) || !con
)
2784 /* Init poison supported flag, the default value is false */
2785 if (adev
->gmc
.xgmi
.connected_to_cpu
||
2786 adev
->gmc
.is_app_apu
) {
2787 /* enabled by default when GPU is connected to CPU */
2788 con
->poison_supported
= true;
2789 } else if (adev
->df
.funcs
&&
2790 adev
->df
.funcs
->query_ras_poison_mode
&&
2792 adev
->umc
.ras
->query_ras_poison_mode
) {
2794 adev
->df
.funcs
->query_ras_poison_mode(adev
);
2796 adev
->umc
.ras
->query_ras_poison_mode(adev
);
2798 /* Only poison is set in both DF and UMC, we can support it */
2799 if (df_poison
&& umc_poison
)
2800 con
->poison_supported
= true;
2801 else if (df_poison
!= umc_poison
)
2803 "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
2804 df_poison
, umc_poison
);
2808 static int amdgpu_get_ras_schema(struct amdgpu_device
*adev
)
2810 return amdgpu_ras_is_poison_mode_supported(adev
) ? AMDGPU_RAS_ERROR__POISON
: 0 |
2811 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE
|
2812 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE
|
2813 AMDGPU_RAS_ERROR__PARITY
;
2816 int amdgpu_ras_init(struct amdgpu_device
*adev
)
2818 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2824 con
= kmalloc(sizeof(struct amdgpu_ras
) +
2825 sizeof(struct ras_manager
) * AMDGPU_RAS_BLOCK_COUNT
+
2826 sizeof(struct ras_manager
) * AMDGPU_RAS_MCA_BLOCK_COUNT
,
2827 GFP_KERNEL
|__GFP_ZERO
);
2832 INIT_DELAYED_WORK(&con
->ras_counte_delay_work
, amdgpu_ras_counte_dw
);
2833 atomic_set(&con
->ras_ce_count
, 0);
2834 atomic_set(&con
->ras_ue_count
, 0);
2836 con
->objs
= (struct ras_manager
*)(con
+ 1);
2838 amdgpu_ras_set_context(adev
, con
);
2840 amdgpu_ras_check_supported(adev
);
2842 if (!adev
->ras_enabled
|| adev
->asic_type
== CHIP_VEGA10
) {
2843 /* set gfx block ras context feature for VEGA20 Gaming
2844 * send ras disable cmd to ras ta during ras late init.
2846 if (!adev
->ras_enabled
&& adev
->asic_type
== CHIP_VEGA20
) {
2847 con
->features
|= BIT(AMDGPU_RAS_BLOCK__GFX
);
2856 con
->update_channel_flag
= false;
2859 INIT_LIST_HEAD(&con
->head
);
2860 /* Might need get this flag from vbios. */
2861 con
->flags
= RAS_DEFAULT_FLAGS
;
2863 /* initialize nbio ras function ahead of any other
2864 * ras functions so hardware fatal error interrupt
2865 * can be enabled as early as possible */
2866 switch (amdgpu_ip_version(adev
, NBIO_HWIP
, 0)) {
2867 case IP_VERSION(7, 4, 0):
2868 case IP_VERSION(7, 4, 1):
2869 case IP_VERSION(7, 4, 4):
2870 if (!adev
->gmc
.xgmi
.connected_to_cpu
)
2871 adev
->nbio
.ras
= &nbio_v7_4_ras
;
2873 case IP_VERSION(4, 3, 0):
2874 if (adev
->ras_hw_enabled
& (1 << AMDGPU_RAS_BLOCK__DF
))
2875 /* unlike other generation of nbio ras,
2876 * nbio v4_3 only support fatal error interrupt
2877 * to inform software that DF is freezed due to
2878 * system fatal error event. driver should not
2879 * enable nbio ras in such case. Instead,
2881 adev
->nbio
.ras
= &nbio_v4_3_ras
;
2883 case IP_VERSION(7, 9, 0):
2884 if (!adev
->gmc
.is_app_apu
)
2885 adev
->nbio
.ras
= &nbio_v7_9_ras
;
2888 /* nbio ras is not available */
2892 /* nbio ras block needs to be enabled ahead of other ras blocks
2893 * to handle fatal error */
2894 r
= amdgpu_nbio_ras_sw_init(adev
);
2898 if (adev
->nbio
.ras
&&
2899 adev
->nbio
.ras
->init_ras_controller_interrupt
) {
2900 r
= adev
->nbio
.ras
->init_ras_controller_interrupt(adev
);
2905 if (adev
->nbio
.ras
&&
2906 adev
->nbio
.ras
->init_ras_err_event_athub_interrupt
) {
2907 r
= adev
->nbio
.ras
->init_ras_err_event_athub_interrupt(adev
);
2912 amdgpu_ras_query_poison_mode(adev
);
2914 /* Get RAS schema for particular SOC */
2915 con
->schema
= amdgpu_get_ras_schema(adev
);
2917 if (amdgpu_ras_fs_init(adev
)) {
2922 dev_info(adev
->dev
, "RAS INFO: ras initialized successfully, "
2923 "hardware ability[%x] ras_mask[%x]\n",
2924 adev
->ras_hw_enabled
, adev
->ras_enabled
);
2928 amdgpu_ras_set_context(adev
, NULL
);
2934 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device
*adev
)
2936 if (adev
->gmc
.xgmi
.connected_to_cpu
||
2937 adev
->gmc
.is_app_apu
)
2942 static int amdgpu_persistent_edc_harvesting(struct amdgpu_device
*adev
,
2943 struct ras_common_if
*ras_block
)
2945 struct ras_query_if info
= {
2949 if (!amdgpu_persistent_edc_harvesting_supported(adev
))
2952 if (amdgpu_ras_query_error_status(adev
, &info
) != 0)
2953 DRM_WARN("RAS init harvest failure");
2955 if (amdgpu_ras_reset_error_status(adev
, ras_block
->block
) != 0)
2956 DRM_WARN("RAS init harvest reset failure");
2961 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device
*adev
)
2963 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2968 return con
->poison_supported
;
2971 /* helper function to handle common stuff in ip late init phase */
2972 int amdgpu_ras_block_late_init(struct amdgpu_device
*adev
,
2973 struct ras_common_if
*ras_block
)
2975 struct amdgpu_ras_block_object
*ras_obj
= NULL
;
2976 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
2977 struct ras_query_if
*query_info
;
2978 unsigned long ue_count
, ce_count
;
2981 /* disable RAS feature per IP block if it is not supported */
2982 if (!amdgpu_ras_is_supported(adev
, ras_block
->block
)) {
2983 amdgpu_ras_feature_enable_on_boot(adev
, ras_block
, 0);
2987 r
= amdgpu_ras_feature_enable_on_boot(adev
, ras_block
, 1);
2989 if (adev
->in_suspend
|| amdgpu_in_reset(adev
)) {
2990 /* in resume phase, if fail to enable ras,
2991 * clean up all ras fs nodes, and disable ras */
2997 /* check for errors on warm reset edc persisant supported ASIC */
2998 amdgpu_persistent_edc_harvesting(adev
, ras_block
);
3000 /* in resume phase, no need to create ras fs node */
3001 if (adev
->in_suspend
|| amdgpu_in_reset(adev
))
3004 ras_obj
= container_of(ras_block
, struct amdgpu_ras_block_object
, ras_comm
);
3005 if (ras_obj
->ras_cb
|| (ras_obj
->hw_ops
&&
3006 (ras_obj
->hw_ops
->query_poison_status
||
3007 ras_obj
->hw_ops
->handle_poison_consumption
))) {
3008 r
= amdgpu_ras_interrupt_add_handler(adev
, ras_block
);
3013 if (ras_obj
->hw_ops
&&
3014 (ras_obj
->hw_ops
->query_ras_error_count
||
3015 ras_obj
->hw_ops
->query_ras_error_status
)) {
3016 r
= amdgpu_ras_sysfs_create(adev
, ras_block
);
3020 /* Those are the cached values at init.
3022 query_info
= kzalloc(sizeof(*query_info
), GFP_KERNEL
);
3025 memcpy(&query_info
->head
, ras_block
, sizeof(struct ras_common_if
));
3027 if (amdgpu_ras_query_error_count(adev
, &ce_count
, &ue_count
, query_info
) == 0) {
3028 atomic_set(&con
->ras_ce_count
, ce_count
);
3029 atomic_set(&con
->ras_ue_count
, ue_count
);
3038 if (ras_obj
->ras_cb
)
3039 amdgpu_ras_interrupt_remove_handler(adev
, ras_block
);
3041 amdgpu_ras_feature_enable(adev
, ras_block
, 0);
3045 static int amdgpu_ras_block_late_init_default(struct amdgpu_device
*adev
,
3046 struct ras_common_if
*ras_block
)
3048 return amdgpu_ras_block_late_init(adev
, ras_block
);
3051 /* helper function to remove ras fs node and interrupt handler */
3052 void amdgpu_ras_block_late_fini(struct amdgpu_device
*adev
,
3053 struct ras_common_if
*ras_block
)
3055 struct amdgpu_ras_block_object
*ras_obj
;
3059 amdgpu_ras_sysfs_remove(adev
, ras_block
);
3061 ras_obj
= container_of(ras_block
, struct amdgpu_ras_block_object
, ras_comm
);
3062 if (ras_obj
->ras_cb
)
3063 amdgpu_ras_interrupt_remove_handler(adev
, ras_block
);
3066 static void amdgpu_ras_block_late_fini_default(struct amdgpu_device
*adev
,
3067 struct ras_common_if
*ras_block
)
3069 return amdgpu_ras_block_late_fini(adev
, ras_block
);
3072 /* do some init work after IP late init as dependence.
3073 * and it runs in resume/gpu reset/booting up cases.
3075 void amdgpu_ras_resume(struct amdgpu_device
*adev
)
3077 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3078 struct ras_manager
*obj
, *tmp
;
3080 if (!adev
->ras_enabled
|| !con
) {
3081 /* clean ras context for VEGA20 Gaming after send ras disable cmd */
3082 amdgpu_release_ras_context(adev
);
3087 if (con
->flags
& AMDGPU_RAS_FLAG_INIT_BY_VBIOS
) {
3088 /* Set up all other IPs which are not implemented. There is a
3089 * tricky thing that IP's actual ras error type should be
3090 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
3091 * ERROR_NONE make sense anyway.
3093 amdgpu_ras_enable_all_features(adev
, 1);
3095 /* We enable ras on all hw_supported block, but as boot
3096 * parameter might disable some of them and one or more IP has
3097 * not implemented yet. So we disable them on behalf.
3099 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
3100 if (!amdgpu_ras_is_supported(adev
, obj
->head
.block
)) {
3101 amdgpu_ras_feature_enable(adev
, &obj
->head
, 0);
3102 /* there should be no any reference. */
3103 WARN_ON(alive_obj(obj
));
3109 void amdgpu_ras_suspend(struct amdgpu_device
*adev
)
3111 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3113 if (!adev
->ras_enabled
|| !con
)
3116 amdgpu_ras_disable_all_features(adev
, 0);
3117 /* Make sure all ras objects are disabled. */
3119 amdgpu_ras_disable_all_features(adev
, 1);
3122 int amdgpu_ras_late_init(struct amdgpu_device
*adev
)
3124 struct amdgpu_ras_block_list
*node
, *tmp
;
3125 struct amdgpu_ras_block_object
*obj
;
3128 /* Guest side doesn't need init ras feature */
3129 if (amdgpu_sriov_vf(adev
))
3132 list_for_each_entry_safe(node
, tmp
, &adev
->ras_list
, node
) {
3133 if (!node
->ras_obj
) {
3134 dev_warn(adev
->dev
, "Warning: abnormal ras list node.\n");
3138 obj
= node
->ras_obj
;
3139 if (obj
->ras_late_init
) {
3140 r
= obj
->ras_late_init(adev
, &obj
->ras_comm
);
3142 dev_err(adev
->dev
, "%s failed to execute ras_late_init! ret:%d\n",
3143 obj
->ras_comm
.name
, r
);
3147 amdgpu_ras_block_late_init_default(adev
, &obj
->ras_comm
);
3153 /* do some fini work before IP fini as dependence */
3154 int amdgpu_ras_pre_fini(struct amdgpu_device
*adev
)
3156 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3158 if (!adev
->ras_enabled
|| !con
)
3162 /* Need disable ras on all IPs here before ip [hw/sw]fini */
3164 amdgpu_ras_disable_all_features(adev
, 0);
3165 amdgpu_ras_recovery_fini(adev
);
3169 int amdgpu_ras_fini(struct amdgpu_device
*adev
)
3171 struct amdgpu_ras_block_list
*ras_node
, *tmp
;
3172 struct amdgpu_ras_block_object
*obj
= NULL
;
3173 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3175 if (!adev
->ras_enabled
|| !con
)
3178 list_for_each_entry_safe(ras_node
, tmp
, &adev
->ras_list
, node
) {
3179 if (ras_node
->ras_obj
) {
3180 obj
= ras_node
->ras_obj
;
3181 if (amdgpu_ras_is_supported(adev
, obj
->ras_comm
.block
) &&
3183 obj
->ras_fini(adev
, &obj
->ras_comm
);
3185 amdgpu_ras_block_late_fini_default(adev
, &obj
->ras_comm
);
3188 /* Clear ras blocks from ras_list and free ras block list node */
3189 list_del(&ras_node
->node
);
3193 amdgpu_ras_fs_fini(adev
);
3194 amdgpu_ras_interrupt_remove_all(adev
);
3196 WARN(con
->features
, "Feature mask is not cleared");
3199 amdgpu_ras_disable_all_features(adev
, 1);
3201 cancel_delayed_work_sync(&con
->ras_counte_delay_work
);
3203 amdgpu_ras_set_context(adev
, NULL
);
3209 void amdgpu_ras_global_ras_isr(struct amdgpu_device
*adev
)
3211 if (atomic_cmpxchg(&amdgpu_ras_in_intr
, 0, 1) == 0) {
3212 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
3214 dev_info(adev
->dev
, "uncorrectable hardware error"
3215 "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
3217 ras
->gpu_reset_flags
|= AMDGPU_RAS_GPU_RESET_MODE1_RESET
;
3218 amdgpu_ras_reset_gpu(adev
);
3222 bool amdgpu_ras_need_emergency_restart(struct amdgpu_device
*adev
)
3224 if (adev
->asic_type
== CHIP_VEGA20
&&
3225 adev
->pm
.fw_version
<= 0x283400) {
3226 return !(amdgpu_asic_reset_method(adev
) == AMD_RESET_METHOD_BACO
) &&
3227 amdgpu_ras_intr_triggered();
3233 void amdgpu_release_ras_context(struct amdgpu_device
*adev
)
3235 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3240 if (!adev
->ras_enabled
&& con
->features
& BIT(AMDGPU_RAS_BLOCK__GFX
)) {
3241 con
->features
&= ~BIT(AMDGPU_RAS_BLOCK__GFX
);
3242 amdgpu_ras_set_context(adev
, NULL
);
3247 #ifdef CONFIG_X86_MCE_AMD
3248 static struct amdgpu_device
*find_adev(uint32_t node_id
)
3251 struct amdgpu_device
*adev
= NULL
;
3253 for (i
= 0; i
< mce_adev_list
.num_gpu
; i
++) {
3254 adev
= mce_adev_list
.devs
[i
];
3256 if (adev
&& adev
->gmc
.xgmi
.connected_to_cpu
&&
3257 adev
->gmc
.xgmi
.physical_node_id
== node_id
)
3265 #define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF)
3266 #define GET_UMC_INST(m) (((m) >> 21) & 0x7)
3267 #define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
3268 #define GPU_ID_OFFSET 8
3270 static int amdgpu_bad_page_notifier(struct notifier_block
*nb
,
3271 unsigned long val
, void *data
)
3273 struct mce
*m
= (struct mce
*)data
;
3274 struct amdgpu_device
*adev
= NULL
;
3275 uint32_t gpu_id
= 0;
3276 uint32_t umc_inst
= 0, ch_inst
= 0;
3279 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
3280 * and error occurred in DramECC (Extended error code = 0) then only
3281 * process the error, else bail out.
3283 if (!m
|| !((smca_get_bank_type(m
->extcpu
, m
->bank
) == SMCA_UMC_V2
) &&
3284 (XEC(m
->status
, 0x3f) == 0x0)))
3288 * If it is correctable error, return.
3290 if (mce_is_correctable(m
))
3294 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
3296 gpu_id
= GET_MCA_IPID_GPUID(m
->ipid
) - GPU_ID_OFFSET
;
3298 adev
= find_adev(gpu_id
);
3300 DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__
,
3306 * If it is uncorrectable error, then find out UMC instance and
3309 umc_inst
= GET_UMC_INST(m
->ipid
);
3310 ch_inst
= GET_CHAN_INDEX(m
->ipid
);
3312 dev_info(adev
->dev
, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
3315 if (!amdgpu_umc_page_retirement_mca(adev
, m
->addr
, ch_inst
, umc_inst
))
3321 static struct notifier_block amdgpu_bad_page_nb
= {
3322 .notifier_call
= amdgpu_bad_page_notifier
,
3323 .priority
= MCE_PRIO_UC
,
3326 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device
*adev
)
3329 * Add the adev to the mce_adev_list.
3330 * During mode2 reset, amdgpu device is temporarily
3331 * removed from the mgpu_info list which can cause
3332 * page retirement to fail.
3333 * Use this list instead of mgpu_info to find the amdgpu
3334 * device on which the UMC error was reported.
3336 mce_adev_list
.devs
[mce_adev_list
.num_gpu
++] = adev
;
3339 * Register the x86 notifier only once
3340 * with MCE subsystem.
3342 if (notifier_registered
== false) {
3343 mce_register_decode_chain(&amdgpu_bad_page_nb
);
3344 notifier_registered
= true;
3349 struct amdgpu_ras
*amdgpu_ras_get_context(struct amdgpu_device
*adev
)
3354 return adev
->psp
.ras_context
.ras
;
3357 int amdgpu_ras_set_context(struct amdgpu_device
*adev
, struct amdgpu_ras
*ras_con
)
3362 adev
->psp
.ras_context
.ras
= ras_con
;
3366 /* check if ras is supported on block, say, sdma, gfx */
3367 int amdgpu_ras_is_supported(struct amdgpu_device
*adev
,
3371 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
3373 if (block
>= AMDGPU_RAS_BLOCK_COUNT
)
3376 ret
= ras
&& (adev
->ras_enabled
& (1 << block
));
3378 /* For the special asic with mem ecc enabled but sram ecc
3379 * not enabled, even if the ras block is not supported on
3380 * .ras_enabled, if the asic supports poison mode and the
3381 * ras block has ras configuration, it can be considered
3382 * that the ras block supports ras function.
3385 (block
== AMDGPU_RAS_BLOCK__GFX
||
3386 block
== AMDGPU_RAS_BLOCK__SDMA
||
3387 block
== AMDGPU_RAS_BLOCK__VCN
||
3388 block
== AMDGPU_RAS_BLOCK__JPEG
) &&
3389 amdgpu_ras_is_poison_mode_supported(adev
) &&
3390 amdgpu_ras_get_ras_block(adev
, block
, 0))
3396 int amdgpu_ras_reset_gpu(struct amdgpu_device
*adev
)
3398 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
3400 if (atomic_cmpxchg(&ras
->in_recovery
, 0, 1) == 0)
3401 amdgpu_reset_domain_schedule(ras
->adev
->reset_domain
, &ras
->recovery_work
);
3405 void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device
*adev
, bool enable
)
3407 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3410 con
->is_mca_debug_mode
= enable
;
3413 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device
*adev
)
3415 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3416 const struct amdgpu_mca_smu_funcs
*mca_funcs
= adev
->mca
.mca_funcs
;
3421 if (mca_funcs
&& mca_funcs
->mca_set_debug_mode
)
3422 return con
->is_mca_debug_mode
;
3427 bool amdgpu_ras_get_error_query_mode(struct amdgpu_device
*adev
,
3428 unsigned int *error_query_mode
)
3430 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
3431 const struct amdgpu_mca_smu_funcs
*mca_funcs
= adev
->mca
.mca_funcs
;
3434 *error_query_mode
= AMDGPU_RAS_INVALID_ERROR_QUERY
;
3438 if (mca_funcs
&& mca_funcs
->mca_set_debug_mode
)
3440 (con
->is_mca_debug_mode
) ? AMDGPU_RAS_DIRECT_ERROR_QUERY
: AMDGPU_RAS_FIRMWARE_ERROR_QUERY
;
3442 *error_query_mode
= AMDGPU_RAS_DIRECT_ERROR_QUERY
;
3447 /* Register each ip ras block into amdgpu ras */
3448 int amdgpu_ras_register_ras_block(struct amdgpu_device
*adev
,
3449 struct amdgpu_ras_block_object
*ras_block_obj
)
3451 struct amdgpu_ras_block_list
*ras_node
;
3452 if (!adev
|| !ras_block_obj
)
3455 ras_node
= kzalloc(sizeof(*ras_node
), GFP_KERNEL
);
3459 INIT_LIST_HEAD(&ras_node
->node
);
3460 ras_node
->ras_obj
= ras_block_obj
;
3461 list_add_tail(&ras_node
->node
, &adev
->ras_list
);
3466 void amdgpu_ras_get_error_type_name(uint32_t err_type
, char *err_type_name
)
3472 case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE
:
3473 sprintf(err_type_name
, "correctable");
3475 case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE
:
3476 sprintf(err_type_name
, "uncorrectable");
3479 sprintf(err_type_name
, "unknown");
3484 bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device
*adev
,
3485 const struct amdgpu_ras_err_status_reg_entry
*reg_entry
,
3487 uint32_t *memory_id
)
3489 uint32_t err_status_lo_data
, err_status_lo_offset
;
3494 err_status_lo_offset
=
3495 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry
->hwip
, instance
,
3496 reg_entry
->seg_lo
, reg_entry
->reg_lo
);
3497 err_status_lo_data
= RREG32(err_status_lo_offset
);
3499 if ((reg_entry
->flags
& AMDGPU_RAS_ERR_STATUS_VALID
) &&
3500 !REG_GET_FIELD(err_status_lo_data
, ERR_STATUS_LO
, ERR_STATUS_VALID_FLAG
))
3503 *memory_id
= REG_GET_FIELD(err_status_lo_data
, ERR_STATUS_LO
, MEMORY_ID
);
3508 bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device
*adev
,
3509 const struct amdgpu_ras_err_status_reg_entry
*reg_entry
,
3511 unsigned long *err_cnt
)
3513 uint32_t err_status_hi_data
, err_status_hi_offset
;
3518 err_status_hi_offset
=
3519 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry
->hwip
, instance
,
3520 reg_entry
->seg_hi
, reg_entry
->reg_hi
);
3521 err_status_hi_data
= RREG32(err_status_hi_offset
);
3523 if ((reg_entry
->flags
& AMDGPU_RAS_ERR_INFO_VALID
) &&
3524 !REG_GET_FIELD(err_status_hi_data
, ERR_STATUS_HI
, ERR_INFO_VALID_FLAG
))
3525 /* keep the check here in case we need to refer to the result later */
3526 dev_dbg(adev
->dev
, "Invalid err_info field\n");
3528 /* read err count */
3529 *err_cnt
= REG_GET_FIELD(err_status_hi_data
, ERR_STATUS
, ERR_CNT
);
3534 void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device
*adev
,
3535 const struct amdgpu_ras_err_status_reg_entry
*reg_list
,
3536 uint32_t reg_list_size
,
3537 const struct amdgpu_ras_memory_id_entry
*mem_list
,
3538 uint32_t mem_list_size
,
3541 unsigned long *err_count
)
3544 unsigned long err_cnt
;
3545 char err_type_name
[16];
3548 for (i
= 0; i
< reg_list_size
; i
++) {
3549 /* query memory_id from err_status_lo */
3550 if (!amdgpu_ras_inst_get_memory_id_field(adev
, ®_list
[i
],
3551 instance
, &memory_id
))
3554 /* query err_cnt from err_status_hi */
3555 if (!amdgpu_ras_inst_get_err_cnt_field(adev
, ®_list
[i
],
3556 instance
, &err_cnt
) ||
3560 *err_count
+= err_cnt
;
3562 /* log the errors */
3563 amdgpu_ras_get_error_type_name(err_type
, err_type_name
);
3565 /* memory_list is not supported */
3567 "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n",
3568 err_cnt
, err_type_name
,
3569 reg_list
[i
].block_name
,
3570 instance
, memory_id
);
3572 for (j
= 0; j
< mem_list_size
; j
++) {
3573 if (memory_id
== mem_list
[j
].memory_id
) {
3575 "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n",
3576 err_cnt
, err_type_name
,
3577 reg_list
[i
].block_name
,
3578 instance
, mem_list
[j
].name
);
3586 void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device
*adev
,
3587 const struct amdgpu_ras_err_status_reg_entry
*reg_list
,
3588 uint32_t reg_list_size
,
3591 uint32_t err_status_lo_offset
, err_status_hi_offset
;
3594 for (i
= 0; i
< reg_list_size
; i
++) {
3595 err_status_lo_offset
=
3596 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list
[i
].hwip
, instance
,
3597 reg_list
[i
].seg_lo
, reg_list
[i
].reg_lo
);
3598 err_status_hi_offset
=
3599 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list
[i
].hwip
, instance
,
3600 reg_list
[i
].seg_hi
, reg_list
[i
].reg_hi
);
3601 WREG32(err_status_lo_offset
, 0);
3602 WREG32(err_status_hi_offset
, 0);
3606 int amdgpu_ras_error_data_init(struct ras_err_data
*err_data
)
3608 memset(err_data
, 0, sizeof(*err_data
));
3610 INIT_LIST_HEAD(&err_data
->err_node_list
);
3615 static void amdgpu_ras_error_node_release(struct ras_err_node
*err_node
)
3620 list_del(&err_node
->node
);
3624 void amdgpu_ras_error_data_fini(struct ras_err_data
*err_data
)
3626 struct ras_err_node
*err_node
, *tmp
;
3628 list_for_each_entry_safe(err_node
, tmp
, &err_data
->err_node_list
, node
)
3629 amdgpu_ras_error_node_release(err_node
);
3632 static struct ras_err_node
*amdgpu_ras_error_find_node_by_id(struct ras_err_data
*err_data
,
3633 struct amdgpu_smuio_mcm_config_info
*mcm_info
)
3635 struct ras_err_node
*err_node
;
3636 struct amdgpu_smuio_mcm_config_info
*ref_id
;
3638 if (!err_data
|| !mcm_info
)
3641 for_each_ras_error(err_node
, err_data
) {
3642 ref_id
= &err_node
->err_info
.mcm_info
;
3644 if (mcm_info
->socket_id
== ref_id
->socket_id
&&
3645 mcm_info
->die_id
== ref_id
->die_id
)
3652 static struct ras_err_node
*amdgpu_ras_error_node_new(void)
3654 struct ras_err_node
*err_node
;
3656 err_node
= kvzalloc(sizeof(*err_node
), GFP_KERNEL
);
3660 INIT_LIST_HEAD(&err_node
->node
);
3665 static struct ras_err_info
*amdgpu_ras_error_get_info(struct ras_err_data
*err_data
,
3666 struct amdgpu_smuio_mcm_config_info
*mcm_info
)
3668 struct ras_err_node
*err_node
;
3670 err_node
= amdgpu_ras_error_find_node_by_id(err_data
, mcm_info
);
3672 return &err_node
->err_info
;
3674 err_node
= amdgpu_ras_error_node_new();
3678 memcpy(&err_node
->err_info
.mcm_info
, mcm_info
, sizeof(*mcm_info
));
3680 err_data
->err_list_count
++;
3681 list_add_tail(&err_node
->node
, &err_data
->err_node_list
);
3683 return &err_node
->err_info
;
3686 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data
*err_data
,
3687 struct amdgpu_smuio_mcm_config_info
*mcm_info
, u64 count
)
3689 struct ras_err_info
*err_info
;
3691 if (!err_data
|| !mcm_info
)
3697 err_info
= amdgpu_ras_error_get_info(err_data
, mcm_info
);
3701 err_info
->ue_count
+= count
;
3702 err_data
->ue_count
+= count
;
3707 int amdgpu_ras_error_statistic_ce_count(struct ras_err_data
*err_data
,
3708 struct amdgpu_smuio_mcm_config_info
*mcm_info
, u64 count
)
3710 struct ras_err_info
*err_info
;
3712 if (!err_data
|| !mcm_info
)
3718 err_info
= amdgpu_ras_error_get_info(err_data
, mcm_info
);
3722 err_info
->ce_count
+= count
;
3723 err_data
->ce_count
+= count
;