]>
Commit | Line | Data |
---|---|---|
335f7cc0 SL |
1 | From 14d176eb96bb46d45988b6187bf58eabd712c606 Mon Sep 17 00:00:00 2001 |
2 | From: Sasha Levin <sashal@kernel.org> | |
3 | Date: Wed, 10 Jan 2024 16:13:50 +0800 | |
4 | Subject: drm/amdgpu: Skip do PCI error slot reset during RAS recovery | |
5 | ||
6 | From: Stanley.Yang <Stanley.Yang@amd.com> | |
7 | ||
8 | [ Upstream commit 601429cca96b4af3be44172c3b64e4228515dbe1 ] | |
9 | ||
10 | Why: | |
11 | The PCI error slot reset maybe triggered after inject ue to UMC multi times, this | |
12 | caused system hang. | |
13 | [ 557.371857] amdgpu 0000:af:00.0: amdgpu: GPU reset succeeded, trying to resume | |
14 | [ 557.373718] [drm] PCIE GART of 512M enabled. | |
15 | [ 557.373722] [drm] PTB located at 0x0000031FED700000 | |
16 | [ 557.373788] [drm] VRAM is lost due to GPU reset! | |
17 | [ 557.373789] [drm] PSP is resuming... | |
18 | [ 557.547012] mlx5_core 0000:55:00.0: mlx5_pci_err_detected Device state = 1 pci_status: 0. Exit, result = 3, need reset | |
19 | [ 557.547067] [drm] PCI error: detected callback, state(1)!! | |
20 | [ 557.547069] [drm] No support for XGMI hive yet... | |
21 | [ 557.548125] mlx5_core 0000:55:00.0: mlx5_pci_slot_reset Device state = 1 pci_status: 0. Enter | |
22 | [ 557.607763] mlx5_core 0000:55:00.0: wait vital counter value 0x16b5b after 1 iterations | |
23 | [ 557.607777] mlx5_core 0000:55:00.0: mlx5_pci_slot_reset Device state = 1 pci_status: 1. Exit, err = 0, result = 5, recovered | |
24 | [ 557.610492] [drm] PCI error: slot reset callback!! | |
25 | ... | |
26 | [ 560.689382] amdgpu 0000:3f:00.0: amdgpu: GPU reset(2) succeeded! | |
27 | [ 560.689546] amdgpu 0000:5a:00.0: amdgpu: GPU reset(2) succeeded! | |
28 | [ 560.689562] general protection fault, probably for non-canonical address 0x5f080b54534f611f: 0000 [#1] SMP NOPTI | |
29 | [ 560.701008] CPU: 16 PID: 2361 Comm: kworker/u448:9 Tainted: G OE 5.15.0-91-generic #101-Ubuntu | |
30 | [ 560.712057] Hardware name: Microsoft C278A/C278A, BIOS C2789.5.BS.1C11.AG.1 11/08/2023 | |
31 | [ 560.720959] Workqueue: amdgpu-reset-hive amdgpu_ras_do_recovery [amdgpu] | |
32 | [ 560.728887] RIP: 0010:amdgpu_device_gpu_recover.cold+0xbf1/0xcf5 [amdgpu] | |
33 | [ 560.736891] Code: ff 41 89 c6 e9 1b ff ff ff 44 0f b6 45 b0 e9 4f ff ff ff be 01 00 00 00 4c 89 e7 e8 76 c9 8b ff 44 0f b6 45 b0 e9 3c fd ff ff <48> 83 ba 18 02 00 00 00 0f 84 6a f8 ff ff 48 8d 7a 78 be 01 00 00 | |
34 | [ 560.757967] RSP: 0018:ffa0000032e53d80 EFLAGS: 00010202 | |
35 | [ 560.763848] RAX: ffa00000001dfd10 RBX: ffa0000000197090 RCX: ffa0000032e53db0 | |
36 | [ 560.771856] RDX: 5f080b54534f5f07 RSI: 0000000000000000 RDI: ff11000128100010 | |
37 | [ 560.779867] RBP: ffa0000032e53df0 R08: 0000000000000000 R09: ffffffffffe77f08 | |
38 | [ 560.787879] R10: 0000000000ffff0a R11: 0000000000000001 R12: 0000000000000000 | |
39 | [ 560.795889] R13: ffa0000032e53e00 R14: 0000000000000000 R15: 0000000000000000 | |
40 | [ 560.803889] FS: 0000000000000000(0000) GS:ff11007e7e800000(0000) knlGS:0000000000000000 | |
41 | [ 560.812973] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 | |
42 | [ 560.819422] CR2: 000055a04c118e68 CR3: 0000000007410005 CR4: 0000000000771ee0 | |
43 | [ 560.827433] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 | |
44 | [ 560.835433] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 | |
45 | [ 560.843444] PKRU: 55555554 | |
46 | [ 560.846480] Call Trace: | |
47 | [ 560.849225] <TASK> | |
48 | [ 560.851580] ? show_trace_log_lvl+0x1d6/0x2ea | |
49 | [ 560.856488] ? show_trace_log_lvl+0x1d6/0x2ea | |
50 | [ 560.861379] ? amdgpu_ras_do_recovery+0x1b2/0x210 [amdgpu] | |
51 | [ 560.867778] ? show_regs.part.0+0x23/0x29 | |
52 | [ 560.872293] ? __die_body.cold+0x8/0xd | |
53 | [ 560.876502] ? die_addr+0x3e/0x60 | |
54 | [ 560.880238] ? exc_general_protection+0x1c5/0x410 | |
55 | [ 560.885532] ? asm_exc_general_protection+0x27/0x30 | |
56 | [ 560.891025] ? amdgpu_device_gpu_recover.cold+0xbf1/0xcf5 [amdgpu] | |
57 | [ 560.898323] amdgpu_ras_do_recovery+0x1b2/0x210 [amdgpu] | |
58 | [ 560.904520] process_one_work+0x228/0x3d0 | |
59 | How: | |
60 | In RAS recovery, mode-1 reset is issued from RAS fatal error handling and expected | |
61 | all the nodes in a hive to be reset. no need to issue another mode-1 during this procedure. | |
62 | ||
63 | Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com> | |
64 | Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> | |
65 | Signed-off-by: Alex Deucher <alexander.deucher@amd.com> | |
66 | Signed-off-by: Sasha Levin <sashal@kernel.org> | |
67 | --- | |
68 | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++++++++++++++ | |
69 | 1 file changed, 14 insertions(+) | |
70 | ||
71 | diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |
72 | index 1fbaf7b81d69a..d0afb9ba3789c 100644 | |
73 | --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |
74 | +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |
75 | @@ -6109,6 +6109,20 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) | |
76 | struct amdgpu_reset_context reset_context; | |
77 | u32 memsize; | |
78 | struct list_head device_list; | |
79 | + struct amdgpu_hive_info *hive; | |
80 | + int hive_ras_recovery = 0; | |
81 | + struct amdgpu_ras *ras; | |
82 | + | |
83 | + /* PCI error slot reset should be skipped During RAS recovery */ | |
84 | + hive = amdgpu_get_xgmi_hive(adev); | |
85 | + if (hive) { | |
86 | + hive_ras_recovery = atomic_read(&hive->ras_recovery); | |
87 | + amdgpu_put_xgmi_hive(hive); | |
88 | + } | |
89 | + ras = amdgpu_ras_get_context(adev); | |
90 | + if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) && | |
91 | + ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) | |
92 | + return PCI_ERS_RESULT_RECOVERED; | |
93 | ||
94 | DRM_INFO("PCI error: slot reset callback!!\n"); | |
95 | ||
96 | -- | |
97 | 2.43.0 | |
98 |