]>
Commit | Line | Data |
---|---|---|
fb30fc59 SL |
1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | * | |
23 | */ | |
24 | #include <linux/list.h> | |
25 | #include "amdgpu.h" | |
5183411b | 26 | #include "amdgpu_xgmi.h" |
029fbd43 | 27 | #include "amdgpu_ras.h" |
18f36157 | 28 | #include "soc15.h" |
24f9aacf | 29 | #include "df/df_3_6_offset.h" |
18f36157 HZ |
30 | #include "xgmi/xgmi_4_0_0_smn.h" |
31 | #include "xgmi/xgmi_4_0_0_sh_mask.h" | |
442d61af | 32 | #include "xgmi/xgmi_6_1_0_sh_mask.h" |
18f36157 HZ |
33 | #include "wafl/wafl2_4_0_0_smn.h" |
34 | #include "wafl/wafl2_4_0_0_sh_mask.h" | |
fb30fc59 | 35 | |
cfbb6b00 AG |
36 | #include "amdgpu_reset.h" |
37 | ||
3c4ff2dc | 38 | #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c |
828fc79d | 39 | #define smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK 0x11a00218 |
3c4ff2dc | 40 | #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 |
828fc79d | 41 | #define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218 |
3c4ff2dc | 42 | |
fb30fc59 SL |
43 | static DEFINE_MUTEX(xgmi_mutex); |
44 | ||
fb30fc59 SL |
45 | #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 |
46 | ||
d95e8e97 | 47 | static LIST_HEAD(xgmi_hive_list); |
fb30fc59 | 48 | |
18f36157 HZ |
49 | static const int xgmi_pcs_err_status_reg_vg20[] = { |
50 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, | |
51 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, | |
52 | }; | |
53 | ||
54 | static const int wafl_pcs_err_status_reg_vg20[] = { | |
55 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, | |
56 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, | |
57 | }; | |
58 | ||
a61f41b1 HZ |
59 | static const int xgmi_pcs_err_status_reg_arct[] = { |
60 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, | |
61 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, | |
62 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000, | |
63 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000, | |
64 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000, | |
65 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000, | |
66 | }; | |
67 | ||
68 | /* same as vg20*/ | |
69 | static const int wafl_pcs_err_status_reg_arct[] = { | |
70 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, | |
71 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, | |
72 | }; | |
73 | ||
3c4ff2dc JC |
74 | static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { |
75 | smnPCS_XGMI3X16_PCS_ERROR_STATUS, | |
76 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000, | |
77 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000, | |
78 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000, | |
79 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000, | |
80 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000, | |
81 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000, | |
82 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000 | |
83 | }; | |
84 | ||
828fc79d SY |
85 | static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[] = { |
86 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK, | |
87 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000, | |
88 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x200000, | |
89 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x300000, | |
90 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x400000, | |
91 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x500000, | |
92 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x600000, | |
93 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x700000 | |
94 | }; | |
95 | ||
3c4ff2dc JC |
96 | static const int walf_pcs_err_status_reg_aldebaran[] = { |
97 | smnPCS_GOPX1_PCS_ERROR_STATUS, | |
98 | smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000 | |
99 | }; | |
100 | ||
828fc79d SY |
101 | static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = { |
102 | smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK, | |
103 | smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000 | |
104 | }; | |
105 | ||
20238a2c TZ |
106 | static const int xgmi3x16_pcs_err_status_reg_v6_4[] = { |
107 | smnPCS_XGMI3X16_PCS_ERROR_STATUS, | |
108 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000 | |
109 | }; | |
110 | ||
111 | static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = { | |
112 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK, | |
113 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000 | |
114 | }; | |
115 | ||
27d80f7d YW |
116 | static const u64 xgmi_v6_4_0_mca_base_array[] = { |
117 | 0x11a09200, | |
118 | 0x11b09200, | |
119 | }; | |
120 | ||
121 | static const char *xgmi_v6_4_0_ras_error_code_ext[32] = { | |
122 | [0x00] = "XGMI PCS DataLossErr", | |
123 | [0x01] = "XGMI PCS TrainingErr", | |
124 | [0x02] = "XGMI PCS FlowCtrlAckErr", | |
125 | [0x03] = "XGMI PCS RxFifoUnderflowErr", | |
126 | [0x04] = "XGMI PCS RxFifoOverflowErr", | |
127 | [0x05] = "XGMI PCS CRCErr", | |
128 | [0x06] = "XGMI PCS BERExceededErr", | |
129 | [0x07] = "XGMI PCS TxMetaDataErr", | |
130 | [0x08] = "XGMI PCS ReplayBufParityErr", | |
131 | [0x09] = "XGMI PCS DataParityErr", | |
132 | [0x0a] = "XGMI PCS ReplayFifoOverflowErr", | |
133 | [0x0b] = "XGMI PCS ReplayFifoUnderflowErr", | |
134 | [0x0c] = "XGMI PCS ElasticFifoOverflowErr", | |
135 | [0x0d] = "XGMI PCS DeskewErr", | |
136 | [0x0e] = "XGMI PCS FlowCtrlCRCErr", | |
137 | [0x0f] = "XGMI PCS DataStartupLimitErr", | |
138 | [0x10] = "XGMI PCS FCInitTimeoutErr", | |
139 | [0x11] = "XGMI PCS RecoveryTimeoutErr", | |
140 | [0x12] = "XGMI PCS ReadySerialTimeoutErr", | |
141 | [0x13] = "XGMI PCS ReadySerialAttemptErr", | |
142 | [0x14] = "XGMI PCS RecoveryAttemptErr", | |
143 | [0x15] = "XGMI PCS RecoveryRelockAttemptErr", | |
144 | [0x16] = "XGMI PCS ReplayAttemptErr", | |
145 | [0x17] = "XGMI PCS SyncHdrErr", | |
146 | [0x18] = "XGMI PCS TxReplayTimeoutErr", | |
147 | [0x19] = "XGMI PCS RxReplayTimeoutErr", | |
148 | [0x1a] = "XGMI PCS LinkSubTxTimeoutErr", | |
149 | [0x1b] = "XGMI PCS LinkSubRxTimeoutErr", | |
150 | [0x1c] = "XGMI PCS RxCMDPktErr", | |
151 | }; | |
152 | ||
18f36157 HZ |
153 | static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { |
154 | {"XGMI PCS DataLossErr", | |
155 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, | |
156 | {"XGMI PCS TrainingErr", | |
157 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)}, | |
158 | {"XGMI PCS CRCErr", | |
159 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)}, | |
160 | {"XGMI PCS BERExceededErr", | |
161 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)}, | |
162 | {"XGMI PCS TxMetaDataErr", | |
163 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)}, | |
164 | {"XGMI PCS ReplayBufParityErr", | |
165 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)}, | |
166 | {"XGMI PCS DataParityErr", | |
167 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)}, | |
168 | {"XGMI PCS ReplayFifoOverflowErr", | |
169 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, | |
170 | {"XGMI PCS ReplayFifoUnderflowErr", | |
171 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, | |
172 | {"XGMI PCS ElasticFifoOverflowErr", | |
173 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, | |
174 | {"XGMI PCS DeskewErr", | |
175 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)}, | |
176 | {"XGMI PCS DataStartupLimitErr", | |
177 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)}, | |
178 | {"XGMI PCS FCInitTimeoutErr", | |
179 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, | |
180 | {"XGMI PCS RecoveryTimeoutErr", | |
181 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, | |
182 | {"XGMI PCS ReadySerialTimeoutErr", | |
183 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, | |
184 | {"XGMI PCS ReadySerialAttemptErr", | |
185 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, | |
186 | {"XGMI PCS RecoveryAttemptErr", | |
187 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, | |
188 | {"XGMI PCS RecoveryRelockAttemptErr", | |
189 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, | |
190 | }; | |
191 | ||
192 | static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { | |
193 | {"WAFL PCS DataLossErr", | |
194 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)}, | |
195 | {"WAFL PCS TrainingErr", | |
196 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)}, | |
197 | {"WAFL PCS CRCErr", | |
198 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)}, | |
199 | {"WAFL PCS BERExceededErr", | |
200 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)}, | |
201 | {"WAFL PCS TxMetaDataErr", | |
202 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)}, | |
203 | {"WAFL PCS ReplayBufParityErr", | |
204 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)}, | |
205 | {"WAFL PCS DataParityErr", | |
206 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)}, | |
207 | {"WAFL PCS ReplayFifoOverflowErr", | |
208 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, | |
209 | {"WAFL PCS ReplayFifoUnderflowErr", | |
210 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, | |
211 | {"WAFL PCS ElasticFifoOverflowErr", | |
212 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, | |
213 | {"WAFL PCS DeskewErr", | |
214 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)}, | |
215 | {"WAFL PCS DataStartupLimitErr", | |
216 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)}, | |
217 | {"WAFL PCS FCInitTimeoutErr", | |
218 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)}, | |
219 | {"WAFL PCS RecoveryTimeoutErr", | |
220 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, | |
221 | {"WAFL PCS ReadySerialTimeoutErr", | |
222 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, | |
223 | {"WAFL PCS ReadySerialAttemptErr", | |
224 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, | |
225 | {"WAFL PCS RecoveryAttemptErr", | |
226 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)}, | |
227 | {"WAFL PCS RecoveryRelockAttemptErr", | |
228 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, | |
229 | }; | |
230 | ||
442d61af SY |
231 | static const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = { |
232 | {"XGMI3X16 PCS DataLossErr", | |
233 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataLossErr)}, | |
234 | {"XGMI3X16 PCS TrainingErr", | |
235 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TrainingErr)}, | |
236 | {"XGMI3X16 PCS FlowCtrlAckErr", | |
237 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlAckErr)}, | |
238 | {"XGMI3X16 PCS RxFifoUnderflowErr", | |
239 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoUnderflowErr)}, | |
240 | {"XGMI3X16 PCS RxFifoOverflowErr", | |
241 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoOverflowErr)}, | |
242 | {"XGMI3X16 PCS CRCErr", | |
243 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, CRCErr)}, | |
244 | {"XGMI3X16 PCS BERExceededErr", | |
245 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, BERExceededErr)}, | |
246 | {"XGMI3X16 PCS TxVcidDataErr", | |
247 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxVcidDataErr)}, | |
248 | {"XGMI3X16 PCS ReplayBufParityErr", | |
249 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayBufParityErr)}, | |
250 | {"XGMI3X16 PCS DataParityErr", | |
251 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataParityErr)}, | |
252 | {"XGMI3X16 PCS ReplayFifoOverflowErr", | |
253 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, | |
254 | {"XGMI3X16 PCS ReplayFifoUnderflowErr", | |
255 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, | |
256 | {"XGMI3X16 PCS ElasticFifoOverflowErr", | |
257 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, | |
258 | {"XGMI3X16 PCS DeskewErr", | |
259 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DeskewErr)}, | |
260 | {"XGMI3X16 PCS FlowCtrlCRCErr", | |
261 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlCRCErr)}, | |
262 | {"XGMI3X16 PCS DataStartupLimitErr", | |
263 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataStartupLimitErr)}, | |
264 | {"XGMI3X16 PCS FCInitTimeoutErr", | |
265 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, | |
266 | {"XGMI3X16 PCS RecoveryTimeoutErr", | |
267 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, | |
268 | {"XGMI3X16 PCS ReadySerialTimeoutErr", | |
269 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, | |
270 | {"XGMI3X16 PCS ReadySerialAttemptErr", | |
271 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, | |
272 | {"XGMI3X16 PCS RecoveryAttemptErr", | |
273 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, | |
274 | {"XGMI3X16 PCS RecoveryRelockAttemptErr", | |
275 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, | |
276 | {"XGMI3X16 PCS ReplayAttemptErr", | |
277 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayAttemptErr)}, | |
278 | {"XGMI3X16 PCS SyncHdrErr", | |
279 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, SyncHdrErr)}, | |
280 | {"XGMI3X16 PCS TxReplayTimeoutErr", | |
281 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxReplayTimeoutErr)}, | |
282 | {"XGMI3X16 PCS RxReplayTimeoutErr", | |
283 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxReplayTimeoutErr)}, | |
284 | {"XGMI3X16 PCS LinkSubTxTimeoutErr", | |
285 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubTxTimeoutErr)}, | |
286 | {"XGMI3X16 PCS LinkSubRxTimeoutErr", | |
287 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubRxTimeoutErr)}, | |
288 | {"XGMI3X16 PCS RxCMDPktErr", | |
289 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxCMDPktErr)}, | |
290 | }; | |
291 | ||
1c1e53f7 TSD |
292 | /** |
293 | * DOC: AMDGPU XGMI Support | |
294 | * | |
295 | * XGMI is a high speed interconnect that joins multiple GPU cards | |
296 | * into a homogeneous memory space that is organized by a collective | |
297 | * hive ID and individual node IDs, both of which are 64-bit numbers. | |
298 | * | |
299 | * The file xgmi_device_id contains the unique per GPU device ID and | |
300 | * is stored in the /sys/class/drm/card${cardno}/device/ directory. | |
301 | * | |
302 | * Inside the device directory a sub-directory 'xgmi_hive_info' is | |
303 | * created which contains the hive ID and the list of nodes. | |
304 | * | |
305 | * The hive ID is stored in: | |
306 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id | |
307 | * | |
308 | * The node information is stored in numbered directories: | |
309 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id | |
310 | * | |
311 | * Each device has their own xgmi_hive_info direction with a mirror | |
312 | * set of node sub-directories. | |
313 | * | |
314 | * The XGMI memory space is built by contiguously adding the power of | |
315 | * two padded VRAM space from each node to each other. | |
316 | * | |
317 | */ | |
318 | ||
d95e8e97 DL |
319 | static struct attribute amdgpu_xgmi_hive_id = { |
320 | .name = "xgmi_hive_id", | |
321 | .mode = S_IRUGO | |
322 | }; | |
1c1e53f7 | 323 | |
d95e8e97 DL |
324 | static struct attribute *amdgpu_xgmi_hive_attrs[] = { |
325 | &amdgpu_xgmi_hive_id, | |
326 | NULL | |
327 | }; | |
7ff61cdc | 328 | ATTRIBUTE_GROUPS(amdgpu_xgmi_hive); |
b1fa8c89 | 329 | |
d95e8e97 DL |
330 | static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj, |
331 | struct attribute *attr, char *buf) | |
b1fa8c89 | 332 | { |
d95e8e97 DL |
333 | struct amdgpu_hive_info *hive = container_of( |
334 | kobj, struct amdgpu_hive_info, kobj); | |
b1fa8c89 | 335 | |
d95e8e97 DL |
336 | if (attr == &amdgpu_xgmi_hive_id) |
337 | return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); | |
b1fa8c89 | 338 | |
d95e8e97 | 339 | return 0; |
b1fa8c89 AG |
340 | } |
341 | ||
d95e8e97 | 342 | static void amdgpu_xgmi_hive_release(struct kobject *kobj) |
b1fa8c89 | 343 | { |
d95e8e97 DL |
344 | struct amdgpu_hive_info *hive = container_of( |
345 | kobj, struct amdgpu_hive_info, kobj); | |
346 | ||
cfbb6b00 AG |
347 | amdgpu_reset_put_reset_domain(hive->reset_domain); |
348 | hive->reset_domain = NULL; | |
349 | ||
d95e8e97 DL |
350 | mutex_destroy(&hive->hive_lock); |
351 | kfree(hive); | |
b1fa8c89 AG |
352 | } |
353 | ||
d95e8e97 DL |
354 | static const struct sysfs_ops amdgpu_xgmi_hive_ops = { |
355 | .show = amdgpu_xgmi_show_attrs, | |
356 | }; | |
357 | ||
b2daaa93 | 358 | static const struct kobj_type amdgpu_xgmi_hive_type = { |
d95e8e97 DL |
359 | .release = amdgpu_xgmi_hive_release, |
360 | .sysfs_ops = &amdgpu_xgmi_hive_ops, | |
7ff61cdc | 361 | .default_groups = amdgpu_xgmi_hive_groups, |
d95e8e97 DL |
362 | }; |
363 | ||
b1fa8c89 AG |
364 | static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, |
365 | struct device_attribute *attr, | |
366 | char *buf) | |
367 | { | |
368 | struct drm_device *ddev = dev_get_drvdata(dev); | |
1348969a | 369 | struct amdgpu_device *adev = drm_to_adev(ddev); |
b1fa8c89 | 370 | |
36000c7a | 371 | return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id); |
b1fa8c89 AG |
372 | |
373 | } | |
374 | ||
cd956e75 MG |
375 | static ssize_t amdgpu_xgmi_show_physical_id(struct device *dev, |
376 | struct device_attribute *attr, | |
377 | char *buf) | |
378 | { | |
379 | struct drm_device *ddev = dev_get_drvdata(dev); | |
380 | struct amdgpu_device *adev = drm_to_adev(ddev); | |
381 | ||
382 | return sysfs_emit(buf, "%u\n", adev->gmc.xgmi.physical_node_id); | |
383 | ||
384 | } | |
385 | ||
8d8ffe37 SZ |
386 | static ssize_t amdgpu_xgmi_show_num_hops(struct device *dev, |
387 | struct device_attribute *attr, | |
388 | char *buf) | |
389 | { | |
390 | struct drm_device *ddev = dev_get_drvdata(dev); | |
391 | struct amdgpu_device *adev = drm_to_adev(ddev); | |
392 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; | |
393 | int i; | |
394 | ||
395 | for (i = 0; i < top->num_nodes; i++) | |
396 | sprintf(buf + 3 * i, "%02x ", top->nodes[i].num_hops); | |
397 | ||
398 | return sysfs_emit(buf, "%s\n", buf); | |
399 | } | |
400 | ||
401 | static ssize_t amdgpu_xgmi_show_num_links(struct device *dev, | |
402 | struct device_attribute *attr, | |
403 | char *buf) | |
404 | { | |
405 | struct drm_device *ddev = dev_get_drvdata(dev); | |
406 | struct amdgpu_device *adev = drm_to_adev(ddev); | |
407 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; | |
408 | int i; | |
409 | ||
410 | for (i = 0; i < top->num_nodes; i++) | |
411 | sprintf(buf + 3 * i, "%02x ", top->nodes[i].num_links); | |
412 | ||
413 | return sysfs_emit(buf, "%s\n", buf); | |
414 | } | |
415 | ||
24f9aacf JK |
416 | #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) |
417 | static ssize_t amdgpu_xgmi_show_error(struct device *dev, | |
418 | struct device_attribute *attr, | |
419 | char *buf) | |
420 | { | |
421 | struct drm_device *ddev = dev_get_drvdata(dev); | |
1348969a | 422 | struct amdgpu_device *adev = drm_to_adev(ddev); |
24f9aacf JK |
423 | uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; |
424 | uint64_t fica_out; | |
425 | unsigned int error_count = 0; | |
426 | ||
427 | ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); | |
428 | ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); | |
b1fa8c89 | 429 | |
cace4bff HZ |
430 | if ((!adev->df.funcs) || |
431 | (!adev->df.funcs->get_fica) || | |
432 | (!adev->df.funcs->set_fica)) | |
433 | return -EINVAL; | |
434 | ||
bdf84a80 | 435 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in); |
24f9aacf JK |
436 | if (fica_out != 0x1f) |
437 | pr_err("xGMI error counters not enabled!\n"); | |
438 | ||
bdf84a80 | 439 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in); |
24f9aacf JK |
440 | |
441 | if ((fica_out & 0xffff) == 2) | |
442 | error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); | |
b1fa8c89 | 443 | |
bdf84a80 | 444 | adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); |
24f9aacf | 445 | |
36000c7a | 446 | return sysfs_emit(buf, "%u\n", error_count); |
24f9aacf JK |
447 | } |
448 | ||
449 | ||
450 | static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); | |
cd956e75 | 451 | static DEVICE_ATTR(xgmi_physical_id, 0444, amdgpu_xgmi_show_physical_id, NULL); |
24f9aacf | 452 | static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL); |
8d8ffe37 SZ |
453 | static DEVICE_ATTR(xgmi_num_hops, S_IRUGO, amdgpu_xgmi_show_num_hops, NULL); |
454 | static DEVICE_ATTR(xgmi_num_links, S_IRUGO, amdgpu_xgmi_show_num_links, NULL); | |
b1fa8c89 AG |
455 | |
456 | static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, | |
457 | struct amdgpu_hive_info *hive) | |
458 | { | |
459 | int ret = 0; | |
460 | char node[10] = { 0 }; | |
461 | ||
462 | /* Create xgmi device id file */ | |
463 | ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id); | |
464 | if (ret) { | |
465 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n"); | |
466 | return ret; | |
467 | } | |
468 | ||
cd956e75 MG |
469 | ret = device_create_file(adev->dev, &dev_attr_xgmi_physical_id); |
470 | if (ret) { | |
471 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_physical_id\n"); | |
472 | return ret; | |
473 | } | |
474 | ||
24f9aacf JK |
475 | /* Create xgmi error file */ |
476 | ret = device_create_file(adev->dev, &dev_attr_xgmi_error); | |
477 | if (ret) | |
478 | pr_err("failed to create xgmi_error\n"); | |
479 | ||
8d8ffe37 SZ |
480 | /* Create xgmi num hops file */ |
481 | ret = device_create_file(adev->dev, &dev_attr_xgmi_num_hops); | |
482 | if (ret) | |
483 | pr_err("failed to create xgmi_num_hops\n"); | |
484 | ||
485 | /* Create xgmi num links file */ | |
486 | ret = device_create_file(adev->dev, &dev_attr_xgmi_num_links); | |
487 | if (ret) | |
488 | pr_err("failed to create xgmi_num_links\n"); | |
24f9aacf | 489 | |
b1fa8c89 | 490 | /* Create sysfs link to hive info folder on the first device */ |
d95e8e97 DL |
491 | if (hive->kobj.parent != (&adev->dev->kobj)) { |
492 | ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj, | |
b1fa8c89 AG |
493 | "xgmi_hive_info"); |
494 | if (ret) { | |
495 | dev_err(adev->dev, "XGMI: Failed to create link to hive info"); | |
496 | goto remove_file; | |
497 | } | |
498 | } | |
499 | ||
d95e8e97 | 500 | sprintf(node, "node%d", atomic_read(&hive->number_devices)); |
b1fa8c89 | 501 | /* Create sysfs link form the hive folder to yourself */ |
d95e8e97 | 502 | ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node); |
b1fa8c89 AG |
503 | if (ret) { |
504 | dev_err(adev->dev, "XGMI: Failed to create link from hive info"); | |
505 | goto remove_link; | |
506 | } | |
507 | ||
508 | goto success; | |
509 | ||
510 | ||
511 | remove_link: | |
4a580877 | 512 | sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique); |
b1fa8c89 AG |
513 | |
514 | remove_file: | |
515 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); | |
cd956e75 | 516 | device_remove_file(adev->dev, &dev_attr_xgmi_physical_id); |
8d8ffe37 SZ |
517 | device_remove_file(adev->dev, &dev_attr_xgmi_error); |
518 | device_remove_file(adev->dev, &dev_attr_xgmi_num_hops); | |
519 | device_remove_file(adev->dev, &dev_attr_xgmi_num_links); | |
b1fa8c89 AG |
520 | |
521 | success: | |
522 | return ret; | |
523 | } | |
524 | ||
525 | static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, | |
526 | struct amdgpu_hive_info *hive) | |
527 | { | |
a89b5dae JZ |
528 | char node[10]; |
529 | memset(node, 0, sizeof(node)); | |
530 | ||
b1fa8c89 | 531 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); |
cd956e75 | 532 | device_remove_file(adev->dev, &dev_attr_xgmi_physical_id); |
a89b5dae | 533 | device_remove_file(adev->dev, &dev_attr_xgmi_error); |
8d8ffe37 SZ |
534 | device_remove_file(adev->dev, &dev_attr_xgmi_num_hops); |
535 | device_remove_file(adev->dev, &dev_attr_xgmi_num_links); | |
a89b5dae | 536 | |
d95e8e97 | 537 | if (hive->kobj.parent != (&adev->dev->kobj)) |
a89b5dae JZ |
538 | sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info"); |
539 | ||
d95e8e97 DL |
540 | sprintf(node, "node%d", atomic_read(&hive->number_devices)); |
541 | sysfs_remove_link(&hive->kobj, node); | |
a89b5dae | 542 | |
b1fa8c89 AG |
543 | } |
544 | ||
545 | ||
546 | ||
d95e8e97 | 547 | struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) |
fb30fc59 | 548 | { |
be8901c2 | 549 | struct amdgpu_hive_info *hive = NULL; |
d95e8e97 | 550 | int ret; |
fb30fc59 SL |
551 | |
552 | if (!adev->gmc.xgmi.hive_id) | |
553 | return NULL; | |
22d6575b | 554 | |
d95e8e97 DL |
555 | if (adev->hive) { |
556 | kobject_get(&adev->hive->kobj); | |
557 | return adev->hive; | |
558 | } | |
559 | ||
22d6575b TSD |
560 | mutex_lock(&xgmi_mutex); |
561 | ||
be8901c2 KW |
562 | list_for_each_entry(hive, &xgmi_hive_list, node) { |
563 | if (hive->hive_id == adev->gmc.xgmi.hive_id) | |
564 | goto pro_end; | |
fb30fc59 | 565 | } |
d95e8e97 DL |
566 | |
567 | hive = kzalloc(sizeof(*hive), GFP_KERNEL); | |
568 | if (!hive) { | |
569 | dev_err(adev->dev, "XGMI: allocation failed\n"); | |
50fbe0cc | 570 | ret = -ENOMEM; |
d95e8e97 DL |
571 | hive = NULL; |
572 | goto pro_end; | |
22d6575b | 573 | } |
fb30fc59 SL |
574 | |
575 | /* initialize new hive if not exist */ | |
d95e8e97 DL |
576 | ret = kobject_init_and_add(&hive->kobj, |
577 | &amdgpu_xgmi_hive_type, | |
578 | &adev->dev->kobj, | |
579 | "%s", "xgmi_hive_info"); | |
580 | if (ret) { | |
581 | dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n"); | |
7b833d68 | 582 | kobject_put(&hive->kobj); |
d95e8e97 DL |
583 | hive = NULL; |
584 | goto pro_end; | |
b1fa8c89 AG |
585 | } |
586 | ||
46c67660 | 587 | /** |
588 | * Only init hive->reset_domain for none SRIOV configuration. For SRIOV, | |
589 | * Host driver decide how to reset the GPU either through FLR or chain reset. | |
590 | * Guest side will get individual notifications from the host for the FLR | |
591 | * if necessary. | |
592 | */ | |
593 | if (!amdgpu_sriov_vf(adev)) { | |
cfbb6b00 AG |
594 | /** |
595 | * Avoid recreating reset domain when hive is reconstructed for the case | |
46c67660 | 596 | * of reset the devices in the XGMI hive during probe for passthrough GPU |
cfbb6b00 AG |
597 | * See https://www.spinics.net/lists/amd-gfx/msg58836.html |
598 | */ | |
46c67660 | 599 | if (adev->reset_domain->type != XGMI_HIVE) { |
600 | hive->reset_domain = | |
601 | amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive"); | |
7f89f997 YL |
602 | if (!hive->reset_domain) { |
603 | dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n"); | |
604 | ret = -ENOMEM; | |
605 | kobject_put(&hive->kobj); | |
7f89f997 YL |
606 | hive = NULL; |
607 | goto pro_end; | |
608 | } | |
46c67660 | 609 | } else { |
610 | amdgpu_reset_get_reset_domain(adev->reset_domain); | |
611 | hive->reset_domain = adev->reset_domain; | |
612 | } | |
a4c63caf AG |
613 | } |
614 | ||
d95e8e97 DL |
615 | hive->hive_id = adev->gmc.xgmi.hive_id; |
616 | INIT_LIST_HEAD(&hive->device_list); | |
617 | INIT_LIST_HEAD(&hive->node); | |
618 | mutex_init(&hive->hive_lock); | |
d95e8e97 DL |
619 | atomic_set(&hive->number_devices, 0); |
620 | task_barrier_init(&hive->tb); | |
621 | hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; | |
622 | hive->hi_req_gpu = NULL; | |
a4c63caf | 623 | |
d84a430d JK |
624 | /* |
625 | * hive pstate on boot is high in vega20 so we have to go to low | |
626 | * pstate on after boot. | |
627 | */ | |
d95e8e97 DL |
628 | hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE; |
629 | list_add_tail(&hive->node, &xgmi_hive_list); | |
630 | ||
631 | pro_end: | |
632 | if (hive) | |
633 | kobject_get(&hive->kobj); | |
22d6575b | 634 | mutex_unlock(&xgmi_mutex); |
d95e8e97 DL |
635 | return hive; |
636 | } | |
ed2bf522 | 637 | |
d95e8e97 DL |
638 | void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive) |
639 | { | |
640 | if (hive) | |
641 | kobject_put(&hive->kobj); | |
fb30fc59 SL |
642 | } |
643 | ||
df399b06 | 644 | int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) |
645 | { | |
646 | int ret = 0; | |
a9f5f98f HZ |
647 | struct amdgpu_hive_info *hive; |
648 | struct amdgpu_device *request_adev; | |
d84a430d | 649 | bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; |
a9f5f98f | 650 | bool init_low; |
df399b06 | 651 | |
a9f5f98f HZ |
652 | hive = amdgpu_get_xgmi_hive(adev); |
653 | if (!hive) | |
654 | return 0; | |
655 | ||
656 | request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev; | |
657 | init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; | |
d95e8e97 | 658 | amdgpu_put_xgmi_hive(hive); |
d84a430d | 659 | /* fw bug so temporarily disable pstate switching */ |
dfe31f25 JK |
660 | return 0; |
661 | ||
662 | if (!hive || adev->asic_type != CHIP_VEGA20) | |
df399b06 | 663 | return 0; |
664 | ||
f1403342 | 665 | mutex_lock(&hive->hive_lock); |
5c5b2ba0 | 666 | |
d84a430d JK |
667 | if (is_hi_req) |
668 | hive->hi_req_count++; | |
669 | else | |
670 | hive->hi_req_count--; | |
671 | ||
672 | /* | |
673 | * Vega20 only needs single peer to request pstate high for the hive to | |
674 | * go high but all peers must request pstate low for the hive to go low | |
675 | */ | |
676 | if (hive->pstate == pstate || | |
677 | (!is_hi_req && hive->hi_req_count && !init_low)) | |
cb5932f8 | 678 | goto out; |
93abb05f | 679 | |
d84a430d | 680 | dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate); |
93abb05f | 681 | |
d84a430d | 682 | ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate); |
5c5b2ba0 | 683 | if (ret) { |
d84a430d | 684 | dev_err(request_adev->dev, |
93abb05f | 685 | "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", |
d84a430d JK |
686 | request_adev->gmc.xgmi.node_id, |
687 | request_adev->gmc.xgmi.hive_id, ret); | |
5c5b2ba0 EQ |
688 | goto out; |
689 | } | |
690 | ||
d84a430d JK |
691 | if (init_low) |
692 | hive->pstate = hive->hi_req_count ? | |
693 | hive->pstate : AMDGPU_XGMI_PSTATE_MIN; | |
694 | else { | |
5c5b2ba0 | 695 | hive->pstate = pstate; |
d84a430d JK |
696 | hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? |
697 | adev : NULL; | |
698 | } | |
5c5b2ba0 | 699 | out: |
f1403342 | 700 | mutex_unlock(&hive->hive_lock); |
df399b06 | 701 | return ret; |
702 | } | |
703 | ||
5183411b AG |
704 | int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) |
705 | { | |
29c1ec24 | 706 | int ret; |
5183411b | 707 | |
7c55b598 VC |
708 | if (amdgpu_sriov_vf(adev)) |
709 | return 0; | |
710 | ||
5183411b AG |
711 | /* Each psp need to set the latest topology */ |
712 | ret = psp_xgmi_set_topology_info(&adev->psp, | |
d95e8e97 | 713 | atomic_read(&hive->number_devices), |
da361dd1 | 714 | &adev->psp.xgmi_context.top_info); |
5183411b AG |
715 | if (ret) |
716 | dev_err(adev->dev, | |
717 | "XGMI: Set topology failure on device %llx, hive %llx, ret %d", | |
718 | adev->gmc.xgmi.node_id, | |
719 | adev->gmc.xgmi.hive_id, ret); | |
5183411b AG |
720 | |
721 | return ret; | |
722 | } | |
723 | ||
da361dd1 | 724 | |
4ac5617c JK |
725 | /* |
726 | * NOTE psp_xgmi_node_info.num_hops layout is as follows: | |
727 | * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved) | |
728 | * num_hops[5:3] = reserved | |
729 | * num_hops[2:0] = number of hops | |
730 | */ | |
da361dd1 | 731 | int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, |
732 | struct amdgpu_device *peer_adev) | |
733 | { | |
734 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; | |
4ac5617c | 735 | uint8_t num_hops_mask = 0x7; |
da361dd1 | 736 | int i; |
737 | ||
738 | for (i = 0 ; i < top->num_nodes; ++i) | |
739 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) | |
4ac5617c | 740 | return top->nodes[i].num_hops & num_hops_mask; |
da361dd1 | 741 | return -EINVAL; |
742 | } | |
743 | ||
3f46c4e9 JK |
744 | int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev, |
745 | struct amdgpu_device *peer_adev) | |
746 | { | |
747 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; | |
748 | int i; | |
749 | ||
750 | for (i = 0 ; i < top->num_nodes; ++i) | |
751 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) | |
752 | return top->nodes[i].num_links; | |
753 | return -EINVAL; | |
754 | } | |
755 | ||
44357a1b JK |
756 | /* |
757 | * Devices that support extended data require the entire hive to initialize with | |
758 | * the shared memory buffer flag set. | |
759 | * | |
760 | * Hive locks and conditions apply - see amdgpu_xgmi_add_device | |
761 | */ | |
762 | static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive, | |
763 | bool set_extended_data) | |
764 | { | |
765 | struct amdgpu_device *tmp_adev; | |
766 | int ret; | |
767 | ||
768 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
769 | ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false); | |
770 | if (ret) { | |
771 | dev_err(tmp_adev->dev, | |
772 | "XGMI: Failed to initialize xgmi session for data partition %i\n", | |
773 | set_extended_data); | |
774 | return ret; | |
775 | } | |
776 | ||
777 | } | |
778 | ||
779 | return 0; | |
780 | } | |
781 | ||
fb30fc59 SL |
782 | int amdgpu_xgmi_add_device(struct amdgpu_device *adev) |
783 | { | |
da361dd1 | 784 | struct psp_xgmi_topology_info *top_info; |
fb30fc59 SL |
785 | struct amdgpu_hive_info *hive; |
786 | struct amdgpu_xgmi *entry; | |
5183411b | 787 | struct amdgpu_device *tmp_adev = NULL; |
fb30fc59 | 788 | |
75b2fce2 | 789 | int count = 0, ret = 0; |
fb30fc59 | 790 | |
47622ba0 | 791 | if (!adev->gmc.xgmi.supported) |
fb30fc59 | 792 | return 0; |
47622ba0 | 793 | |
e3c1b071 | 794 | if (!adev->gmc.xgmi.pending_reset && |
795 | amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { | |
44357a1b | 796 | ret = psp_xgmi_initialize(&adev->psp, false, true); |
0b9d3760 HZ |
797 | if (ret) { |
798 | dev_err(adev->dev, | |
799 | "XGMI: Failed to initialize xgmi session\n"); | |
800 | return ret; | |
801 | } | |
802 | ||
2f2eab3a OZ |
803 | ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); |
804 | if (ret) { | |
805 | dev_err(adev->dev, | |
806 | "XGMI: Failed to get hive id\n"); | |
807 | return ret; | |
808 | } | |
379c237e | 809 | |
2f2eab3a OZ |
810 | ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); |
811 | if (ret) { | |
812 | dev_err(adev->dev, | |
813 | "XGMI: Failed to get node id\n"); | |
814 | return ret; | |
815 | } | |
816 | } else { | |
817 | adev->gmc.xgmi.hive_id = 16; | |
818 | adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; | |
379c237e | 819 | } |
fb30fc59 | 820 | |
d95e8e97 | 821 | hive = amdgpu_get_xgmi_hive(adev); |
36ca09a0 | 822 | if (!hive) { |
823 | ret = -EINVAL; | |
824 | dev_err(adev->dev, | |
c1219b94 | 825 | "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n", |
36ca09a0 | 826 | adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); |
fb30fc59 | 827 | goto exit; |
36ca09a0 | 828 | } |
d95e8e97 | 829 | mutex_lock(&hive->hive_lock); |
fb30fc59 | 830 | |
da361dd1 | 831 | top_info = &adev->psp.xgmi_context.top_info; |
5183411b | 832 | |
fb30fc59 SL |
833 | list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); |
834 | list_for_each_entry(entry, &hive->device_list, head) | |
da361dd1 | 835 | top_info->nodes[count++].node_id = entry->node_id; |
e008299e | 836 | top_info->num_nodes = count; |
d95e8e97 | 837 | atomic_set(&hive->number_devices, count); |
fb30fc59 | 838 | |
f33a8770 AG |
839 | task_barrier_add_task(&hive->tb); |
840 | ||
e3c1b071 | 841 | if (!adev->gmc.xgmi.pending_reset && |
842 | amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { | |
75b2fce2 LM |
843 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
844 | /* update node list for other device in the hive */ | |
845 | if (tmp_adev != adev) { | |
846 | top_info = &tmp_adev->psp.xgmi_context.top_info; | |
847 | top_info->nodes[count - 1].node_id = | |
848 | adev->gmc.xgmi.node_id; | |
849 | top_info->num_nodes = count; | |
850 | } | |
851 | ret = amdgpu_xgmi_update_topology(hive, tmp_adev); | |
852 | if (ret) | |
94561899 | 853 | goto exit_unlock; |
e008299e | 854 | } |
e008299e | 855 | |
75b2fce2 LM |
856 | /* get latest topology info for each device from psp */ |
857 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
858 | ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, | |
44357a1b | 859 | &tmp_adev->psp.xgmi_context.top_info, false); |
75b2fce2 LM |
860 | if (ret) { |
861 | dev_err(tmp_adev->dev, | |
862 | "XGMI: Get topology failure on device %llx, hive %llx, ret %d", | |
863 | tmp_adev->gmc.xgmi.node_id, | |
864 | tmp_adev->gmc.xgmi.hive_id, ret); | |
865 | /* To do : continue with some node failed or disable the whole hive */ | |
94561899 | 866 | goto exit_unlock; |
75b2fce2 | 867 | } |
a82c1566 | 868 | } |
44357a1b JK |
869 | |
870 | /* get topology again for hives that support extended data */ | |
871 | if (adev->psp.xgmi_context.supports_extended_data) { | |
872 | ||
873 | /* initialize the hive to get extended data. */ | |
874 | ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true); | |
875 | if (ret) | |
876 | goto exit_unlock; | |
877 | ||
878 | /* get the extended data. */ | |
879 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
880 | ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, | |
881 | &tmp_adev->psp.xgmi_context.top_info, true); | |
882 | if (ret) { | |
883 | dev_err(tmp_adev->dev, | |
884 | "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d", | |
885 | tmp_adev->gmc.xgmi.node_id, | |
886 | tmp_adev->gmc.xgmi.hive_id, ret); | |
887 | goto exit_unlock; | |
888 | } | |
889 | } | |
890 | ||
891 | /* initialize the hive to get non-extended data for the next round. */ | |
892 | ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false); | |
893 | if (ret) | |
894 | goto exit_unlock; | |
895 | ||
896 | } | |
fb30fc59 | 897 | } |
a82c1566 | 898 | |
e3c1b071 | 899 | if (!ret && !adev->gmc.xgmi.pending_reset) |
b1fa8c89 AG |
900 | ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); |
901 | ||
94561899 | 902 | exit_unlock: |
e008299e | 903 | mutex_unlock(&hive->hive_lock); |
904 | exit: | |
d95e8e97 DL |
905 | if (!ret) { |
906 | adev->hive = hive; | |
b1fa8c89 AG |
907 | dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n", |
908 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); | |
d95e8e97 DL |
909 | } else { |
910 | amdgpu_put_xgmi_hive(hive); | |
b1fa8c89 AG |
911 | dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n", |
912 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, | |
913 | ret); | |
d95e8e97 | 914 | } |
b1fa8c89 | 915 | |
fb30fc59 SL |
916 | return ret; |
917 | } | |
a82400b5 | 918 | |
0b9d3760 | 919 | int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) |
a82400b5 | 920 | { |
d95e8e97 | 921 | struct amdgpu_hive_info *hive = adev->hive; |
a82400b5 AG |
922 | |
923 | if (!adev->gmc.xgmi.supported) | |
0b9d3760 | 924 | return -EINVAL; |
a82400b5 | 925 | |
a82400b5 | 926 | if (!hive) |
0b9d3760 | 927 | return -EINVAL; |
a82400b5 | 928 | |
d95e8e97 | 929 | mutex_lock(&hive->hive_lock); |
a89b5dae JZ |
930 | task_barrier_rem_task(&hive->tb); |
931 | amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); | |
d95e8e97 DL |
932 | if (hive->hi_req_gpu == adev) |
933 | hive->hi_req_gpu = NULL; | |
934 | list_del(&adev->gmc.xgmi.head); | |
a89b5dae JZ |
935 | mutex_unlock(&hive->hive_lock); |
936 | ||
d95e8e97 DL |
937 | amdgpu_put_xgmi_hive(hive); |
938 | adev->hive = NULL; | |
939 | ||
940 | if (atomic_dec_return(&hive->number_devices) == 0) { | |
941 | /* Remove the hive from global hive list */ | |
942 | mutex_lock(&xgmi_mutex); | |
943 | list_del(&hive->node); | |
944 | mutex_unlock(&xgmi_mutex); | |
945 | ||
946 | amdgpu_put_xgmi_hive(hive); | |
22d6575b | 947 | } |
0b9d3760 | 948 | |
d8adafc7 | 949 | return 0; |
a82400b5 | 950 | } |
029fbd43 | 951 | |
4e9b1fa5 | 952 | static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) |
029fbd43 | 953 | { |
029fbd43 HZ |
954 | if (!adev->gmc.xgmi.supported || |
955 | adev->gmc.xgmi.num_physical_nodes == 0) | |
956 | return 0; | |
957 | ||
21226f02 | 958 | amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); |
66399248 | 959 | |
caae42f0 | 960 | return amdgpu_ras_block_late_init(adev, ras_block); |
029fbd43 | 961 | } |
be5b39d8 | 962 | |
19744f5f HZ |
963 | uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, |
964 | uint64_t addr) | |
965 | { | |
890900fe HZ |
966 | struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi; |
967 | return (addr + xgmi->physical_node_id * xgmi->node_segment_size); | |
19744f5f | 968 | } |
18f36157 | 969 | |
66399248 JC |
970 | static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) |
971 | { | |
972 | WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); | |
973 | WREG32_PCIE(pcs_status_reg, 0); | |
974 | } | |
975 | ||
27d80f7d | 976 | static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev) |
66399248 JC |
977 | { |
978 | uint32_t i; | |
979 | ||
980 | switch (adev->asic_type) { | |
981 | case CHIP_ARCTURUS: | |
982 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) | |
983 | pcs_clear_status(adev, | |
984 | xgmi_pcs_err_status_reg_arct[i]); | |
985 | break; | |
986 | case CHIP_VEGA20: | |
987 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) | |
988 | pcs_clear_status(adev, | |
989 | xgmi_pcs_err_status_reg_vg20[i]); | |
990 | break; | |
3c4ff2dc | 991 | case CHIP_ALDEBARAN: |
7513c9ff | 992 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) |
3c4ff2dc | 993 | pcs_clear_status(adev, |
7513c9ff | 994 | xgmi3x16_pcs_err_status_reg_aldebaran[i]); |
3c4ff2dc JC |
995 | for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) |
996 | pcs_clear_status(adev, | |
997 | walf_pcs_err_status_reg_aldebaran[i]); | |
998 | break; | |
66399248 JC |
999 | default: |
1000 | break; | |
1001 | } | |
20238a2c TZ |
1002 | |
1003 | switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { | |
1004 | case IP_VERSION(6, 4, 0): | |
1005 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) | |
1006 | pcs_clear_status(adev, | |
1007 | xgmi3x16_pcs_err_status_reg_v6_4[i]); | |
1008 | break; | |
1009 | default: | |
1010 | break; | |
1011 | } | |
66399248 JC |
1012 | } |
1013 | ||
27d80f7d YW |
1014 | static void __xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst, u64 mca_base) |
1015 | { | |
1016 | WREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS, 0ULL); | |
1017 | } | |
1018 | ||
1019 | static void xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst) | |
1020 | { | |
1021 | int i; | |
1022 | ||
1023 | for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++) | |
1024 | __xgmi_v6_4_0_reset_error_count(adev, xgmi_inst, xgmi_v6_4_0_mca_base_array[i]); | |
1025 | } | |
1026 | ||
1027 | static void xgmi_v6_4_0_reset_ras_error_count(struct amdgpu_device *adev) | |
1028 | { | |
1029 | int i; | |
1030 | ||
1031 | for_each_inst(i, adev->aid_mask) | |
1032 | xgmi_v6_4_0_reset_error_count(adev, i); | |
1033 | } | |
1034 | ||
1035 | static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) | |
1036 | { | |
1037 | switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { | |
1038 | case IP_VERSION(6, 4, 0): | |
1039 | xgmi_v6_4_0_reset_ras_error_count(adev); | |
1040 | break; | |
1041 | default: | |
1042 | amdgpu_xgmi_legacy_reset_ras_error_count(adev); | |
1043 | break; | |
1044 | } | |
1045 | } | |
1046 | ||
18f36157 HZ |
1047 | static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, |
1048 | uint32_t value, | |
828fc79d | 1049 | uint32_t mask_value, |
18f36157 HZ |
1050 | uint32_t *ue_count, |
1051 | uint32_t *ce_count, | |
828fc79d SY |
1052 | bool is_xgmi_pcs, |
1053 | bool check_mask) | |
18f36157 HZ |
1054 | { |
1055 | int i; | |
828fc79d | 1056 | int ue_cnt = 0; |
828fc79d SY |
1057 | const struct amdgpu_pcs_ras_field *pcs_ras_fields = NULL; |
1058 | uint32_t field_array_size = 0; | |
18f36157 HZ |
1059 | |
1060 | if (is_xgmi_pcs) { | |
4e8303cf | 1061 | if (amdgpu_ip_version(adev, XGMI_HWIP, 0) == |
20238a2c TZ |
1062 | IP_VERSION(6, 1, 0) || |
1063 | amdgpu_ip_version(adev, XGMI_HWIP, 0) == | |
1064 | IP_VERSION(6, 4, 0)) { | |
442d61af SY |
1065 | pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0]; |
1066 | field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields); | |
1067 | } else { | |
1068 | pcs_ras_fields = &xgmi_pcs_ras_fields[0]; | |
1069 | field_array_size = ARRAY_SIZE(xgmi_pcs_ras_fields); | |
1070 | } | |
18f36157 | 1071 | } else { |
828fc79d SY |
1072 | pcs_ras_fields = &wafl_pcs_ras_fields[0]; |
1073 | field_array_size = ARRAY_SIZE(wafl_pcs_ras_fields); | |
1074 | } | |
1075 | ||
1076 | if (check_mask) | |
1077 | value = value & ~mask_value; | |
1078 | ||
1079 | /* query xgmi/walf pcs error status, | |
1080 | * only ue is supported */ | |
1081 | for (i = 0; value && i < field_array_size; i++) { | |
1082 | ue_cnt = (value & | |
1083 | pcs_ras_fields[i].pcs_err_mask) >> | |
1084 | pcs_ras_fields[i].pcs_err_shift; | |
1085 | if (ue_cnt) { | |
1086 | dev_info(adev->dev, "%s detected\n", | |
1087 | pcs_ras_fields[i].err_name); | |
1088 | *ue_count += ue_cnt; | |
18f36157 | 1089 | } |
828fc79d SY |
1090 | |
1091 | /* reset bit value if the bit is checked */ | |
1092 | value &= ~(pcs_ras_fields[i].pcs_err_mask); | |
18f36157 HZ |
1093 | } |
1094 | ||
1095 | return 0; | |
1096 | } | |
1097 | ||
27d80f7d YW |
1098 | static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev, |
1099 | void *ras_error_status) | |
18f36157 HZ |
1100 | { |
1101 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; | |
20238a2c | 1102 | int i, supported = 1; |
828fc79d | 1103 | uint32_t data, mask_data = 0; |
18f36157 HZ |
1104 | uint32_t ue_cnt = 0, ce_cnt = 0; |
1105 | ||
1106 | if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) | |
6c245386 | 1107 | return ; |
18f36157 HZ |
1108 | |
1109 | err_data->ue_count = 0; | |
1110 | err_data->ce_count = 0; | |
1111 | ||
1112 | switch (adev->asic_type) { | |
a61f41b1 HZ |
1113 | case CHIP_ARCTURUS: |
1114 | /* check xgmi pcs error */ | |
1115 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) { | |
1116 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]); | |
1117 | if (data) | |
828fc79d SY |
1118 | amdgpu_xgmi_query_pcs_error_status(adev, data, |
1119 | mask_data, &ue_cnt, &ce_cnt, true, false); | |
a61f41b1 HZ |
1120 | } |
1121 | /* check wafl pcs error */ | |
1122 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) { | |
1123 | data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]); | |
1124 | if (data) | |
828fc79d SY |
1125 | amdgpu_xgmi_query_pcs_error_status(adev, data, |
1126 | mask_data, &ue_cnt, &ce_cnt, false, false); | |
a61f41b1 HZ |
1127 | } |
1128 | break; | |
18f36157 | 1129 | case CHIP_VEGA20: |
18f36157 HZ |
1130 | /* check xgmi pcs error */ |
1131 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) { | |
1132 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]); | |
1133 | if (data) | |
828fc79d SY |
1134 | amdgpu_xgmi_query_pcs_error_status(adev, data, |
1135 | mask_data, &ue_cnt, &ce_cnt, true, false); | |
18f36157 HZ |
1136 | } |
1137 | /* check wafl pcs error */ | |
1138 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) { | |
1139 | data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]); | |
1140 | if (data) | |
828fc79d SY |
1141 | amdgpu_xgmi_query_pcs_error_status(adev, data, |
1142 | mask_data, &ue_cnt, &ce_cnt, false, false); | |
18f36157 HZ |
1143 | } |
1144 | break; | |
3c4ff2dc | 1145 | case CHIP_ALDEBARAN: |
3c4ff2dc JC |
1146 | /* check xgmi3x16 pcs error */ |
1147 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { | |
1148 | data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); | |
828fc79d SY |
1149 | mask_data = |
1150 | RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[i]); | |
3c4ff2dc | 1151 | if (data) |
828fc79d SY |
1152 | amdgpu_xgmi_query_pcs_error_status(adev, data, |
1153 | mask_data, &ue_cnt, &ce_cnt, true, true); | |
3c4ff2dc JC |
1154 | } |
1155 | /* check wafl pcs error */ | |
1156 | for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) { | |
1157 | data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]); | |
828fc79d SY |
1158 | mask_data = |
1159 | RREG32_PCIE(walf_pcs_err_noncorrectable_mask_reg_aldebaran[i]); | |
3c4ff2dc | 1160 | if (data) |
828fc79d SY |
1161 | amdgpu_xgmi_query_pcs_error_status(adev, data, |
1162 | mask_data, &ue_cnt, &ce_cnt, false, true); | |
3c4ff2dc JC |
1163 | } |
1164 | break; | |
f24d991b | 1165 | default: |
20238a2c TZ |
1166 | supported = 0; |
1167 | break; | |
1168 | } | |
1169 | ||
1170 | switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { | |
1171 | case IP_VERSION(6, 4, 0): | |
1172 | /* check xgmi3x16 pcs error */ | |
1173 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) { | |
1174 | data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_v6_4[i]); | |
1175 | mask_data = | |
1176 | RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[i]); | |
1177 | if (data) | |
1178 | amdgpu_xgmi_query_pcs_error_status(adev, data, | |
1179 | mask_data, &ue_cnt, &ce_cnt, true, true); | |
1180 | } | |
1181 | break; | |
1182 | default: | |
1183 | if (!supported) | |
1184 | dev_warn(adev->dev, "XGMI RAS error query not supported"); | |
f24d991b | 1185 | break; |
18f36157 HZ |
1186 | } |
1187 | ||
21226f02 | 1188 | amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); |
66399248 | 1189 | |
18f36157 HZ |
1190 | err_data->ue_count += ue_cnt; |
1191 | err_data->ce_count += ce_cnt; | |
18f36157 | 1192 | } |
52137ca8 | 1193 | |
27d80f7d YW |
1194 | static enum amdgpu_mca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdgpu_device *adev, u64 status) |
1195 | { | |
1196 | const char *error_str; | |
1197 | int ext_error_code; | |
1198 | ||
1199 | ext_error_code = MCA_REG__STATUS__ERRORCODEEXT(status); | |
1200 | ||
1201 | error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ? | |
1202 | xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL; | |
1203 | if (error_str) | |
1204 | dev_info(adev->dev, "%s detected\n", error_str); | |
1205 | ||
1206 | switch (ext_error_code) { | |
1207 | case 0: | |
1208 | return AMDGPU_MCA_ERROR_TYPE_UE; | |
1209 | case 6: | |
1210 | return AMDGPU_MCA_ERROR_TYPE_CE; | |
1211 | default: | |
1212 | return -EINVAL; | |
1213 | } | |
1214 | ||
1215 | return -EINVAL; | |
1216 | } | |
1217 | ||
1218 | static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct amdgpu_smuio_mcm_config_info *mcm_info, | |
1219 | u64 mca_base, struct ras_err_data *err_data) | |
1220 | { | |
1221 | int xgmi_inst = mcm_info->die_id; | |
1222 | u64 status = 0; | |
1223 | ||
1224 | status = RREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS); | |
1225 | if (!MCA_REG__STATUS__VAL(status)) | |
1226 | return; | |
1227 | ||
1228 | switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) { | |
1229 | case AMDGPU_MCA_ERROR_TYPE_UE: | |
1230 | amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL); | |
1231 | break; | |
1232 | case AMDGPU_MCA_ERROR_TYPE_CE: | |
1233 | amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL); | |
1234 | break; | |
1235 | default: | |
1236 | break; | |
1237 | } | |
1238 | ||
1239 | WREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS, 0ULL); | |
1240 | } | |
1241 | ||
1242 | static void xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, int xgmi_inst, struct ras_err_data *err_data) | |
1243 | { | |
1244 | struct amdgpu_smuio_mcm_config_info mcm_info = { | |
1245 | .socket_id = adev->smuio.funcs->get_socket_id(adev), | |
1246 | .die_id = xgmi_inst, | |
1247 | }; | |
1248 | int i; | |
1249 | ||
1250 | for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++) | |
1251 | __xgmi_v6_4_0_query_error_count(adev, &mcm_info, xgmi_v6_4_0_mca_base_array[i], err_data); | |
1252 | } | |
1253 | ||
1254 | static void xgmi_v6_4_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) | |
1255 | { | |
1256 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; | |
1257 | int i; | |
1258 | ||
1259 | for_each_inst(i, adev->aid_mask) | |
1260 | xgmi_v6_4_0_query_error_count(adev, i, err_data); | |
1261 | } | |
1262 | ||
1263 | static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, | |
1264 | void *ras_error_status) | |
1265 | { | |
1266 | switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { | |
1267 | case IP_VERSION(6, 4, 0): | |
1268 | xgmi_v6_4_0_query_ras_error_count(adev, ras_error_status); | |
1269 | break; | |
1270 | default: | |
1271 | amdgpu_xgmi_legacy_query_ras_error_count(adev, ras_error_status); | |
1272 | break; | |
1273 | } | |
1274 | } | |
1275 | ||
22d4ba53 | 1276 | /* Trigger XGMI/WAFL error */ |
2c22ed0b TZ |
1277 | static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, |
1278 | void *inject_if, uint32_t instance_mask) | |
22d4ba53 | 1279 | { |
61d70522 | 1280 | int ret1, ret2; |
71b6c4a2 | 1281 | struct ta_ras_trigger_error_input *block_info = |
1282 | (struct ta_ras_trigger_error_input *)inject_if; | |
22d4ba53 | 1283 | |
1284 | if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) | |
1285 | dev_warn(adev->dev, "Failed to disallow df cstate"); | |
1286 | ||
61d70522 TZ |
1287 | ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW); |
1288 | if (ret1 && ret1 != -EOPNOTSUPP) | |
22d4ba53 | 1289 | dev_warn(adev->dev, "Failed to disallow XGMI power down"); |
1290 | ||
61d70522 | 1291 | ret2 = psp_ras_trigger_error(&adev->psp, block_info, instance_mask); |
22d4ba53 | 1292 | |
1293 | if (amdgpu_ras_intr_triggered()) | |
61d70522 | 1294 | return ret2; |
22d4ba53 | 1295 | |
61d70522 TZ |
1296 | ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT); |
1297 | if (ret1 && ret1 != -EOPNOTSUPP) | |
22d4ba53 | 1298 | dev_warn(adev->dev, "Failed to allow XGMI power down"); |
1299 | ||
1300 | if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) | |
1301 | dev_warn(adev->dev, "Failed to allow df cstate"); | |
1302 | ||
61d70522 | 1303 | return ret2; |
22d4ba53 | 1304 | } |
1305 | ||
6c245386 | 1306 | struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { |
52137ca8 HZ |
1307 | .query_ras_error_count = amdgpu_xgmi_query_ras_error_count, |
1308 | .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count, | |
22d4ba53 | 1309 | .ras_error_inject = amdgpu_ras_error_inject_xgmi, |
52137ca8 | 1310 | }; |
6c245386 | 1311 | |
1312 | struct amdgpu_xgmi_ras xgmi_ras = { | |
1313 | .ras_block = { | |
6c245386 | 1314 | .hw_ops = &xgmi_ras_hw_ops, |
1315 | .ras_late_init = amdgpu_xgmi_ras_late_init, | |
6c245386 | 1316 | }, |
1317 | }; | |
da9d669e HZ |
1318 | |
1319 | int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev) | |
1320 | { | |
1321 | int err; | |
1322 | struct amdgpu_xgmi_ras *ras; | |
1323 | ||
1324 | if (!adev->gmc.xgmi.ras) | |
1325 | return 0; | |
1326 | ||
1327 | ras = adev->gmc.xgmi.ras; | |
1328 | err = amdgpu_ras_register_ras_block(adev, &ras->ras_block); | |
1329 | if (err) { | |
1330 | dev_err(adev->dev, "Failed to register xgmi_wafl_pcs ras block!\n"); | |
1331 | return err; | |
1332 | } | |
1333 | ||
9c224e05 | 1334 | strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl"); |
da9d669e HZ |
1335 | ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL; |
1336 | ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; | |
1337 | adev->gmc.xgmi.ras_if = &ras->ras_block.ras_comm; | |
1338 | ||
1339 | return 0; | |
1340 | } |