]>
Commit | Line | Data |
---|---|---|
c030f2e4 | 1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | * | |
23 | */ | |
24 | #include <linux/debugfs.h> | |
25 | #include <linux/list.h> | |
26 | #include <linux/module.h> | |
f867723b SR |
27 | #include <linux/uaccess.h> |
28 | ||
c030f2e4 | 29 | #include "amdgpu.h" |
30 | #include "amdgpu_ras.h" | |
b404ae82 | 31 | #include "amdgpu_atomfirmware.h" |
c030f2e4 | 32 | |
33 | struct ras_ih_data { | |
34 | /* interrupt bottom half */ | |
35 | struct work_struct ih_work; | |
36 | int inuse; | |
37 | /* IP callback */ | |
38 | ras_ih_cb cb; | |
39 | /* full of entries */ | |
40 | unsigned char *ring; | |
41 | unsigned int ring_size; | |
42 | unsigned int element_size; | |
43 | unsigned int aligned_element_size; | |
44 | unsigned int rptr; | |
45 | unsigned int wptr; | |
46 | }; | |
47 | ||
48 | struct ras_fs_data { | |
49 | char sysfs_name[32]; | |
50 | char debugfs_name[32]; | |
51 | }; | |
52 | ||
53 | struct ras_err_data { | |
54 | unsigned long ue_count; | |
55 | unsigned long ce_count; | |
56 | }; | |
57 | ||
58 | struct ras_err_handler_data { | |
59 | /* point to bad pages array */ | |
60 | struct { | |
61 | unsigned long bp; | |
62 | struct amdgpu_bo *bo; | |
63 | } *bps; | |
64 | /* the count of entries */ | |
65 | int count; | |
66 | /* the space can place new entries */ | |
67 | int space_left; | |
68 | /* last reserved entry's index + 1 */ | |
69 | int last_reserved; | |
70 | }; | |
71 | ||
72 | struct ras_manager { | |
73 | struct ras_common_if head; | |
74 | /* reference count */ | |
75 | int use; | |
76 | /* ras block link */ | |
77 | struct list_head node; | |
78 | /* the device */ | |
79 | struct amdgpu_device *adev; | |
80 | /* debugfs */ | |
81 | struct dentry *ent; | |
82 | /* sysfs */ | |
83 | struct device_attribute sysfs_attr; | |
84 | int attr_inuse; | |
85 | ||
86 | /* fs node name */ | |
87 | struct ras_fs_data fs_data; | |
88 | ||
89 | /* IH data */ | |
90 | struct ras_ih_data ih_data; | |
91 | ||
92 | struct ras_err_data err_data; | |
93 | }; | |
94 | ||
466b1793 | 95 | struct ras_badpage { |
96 | unsigned int bp; | |
97 | unsigned int size; | |
98 | unsigned int flags; | |
99 | }; | |
100 | ||
c030f2e4 | 101 | const char *ras_error_string[] = { |
102 | "none", | |
103 | "parity", | |
104 | "single_correctable", | |
105 | "multi_uncorrectable", | |
106 | "poison", | |
107 | }; | |
108 | ||
109 | const char *ras_block_string[] = { | |
110 | "umc", | |
111 | "sdma", | |
112 | "gfx", | |
113 | "mmhub", | |
114 | "athub", | |
115 | "pcie_bif", | |
116 | "hdp", | |
117 | "xgmi_wafl", | |
118 | "df", | |
119 | "smn", | |
120 | "sem", | |
121 | "mp0", | |
122 | "mp1", | |
123 | "fuse", | |
124 | }; | |
125 | ||
126 | #define ras_err_str(i) (ras_error_string[ffs(i)]) | |
127 | #define ras_block_str(i) (ras_block_string[i]) | |
128 | ||
a564808e | 129 | #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 |
130 | #define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2 | |
108c6a63 | 131 | #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) |
132 | ||
efb426d5 | 133 | static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, |
134 | uint64_t offset, uint64_t size, | |
135 | struct amdgpu_bo **bo_ptr); | |
136 | static int amdgpu_ras_release_vram(struct amdgpu_device *adev, | |
137 | struct amdgpu_bo **bo_ptr); | |
138 | ||
c030f2e4 | 139 | static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, |
140 | size_t size, loff_t *pos) | |
141 | { | |
142 | struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; | |
143 | struct ras_query_if info = { | |
144 | .head = obj->head, | |
145 | }; | |
146 | ssize_t s; | |
147 | char val[128]; | |
148 | ||
149 | if (amdgpu_ras_error_query(obj->adev, &info)) | |
150 | return -EINVAL; | |
151 | ||
152 | s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", | |
153 | "ue", info.ue_count, | |
154 | "ce", info.ce_count); | |
155 | if (*pos >= s) | |
156 | return 0; | |
157 | ||
158 | s -= *pos; | |
159 | s = min_t(u64, s, size); | |
160 | ||
161 | ||
162 | if (copy_to_user(buf, &val[*pos], s)) | |
163 | return -EINVAL; | |
164 | ||
165 | *pos += s; | |
166 | ||
167 | return s; | |
168 | } | |
169 | ||
c030f2e4 | 170 | static const struct file_operations amdgpu_ras_debugfs_ops = { |
171 | .owner = THIS_MODULE, | |
172 | .read = amdgpu_ras_debugfs_read, | |
190211ab | 173 | .write = NULL, |
c030f2e4 | 174 | .llseek = default_llseek |
175 | }; | |
176 | ||
96ebb307 | 177 | static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) |
178 | { | |
179 | int i; | |
180 | ||
181 | for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { | |
182 | *block_id = i; | |
183 | if (strcmp(name, ras_block_str(i)) == 0) | |
184 | return 0; | |
185 | } | |
186 | return -EINVAL; | |
187 | } | |
188 | ||
189 | static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, | |
190 | const char __user *buf, size_t size, | |
191 | loff_t *pos, struct ras_debug_if *data) | |
192 | { | |
193 | ssize_t s = min_t(u64, 64, size); | |
194 | char str[65]; | |
195 | char block_name[33]; | |
196 | char err[9] = "ue"; | |
197 | int op = -1; | |
198 | int block_id; | |
199 | u64 address, value; | |
200 | ||
201 | if (*pos) | |
202 | return -EINVAL; | |
203 | *pos = size; | |
204 | ||
205 | memset(str, 0, sizeof(str)); | |
206 | memset(data, 0, sizeof(*data)); | |
207 | ||
208 | if (copy_from_user(str, buf, s)) | |
209 | return -EINVAL; | |
210 | ||
211 | if (sscanf(str, "disable %32s", block_name) == 1) | |
212 | op = 0; | |
213 | else if (sscanf(str, "enable %32s %8s", block_name, err) == 2) | |
214 | op = 1; | |
215 | else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) | |
216 | op = 2; | |
b076296b | 217 | else if (str[0] && str[1] && str[2] && str[3]) |
96ebb307 | 218 | /* ascii string, but commands are not matched. */ |
219 | return -EINVAL; | |
220 | ||
221 | if (op != -1) { | |
222 | if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) | |
223 | return -EINVAL; | |
224 | ||
225 | data->head.block = block_id; | |
226 | data->head.type = memcmp("ue", err, 2) == 0 ? | |
227 | AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE : | |
228 | AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; | |
229 | data->op = op; | |
230 | ||
231 | if (op == 2) { | |
232 | if (sscanf(str, "%*s %*s %*s %llu %llu", | |
233 | &address, &value) != 2) | |
234 | if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx", | |
235 | &address, &value) != 2) | |
236 | return -EINVAL; | |
237 | data->inject.address = address; | |
238 | data->inject.value = value; | |
239 | } | |
240 | } else { | |
73aa8e1a | 241 | if (size < sizeof(*data)) |
96ebb307 | 242 | return -EINVAL; |
243 | ||
244 | if (copy_from_user(data, buf, sizeof(*data))) | |
245 | return -EINVAL; | |
246 | } | |
247 | ||
248 | return 0; | |
249 | } | |
74abc221 TSD |
250 | /** |
251 | * DOC: AMDGPU RAS debugfs control interface | |
36ea1bd2 | 252 | * |
253 | * It accepts struct ras_debug_if who has two members. | |
254 | * | |
255 | * First member: ras_debug_if::head or ras_debug_if::inject. | |
96ebb307 | 256 | * |
257 | * head is used to indicate which IP block will be under control. | |
36ea1bd2 | 258 | * |
259 | * head has four members, they are block, type, sub_block_index, name. | |
260 | * block: which IP will be under control. | |
261 | * type: what kind of error will be enabled/disabled/injected. | |
262 | * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. | |
263 | * name: the name of IP. | |
264 | * | |
265 | * inject has two more members than head, they are address, value. | |
266 | * As their names indicate, inject operation will write the | |
267 | * value to the address. | |
268 | * | |
269 | * Second member: struct ras_debug_if::op. | |
270 | * It has three kinds of operations. | |
271 | * 0: disable RAS on the block. Take ::head as its data. | |
272 | * 1: enable RAS on the block. Take ::head as its data. | |
273 | * 2: inject errors on the block. Take ::inject as its data. | |
274 | * | |
96ebb307 | 275 | * How to use the interface? |
276 | * programs: | |
277 | * copy the struct ras_debug_if in your codes and initialize it. | |
278 | * write the struct to the control node. | |
279 | * | |
280 | * bash: | |
281 | * echo op block [error [address value]] > .../ras/ras_ctrl | |
282 | * op: disable, enable, inject | |
283 | * disable: only block is needed | |
284 | * enable: block and error are needed | |
285 | * inject: error, address, value are needed | |
286 | * block: umc, smda, gfx, ......... | |
287 | * see ras_block_string[] for details | |
288 | * error: ue, ce | |
289 | * ue: multi_uncorrectable | |
290 | * ce: single_correctable | |
291 | * | |
292 | * here are some examples for bash commands, | |
293 | * echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl | |
294 | * echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl | |
295 | * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl | |
296 | * | |
36ea1bd2 | 297 | * How to check the result? |
298 | * | |
299 | * For disable/enable, please check ras features at | |
300 | * /sys/class/drm/card[0/1/2...]/device/ras/features | |
301 | * | |
302 | * For inject, please check corresponding err count at | |
303 | * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count | |
304 | * | |
305 | * NOTE: operation is only allowed on blocks which are supported. | |
306 | * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask | |
307 | */ | |
308 | static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf, | |
309 | size_t size, loff_t *pos) | |
310 | { | |
311 | struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; | |
312 | struct ras_debug_if data; | |
efb426d5 | 313 | struct amdgpu_bo *bo; |
36ea1bd2 | 314 | int ret = 0; |
315 | ||
96ebb307 | 316 | ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); |
317 | if (ret) | |
36ea1bd2 | 318 | return -EINVAL; |
319 | ||
36ea1bd2 | 320 | if (!amdgpu_ras_is_supported(adev, data.head.block)) |
321 | return -EINVAL; | |
322 | ||
323 | switch (data.op) { | |
324 | case 0: | |
325 | ret = amdgpu_ras_feature_enable(adev, &data.head, 0); | |
326 | break; | |
327 | case 1: | |
328 | ret = amdgpu_ras_feature_enable(adev, &data.head, 1); | |
329 | break; | |
330 | case 2: | |
efb426d5 | 331 | ret = amdgpu_ras_reserve_vram(adev, |
332 | data.inject.address, PAGE_SIZE, &bo); | |
acb05f0a | 333 | if (ret) { |
334 | /* address was offset, now it is absolute.*/ | |
335 | data.inject.address += adev->gmc.vram_start; | |
336 | if (data.inject.address > adev->gmc.vram_end) | |
337 | break; | |
338 | } else | |
339 | data.inject.address = amdgpu_bo_gpu_offset(bo); | |
36ea1bd2 | 340 | ret = amdgpu_ras_error_inject(adev, &data.inject); |
efb426d5 | 341 | amdgpu_ras_release_vram(adev, &bo); |
36ea1bd2 | 342 | break; |
96ebb307 | 343 | default: |
344 | ret = -EINVAL; | |
345 | break; | |
36ea1bd2 | 346 | }; |
347 | ||
348 | if (ret) | |
349 | return -EINVAL; | |
350 | ||
351 | return size; | |
352 | } | |
353 | ||
354 | static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { | |
355 | .owner = THIS_MODULE, | |
356 | .read = NULL, | |
357 | .write = amdgpu_ras_debugfs_ctrl_write, | |
358 | .llseek = default_llseek | |
359 | }; | |
360 | ||
c030f2e4 | 361 | static ssize_t amdgpu_ras_sysfs_read(struct device *dev, |
362 | struct device_attribute *attr, char *buf) | |
363 | { | |
364 | struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); | |
365 | struct ras_query_if info = { | |
366 | .head = obj->head, | |
367 | }; | |
368 | ||
369 | if (amdgpu_ras_error_query(obj->adev, &info)) | |
370 | return -EINVAL; | |
371 | ||
372 | return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", | |
373 | "ue", info.ue_count, | |
374 | "ce", info.ce_count); | |
375 | } | |
376 | ||
377 | /* obj begin */ | |
378 | ||
379 | #define get_obj(obj) do { (obj)->use++; } while (0) | |
380 | #define alive_obj(obj) ((obj)->use) | |
381 | ||
382 | static inline void put_obj(struct ras_manager *obj) | |
383 | { | |
384 | if (obj && --obj->use == 0) | |
385 | list_del(&obj->node); | |
386 | if (obj && obj->use < 0) { | |
387 | DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); | |
388 | } | |
389 | } | |
390 | ||
391 | /* make one obj and return it. */ | |
392 | static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, | |
393 | struct ras_common_if *head) | |
394 | { | |
395 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
396 | struct ras_manager *obj; | |
397 | ||
398 | if (!con) | |
399 | return NULL; | |
400 | ||
401 | if (head->block >= AMDGPU_RAS_BLOCK_COUNT) | |
402 | return NULL; | |
403 | ||
404 | obj = &con->objs[head->block]; | |
405 | /* already exist. return obj? */ | |
406 | if (alive_obj(obj)) | |
407 | return NULL; | |
408 | ||
409 | obj->head = *head; | |
410 | obj->adev = adev; | |
411 | list_add(&obj->node, &con->head); | |
412 | get_obj(obj); | |
413 | ||
414 | return obj; | |
415 | } | |
416 | ||
417 | /* return an obj equal to head, or the first when head is NULL */ | |
418 | static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, | |
419 | struct ras_common_if *head) | |
420 | { | |
421 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
422 | struct ras_manager *obj; | |
423 | int i; | |
424 | ||
425 | if (!con) | |
426 | return NULL; | |
427 | ||
428 | if (head) { | |
429 | if (head->block >= AMDGPU_RAS_BLOCK_COUNT) | |
430 | return NULL; | |
431 | ||
432 | obj = &con->objs[head->block]; | |
433 | ||
434 | if (alive_obj(obj)) { | |
435 | WARN_ON(head->block != obj->head.block); | |
436 | return obj; | |
437 | } | |
438 | } else { | |
439 | for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { | |
440 | obj = &con->objs[i]; | |
441 | if (alive_obj(obj)) { | |
442 | WARN_ON(i != obj->head.block); | |
443 | return obj; | |
444 | } | |
445 | } | |
446 | } | |
447 | ||
448 | return NULL; | |
449 | } | |
450 | /* obj end */ | |
451 | ||
452 | /* feature ctl begin */ | |
453 | static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, | |
454 | struct ras_common_if *head) | |
455 | { | |
5caf466a | 456 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
457 | ||
458 | return con->hw_supported & BIT(head->block); | |
c030f2e4 | 459 | } |
460 | ||
461 | static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, | |
462 | struct ras_common_if *head) | |
463 | { | |
464 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
465 | ||
466 | return con->features & BIT(head->block); | |
467 | } | |
468 | ||
469 | /* | |
470 | * if obj is not created, then create one. | |
471 | * set feature enable flag. | |
472 | */ | |
473 | static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, | |
474 | struct ras_common_if *head, int enable) | |
475 | { | |
476 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
477 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); | |
478 | ||
5caf466a | 479 | /* If hardware does not support ras, then do not create obj. |
480 | * But if hardware support ras, we can create the obj. | |
481 | * Ras framework checks con->hw_supported to see if it need do | |
482 | * corresponding initialization. | |
483 | * IP checks con->support to see if it need disable ras. | |
484 | */ | |
c030f2e4 | 485 | if (!amdgpu_ras_is_feature_allowed(adev, head)) |
486 | return 0; | |
487 | if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) | |
488 | return 0; | |
489 | ||
490 | if (enable) { | |
491 | if (!obj) { | |
492 | obj = amdgpu_ras_create_obj(adev, head); | |
493 | if (!obj) | |
494 | return -EINVAL; | |
495 | } else { | |
496 | /* In case we create obj somewhere else */ | |
497 | get_obj(obj); | |
498 | } | |
499 | con->features |= BIT(head->block); | |
500 | } else { | |
501 | if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { | |
502 | con->features &= ~BIT(head->block); | |
503 | put_obj(obj); | |
504 | } | |
505 | } | |
506 | ||
507 | return 0; | |
508 | } | |
509 | ||
510 | /* wrapper of psp_ras_enable_features */ | |
511 | int amdgpu_ras_feature_enable(struct amdgpu_device *adev, | |
512 | struct ras_common_if *head, bool enable) | |
513 | { | |
514 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
515 | union ta_ras_cmd_input info; | |
516 | int ret; | |
517 | ||
518 | if (!con) | |
519 | return -EINVAL; | |
520 | ||
521 | if (!enable) { | |
522 | info.disable_features = (struct ta_ras_disable_features_input) { | |
828cfa29 | 523 | .block_id = amdgpu_ras_block_to_ta(head->block), |
524 | .error_type = amdgpu_ras_error_to_ta(head->type), | |
c030f2e4 | 525 | }; |
526 | } else { | |
527 | info.enable_features = (struct ta_ras_enable_features_input) { | |
828cfa29 | 528 | .block_id = amdgpu_ras_block_to_ta(head->block), |
529 | .error_type = amdgpu_ras_error_to_ta(head->type), | |
c030f2e4 | 530 | }; |
531 | } | |
532 | ||
533 | /* Do not enable if it is not allowed. */ | |
534 | WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); | |
535 | /* Are we alerady in that state we are going to set? */ | |
536 | if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) | |
537 | return 0; | |
538 | ||
539 | ret = psp_ras_enable_features(&adev->psp, &info, enable); | |
540 | if (ret) { | |
541 | DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", | |
542 | enable ? "enable":"disable", | |
543 | ras_block_str(head->block), | |
544 | ret); | |
7af23ebe | 545 | if (ret == TA_RAS_STATUS__RESET_NEEDED) |
546 | return -EAGAIN; | |
c030f2e4 | 547 | return -EINVAL; |
548 | } | |
549 | ||
550 | /* setup the obj */ | |
551 | __amdgpu_ras_feature_enable(adev, head, enable); | |
552 | ||
553 | return 0; | |
554 | } | |
555 | ||
77de502b | 556 | /* Only used in device probe stage and called only once. */ |
557 | int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, | |
558 | struct ras_common_if *head, bool enable) | |
559 | { | |
560 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
561 | int ret; | |
562 | ||
563 | if (!con) | |
564 | return -EINVAL; | |
565 | ||
566 | if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { | |
7af23ebe | 567 | if (enable) { |
568 | /* There is no harm to issue a ras TA cmd regardless of | |
569 | * the currecnt ras state. | |
570 | * If current state == target state, it will do nothing | |
571 | * But sometimes it requests driver to reset and repost | |
572 | * with error code -EAGAIN. | |
573 | */ | |
574 | ret = amdgpu_ras_feature_enable(adev, head, 1); | |
575 | /* With old ras TA, we might fail to enable ras. | |
576 | * Log it and just setup the object. | |
577 | * TODO need remove this WA in the future. | |
578 | */ | |
579 | if (ret == -EINVAL) { | |
580 | ret = __amdgpu_ras_feature_enable(adev, head, 1); | |
581 | if (!ret) | |
582 | DRM_INFO("RAS INFO: %s setup object\n", | |
583 | ras_block_str(head->block)); | |
584 | } | |
585 | } else { | |
586 | /* setup the object then issue a ras TA disable cmd.*/ | |
587 | ret = __amdgpu_ras_feature_enable(adev, head, 1); | |
588 | if (ret) | |
589 | return ret; | |
77de502b | 590 | |
77de502b | 591 | ret = amdgpu_ras_feature_enable(adev, head, 0); |
7af23ebe | 592 | } |
77de502b | 593 | } else |
594 | ret = amdgpu_ras_feature_enable(adev, head, enable); | |
595 | ||
596 | return ret; | |
597 | } | |
598 | ||
c030f2e4 | 599 | static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, |
600 | bool bypass) | |
601 | { | |
602 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
603 | struct ras_manager *obj, *tmp; | |
604 | ||
605 | list_for_each_entry_safe(obj, tmp, &con->head, node) { | |
606 | /* bypass psp. | |
607 | * aka just release the obj and corresponding flags | |
608 | */ | |
609 | if (bypass) { | |
610 | if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) | |
611 | break; | |
612 | } else { | |
613 | if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) | |
614 | break; | |
615 | } | |
289d513b | 616 | } |
c030f2e4 | 617 | |
618 | return con->features; | |
619 | } | |
620 | ||
621 | static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, | |
622 | bool bypass) | |
623 | { | |
624 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
625 | int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; | |
626 | int i; | |
191051a1 | 627 | const enum amdgpu_ras_error_type default_ras_type = |
628 | AMDGPU_RAS_ERROR__NONE; | |
c030f2e4 | 629 | |
630 | for (i = 0; i < ras_block_count; i++) { | |
631 | struct ras_common_if head = { | |
632 | .block = i, | |
191051a1 | 633 | .type = default_ras_type, |
c030f2e4 | 634 | .sub_block_index = 0, |
635 | }; | |
636 | strcpy(head.name, ras_block_str(i)); | |
637 | if (bypass) { | |
638 | /* | |
639 | * bypass psp. vbios enable ras for us. | |
640 | * so just create the obj | |
641 | */ | |
642 | if (__amdgpu_ras_feature_enable(adev, &head, 1)) | |
643 | break; | |
644 | } else { | |
645 | if (amdgpu_ras_feature_enable(adev, &head, 1)) | |
646 | break; | |
647 | } | |
289d513b | 648 | } |
c030f2e4 | 649 | |
650 | return con->features; | |
651 | } | |
652 | /* feature ctl end */ | |
653 | ||
654 | /* query/inject/cure begin */ | |
655 | int amdgpu_ras_error_query(struct amdgpu_device *adev, | |
656 | struct ras_query_if *info) | |
657 | { | |
658 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); | |
659 | ||
660 | if (!obj) | |
661 | return -EINVAL; | |
662 | /* TODO might read the register to read the count */ | |
663 | ||
664 | info->ue_count = obj->err_data.ue_count; | |
665 | info->ce_count = obj->err_data.ce_count; | |
666 | ||
667 | return 0; | |
668 | } | |
669 | ||
670 | /* wrapper of psp_ras_trigger_error */ | |
671 | int amdgpu_ras_error_inject(struct amdgpu_device *adev, | |
672 | struct ras_inject_if *info) | |
673 | { | |
674 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); | |
675 | struct ta_ras_trigger_error_input block_info = { | |
828cfa29 | 676 | .block_id = amdgpu_ras_block_to_ta(info->head.block), |
677 | .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), | |
c030f2e4 | 678 | .sub_block_index = info->head.sub_block_index, |
679 | .address = info->address, | |
680 | .value = info->value, | |
681 | }; | |
682 | int ret = 0; | |
683 | ||
684 | if (!obj) | |
685 | return -EINVAL; | |
686 | ||
29bd6508 HZ |
687 | if (block_info.block_id != TA_RAS_BLOCK__UMC) { |
688 | DRM_INFO("%s error injection is not supported yet\n", | |
689 | ras_block_str(info->head.block)); | |
690 | return -EINVAL; | |
691 | } | |
692 | ||
c030f2e4 | 693 | ret = psp_ras_trigger_error(&adev->psp, &block_info); |
694 | if (ret) | |
695 | DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n", | |
696 | ras_block_str(info->head.block), | |
697 | ret); | |
698 | ||
699 | return ret; | |
700 | } | |
701 | ||
702 | int amdgpu_ras_error_cure(struct amdgpu_device *adev, | |
703 | struct ras_cure_if *info) | |
704 | { | |
705 | /* psp fw has no cure interface for now. */ | |
706 | return 0; | |
707 | } | |
708 | ||
709 | /* get the total error counts on all IPs */ | |
710 | int amdgpu_ras_query_error_count(struct amdgpu_device *adev, | |
711 | bool is_ce) | |
712 | { | |
713 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
714 | struct ras_manager *obj; | |
715 | struct ras_err_data data = {0, 0}; | |
716 | ||
717 | if (!con) | |
718 | return -EINVAL; | |
719 | ||
720 | list_for_each_entry(obj, &con->head, node) { | |
721 | struct ras_query_if info = { | |
722 | .head = obj->head, | |
723 | }; | |
724 | ||
725 | if (amdgpu_ras_error_query(adev, &info)) | |
726 | return -EINVAL; | |
727 | ||
728 | data.ce_count += info.ce_count; | |
729 | data.ue_count += info.ue_count; | |
730 | } | |
731 | ||
732 | return is_ce ? data.ce_count : data.ue_count; | |
733 | } | |
734 | /* query/inject/cure end */ | |
735 | ||
736 | ||
737 | /* sysfs begin */ | |
738 | ||
466b1793 | 739 | static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, |
740 | struct ras_badpage **bps, unsigned int *count); | |
741 | ||
742 | static char *amdgpu_ras_badpage_flags_str(unsigned int flags) | |
743 | { | |
744 | switch (flags) { | |
745 | case 0: | |
746 | return "R"; | |
747 | case 1: | |
748 | return "P"; | |
749 | case 2: | |
750 | default: | |
751 | return "F"; | |
752 | }; | |
753 | } | |
754 | ||
755 | /* | |
756 | * DOC: ras sysfs gpu_vram_bad_pages interface | |
757 | * | |
758 | * It allows user to read the bad pages of vram on the gpu through | |
759 | * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages | |
760 | * | |
761 | * It outputs multiple lines, and each line stands for one gpu page. | |
762 | * | |
763 | * The format of one line is below, | |
764 | * gpu pfn : gpu page size : flags | |
765 | * | |
766 | * gpu pfn and gpu page size are printed in hex format. | |
767 | * flags can be one of below character, | |
768 | * R: reserved, this gpu page is reserved and not able to use. | |
769 | * P: pending for reserve, this gpu page is marked as bad, will be reserved | |
770 | * in next window of page_reserve. | |
771 | * F: unable to reserve. this gpu page can't be reserved due to some reasons. | |
772 | * | |
773 | * examples: | |
774 | * 0x00000001 : 0x00001000 : R | |
775 | * 0x00000002 : 0x00001000 : P | |
776 | */ | |
777 | ||
778 | static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, | |
779 | struct kobject *kobj, struct bin_attribute *attr, | |
780 | char *buf, loff_t ppos, size_t count) | |
781 | { | |
782 | struct amdgpu_ras *con = | |
783 | container_of(attr, struct amdgpu_ras, badpages_attr); | |
784 | struct amdgpu_device *adev = con->adev; | |
785 | const unsigned int element_size = | |
786 | sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; | |
d6ee400e SA |
787 | unsigned int start = div64_ul(ppos + element_size - 1, element_size); |
788 | unsigned int end = div64_ul(ppos + count - 1, element_size); | |
466b1793 | 789 | ssize_t s = 0; |
790 | struct ras_badpage *bps = NULL; | |
791 | unsigned int bps_count = 0; | |
792 | ||
793 | memset(buf, 0, count); | |
794 | ||
795 | if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) | |
796 | return 0; | |
797 | ||
798 | for (; start < end && start < bps_count; start++) | |
799 | s += scnprintf(&buf[s], element_size + 1, | |
800 | "0x%08x : 0x%08x : %1s\n", | |
801 | bps[start].bp, | |
802 | bps[start].size, | |
803 | amdgpu_ras_badpage_flags_str(bps[start].flags)); | |
804 | ||
805 | kfree(bps); | |
806 | ||
807 | return s; | |
808 | } | |
809 | ||
c030f2e4 | 810 | static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, |
811 | struct device_attribute *attr, char *buf) | |
812 | { | |
813 | struct amdgpu_ras *con = | |
814 | container_of(attr, struct amdgpu_ras, features_attr); | |
815 | struct drm_device *ddev = dev_get_drvdata(dev); | |
816 | struct amdgpu_device *adev = ddev->dev_private; | |
817 | struct ras_common_if head; | |
818 | int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; | |
819 | int i; | |
820 | ssize_t s; | |
821 | struct ras_manager *obj; | |
822 | ||
823 | s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); | |
824 | ||
825 | for (i = 0; i < ras_block_count; i++) { | |
826 | head.block = i; | |
827 | ||
828 | if (amdgpu_ras_is_feature_enabled(adev, &head)) { | |
829 | obj = amdgpu_ras_find_obj(adev, &head); | |
830 | s += scnprintf(&buf[s], PAGE_SIZE - s, | |
831 | "%s: %s\n", | |
832 | ras_block_str(i), | |
833 | ras_err_str(obj->head.type)); | |
834 | } else | |
835 | s += scnprintf(&buf[s], PAGE_SIZE - s, | |
836 | "%s: disabled\n", | |
837 | ras_block_str(i)); | |
838 | } | |
839 | ||
840 | return s; | |
841 | } | |
842 | ||
843 | static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) | |
844 | { | |
845 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
846 | struct attribute *attrs[] = { | |
847 | &con->features_attr.attr, | |
848 | NULL | |
849 | }; | |
466b1793 | 850 | struct bin_attribute *bin_attrs[] = { |
851 | &con->badpages_attr, | |
852 | NULL | |
853 | }; | |
c030f2e4 | 854 | struct attribute_group group = { |
855 | .name = "ras", | |
856 | .attrs = attrs, | |
466b1793 | 857 | .bin_attrs = bin_attrs, |
c030f2e4 | 858 | }; |
859 | ||
860 | con->features_attr = (struct device_attribute) { | |
861 | .attr = { | |
862 | .name = "features", | |
863 | .mode = S_IRUGO, | |
864 | }, | |
865 | .show = amdgpu_ras_sysfs_features_read, | |
866 | }; | |
466b1793 | 867 | |
868 | con->badpages_attr = (struct bin_attribute) { | |
869 | .attr = { | |
870 | .name = "gpu_vram_bad_pages", | |
871 | .mode = S_IRUGO, | |
872 | }, | |
873 | .size = 0, | |
874 | .private = NULL, | |
875 | .read = amdgpu_ras_sysfs_badpages_read, | |
876 | }; | |
877 | ||
163def43 | 878 | sysfs_attr_init(attrs[0]); |
466b1793 | 879 | sysfs_bin_attr_init(bin_attrs[0]); |
c030f2e4 | 880 | |
881 | return sysfs_create_group(&adev->dev->kobj, &group); | |
882 | } | |
883 | ||
884 | static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) | |
885 | { | |
886 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
887 | struct attribute *attrs[] = { | |
888 | &con->features_attr.attr, | |
889 | NULL | |
890 | }; | |
466b1793 | 891 | struct bin_attribute *bin_attrs[] = { |
892 | &con->badpages_attr, | |
893 | NULL | |
894 | }; | |
c030f2e4 | 895 | struct attribute_group group = { |
896 | .name = "ras", | |
897 | .attrs = attrs, | |
466b1793 | 898 | .bin_attrs = bin_attrs, |
c030f2e4 | 899 | }; |
900 | ||
901 | sysfs_remove_group(&adev->dev->kobj, &group); | |
902 | ||
903 | return 0; | |
904 | } | |
905 | ||
906 | int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, | |
907 | struct ras_fs_if *head) | |
908 | { | |
909 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); | |
910 | ||
911 | if (!obj || obj->attr_inuse) | |
912 | return -EINVAL; | |
913 | ||
914 | get_obj(obj); | |
915 | ||
916 | memcpy(obj->fs_data.sysfs_name, | |
917 | head->sysfs_name, | |
918 | sizeof(obj->fs_data.sysfs_name)); | |
919 | ||
920 | obj->sysfs_attr = (struct device_attribute){ | |
921 | .attr = { | |
922 | .name = obj->fs_data.sysfs_name, | |
923 | .mode = S_IRUGO, | |
924 | }, | |
925 | .show = amdgpu_ras_sysfs_read, | |
926 | }; | |
163def43 | 927 | sysfs_attr_init(&obj->sysfs_attr.attr); |
c030f2e4 | 928 | |
929 | if (sysfs_add_file_to_group(&adev->dev->kobj, | |
930 | &obj->sysfs_attr.attr, | |
931 | "ras")) { | |
932 | put_obj(obj); | |
933 | return -EINVAL; | |
934 | } | |
935 | ||
936 | obj->attr_inuse = 1; | |
937 | ||
938 | return 0; | |
939 | } | |
940 | ||
941 | int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, | |
942 | struct ras_common_if *head) | |
943 | { | |
944 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); | |
945 | ||
946 | if (!obj || !obj->attr_inuse) | |
947 | return -EINVAL; | |
948 | ||
949 | sysfs_remove_file_from_group(&adev->dev->kobj, | |
950 | &obj->sysfs_attr.attr, | |
951 | "ras"); | |
952 | obj->attr_inuse = 0; | |
953 | put_obj(obj); | |
954 | ||
955 | return 0; | |
956 | } | |
957 | ||
958 | static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) | |
959 | { | |
960 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
961 | struct ras_manager *obj, *tmp; | |
962 | ||
963 | list_for_each_entry_safe(obj, tmp, &con->head, node) { | |
964 | amdgpu_ras_sysfs_remove(adev, &obj->head); | |
965 | } | |
966 | ||
967 | amdgpu_ras_sysfs_remove_feature_node(adev); | |
968 | ||
969 | return 0; | |
970 | } | |
971 | /* sysfs end */ | |
972 | ||
973 | /* debugfs begin */ | |
450f30ea | 974 | static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) |
36ea1bd2 | 975 | { |
976 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
977 | struct drm_minor *minor = adev->ddev->primary; | |
36ea1bd2 | 978 | |
450f30ea GKH |
979 | con->dir = debugfs_create_dir("ras", minor->debugfs_root); |
980 | con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, | |
981 | adev, &amdgpu_ras_debugfs_ctrl_ops); | |
36ea1bd2 | 982 | } |
983 | ||
450f30ea | 984 | void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, |
c030f2e4 | 985 | struct ras_fs_if *head) |
986 | { | |
987 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
988 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); | |
c030f2e4 | 989 | |
990 | if (!obj || obj->ent) | |
450f30ea | 991 | return; |
c030f2e4 | 992 | |
993 | get_obj(obj); | |
994 | ||
995 | memcpy(obj->fs_data.debugfs_name, | |
996 | head->debugfs_name, | |
997 | sizeof(obj->fs_data.debugfs_name)); | |
998 | ||
450f30ea GKH |
999 | obj->ent = debugfs_create_file(obj->fs_data.debugfs_name, |
1000 | S_IWUGO | S_IRUGO, con->dir, obj, | |
1001 | &amdgpu_ras_debugfs_ops); | |
c030f2e4 | 1002 | } |
1003 | ||
450f30ea | 1004 | void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, |
c030f2e4 | 1005 | struct ras_common_if *head) |
1006 | { | |
1007 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); | |
1008 | ||
1009 | if (!obj || !obj->ent) | |
450f30ea | 1010 | return; |
c030f2e4 | 1011 | |
1012 | debugfs_remove(obj->ent); | |
1013 | obj->ent = NULL; | |
1014 | put_obj(obj); | |
c030f2e4 | 1015 | } |
1016 | ||
450f30ea | 1017 | static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) |
c030f2e4 | 1018 | { |
1019 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
1020 | struct ras_manager *obj, *tmp; | |
1021 | ||
1022 | list_for_each_entry_safe(obj, tmp, &con->head, node) { | |
1023 | amdgpu_ras_debugfs_remove(adev, &obj->head); | |
1024 | } | |
1025 | ||
36ea1bd2 | 1026 | debugfs_remove(con->ent); |
c030f2e4 | 1027 | debugfs_remove(con->dir); |
1028 | con->dir = NULL; | |
36ea1bd2 | 1029 | con->ent = NULL; |
c030f2e4 | 1030 | } |
1031 | /* debugfs end */ | |
1032 | ||
1033 | /* ras fs */ | |
1034 | ||
1035 | static int amdgpu_ras_fs_init(struct amdgpu_device *adev) | |
1036 | { | |
c030f2e4 | 1037 | amdgpu_ras_sysfs_create_feature_node(adev); |
36ea1bd2 | 1038 | amdgpu_ras_debugfs_create_ctrl_node(adev); |
c030f2e4 | 1039 | |
1040 | return 0; | |
1041 | } | |
1042 | ||
1043 | static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) | |
1044 | { | |
1045 | amdgpu_ras_debugfs_remove_all(adev); | |
1046 | amdgpu_ras_sysfs_remove_all(adev); | |
1047 | return 0; | |
1048 | } | |
1049 | /* ras fs end */ | |
1050 | ||
1051 | /* ih begin */ | |
1052 | static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) | |
1053 | { | |
1054 | struct ras_ih_data *data = &obj->ih_data; | |
1055 | struct amdgpu_iv_entry entry; | |
1056 | int ret; | |
1057 | ||
1058 | while (data->rptr != data->wptr) { | |
1059 | rmb(); | |
1060 | memcpy(&entry, &data->ring[data->rptr], | |
1061 | data->element_size); | |
1062 | ||
1063 | wmb(); | |
1064 | data->rptr = (data->aligned_element_size + | |
1065 | data->rptr) % data->ring_size; | |
1066 | ||
1067 | /* Let IP handle its data, maybe we need get the output | |
1068 | * from the callback to udpate the error type/count, etc | |
1069 | */ | |
1070 | if (data->cb) { | |
1071 | ret = data->cb(obj->adev, &entry); | |
1072 | /* ue will trigger an interrupt, and in that case | |
1073 | * we need do a reset to recovery the whole system. | |
1074 | * But leave IP do that recovery, here we just dispatch | |
1075 | * the error. | |
1076 | */ | |
1077 | if (ret == AMDGPU_RAS_UE) { | |
1078 | obj->err_data.ue_count++; | |
1079 | } | |
1080 | /* Might need get ce count by register, but not all IP | |
1081 | * saves ce count, some IP just use one bit or two bits | |
1082 | * to indicate ce happened. | |
1083 | */ | |
1084 | } | |
1085 | } | |
1086 | } | |
1087 | ||
1088 | static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) | |
1089 | { | |
1090 | struct ras_ih_data *data = | |
1091 | container_of(work, struct ras_ih_data, ih_work); | |
1092 | struct ras_manager *obj = | |
1093 | container_of(data, struct ras_manager, ih_data); | |
1094 | ||
1095 | amdgpu_ras_interrupt_handler(obj); | |
1096 | } | |
1097 | ||
1098 | int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, | |
1099 | struct ras_dispatch_if *info) | |
1100 | { | |
1101 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); | |
1102 | struct ras_ih_data *data = &obj->ih_data; | |
1103 | ||
1104 | if (!obj) | |
1105 | return -EINVAL; | |
1106 | ||
1107 | if (data->inuse == 0) | |
1108 | return 0; | |
1109 | ||
1110 | /* Might be overflow... */ | |
1111 | memcpy(&data->ring[data->wptr], info->entry, | |
1112 | data->element_size); | |
1113 | ||
1114 | wmb(); | |
1115 | data->wptr = (data->aligned_element_size + | |
1116 | data->wptr) % data->ring_size; | |
1117 | ||
1118 | schedule_work(&data->ih_work); | |
1119 | ||
1120 | return 0; | |
1121 | } | |
1122 | ||
1123 | int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, | |
1124 | struct ras_ih_if *info) | |
1125 | { | |
1126 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); | |
1127 | struct ras_ih_data *data; | |
1128 | ||
1129 | if (!obj) | |
1130 | return -EINVAL; | |
1131 | ||
1132 | data = &obj->ih_data; | |
1133 | if (data->inuse == 0) | |
1134 | return 0; | |
1135 | ||
1136 | cancel_work_sync(&data->ih_work); | |
1137 | ||
1138 | kfree(data->ring); | |
1139 | memset(data, 0, sizeof(*data)); | |
1140 | put_obj(obj); | |
1141 | ||
1142 | return 0; | |
1143 | } | |
1144 | ||
1145 | int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, | |
1146 | struct ras_ih_if *info) | |
1147 | { | |
1148 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); | |
1149 | struct ras_ih_data *data; | |
1150 | ||
1151 | if (!obj) { | |
1152 | /* in case we registe the IH before enable ras feature */ | |
1153 | obj = amdgpu_ras_create_obj(adev, &info->head); | |
1154 | if (!obj) | |
1155 | return -EINVAL; | |
1156 | } else | |
1157 | get_obj(obj); | |
1158 | ||
1159 | data = &obj->ih_data; | |
1160 | /* add the callback.etc */ | |
1161 | *data = (struct ras_ih_data) { | |
1162 | .inuse = 0, | |
1163 | .cb = info->cb, | |
1164 | .element_size = sizeof(struct amdgpu_iv_entry), | |
1165 | .rptr = 0, | |
1166 | .wptr = 0, | |
1167 | }; | |
1168 | ||
1169 | INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); | |
1170 | ||
1171 | data->aligned_element_size = ALIGN(data->element_size, 8); | |
1172 | /* the ring can store 64 iv entries. */ | |
1173 | data->ring_size = 64 * data->aligned_element_size; | |
1174 | data->ring = kmalloc(data->ring_size, GFP_KERNEL); | |
1175 | if (!data->ring) { | |
1176 | put_obj(obj); | |
1177 | return -ENOMEM; | |
1178 | } | |
1179 | ||
1180 | /* IH is ready */ | |
1181 | data->inuse = 1; | |
1182 | ||
1183 | return 0; | |
1184 | } | |
1185 | ||
1186 | static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) | |
1187 | { | |
1188 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
1189 | struct ras_manager *obj, *tmp; | |
1190 | ||
1191 | list_for_each_entry_safe(obj, tmp, &con->head, node) { | |
1192 | struct ras_ih_if info = { | |
1193 | .head = obj->head, | |
1194 | }; | |
1195 | amdgpu_ras_interrupt_remove_handler(adev, &info); | |
1196 | } | |
1197 | ||
1198 | return 0; | |
1199 | } | |
1200 | /* ih end */ | |
1201 | ||
1202 | /* recovery begin */ | |
466b1793 | 1203 | |
1204 | /* return 0 on success. | |
1205 | * caller need free bps. | |
1206 | */ | |
1207 | static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, | |
1208 | struct ras_badpage **bps, unsigned int *count) | |
1209 | { | |
1210 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
1211 | struct ras_err_handler_data *data; | |
1212 | int i = 0; | |
1213 | int ret = 0; | |
1214 | ||
1215 | if (!con || !con->eh_data || !bps || !count) | |
1216 | return -EINVAL; | |
1217 | ||
1218 | mutex_lock(&con->recovery_lock); | |
1219 | data = con->eh_data; | |
1220 | if (!data || data->count == 0) { | |
1221 | *bps = NULL; | |
1222 | goto out; | |
1223 | } | |
1224 | ||
1225 | *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); | |
1226 | if (!*bps) { | |
1227 | ret = -ENOMEM; | |
1228 | goto out; | |
1229 | } | |
1230 | ||
1231 | for (; i < data->count; i++) { | |
1232 | (*bps)[i] = (struct ras_badpage){ | |
1233 | .bp = data->bps[i].bp, | |
1234 | .size = AMDGPU_GPU_PAGE_SIZE, | |
1235 | .flags = 0, | |
1236 | }; | |
1237 | ||
1238 | if (data->last_reserved <= i) | |
1239 | (*bps)[i].flags = 1; | |
1240 | else if (data->bps[i].bo == NULL) | |
1241 | (*bps)[i].flags = 2; | |
1242 | } | |
1243 | ||
1244 | *count = data->count; | |
1245 | out: | |
1246 | mutex_unlock(&con->recovery_lock); | |
1247 | return ret; | |
1248 | } | |
1249 | ||
c030f2e4 | 1250 | static void amdgpu_ras_do_recovery(struct work_struct *work) |
1251 | { | |
1252 | struct amdgpu_ras *ras = | |
1253 | container_of(work, struct amdgpu_ras, recovery_work); | |
1254 | ||
1255 | amdgpu_device_gpu_recover(ras->adev, 0); | |
1256 | atomic_set(&ras->in_recovery, 0); | |
1257 | } | |
1258 | ||
1259 | static int amdgpu_ras_release_vram(struct amdgpu_device *adev, | |
1260 | struct amdgpu_bo **bo_ptr) | |
1261 | { | |
1262 | /* no need to free it actually. */ | |
1263 | amdgpu_bo_free_kernel(bo_ptr, NULL, NULL); | |
1264 | return 0; | |
1265 | } | |
1266 | ||
1267 | /* reserve vram with size@offset */ | |
1268 | static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, | |
1269 | uint64_t offset, uint64_t size, | |
1270 | struct amdgpu_bo **bo_ptr) | |
1271 | { | |
1272 | struct ttm_operation_ctx ctx = { false, false }; | |
1273 | struct amdgpu_bo_param bp; | |
1274 | int r = 0; | |
1275 | int i; | |
1276 | struct amdgpu_bo *bo; | |
1277 | ||
1278 | if (bo_ptr) | |
1279 | *bo_ptr = NULL; | |
1280 | memset(&bp, 0, sizeof(bp)); | |
1281 | bp.size = size; | |
1282 | bp.byte_align = PAGE_SIZE; | |
1283 | bp.domain = AMDGPU_GEM_DOMAIN_VRAM; | |
1284 | bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | | |
1285 | AMDGPU_GEM_CREATE_NO_CPU_ACCESS; | |
1286 | bp.type = ttm_bo_type_kernel; | |
1287 | bp.resv = NULL; | |
1288 | ||
1289 | r = amdgpu_bo_create(adev, &bp, &bo); | |
1290 | if (r) | |
1291 | return -EINVAL; | |
1292 | ||
1293 | r = amdgpu_bo_reserve(bo, false); | |
1294 | if (r) | |
1295 | goto error_reserve; | |
1296 | ||
1297 | offset = ALIGN(offset, PAGE_SIZE); | |
1298 | for (i = 0; i < bo->placement.num_placement; ++i) { | |
1299 | bo->placements[i].fpfn = offset >> PAGE_SHIFT; | |
1300 | bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT; | |
1301 | } | |
1302 | ||
1303 | ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem); | |
1304 | r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx); | |
1305 | if (r) | |
1306 | goto error_pin; | |
1307 | ||
1308 | r = amdgpu_bo_pin_restricted(bo, | |
1309 | AMDGPU_GEM_DOMAIN_VRAM, | |
1310 | offset, | |
1311 | offset + size); | |
1312 | if (r) | |
1313 | goto error_pin; | |
1314 | ||
1315 | if (bo_ptr) | |
1316 | *bo_ptr = bo; | |
1317 | ||
1318 | amdgpu_bo_unreserve(bo); | |
1319 | return r; | |
1320 | ||
1321 | error_pin: | |
1322 | amdgpu_bo_unreserve(bo); | |
1323 | error_reserve: | |
1324 | amdgpu_bo_unref(&bo); | |
1325 | return r; | |
1326 | } | |
1327 | ||
1328 | /* alloc/realloc bps array */ | |
1329 | static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, | |
1330 | struct ras_err_handler_data *data, int pages) | |
1331 | { | |
1332 | unsigned int old_space = data->count + data->space_left; | |
1333 | unsigned int new_space = old_space + pages; | |
1334 | unsigned int align_space = ALIGN(new_space, 1024); | |
1335 | void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); | |
1336 | ||
1337 | if (!tmp) | |
1338 | return -ENOMEM; | |
1339 | ||
1340 | if (data->bps) { | |
1341 | memcpy(tmp, data->bps, | |
1342 | data->count * sizeof(*data->bps)); | |
1343 | kfree(data->bps); | |
1344 | } | |
1345 | ||
1346 | data->bps = tmp; | |
1347 | data->space_left += align_space - old_space; | |
1348 | return 0; | |
1349 | } | |
1350 | ||
1351 | /* it deal with vram only. */ | |
1352 | int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, | |
1353 | unsigned long *bps, int pages) | |
1354 | { | |
1355 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
73aa8e1a | 1356 | struct ras_err_handler_data *data; |
c030f2e4 | 1357 | int i = pages; |
1358 | int ret = 0; | |
1359 | ||
73aa8e1a | 1360 | if (!con || !con->eh_data || !bps || pages <= 0) |
c030f2e4 | 1361 | return 0; |
1362 | ||
1363 | mutex_lock(&con->recovery_lock); | |
73aa8e1a | 1364 | data = con->eh_data; |
c030f2e4 | 1365 | if (!data) |
1366 | goto out; | |
1367 | ||
1368 | if (data->space_left <= pages) | |
1369 | if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) { | |
1370 | ret = -ENOMEM; | |
1371 | goto out; | |
1372 | } | |
1373 | ||
1374 | while (i--) | |
1375 | data->bps[data->count++].bp = bps[i]; | |
1376 | ||
1377 | data->space_left -= pages; | |
1378 | out: | |
1379 | mutex_unlock(&con->recovery_lock); | |
1380 | ||
1381 | return ret; | |
1382 | } | |
1383 | ||
1384 | /* called in gpu recovery/init */ | |
1385 | int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) | |
1386 | { | |
1387 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
73aa8e1a | 1388 | struct ras_err_handler_data *data; |
c030f2e4 | 1389 | uint64_t bp; |
1390 | struct amdgpu_bo *bo; | |
1391 | int i; | |
1392 | ||
73aa8e1a | 1393 | if (!con || !con->eh_data) |
c030f2e4 | 1394 | return 0; |
1395 | ||
1396 | mutex_lock(&con->recovery_lock); | |
73aa8e1a | 1397 | data = con->eh_data; |
1398 | if (!data) | |
1399 | goto out; | |
c030f2e4 | 1400 | /* reserve vram at driver post stage. */ |
1401 | for (i = data->last_reserved; i < data->count; i++) { | |
1402 | bp = data->bps[i].bp; | |
1403 | ||
1404 | if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, | |
1405 | PAGE_SIZE, &bo)) | |
1406 | DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); | |
1407 | ||
1408 | data->bps[i].bo = bo; | |
1409 | data->last_reserved = i + 1; | |
1410 | } | |
73aa8e1a | 1411 | out: |
c030f2e4 | 1412 | mutex_unlock(&con->recovery_lock); |
1413 | return 0; | |
1414 | } | |
1415 | ||
1416 | /* called when driver unload */ | |
1417 | static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) | |
1418 | { | |
1419 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
73aa8e1a | 1420 | struct ras_err_handler_data *data; |
c030f2e4 | 1421 | struct amdgpu_bo *bo; |
1422 | int i; | |
1423 | ||
73aa8e1a | 1424 | if (!con || !con->eh_data) |
c030f2e4 | 1425 | return 0; |
1426 | ||
1427 | mutex_lock(&con->recovery_lock); | |
73aa8e1a | 1428 | data = con->eh_data; |
1429 | if (!data) | |
1430 | goto out; | |
1431 | ||
c030f2e4 | 1432 | for (i = data->last_reserved - 1; i >= 0; i--) { |
1433 | bo = data->bps[i].bo; | |
1434 | ||
1435 | amdgpu_ras_release_vram(adev, &bo); | |
1436 | ||
1437 | data->bps[i].bo = bo; | |
1438 | data->last_reserved = i; | |
1439 | } | |
73aa8e1a | 1440 | out: |
c030f2e4 | 1441 | mutex_unlock(&con->recovery_lock); |
1442 | return 0; | |
1443 | } | |
1444 | ||
1445 | static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) | |
1446 | { | |
1447 | /* TODO | |
1448 | * write the array to eeprom when SMU disabled. | |
1449 | */ | |
1450 | return 0; | |
1451 | } | |
1452 | ||
1453 | static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) | |
1454 | { | |
1455 | /* TODO | |
1456 | * read the array to eeprom when SMU disabled. | |
1457 | */ | |
1458 | return 0; | |
1459 | } | |
1460 | ||
1461 | static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) | |
1462 | { | |
1463 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
1464 | struct ras_err_handler_data **data = &con->eh_data; | |
1465 | ||
1466 | *data = kmalloc(sizeof(**data), | |
1467 | GFP_KERNEL|__GFP_ZERO); | |
1468 | if (!*data) | |
1469 | return -ENOMEM; | |
1470 | ||
1471 | mutex_init(&con->recovery_lock); | |
1472 | INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); | |
1473 | atomic_set(&con->in_recovery, 0); | |
1474 | con->adev = adev; | |
1475 | ||
1476 | amdgpu_ras_load_bad_pages(adev); | |
1477 | amdgpu_ras_reserve_bad_pages(adev); | |
1478 | ||
1479 | return 0; | |
1480 | } | |
1481 | ||
1482 | static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) | |
1483 | { | |
1484 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
1485 | struct ras_err_handler_data *data = con->eh_data; | |
1486 | ||
1487 | cancel_work_sync(&con->recovery_work); | |
1488 | amdgpu_ras_save_bad_pages(adev); | |
1489 | amdgpu_ras_release_bad_pages(adev); | |
1490 | ||
1491 | mutex_lock(&con->recovery_lock); | |
1492 | con->eh_data = NULL; | |
1493 | kfree(data->bps); | |
1494 | kfree(data); | |
1495 | mutex_unlock(&con->recovery_lock); | |
1496 | ||
1497 | return 0; | |
1498 | } | |
1499 | /* recovery end */ | |
1500 | ||
a564808e | 1501 | /* return 0 if ras will reset gpu and repost.*/ |
1502 | int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, | |
1503 | unsigned int block) | |
1504 | { | |
1505 | struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); | |
1506 | ||
1507 | if (!ras) | |
1508 | return -EINVAL; | |
1509 | ||
1510 | ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET; | |
1511 | return 0; | |
1512 | } | |
1513 | ||
5caf466a | 1514 | /* |
1515 | * check hardware's ras ability which will be saved in hw_supported. | |
1516 | * if hardware does not support ras, we can skip some ras initializtion and | |
1517 | * forbid some ras operations from IP. | |
1518 | * if software itself, say boot parameter, limit the ras ability. We still | |
1519 | * need allow IP do some limited operations, like disable. In such case, | |
1520 | * we have to initialize ras as normal. but need check if operation is | |
1521 | * allowed or not in each function. | |
1522 | */ | |
1523 | static void amdgpu_ras_check_supported(struct amdgpu_device *adev, | |
1524 | uint32_t *hw_supported, uint32_t *supported) | |
c030f2e4 | 1525 | { |
5caf466a | 1526 | *hw_supported = 0; |
1527 | *supported = 0; | |
c030f2e4 | 1528 | |
5caf466a | 1529 | if (amdgpu_sriov_vf(adev) || |
b404ae82 | 1530 | adev->asic_type != CHIP_VEGA20) |
5caf466a | 1531 | return; |
b404ae82 | 1532 | |
5d0f903f | 1533 | if (adev->is_atom_fw && |
1534 | (amdgpu_atomfirmware_mem_ecc_supported(adev) || | |
1535 | amdgpu_atomfirmware_sram_ecc_supported(adev))) | |
5caf466a | 1536 | *hw_supported = AMDGPU_RAS_BLOCK_MASK; |
b404ae82 | 1537 | |
5caf466a | 1538 | *supported = amdgpu_ras_enable == 0 ? |
1539 | 0 : *hw_supported & amdgpu_ras_mask; | |
c030f2e4 | 1540 | } |
1541 | ||
1542 | int amdgpu_ras_init(struct amdgpu_device *adev) | |
1543 | { | |
1544 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
c030f2e4 | 1545 | |
b404ae82 | 1546 | if (con) |
c030f2e4 | 1547 | return 0; |
1548 | ||
1549 | con = kmalloc(sizeof(struct amdgpu_ras) + | |
1550 | sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT, | |
1551 | GFP_KERNEL|__GFP_ZERO); | |
1552 | if (!con) | |
1553 | return -ENOMEM; | |
1554 | ||
1555 | con->objs = (struct ras_manager *)(con + 1); | |
1556 | ||
1557 | amdgpu_ras_set_context(adev, con); | |
1558 | ||
5caf466a | 1559 | amdgpu_ras_check_supported(adev, &con->hw_supported, |
1560 | &con->supported); | |
5f872b72 HZ |
1561 | if (!con->hw_supported) { |
1562 | amdgpu_ras_set_context(adev, NULL); | |
1563 | kfree(con); | |
1564 | return 0; | |
1565 | } | |
1566 | ||
c030f2e4 | 1567 | con->features = 0; |
1568 | INIT_LIST_HEAD(&con->head); | |
108c6a63 | 1569 | /* Might need get this flag from vbios. */ |
1570 | con->flags = RAS_DEFAULT_FLAGS; | |
c030f2e4 | 1571 | |
1572 | if (amdgpu_ras_recovery_init(adev)) | |
1573 | goto recovery_out; | |
1574 | ||
1575 | amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; | |
1576 | ||
c030f2e4 | 1577 | if (amdgpu_ras_fs_init(adev)) |
1578 | goto fs_out; | |
1579 | ||
5d0f903f | 1580 | DRM_INFO("RAS INFO: ras initialized successfully, " |
1581 | "hardware ability[%x] ras_mask[%x]\n", | |
1582 | con->hw_supported, con->supported); | |
c030f2e4 | 1583 | return 0; |
1584 | fs_out: | |
1585 | amdgpu_ras_recovery_fini(adev); | |
1586 | recovery_out: | |
1587 | amdgpu_ras_set_context(adev, NULL); | |
1588 | kfree(con); | |
1589 | ||
1590 | return -EINVAL; | |
1591 | } | |
1592 | ||
a564808e | 1593 | /* do some init work after IP late init as dependence. |
511fdbc3 | 1594 | * and it runs in resume/gpu reset/booting up cases. |
a564808e | 1595 | */ |
511fdbc3 | 1596 | void amdgpu_ras_resume(struct amdgpu_device *adev) |
108c6a63 | 1597 | { |
1598 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
1599 | struct ras_manager *obj, *tmp; | |
1600 | ||
1601 | if (!con) | |
1602 | return; | |
1603 | ||
108c6a63 | 1604 | if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { |
191051a1 | 1605 | /* Set up all other IPs which are not implemented. There is a |
1606 | * tricky thing that IP's actual ras error type should be | |
1607 | * MULTI_UNCORRECTABLE, but as driver does not handle it, so | |
1608 | * ERROR_NONE make sense anyway. | |
1609 | */ | |
1610 | amdgpu_ras_enable_all_features(adev, 1); | |
1611 | ||
1612 | /* We enable ras on all hw_supported block, but as boot | |
1613 | * parameter might disable some of them and one or more IP has | |
1614 | * not implemented yet. So we disable them on behalf. | |
1615 | */ | |
108c6a63 | 1616 | list_for_each_entry_safe(obj, tmp, &con->head, node) { |
1617 | if (!amdgpu_ras_is_supported(adev, obj->head.block)) { | |
1618 | amdgpu_ras_feature_enable(adev, &obj->head, 0); | |
1619 | /* there should be no any reference. */ | |
1620 | WARN_ON(alive_obj(obj)); | |
1621 | } | |
191051a1 | 1622 | } |
108c6a63 | 1623 | } |
a564808e | 1624 | |
1625 | if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) { | |
1626 | con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET; | |
1627 | /* setup ras obj state as disabled. | |
1628 | * for init_by_vbios case. | |
1629 | * if we want to enable ras, just enable it in a normal way. | |
1630 | * If we want do disable it, need setup ras obj as enabled, | |
1631 | * then issue another TA disable cmd. | |
1632 | * See feature_enable_on_boot | |
1633 | */ | |
1634 | amdgpu_ras_disable_all_features(adev, 1); | |
1635 | amdgpu_ras_reset_gpu(adev, 0); | |
1636 | } | |
108c6a63 | 1637 | } |
1638 | ||
511fdbc3 | 1639 | void amdgpu_ras_suspend(struct amdgpu_device *adev) |
1640 | { | |
1641 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
1642 | ||
1643 | if (!con) | |
1644 | return; | |
1645 | ||
1646 | amdgpu_ras_disable_all_features(adev, 0); | |
1647 | /* Make sure all ras objects are disabled. */ | |
1648 | if (con->features) | |
1649 | amdgpu_ras_disable_all_features(adev, 1); | |
1650 | } | |
1651 | ||
c030f2e4 | 1652 | /* do some fini work before IP fini as dependence */ |
1653 | int amdgpu_ras_pre_fini(struct amdgpu_device *adev) | |
1654 | { | |
1655 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
1656 | ||
1657 | if (!con) | |
1658 | return 0; | |
1659 | ||
1660 | /* Need disable ras on all IPs here before ip [hw/sw]fini */ | |
1661 | amdgpu_ras_disable_all_features(adev, 0); | |
1662 | amdgpu_ras_recovery_fini(adev); | |
1663 | return 0; | |
1664 | } | |
1665 | ||
1666 | int amdgpu_ras_fini(struct amdgpu_device *adev) | |
1667 | { | |
1668 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | |
1669 | ||
1670 | if (!con) | |
1671 | return 0; | |
1672 | ||
1673 | amdgpu_ras_fs_fini(adev); | |
1674 | amdgpu_ras_interrupt_remove_all(adev); | |
1675 | ||
1676 | WARN(con->features, "Feature mask is not cleared"); | |
1677 | ||
1678 | if (con->features) | |
1679 | amdgpu_ras_disable_all_features(adev, 1); | |
1680 | ||
1681 | amdgpu_ras_set_context(adev, NULL); | |
1682 | kfree(con); | |
1683 | ||
1684 | return 0; | |
1685 | } |