]> git.ipfire.org Git - thirdparty/kernel/stable.git/blob - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drm/amdgpu: Fix vram recover doesn't work after whole GPU reset (v2)
[thirdparty/kernel/stable.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_aperture.h>
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_fb_helper.h>
43 #include <drm/drm_probe_helper.h>
44 #include <drm/amdgpu_drm.h>
45 #include <linux/vgaarb.h>
46 #include <linux/vga_switcheroo.h>
47 #include <linux/efi.h>
48 #include "amdgpu.h"
49 #include "amdgpu_trace.h"
50 #include "amdgpu_i2c.h"
51 #include "atom.h"
52 #include "amdgpu_atombios.h"
53 #include "amdgpu_atomfirmware.h"
54 #include "amd_pcie.h"
55 #ifdef CONFIG_DRM_AMDGPU_SI
56 #include "si.h"
57 #endif
58 #ifdef CONFIG_DRM_AMDGPU_CIK
59 #include "cik.h"
60 #endif
61 #include "vi.h"
62 #include "soc15.h"
63 #include "nv.h"
64 #include "bif/bif_4_1_d.h"
65 #include <linux/firmware.h>
66 #include "amdgpu_vf_error.h"
67
68 #include "amdgpu_amdkfd.h"
69 #include "amdgpu_pm.h"
70
71 #include "amdgpu_xgmi.h"
72 #include "amdgpu_ras.h"
73 #include "amdgpu_pmu.h"
74 #include "amdgpu_fru_eeprom.h"
75 #include "amdgpu_reset.h"
76
77 #include <linux/suspend.h>
78 #include <drm/task_barrier.h>
79 #include <linux/pm_runtime.h>
80
81 #include <drm/drm_drv.h>
82
83 #if IS_ENABLED(CONFIG_X86)
84 #include <asm/intel-family.h>
85 #endif
86
87 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
88 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
89 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
94
95 #define AMDGPU_RESUME_MS 2000
96 #define AMDGPU_MAX_RETRY_LIMIT 2
97 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
98
99 static const struct drm_driver amdgpu_kms_driver;
100
101 const char *amdgpu_asic_name[] = {
102 "TAHITI",
103 "PITCAIRN",
104 "VERDE",
105 "OLAND",
106 "HAINAN",
107 "BONAIRE",
108 "KAVERI",
109 "KABINI",
110 "HAWAII",
111 "MULLINS",
112 "TOPAZ",
113 "TONGA",
114 "FIJI",
115 "CARRIZO",
116 "STONEY",
117 "POLARIS10",
118 "POLARIS11",
119 "POLARIS12",
120 "VEGAM",
121 "VEGA10",
122 "VEGA12",
123 "VEGA20",
124 "RAVEN",
125 "ARCTURUS",
126 "RENOIR",
127 "ALDEBARAN",
128 "NAVI10",
129 "CYAN_SKILLFISH",
130 "NAVI14",
131 "NAVI12",
132 "SIENNA_CICHLID",
133 "NAVY_FLOUNDER",
134 "VANGOGH",
135 "DIMGREY_CAVEFISH",
136 "BEIGE_GOBY",
137 "YELLOW_CARP",
138 "IP DISCOVERY",
139 "LAST",
140 };
141
142 /**
143 * DOC: pcie_replay_count
144 *
145 * The amdgpu driver provides a sysfs API for reporting the total number
146 * of PCIe replays (NAKs)
147 * The file pcie_replay_count is used for this and returns the total
148 * number of replays as a sum of the NAKs generated and NAKs received
149 */
150
151 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
152 struct device_attribute *attr, char *buf)
153 {
154 struct drm_device *ddev = dev_get_drvdata(dev);
155 struct amdgpu_device *adev = drm_to_adev(ddev);
156 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
157
158 return sysfs_emit(buf, "%llu\n", cnt);
159 }
160
161 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
162 amdgpu_device_get_pcie_replay_count, NULL);
163
164 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
165
166 /**
167 * DOC: product_name
168 *
169 * The amdgpu driver provides a sysfs API for reporting the product name
170 * for the device
171 * The file serial_number is used for this and returns the product name
172 * as returned from the FRU.
173 * NOTE: This is only available for certain server cards
174 */
175
176 static ssize_t amdgpu_device_get_product_name(struct device *dev,
177 struct device_attribute *attr, char *buf)
178 {
179 struct drm_device *ddev = dev_get_drvdata(dev);
180 struct amdgpu_device *adev = drm_to_adev(ddev);
181
182 return sysfs_emit(buf, "%s\n", adev->product_name);
183 }
184
185 static DEVICE_ATTR(product_name, S_IRUGO,
186 amdgpu_device_get_product_name, NULL);
187
188 /**
189 * DOC: product_number
190 *
191 * The amdgpu driver provides a sysfs API for reporting the part number
192 * for the device
193 * The file serial_number is used for this and returns the part number
194 * as returned from the FRU.
195 * NOTE: This is only available for certain server cards
196 */
197
198 static ssize_t amdgpu_device_get_product_number(struct device *dev,
199 struct device_attribute *attr, char *buf)
200 {
201 struct drm_device *ddev = dev_get_drvdata(dev);
202 struct amdgpu_device *adev = drm_to_adev(ddev);
203
204 return sysfs_emit(buf, "%s\n", adev->product_number);
205 }
206
207 static DEVICE_ATTR(product_number, S_IRUGO,
208 amdgpu_device_get_product_number, NULL);
209
210 /**
211 * DOC: serial_number
212 *
213 * The amdgpu driver provides a sysfs API for reporting the serial number
214 * for the device
215 * The file serial_number is used for this and returns the serial number
216 * as returned from the FRU.
217 * NOTE: This is only available for certain server cards
218 */
219
220 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
221 struct device_attribute *attr, char *buf)
222 {
223 struct drm_device *ddev = dev_get_drvdata(dev);
224 struct amdgpu_device *adev = drm_to_adev(ddev);
225
226 return sysfs_emit(buf, "%s\n", adev->serial);
227 }
228
229 static DEVICE_ATTR(serial_number, S_IRUGO,
230 amdgpu_device_get_serial_number, NULL);
231
232 /**
233 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
234 *
235 * @dev: drm_device pointer
236 *
237 * Returns true if the device is a dGPU with ATPX power control,
238 * otherwise return false.
239 */
240 bool amdgpu_device_supports_px(struct drm_device *dev)
241 {
242 struct amdgpu_device *adev = drm_to_adev(dev);
243
244 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
245 return true;
246 return false;
247 }
248
249 /**
250 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
251 *
252 * @dev: drm_device pointer
253 *
254 * Returns true if the device is a dGPU with ACPI power control,
255 * otherwise return false.
256 */
257 bool amdgpu_device_supports_boco(struct drm_device *dev)
258 {
259 struct amdgpu_device *adev = drm_to_adev(dev);
260
261 if (adev->has_pr3 ||
262 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
263 return true;
264 return false;
265 }
266
267 /**
268 * amdgpu_device_supports_baco - Does the device support BACO
269 *
270 * @dev: drm_device pointer
271 *
272 * Returns true if the device supporte BACO,
273 * otherwise return false.
274 */
275 bool amdgpu_device_supports_baco(struct drm_device *dev)
276 {
277 struct amdgpu_device *adev = drm_to_adev(dev);
278
279 return amdgpu_asic_supports_baco(adev);
280 }
281
282 /**
283 * amdgpu_device_supports_smart_shift - Is the device dGPU with
284 * smart shift support
285 *
286 * @dev: drm_device pointer
287 *
288 * Returns true if the device is a dGPU with Smart Shift support,
289 * otherwise returns false.
290 */
291 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
292 {
293 return (amdgpu_device_supports_boco(dev) &&
294 amdgpu_acpi_is_power_shift_control_supported());
295 }
296
297 /*
298 * VRAM access helper functions
299 */
300
301 /**
302 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
303 *
304 * @adev: amdgpu_device pointer
305 * @pos: offset of the buffer in vram
306 * @buf: virtual address of the buffer in system memory
307 * @size: read/write size, sizeof(@buf) must > @size
308 * @write: true - write to vram, otherwise - read from vram
309 */
310 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
311 void *buf, size_t size, bool write)
312 {
313 unsigned long flags;
314 uint32_t hi = ~0, tmp = 0;
315 uint32_t *data = buf;
316 uint64_t last;
317 int idx;
318
319 if (!drm_dev_enter(adev_to_drm(adev), &idx))
320 return;
321
322 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
323
324 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
325 for (last = pos + size; pos < last; pos += 4) {
326 tmp = pos >> 31;
327
328 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
329 if (tmp != hi) {
330 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
331 hi = tmp;
332 }
333 if (write)
334 WREG32_NO_KIQ(mmMM_DATA, *data++);
335 else
336 *data++ = RREG32_NO_KIQ(mmMM_DATA);
337 }
338
339 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
340 drm_dev_exit(idx);
341 }
342
343 /**
344 * amdgpu_device_aper_access - access vram by vram aperature
345 *
346 * @adev: amdgpu_device pointer
347 * @pos: offset of the buffer in vram
348 * @buf: virtual address of the buffer in system memory
349 * @size: read/write size, sizeof(@buf) must > @size
350 * @write: true - write to vram, otherwise - read from vram
351 *
352 * The return value means how many bytes have been transferred.
353 */
354 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
355 void *buf, size_t size, bool write)
356 {
357 #ifdef CONFIG_64BIT
358 void __iomem *addr;
359 size_t count = 0;
360 uint64_t last;
361
362 if (!adev->mman.aper_base_kaddr)
363 return 0;
364
365 last = min(pos + size, adev->gmc.visible_vram_size);
366 if (last > pos) {
367 addr = adev->mman.aper_base_kaddr + pos;
368 count = last - pos;
369
370 if (write) {
371 memcpy_toio(addr, buf, count);
372 mb();
373 amdgpu_device_flush_hdp(adev, NULL);
374 } else {
375 amdgpu_device_invalidate_hdp(adev, NULL);
376 mb();
377 memcpy_fromio(buf, addr, count);
378 }
379
380 }
381
382 return count;
383 #else
384 return 0;
385 #endif
386 }
387
388 /**
389 * amdgpu_device_vram_access - read/write a buffer in vram
390 *
391 * @adev: amdgpu_device pointer
392 * @pos: offset of the buffer in vram
393 * @buf: virtual address of the buffer in system memory
394 * @size: read/write size, sizeof(@buf) must > @size
395 * @write: true - write to vram, otherwise - read from vram
396 */
397 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
398 void *buf, size_t size, bool write)
399 {
400 size_t count;
401
402 /* try to using vram apreature to access vram first */
403 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
404 size -= count;
405 if (size) {
406 /* using MM to access rest vram */
407 pos += count;
408 buf += count;
409 amdgpu_device_mm_access(adev, pos, buf, size, write);
410 }
411 }
412
413 /*
414 * register access helper functions.
415 */
416
417 /* Check if hw access should be skipped because of hotplug or device error */
418 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
419 {
420 if (adev->no_hw_access)
421 return true;
422
423 #ifdef CONFIG_LOCKDEP
424 /*
425 * This is a bit complicated to understand, so worth a comment. What we assert
426 * here is that the GPU reset is not running on another thread in parallel.
427 *
428 * For this we trylock the read side of the reset semaphore, if that succeeds
429 * we know that the reset is not running in paralell.
430 *
431 * If the trylock fails we assert that we are either already holding the read
432 * side of the lock or are the reset thread itself and hold the write side of
433 * the lock.
434 */
435 if (in_task()) {
436 if (down_read_trylock(&adev->reset_domain->sem))
437 up_read(&adev->reset_domain->sem);
438 else
439 lockdep_assert_held(&adev->reset_domain->sem);
440 }
441 #endif
442 return false;
443 }
444
445 /**
446 * amdgpu_device_rreg - read a memory mapped IO or indirect register
447 *
448 * @adev: amdgpu_device pointer
449 * @reg: dword aligned register offset
450 * @acc_flags: access flags which require special behavior
451 *
452 * Returns the 32 bit value from the offset specified.
453 */
454 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
455 uint32_t reg, uint32_t acc_flags)
456 {
457 uint32_t ret;
458
459 if (amdgpu_device_skip_hw_access(adev))
460 return 0;
461
462 if ((reg * 4) < adev->rmmio_size) {
463 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
464 amdgpu_sriov_runtime(adev) &&
465 down_read_trylock(&adev->reset_domain->sem)) {
466 ret = amdgpu_kiq_rreg(adev, reg);
467 up_read(&adev->reset_domain->sem);
468 } else {
469 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
470 }
471 } else {
472 ret = adev->pcie_rreg(adev, reg * 4);
473 }
474
475 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
476
477 return ret;
478 }
479
480 /*
481 * MMIO register read with bytes helper functions
482 * @offset:bytes offset from MMIO start
483 *
484 */
485
486 /**
487 * amdgpu_mm_rreg8 - read a memory mapped IO register
488 *
489 * @adev: amdgpu_device pointer
490 * @offset: byte aligned register offset
491 *
492 * Returns the 8 bit value from the offset specified.
493 */
494 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
495 {
496 if (amdgpu_device_skip_hw_access(adev))
497 return 0;
498
499 if (offset < adev->rmmio_size)
500 return (readb(adev->rmmio + offset));
501 BUG();
502 }
503
504 /*
505 * MMIO register write with bytes helper functions
506 * @offset:bytes offset from MMIO start
507 * @value: the value want to be written to the register
508 *
509 */
510 /**
511 * amdgpu_mm_wreg8 - read a memory mapped IO register
512 *
513 * @adev: amdgpu_device pointer
514 * @offset: byte aligned register offset
515 * @value: 8 bit value to write
516 *
517 * Writes the value specified to the offset specified.
518 */
519 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
520 {
521 if (amdgpu_device_skip_hw_access(adev))
522 return;
523
524 if (offset < adev->rmmio_size)
525 writeb(value, adev->rmmio + offset);
526 else
527 BUG();
528 }
529
530 /**
531 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
532 *
533 * @adev: amdgpu_device pointer
534 * @reg: dword aligned register offset
535 * @v: 32 bit value to write to the register
536 * @acc_flags: access flags which require special behavior
537 *
538 * Writes the value specified to the offset specified.
539 */
540 void amdgpu_device_wreg(struct amdgpu_device *adev,
541 uint32_t reg, uint32_t v,
542 uint32_t acc_flags)
543 {
544 if (amdgpu_device_skip_hw_access(adev))
545 return;
546
547 if ((reg * 4) < adev->rmmio_size) {
548 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
549 amdgpu_sriov_runtime(adev) &&
550 down_read_trylock(&adev->reset_domain->sem)) {
551 amdgpu_kiq_wreg(adev, reg, v);
552 up_read(&adev->reset_domain->sem);
553 } else {
554 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
555 }
556 } else {
557 adev->pcie_wreg(adev, reg * 4, v);
558 }
559
560 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
561 }
562
563 /**
564 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
565 *
566 * @adev: amdgpu_device pointer
567 * @reg: mmio/rlc register
568 * @v: value to write
569 *
570 * this function is invoked only for the debugfs register access
571 */
572 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
573 uint32_t reg, uint32_t v)
574 {
575 if (amdgpu_device_skip_hw_access(adev))
576 return;
577
578 if (amdgpu_sriov_fullaccess(adev) &&
579 adev->gfx.rlc.funcs &&
580 adev->gfx.rlc.funcs->is_rlcg_access_range) {
581 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
582 return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
583 } else if ((reg * 4) >= adev->rmmio_size) {
584 adev->pcie_wreg(adev, reg * 4, v);
585 } else {
586 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
587 }
588 }
589
590 /**
591 * amdgpu_mm_rdoorbell - read a doorbell dword
592 *
593 * @adev: amdgpu_device pointer
594 * @index: doorbell index
595 *
596 * Returns the value in the doorbell aperture at the
597 * requested doorbell index (CIK).
598 */
599 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
600 {
601 if (amdgpu_device_skip_hw_access(adev))
602 return 0;
603
604 if (index < adev->doorbell.num_doorbells) {
605 return readl(adev->doorbell.ptr + index);
606 } else {
607 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
608 return 0;
609 }
610 }
611
612 /**
613 * amdgpu_mm_wdoorbell - write a doorbell dword
614 *
615 * @adev: amdgpu_device pointer
616 * @index: doorbell index
617 * @v: value to write
618 *
619 * Writes @v to the doorbell aperture at the
620 * requested doorbell index (CIK).
621 */
622 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
623 {
624 if (amdgpu_device_skip_hw_access(adev))
625 return;
626
627 if (index < adev->doorbell.num_doorbells) {
628 writel(v, adev->doorbell.ptr + index);
629 } else {
630 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
631 }
632 }
633
634 /**
635 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
636 *
637 * @adev: amdgpu_device pointer
638 * @index: doorbell index
639 *
640 * Returns the value in the doorbell aperture at the
641 * requested doorbell index (VEGA10+).
642 */
643 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
644 {
645 if (amdgpu_device_skip_hw_access(adev))
646 return 0;
647
648 if (index < adev->doorbell.num_doorbells) {
649 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
650 } else {
651 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
652 return 0;
653 }
654 }
655
656 /**
657 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
658 *
659 * @adev: amdgpu_device pointer
660 * @index: doorbell index
661 * @v: value to write
662 *
663 * Writes @v to the doorbell aperture at the
664 * requested doorbell index (VEGA10+).
665 */
666 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
667 {
668 if (amdgpu_device_skip_hw_access(adev))
669 return;
670
671 if (index < adev->doorbell.num_doorbells) {
672 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
673 } else {
674 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
675 }
676 }
677
678 /**
679 * amdgpu_device_indirect_rreg - read an indirect register
680 *
681 * @adev: amdgpu_device pointer
682 * @pcie_index: mmio register offset
683 * @pcie_data: mmio register offset
684 * @reg_addr: indirect register address to read from
685 *
686 * Returns the value of indirect register @reg_addr
687 */
688 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
689 u32 pcie_index, u32 pcie_data,
690 u32 reg_addr)
691 {
692 unsigned long flags;
693 u32 r;
694 void __iomem *pcie_index_offset;
695 void __iomem *pcie_data_offset;
696
697 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
698 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
699 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
700
701 writel(reg_addr, pcie_index_offset);
702 readl(pcie_index_offset);
703 r = readl(pcie_data_offset);
704 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
705
706 return r;
707 }
708
709 /**
710 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
711 *
712 * @adev: amdgpu_device pointer
713 * @pcie_index: mmio register offset
714 * @pcie_data: mmio register offset
715 * @reg_addr: indirect register address to read from
716 *
717 * Returns the value of indirect register @reg_addr
718 */
719 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
720 u32 pcie_index, u32 pcie_data,
721 u32 reg_addr)
722 {
723 unsigned long flags;
724 u64 r;
725 void __iomem *pcie_index_offset;
726 void __iomem *pcie_data_offset;
727
728 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
729 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
730 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
731
732 /* read low 32 bits */
733 writel(reg_addr, pcie_index_offset);
734 readl(pcie_index_offset);
735 r = readl(pcie_data_offset);
736 /* read high 32 bits */
737 writel(reg_addr + 4, pcie_index_offset);
738 readl(pcie_index_offset);
739 r |= ((u64)readl(pcie_data_offset) << 32);
740 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
741
742 return r;
743 }
744
745 /**
746 * amdgpu_device_indirect_wreg - write an indirect register address
747 *
748 * @adev: amdgpu_device pointer
749 * @pcie_index: mmio register offset
750 * @pcie_data: mmio register offset
751 * @reg_addr: indirect register offset
752 * @reg_data: indirect register data
753 *
754 */
755 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
756 u32 pcie_index, u32 pcie_data,
757 u32 reg_addr, u32 reg_data)
758 {
759 unsigned long flags;
760 void __iomem *pcie_index_offset;
761 void __iomem *pcie_data_offset;
762
763 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
764 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
765 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
766
767 writel(reg_addr, pcie_index_offset);
768 readl(pcie_index_offset);
769 writel(reg_data, pcie_data_offset);
770 readl(pcie_data_offset);
771 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
772 }
773
774 /**
775 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
776 *
777 * @adev: amdgpu_device pointer
778 * @pcie_index: mmio register offset
779 * @pcie_data: mmio register offset
780 * @reg_addr: indirect register offset
781 * @reg_data: indirect register data
782 *
783 */
784 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
785 u32 pcie_index, u32 pcie_data,
786 u32 reg_addr, u64 reg_data)
787 {
788 unsigned long flags;
789 void __iomem *pcie_index_offset;
790 void __iomem *pcie_data_offset;
791
792 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
793 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
794 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
795
796 /* write low 32 bits */
797 writel(reg_addr, pcie_index_offset);
798 readl(pcie_index_offset);
799 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
800 readl(pcie_data_offset);
801 /* write high 32 bits */
802 writel(reg_addr + 4, pcie_index_offset);
803 readl(pcie_index_offset);
804 writel((u32)(reg_data >> 32), pcie_data_offset);
805 readl(pcie_data_offset);
806 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
807 }
808
809 /**
810 * amdgpu_invalid_rreg - dummy reg read function
811 *
812 * @adev: amdgpu_device pointer
813 * @reg: offset of register
814 *
815 * Dummy register read function. Used for register blocks
816 * that certain asics don't have (all asics).
817 * Returns the value in the register.
818 */
819 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
820 {
821 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
822 BUG();
823 return 0;
824 }
825
826 /**
827 * amdgpu_invalid_wreg - dummy reg write function
828 *
829 * @adev: amdgpu_device pointer
830 * @reg: offset of register
831 * @v: value to write to the register
832 *
833 * Dummy register read function. Used for register blocks
834 * that certain asics don't have (all asics).
835 */
836 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
837 {
838 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
839 reg, v);
840 BUG();
841 }
842
843 /**
844 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
845 *
846 * @adev: amdgpu_device pointer
847 * @reg: offset of register
848 *
849 * Dummy register read function. Used for register blocks
850 * that certain asics don't have (all asics).
851 * Returns the value in the register.
852 */
853 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
854 {
855 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
856 BUG();
857 return 0;
858 }
859
860 /**
861 * amdgpu_invalid_wreg64 - dummy reg write function
862 *
863 * @adev: amdgpu_device pointer
864 * @reg: offset of register
865 * @v: value to write to the register
866 *
867 * Dummy register read function. Used for register blocks
868 * that certain asics don't have (all asics).
869 */
870 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
871 {
872 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
873 reg, v);
874 BUG();
875 }
876
877 /**
878 * amdgpu_block_invalid_rreg - dummy reg read function
879 *
880 * @adev: amdgpu_device pointer
881 * @block: offset of instance
882 * @reg: offset of register
883 *
884 * Dummy register read function. Used for register blocks
885 * that certain asics don't have (all asics).
886 * Returns the value in the register.
887 */
888 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
889 uint32_t block, uint32_t reg)
890 {
891 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
892 reg, block);
893 BUG();
894 return 0;
895 }
896
897 /**
898 * amdgpu_block_invalid_wreg - dummy reg write function
899 *
900 * @adev: amdgpu_device pointer
901 * @block: offset of instance
902 * @reg: offset of register
903 * @v: value to write to the register
904 *
905 * Dummy register read function. Used for register blocks
906 * that certain asics don't have (all asics).
907 */
908 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
909 uint32_t block,
910 uint32_t reg, uint32_t v)
911 {
912 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
913 reg, block, v);
914 BUG();
915 }
916
917 /**
918 * amdgpu_device_asic_init - Wrapper for atom asic_init
919 *
920 * @adev: amdgpu_device pointer
921 *
922 * Does any asic specific work and then calls atom asic init.
923 */
924 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
925 {
926 amdgpu_asic_pre_asic_init(adev);
927
928 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
929 return amdgpu_atomfirmware_asic_init(adev, true);
930 else
931 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
932 }
933
934 /**
935 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
936 *
937 * @adev: amdgpu_device pointer
938 *
939 * Allocates a scratch page of VRAM for use by various things in the
940 * driver.
941 */
942 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
943 {
944 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
945 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
946 &adev->vram_scratch.robj,
947 &adev->vram_scratch.gpu_addr,
948 (void **)&adev->vram_scratch.ptr);
949 }
950
951 /**
952 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
953 *
954 * @adev: amdgpu_device pointer
955 *
956 * Frees the VRAM scratch page.
957 */
958 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
959 {
960 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
961 }
962
963 /**
964 * amdgpu_device_program_register_sequence - program an array of registers.
965 *
966 * @adev: amdgpu_device pointer
967 * @registers: pointer to the register array
968 * @array_size: size of the register array
969 *
970 * Programs an array or registers with and and or masks.
971 * This is a helper for setting golden registers.
972 */
973 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
974 const u32 *registers,
975 const u32 array_size)
976 {
977 u32 tmp, reg, and_mask, or_mask;
978 int i;
979
980 if (array_size % 3)
981 return;
982
983 for (i = 0; i < array_size; i +=3) {
984 reg = registers[i + 0];
985 and_mask = registers[i + 1];
986 or_mask = registers[i + 2];
987
988 if (and_mask == 0xffffffff) {
989 tmp = or_mask;
990 } else {
991 tmp = RREG32(reg);
992 tmp &= ~and_mask;
993 if (adev->family >= AMDGPU_FAMILY_AI)
994 tmp |= (or_mask & and_mask);
995 else
996 tmp |= or_mask;
997 }
998 WREG32(reg, tmp);
999 }
1000 }
1001
1002 /**
1003 * amdgpu_device_pci_config_reset - reset the GPU
1004 *
1005 * @adev: amdgpu_device pointer
1006 *
1007 * Resets the GPU using the pci config reset sequence.
1008 * Only applicable to asics prior to vega10.
1009 */
1010 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1011 {
1012 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1013 }
1014
1015 /**
1016 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1017 *
1018 * @adev: amdgpu_device pointer
1019 *
1020 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1021 */
1022 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1023 {
1024 return pci_reset_function(adev->pdev);
1025 }
1026
1027 /*
1028 * GPU doorbell aperture helpers function.
1029 */
1030 /**
1031 * amdgpu_device_doorbell_init - Init doorbell driver information.
1032 *
1033 * @adev: amdgpu_device pointer
1034 *
1035 * Init doorbell driver information (CIK)
1036 * Returns 0 on success, error on failure.
1037 */
1038 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1039 {
1040
1041 /* No doorbell on SI hardware generation */
1042 if (adev->asic_type < CHIP_BONAIRE) {
1043 adev->doorbell.base = 0;
1044 adev->doorbell.size = 0;
1045 adev->doorbell.num_doorbells = 0;
1046 adev->doorbell.ptr = NULL;
1047 return 0;
1048 }
1049
1050 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1051 return -EINVAL;
1052
1053 amdgpu_asic_init_doorbell_index(adev);
1054
1055 /* doorbell bar mapping */
1056 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1057 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1058
1059 if (adev->enable_mes) {
1060 adev->doorbell.num_doorbells =
1061 adev->doorbell.size / sizeof(u32);
1062 } else {
1063 adev->doorbell.num_doorbells =
1064 min_t(u32, adev->doorbell.size / sizeof(u32),
1065 adev->doorbell_index.max_assignment+1);
1066 if (adev->doorbell.num_doorbells == 0)
1067 return -EINVAL;
1068
1069 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
1070 * paging queue doorbell use the second page. The
1071 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1072 * doorbells are in the first page. So with paging queue enabled,
1073 * the max num_doorbells should + 1 page (0x400 in dword)
1074 */
1075 if (adev->asic_type >= CHIP_VEGA10)
1076 adev->doorbell.num_doorbells += 0x400;
1077 }
1078
1079 adev->doorbell.ptr = ioremap(adev->doorbell.base,
1080 adev->doorbell.num_doorbells *
1081 sizeof(u32));
1082 if (adev->doorbell.ptr == NULL)
1083 return -ENOMEM;
1084
1085 return 0;
1086 }
1087
1088 /**
1089 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1090 *
1091 * @adev: amdgpu_device pointer
1092 *
1093 * Tear down doorbell driver information (CIK)
1094 */
1095 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1096 {
1097 iounmap(adev->doorbell.ptr);
1098 adev->doorbell.ptr = NULL;
1099 }
1100
1101
1102
1103 /*
1104 * amdgpu_device_wb_*()
1105 * Writeback is the method by which the GPU updates special pages in memory
1106 * with the status of certain GPU events (fences, ring pointers,etc.).
1107 */
1108
1109 /**
1110 * amdgpu_device_wb_fini - Disable Writeback and free memory
1111 *
1112 * @adev: amdgpu_device pointer
1113 *
1114 * Disables Writeback and frees the Writeback memory (all asics).
1115 * Used at driver shutdown.
1116 */
1117 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1118 {
1119 if (adev->wb.wb_obj) {
1120 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1121 &adev->wb.gpu_addr,
1122 (void **)&adev->wb.wb);
1123 adev->wb.wb_obj = NULL;
1124 }
1125 }
1126
1127 /**
1128 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1129 *
1130 * @adev: amdgpu_device pointer
1131 *
1132 * Initializes writeback and allocates writeback memory (all asics).
1133 * Used at driver startup.
1134 * Returns 0 on success or an -error on failure.
1135 */
1136 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1137 {
1138 int r;
1139
1140 if (adev->wb.wb_obj == NULL) {
1141 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1142 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1143 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1144 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1145 (void **)&adev->wb.wb);
1146 if (r) {
1147 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1148 return r;
1149 }
1150
1151 adev->wb.num_wb = AMDGPU_MAX_WB;
1152 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1153
1154 /* clear wb memory */
1155 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1156 }
1157
1158 return 0;
1159 }
1160
1161 /**
1162 * amdgpu_device_wb_get - Allocate a wb entry
1163 *
1164 * @adev: amdgpu_device pointer
1165 * @wb: wb index
1166 *
1167 * Allocate a wb slot for use by the driver (all asics).
1168 * Returns 0 on success or -EINVAL on failure.
1169 */
1170 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1171 {
1172 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1173
1174 if (offset < adev->wb.num_wb) {
1175 __set_bit(offset, adev->wb.used);
1176 *wb = offset << 3; /* convert to dw offset */
1177 return 0;
1178 } else {
1179 return -EINVAL;
1180 }
1181 }
1182
1183 /**
1184 * amdgpu_device_wb_free - Free a wb entry
1185 *
1186 * @adev: amdgpu_device pointer
1187 * @wb: wb index
1188 *
1189 * Free a wb slot allocated for use by the driver (all asics)
1190 */
1191 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1192 {
1193 wb >>= 3;
1194 if (wb < adev->wb.num_wb)
1195 __clear_bit(wb, adev->wb.used);
1196 }
1197
1198 /**
1199 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1200 *
1201 * @adev: amdgpu_device pointer
1202 *
1203 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1204 * to fail, but if any of the BARs is not accessible after the size we abort
1205 * driver loading by returning -ENODEV.
1206 */
1207 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1208 {
1209 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1210 struct pci_bus *root;
1211 struct resource *res;
1212 unsigned i;
1213 u16 cmd;
1214 int r;
1215
1216 /* Bypass for VF */
1217 if (amdgpu_sriov_vf(adev))
1218 return 0;
1219
1220 /* skip if the bios has already enabled large BAR */
1221 if (adev->gmc.real_vram_size &&
1222 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1223 return 0;
1224
1225 /* Check if the root BUS has 64bit memory resources */
1226 root = adev->pdev->bus;
1227 while (root->parent)
1228 root = root->parent;
1229
1230 pci_bus_for_each_resource(root, res, i) {
1231 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1232 res->start > 0x100000000ull)
1233 break;
1234 }
1235
1236 /* Trying to resize is pointless without a root hub window above 4GB */
1237 if (!res)
1238 return 0;
1239
1240 /* Limit the BAR size to what is available */
1241 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1242 rbar_size);
1243
1244 /* Disable memory decoding while we change the BAR addresses and size */
1245 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1246 pci_write_config_word(adev->pdev, PCI_COMMAND,
1247 cmd & ~PCI_COMMAND_MEMORY);
1248
1249 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1250 amdgpu_device_doorbell_fini(adev);
1251 if (adev->asic_type >= CHIP_BONAIRE)
1252 pci_release_resource(adev->pdev, 2);
1253
1254 pci_release_resource(adev->pdev, 0);
1255
1256 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1257 if (r == -ENOSPC)
1258 DRM_INFO("Not enough PCI address space for a large BAR.");
1259 else if (r && r != -ENOTSUPP)
1260 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1261
1262 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1263
1264 /* When the doorbell or fb BAR isn't available we have no chance of
1265 * using the device.
1266 */
1267 r = amdgpu_device_doorbell_init(adev);
1268 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1269 return -ENODEV;
1270
1271 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1272
1273 return 0;
1274 }
1275
1276 /*
1277 * GPU helpers function.
1278 */
1279 /**
1280 * amdgpu_device_need_post - check if the hw need post or not
1281 *
1282 * @adev: amdgpu_device pointer
1283 *
1284 * Check if the asic has been initialized (all asics) at driver startup
1285 * or post is needed if hw reset is performed.
1286 * Returns true if need or false if not.
1287 */
1288 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1289 {
1290 uint32_t reg;
1291
1292 if (amdgpu_sriov_vf(adev))
1293 return false;
1294
1295 if (amdgpu_passthrough(adev)) {
1296 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1297 * some old smc fw still need driver do vPost otherwise gpu hang, while
1298 * those smc fw version above 22.15 doesn't have this flaw, so we force
1299 * vpost executed for smc version below 22.15
1300 */
1301 if (adev->asic_type == CHIP_FIJI) {
1302 int err;
1303 uint32_t fw_ver;
1304 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1305 /* force vPost if error occured */
1306 if (err)
1307 return true;
1308
1309 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1310 if (fw_ver < 0x00160e00)
1311 return true;
1312 }
1313 }
1314
1315 /* Don't post if we need to reset whole hive on init */
1316 if (adev->gmc.xgmi.pending_reset)
1317 return false;
1318
1319 if (adev->has_hw_reset) {
1320 adev->has_hw_reset = false;
1321 return true;
1322 }
1323
1324 /* bios scratch used on CIK+ */
1325 if (adev->asic_type >= CHIP_BONAIRE)
1326 return amdgpu_atombios_scratch_need_asic_init(adev);
1327
1328 /* check MEM_SIZE for older asics */
1329 reg = amdgpu_asic_get_config_memsize(adev);
1330
1331 if ((reg != 0) && (reg != 0xffffffff))
1332 return false;
1333
1334 return true;
1335 }
1336
1337 /**
1338 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1339 *
1340 * @adev: amdgpu_device pointer
1341 *
1342 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1343 * be set for this device.
1344 *
1345 * Returns true if it should be used or false if not.
1346 */
1347 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1348 {
1349 switch (amdgpu_aspm) {
1350 case -1:
1351 break;
1352 case 0:
1353 return false;
1354 case 1:
1355 return true;
1356 default:
1357 return false;
1358 }
1359 return pcie_aspm_enabled(adev->pdev);
1360 }
1361
1362 bool amdgpu_device_aspm_support_quirk(void)
1363 {
1364 #if IS_ENABLED(CONFIG_X86)
1365 struct cpuinfo_x86 *c = &cpu_data(0);
1366
1367 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1368 #else
1369 return true;
1370 #endif
1371 }
1372
1373 /* if we get transitioned to only one device, take VGA back */
1374 /**
1375 * amdgpu_device_vga_set_decode - enable/disable vga decode
1376 *
1377 * @pdev: PCI device pointer
1378 * @state: enable/disable vga decode
1379 *
1380 * Enable/disable vga decode (all asics).
1381 * Returns VGA resource flags.
1382 */
1383 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1384 bool state)
1385 {
1386 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1387 amdgpu_asic_set_vga_state(adev, state);
1388 if (state)
1389 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1390 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1391 else
1392 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1393 }
1394
1395 /**
1396 * amdgpu_device_check_block_size - validate the vm block size
1397 *
1398 * @adev: amdgpu_device pointer
1399 *
1400 * Validates the vm block size specified via module parameter.
1401 * The vm block size defines number of bits in page table versus page directory,
1402 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1403 * page table and the remaining bits are in the page directory.
1404 */
1405 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1406 {
1407 /* defines number of bits in page table versus page directory,
1408 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1409 * page table and the remaining bits are in the page directory */
1410 if (amdgpu_vm_block_size == -1)
1411 return;
1412
1413 if (amdgpu_vm_block_size < 9) {
1414 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1415 amdgpu_vm_block_size);
1416 amdgpu_vm_block_size = -1;
1417 }
1418 }
1419
1420 /**
1421 * amdgpu_device_check_vm_size - validate the vm size
1422 *
1423 * @adev: amdgpu_device pointer
1424 *
1425 * Validates the vm size in GB specified via module parameter.
1426 * The VM size is the size of the GPU virtual memory space in GB.
1427 */
1428 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1429 {
1430 /* no need to check the default value */
1431 if (amdgpu_vm_size == -1)
1432 return;
1433
1434 if (amdgpu_vm_size < 1) {
1435 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1436 amdgpu_vm_size);
1437 amdgpu_vm_size = -1;
1438 }
1439 }
1440
1441 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1442 {
1443 struct sysinfo si;
1444 bool is_os_64 = (sizeof(void *) == 8);
1445 uint64_t total_memory;
1446 uint64_t dram_size_seven_GB = 0x1B8000000;
1447 uint64_t dram_size_three_GB = 0xB8000000;
1448
1449 if (amdgpu_smu_memory_pool_size == 0)
1450 return;
1451
1452 if (!is_os_64) {
1453 DRM_WARN("Not 64-bit OS, feature not supported\n");
1454 goto def_value;
1455 }
1456 si_meminfo(&si);
1457 total_memory = (uint64_t)si.totalram * si.mem_unit;
1458
1459 if ((amdgpu_smu_memory_pool_size == 1) ||
1460 (amdgpu_smu_memory_pool_size == 2)) {
1461 if (total_memory < dram_size_three_GB)
1462 goto def_value1;
1463 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1464 (amdgpu_smu_memory_pool_size == 8)) {
1465 if (total_memory < dram_size_seven_GB)
1466 goto def_value1;
1467 } else {
1468 DRM_WARN("Smu memory pool size not supported\n");
1469 goto def_value;
1470 }
1471 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1472
1473 return;
1474
1475 def_value1:
1476 DRM_WARN("No enough system memory\n");
1477 def_value:
1478 adev->pm.smu_prv_buffer_size = 0;
1479 }
1480
1481 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1482 {
1483 if (!(adev->flags & AMD_IS_APU) ||
1484 adev->asic_type < CHIP_RAVEN)
1485 return 0;
1486
1487 switch (adev->asic_type) {
1488 case CHIP_RAVEN:
1489 if (adev->pdev->device == 0x15dd)
1490 adev->apu_flags |= AMD_APU_IS_RAVEN;
1491 if (adev->pdev->device == 0x15d8)
1492 adev->apu_flags |= AMD_APU_IS_PICASSO;
1493 break;
1494 case CHIP_RENOIR:
1495 if ((adev->pdev->device == 0x1636) ||
1496 (adev->pdev->device == 0x164c))
1497 adev->apu_flags |= AMD_APU_IS_RENOIR;
1498 else
1499 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1500 break;
1501 case CHIP_VANGOGH:
1502 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1503 break;
1504 case CHIP_YELLOW_CARP:
1505 break;
1506 case CHIP_CYAN_SKILLFISH:
1507 if ((adev->pdev->device == 0x13FE) ||
1508 (adev->pdev->device == 0x143F))
1509 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1510 break;
1511 default:
1512 break;
1513 }
1514
1515 return 0;
1516 }
1517
1518 /**
1519 * amdgpu_device_check_arguments - validate module params
1520 *
1521 * @adev: amdgpu_device pointer
1522 *
1523 * Validates certain module parameters and updates
1524 * the associated values used by the driver (all asics).
1525 */
1526 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1527 {
1528 if (amdgpu_sched_jobs < 4) {
1529 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1530 amdgpu_sched_jobs);
1531 amdgpu_sched_jobs = 4;
1532 } else if (!is_power_of_2(amdgpu_sched_jobs)){
1533 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1534 amdgpu_sched_jobs);
1535 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1536 }
1537
1538 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1539 /* gart size must be greater or equal to 32M */
1540 dev_warn(adev->dev, "gart size (%d) too small\n",
1541 amdgpu_gart_size);
1542 amdgpu_gart_size = -1;
1543 }
1544
1545 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1546 /* gtt size must be greater or equal to 32M */
1547 dev_warn(adev->dev, "gtt size (%d) too small\n",
1548 amdgpu_gtt_size);
1549 amdgpu_gtt_size = -1;
1550 }
1551
1552 /* valid range is between 4 and 9 inclusive */
1553 if (amdgpu_vm_fragment_size != -1 &&
1554 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1555 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1556 amdgpu_vm_fragment_size = -1;
1557 }
1558
1559 if (amdgpu_sched_hw_submission < 2) {
1560 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1561 amdgpu_sched_hw_submission);
1562 amdgpu_sched_hw_submission = 2;
1563 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1564 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1565 amdgpu_sched_hw_submission);
1566 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1567 }
1568
1569 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1570 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1571 amdgpu_reset_method = -1;
1572 }
1573
1574 amdgpu_device_check_smu_prv_buffer_size(adev);
1575
1576 amdgpu_device_check_vm_size(adev);
1577
1578 amdgpu_device_check_block_size(adev);
1579
1580 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1581
1582 return 0;
1583 }
1584
1585 /**
1586 * amdgpu_switcheroo_set_state - set switcheroo state
1587 *
1588 * @pdev: pci dev pointer
1589 * @state: vga_switcheroo state
1590 *
1591 * Callback for the switcheroo driver. Suspends or resumes
1592 * the asics before or after it is powered up using ACPI methods.
1593 */
1594 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1595 enum vga_switcheroo_state state)
1596 {
1597 struct drm_device *dev = pci_get_drvdata(pdev);
1598 int r;
1599
1600 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1601 return;
1602
1603 if (state == VGA_SWITCHEROO_ON) {
1604 pr_info("switched on\n");
1605 /* don't suspend or resume card normally */
1606 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1607
1608 pci_set_power_state(pdev, PCI_D0);
1609 amdgpu_device_load_pci_state(pdev);
1610 r = pci_enable_device(pdev);
1611 if (r)
1612 DRM_WARN("pci_enable_device failed (%d)\n", r);
1613 amdgpu_device_resume(dev, true);
1614
1615 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1616 } else {
1617 pr_info("switched off\n");
1618 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1619 amdgpu_device_suspend(dev, true);
1620 amdgpu_device_cache_pci_state(pdev);
1621 /* Shut down the device */
1622 pci_disable_device(pdev);
1623 pci_set_power_state(pdev, PCI_D3cold);
1624 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1625 }
1626 }
1627
1628 /**
1629 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1630 *
1631 * @pdev: pci dev pointer
1632 *
1633 * Callback for the switcheroo driver. Check of the switcheroo
1634 * state can be changed.
1635 * Returns true if the state can be changed, false if not.
1636 */
1637 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1638 {
1639 struct drm_device *dev = pci_get_drvdata(pdev);
1640
1641 /*
1642 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1643 * locking inversion with the driver load path. And the access here is
1644 * completely racy anyway. So don't bother with locking for now.
1645 */
1646 return atomic_read(&dev->open_count) == 0;
1647 }
1648
1649 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1650 .set_gpu_state = amdgpu_switcheroo_set_state,
1651 .reprobe = NULL,
1652 .can_switch = amdgpu_switcheroo_can_switch,
1653 };
1654
1655 /**
1656 * amdgpu_device_ip_set_clockgating_state - set the CG state
1657 *
1658 * @dev: amdgpu_device pointer
1659 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1660 * @state: clockgating state (gate or ungate)
1661 *
1662 * Sets the requested clockgating state for all instances of
1663 * the hardware IP specified.
1664 * Returns the error code from the last instance.
1665 */
1666 int amdgpu_device_ip_set_clockgating_state(void *dev,
1667 enum amd_ip_block_type block_type,
1668 enum amd_clockgating_state state)
1669 {
1670 struct amdgpu_device *adev = dev;
1671 int i, r = 0;
1672
1673 for (i = 0; i < adev->num_ip_blocks; i++) {
1674 if (!adev->ip_blocks[i].status.valid)
1675 continue;
1676 if (adev->ip_blocks[i].version->type != block_type)
1677 continue;
1678 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1679 continue;
1680 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1681 (void *)adev, state);
1682 if (r)
1683 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1684 adev->ip_blocks[i].version->funcs->name, r);
1685 }
1686 return r;
1687 }
1688
1689 /**
1690 * amdgpu_device_ip_set_powergating_state - set the PG state
1691 *
1692 * @dev: amdgpu_device pointer
1693 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1694 * @state: powergating state (gate or ungate)
1695 *
1696 * Sets the requested powergating state for all instances of
1697 * the hardware IP specified.
1698 * Returns the error code from the last instance.
1699 */
1700 int amdgpu_device_ip_set_powergating_state(void *dev,
1701 enum amd_ip_block_type block_type,
1702 enum amd_powergating_state state)
1703 {
1704 struct amdgpu_device *adev = dev;
1705 int i, r = 0;
1706
1707 for (i = 0; i < adev->num_ip_blocks; i++) {
1708 if (!adev->ip_blocks[i].status.valid)
1709 continue;
1710 if (adev->ip_blocks[i].version->type != block_type)
1711 continue;
1712 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1713 continue;
1714 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1715 (void *)adev, state);
1716 if (r)
1717 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1718 adev->ip_blocks[i].version->funcs->name, r);
1719 }
1720 return r;
1721 }
1722
1723 /**
1724 * amdgpu_device_ip_get_clockgating_state - get the CG state
1725 *
1726 * @adev: amdgpu_device pointer
1727 * @flags: clockgating feature flags
1728 *
1729 * Walks the list of IPs on the device and updates the clockgating
1730 * flags for each IP.
1731 * Updates @flags with the feature flags for each hardware IP where
1732 * clockgating is enabled.
1733 */
1734 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1735 u64 *flags)
1736 {
1737 int i;
1738
1739 for (i = 0; i < adev->num_ip_blocks; i++) {
1740 if (!adev->ip_blocks[i].status.valid)
1741 continue;
1742 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1743 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1744 }
1745 }
1746
1747 /**
1748 * amdgpu_device_ip_wait_for_idle - wait for idle
1749 *
1750 * @adev: amdgpu_device pointer
1751 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1752 *
1753 * Waits for the request hardware IP to be idle.
1754 * Returns 0 for success or a negative error code on failure.
1755 */
1756 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1757 enum amd_ip_block_type block_type)
1758 {
1759 int i, r;
1760
1761 for (i = 0; i < adev->num_ip_blocks; i++) {
1762 if (!adev->ip_blocks[i].status.valid)
1763 continue;
1764 if (adev->ip_blocks[i].version->type == block_type) {
1765 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1766 if (r)
1767 return r;
1768 break;
1769 }
1770 }
1771 return 0;
1772
1773 }
1774
1775 /**
1776 * amdgpu_device_ip_is_idle - is the hardware IP idle
1777 *
1778 * @adev: amdgpu_device pointer
1779 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1780 *
1781 * Check if the hardware IP is idle or not.
1782 * Returns true if it the IP is idle, false if not.
1783 */
1784 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1785 enum amd_ip_block_type block_type)
1786 {
1787 int i;
1788
1789 for (i = 0; i < adev->num_ip_blocks; i++) {
1790 if (!adev->ip_blocks[i].status.valid)
1791 continue;
1792 if (adev->ip_blocks[i].version->type == block_type)
1793 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1794 }
1795 return true;
1796
1797 }
1798
1799 /**
1800 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1801 *
1802 * @adev: amdgpu_device pointer
1803 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1804 *
1805 * Returns a pointer to the hardware IP block structure
1806 * if it exists for the asic, otherwise NULL.
1807 */
1808 struct amdgpu_ip_block *
1809 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1810 enum amd_ip_block_type type)
1811 {
1812 int i;
1813
1814 for (i = 0; i < adev->num_ip_blocks; i++)
1815 if (adev->ip_blocks[i].version->type == type)
1816 return &adev->ip_blocks[i];
1817
1818 return NULL;
1819 }
1820
1821 /**
1822 * amdgpu_device_ip_block_version_cmp
1823 *
1824 * @adev: amdgpu_device pointer
1825 * @type: enum amd_ip_block_type
1826 * @major: major version
1827 * @minor: minor version
1828 *
1829 * return 0 if equal or greater
1830 * return 1 if smaller or the ip_block doesn't exist
1831 */
1832 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1833 enum amd_ip_block_type type,
1834 u32 major, u32 minor)
1835 {
1836 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1837
1838 if (ip_block && ((ip_block->version->major > major) ||
1839 ((ip_block->version->major == major) &&
1840 (ip_block->version->minor >= minor))))
1841 return 0;
1842
1843 return 1;
1844 }
1845
1846 /**
1847 * amdgpu_device_ip_block_add
1848 *
1849 * @adev: amdgpu_device pointer
1850 * @ip_block_version: pointer to the IP to add
1851 *
1852 * Adds the IP block driver information to the collection of IPs
1853 * on the asic.
1854 */
1855 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1856 const struct amdgpu_ip_block_version *ip_block_version)
1857 {
1858 if (!ip_block_version)
1859 return -EINVAL;
1860
1861 switch (ip_block_version->type) {
1862 case AMD_IP_BLOCK_TYPE_VCN:
1863 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1864 return 0;
1865 break;
1866 case AMD_IP_BLOCK_TYPE_JPEG:
1867 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1868 return 0;
1869 break;
1870 default:
1871 break;
1872 }
1873
1874 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1875 ip_block_version->funcs->name);
1876
1877 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1878
1879 return 0;
1880 }
1881
1882 /**
1883 * amdgpu_device_enable_virtual_display - enable virtual display feature
1884 *
1885 * @adev: amdgpu_device pointer
1886 *
1887 * Enabled the virtual display feature if the user has enabled it via
1888 * the module parameter virtual_display. This feature provides a virtual
1889 * display hardware on headless boards or in virtualized environments.
1890 * This function parses and validates the configuration string specified by
1891 * the user and configues the virtual display configuration (number of
1892 * virtual connectors, crtcs, etc.) specified.
1893 */
1894 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1895 {
1896 adev->enable_virtual_display = false;
1897
1898 if (amdgpu_virtual_display) {
1899 const char *pci_address_name = pci_name(adev->pdev);
1900 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1901
1902 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1903 pciaddstr_tmp = pciaddstr;
1904 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1905 pciaddname = strsep(&pciaddname_tmp, ",");
1906 if (!strcmp("all", pciaddname)
1907 || !strcmp(pci_address_name, pciaddname)) {
1908 long num_crtc;
1909 int res = -1;
1910
1911 adev->enable_virtual_display = true;
1912
1913 if (pciaddname_tmp)
1914 res = kstrtol(pciaddname_tmp, 10,
1915 &num_crtc);
1916
1917 if (!res) {
1918 if (num_crtc < 1)
1919 num_crtc = 1;
1920 if (num_crtc > 6)
1921 num_crtc = 6;
1922 adev->mode_info.num_crtc = num_crtc;
1923 } else {
1924 adev->mode_info.num_crtc = 1;
1925 }
1926 break;
1927 }
1928 }
1929
1930 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1931 amdgpu_virtual_display, pci_address_name,
1932 adev->enable_virtual_display, adev->mode_info.num_crtc);
1933
1934 kfree(pciaddstr);
1935 }
1936 }
1937
1938 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1939 {
1940 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1941 adev->mode_info.num_crtc = 1;
1942 adev->enable_virtual_display = true;
1943 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1944 adev->enable_virtual_display, adev->mode_info.num_crtc);
1945 }
1946 }
1947
1948 /**
1949 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1950 *
1951 * @adev: amdgpu_device pointer
1952 *
1953 * Parses the asic configuration parameters specified in the gpu info
1954 * firmware and makes them availale to the driver for use in configuring
1955 * the asic.
1956 * Returns 0 on success, -EINVAL on failure.
1957 */
1958 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1959 {
1960 const char *chip_name;
1961 char fw_name[40];
1962 int err;
1963 const struct gpu_info_firmware_header_v1_0 *hdr;
1964
1965 adev->firmware.gpu_info_fw = NULL;
1966
1967 if (adev->mman.discovery_bin) {
1968 /*
1969 * FIXME: The bounding box is still needed by Navi12, so
1970 * temporarily read it from gpu_info firmware. Should be dropped
1971 * when DAL no longer needs it.
1972 */
1973 if (adev->asic_type != CHIP_NAVI12)
1974 return 0;
1975 }
1976
1977 switch (adev->asic_type) {
1978 default:
1979 return 0;
1980 case CHIP_VEGA10:
1981 chip_name = "vega10";
1982 break;
1983 case CHIP_VEGA12:
1984 chip_name = "vega12";
1985 break;
1986 case CHIP_RAVEN:
1987 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1988 chip_name = "raven2";
1989 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1990 chip_name = "picasso";
1991 else
1992 chip_name = "raven";
1993 break;
1994 case CHIP_ARCTURUS:
1995 chip_name = "arcturus";
1996 break;
1997 case CHIP_NAVI12:
1998 chip_name = "navi12";
1999 break;
2000 }
2001
2002 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
2003 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
2004 if (err) {
2005 dev_err(adev->dev,
2006 "Failed to load gpu_info firmware \"%s\"\n",
2007 fw_name);
2008 goto out;
2009 }
2010 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
2011 if (err) {
2012 dev_err(adev->dev,
2013 "Failed to validate gpu_info firmware \"%s\"\n",
2014 fw_name);
2015 goto out;
2016 }
2017
2018 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2019 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2020
2021 switch (hdr->version_major) {
2022 case 1:
2023 {
2024 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2025 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2026 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2027
2028 /*
2029 * Should be droped when DAL no longer needs it.
2030 */
2031 if (adev->asic_type == CHIP_NAVI12)
2032 goto parse_soc_bounding_box;
2033
2034 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2035 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2036 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2037 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2038 adev->gfx.config.max_texture_channel_caches =
2039 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2040 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2041 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2042 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2043 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2044 adev->gfx.config.double_offchip_lds_buf =
2045 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2046 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2047 adev->gfx.cu_info.max_waves_per_simd =
2048 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2049 adev->gfx.cu_info.max_scratch_slots_per_cu =
2050 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2051 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2052 if (hdr->version_minor >= 1) {
2053 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2054 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2055 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2056 adev->gfx.config.num_sc_per_sh =
2057 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2058 adev->gfx.config.num_packer_per_sc =
2059 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2060 }
2061
2062 parse_soc_bounding_box:
2063 /*
2064 * soc bounding box info is not integrated in disocovery table,
2065 * we always need to parse it from gpu info firmware if needed.
2066 */
2067 if (hdr->version_minor == 2) {
2068 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2069 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2070 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2071 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2072 }
2073 break;
2074 }
2075 default:
2076 dev_err(adev->dev,
2077 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2078 err = -EINVAL;
2079 goto out;
2080 }
2081 out:
2082 return err;
2083 }
2084
2085 /**
2086 * amdgpu_device_ip_early_init - run early init for hardware IPs
2087 *
2088 * @adev: amdgpu_device pointer
2089 *
2090 * Early initialization pass for hardware IPs. The hardware IPs that make
2091 * up each asic are discovered each IP's early_init callback is run. This
2092 * is the first stage in initializing the asic.
2093 * Returns 0 on success, negative error code on failure.
2094 */
2095 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2096 {
2097 struct drm_device *dev = adev_to_drm(adev);
2098 struct pci_dev *parent;
2099 int i, r;
2100
2101 amdgpu_device_enable_virtual_display(adev);
2102
2103 if (amdgpu_sriov_vf(adev)) {
2104 r = amdgpu_virt_request_full_gpu(adev, true);
2105 if (r)
2106 return r;
2107 }
2108
2109 switch (adev->asic_type) {
2110 #ifdef CONFIG_DRM_AMDGPU_SI
2111 case CHIP_VERDE:
2112 case CHIP_TAHITI:
2113 case CHIP_PITCAIRN:
2114 case CHIP_OLAND:
2115 case CHIP_HAINAN:
2116 adev->family = AMDGPU_FAMILY_SI;
2117 r = si_set_ip_blocks(adev);
2118 if (r)
2119 return r;
2120 break;
2121 #endif
2122 #ifdef CONFIG_DRM_AMDGPU_CIK
2123 case CHIP_BONAIRE:
2124 case CHIP_HAWAII:
2125 case CHIP_KAVERI:
2126 case CHIP_KABINI:
2127 case CHIP_MULLINS:
2128 if (adev->flags & AMD_IS_APU)
2129 adev->family = AMDGPU_FAMILY_KV;
2130 else
2131 adev->family = AMDGPU_FAMILY_CI;
2132
2133 r = cik_set_ip_blocks(adev);
2134 if (r)
2135 return r;
2136 break;
2137 #endif
2138 case CHIP_TOPAZ:
2139 case CHIP_TONGA:
2140 case CHIP_FIJI:
2141 case CHIP_POLARIS10:
2142 case CHIP_POLARIS11:
2143 case CHIP_POLARIS12:
2144 case CHIP_VEGAM:
2145 case CHIP_CARRIZO:
2146 case CHIP_STONEY:
2147 if (adev->flags & AMD_IS_APU)
2148 adev->family = AMDGPU_FAMILY_CZ;
2149 else
2150 adev->family = AMDGPU_FAMILY_VI;
2151
2152 r = vi_set_ip_blocks(adev);
2153 if (r)
2154 return r;
2155 break;
2156 default:
2157 r = amdgpu_discovery_set_ip_blocks(adev);
2158 if (r)
2159 return r;
2160 break;
2161 }
2162
2163 if (amdgpu_has_atpx() &&
2164 (amdgpu_is_atpx_hybrid() ||
2165 amdgpu_has_atpx_dgpu_power_cntl()) &&
2166 ((adev->flags & AMD_IS_APU) == 0) &&
2167 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2168 adev->flags |= AMD_IS_PX;
2169
2170 if (!(adev->flags & AMD_IS_APU)) {
2171 parent = pci_upstream_bridge(adev->pdev);
2172 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2173 }
2174
2175 amdgpu_amdkfd_device_probe(adev);
2176
2177 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2178 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2179 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2180 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2181 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2182
2183 for (i = 0; i < adev->num_ip_blocks; i++) {
2184 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2185 DRM_ERROR("disabled ip block: %d <%s>\n",
2186 i, adev->ip_blocks[i].version->funcs->name);
2187 adev->ip_blocks[i].status.valid = false;
2188 } else {
2189 if (adev->ip_blocks[i].version->funcs->early_init) {
2190 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2191 if (r == -ENOENT) {
2192 adev->ip_blocks[i].status.valid = false;
2193 } else if (r) {
2194 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2195 adev->ip_blocks[i].version->funcs->name, r);
2196 return r;
2197 } else {
2198 adev->ip_blocks[i].status.valid = true;
2199 }
2200 } else {
2201 adev->ip_blocks[i].status.valid = true;
2202 }
2203 }
2204 /* get the vbios after the asic_funcs are set up */
2205 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2206 r = amdgpu_device_parse_gpu_info_fw(adev);
2207 if (r)
2208 return r;
2209
2210 /* Read BIOS */
2211 if (!amdgpu_get_bios(adev))
2212 return -EINVAL;
2213
2214 r = amdgpu_atombios_init(adev);
2215 if (r) {
2216 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2217 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2218 return r;
2219 }
2220
2221 /*get pf2vf msg info at it's earliest time*/
2222 if (amdgpu_sriov_vf(adev))
2223 amdgpu_virt_init_data_exchange(adev);
2224
2225 }
2226 }
2227
2228 adev->cg_flags &= amdgpu_cg_mask;
2229 adev->pg_flags &= amdgpu_pg_mask;
2230
2231 return 0;
2232 }
2233
2234 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2235 {
2236 int i, r;
2237
2238 for (i = 0; i < adev->num_ip_blocks; i++) {
2239 if (!adev->ip_blocks[i].status.sw)
2240 continue;
2241 if (adev->ip_blocks[i].status.hw)
2242 continue;
2243 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2244 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2245 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2246 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2247 if (r) {
2248 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2249 adev->ip_blocks[i].version->funcs->name, r);
2250 return r;
2251 }
2252 adev->ip_blocks[i].status.hw = true;
2253 }
2254 }
2255
2256 return 0;
2257 }
2258
2259 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2260 {
2261 int i, r;
2262
2263 for (i = 0; i < adev->num_ip_blocks; i++) {
2264 if (!adev->ip_blocks[i].status.sw)
2265 continue;
2266 if (adev->ip_blocks[i].status.hw)
2267 continue;
2268 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2269 if (r) {
2270 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2271 adev->ip_blocks[i].version->funcs->name, r);
2272 return r;
2273 }
2274 adev->ip_blocks[i].status.hw = true;
2275 }
2276
2277 return 0;
2278 }
2279
2280 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2281 {
2282 int r = 0;
2283 int i;
2284 uint32_t smu_version;
2285
2286 if (adev->asic_type >= CHIP_VEGA10) {
2287 for (i = 0; i < adev->num_ip_blocks; i++) {
2288 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2289 continue;
2290
2291 if (!adev->ip_blocks[i].status.sw)
2292 continue;
2293
2294 /* no need to do the fw loading again if already done*/
2295 if (adev->ip_blocks[i].status.hw == true)
2296 break;
2297
2298 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2299 r = adev->ip_blocks[i].version->funcs->resume(adev);
2300 if (r) {
2301 DRM_ERROR("resume of IP block <%s> failed %d\n",
2302 adev->ip_blocks[i].version->funcs->name, r);
2303 return r;
2304 }
2305 } else {
2306 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2307 if (r) {
2308 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2309 adev->ip_blocks[i].version->funcs->name, r);
2310 return r;
2311 }
2312 }
2313
2314 adev->ip_blocks[i].status.hw = true;
2315 break;
2316 }
2317 }
2318
2319 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2320 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2321
2322 return r;
2323 }
2324
2325 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2326 {
2327 long timeout;
2328 int r, i;
2329
2330 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2331 struct amdgpu_ring *ring = adev->rings[i];
2332
2333 /* No need to setup the GPU scheduler for rings that don't need it */
2334 if (!ring || ring->no_scheduler)
2335 continue;
2336
2337 switch (ring->funcs->type) {
2338 case AMDGPU_RING_TYPE_GFX:
2339 timeout = adev->gfx_timeout;
2340 break;
2341 case AMDGPU_RING_TYPE_COMPUTE:
2342 timeout = adev->compute_timeout;
2343 break;
2344 case AMDGPU_RING_TYPE_SDMA:
2345 timeout = adev->sdma_timeout;
2346 break;
2347 default:
2348 timeout = adev->video_timeout;
2349 break;
2350 }
2351
2352 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2353 ring->num_hw_submission, amdgpu_job_hang_limit,
2354 timeout, adev->reset_domain->wq,
2355 ring->sched_score, ring->name,
2356 adev->dev);
2357 if (r) {
2358 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2359 ring->name);
2360 return r;
2361 }
2362 }
2363
2364 return 0;
2365 }
2366
2367
2368 /**
2369 * amdgpu_device_ip_init - run init for hardware IPs
2370 *
2371 * @adev: amdgpu_device pointer
2372 *
2373 * Main initialization pass for hardware IPs. The list of all the hardware
2374 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2375 * are run. sw_init initializes the software state associated with each IP
2376 * and hw_init initializes the hardware associated with each IP.
2377 * Returns 0 on success, negative error code on failure.
2378 */
2379 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2380 {
2381 int i, r;
2382
2383 r = amdgpu_ras_init(adev);
2384 if (r)
2385 return r;
2386
2387 for (i = 0; i < adev->num_ip_blocks; i++) {
2388 if (!adev->ip_blocks[i].status.valid)
2389 continue;
2390 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2391 if (r) {
2392 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2393 adev->ip_blocks[i].version->funcs->name, r);
2394 goto init_failed;
2395 }
2396 adev->ip_blocks[i].status.sw = true;
2397
2398 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2399 /* need to do common hw init early so everything is set up for gmc */
2400 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2401 if (r) {
2402 DRM_ERROR("hw_init %d failed %d\n", i, r);
2403 goto init_failed;
2404 }
2405 adev->ip_blocks[i].status.hw = true;
2406 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2407 /* need to do gmc hw init early so we can allocate gpu mem */
2408 /* Try to reserve bad pages early */
2409 if (amdgpu_sriov_vf(adev))
2410 amdgpu_virt_exchange_data(adev);
2411
2412 r = amdgpu_device_vram_scratch_init(adev);
2413 if (r) {
2414 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2415 goto init_failed;
2416 }
2417 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2418 if (r) {
2419 DRM_ERROR("hw_init %d failed %d\n", i, r);
2420 goto init_failed;
2421 }
2422 r = amdgpu_device_wb_init(adev);
2423 if (r) {
2424 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2425 goto init_failed;
2426 }
2427 adev->ip_blocks[i].status.hw = true;
2428
2429 /* right after GMC hw init, we create CSA */
2430 if (amdgpu_mcbp) {
2431 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2432 AMDGPU_GEM_DOMAIN_VRAM,
2433 AMDGPU_CSA_SIZE);
2434 if (r) {
2435 DRM_ERROR("allocate CSA failed %d\n", r);
2436 goto init_failed;
2437 }
2438 }
2439 }
2440 }
2441
2442 if (amdgpu_sriov_vf(adev))
2443 amdgpu_virt_init_data_exchange(adev);
2444
2445 r = amdgpu_ib_pool_init(adev);
2446 if (r) {
2447 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2448 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2449 goto init_failed;
2450 }
2451
2452 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2453 if (r)
2454 goto init_failed;
2455
2456 r = amdgpu_device_ip_hw_init_phase1(adev);
2457 if (r)
2458 goto init_failed;
2459
2460 r = amdgpu_device_fw_loading(adev);
2461 if (r)
2462 goto init_failed;
2463
2464 r = amdgpu_device_ip_hw_init_phase2(adev);
2465 if (r)
2466 goto init_failed;
2467
2468 /*
2469 * retired pages will be loaded from eeprom and reserved here,
2470 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2471 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2472 * for I2C communication which only true at this point.
2473 *
2474 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2475 * failure from bad gpu situation and stop amdgpu init process
2476 * accordingly. For other failed cases, it will still release all
2477 * the resource and print error message, rather than returning one
2478 * negative value to upper level.
2479 *
2480 * Note: theoretically, this should be called before all vram allocations
2481 * to protect retired page from abusing
2482 */
2483 r = amdgpu_ras_recovery_init(adev);
2484 if (r)
2485 goto init_failed;
2486
2487 /**
2488 * In case of XGMI grab extra reference for reset domain for this device
2489 */
2490 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2491 if (amdgpu_xgmi_add_device(adev) == 0) {
2492 if (!amdgpu_sriov_vf(adev)) {
2493 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2494
2495 if (WARN_ON(!hive)) {
2496 r = -ENOENT;
2497 goto init_failed;
2498 }
2499
2500 if (!hive->reset_domain ||
2501 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2502 r = -ENOENT;
2503 amdgpu_put_xgmi_hive(hive);
2504 goto init_failed;
2505 }
2506
2507 /* Drop the early temporary reset domain we created for device */
2508 amdgpu_reset_put_reset_domain(adev->reset_domain);
2509 adev->reset_domain = hive->reset_domain;
2510 amdgpu_put_xgmi_hive(hive);
2511 }
2512 }
2513 }
2514
2515 r = amdgpu_device_init_schedulers(adev);
2516 if (r)
2517 goto init_failed;
2518
2519 /* Don't init kfd if whole hive need to be reset during init */
2520 if (!adev->gmc.xgmi.pending_reset)
2521 amdgpu_amdkfd_device_init(adev);
2522
2523 amdgpu_fru_get_product_info(adev);
2524
2525 init_failed:
2526 if (amdgpu_sriov_vf(adev))
2527 amdgpu_virt_release_full_gpu(adev, true);
2528
2529 return r;
2530 }
2531
2532 /**
2533 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2534 *
2535 * @adev: amdgpu_device pointer
2536 *
2537 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2538 * this function before a GPU reset. If the value is retained after a
2539 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2540 */
2541 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2542 {
2543 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2544 }
2545
2546 /**
2547 * amdgpu_device_check_vram_lost - check if vram is valid
2548 *
2549 * @adev: amdgpu_device pointer
2550 *
2551 * Checks the reset magic value written to the gart pointer in VRAM.
2552 * The driver calls this after a GPU reset to see if the contents of
2553 * VRAM is lost or now.
2554 * returns true if vram is lost, false if not.
2555 */
2556 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2557 {
2558 if (memcmp(adev->gart.ptr, adev->reset_magic,
2559 AMDGPU_RESET_MAGIC_NUM))
2560 return true;
2561
2562 if (!amdgpu_in_reset(adev))
2563 return false;
2564
2565 /*
2566 * For all ASICs with baco/mode1 reset, the VRAM is
2567 * always assumed to be lost.
2568 */
2569 switch (amdgpu_asic_reset_method(adev)) {
2570 case AMD_RESET_METHOD_BACO:
2571 case AMD_RESET_METHOD_MODE1:
2572 return true;
2573 default:
2574 return false;
2575 }
2576 }
2577
2578 /**
2579 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2580 *
2581 * @adev: amdgpu_device pointer
2582 * @state: clockgating state (gate or ungate)
2583 *
2584 * The list of all the hardware IPs that make up the asic is walked and the
2585 * set_clockgating_state callbacks are run.
2586 * Late initialization pass enabling clockgating for hardware IPs.
2587 * Fini or suspend, pass disabling clockgating for hardware IPs.
2588 * Returns 0 on success, negative error code on failure.
2589 */
2590
2591 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2592 enum amd_clockgating_state state)
2593 {
2594 int i, j, r;
2595
2596 if (amdgpu_emu_mode == 1)
2597 return 0;
2598
2599 for (j = 0; j < adev->num_ip_blocks; j++) {
2600 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2601 if (!adev->ip_blocks[i].status.late_initialized)
2602 continue;
2603 /* skip CG for GFX on S0ix */
2604 if (adev->in_s0ix &&
2605 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2606 continue;
2607 /* skip CG for VCE/UVD, it's handled specially */
2608 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2609 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2610 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2611 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2612 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2613 /* enable clockgating to save power */
2614 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2615 state);
2616 if (r) {
2617 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2618 adev->ip_blocks[i].version->funcs->name, r);
2619 return r;
2620 }
2621 }
2622 }
2623
2624 return 0;
2625 }
2626
2627 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2628 enum amd_powergating_state state)
2629 {
2630 int i, j, r;
2631
2632 if (amdgpu_emu_mode == 1)
2633 return 0;
2634
2635 for (j = 0; j < adev->num_ip_blocks; j++) {
2636 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2637 if (!adev->ip_blocks[i].status.late_initialized)
2638 continue;
2639 /* skip PG for GFX on S0ix */
2640 if (adev->in_s0ix &&
2641 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2642 continue;
2643 /* skip CG for VCE/UVD, it's handled specially */
2644 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2645 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2646 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2647 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2648 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2649 /* enable powergating to save power */
2650 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2651 state);
2652 if (r) {
2653 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2654 adev->ip_blocks[i].version->funcs->name, r);
2655 return r;
2656 }
2657 }
2658 }
2659 return 0;
2660 }
2661
2662 static int amdgpu_device_enable_mgpu_fan_boost(void)
2663 {
2664 struct amdgpu_gpu_instance *gpu_ins;
2665 struct amdgpu_device *adev;
2666 int i, ret = 0;
2667
2668 mutex_lock(&mgpu_info.mutex);
2669
2670 /*
2671 * MGPU fan boost feature should be enabled
2672 * only when there are two or more dGPUs in
2673 * the system
2674 */
2675 if (mgpu_info.num_dgpu < 2)
2676 goto out;
2677
2678 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2679 gpu_ins = &(mgpu_info.gpu_ins[i]);
2680 adev = gpu_ins->adev;
2681 if (!(adev->flags & AMD_IS_APU) &&
2682 !gpu_ins->mgpu_fan_enabled) {
2683 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2684 if (ret)
2685 break;
2686
2687 gpu_ins->mgpu_fan_enabled = 1;
2688 }
2689 }
2690
2691 out:
2692 mutex_unlock(&mgpu_info.mutex);
2693
2694 return ret;
2695 }
2696
2697 /**
2698 * amdgpu_device_ip_late_init - run late init for hardware IPs
2699 *
2700 * @adev: amdgpu_device pointer
2701 *
2702 * Late initialization pass for hardware IPs. The list of all the hardware
2703 * IPs that make up the asic is walked and the late_init callbacks are run.
2704 * late_init covers any special initialization that an IP requires
2705 * after all of the have been initialized or something that needs to happen
2706 * late in the init process.
2707 * Returns 0 on success, negative error code on failure.
2708 */
2709 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2710 {
2711 struct amdgpu_gpu_instance *gpu_instance;
2712 int i = 0, r;
2713
2714 for (i = 0; i < adev->num_ip_blocks; i++) {
2715 if (!adev->ip_blocks[i].status.hw)
2716 continue;
2717 if (adev->ip_blocks[i].version->funcs->late_init) {
2718 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2719 if (r) {
2720 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2721 adev->ip_blocks[i].version->funcs->name, r);
2722 return r;
2723 }
2724 }
2725 adev->ip_blocks[i].status.late_initialized = true;
2726 }
2727
2728 r = amdgpu_ras_late_init(adev);
2729 if (r) {
2730 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2731 return r;
2732 }
2733
2734 amdgpu_ras_set_error_query_ready(adev, true);
2735
2736 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2737 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2738
2739 amdgpu_device_fill_reset_magic(adev);
2740
2741 r = amdgpu_device_enable_mgpu_fan_boost();
2742 if (r)
2743 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2744
2745 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2746 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||
2747 adev->asic_type == CHIP_ALDEBARAN ))
2748 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2749
2750 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2751 mutex_lock(&mgpu_info.mutex);
2752
2753 /*
2754 * Reset device p-state to low as this was booted with high.
2755 *
2756 * This should be performed only after all devices from the same
2757 * hive get initialized.
2758 *
2759 * However, it's unknown how many device in the hive in advance.
2760 * As this is counted one by one during devices initializations.
2761 *
2762 * So, we wait for all XGMI interlinked devices initialized.
2763 * This may bring some delays as those devices may come from
2764 * different hives. But that should be OK.
2765 */
2766 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2767 for (i = 0; i < mgpu_info.num_gpu; i++) {
2768 gpu_instance = &(mgpu_info.gpu_ins[i]);
2769 if (gpu_instance->adev->flags & AMD_IS_APU)
2770 continue;
2771
2772 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2773 AMDGPU_XGMI_PSTATE_MIN);
2774 if (r) {
2775 DRM_ERROR("pstate setting failed (%d).\n", r);
2776 break;
2777 }
2778 }
2779 }
2780
2781 mutex_unlock(&mgpu_info.mutex);
2782 }
2783
2784 return 0;
2785 }
2786
2787 /**
2788 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2789 *
2790 * @adev: amdgpu_device pointer
2791 *
2792 * For ASICs need to disable SMC first
2793 */
2794 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2795 {
2796 int i, r;
2797
2798 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2799 return;
2800
2801 for (i = 0; i < adev->num_ip_blocks; i++) {
2802 if (!adev->ip_blocks[i].status.hw)
2803 continue;
2804 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2805 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2806 /* XXX handle errors */
2807 if (r) {
2808 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2809 adev->ip_blocks[i].version->funcs->name, r);
2810 }
2811 adev->ip_blocks[i].status.hw = false;
2812 break;
2813 }
2814 }
2815 }
2816
2817 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2818 {
2819 int i, r;
2820
2821 for (i = 0; i < adev->num_ip_blocks; i++) {
2822 if (!adev->ip_blocks[i].version->funcs->early_fini)
2823 continue;
2824
2825 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2826 if (r) {
2827 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2828 adev->ip_blocks[i].version->funcs->name, r);
2829 }
2830 }
2831
2832 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2833 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2834
2835 amdgpu_amdkfd_suspend(adev, false);
2836
2837 /* Workaroud for ASICs need to disable SMC first */
2838 amdgpu_device_smu_fini_early(adev);
2839
2840 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2841 if (!adev->ip_blocks[i].status.hw)
2842 continue;
2843
2844 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2845 /* XXX handle errors */
2846 if (r) {
2847 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2848 adev->ip_blocks[i].version->funcs->name, r);
2849 }
2850
2851 adev->ip_blocks[i].status.hw = false;
2852 }
2853
2854 if (amdgpu_sriov_vf(adev)) {
2855 if (amdgpu_virt_release_full_gpu(adev, false))
2856 DRM_ERROR("failed to release exclusive mode on fini\n");
2857 }
2858
2859 return 0;
2860 }
2861
2862 /**
2863 * amdgpu_device_ip_fini - run fini for hardware IPs
2864 *
2865 * @adev: amdgpu_device pointer
2866 *
2867 * Main teardown pass for hardware IPs. The list of all the hardware
2868 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2869 * are run. hw_fini tears down the hardware associated with each IP
2870 * and sw_fini tears down any software state associated with each IP.
2871 * Returns 0 on success, negative error code on failure.
2872 */
2873 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2874 {
2875 int i, r;
2876
2877 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2878 amdgpu_virt_release_ras_err_handler_data(adev);
2879
2880 if (adev->gmc.xgmi.num_physical_nodes > 1)
2881 amdgpu_xgmi_remove_device(adev);
2882
2883 amdgpu_amdkfd_device_fini_sw(adev);
2884
2885 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2886 if (!adev->ip_blocks[i].status.sw)
2887 continue;
2888
2889 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2890 amdgpu_ucode_free_bo(adev);
2891 amdgpu_free_static_csa(&adev->virt.csa_obj);
2892 amdgpu_device_wb_fini(adev);
2893 amdgpu_device_vram_scratch_fini(adev);
2894 amdgpu_ib_pool_fini(adev);
2895 }
2896
2897 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2898 /* XXX handle errors */
2899 if (r) {
2900 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2901 adev->ip_blocks[i].version->funcs->name, r);
2902 }
2903 adev->ip_blocks[i].status.sw = false;
2904 adev->ip_blocks[i].status.valid = false;
2905 }
2906
2907 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2908 if (!adev->ip_blocks[i].status.late_initialized)
2909 continue;
2910 if (adev->ip_blocks[i].version->funcs->late_fini)
2911 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2912 adev->ip_blocks[i].status.late_initialized = false;
2913 }
2914
2915 amdgpu_ras_fini(adev);
2916
2917 return 0;
2918 }
2919
2920 /**
2921 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2922 *
2923 * @work: work_struct.
2924 */
2925 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2926 {
2927 struct amdgpu_device *adev =
2928 container_of(work, struct amdgpu_device, delayed_init_work.work);
2929 int r;
2930
2931 r = amdgpu_ib_ring_tests(adev);
2932 if (r)
2933 DRM_ERROR("ib ring test failed (%d).\n", r);
2934 }
2935
2936 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2937 {
2938 struct amdgpu_device *adev =
2939 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2940
2941 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2942 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2943
2944 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2945 adev->gfx.gfx_off_state = true;
2946 }
2947
2948 /**
2949 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2950 *
2951 * @adev: amdgpu_device pointer
2952 *
2953 * Main suspend function for hardware IPs. The list of all the hardware
2954 * IPs that make up the asic is walked, clockgating is disabled and the
2955 * suspend callbacks are run. suspend puts the hardware and software state
2956 * in each IP into a state suitable for suspend.
2957 * Returns 0 on success, negative error code on failure.
2958 */
2959 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2960 {
2961 int i, r;
2962
2963 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2964 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2965
2966 /*
2967 * Per PMFW team's suggestion, driver needs to handle gfxoff
2968 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2969 * scenario. Add the missing df cstate disablement here.
2970 */
2971 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2972 dev_warn(adev->dev, "Failed to disallow df cstate");
2973
2974 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2975 if (!adev->ip_blocks[i].status.valid)
2976 continue;
2977
2978 /* displays are handled separately */
2979 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2980 continue;
2981
2982 /* XXX handle errors */
2983 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2984 /* XXX handle errors */
2985 if (r) {
2986 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2987 adev->ip_blocks[i].version->funcs->name, r);
2988 return r;
2989 }
2990
2991 adev->ip_blocks[i].status.hw = false;
2992 }
2993
2994 return 0;
2995 }
2996
2997 /**
2998 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2999 *
3000 * @adev: amdgpu_device pointer
3001 *
3002 * Main suspend function for hardware IPs. The list of all the hardware
3003 * IPs that make up the asic is walked, clockgating is disabled and the
3004 * suspend callbacks are run. suspend puts the hardware and software state
3005 * in each IP into a state suitable for suspend.
3006 * Returns 0 on success, negative error code on failure.
3007 */
3008 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3009 {
3010 int i, r;
3011
3012 if (adev->in_s0ix)
3013 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3014
3015 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3016 if (!adev->ip_blocks[i].status.valid)
3017 continue;
3018 /* displays are handled in phase1 */
3019 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3020 continue;
3021 /* PSP lost connection when err_event_athub occurs */
3022 if (amdgpu_ras_intr_triggered() &&
3023 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3024 adev->ip_blocks[i].status.hw = false;
3025 continue;
3026 }
3027
3028 /* skip unnecessary suspend if we do not initialize them yet */
3029 if (adev->gmc.xgmi.pending_reset &&
3030 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3031 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3032 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3033 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3034 adev->ip_blocks[i].status.hw = false;
3035 continue;
3036 }
3037
3038 /* skip suspend of gfx/mes and psp for S0ix
3039 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3040 * like at runtime. PSP is also part of the always on hardware
3041 * so no need to suspend it.
3042 */
3043 if (adev->in_s0ix &&
3044 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3045 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3046 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3047 continue;
3048
3049 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3050 if (adev->in_s0ix &&
3051 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3052 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3053 continue;
3054
3055 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3056 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3057 * from this location and RLC Autoload automatically also gets loaded
3058 * from here based on PMFW -> PSP message during re-init sequence.
3059 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3060 * the TMR and reload FWs again for IMU enabled APU ASICs.
3061 */
3062 if (amdgpu_in_reset(adev) &&
3063 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3064 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3065 continue;
3066
3067 /* XXX handle errors */
3068 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3069 /* XXX handle errors */
3070 if (r) {
3071 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3072 adev->ip_blocks[i].version->funcs->name, r);
3073 }
3074 adev->ip_blocks[i].status.hw = false;
3075 /* handle putting the SMC in the appropriate state */
3076 if(!amdgpu_sriov_vf(adev)){
3077 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3078 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3079 if (r) {
3080 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3081 adev->mp1_state, r);
3082 return r;
3083 }
3084 }
3085 }
3086 }
3087
3088 return 0;
3089 }
3090
3091 /**
3092 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3093 *
3094 * @adev: amdgpu_device pointer
3095 *
3096 * Main suspend function for hardware IPs. The list of all the hardware
3097 * IPs that make up the asic is walked, clockgating is disabled and the
3098 * suspend callbacks are run. suspend puts the hardware and software state
3099 * in each IP into a state suitable for suspend.
3100 * Returns 0 on success, negative error code on failure.
3101 */
3102 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3103 {
3104 int r;
3105
3106 if (amdgpu_sriov_vf(adev)) {
3107 amdgpu_virt_fini_data_exchange(adev);
3108 amdgpu_virt_request_full_gpu(adev, false);
3109 }
3110
3111 r = amdgpu_device_ip_suspend_phase1(adev);
3112 if (r)
3113 return r;
3114 r = amdgpu_device_ip_suspend_phase2(adev);
3115
3116 if (amdgpu_sriov_vf(adev))
3117 amdgpu_virt_release_full_gpu(adev, false);
3118
3119 return r;
3120 }
3121
3122 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3123 {
3124 int i, r;
3125
3126 static enum amd_ip_block_type ip_order[] = {
3127 AMD_IP_BLOCK_TYPE_COMMON,
3128 AMD_IP_BLOCK_TYPE_GMC,
3129 AMD_IP_BLOCK_TYPE_PSP,
3130 AMD_IP_BLOCK_TYPE_IH,
3131 };
3132
3133 for (i = 0; i < adev->num_ip_blocks; i++) {
3134 int j;
3135 struct amdgpu_ip_block *block;
3136
3137 block = &adev->ip_blocks[i];
3138 block->status.hw = false;
3139
3140 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3141
3142 if (block->version->type != ip_order[j] ||
3143 !block->status.valid)
3144 continue;
3145
3146 r = block->version->funcs->hw_init(adev);
3147 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3148 if (r)
3149 return r;
3150 block->status.hw = true;
3151 }
3152 }
3153
3154 return 0;
3155 }
3156
3157 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3158 {
3159 int i, r;
3160
3161 static enum amd_ip_block_type ip_order[] = {
3162 AMD_IP_BLOCK_TYPE_SMC,
3163 AMD_IP_BLOCK_TYPE_DCE,
3164 AMD_IP_BLOCK_TYPE_GFX,
3165 AMD_IP_BLOCK_TYPE_SDMA,
3166 AMD_IP_BLOCK_TYPE_UVD,
3167 AMD_IP_BLOCK_TYPE_VCE,
3168 AMD_IP_BLOCK_TYPE_VCN
3169 };
3170
3171 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3172 int j;
3173 struct amdgpu_ip_block *block;
3174
3175 for (j = 0; j < adev->num_ip_blocks; j++) {
3176 block = &adev->ip_blocks[j];
3177
3178 if (block->version->type != ip_order[i] ||
3179 !block->status.valid ||
3180 block->status.hw)
3181 continue;
3182
3183 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3184 r = block->version->funcs->resume(adev);
3185 else
3186 r = block->version->funcs->hw_init(adev);
3187
3188 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3189 if (r)
3190 return r;
3191 block->status.hw = true;
3192 }
3193 }
3194
3195 return 0;
3196 }
3197
3198 /**
3199 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3200 *
3201 * @adev: amdgpu_device pointer
3202 *
3203 * First resume function for hardware IPs. The list of all the hardware
3204 * IPs that make up the asic is walked and the resume callbacks are run for
3205 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3206 * after a suspend and updates the software state as necessary. This
3207 * function is also used for restoring the GPU after a GPU reset.
3208 * Returns 0 on success, negative error code on failure.
3209 */
3210 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3211 {
3212 int i, r;
3213
3214 for (i = 0; i < adev->num_ip_blocks; i++) {
3215 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3216 continue;
3217 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3218 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3219 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3220 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3221
3222 r = adev->ip_blocks[i].version->funcs->resume(adev);
3223 if (r) {
3224 DRM_ERROR("resume of IP block <%s> failed %d\n",
3225 adev->ip_blocks[i].version->funcs->name, r);
3226 return r;
3227 }
3228 adev->ip_blocks[i].status.hw = true;
3229 }
3230 }
3231
3232 return 0;
3233 }
3234
3235 /**
3236 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3237 *
3238 * @adev: amdgpu_device pointer
3239 *
3240 * First resume function for hardware IPs. The list of all the hardware
3241 * IPs that make up the asic is walked and the resume callbacks are run for
3242 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3243 * functional state after a suspend and updates the software state as
3244 * necessary. This function is also used for restoring the GPU after a GPU
3245 * reset.
3246 * Returns 0 on success, negative error code on failure.
3247 */
3248 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3249 {
3250 int i, r;
3251
3252 for (i = 0; i < adev->num_ip_blocks; i++) {
3253 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3254 continue;
3255 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3256 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3257 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3258 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3259 continue;
3260 r = adev->ip_blocks[i].version->funcs->resume(adev);
3261 if (r) {
3262 DRM_ERROR("resume of IP block <%s> failed %d\n",
3263 adev->ip_blocks[i].version->funcs->name, r);
3264 return r;
3265 }
3266 adev->ip_blocks[i].status.hw = true;
3267
3268 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3269 /* disable gfxoff for IP resume. The gfxoff will be re-enabled in
3270 * amdgpu_device_resume() after IP resume.
3271 */
3272 amdgpu_gfx_off_ctrl(adev, false);
3273 DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n");
3274 }
3275
3276 }
3277
3278 return 0;
3279 }
3280
3281 /**
3282 * amdgpu_device_ip_resume - run resume for hardware IPs
3283 *
3284 * @adev: amdgpu_device pointer
3285 *
3286 * Main resume function for hardware IPs. The hardware IPs
3287 * are split into two resume functions because they are
3288 * are also used in in recovering from a GPU reset and some additional
3289 * steps need to be take between them. In this case (S3/S4) they are
3290 * run sequentially.
3291 * Returns 0 on success, negative error code on failure.
3292 */
3293 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3294 {
3295 int r;
3296
3297 r = amdgpu_amdkfd_resume_iommu(adev);
3298 if (r)
3299 return r;
3300
3301 r = amdgpu_device_ip_resume_phase1(adev);
3302 if (r)
3303 return r;
3304
3305 r = amdgpu_device_fw_loading(adev);
3306 if (r)
3307 return r;
3308
3309 r = amdgpu_device_ip_resume_phase2(adev);
3310
3311 return r;
3312 }
3313
3314 /**
3315 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3316 *
3317 * @adev: amdgpu_device pointer
3318 *
3319 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3320 */
3321 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3322 {
3323 if (amdgpu_sriov_vf(adev)) {
3324 if (adev->is_atom_fw) {
3325 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3326 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3327 } else {
3328 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3329 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3330 }
3331
3332 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3333 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3334 }
3335 }
3336
3337 /**
3338 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3339 *
3340 * @asic_type: AMD asic type
3341 *
3342 * Check if there is DC (new modesetting infrastructre) support for an asic.
3343 * returns true if DC has support, false if not.
3344 */
3345 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3346 {
3347 switch (asic_type) {
3348 #ifdef CONFIG_DRM_AMDGPU_SI
3349 case CHIP_HAINAN:
3350 #endif
3351 case CHIP_TOPAZ:
3352 /* chips with no display hardware */
3353 return false;
3354 #if defined(CONFIG_DRM_AMD_DC)
3355 case CHIP_TAHITI:
3356 case CHIP_PITCAIRN:
3357 case CHIP_VERDE:
3358 case CHIP_OLAND:
3359 /*
3360 * We have systems in the wild with these ASICs that require
3361 * LVDS and VGA support which is not supported with DC.
3362 *
3363 * Fallback to the non-DC driver here by default so as not to
3364 * cause regressions.
3365 */
3366 #if defined(CONFIG_DRM_AMD_DC_SI)
3367 return amdgpu_dc > 0;
3368 #else
3369 return false;
3370 #endif
3371 case CHIP_BONAIRE:
3372 case CHIP_KAVERI:
3373 case CHIP_KABINI:
3374 case CHIP_MULLINS:
3375 /*
3376 * We have systems in the wild with these ASICs that require
3377 * VGA support which is not supported with DC.
3378 *
3379 * Fallback to the non-DC driver here by default so as not to
3380 * cause regressions.
3381 */
3382 return amdgpu_dc > 0;
3383 default:
3384 return amdgpu_dc != 0;
3385 #else
3386 default:
3387 if (amdgpu_dc > 0)
3388 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3389 "but isn't supported by ASIC, ignoring\n");
3390 return false;
3391 #endif
3392 }
3393 }
3394
3395 /**
3396 * amdgpu_device_has_dc_support - check if dc is supported
3397 *
3398 * @adev: amdgpu_device pointer
3399 *
3400 * Returns true for supported, false for not supported
3401 */
3402 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3403 {
3404 if (adev->enable_virtual_display ||
3405 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3406 return false;
3407
3408 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3409 }
3410
3411 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3412 {
3413 struct amdgpu_device *adev =
3414 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3415 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3416
3417 /* It's a bug to not have a hive within this function */
3418 if (WARN_ON(!hive))
3419 return;
3420
3421 /*
3422 * Use task barrier to synchronize all xgmi reset works across the
3423 * hive. task_barrier_enter and task_barrier_exit will block
3424 * until all the threads running the xgmi reset works reach
3425 * those points. task_barrier_full will do both blocks.
3426 */
3427 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3428
3429 task_barrier_enter(&hive->tb);
3430 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3431
3432 if (adev->asic_reset_res)
3433 goto fail;
3434
3435 task_barrier_exit(&hive->tb);
3436 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3437
3438 if (adev->asic_reset_res)
3439 goto fail;
3440
3441 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3442 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3443 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3444 } else {
3445
3446 task_barrier_full(&hive->tb);
3447 adev->asic_reset_res = amdgpu_asic_reset(adev);
3448 }
3449
3450 fail:
3451 if (adev->asic_reset_res)
3452 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3453 adev->asic_reset_res, adev_to_drm(adev)->unique);
3454 amdgpu_put_xgmi_hive(hive);
3455 }
3456
3457 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3458 {
3459 char *input = amdgpu_lockup_timeout;
3460 char *timeout_setting = NULL;
3461 int index = 0;
3462 long timeout;
3463 int ret = 0;
3464
3465 /*
3466 * By default timeout for non compute jobs is 10000
3467 * and 60000 for compute jobs.
3468 * In SR-IOV or passthrough mode, timeout for compute
3469 * jobs are 60000 by default.
3470 */
3471 adev->gfx_timeout = msecs_to_jiffies(10000);
3472 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3473 if (amdgpu_sriov_vf(adev))
3474 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3475 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3476 else
3477 adev->compute_timeout = msecs_to_jiffies(60000);
3478
3479 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3480 while ((timeout_setting = strsep(&input, ",")) &&
3481 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3482 ret = kstrtol(timeout_setting, 0, &timeout);
3483 if (ret)
3484 return ret;
3485
3486 if (timeout == 0) {
3487 index++;
3488 continue;
3489 } else if (timeout < 0) {
3490 timeout = MAX_SCHEDULE_TIMEOUT;
3491 dev_warn(adev->dev, "lockup timeout disabled");
3492 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3493 } else {
3494 timeout = msecs_to_jiffies(timeout);
3495 }
3496
3497 switch (index++) {
3498 case 0:
3499 adev->gfx_timeout = timeout;
3500 break;
3501 case 1:
3502 adev->compute_timeout = timeout;
3503 break;
3504 case 2:
3505 adev->sdma_timeout = timeout;
3506 break;
3507 case 3:
3508 adev->video_timeout = timeout;
3509 break;
3510 default:
3511 break;
3512 }
3513 }
3514 /*
3515 * There is only one value specified and
3516 * it should apply to all non-compute jobs.
3517 */
3518 if (index == 1) {
3519 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3520 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3521 adev->compute_timeout = adev->gfx_timeout;
3522 }
3523 }
3524
3525 return ret;
3526 }
3527
3528 /**
3529 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3530 *
3531 * @adev: amdgpu_device pointer
3532 *
3533 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3534 */
3535 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3536 {
3537 struct iommu_domain *domain;
3538
3539 domain = iommu_get_domain_for_dev(adev->dev);
3540 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3541 adev->ram_is_direct_mapped = true;
3542 }
3543
3544 static const struct attribute *amdgpu_dev_attributes[] = {
3545 &dev_attr_product_name.attr,
3546 &dev_attr_product_number.attr,
3547 &dev_attr_serial_number.attr,
3548 &dev_attr_pcie_replay_count.attr,
3549 NULL
3550 };
3551
3552 /**
3553 * amdgpu_device_init - initialize the driver
3554 *
3555 * @adev: amdgpu_device pointer
3556 * @flags: driver flags
3557 *
3558 * Initializes the driver info and hw (all asics).
3559 * Returns 0 for success or an error on failure.
3560 * Called at driver startup.
3561 */
3562 int amdgpu_device_init(struct amdgpu_device *adev,
3563 uint32_t flags)
3564 {
3565 struct drm_device *ddev = adev_to_drm(adev);
3566 struct pci_dev *pdev = adev->pdev;
3567 int r, i;
3568 bool px = false;
3569 u32 max_MBps;
3570
3571 adev->shutdown = false;
3572 adev->flags = flags;
3573
3574 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3575 adev->asic_type = amdgpu_force_asic_type;
3576 else
3577 adev->asic_type = flags & AMD_ASIC_MASK;
3578
3579 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3580 if (amdgpu_emu_mode == 1)
3581 adev->usec_timeout *= 10;
3582 adev->gmc.gart_size = 512 * 1024 * 1024;
3583 adev->accel_working = false;
3584 adev->num_rings = 0;
3585 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3586 adev->mman.buffer_funcs = NULL;
3587 adev->mman.buffer_funcs_ring = NULL;
3588 adev->vm_manager.vm_pte_funcs = NULL;
3589 adev->vm_manager.vm_pte_num_scheds = 0;
3590 adev->gmc.gmc_funcs = NULL;
3591 adev->harvest_ip_mask = 0x0;
3592 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3593 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3594
3595 adev->smc_rreg = &amdgpu_invalid_rreg;
3596 adev->smc_wreg = &amdgpu_invalid_wreg;
3597 adev->pcie_rreg = &amdgpu_invalid_rreg;
3598 adev->pcie_wreg = &amdgpu_invalid_wreg;
3599 adev->pciep_rreg = &amdgpu_invalid_rreg;
3600 adev->pciep_wreg = &amdgpu_invalid_wreg;
3601 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3602 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3603 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3604 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3605 adev->didt_rreg = &amdgpu_invalid_rreg;
3606 adev->didt_wreg = &amdgpu_invalid_wreg;
3607 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3608 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3609 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3610 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3611
3612 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3613 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3614 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3615
3616 /* mutex initialization are all done here so we
3617 * can recall function without having locking issues */
3618 mutex_init(&adev->firmware.mutex);
3619 mutex_init(&adev->pm.mutex);
3620 mutex_init(&adev->gfx.gpu_clock_mutex);
3621 mutex_init(&adev->srbm_mutex);
3622 mutex_init(&adev->gfx.pipe_reserve_mutex);
3623 mutex_init(&adev->gfx.gfx_off_mutex);
3624 mutex_init(&adev->grbm_idx_mutex);
3625 mutex_init(&adev->mn_lock);
3626 mutex_init(&adev->virt.vf_errors.lock);
3627 hash_init(adev->mn_hash);
3628 mutex_init(&adev->psp.mutex);
3629 mutex_init(&adev->notifier_lock);
3630 mutex_init(&adev->pm.stable_pstate_ctx_lock);
3631 mutex_init(&adev->benchmark_mutex);
3632
3633 amdgpu_device_init_apu_flags(adev);
3634
3635 r = amdgpu_device_check_arguments(adev);
3636 if (r)
3637 return r;
3638
3639 spin_lock_init(&adev->mmio_idx_lock);
3640 spin_lock_init(&adev->smc_idx_lock);
3641 spin_lock_init(&adev->pcie_idx_lock);
3642 spin_lock_init(&adev->uvd_ctx_idx_lock);
3643 spin_lock_init(&adev->didt_idx_lock);
3644 spin_lock_init(&adev->gc_cac_idx_lock);
3645 spin_lock_init(&adev->se_cac_idx_lock);
3646 spin_lock_init(&adev->audio_endpt_idx_lock);
3647 spin_lock_init(&adev->mm_stats.lock);
3648
3649 INIT_LIST_HEAD(&adev->shadow_list);
3650 mutex_init(&adev->shadow_list_lock);
3651
3652 INIT_LIST_HEAD(&adev->reset_list);
3653
3654 INIT_LIST_HEAD(&adev->ras_list);
3655
3656 INIT_DELAYED_WORK(&adev->delayed_init_work,
3657 amdgpu_device_delayed_init_work_handler);
3658 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3659 amdgpu_device_delay_enable_gfx_off);
3660
3661 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3662
3663 adev->gfx.gfx_off_req_count = 1;
3664 adev->gfx.gfx_off_residency = 0;
3665 adev->gfx.gfx_off_entrycount = 0;
3666 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3667
3668 atomic_set(&adev->throttling_logging_enabled, 1);
3669 /*
3670 * If throttling continues, logging will be performed every minute
3671 * to avoid log flooding. "-1" is subtracted since the thermal
3672 * throttling interrupt comes every second. Thus, the total logging
3673 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3674 * for throttling interrupt) = 60 seconds.
3675 */
3676 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3677 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3678
3679 /* Registers mapping */
3680 /* TODO: block userspace mapping of io register */
3681 if (adev->asic_type >= CHIP_BONAIRE) {
3682 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3683 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3684 } else {
3685 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3686 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3687 }
3688
3689 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3690 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3691
3692 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3693 if (adev->rmmio == NULL) {
3694 return -ENOMEM;
3695 }
3696 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3697 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3698
3699 amdgpu_device_get_pcie_info(adev);
3700
3701 if (amdgpu_mcbp)
3702 DRM_INFO("MCBP is enabled\n");
3703
3704 /*
3705 * Reset domain needs to be present early, before XGMI hive discovered
3706 * (if any) and intitialized to use reset sem and in_gpu reset flag
3707 * early on during init and before calling to RREG32.
3708 */
3709 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3710 if (!adev->reset_domain)
3711 return -ENOMEM;
3712
3713 /* detect hw virtualization here */
3714 amdgpu_detect_virtualization(adev);
3715
3716 r = amdgpu_device_get_job_timeout_settings(adev);
3717 if (r) {
3718 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3719 return r;
3720 }
3721
3722 /* early init functions */
3723 r = amdgpu_device_ip_early_init(adev);
3724 if (r)
3725 return r;
3726
3727 /* Get rid of things like offb */
3728 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3729 if (r)
3730 return r;
3731
3732 /* Enable TMZ based on IP_VERSION */
3733 amdgpu_gmc_tmz_set(adev);
3734
3735 amdgpu_gmc_noretry_set(adev);
3736 /* Need to get xgmi info early to decide the reset behavior*/
3737 if (adev->gmc.xgmi.supported) {
3738 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3739 if (r)
3740 return r;
3741 }
3742
3743 /* enable PCIE atomic ops */
3744 if (amdgpu_sriov_vf(adev))
3745 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3746 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3747 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3748 else
3749 adev->have_atomics_support =
3750 !pci_enable_atomic_ops_to_root(adev->pdev,
3751 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3752 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3753 if (!adev->have_atomics_support)
3754 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3755
3756 /* doorbell bar mapping and doorbell index init*/
3757 amdgpu_device_doorbell_init(adev);
3758
3759 if (amdgpu_emu_mode == 1) {
3760 /* post the asic on emulation mode */
3761 emu_soc_asic_init(adev);
3762 goto fence_driver_init;
3763 }
3764
3765 amdgpu_reset_init(adev);
3766
3767 /* detect if we are with an SRIOV vbios */
3768 amdgpu_device_detect_sriov_bios(adev);
3769
3770 /* check if we need to reset the asic
3771 * E.g., driver was not cleanly unloaded previously, etc.
3772 */
3773 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3774 if (adev->gmc.xgmi.num_physical_nodes) {
3775 dev_info(adev->dev, "Pending hive reset.\n");
3776 adev->gmc.xgmi.pending_reset = true;
3777 /* Only need to init necessary block for SMU to handle the reset */
3778 for (i = 0; i < adev->num_ip_blocks; i++) {
3779 if (!adev->ip_blocks[i].status.valid)
3780 continue;
3781 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3782 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3783 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3784 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3785 DRM_DEBUG("IP %s disabled for hw_init.\n",
3786 adev->ip_blocks[i].version->funcs->name);
3787 adev->ip_blocks[i].status.hw = true;
3788 }
3789 }
3790 } else {
3791 r = amdgpu_asic_reset(adev);
3792 if (r) {
3793 dev_err(adev->dev, "asic reset on init failed\n");
3794 goto failed;
3795 }
3796 }
3797 }
3798
3799 pci_enable_pcie_error_reporting(adev->pdev);
3800
3801 /* Post card if necessary */
3802 if (amdgpu_device_need_post(adev)) {
3803 if (!adev->bios) {
3804 dev_err(adev->dev, "no vBIOS found\n");
3805 r = -EINVAL;
3806 goto failed;
3807 }
3808 DRM_INFO("GPU posting now...\n");
3809 r = amdgpu_device_asic_init(adev);
3810 if (r) {
3811 dev_err(adev->dev, "gpu post error!\n");
3812 goto failed;
3813 }
3814 }
3815
3816 if (adev->is_atom_fw) {
3817 /* Initialize clocks */
3818 r = amdgpu_atomfirmware_get_clock_info(adev);
3819 if (r) {
3820 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3821 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3822 goto failed;
3823 }
3824 } else {
3825 /* Initialize clocks */
3826 r = amdgpu_atombios_get_clock_info(adev);
3827 if (r) {
3828 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3829 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3830 goto failed;
3831 }
3832 /* init i2c buses */
3833 if (!amdgpu_device_has_dc_support(adev))
3834 amdgpu_atombios_i2c_init(adev);
3835 }
3836
3837 fence_driver_init:
3838 /* Fence driver */
3839 r = amdgpu_fence_driver_sw_init(adev);
3840 if (r) {
3841 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3842 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3843 goto failed;
3844 }
3845
3846 /* init the mode config */
3847 drm_mode_config_init(adev_to_drm(adev));
3848
3849 r = amdgpu_device_ip_init(adev);
3850 if (r) {
3851 /* failed in exclusive mode due to timeout */
3852 if (amdgpu_sriov_vf(adev) &&
3853 !amdgpu_sriov_runtime(adev) &&
3854 amdgpu_virt_mmio_blocked(adev) &&
3855 !amdgpu_virt_wait_reset(adev)) {
3856 dev_err(adev->dev, "VF exclusive mode timeout\n");
3857 /* Don't send request since VF is inactive. */
3858 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3859 adev->virt.ops = NULL;
3860 r = -EAGAIN;
3861 goto release_ras_con;
3862 }
3863 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3864 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3865 goto release_ras_con;
3866 }
3867
3868 amdgpu_fence_driver_hw_init(adev);
3869
3870 dev_info(adev->dev,
3871 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3872 adev->gfx.config.max_shader_engines,
3873 adev->gfx.config.max_sh_per_se,
3874 adev->gfx.config.max_cu_per_sh,
3875 adev->gfx.cu_info.number);
3876
3877 adev->accel_working = true;
3878
3879 amdgpu_vm_check_compute_bug(adev);
3880
3881 /* Initialize the buffer migration limit. */
3882 if (amdgpu_moverate >= 0)
3883 max_MBps = amdgpu_moverate;
3884 else
3885 max_MBps = 8; /* Allow 8 MB/s. */
3886 /* Get a log2 for easy divisions. */
3887 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3888
3889 r = amdgpu_pm_sysfs_init(adev);
3890 if (r) {
3891 adev->pm_sysfs_en = false;
3892 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3893 } else
3894 adev->pm_sysfs_en = true;
3895
3896 r = amdgpu_ucode_sysfs_init(adev);
3897 if (r) {
3898 adev->ucode_sysfs_en = false;
3899 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3900 } else
3901 adev->ucode_sysfs_en = true;
3902
3903 r = amdgpu_psp_sysfs_init(adev);
3904 if (r) {
3905 adev->psp_sysfs_en = false;
3906 if (!amdgpu_sriov_vf(adev))
3907 DRM_ERROR("Creating psp sysfs failed\n");
3908 } else
3909 adev->psp_sysfs_en = true;
3910
3911 /*
3912 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3913 * Otherwise the mgpu fan boost feature will be skipped due to the
3914 * gpu instance is counted less.
3915 */
3916 amdgpu_register_gpu_instance(adev);
3917
3918 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3919 * explicit gating rather than handling it automatically.
3920 */
3921 if (!adev->gmc.xgmi.pending_reset) {
3922 r = amdgpu_device_ip_late_init(adev);
3923 if (r) {
3924 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3925 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3926 goto release_ras_con;
3927 }
3928 /* must succeed. */
3929 amdgpu_ras_resume(adev);
3930 queue_delayed_work(system_wq, &adev->delayed_init_work,
3931 msecs_to_jiffies(AMDGPU_RESUME_MS));
3932 }
3933
3934 if (amdgpu_sriov_vf(adev))
3935 flush_delayed_work(&adev->delayed_init_work);
3936
3937 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3938 if (r)
3939 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3940
3941 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3942 r = amdgpu_pmu_init(adev);
3943 if (r)
3944 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3945
3946 /* Have stored pci confspace at hand for restore in sudden PCI error */
3947 if (amdgpu_device_cache_pci_state(adev->pdev))
3948 pci_restore_state(pdev);
3949
3950 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3951 /* this will fail for cards that aren't VGA class devices, just
3952 * ignore it */
3953 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3954 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3955
3956 px = amdgpu_device_supports_px(ddev);
3957
3958 if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
3959 apple_gmux_detect(NULL, NULL)))
3960 vga_switcheroo_register_client(adev->pdev,
3961 &amdgpu_switcheroo_ops, px);
3962
3963 if (px)
3964 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3965
3966 if (adev->gmc.xgmi.pending_reset)
3967 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3968 msecs_to_jiffies(AMDGPU_RESUME_MS));
3969
3970 amdgpu_device_check_iommu_direct_map(adev);
3971
3972 return 0;
3973
3974 release_ras_con:
3975 amdgpu_release_ras_context(adev);
3976
3977 failed:
3978 amdgpu_vf_error_trans_all(adev);
3979
3980 return r;
3981 }
3982
3983 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3984 {
3985
3986 /* Clear all CPU mappings pointing to this device */
3987 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3988
3989 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3990 amdgpu_device_doorbell_fini(adev);
3991
3992 iounmap(adev->rmmio);
3993 adev->rmmio = NULL;
3994 if (adev->mman.aper_base_kaddr)
3995 iounmap(adev->mman.aper_base_kaddr);
3996 adev->mman.aper_base_kaddr = NULL;
3997
3998 /* Memory manager related */
3999 if (!adev->gmc.xgmi.connected_to_cpu) {
4000 arch_phys_wc_del(adev->gmc.vram_mtrr);
4001 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4002 }
4003 }
4004
4005 /**
4006 * amdgpu_device_fini_hw - tear down the driver
4007 *
4008 * @adev: amdgpu_device pointer
4009 *
4010 * Tear down the driver info (all asics).
4011 * Called at driver shutdown.
4012 */
4013 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4014 {
4015 dev_info(adev->dev, "amdgpu: finishing device.\n");
4016 flush_delayed_work(&adev->delayed_init_work);
4017 adev->shutdown = true;
4018
4019 /* make sure IB test finished before entering exclusive mode
4020 * to avoid preemption on IB test
4021 * */
4022 if (amdgpu_sriov_vf(adev)) {
4023 amdgpu_virt_request_full_gpu(adev, false);
4024 amdgpu_virt_fini_data_exchange(adev);
4025 }
4026
4027 /* disable all interrupts */
4028 amdgpu_irq_disable_all(adev);
4029 if (adev->mode_info.mode_config_initialized){
4030 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4031 drm_helper_force_disable_all(adev_to_drm(adev));
4032 else
4033 drm_atomic_helper_shutdown(adev_to_drm(adev));
4034 }
4035 amdgpu_fence_driver_hw_fini(adev);
4036
4037 if (adev->mman.initialized) {
4038 flush_delayed_work(&adev->mman.bdev.wq);
4039 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
4040 }
4041
4042 if (adev->pm_sysfs_en)
4043 amdgpu_pm_sysfs_fini(adev);
4044 if (adev->ucode_sysfs_en)
4045 amdgpu_ucode_sysfs_fini(adev);
4046 if (adev->psp_sysfs_en)
4047 amdgpu_psp_sysfs_fini(adev);
4048 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4049
4050 /* disable ras feature must before hw fini */
4051 amdgpu_ras_pre_fini(adev);
4052
4053 amdgpu_device_ip_fini_early(adev);
4054
4055 amdgpu_irq_fini_hw(adev);
4056
4057 if (adev->mman.initialized)
4058 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4059
4060 amdgpu_gart_dummy_page_fini(adev);
4061
4062 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4063 amdgpu_device_unmap_mmio(adev);
4064
4065 }
4066
4067 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4068 {
4069 int idx;
4070 bool px;
4071
4072 amdgpu_fence_driver_sw_fini(adev);
4073 amdgpu_device_ip_fini(adev);
4074 release_firmware(adev->firmware.gpu_info_fw);
4075 adev->firmware.gpu_info_fw = NULL;
4076 adev->accel_working = false;
4077 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4078
4079 amdgpu_reset_fini(adev);
4080
4081 /* free i2c buses */
4082 if (!amdgpu_device_has_dc_support(adev))
4083 amdgpu_i2c_fini(adev);
4084
4085 if (amdgpu_emu_mode != 1)
4086 amdgpu_atombios_fini(adev);
4087
4088 kfree(adev->bios);
4089 adev->bios = NULL;
4090
4091 px = amdgpu_device_supports_px(adev_to_drm(adev));
4092
4093 if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4094 apple_gmux_detect(NULL, NULL)))
4095 vga_switcheroo_unregister_client(adev->pdev);
4096
4097 if (px)
4098 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4099
4100 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4101 vga_client_unregister(adev->pdev);
4102
4103 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4104
4105 iounmap(adev->rmmio);
4106 adev->rmmio = NULL;
4107 amdgpu_device_doorbell_fini(adev);
4108 drm_dev_exit(idx);
4109 }
4110
4111 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4112 amdgpu_pmu_fini(adev);
4113 if (adev->mman.discovery_bin)
4114 amdgpu_discovery_fini(adev);
4115
4116 amdgpu_reset_put_reset_domain(adev->reset_domain);
4117 adev->reset_domain = NULL;
4118
4119 kfree(adev->pci_state);
4120
4121 }
4122
4123 /**
4124 * amdgpu_device_evict_resources - evict device resources
4125 * @adev: amdgpu device object
4126 *
4127 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4128 * of the vram memory type. Mainly used for evicting device resources
4129 * at suspend time.
4130 *
4131 */
4132 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4133 {
4134 int ret;
4135
4136 /* No need to evict vram on APUs for suspend to ram or s2idle */
4137 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4138 return 0;
4139
4140 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4141 if (ret)
4142 DRM_WARN("evicting device resources failed\n");
4143 return ret;
4144 }
4145
4146 /*
4147 * Suspend & resume.
4148 */
4149 /**
4150 * amdgpu_device_suspend - initiate device suspend
4151 *
4152 * @dev: drm dev pointer
4153 * @fbcon : notify the fbdev of suspend
4154 *
4155 * Puts the hw in the suspend state (all asics).
4156 * Returns 0 for success or an error on failure.
4157 * Called at driver suspend.
4158 */
4159 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4160 {
4161 struct amdgpu_device *adev = drm_to_adev(dev);
4162 int r = 0;
4163
4164 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4165 return 0;
4166
4167 adev->in_suspend = true;
4168
4169 /* Evict the majority of BOs before grabbing the full access */
4170 r = amdgpu_device_evict_resources(adev);
4171 if (r)
4172 return r;
4173
4174 if (amdgpu_sriov_vf(adev)) {
4175 amdgpu_virt_fini_data_exchange(adev);
4176 r = amdgpu_virt_request_full_gpu(adev, false);
4177 if (r)
4178 return r;
4179 }
4180
4181 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4182 DRM_WARN("smart shift update failed\n");
4183
4184 drm_kms_helper_poll_disable(dev);
4185
4186 if (fbcon)
4187 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4188
4189 cancel_delayed_work_sync(&adev->delayed_init_work);
4190
4191 amdgpu_ras_suspend(adev);
4192
4193 amdgpu_device_ip_suspend_phase1(adev);
4194
4195 if (!adev->in_s0ix)
4196 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4197
4198 r = amdgpu_device_evict_resources(adev);
4199 if (r)
4200 return r;
4201
4202 amdgpu_fence_driver_hw_fini(adev);
4203
4204 amdgpu_device_ip_suspend_phase2(adev);
4205
4206 if (amdgpu_sriov_vf(adev))
4207 amdgpu_virt_release_full_gpu(adev, false);
4208
4209 return 0;
4210 }
4211
4212 /**
4213 * amdgpu_device_resume - initiate device resume
4214 *
4215 * @dev: drm dev pointer
4216 * @fbcon : notify the fbdev of resume
4217 *
4218 * Bring the hw back to operating state (all asics).
4219 * Returns 0 for success or an error on failure.
4220 * Called at driver resume.
4221 */
4222 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4223 {
4224 struct amdgpu_device *adev = drm_to_adev(dev);
4225 int r = 0;
4226
4227 if (amdgpu_sriov_vf(adev)) {
4228 r = amdgpu_virt_request_full_gpu(adev, true);
4229 if (r)
4230 return r;
4231 }
4232
4233 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4234 return 0;
4235
4236 if (adev->in_s0ix)
4237 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4238
4239 /* post card */
4240 if (amdgpu_device_need_post(adev)) {
4241 r = amdgpu_device_asic_init(adev);
4242 if (r)
4243 dev_err(adev->dev, "amdgpu asic init failed\n");
4244 }
4245
4246 r = amdgpu_device_ip_resume(adev);
4247
4248 if (r) {
4249 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4250 goto exit;
4251 }
4252 amdgpu_fence_driver_hw_init(adev);
4253
4254 r = amdgpu_device_ip_late_init(adev);
4255 if (r)
4256 goto exit;
4257
4258 queue_delayed_work(system_wq, &adev->delayed_init_work,
4259 msecs_to_jiffies(AMDGPU_RESUME_MS));
4260
4261 if (!adev->in_s0ix) {
4262 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4263 if (r)
4264 goto exit;
4265 }
4266
4267 exit:
4268 if (amdgpu_sriov_vf(adev)) {
4269 amdgpu_virt_init_data_exchange(adev);
4270 amdgpu_virt_release_full_gpu(adev, true);
4271 }
4272
4273 if (r)
4274 return r;
4275
4276 /* Make sure IB tests flushed */
4277 flush_delayed_work(&adev->delayed_init_work);
4278
4279 if (adev->in_s0ix) {
4280 /* re-enable gfxoff after IP resume. This re-enables gfxoff after
4281 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2().
4282 */
4283 amdgpu_gfx_off_ctrl(adev, true);
4284 DRM_DEBUG("will enable gfxoff for the mission mode\n");
4285 }
4286 if (fbcon)
4287 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4288
4289 drm_kms_helper_poll_enable(dev);
4290
4291 amdgpu_ras_resume(adev);
4292
4293 if (adev->mode_info.num_crtc) {
4294 /*
4295 * Most of the connector probing functions try to acquire runtime pm
4296 * refs to ensure that the GPU is powered on when connector polling is
4297 * performed. Since we're calling this from a runtime PM callback,
4298 * trying to acquire rpm refs will cause us to deadlock.
4299 *
4300 * Since we're guaranteed to be holding the rpm lock, it's safe to
4301 * temporarily disable the rpm helpers so this doesn't deadlock us.
4302 */
4303 #ifdef CONFIG_PM
4304 dev->dev->power.disable_depth++;
4305 #endif
4306 if (!adev->dc_enabled)
4307 drm_helper_hpd_irq_event(dev);
4308 else
4309 drm_kms_helper_hotplug_event(dev);
4310 #ifdef CONFIG_PM
4311 dev->dev->power.disable_depth--;
4312 #endif
4313 }
4314 adev->in_suspend = false;
4315
4316 if (adev->enable_mes)
4317 amdgpu_mes_self_test(adev);
4318
4319 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4320 DRM_WARN("smart shift update failed\n");
4321
4322 return 0;
4323 }
4324
4325 /**
4326 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4327 *
4328 * @adev: amdgpu_device pointer
4329 *
4330 * The list of all the hardware IPs that make up the asic is walked and
4331 * the check_soft_reset callbacks are run. check_soft_reset determines
4332 * if the asic is still hung or not.
4333 * Returns true if any of the IPs are still in a hung state, false if not.
4334 */
4335 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4336 {
4337 int i;
4338 bool asic_hang = false;
4339
4340 if (amdgpu_sriov_vf(adev))
4341 return true;
4342
4343 if (amdgpu_asic_need_full_reset(adev))
4344 return true;
4345
4346 for (i = 0; i < adev->num_ip_blocks; i++) {
4347 if (!adev->ip_blocks[i].status.valid)
4348 continue;
4349 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4350 adev->ip_blocks[i].status.hang =
4351 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4352 if (adev->ip_blocks[i].status.hang) {
4353 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4354 asic_hang = true;
4355 }
4356 }
4357 return asic_hang;
4358 }
4359
4360 /**
4361 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4362 *
4363 * @adev: amdgpu_device pointer
4364 *
4365 * The list of all the hardware IPs that make up the asic is walked and the
4366 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4367 * handles any IP specific hardware or software state changes that are
4368 * necessary for a soft reset to succeed.
4369 * Returns 0 on success, negative error code on failure.
4370 */
4371 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4372 {
4373 int i, r = 0;
4374
4375 for (i = 0; i < adev->num_ip_blocks; i++) {
4376 if (!adev->ip_blocks[i].status.valid)
4377 continue;
4378 if (adev->ip_blocks[i].status.hang &&
4379 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4380 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4381 if (r)
4382 return r;
4383 }
4384 }
4385
4386 return 0;
4387 }
4388
4389 /**
4390 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4391 *
4392 * @adev: amdgpu_device pointer
4393 *
4394 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4395 * reset is necessary to recover.
4396 * Returns true if a full asic reset is required, false if not.
4397 */
4398 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4399 {
4400 int i;
4401
4402 if (amdgpu_asic_need_full_reset(adev))
4403 return true;
4404
4405 for (i = 0; i < adev->num_ip_blocks; i++) {
4406 if (!adev->ip_blocks[i].status.valid)
4407 continue;
4408 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4409 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4410 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4411 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4412 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4413 if (adev->ip_blocks[i].status.hang) {
4414 dev_info(adev->dev, "Some block need full reset!\n");
4415 return true;
4416 }
4417 }
4418 }
4419 return false;
4420 }
4421
4422 /**
4423 * amdgpu_device_ip_soft_reset - do a soft reset
4424 *
4425 * @adev: amdgpu_device pointer
4426 *
4427 * The list of all the hardware IPs that make up the asic is walked and the
4428 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4429 * IP specific hardware or software state changes that are necessary to soft
4430 * reset the IP.
4431 * Returns 0 on success, negative error code on failure.
4432 */
4433 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4434 {
4435 int i, r = 0;
4436
4437 for (i = 0; i < adev->num_ip_blocks; i++) {
4438 if (!adev->ip_blocks[i].status.valid)
4439 continue;
4440 if (adev->ip_blocks[i].status.hang &&
4441 adev->ip_blocks[i].version->funcs->soft_reset) {
4442 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4443 if (r)
4444 return r;
4445 }
4446 }
4447
4448 return 0;
4449 }
4450
4451 /**
4452 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4453 *
4454 * @adev: amdgpu_device pointer
4455 *
4456 * The list of all the hardware IPs that make up the asic is walked and the
4457 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4458 * handles any IP specific hardware or software state changes that are
4459 * necessary after the IP has been soft reset.
4460 * Returns 0 on success, negative error code on failure.
4461 */
4462 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4463 {
4464 int i, r = 0;
4465
4466 for (i = 0; i < adev->num_ip_blocks; i++) {
4467 if (!adev->ip_blocks[i].status.valid)
4468 continue;
4469 if (adev->ip_blocks[i].status.hang &&
4470 adev->ip_blocks[i].version->funcs->post_soft_reset)
4471 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4472 if (r)
4473 return r;
4474 }
4475
4476 return 0;
4477 }
4478
4479 /**
4480 * amdgpu_device_recover_vram - Recover some VRAM contents
4481 *
4482 * @adev: amdgpu_device pointer
4483 *
4484 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4485 * restore things like GPUVM page tables after a GPU reset where
4486 * the contents of VRAM might be lost.
4487 *
4488 * Returns:
4489 * 0 on success, negative error code on failure.
4490 */
4491 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4492 {
4493 struct dma_fence *fence = NULL, *next = NULL;
4494 struct amdgpu_bo *shadow;
4495 struct amdgpu_bo_vm *vmbo;
4496 long r = 1, tmo;
4497
4498 if (amdgpu_sriov_runtime(adev))
4499 tmo = msecs_to_jiffies(8000);
4500 else
4501 tmo = msecs_to_jiffies(100);
4502
4503 dev_info(adev->dev, "recover vram bo from shadow start\n");
4504 mutex_lock(&adev->shadow_list_lock);
4505 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4506 /* If vm is compute context or adev is APU, shadow will be NULL */
4507 if (!vmbo->shadow)
4508 continue;
4509 shadow = vmbo->shadow;
4510
4511 /* No need to recover an evicted BO */
4512 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4513 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4514 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4515 continue;
4516
4517 r = amdgpu_bo_restore_shadow(shadow, &next);
4518 if (r)
4519 break;
4520
4521 if (fence) {
4522 tmo = dma_fence_wait_timeout(fence, false, tmo);
4523 dma_fence_put(fence);
4524 fence = next;
4525 if (tmo == 0) {
4526 r = -ETIMEDOUT;
4527 break;
4528 } else if (tmo < 0) {
4529 r = tmo;
4530 break;
4531 }
4532 } else {
4533 fence = next;
4534 }
4535 }
4536 mutex_unlock(&adev->shadow_list_lock);
4537
4538 if (fence)
4539 tmo = dma_fence_wait_timeout(fence, false, tmo);
4540 dma_fence_put(fence);
4541
4542 if (r < 0 || tmo <= 0) {
4543 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4544 return -EIO;
4545 }
4546
4547 dev_info(adev->dev, "recover vram bo from shadow done\n");
4548 return 0;
4549 }
4550
4551
4552 /**
4553 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4554 *
4555 * @adev: amdgpu_device pointer
4556 * @from_hypervisor: request from hypervisor
4557 *
4558 * do VF FLR and reinitialize Asic
4559 * return 0 means succeeded otherwise failed
4560 */
4561 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4562 bool from_hypervisor)
4563 {
4564 int r;
4565 struct amdgpu_hive_info *hive = NULL;
4566 int retry_limit = 0;
4567
4568 retry:
4569 amdgpu_amdkfd_pre_reset(adev);
4570
4571 if (from_hypervisor)
4572 r = amdgpu_virt_request_full_gpu(adev, true);
4573 else
4574 r = amdgpu_virt_reset_gpu(adev);
4575 if (r)
4576 return r;
4577
4578 /* Resume IP prior to SMC */
4579 r = amdgpu_device_ip_reinit_early_sriov(adev);
4580 if (r)
4581 goto error;
4582
4583 amdgpu_virt_init_data_exchange(adev);
4584
4585 r = amdgpu_device_fw_loading(adev);
4586 if (r)
4587 return r;
4588
4589 /* now we are okay to resume SMC/CP/SDMA */
4590 r = amdgpu_device_ip_reinit_late_sriov(adev);
4591 if (r)
4592 goto error;
4593
4594 hive = amdgpu_get_xgmi_hive(adev);
4595 /* Update PSP FW topology after reset */
4596 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4597 r = amdgpu_xgmi_update_topology(hive, adev);
4598
4599 if (hive)
4600 amdgpu_put_xgmi_hive(hive);
4601
4602 if (!r) {
4603 amdgpu_irq_gpu_reset_resume_helper(adev);
4604 r = amdgpu_ib_ring_tests(adev);
4605
4606 amdgpu_amdkfd_post_reset(adev);
4607 }
4608
4609 error:
4610 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4611 amdgpu_inc_vram_lost(adev);
4612 r = amdgpu_device_recover_vram(adev);
4613 }
4614 amdgpu_virt_release_full_gpu(adev, true);
4615
4616 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4617 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4618 retry_limit++;
4619 goto retry;
4620 } else
4621 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4622 }
4623
4624 return r;
4625 }
4626
4627 /**
4628 * amdgpu_device_has_job_running - check if there is any job in mirror list
4629 *
4630 * @adev: amdgpu_device pointer
4631 *
4632 * check if there is any job in mirror list
4633 */
4634 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4635 {
4636 int i;
4637 struct drm_sched_job *job;
4638
4639 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4640 struct amdgpu_ring *ring = adev->rings[i];
4641
4642 if (!ring || !ring->sched.thread)
4643 continue;
4644
4645 spin_lock(&ring->sched.job_list_lock);
4646 job = list_first_entry_or_null(&ring->sched.pending_list,
4647 struct drm_sched_job, list);
4648 spin_unlock(&ring->sched.job_list_lock);
4649 if (job)
4650 return true;
4651 }
4652 return false;
4653 }
4654
4655 /**
4656 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4657 *
4658 * @adev: amdgpu_device pointer
4659 *
4660 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4661 * a hung GPU.
4662 */
4663 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4664 {
4665
4666 if (amdgpu_gpu_recovery == 0)
4667 goto disabled;
4668
4669 /* Skip soft reset check in fatal error mode */
4670 if (!amdgpu_ras_is_poison_mode_supported(adev))
4671 return true;
4672
4673 if (!amdgpu_device_ip_check_soft_reset(adev)) {
4674 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n");
4675 return false;
4676 }
4677
4678 if (amdgpu_sriov_vf(adev))
4679 return true;
4680
4681 if (amdgpu_gpu_recovery == -1) {
4682 switch (adev->asic_type) {
4683 #ifdef CONFIG_DRM_AMDGPU_SI
4684 case CHIP_VERDE:
4685 case CHIP_TAHITI:
4686 case CHIP_PITCAIRN:
4687 case CHIP_OLAND:
4688 case CHIP_HAINAN:
4689 #endif
4690 #ifdef CONFIG_DRM_AMDGPU_CIK
4691 case CHIP_KAVERI:
4692 case CHIP_KABINI:
4693 case CHIP_MULLINS:
4694 #endif
4695 case CHIP_CARRIZO:
4696 case CHIP_STONEY:
4697 case CHIP_CYAN_SKILLFISH:
4698 goto disabled;
4699 default:
4700 break;
4701 }
4702 }
4703
4704 return true;
4705
4706 disabled:
4707 dev_info(adev->dev, "GPU recovery disabled.\n");
4708 return false;
4709 }
4710
4711 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4712 {
4713 u32 i;
4714 int ret = 0;
4715
4716 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4717
4718 dev_info(adev->dev, "GPU mode1 reset\n");
4719
4720 /* disable BM */
4721 pci_clear_master(adev->pdev);
4722
4723 amdgpu_device_cache_pci_state(adev->pdev);
4724
4725 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4726 dev_info(adev->dev, "GPU smu mode1 reset\n");
4727 ret = amdgpu_dpm_mode1_reset(adev);
4728 } else {
4729 dev_info(adev->dev, "GPU psp mode1 reset\n");
4730 ret = psp_gpu_reset(adev);
4731 }
4732
4733 if (ret)
4734 dev_err(adev->dev, "GPU mode1 reset failed\n");
4735
4736 amdgpu_device_load_pci_state(adev->pdev);
4737
4738 /* wait for asic to come out of reset */
4739 for (i = 0; i < adev->usec_timeout; i++) {
4740 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4741
4742 if (memsize != 0xffffffff)
4743 break;
4744 udelay(1);
4745 }
4746
4747 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4748 return ret;
4749 }
4750
4751 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4752 struct amdgpu_reset_context *reset_context)
4753 {
4754 int i, r = 0;
4755 struct amdgpu_job *job = NULL;
4756 bool need_full_reset =
4757 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4758
4759 if (reset_context->reset_req_dev == adev)
4760 job = reset_context->job;
4761
4762 if (amdgpu_sriov_vf(adev)) {
4763 /* stop the data exchange thread */
4764 amdgpu_virt_fini_data_exchange(adev);
4765 }
4766
4767 amdgpu_fence_driver_isr_toggle(adev, true);
4768
4769 /* block all schedulers and reset given job's ring */
4770 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4771 struct amdgpu_ring *ring = adev->rings[i];
4772
4773 if (!ring || !ring->sched.thread)
4774 continue;
4775
4776 /*clear job fence from fence drv to avoid force_completion
4777 *leave NULL and vm flush fence in fence drv */
4778 amdgpu_fence_driver_clear_job_fences(ring);
4779
4780 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4781 amdgpu_fence_driver_force_completion(ring);
4782 }
4783
4784 amdgpu_fence_driver_isr_toggle(adev, false);
4785
4786 if (job && job->vm)
4787 drm_sched_increase_karma(&job->base);
4788
4789 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4790 /* If reset handler not implemented, continue; otherwise return */
4791 if (r == -ENOSYS)
4792 r = 0;
4793 else
4794 return r;
4795
4796 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4797 if (!amdgpu_sriov_vf(adev)) {
4798
4799 if (!need_full_reset)
4800 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4801
4802 if (!need_full_reset && amdgpu_gpu_recovery) {
4803 amdgpu_device_ip_pre_soft_reset(adev);
4804 r = amdgpu_device_ip_soft_reset(adev);
4805 amdgpu_device_ip_post_soft_reset(adev);
4806 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4807 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4808 need_full_reset = true;
4809 }
4810 }
4811
4812 if (need_full_reset)
4813 r = amdgpu_device_ip_suspend(adev);
4814 if (need_full_reset)
4815 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4816 else
4817 clear_bit(AMDGPU_NEED_FULL_RESET,
4818 &reset_context->flags);
4819 }
4820
4821 return r;
4822 }
4823
4824 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4825 {
4826 int i;
4827
4828 lockdep_assert_held(&adev->reset_domain->sem);
4829
4830 for (i = 0; i < adev->num_regs; i++) {
4831 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4832 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4833 adev->reset_dump_reg_value[i]);
4834 }
4835
4836 return 0;
4837 }
4838
4839 #ifdef CONFIG_DEV_COREDUMP
4840 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4841 size_t count, void *data, size_t datalen)
4842 {
4843 struct drm_printer p;
4844 struct amdgpu_device *adev = data;
4845 struct drm_print_iterator iter;
4846 int i;
4847
4848 iter.data = buffer;
4849 iter.offset = 0;
4850 iter.start = offset;
4851 iter.remain = count;
4852
4853 p = drm_coredump_printer(&iter);
4854
4855 drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4856 drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4857 drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4858 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4859 if (adev->reset_task_info.pid)
4860 drm_printf(&p, "process_name: %s PID: %d\n",
4861 adev->reset_task_info.process_name,
4862 adev->reset_task_info.pid);
4863
4864 if (adev->reset_vram_lost)
4865 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4866 if (adev->num_regs) {
4867 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
4868
4869 for (i = 0; i < adev->num_regs; i++)
4870 drm_printf(&p, "0x%08x: 0x%08x\n",
4871 adev->reset_dump_reg_list[i],
4872 adev->reset_dump_reg_value[i]);
4873 }
4874
4875 return count - iter.remain;
4876 }
4877
4878 static void amdgpu_devcoredump_free(void *data)
4879 {
4880 }
4881
4882 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4883 {
4884 struct drm_device *dev = adev_to_drm(adev);
4885
4886 ktime_get_ts64(&adev->reset_time);
4887 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
4888 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4889 }
4890 #endif
4891
4892 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4893 struct amdgpu_reset_context *reset_context)
4894 {
4895 struct amdgpu_device *tmp_adev = NULL;
4896 bool need_full_reset, skip_hw_reset, vram_lost = false;
4897 int r = 0;
4898 bool gpu_reset_for_dev_remove = 0;
4899
4900 /* Try reset handler method first */
4901 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4902 reset_list);
4903 amdgpu_reset_reg_dumps(tmp_adev);
4904
4905 reset_context->reset_device_list = device_list_handle;
4906 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4907 /* If reset handler not implemented, continue; otherwise return */
4908 if (r == -ENOSYS)
4909 r = 0;
4910 else
4911 return r;
4912
4913 /* Reset handler not implemented, use the default method */
4914 need_full_reset =
4915 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4916 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4917
4918 gpu_reset_for_dev_remove =
4919 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4920 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4921
4922 /*
4923 * ASIC reset has to be done on all XGMI hive nodes ASAP
4924 * to allow proper links negotiation in FW (within 1 sec)
4925 */
4926 if (!skip_hw_reset && need_full_reset) {
4927 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4928 /* For XGMI run all resets in parallel to speed up the process */
4929 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4930 tmp_adev->gmc.xgmi.pending_reset = false;
4931 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4932 r = -EALREADY;
4933 } else
4934 r = amdgpu_asic_reset(tmp_adev);
4935
4936 if (r) {
4937 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4938 r, adev_to_drm(tmp_adev)->unique);
4939 break;
4940 }
4941 }
4942
4943 /* For XGMI wait for all resets to complete before proceed */
4944 if (!r) {
4945 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4946 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4947 flush_work(&tmp_adev->xgmi_reset_work);
4948 r = tmp_adev->asic_reset_res;
4949 if (r)
4950 break;
4951 }
4952 }
4953 }
4954 }
4955
4956 if (!r && amdgpu_ras_intr_triggered()) {
4957 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4958 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4959 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4960 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4961 }
4962
4963 amdgpu_ras_intr_cleared();
4964 }
4965
4966 /* Since the mode1 reset affects base ip blocks, the
4967 * phase1 ip blocks need to be resumed. Otherwise there
4968 * will be a BIOS signature error and the psp bootloader
4969 * can't load kdb on the next amdgpu install.
4970 */
4971 if (gpu_reset_for_dev_remove) {
4972 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4973 amdgpu_device_ip_resume_phase1(tmp_adev);
4974
4975 goto end;
4976 }
4977
4978 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4979 if (need_full_reset) {
4980 /* post card */
4981 r = amdgpu_device_asic_init(tmp_adev);
4982 if (r) {
4983 dev_warn(tmp_adev->dev, "asic atom init failed!");
4984 } else {
4985 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4986 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4987 if (r)
4988 goto out;
4989
4990 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4991 if (r)
4992 goto out;
4993
4994 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4995 #ifdef CONFIG_DEV_COREDUMP
4996 tmp_adev->reset_vram_lost = vram_lost;
4997 memset(&tmp_adev->reset_task_info, 0,
4998 sizeof(tmp_adev->reset_task_info));
4999 if (reset_context->job && reset_context->job->vm)
5000 tmp_adev->reset_task_info =
5001 reset_context->job->vm->task_info;
5002 amdgpu_reset_capture_coredumpm(tmp_adev);
5003 #endif
5004 if (vram_lost) {
5005 DRM_INFO("VRAM is lost due to GPU reset!\n");
5006 amdgpu_inc_vram_lost(tmp_adev);
5007 }
5008
5009 r = amdgpu_device_fw_loading(tmp_adev);
5010 if (r)
5011 return r;
5012
5013 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5014 if (r)
5015 goto out;
5016
5017 if (vram_lost)
5018 amdgpu_device_fill_reset_magic(tmp_adev);
5019
5020 /*
5021 * Add this ASIC as tracked as reset was already
5022 * complete successfully.
5023 */
5024 amdgpu_register_gpu_instance(tmp_adev);
5025
5026 if (!reset_context->hive &&
5027 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5028 amdgpu_xgmi_add_device(tmp_adev);
5029
5030 r = amdgpu_device_ip_late_init(tmp_adev);
5031 if (r)
5032 goto out;
5033
5034 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5035
5036 /*
5037 * The GPU enters bad state once faulty pages
5038 * by ECC has reached the threshold, and ras
5039 * recovery is scheduled next. So add one check
5040 * here to break recovery if it indeed exceeds
5041 * bad page threshold, and remind user to
5042 * retire this GPU or setting one bigger
5043 * bad_page_threshold value to fix this once
5044 * probing driver again.
5045 */
5046 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5047 /* must succeed. */
5048 amdgpu_ras_resume(tmp_adev);
5049 } else {
5050 r = -EINVAL;
5051 goto out;
5052 }
5053
5054 /* Update PSP FW topology after reset */
5055 if (reset_context->hive &&
5056 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5057 r = amdgpu_xgmi_update_topology(
5058 reset_context->hive, tmp_adev);
5059 }
5060 }
5061
5062 out:
5063 if (!r) {
5064 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5065 r = amdgpu_ib_ring_tests(tmp_adev);
5066 if (r) {
5067 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5068 need_full_reset = true;
5069 r = -EAGAIN;
5070 goto end;
5071 }
5072 }
5073
5074 if (!r)
5075 r = amdgpu_device_recover_vram(tmp_adev);
5076 else
5077 tmp_adev->asic_reset_res = r;
5078 }
5079
5080 end:
5081 if (need_full_reset)
5082 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5083 else
5084 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5085 return r;
5086 }
5087
5088 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5089 {
5090
5091 switch (amdgpu_asic_reset_method(adev)) {
5092 case AMD_RESET_METHOD_MODE1:
5093 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5094 break;
5095 case AMD_RESET_METHOD_MODE2:
5096 adev->mp1_state = PP_MP1_STATE_RESET;
5097 break;
5098 default:
5099 adev->mp1_state = PP_MP1_STATE_NONE;
5100 break;
5101 }
5102 }
5103
5104 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5105 {
5106 amdgpu_vf_error_trans_all(adev);
5107 adev->mp1_state = PP_MP1_STATE_NONE;
5108 }
5109
5110 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5111 {
5112 struct pci_dev *p = NULL;
5113
5114 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5115 adev->pdev->bus->number, 1);
5116 if (p) {
5117 pm_runtime_enable(&(p->dev));
5118 pm_runtime_resume(&(p->dev));
5119 }
5120
5121 pci_dev_put(p);
5122 }
5123
5124 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5125 {
5126 enum amd_reset_method reset_method;
5127 struct pci_dev *p = NULL;
5128 u64 expires;
5129
5130 /*
5131 * For now, only BACO and mode1 reset are confirmed
5132 * to suffer the audio issue without proper suspended.
5133 */
5134 reset_method = amdgpu_asic_reset_method(adev);
5135 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5136 (reset_method != AMD_RESET_METHOD_MODE1))
5137 return -EINVAL;
5138
5139 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5140 adev->pdev->bus->number, 1);
5141 if (!p)
5142 return -ENODEV;
5143
5144 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5145 if (!expires)
5146 /*
5147 * If we cannot get the audio device autosuspend delay,
5148 * a fixed 4S interval will be used. Considering 3S is
5149 * the audio controller default autosuspend delay setting.
5150 * 4S used here is guaranteed to cover that.
5151 */
5152 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5153
5154 while (!pm_runtime_status_suspended(&(p->dev))) {
5155 if (!pm_runtime_suspend(&(p->dev)))
5156 break;
5157
5158 if (expires < ktime_get_mono_fast_ns()) {
5159 dev_warn(adev->dev, "failed to suspend display audio\n");
5160 pci_dev_put(p);
5161 /* TODO: abort the succeeding gpu reset? */
5162 return -ETIMEDOUT;
5163 }
5164 }
5165
5166 pm_runtime_disable(&(p->dev));
5167
5168 pci_dev_put(p);
5169 return 0;
5170 }
5171
5172 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5173 {
5174 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5175
5176 #if defined(CONFIG_DEBUG_FS)
5177 if (!amdgpu_sriov_vf(adev))
5178 cancel_work(&adev->reset_work);
5179 #endif
5180
5181 if (adev->kfd.dev)
5182 cancel_work(&adev->kfd.reset_work);
5183
5184 if (amdgpu_sriov_vf(adev))
5185 cancel_work(&adev->virt.flr_work);
5186
5187 if (con && adev->ras_enabled)
5188 cancel_work(&con->recovery_work);
5189
5190 }
5191
5192 /**
5193 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5194 *
5195 * @adev: amdgpu_device pointer
5196 * @job: which job trigger hang
5197 *
5198 * Attempt to reset the GPU if it has hung (all asics).
5199 * Attempt to do soft-reset or full-reset and reinitialize Asic
5200 * Returns 0 for success or an error on failure.
5201 */
5202
5203 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5204 struct amdgpu_job *job,
5205 struct amdgpu_reset_context *reset_context)
5206 {
5207 struct list_head device_list, *device_list_handle = NULL;
5208 bool job_signaled = false;
5209 struct amdgpu_hive_info *hive = NULL;
5210 struct amdgpu_device *tmp_adev = NULL;
5211 int i, r = 0;
5212 bool need_emergency_restart = false;
5213 bool audio_suspended = false;
5214 bool gpu_reset_for_dev_remove = false;
5215
5216 gpu_reset_for_dev_remove =
5217 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5218 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5219
5220 /*
5221 * Special case: RAS triggered and full reset isn't supported
5222 */
5223 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5224
5225 /*
5226 * Flush RAM to disk so that after reboot
5227 * the user can read log and see why the system rebooted.
5228 */
5229 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5230 DRM_WARN("Emergency reboot.");
5231
5232 ksys_sync_helper();
5233 emergency_restart();
5234 }
5235
5236 dev_info(adev->dev, "GPU %s begin!\n",
5237 need_emergency_restart ? "jobs stop":"reset");
5238
5239 if (!amdgpu_sriov_vf(adev))
5240 hive = amdgpu_get_xgmi_hive(adev);
5241 if (hive)
5242 mutex_lock(&hive->hive_lock);
5243
5244 reset_context->job = job;
5245 reset_context->hive = hive;
5246 /*
5247 * Build list of devices to reset.
5248 * In case we are in XGMI hive mode, resort the device list
5249 * to put adev in the 1st position.
5250 */
5251 INIT_LIST_HEAD(&device_list);
5252 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5253 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5254 list_add_tail(&tmp_adev->reset_list, &device_list);
5255 if (gpu_reset_for_dev_remove && adev->shutdown)
5256 tmp_adev->shutdown = true;
5257 }
5258 if (!list_is_first(&adev->reset_list, &device_list))
5259 list_rotate_to_front(&adev->reset_list, &device_list);
5260 device_list_handle = &device_list;
5261 } else {
5262 list_add_tail(&adev->reset_list, &device_list);
5263 device_list_handle = &device_list;
5264 }
5265
5266 /* We need to lock reset domain only once both for XGMI and single device */
5267 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5268 reset_list);
5269 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5270
5271 /* block all schedulers and reset given job's ring */
5272 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5273
5274 amdgpu_device_set_mp1_state(tmp_adev);
5275
5276 /*
5277 * Try to put the audio codec into suspend state
5278 * before gpu reset started.
5279 *
5280 * Due to the power domain of the graphics device
5281 * is shared with AZ power domain. Without this,
5282 * we may change the audio hardware from behind
5283 * the audio driver's back. That will trigger
5284 * some audio codec errors.
5285 */
5286 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5287 audio_suspended = true;
5288
5289 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5290
5291 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5292
5293 if (!amdgpu_sriov_vf(tmp_adev))
5294 amdgpu_amdkfd_pre_reset(tmp_adev);
5295
5296 /*
5297 * Mark these ASICs to be reseted as untracked first
5298 * And add them back after reset completed
5299 */
5300 amdgpu_unregister_gpu_instance(tmp_adev);
5301
5302 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5303
5304 /* disable ras on ALL IPs */
5305 if (!need_emergency_restart &&
5306 amdgpu_device_ip_need_full_reset(tmp_adev))
5307 amdgpu_ras_suspend(tmp_adev);
5308
5309 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5310 struct amdgpu_ring *ring = tmp_adev->rings[i];
5311
5312 if (!ring || !ring->sched.thread)
5313 continue;
5314
5315 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5316
5317 if (need_emergency_restart)
5318 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5319 }
5320 atomic_inc(&tmp_adev->gpu_reset_counter);
5321 }
5322
5323 if (need_emergency_restart)
5324 goto skip_sched_resume;
5325
5326 /*
5327 * Must check guilty signal here since after this point all old
5328 * HW fences are force signaled.
5329 *
5330 * job->base holds a reference to parent fence
5331 */
5332 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5333 job_signaled = true;
5334 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5335 goto skip_hw_reset;
5336 }
5337
5338 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5339 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5340 if (gpu_reset_for_dev_remove) {
5341 /* Workaroud for ASICs need to disable SMC first */
5342 amdgpu_device_smu_fini_early(tmp_adev);
5343 }
5344 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5345 /*TODO Should we stop ?*/
5346 if (r) {
5347 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5348 r, adev_to_drm(tmp_adev)->unique);
5349 tmp_adev->asic_reset_res = r;
5350 }
5351
5352 /*
5353 * Drop all pending non scheduler resets. Scheduler resets
5354 * were already dropped during drm_sched_stop
5355 */
5356 amdgpu_device_stop_pending_resets(tmp_adev);
5357 }
5358
5359 /* Actual ASIC resets if needed.*/
5360 /* Host driver will handle XGMI hive reset for SRIOV */
5361 if (amdgpu_sriov_vf(adev)) {
5362 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5363 if (r)
5364 adev->asic_reset_res = r;
5365
5366 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */
5367 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
5368 amdgpu_ras_resume(adev);
5369 } else {
5370 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5371 if (r && r == -EAGAIN)
5372 goto retry;
5373
5374 if (!r && gpu_reset_for_dev_remove)
5375 goto recover_end;
5376 }
5377
5378 skip_hw_reset:
5379
5380 /* Post ASIC reset for all devs .*/
5381 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5382
5383 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5384 struct amdgpu_ring *ring = tmp_adev->rings[i];
5385
5386 if (!ring || !ring->sched.thread)
5387 continue;
5388
5389 drm_sched_start(&ring->sched, true);
5390 }
5391
5392 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5393 amdgpu_mes_self_test(tmp_adev);
5394
5395 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5396 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5397 }
5398
5399 if (tmp_adev->asic_reset_res)
5400 r = tmp_adev->asic_reset_res;
5401
5402 tmp_adev->asic_reset_res = 0;
5403
5404 if (r) {
5405 /* bad news, how to tell it to userspace ? */
5406 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5407 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5408 } else {
5409 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5410 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5411 DRM_WARN("smart shift update failed\n");
5412 }
5413 }
5414
5415 skip_sched_resume:
5416 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5417 /* unlock kfd: SRIOV would do it separately */
5418 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5419 amdgpu_amdkfd_post_reset(tmp_adev);
5420
5421 /* kfd_post_reset will do nothing if kfd device is not initialized,
5422 * need to bring up kfd here if it's not be initialized before
5423 */
5424 if (!adev->kfd.init_complete)
5425 amdgpu_amdkfd_device_init(adev);
5426
5427 if (audio_suspended)
5428 amdgpu_device_resume_display_audio(tmp_adev);
5429
5430 amdgpu_device_unset_mp1_state(tmp_adev);
5431
5432 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5433 }
5434
5435 recover_end:
5436 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5437 reset_list);
5438 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5439
5440 if (hive) {
5441 mutex_unlock(&hive->hive_lock);
5442 amdgpu_put_xgmi_hive(hive);
5443 }
5444
5445 if (r)
5446 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5447
5448 atomic_set(&adev->reset_domain->reset_res, r);
5449 return r;
5450 }
5451
5452 /**
5453 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5454 *
5455 * @adev: amdgpu_device pointer
5456 *
5457 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5458 * and lanes) of the slot the device is in. Handles APUs and
5459 * virtualized environments where PCIE config space may not be available.
5460 */
5461 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5462 {
5463 struct pci_dev *pdev;
5464 enum pci_bus_speed speed_cap, platform_speed_cap;
5465 enum pcie_link_width platform_link_width;
5466
5467 if (amdgpu_pcie_gen_cap)
5468 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5469
5470 if (amdgpu_pcie_lane_cap)
5471 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5472
5473 /* covers APUs as well */
5474 if (pci_is_root_bus(adev->pdev->bus)) {
5475 if (adev->pm.pcie_gen_mask == 0)
5476 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5477 if (adev->pm.pcie_mlw_mask == 0)
5478 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5479 return;
5480 }
5481
5482 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5483 return;
5484
5485 pcie_bandwidth_available(adev->pdev, NULL,
5486 &platform_speed_cap, &platform_link_width);
5487
5488 if (adev->pm.pcie_gen_mask == 0) {
5489 /* asic caps */
5490 pdev = adev->pdev;
5491 speed_cap = pcie_get_speed_cap(pdev);
5492 if (speed_cap == PCI_SPEED_UNKNOWN) {
5493 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5494 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5495 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5496 } else {
5497 if (speed_cap == PCIE_SPEED_32_0GT)
5498 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5499 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5500 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5501 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5502 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5503 else if (speed_cap == PCIE_SPEED_16_0GT)
5504 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5505 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5506 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5507 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5508 else if (speed_cap == PCIE_SPEED_8_0GT)
5509 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5510 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5511 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5512 else if (speed_cap == PCIE_SPEED_5_0GT)
5513 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5514 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5515 else
5516 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5517 }
5518 /* platform caps */
5519 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5520 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5521 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5522 } else {
5523 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5524 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5525 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5526 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5527 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5528 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5529 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5530 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5531 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5532 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5533 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5534 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5535 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5536 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5537 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5538 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5539 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5540 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5541 else
5542 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5543
5544 }
5545 }
5546 if (adev->pm.pcie_mlw_mask == 0) {
5547 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5548 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5549 } else {
5550 switch (platform_link_width) {
5551 case PCIE_LNK_X32:
5552 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5553 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5554 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5555 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5558 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5559 break;
5560 case PCIE_LNK_X16:
5561 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5562 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5563 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5564 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5565 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5566 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5567 break;
5568 case PCIE_LNK_X12:
5569 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5570 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5571 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5572 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5573 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5574 break;
5575 case PCIE_LNK_X8:
5576 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5577 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5578 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5579 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5580 break;
5581 case PCIE_LNK_X4:
5582 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5583 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5584 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5585 break;
5586 case PCIE_LNK_X2:
5587 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5588 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5589 break;
5590 case PCIE_LNK_X1:
5591 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5592 break;
5593 default:
5594 break;
5595 }
5596 }
5597 }
5598 }
5599
5600 /**
5601 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5602 *
5603 * @adev: amdgpu_device pointer
5604 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5605 *
5606 * Return true if @peer_adev can access (DMA) @adev through the PCIe
5607 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5608 * @peer_adev.
5609 */
5610 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5611 struct amdgpu_device *peer_adev)
5612 {
5613 #ifdef CONFIG_HSA_AMD_P2P
5614 uint64_t address_mask = peer_adev->dev->dma_mask ?
5615 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5616 resource_size_t aper_limit =
5617 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5618 bool p2p_access =
5619 !adev->gmc.xgmi.connected_to_cpu &&
5620 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5621
5622 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5623 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5624 !(adev->gmc.aper_base & address_mask ||
5625 aper_limit & address_mask));
5626 #else
5627 return false;
5628 #endif
5629 }
5630
5631 int amdgpu_device_baco_enter(struct drm_device *dev)
5632 {
5633 struct amdgpu_device *adev = drm_to_adev(dev);
5634 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5635
5636 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5637 return -ENOTSUPP;
5638
5639 if (ras && adev->ras_enabled &&
5640 adev->nbio.funcs->enable_doorbell_interrupt)
5641 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5642
5643 return amdgpu_dpm_baco_enter(adev);
5644 }
5645
5646 int amdgpu_device_baco_exit(struct drm_device *dev)
5647 {
5648 struct amdgpu_device *adev = drm_to_adev(dev);
5649 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5650 int ret = 0;
5651
5652 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5653 return -ENOTSUPP;
5654
5655 ret = amdgpu_dpm_baco_exit(adev);
5656 if (ret)
5657 return ret;
5658
5659 if (ras && adev->ras_enabled &&
5660 adev->nbio.funcs->enable_doorbell_interrupt)
5661 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5662
5663 if (amdgpu_passthrough(adev) &&
5664 adev->nbio.funcs->clear_doorbell_interrupt)
5665 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5666
5667 return 0;
5668 }
5669
5670 /**
5671 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5672 * @pdev: PCI device struct
5673 * @state: PCI channel state
5674 *
5675 * Description: Called when a PCI error is detected.
5676 *
5677 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5678 */
5679 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5680 {
5681 struct drm_device *dev = pci_get_drvdata(pdev);
5682 struct amdgpu_device *adev = drm_to_adev(dev);
5683 int i;
5684
5685 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5686
5687 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5688 DRM_WARN("No support for XGMI hive yet...");
5689 return PCI_ERS_RESULT_DISCONNECT;
5690 }
5691
5692 adev->pci_channel_state = state;
5693
5694 switch (state) {
5695 case pci_channel_io_normal:
5696 return PCI_ERS_RESULT_CAN_RECOVER;
5697 /* Fatal error, prepare for slot reset */
5698 case pci_channel_io_frozen:
5699 /*
5700 * Locking adev->reset_domain->sem will prevent any external access
5701 * to GPU during PCI error recovery
5702 */
5703 amdgpu_device_lock_reset_domain(adev->reset_domain);
5704 amdgpu_device_set_mp1_state(adev);
5705
5706 /*
5707 * Block any work scheduling as we do for regular GPU reset
5708 * for the duration of the recovery
5709 */
5710 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5711 struct amdgpu_ring *ring = adev->rings[i];
5712
5713 if (!ring || !ring->sched.thread)
5714 continue;
5715
5716 drm_sched_stop(&ring->sched, NULL);
5717 }
5718 atomic_inc(&adev->gpu_reset_counter);
5719 return PCI_ERS_RESULT_NEED_RESET;
5720 case pci_channel_io_perm_failure:
5721 /* Permanent error, prepare for device removal */
5722 return PCI_ERS_RESULT_DISCONNECT;
5723 }
5724
5725 return PCI_ERS_RESULT_NEED_RESET;
5726 }
5727
5728 /**
5729 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5730 * @pdev: pointer to PCI device
5731 */
5732 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5733 {
5734
5735 DRM_INFO("PCI error: mmio enabled callback!!\n");
5736
5737 /* TODO - dump whatever for debugging purposes */
5738
5739 /* This called only if amdgpu_pci_error_detected returns
5740 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5741 * works, no need to reset slot.
5742 */
5743
5744 return PCI_ERS_RESULT_RECOVERED;
5745 }
5746
5747 /**
5748 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5749 * @pdev: PCI device struct
5750 *
5751 * Description: This routine is called by the pci error recovery
5752 * code after the PCI slot has been reset, just before we
5753 * should resume normal operations.
5754 */
5755 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5756 {
5757 struct drm_device *dev = pci_get_drvdata(pdev);
5758 struct amdgpu_device *adev = drm_to_adev(dev);
5759 int r, i;
5760 struct amdgpu_reset_context reset_context;
5761 u32 memsize;
5762 struct list_head device_list;
5763
5764 DRM_INFO("PCI error: slot reset callback!!\n");
5765
5766 memset(&reset_context, 0, sizeof(reset_context));
5767
5768 INIT_LIST_HEAD(&device_list);
5769 list_add_tail(&adev->reset_list, &device_list);
5770
5771 /* wait for asic to come out of reset */
5772 msleep(500);
5773
5774 /* Restore PCI confspace */
5775 amdgpu_device_load_pci_state(pdev);
5776
5777 /* confirm ASIC came out of reset */
5778 for (i = 0; i < adev->usec_timeout; i++) {
5779 memsize = amdgpu_asic_get_config_memsize(adev);
5780
5781 if (memsize != 0xffffffff)
5782 break;
5783 udelay(1);
5784 }
5785 if (memsize == 0xffffffff) {
5786 r = -ETIME;
5787 goto out;
5788 }
5789
5790 reset_context.method = AMD_RESET_METHOD_NONE;
5791 reset_context.reset_req_dev = adev;
5792 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5793 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5794
5795 adev->no_hw_access = true;
5796 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5797 adev->no_hw_access = false;
5798 if (r)
5799 goto out;
5800
5801 r = amdgpu_do_asic_reset(&device_list, &reset_context);
5802
5803 out:
5804 if (!r) {
5805 if (amdgpu_device_cache_pci_state(adev->pdev))
5806 pci_restore_state(adev->pdev);
5807
5808 DRM_INFO("PCIe error recovery succeeded\n");
5809 } else {
5810 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5811 amdgpu_device_unset_mp1_state(adev);
5812 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5813 }
5814
5815 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5816 }
5817
5818 /**
5819 * amdgpu_pci_resume() - resume normal ops after PCI reset
5820 * @pdev: pointer to PCI device
5821 *
5822 * Called when the error recovery driver tells us that its
5823 * OK to resume normal operation.
5824 */
5825 void amdgpu_pci_resume(struct pci_dev *pdev)
5826 {
5827 struct drm_device *dev = pci_get_drvdata(pdev);
5828 struct amdgpu_device *adev = drm_to_adev(dev);
5829 int i;
5830
5831
5832 DRM_INFO("PCI error: resume callback!!\n");
5833
5834 /* Only continue execution for the case of pci_channel_io_frozen */
5835 if (adev->pci_channel_state != pci_channel_io_frozen)
5836 return;
5837
5838 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5839 struct amdgpu_ring *ring = adev->rings[i];
5840
5841 if (!ring || !ring->sched.thread)
5842 continue;
5843
5844 drm_sched_start(&ring->sched, true);
5845 }
5846
5847 amdgpu_device_unset_mp1_state(adev);
5848 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5849 }
5850
5851 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5852 {
5853 struct drm_device *dev = pci_get_drvdata(pdev);
5854 struct amdgpu_device *adev = drm_to_adev(dev);
5855 int r;
5856
5857 r = pci_save_state(pdev);
5858 if (!r) {
5859 kfree(adev->pci_state);
5860
5861 adev->pci_state = pci_store_saved_state(pdev);
5862
5863 if (!adev->pci_state) {
5864 DRM_ERROR("Failed to store PCI saved state");
5865 return false;
5866 }
5867 } else {
5868 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5869 return false;
5870 }
5871
5872 return true;
5873 }
5874
5875 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5876 {
5877 struct drm_device *dev = pci_get_drvdata(pdev);
5878 struct amdgpu_device *adev = drm_to_adev(dev);
5879 int r;
5880
5881 if (!adev->pci_state)
5882 return false;
5883
5884 r = pci_load_saved_state(pdev, adev->pci_state);
5885
5886 if (!r) {
5887 pci_restore_state(pdev);
5888 } else {
5889 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5890 return false;
5891 }
5892
5893 return true;
5894 }
5895
5896 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5897 struct amdgpu_ring *ring)
5898 {
5899 #ifdef CONFIG_X86_64
5900 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5901 return;
5902 #endif
5903 if (adev->gmc.xgmi.connected_to_cpu)
5904 return;
5905
5906 if (ring && ring->funcs->emit_hdp_flush)
5907 amdgpu_ring_emit_hdp_flush(ring);
5908 else
5909 amdgpu_asic_flush_hdp(adev, ring);
5910 }
5911
5912 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5913 struct amdgpu_ring *ring)
5914 {
5915 #ifdef CONFIG_X86_64
5916 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5917 return;
5918 #endif
5919 if (adev->gmc.xgmi.connected_to_cpu)
5920 return;
5921
5922 amdgpu_asic_invalidate_hdp(adev, ring);
5923 }
5924
5925 int amdgpu_in_reset(struct amdgpu_device *adev)
5926 {
5927 return atomic_read(&adev->reset_domain->in_gpu_reset);
5928 }
5929
5930 /**
5931 * amdgpu_device_halt() - bring hardware to some kind of halt state
5932 *
5933 * @adev: amdgpu_device pointer
5934 *
5935 * Bring hardware to some kind of halt state so that no one can touch it
5936 * any more. It will help to maintain error context when error occurred.
5937 * Compare to a simple hang, the system will keep stable at least for SSH
5938 * access. Then it should be trivial to inspect the hardware state and
5939 * see what's going on. Implemented as following:
5940 *
5941 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5942 * clears all CPU mappings to device, disallows remappings through page faults
5943 * 2. amdgpu_irq_disable_all() disables all interrupts
5944 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5945 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5946 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5947 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5948 * flush any in flight DMA operations
5949 */
5950 void amdgpu_device_halt(struct amdgpu_device *adev)
5951 {
5952 struct pci_dev *pdev = adev->pdev;
5953 struct drm_device *ddev = adev_to_drm(adev);
5954
5955 drm_dev_unplug(ddev);
5956
5957 amdgpu_irq_disable_all(adev);
5958
5959 amdgpu_fence_driver_hw_fini(adev);
5960
5961 adev->no_hw_access = true;
5962
5963 amdgpu_device_unmap_mmio(adev);
5964
5965 pci_disable_device(pdev);
5966 pci_wait_for_pending_transaction(pdev);
5967 }
5968
5969 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5970 u32 reg)
5971 {
5972 unsigned long flags, address, data;
5973 u32 r;
5974
5975 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5976 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5977
5978 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5979 WREG32(address, reg * 4);
5980 (void)RREG32(address);
5981 r = RREG32(data);
5982 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5983 return r;
5984 }
5985
5986 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5987 u32 reg, u32 v)
5988 {
5989 unsigned long flags, address, data;
5990
5991 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5992 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5993
5994 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5995 WREG32(address, reg * 4);
5996 (void)RREG32(address);
5997 WREG32(data, v);
5998 (void)RREG32(data);
5999 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6000 }
6001
6002 /**
6003 * amdgpu_device_switch_gang - switch to a new gang
6004 * @adev: amdgpu_device pointer
6005 * @gang: the gang to switch to
6006 *
6007 * Try to switch to a new gang.
6008 * Returns: NULL if we switched to the new gang or a reference to the current
6009 * gang leader.
6010 */
6011 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6012 struct dma_fence *gang)
6013 {
6014 struct dma_fence *old = NULL;
6015
6016 do {
6017 dma_fence_put(old);
6018 rcu_read_lock();
6019 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6020 rcu_read_unlock();
6021
6022 if (old == gang)
6023 break;
6024
6025 if (!dma_fence_is_signaled(old))
6026 return old;
6027
6028 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6029 old, gang) != old);
6030
6031 dma_fence_put(old);
6032 return NULL;
6033 }
6034
6035 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6036 {
6037 switch (adev->asic_type) {
6038 #ifdef CONFIG_DRM_AMDGPU_SI
6039 case CHIP_HAINAN:
6040 #endif
6041 case CHIP_TOPAZ:
6042 /* chips with no display hardware */
6043 return false;
6044 #ifdef CONFIG_DRM_AMDGPU_SI
6045 case CHIP_TAHITI:
6046 case CHIP_PITCAIRN:
6047 case CHIP_VERDE:
6048 case CHIP_OLAND:
6049 #endif
6050 #ifdef CONFIG_DRM_AMDGPU_CIK
6051 case CHIP_BONAIRE:
6052 case CHIP_HAWAII:
6053 case CHIP_KAVERI:
6054 case CHIP_KABINI:
6055 case CHIP_MULLINS:
6056 #endif
6057 case CHIP_TONGA:
6058 case CHIP_FIJI:
6059 case CHIP_POLARIS10:
6060 case CHIP_POLARIS11:
6061 case CHIP_POLARIS12:
6062 case CHIP_VEGAM:
6063 case CHIP_CARRIZO:
6064 case CHIP_STONEY:
6065 /* chips with display hardware */
6066 return true;
6067 default:
6068 /* IP discovery */
6069 if (!adev->ip_versions[DCE_HWIP][0] ||
6070 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6071 return false;
6072 return true;
6073 }
6074 }