2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
24 * Authors: Dave Airlie
29 #include <linux/aperture.h>
30 #include <linux/power_supply.h>
31 #include <linux/kthread.h>
32 #include <linux/module.h>
33 #include <linux/console.h>
34 #include <linux/slab.h>
35 #include <linux/iommu.h>
36 #include <linux/pci.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_client_event.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_probe_helper.h>
44 #include <drm/amdgpu_drm.h>
45 #include <linux/device.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
59 #ifdef CONFIG_DRM_AMDGPU_CIK
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_pmu.h"
75 #include "amdgpu_fru_eeprom.h"
76 #include "amdgpu_reset.h"
77 #include "amdgpu_virt.h"
78 #include "amdgpu_dev_coredump.h"
80 #include <linux/suspend.h>
81 #include <drm/task_barrier.h>
82 #include <linux/pm_runtime.h>
84 #include <drm/drm_drv.h>
86 #if IS_ENABLED(CONFIG_X86)
87 #include <asm/intel-family.h>
88 #include <asm/cpu_device_id.h>
91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
99 #define AMDGPU_RESUME_MS 2000
100 #define AMDGPU_MAX_RETRY_LIMIT 2
101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
106 #define AMDGPU_VBIOS_SKIP (1U << 0)
107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1)
109 static const struct drm_driver amdgpu_kms_driver
;
111 const char *amdgpu_asic_name
[] = {
152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0)
154 * Default init level where all blocks are expected to be initialized. This is
155 * the level of initialization expected by default and also after a full reset
158 struct amdgpu_init_level amdgpu_init_default
= {
159 .level
= AMDGPU_INIT_LEVEL_DEFAULT
,
160 .hwini_ip_block_mask
= AMDGPU_IP_BLK_MASK_ALL
,
163 struct amdgpu_init_level amdgpu_init_recovery
= {
164 .level
= AMDGPU_INIT_LEVEL_RESET_RECOVERY
,
165 .hwini_ip_block_mask
= AMDGPU_IP_BLK_MASK_ALL
,
169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
170 * is used for cases like reset on initialization where the entire hive needs to
171 * be reset before first use.
173 struct amdgpu_init_level amdgpu_init_minimal_xgmi
= {
174 .level
= AMDGPU_INIT_LEVEL_MINIMAL_XGMI
,
175 .hwini_ip_block_mask
=
176 BIT(AMD_IP_BLOCK_TYPE_GMC
) | BIT(AMD_IP_BLOCK_TYPE_SMC
) |
177 BIT(AMD_IP_BLOCK_TYPE_COMMON
) | BIT(AMD_IP_BLOCK_TYPE_IH
) |
178 BIT(AMD_IP_BLOCK_TYPE_PSP
)
181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device
*adev
,
182 enum amd_ip_block_type block
)
184 return (adev
->init_lvl
->hwini_ip_block_mask
& (1U << block
)) != 0;
187 void amdgpu_set_init_level(struct amdgpu_device
*adev
,
188 enum amdgpu_init_lvl_id lvl
)
191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI
:
192 adev
->init_lvl
= &amdgpu_init_minimal_xgmi
;
194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY
:
195 adev
->init_lvl
= &amdgpu_init_recovery
;
197 case AMDGPU_INIT_LEVEL_DEFAULT
:
200 adev
->init_lvl
= &amdgpu_init_default
;
205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device
*adev
);
206 static int amdgpu_device_pm_notifier(struct notifier_block
*nb
, unsigned long mode
,
210 * DOC: pcie_replay_count
212 * The amdgpu driver provides a sysfs API for reporting the total number
213 * of PCIe replays (NAKs).
214 * The file pcie_replay_count is used for this and returns the total
215 * number of replays as a sum of the NAKs generated and NAKs received.
218 static ssize_t
amdgpu_device_get_pcie_replay_count(struct device
*dev
,
219 struct device_attribute
*attr
, char *buf
)
221 struct drm_device
*ddev
= dev_get_drvdata(dev
);
222 struct amdgpu_device
*adev
= drm_to_adev(ddev
);
223 uint64_t cnt
= amdgpu_asic_get_pcie_replay_count(adev
);
225 return sysfs_emit(buf
, "%llu\n", cnt
);
228 static DEVICE_ATTR(pcie_replay_count
, 0444,
229 amdgpu_device_get_pcie_replay_count
, NULL
);
231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device
*adev
)
235 if (amdgpu_nbio_is_replay_cnt_supported(adev
))
236 ret
= sysfs_create_file(&adev
->dev
->kobj
,
237 &dev_attr_pcie_replay_count
.attr
);
242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device
*adev
)
244 if (amdgpu_nbio_is_replay_cnt_supported(adev
))
245 sysfs_remove_file(&adev
->dev
->kobj
,
246 &dev_attr_pcie_replay_count
.attr
);
249 static ssize_t
amdgpu_sysfs_reg_state_get(struct file
*f
, struct kobject
*kobj
,
250 const struct bin_attribute
*attr
, char *buf
,
251 loff_t ppos
, size_t count
)
253 struct device
*dev
= kobj_to_dev(kobj
);
254 struct drm_device
*ddev
= dev_get_drvdata(dev
);
255 struct amdgpu_device
*adev
= drm_to_adev(ddev
);
259 case AMDGPU_SYS_REG_STATE_XGMI
:
260 bytes_read
= amdgpu_asic_get_reg_state(
261 adev
, AMDGPU_REG_STATE_TYPE_XGMI
, buf
, count
);
263 case AMDGPU_SYS_REG_STATE_WAFL
:
264 bytes_read
= amdgpu_asic_get_reg_state(
265 adev
, AMDGPU_REG_STATE_TYPE_WAFL
, buf
, count
);
267 case AMDGPU_SYS_REG_STATE_PCIE
:
268 bytes_read
= amdgpu_asic_get_reg_state(
269 adev
, AMDGPU_REG_STATE_TYPE_PCIE
, buf
, count
);
271 case AMDGPU_SYS_REG_STATE_USR
:
272 bytes_read
= amdgpu_asic_get_reg_state(
273 adev
, AMDGPU_REG_STATE_TYPE_USR
, buf
, count
);
275 case AMDGPU_SYS_REG_STATE_USR_1
:
276 bytes_read
= amdgpu_asic_get_reg_state(
277 adev
, AMDGPU_REG_STATE_TYPE_USR_1
, buf
, count
);
286 static const BIN_ATTR(reg_state
, 0444, amdgpu_sysfs_reg_state_get
, NULL
,
287 AMDGPU_SYS_REG_STATE_END
);
289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device
*adev
)
293 if (!amdgpu_asic_get_reg_state_supported(adev
))
296 ret
= sysfs_create_bin_file(&adev
->dev
->kobj
, &bin_attr_reg_state
);
301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device
*adev
)
303 if (!amdgpu_asic_get_reg_state_supported(adev
))
305 sysfs_remove_bin_file(&adev
->dev
->kobj
, &bin_attr_reg_state
);
308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block
*ip_block
)
312 if (ip_block
->version
->funcs
->suspend
) {
313 r
= ip_block
->version
->funcs
->suspend(ip_block
);
315 dev_err(ip_block
->adev
->dev
,
316 "suspend of IP block <%s> failed %d\n",
317 ip_block
->version
->funcs
->name
, r
);
322 ip_block
->status
.hw
= false;
326 int amdgpu_ip_block_resume(struct amdgpu_ip_block
*ip_block
)
330 if (ip_block
->version
->funcs
->resume
) {
331 r
= ip_block
->version
->funcs
->resume(ip_block
);
333 dev_err(ip_block
->adev
->dev
,
334 "resume of IP block <%s> failed %d\n",
335 ip_block
->version
->funcs
->name
, r
);
340 ip_block
->status
.hw
= true;
347 * The amdgpu driver provides a sysfs API for giving board related information.
348 * It provides the form factor information in the format
352 * Possible form factor values
354 * - "cem" - PCIE CEM card
355 * - "oam" - Open Compute Accelerator Module
356 * - "unknown" - Not known
360 static ssize_t
amdgpu_device_get_board_info(struct device
*dev
,
361 struct device_attribute
*attr
,
364 struct drm_device
*ddev
= dev_get_drvdata(dev
);
365 struct amdgpu_device
*adev
= drm_to_adev(ddev
);
366 enum amdgpu_pkg_type pkg_type
= AMDGPU_PKG_TYPE_CEM
;
369 if (adev
->smuio
.funcs
&& adev
->smuio
.funcs
->get_pkg_type
)
370 pkg_type
= adev
->smuio
.funcs
->get_pkg_type(adev
);
373 case AMDGPU_PKG_TYPE_CEM
:
376 case AMDGPU_PKG_TYPE_OAM
:
384 return sysfs_emit(buf
, "%s : %s\n", "type", pkg
);
387 static DEVICE_ATTR(board_info
, 0444, amdgpu_device_get_board_info
, NULL
);
389 static struct attribute
*amdgpu_board_attrs
[] = {
390 &dev_attr_board_info
.attr
,
394 static umode_t
amdgpu_board_attrs_is_visible(struct kobject
*kobj
,
395 struct attribute
*attr
, int n
)
397 struct device
*dev
= kobj_to_dev(kobj
);
398 struct drm_device
*ddev
= dev_get_drvdata(dev
);
399 struct amdgpu_device
*adev
= drm_to_adev(ddev
);
401 if (adev
->flags
& AMD_IS_APU
)
407 static const struct attribute_group amdgpu_board_attrs_group
= {
408 .attrs
= amdgpu_board_attrs
,
409 .is_visible
= amdgpu_board_attrs_is_visible
412 static void amdgpu_device_get_pcie_info(struct amdgpu_device
*adev
);
415 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
417 * @adev: amdgpu device pointer
419 * Returns true if the device is a dGPU with ATPX power control,
420 * otherwise return false.
422 bool amdgpu_device_supports_px(struct amdgpu_device
*adev
)
424 if ((adev
->flags
& AMD_IS_PX
) && !amdgpu_is_atpx_hybrid())
430 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
432 * @adev: amdgpu device pointer
434 * Returns true if the device is a dGPU with ACPI power control,
435 * otherwise return false.
437 bool amdgpu_device_supports_boco(struct amdgpu_device
*adev
)
439 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE
))
443 ((adev
->flags
& AMD_IS_PX
) && amdgpu_is_atpx_hybrid()))
449 * amdgpu_device_supports_baco - Does the device support BACO
451 * @adev: amdgpu device pointer
454 * 1 if the device supports BACO;
455 * 3 if the device supports MACO (only works if BACO is supported)
456 * otherwise return 0.
458 int amdgpu_device_supports_baco(struct amdgpu_device
*adev
)
460 return amdgpu_asic_supports_baco(adev
);
463 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device
*adev
)
467 adev
->pm
.rpm_mode
= AMDGPU_RUNPM_NONE
;
468 bamaco_support
= amdgpu_device_supports_baco(adev
);
470 switch (amdgpu_runtime_pm
) {
472 if (bamaco_support
& MACO_SUPPORT
) {
473 adev
->pm
.rpm_mode
= AMDGPU_RUNPM_BAMACO
;
474 dev_info(adev
->dev
, "Forcing BAMACO for runtime pm\n");
475 } else if (bamaco_support
== BACO_SUPPORT
) {
476 adev
->pm
.rpm_mode
= AMDGPU_RUNPM_BACO
;
477 dev_info(adev
->dev
, "Requested mode BAMACO not available,fallback to use BACO\n");
481 if (bamaco_support
& BACO_SUPPORT
) {
482 adev
->pm
.rpm_mode
= AMDGPU_RUNPM_BACO
;
483 dev_info(adev
->dev
, "Forcing BACO for runtime pm\n");
488 if (amdgpu_device_supports_px(adev
)) {
489 /* enable PX as runtime mode */
490 adev
->pm
.rpm_mode
= AMDGPU_RUNPM_PX
;
491 dev_info(adev
->dev
, "Using ATPX for runtime pm\n");
492 } else if (amdgpu_device_supports_boco(adev
)) {
493 /* enable boco as runtime mode */
494 adev
->pm
.rpm_mode
= AMDGPU_RUNPM_BOCO
;
495 dev_info(adev
->dev
, "Using BOCO for runtime pm\n");
500 switch (adev
->asic_type
) {
503 /* BACO are not supported on vega20 and arctrus */
506 /* enable BACO as runpm mode if noretry=0 */
507 if (!adev
->gmc
.noretry
&& !amdgpu_passthrough(adev
))
508 adev
->pm
.rpm_mode
= AMDGPU_RUNPM_BACO
;
511 /* enable BACO as runpm mode on CI+ */
512 if (!amdgpu_passthrough(adev
))
513 adev
->pm
.rpm_mode
= AMDGPU_RUNPM_BACO
;
517 if (adev
->pm
.rpm_mode
== AMDGPU_RUNPM_BACO
) {
518 if (bamaco_support
& MACO_SUPPORT
) {
519 adev
->pm
.rpm_mode
= AMDGPU_RUNPM_BAMACO
;
520 dev_info(adev
->dev
, "Using BAMACO for runtime pm\n");
522 dev_info(adev
->dev
, "Using BACO for runtime pm\n");
528 dev_info(adev
->dev
, "runtime pm is manually disabled\n");
535 if (adev
->pm
.rpm_mode
== AMDGPU_RUNPM_NONE
)
536 dev_info(adev
->dev
, "Runtime PM not available\n");
539 * amdgpu_device_supports_smart_shift - Is the device dGPU with
540 * smart shift support
542 * @adev: amdgpu device pointer
544 * Returns true if the device is a dGPU with Smart Shift support,
545 * otherwise returns false.
547 bool amdgpu_device_supports_smart_shift(struct amdgpu_device
*adev
)
549 return (amdgpu_device_supports_boco(adev
) &&
550 amdgpu_acpi_is_power_shift_control_supported());
554 * VRAM access helper functions
558 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
560 * @adev: amdgpu_device pointer
561 * @pos: offset of the buffer in vram
562 * @buf: virtual address of the buffer in system memory
563 * @size: read/write size, sizeof(@buf) must > @size
564 * @write: true - write to vram, otherwise - read from vram
566 void amdgpu_device_mm_access(struct amdgpu_device
*adev
, loff_t pos
,
567 void *buf
, size_t size
, bool write
)
570 uint32_t hi
= ~0, tmp
= 0;
571 uint32_t *data
= buf
;
575 if (!drm_dev_enter(adev_to_drm(adev
), &idx
))
578 BUG_ON(!IS_ALIGNED(pos
, 4) || !IS_ALIGNED(size
, 4));
580 spin_lock_irqsave(&adev
->mmio_idx_lock
, flags
);
581 for (last
= pos
+ size
; pos
< last
; pos
+= 4) {
584 WREG32_NO_KIQ(mmMM_INDEX
, ((uint32_t)pos
) | 0x80000000);
586 WREG32_NO_KIQ(mmMM_INDEX_HI
, tmp
);
590 WREG32_NO_KIQ(mmMM_DATA
, *data
++);
592 *data
++ = RREG32_NO_KIQ(mmMM_DATA
);
595 spin_unlock_irqrestore(&adev
->mmio_idx_lock
, flags
);
600 * amdgpu_device_aper_access - access vram by vram aperture
602 * @adev: amdgpu_device pointer
603 * @pos: offset of the buffer in vram
604 * @buf: virtual address of the buffer in system memory
605 * @size: read/write size, sizeof(@buf) must > @size
606 * @write: true - write to vram, otherwise - read from vram
608 * The return value means how many bytes have been transferred.
610 size_t amdgpu_device_aper_access(struct amdgpu_device
*adev
, loff_t pos
,
611 void *buf
, size_t size
, bool write
)
618 if (!adev
->mman
.aper_base_kaddr
)
621 last
= min(pos
+ size
, adev
->gmc
.visible_vram_size
);
623 addr
= adev
->mman
.aper_base_kaddr
+ pos
;
627 memcpy_toio(addr
, buf
, count
);
628 /* Make sure HDP write cache flush happens without any reordering
629 * after the system memory contents are sent over PCIe device
632 amdgpu_device_flush_hdp(adev
, NULL
);
634 amdgpu_device_invalidate_hdp(adev
, NULL
);
635 /* Make sure HDP read cache is invalidated before issuing a read
639 memcpy_fromio(buf
, addr
, count
);
651 * amdgpu_device_vram_access - read/write a buffer in vram
653 * @adev: amdgpu_device pointer
654 * @pos: offset of the buffer in vram
655 * @buf: virtual address of the buffer in system memory
656 * @size: read/write size, sizeof(@buf) must > @size
657 * @write: true - write to vram, otherwise - read from vram
659 void amdgpu_device_vram_access(struct amdgpu_device
*adev
, loff_t pos
,
660 void *buf
, size_t size
, bool write
)
664 /* try to using vram apreature to access vram first */
665 count
= amdgpu_device_aper_access(adev
, pos
, buf
, size
, write
);
668 /* using MM to access rest vram */
671 amdgpu_device_mm_access(adev
, pos
, buf
, size
, write
);
676 * register access helper functions.
679 /* Check if hw access should be skipped because of hotplug or device error */
680 bool amdgpu_device_skip_hw_access(struct amdgpu_device
*adev
)
682 if (adev
->no_hw_access
)
685 #ifdef CONFIG_LOCKDEP
687 * This is a bit complicated to understand, so worth a comment. What we assert
688 * here is that the GPU reset is not running on another thread in parallel.
690 * For this we trylock the read side of the reset semaphore, if that succeeds
691 * we know that the reset is not running in parallel.
693 * If the trylock fails we assert that we are either already holding the read
694 * side of the lock or are the reset thread itself and hold the write side of
698 if (down_read_trylock(&adev
->reset_domain
->sem
))
699 up_read(&adev
->reset_domain
->sem
);
701 lockdep_assert_held(&adev
->reset_domain
->sem
);
708 * amdgpu_device_rreg - read a memory mapped IO or indirect register
710 * @adev: amdgpu_device pointer
711 * @reg: dword aligned register offset
712 * @acc_flags: access flags which require special behavior
714 * Returns the 32 bit value from the offset specified.
716 uint32_t amdgpu_device_rreg(struct amdgpu_device
*adev
,
717 uint32_t reg
, uint32_t acc_flags
)
721 if (amdgpu_device_skip_hw_access(adev
))
724 if ((reg
* 4) < adev
->rmmio_size
) {
725 if (!(acc_flags
& AMDGPU_REGS_NO_KIQ
) &&
726 amdgpu_sriov_runtime(adev
) &&
727 down_read_trylock(&adev
->reset_domain
->sem
)) {
728 ret
= amdgpu_kiq_rreg(adev
, reg
, 0);
729 up_read(&adev
->reset_domain
->sem
);
731 ret
= readl(((void __iomem
*)adev
->rmmio
) + (reg
* 4));
734 ret
= adev
->pcie_rreg(adev
, reg
* 4);
737 trace_amdgpu_device_rreg(adev
->pdev
->device
, reg
, ret
);
743 * MMIO register read with bytes helper functions
744 * @offset:bytes offset from MMIO start
748 * amdgpu_mm_rreg8 - read a memory mapped IO register
750 * @adev: amdgpu_device pointer
751 * @offset: byte aligned register offset
753 * Returns the 8 bit value from the offset specified.
755 uint8_t amdgpu_mm_rreg8(struct amdgpu_device
*adev
, uint32_t offset
)
757 if (amdgpu_device_skip_hw_access(adev
))
760 if (offset
< adev
->rmmio_size
)
761 return (readb(adev
->rmmio
+ offset
));
767 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
769 * @adev: amdgpu_device pointer
770 * @reg: dword aligned register offset
771 * @acc_flags: access flags which require special behavior
772 * @xcc_id: xcc accelerated compute core id
774 * Returns the 32 bit value from the offset specified.
776 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device
*adev
,
777 uint32_t reg
, uint32_t acc_flags
,
780 uint32_t ret
, rlcg_flag
;
782 if (amdgpu_device_skip_hw_access(adev
))
785 if ((reg
* 4) < adev
->rmmio_size
) {
786 if (amdgpu_sriov_vf(adev
) &&
787 !amdgpu_sriov_runtime(adev
) &&
788 adev
->gfx
.rlc
.rlcg_reg_access_supported
&&
789 amdgpu_virt_get_rlcg_reg_access_flag(adev
, acc_flags
,
792 ret
= amdgpu_virt_rlcg_reg_rw(adev
, reg
, 0, rlcg_flag
, GET_INST(GC
, xcc_id
));
793 } else if (!(acc_flags
& AMDGPU_REGS_NO_KIQ
) &&
794 amdgpu_sriov_runtime(adev
) &&
795 down_read_trylock(&adev
->reset_domain
->sem
)) {
796 ret
= amdgpu_kiq_rreg(adev
, reg
, xcc_id
);
797 up_read(&adev
->reset_domain
->sem
);
799 ret
= readl(((void __iomem
*)adev
->rmmio
) + (reg
* 4));
802 ret
= adev
->pcie_rreg(adev
, reg
* 4);
809 * MMIO register write with bytes helper functions
810 * @offset:bytes offset from MMIO start
811 * @value: the value want to be written to the register
815 * amdgpu_mm_wreg8 - read a memory mapped IO register
817 * @adev: amdgpu_device pointer
818 * @offset: byte aligned register offset
819 * @value: 8 bit value to write
821 * Writes the value specified to the offset specified.
823 void amdgpu_mm_wreg8(struct amdgpu_device
*adev
, uint32_t offset
, uint8_t value
)
825 if (amdgpu_device_skip_hw_access(adev
))
828 if (offset
< adev
->rmmio_size
)
829 writeb(value
, adev
->rmmio
+ offset
);
835 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
837 * @adev: amdgpu_device pointer
838 * @reg: dword aligned register offset
839 * @v: 32 bit value to write to the register
840 * @acc_flags: access flags which require special behavior
842 * Writes the value specified to the offset specified.
844 void amdgpu_device_wreg(struct amdgpu_device
*adev
,
845 uint32_t reg
, uint32_t v
,
848 if (amdgpu_device_skip_hw_access(adev
))
851 if ((reg
* 4) < adev
->rmmio_size
) {
852 if (!(acc_flags
& AMDGPU_REGS_NO_KIQ
) &&
853 amdgpu_sriov_runtime(adev
) &&
854 down_read_trylock(&adev
->reset_domain
->sem
)) {
855 amdgpu_kiq_wreg(adev
, reg
, v
, 0);
856 up_read(&adev
->reset_domain
->sem
);
858 writel(v
, ((void __iomem
*)adev
->rmmio
) + (reg
* 4));
861 adev
->pcie_wreg(adev
, reg
* 4, v
);
864 trace_amdgpu_device_wreg(adev
->pdev
->device
, reg
, v
);
868 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
870 * @adev: amdgpu_device pointer
871 * @reg: mmio/rlc register
873 * @xcc_id: xcc accelerated compute core id
875 * this function is invoked only for the debugfs register access
877 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device
*adev
,
878 uint32_t reg
, uint32_t v
,
881 if (amdgpu_device_skip_hw_access(adev
))
884 if (amdgpu_sriov_fullaccess(adev
) &&
885 adev
->gfx
.rlc
.funcs
&&
886 adev
->gfx
.rlc
.funcs
->is_rlcg_access_range
) {
887 if (adev
->gfx
.rlc
.funcs
->is_rlcg_access_range(adev
, reg
))
888 return amdgpu_sriov_wreg(adev
, reg
, v
, 0, 0, xcc_id
);
889 } else if ((reg
* 4) >= adev
->rmmio_size
) {
890 adev
->pcie_wreg(adev
, reg
* 4, v
);
892 writel(v
, ((void __iomem
*)adev
->rmmio
) + (reg
* 4));
897 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
899 * @adev: amdgpu_device pointer
900 * @reg: dword aligned register offset
901 * @v: 32 bit value to write to the register
902 * @acc_flags: access flags which require special behavior
903 * @xcc_id: xcc accelerated compute core id
905 * Writes the value specified to the offset specified.
907 void amdgpu_device_xcc_wreg(struct amdgpu_device
*adev
,
908 uint32_t reg
, uint32_t v
,
909 uint32_t acc_flags
, uint32_t xcc_id
)
913 if (amdgpu_device_skip_hw_access(adev
))
916 if ((reg
* 4) < adev
->rmmio_size
) {
917 if (amdgpu_sriov_vf(adev
) &&
918 !amdgpu_sriov_runtime(adev
) &&
919 adev
->gfx
.rlc
.rlcg_reg_access_supported
&&
920 amdgpu_virt_get_rlcg_reg_access_flag(adev
, acc_flags
,
923 amdgpu_virt_rlcg_reg_rw(adev
, reg
, v
, rlcg_flag
, GET_INST(GC
, xcc_id
));
924 } else if (!(acc_flags
& AMDGPU_REGS_NO_KIQ
) &&
925 amdgpu_sriov_runtime(adev
) &&
926 down_read_trylock(&adev
->reset_domain
->sem
)) {
927 amdgpu_kiq_wreg(adev
, reg
, v
, xcc_id
);
928 up_read(&adev
->reset_domain
->sem
);
930 writel(v
, ((void __iomem
*)adev
->rmmio
) + (reg
* 4));
933 adev
->pcie_wreg(adev
, reg
* 4, v
);
938 * amdgpu_device_indirect_rreg - read an indirect register
940 * @adev: amdgpu_device pointer
941 * @reg_addr: indirect register address to read from
943 * Returns the value of indirect register @reg_addr
945 u32
amdgpu_device_indirect_rreg(struct amdgpu_device
*adev
,
948 unsigned long flags
, pcie_index
, pcie_data
;
949 void __iomem
*pcie_index_offset
;
950 void __iomem
*pcie_data_offset
;
953 pcie_index
= adev
->nbio
.funcs
->get_pcie_index_offset(adev
);
954 pcie_data
= adev
->nbio
.funcs
->get_pcie_data_offset(adev
);
956 spin_lock_irqsave(&adev
->pcie_idx_lock
, flags
);
957 pcie_index_offset
= (void __iomem
*)adev
->rmmio
+ pcie_index
* 4;
958 pcie_data_offset
= (void __iomem
*)adev
->rmmio
+ pcie_data
* 4;
960 writel(reg_addr
, pcie_index_offset
);
961 readl(pcie_index_offset
);
962 r
= readl(pcie_data_offset
);
963 spin_unlock_irqrestore(&adev
->pcie_idx_lock
, flags
);
968 u32
amdgpu_device_indirect_rreg_ext(struct amdgpu_device
*adev
,
971 unsigned long flags
, pcie_index
, pcie_index_hi
, pcie_data
;
973 void __iomem
*pcie_index_offset
;
974 void __iomem
*pcie_index_hi_offset
;
975 void __iomem
*pcie_data_offset
;
977 if (unlikely(!adev
->nbio
.funcs
)) {
978 pcie_index
= AMDGPU_PCIE_INDEX_FALLBACK
;
979 pcie_data
= AMDGPU_PCIE_DATA_FALLBACK
;
981 pcie_index
= adev
->nbio
.funcs
->get_pcie_index_offset(adev
);
982 pcie_data
= adev
->nbio
.funcs
->get_pcie_data_offset(adev
);
985 if (reg_addr
>> 32) {
986 if (unlikely(!adev
->nbio
.funcs
))
987 pcie_index_hi
= AMDGPU_PCIE_INDEX_HI_FALLBACK
;
989 pcie_index_hi
= adev
->nbio
.funcs
->get_pcie_index_hi_offset(adev
);
994 spin_lock_irqsave(&adev
->pcie_idx_lock
, flags
);
995 pcie_index_offset
= (void __iomem
*)adev
->rmmio
+ pcie_index
* 4;
996 pcie_data_offset
= (void __iomem
*)adev
->rmmio
+ pcie_data
* 4;
997 if (pcie_index_hi
!= 0)
998 pcie_index_hi_offset
= (void __iomem
*)adev
->rmmio
+
1001 writel(reg_addr
, pcie_index_offset
);
1002 readl(pcie_index_offset
);
1003 if (pcie_index_hi
!= 0) {
1004 writel((reg_addr
>> 32) & 0xff, pcie_index_hi_offset
);
1005 readl(pcie_index_hi_offset
);
1007 r
= readl(pcie_data_offset
);
1009 /* clear the high bits */
1010 if (pcie_index_hi
!= 0) {
1011 writel(0, pcie_index_hi_offset
);
1012 readl(pcie_index_hi_offset
);
1015 spin_unlock_irqrestore(&adev
->pcie_idx_lock
, flags
);
1021 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1023 * @adev: amdgpu_device pointer
1024 * @reg_addr: indirect register address to read from
1026 * Returns the value of indirect register @reg_addr
1028 u64
amdgpu_device_indirect_rreg64(struct amdgpu_device
*adev
,
1031 unsigned long flags
, pcie_index
, pcie_data
;
1032 void __iomem
*pcie_index_offset
;
1033 void __iomem
*pcie_data_offset
;
1036 pcie_index
= adev
->nbio
.funcs
->get_pcie_index_offset(adev
);
1037 pcie_data
= adev
->nbio
.funcs
->get_pcie_data_offset(adev
);
1039 spin_lock_irqsave(&adev
->pcie_idx_lock
, flags
);
1040 pcie_index_offset
= (void __iomem
*)adev
->rmmio
+ pcie_index
* 4;
1041 pcie_data_offset
= (void __iomem
*)adev
->rmmio
+ pcie_data
* 4;
1043 /* read low 32 bits */
1044 writel(reg_addr
, pcie_index_offset
);
1045 readl(pcie_index_offset
);
1046 r
= readl(pcie_data_offset
);
1047 /* read high 32 bits */
1048 writel(reg_addr
+ 4, pcie_index_offset
);
1049 readl(pcie_index_offset
);
1050 r
|= ((u64
)readl(pcie_data_offset
) << 32);
1051 spin_unlock_irqrestore(&adev
->pcie_idx_lock
, flags
);
1056 u64
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device
*adev
,
1059 unsigned long flags
, pcie_index
, pcie_data
;
1060 unsigned long pcie_index_hi
= 0;
1061 void __iomem
*pcie_index_offset
;
1062 void __iomem
*pcie_index_hi_offset
;
1063 void __iomem
*pcie_data_offset
;
1066 pcie_index
= adev
->nbio
.funcs
->get_pcie_index_offset(adev
);
1067 pcie_data
= adev
->nbio
.funcs
->get_pcie_data_offset(adev
);
1068 if ((reg_addr
>> 32) && (adev
->nbio
.funcs
->get_pcie_index_hi_offset
))
1069 pcie_index_hi
= adev
->nbio
.funcs
->get_pcie_index_hi_offset(adev
);
1071 spin_lock_irqsave(&adev
->pcie_idx_lock
, flags
);
1072 pcie_index_offset
= (void __iomem
*)adev
->rmmio
+ pcie_index
* 4;
1073 pcie_data_offset
= (void __iomem
*)adev
->rmmio
+ pcie_data
* 4;
1074 if (pcie_index_hi
!= 0)
1075 pcie_index_hi_offset
= (void __iomem
*)adev
->rmmio
+
1078 /* read low 32 bits */
1079 writel(reg_addr
, pcie_index_offset
);
1080 readl(pcie_index_offset
);
1081 if (pcie_index_hi
!= 0) {
1082 writel((reg_addr
>> 32) & 0xff, pcie_index_hi_offset
);
1083 readl(pcie_index_hi_offset
);
1085 r
= readl(pcie_data_offset
);
1086 /* read high 32 bits */
1087 writel(reg_addr
+ 4, pcie_index_offset
);
1088 readl(pcie_index_offset
);
1089 if (pcie_index_hi
!= 0) {
1090 writel((reg_addr
>> 32) & 0xff, pcie_index_hi_offset
);
1091 readl(pcie_index_hi_offset
);
1093 r
|= ((u64
)readl(pcie_data_offset
) << 32);
1095 /* clear the high bits */
1096 if (pcie_index_hi
!= 0) {
1097 writel(0, pcie_index_hi_offset
);
1098 readl(pcie_index_hi_offset
);
1101 spin_unlock_irqrestore(&adev
->pcie_idx_lock
, flags
);
1107 * amdgpu_device_indirect_wreg - write an indirect register address
1109 * @adev: amdgpu_device pointer
1110 * @reg_addr: indirect register offset
1111 * @reg_data: indirect register data
1114 void amdgpu_device_indirect_wreg(struct amdgpu_device
*adev
,
1115 u32 reg_addr
, u32 reg_data
)
1117 unsigned long flags
, pcie_index
, pcie_data
;
1118 void __iomem
*pcie_index_offset
;
1119 void __iomem
*pcie_data_offset
;
1121 pcie_index
= adev
->nbio
.funcs
->get_pcie_index_offset(adev
);
1122 pcie_data
= adev
->nbio
.funcs
->get_pcie_data_offset(adev
);
1124 spin_lock_irqsave(&adev
->pcie_idx_lock
, flags
);
1125 pcie_index_offset
= (void __iomem
*)adev
->rmmio
+ pcie_index
* 4;
1126 pcie_data_offset
= (void __iomem
*)adev
->rmmio
+ pcie_data
* 4;
1128 writel(reg_addr
, pcie_index_offset
);
1129 readl(pcie_index_offset
);
1130 writel(reg_data
, pcie_data_offset
);
1131 readl(pcie_data_offset
);
1132 spin_unlock_irqrestore(&adev
->pcie_idx_lock
, flags
);
1135 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device
*adev
,
1136 u64 reg_addr
, u32 reg_data
)
1138 unsigned long flags
, pcie_index
, pcie_index_hi
, pcie_data
;
1139 void __iomem
*pcie_index_offset
;
1140 void __iomem
*pcie_index_hi_offset
;
1141 void __iomem
*pcie_data_offset
;
1143 pcie_index
= adev
->nbio
.funcs
->get_pcie_index_offset(adev
);
1144 pcie_data
= adev
->nbio
.funcs
->get_pcie_data_offset(adev
);
1145 if ((reg_addr
>> 32) && (adev
->nbio
.funcs
->get_pcie_index_hi_offset
))
1146 pcie_index_hi
= adev
->nbio
.funcs
->get_pcie_index_hi_offset(adev
);
1150 spin_lock_irqsave(&adev
->pcie_idx_lock
, flags
);
1151 pcie_index_offset
= (void __iomem
*)adev
->rmmio
+ pcie_index
* 4;
1152 pcie_data_offset
= (void __iomem
*)adev
->rmmio
+ pcie_data
* 4;
1153 if (pcie_index_hi
!= 0)
1154 pcie_index_hi_offset
= (void __iomem
*)adev
->rmmio
+
1157 writel(reg_addr
, pcie_index_offset
);
1158 readl(pcie_index_offset
);
1159 if (pcie_index_hi
!= 0) {
1160 writel((reg_addr
>> 32) & 0xff, pcie_index_hi_offset
);
1161 readl(pcie_index_hi_offset
);
1163 writel(reg_data
, pcie_data_offset
);
1164 readl(pcie_data_offset
);
1166 /* clear the high bits */
1167 if (pcie_index_hi
!= 0) {
1168 writel(0, pcie_index_hi_offset
);
1169 readl(pcie_index_hi_offset
);
1172 spin_unlock_irqrestore(&adev
->pcie_idx_lock
, flags
);
1176 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1178 * @adev: amdgpu_device pointer
1179 * @reg_addr: indirect register offset
1180 * @reg_data: indirect register data
1183 void amdgpu_device_indirect_wreg64(struct amdgpu_device
*adev
,
1184 u32 reg_addr
, u64 reg_data
)
1186 unsigned long flags
, pcie_index
, pcie_data
;
1187 void __iomem
*pcie_index_offset
;
1188 void __iomem
*pcie_data_offset
;
1190 pcie_index
= adev
->nbio
.funcs
->get_pcie_index_offset(adev
);
1191 pcie_data
= adev
->nbio
.funcs
->get_pcie_data_offset(adev
);
1193 spin_lock_irqsave(&adev
->pcie_idx_lock
, flags
);
1194 pcie_index_offset
= (void __iomem
*)adev
->rmmio
+ pcie_index
* 4;
1195 pcie_data_offset
= (void __iomem
*)adev
->rmmio
+ pcie_data
* 4;
1197 /* write low 32 bits */
1198 writel(reg_addr
, pcie_index_offset
);
1199 readl(pcie_index_offset
);
1200 writel((u32
)(reg_data
& 0xffffffffULL
), pcie_data_offset
);
1201 readl(pcie_data_offset
);
1202 /* write high 32 bits */
1203 writel(reg_addr
+ 4, pcie_index_offset
);
1204 readl(pcie_index_offset
);
1205 writel((u32
)(reg_data
>> 32), pcie_data_offset
);
1206 readl(pcie_data_offset
);
1207 spin_unlock_irqrestore(&adev
->pcie_idx_lock
, flags
);
1210 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device
*adev
,
1211 u64 reg_addr
, u64 reg_data
)
1213 unsigned long flags
, pcie_index
, pcie_data
;
1214 unsigned long pcie_index_hi
= 0;
1215 void __iomem
*pcie_index_offset
;
1216 void __iomem
*pcie_index_hi_offset
;
1217 void __iomem
*pcie_data_offset
;
1219 pcie_index
= adev
->nbio
.funcs
->get_pcie_index_offset(adev
);
1220 pcie_data
= adev
->nbio
.funcs
->get_pcie_data_offset(adev
);
1221 if ((reg_addr
>> 32) && (adev
->nbio
.funcs
->get_pcie_index_hi_offset
))
1222 pcie_index_hi
= adev
->nbio
.funcs
->get_pcie_index_hi_offset(adev
);
1224 spin_lock_irqsave(&adev
->pcie_idx_lock
, flags
);
1225 pcie_index_offset
= (void __iomem
*)adev
->rmmio
+ pcie_index
* 4;
1226 pcie_data_offset
= (void __iomem
*)adev
->rmmio
+ pcie_data
* 4;
1227 if (pcie_index_hi
!= 0)
1228 pcie_index_hi_offset
= (void __iomem
*)adev
->rmmio
+
1231 /* write low 32 bits */
1232 writel(reg_addr
, pcie_index_offset
);
1233 readl(pcie_index_offset
);
1234 if (pcie_index_hi
!= 0) {
1235 writel((reg_addr
>> 32) & 0xff, pcie_index_hi_offset
);
1236 readl(pcie_index_hi_offset
);
1238 writel((u32
)(reg_data
& 0xffffffffULL
), pcie_data_offset
);
1239 readl(pcie_data_offset
);
1240 /* write high 32 bits */
1241 writel(reg_addr
+ 4, pcie_index_offset
);
1242 readl(pcie_index_offset
);
1243 if (pcie_index_hi
!= 0) {
1244 writel((reg_addr
>> 32) & 0xff, pcie_index_hi_offset
);
1245 readl(pcie_index_hi_offset
);
1247 writel((u32
)(reg_data
>> 32), pcie_data_offset
);
1248 readl(pcie_data_offset
);
1250 /* clear the high bits */
1251 if (pcie_index_hi
!= 0) {
1252 writel(0, pcie_index_hi_offset
);
1253 readl(pcie_index_hi_offset
);
1256 spin_unlock_irqrestore(&adev
->pcie_idx_lock
, flags
);
1260 * amdgpu_device_get_rev_id - query device rev_id
1262 * @adev: amdgpu_device pointer
1264 * Return device rev_id
1266 u32
amdgpu_device_get_rev_id(struct amdgpu_device
*adev
)
1268 return adev
->nbio
.funcs
->get_rev_id(adev
);
1272 * amdgpu_invalid_rreg - dummy reg read function
1274 * @adev: amdgpu_device pointer
1275 * @reg: offset of register
1277 * Dummy register read function. Used for register blocks
1278 * that certain asics don't have (all asics).
1279 * Returns the value in the register.
1281 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device
*adev
, uint32_t reg
)
1283 dev_err(adev
->dev
, "Invalid callback to read register 0x%04X\n", reg
);
1288 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device
*adev
, uint64_t reg
)
1290 dev_err(adev
->dev
, "Invalid callback to read register 0x%llX\n", reg
);
1296 * amdgpu_invalid_wreg - dummy reg write function
1298 * @adev: amdgpu_device pointer
1299 * @reg: offset of register
1300 * @v: value to write to the register
1302 * Dummy register read function. Used for register blocks
1303 * that certain asics don't have (all asics).
1305 static void amdgpu_invalid_wreg(struct amdgpu_device
*adev
, uint32_t reg
, uint32_t v
)
1308 "Invalid callback to write register 0x%04X with 0x%08X\n", reg
,
1313 static void amdgpu_invalid_wreg_ext(struct amdgpu_device
*adev
, uint64_t reg
, uint32_t v
)
1316 "Invalid callback to write register 0x%llX with 0x%08X\n", reg
,
1322 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1324 * @adev: amdgpu_device pointer
1325 * @reg: offset of register
1327 * Dummy register read function. Used for register blocks
1328 * that certain asics don't have (all asics).
1329 * Returns the value in the register.
1331 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device
*adev
, uint32_t reg
)
1333 dev_err(adev
->dev
, "Invalid callback to read 64 bit register 0x%04X\n",
1339 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device
*adev
, uint64_t reg
)
1341 dev_err(adev
->dev
, "Invalid callback to read register 0x%llX\n", reg
);
1347 * amdgpu_invalid_wreg64 - dummy reg write function
1349 * @adev: amdgpu_device pointer
1350 * @reg: offset of register
1351 * @v: value to write to the register
1353 * Dummy register read function. Used for register blocks
1354 * that certain asics don't have (all asics).
1356 static void amdgpu_invalid_wreg64(struct amdgpu_device
*adev
, uint32_t reg
, uint64_t v
)
1359 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1364 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device
*adev
, uint64_t reg
, uint64_t v
)
1367 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1373 * amdgpu_block_invalid_rreg - dummy reg read function
1375 * @adev: amdgpu_device pointer
1376 * @block: offset of instance
1377 * @reg: offset of register
1379 * Dummy register read function. Used for register blocks
1380 * that certain asics don't have (all asics).
1381 * Returns the value in the register.
1383 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device
*adev
,
1384 uint32_t block
, uint32_t reg
)
1387 "Invalid callback to read register 0x%04X in block 0x%04X\n",
1394 * amdgpu_block_invalid_wreg - dummy reg write function
1396 * @adev: amdgpu_device pointer
1397 * @block: offset of instance
1398 * @reg: offset of register
1399 * @v: value to write to the register
1401 * Dummy register read function. Used for register blocks
1402 * that certain asics don't have (all asics).
1404 static void amdgpu_block_invalid_wreg(struct amdgpu_device
*adev
,
1406 uint32_t reg
, uint32_t v
)
1409 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1414 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device
*adev
)
1416 if (hweight32(adev
->aid_mask
) && (adev
->flags
& AMD_IS_APU
))
1417 return AMDGPU_VBIOS_SKIP
;
1419 if (hweight32(adev
->aid_mask
) && amdgpu_passthrough(adev
))
1420 return AMDGPU_VBIOS_OPTIONAL
;
1426 * amdgpu_device_asic_init - Wrapper for atom asic_init
1428 * @adev: amdgpu_device pointer
1430 * Does any asic specific work and then calls atom asic init.
1432 static int amdgpu_device_asic_init(struct amdgpu_device
*adev
)
1438 amdgpu_asic_pre_asic_init(adev
);
1439 flags
= amdgpu_device_get_vbios_flags(adev
);
1440 optional
= !!(flags
& (AMDGPU_VBIOS_OPTIONAL
| AMDGPU_VBIOS_SKIP
));
1442 if (amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(9, 4, 3) ||
1443 amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(9, 4, 4) ||
1444 amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(9, 5, 0) ||
1445 amdgpu_ip_version(adev
, GC_HWIP
, 0) >= IP_VERSION(11, 0, 0)) {
1446 amdgpu_psp_wait_for_bootloader(adev
);
1447 if (optional
&& !adev
->bios
)
1450 ret
= amdgpu_atomfirmware_asic_init(adev
, true);
1453 if (optional
&& !adev
->bios
)
1456 return amdgpu_atom_asic_init(adev
->mode_info
.atom_context
);
1463 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1465 * @adev: amdgpu_device pointer
1467 * Allocates a scratch page of VRAM for use by various things in the
1470 static int amdgpu_device_mem_scratch_init(struct amdgpu_device
*adev
)
1472 return amdgpu_bo_create_kernel(adev
, AMDGPU_GPU_PAGE_SIZE
, PAGE_SIZE
,
1473 AMDGPU_GEM_DOMAIN_VRAM
|
1474 AMDGPU_GEM_DOMAIN_GTT
,
1475 &adev
->mem_scratch
.robj
,
1476 &adev
->mem_scratch
.gpu_addr
,
1477 (void **)&adev
->mem_scratch
.ptr
);
1481 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1483 * @adev: amdgpu_device pointer
1485 * Frees the VRAM scratch page.
1487 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device
*adev
)
1489 amdgpu_bo_free_kernel(&adev
->mem_scratch
.robj
, NULL
, NULL
);
1493 * amdgpu_device_program_register_sequence - program an array of registers.
1495 * @adev: amdgpu_device pointer
1496 * @registers: pointer to the register array
1497 * @array_size: size of the register array
1499 * Programs an array or registers with and or masks.
1500 * This is a helper for setting golden registers.
1502 void amdgpu_device_program_register_sequence(struct amdgpu_device
*adev
,
1503 const u32
*registers
,
1504 const u32 array_size
)
1506 u32 tmp
, reg
, and_mask
, or_mask
;
1512 for (i
= 0; i
< array_size
; i
+= 3) {
1513 reg
= registers
[i
+ 0];
1514 and_mask
= registers
[i
+ 1];
1515 or_mask
= registers
[i
+ 2];
1517 if (and_mask
== 0xffffffff) {
1522 if (adev
->family
>= AMDGPU_FAMILY_AI
)
1523 tmp
|= (or_mask
& and_mask
);
1532 * amdgpu_device_pci_config_reset - reset the GPU
1534 * @adev: amdgpu_device pointer
1536 * Resets the GPU using the pci config reset sequence.
1537 * Only applicable to asics prior to vega10.
1539 void amdgpu_device_pci_config_reset(struct amdgpu_device
*adev
)
1541 pci_write_config_dword(adev
->pdev
, 0x7c, AMDGPU_ASIC_RESET_DATA
);
1545 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1547 * @adev: amdgpu_device pointer
1549 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1551 int amdgpu_device_pci_reset(struct amdgpu_device
*adev
)
1553 return pci_reset_function(adev
->pdev
);
1557 * amdgpu_device_wb_*()
1558 * Writeback is the method by which the GPU updates special pages in memory
1559 * with the status of certain GPU events (fences, ring pointers,etc.).
1563 * amdgpu_device_wb_fini - Disable Writeback and free memory
1565 * @adev: amdgpu_device pointer
1567 * Disables Writeback and frees the Writeback memory (all asics).
1568 * Used at driver shutdown.
1570 static void amdgpu_device_wb_fini(struct amdgpu_device
*adev
)
1572 if (adev
->wb
.wb_obj
) {
1573 amdgpu_bo_free_kernel(&adev
->wb
.wb_obj
,
1575 (void **)&adev
->wb
.wb
);
1576 adev
->wb
.wb_obj
= NULL
;
1581 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1583 * @adev: amdgpu_device pointer
1585 * Initializes writeback and allocates writeback memory (all asics).
1586 * Used at driver startup.
1587 * Returns 0 on success or an -error on failure.
1589 static int amdgpu_device_wb_init(struct amdgpu_device
*adev
)
1593 if (adev
->wb
.wb_obj
== NULL
) {
1594 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1595 r
= amdgpu_bo_create_kernel(adev
, AMDGPU_MAX_WB
* sizeof(uint32_t) * 8,
1596 PAGE_SIZE
, AMDGPU_GEM_DOMAIN_GTT
,
1597 &adev
->wb
.wb_obj
, &adev
->wb
.gpu_addr
,
1598 (void **)&adev
->wb
.wb
);
1600 dev_warn(adev
->dev
, "(%d) create WB bo failed\n", r
);
1604 adev
->wb
.num_wb
= AMDGPU_MAX_WB
;
1605 memset(&adev
->wb
.used
, 0, sizeof(adev
->wb
.used
));
1607 /* clear wb memory */
1608 memset((char *)adev
->wb
.wb
, 0, AMDGPU_MAX_WB
* sizeof(uint32_t) * 8);
1615 * amdgpu_device_wb_get - Allocate a wb entry
1617 * @adev: amdgpu_device pointer
1620 * Allocate a wb slot for use by the driver (all asics).
1621 * Returns 0 on success or -EINVAL on failure.
1623 int amdgpu_device_wb_get(struct amdgpu_device
*adev
, u32
*wb
)
1625 unsigned long flags
, offset
;
1627 spin_lock_irqsave(&adev
->wb
.lock
, flags
);
1628 offset
= find_first_zero_bit(adev
->wb
.used
, adev
->wb
.num_wb
);
1629 if (offset
< adev
->wb
.num_wb
) {
1630 __set_bit(offset
, adev
->wb
.used
);
1631 spin_unlock_irqrestore(&adev
->wb
.lock
, flags
);
1632 *wb
= offset
<< 3; /* convert to dw offset */
1635 spin_unlock_irqrestore(&adev
->wb
.lock
, flags
);
1641 * amdgpu_device_wb_free - Free a wb entry
1643 * @adev: amdgpu_device pointer
1646 * Free a wb slot allocated for use by the driver (all asics)
1648 void amdgpu_device_wb_free(struct amdgpu_device
*adev
, u32 wb
)
1650 unsigned long flags
;
1653 spin_lock_irqsave(&adev
->wb
.lock
, flags
);
1654 if (wb
< adev
->wb
.num_wb
)
1655 __clear_bit(wb
, adev
->wb
.used
);
1656 spin_unlock_irqrestore(&adev
->wb
.lock
, flags
);
1660 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1662 * @adev: amdgpu_device pointer
1664 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1665 * to fail, but if any of the BARs is not accessible after the size we abort
1666 * driver loading by returning -ENODEV.
1668 int amdgpu_device_resize_fb_bar(struct amdgpu_device
*adev
)
1670 int rbar_size
= pci_rebar_bytes_to_size(adev
->gmc
.real_vram_size
);
1671 struct pci_bus
*root
;
1672 struct resource
*res
;
1677 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT
))
1681 if (amdgpu_sriov_vf(adev
))
1687 /* resizing on Dell G5 SE platforms causes problems with runtime pm */
1688 if ((amdgpu_runtime_pm
!= 0) &&
1689 adev
->pdev
->vendor
== PCI_VENDOR_ID_ATI
&&
1690 adev
->pdev
->device
== 0x731f &&
1691 adev
->pdev
->subsystem_vendor
== PCI_VENDOR_ID_DELL
)
1694 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1695 if (!pci_find_ext_capability(adev
->pdev
, PCI_EXT_CAP_ID_VNDR
))
1698 "System can't access extended configuration space, please check!!\n");
1700 /* skip if the bios has already enabled large BAR */
1701 if (adev
->gmc
.real_vram_size
&&
1702 (pci_resource_len(adev
->pdev
, 0) >= adev
->gmc
.real_vram_size
))
1705 /* Check if the root BUS has 64bit memory resources */
1706 root
= adev
->pdev
->bus
;
1707 while (root
->parent
)
1708 root
= root
->parent
;
1710 pci_bus_for_each_resource(root
, res
, i
) {
1711 if (res
&& res
->flags
& (IORESOURCE_MEM
| IORESOURCE_MEM_64
) &&
1712 res
->start
> 0x100000000ull
)
1716 /* Trying to resize is pointless without a root hub window above 4GB */
1720 /* Limit the BAR size to what is available */
1721 rbar_size
= min(fls(pci_rebar_get_possible_sizes(adev
->pdev
, 0)) - 1,
1724 /* Disable memory decoding while we change the BAR addresses and size */
1725 pci_read_config_word(adev
->pdev
, PCI_COMMAND
, &cmd
);
1726 pci_write_config_word(adev
->pdev
, PCI_COMMAND
,
1727 cmd
& ~PCI_COMMAND_MEMORY
);
1729 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1730 amdgpu_doorbell_fini(adev
);
1731 if (adev
->asic_type
>= CHIP_BONAIRE
)
1732 pci_release_resource(adev
->pdev
, 2);
1734 pci_release_resource(adev
->pdev
, 0);
1736 r
= pci_resize_resource(adev
->pdev
, 0, rbar_size
);
1739 "Not enough PCI address space for a large BAR.");
1740 else if (r
&& r
!= -ENOTSUPP
)
1741 dev_err(adev
->dev
, "Problem resizing BAR0 (%d).", r
);
1743 pci_assign_unassigned_bus_resources(adev
->pdev
->bus
);
1745 /* When the doorbell or fb BAR isn't available we have no chance of
1748 r
= amdgpu_doorbell_init(adev
);
1749 if (r
|| (pci_resource_flags(adev
->pdev
, 0) & IORESOURCE_UNSET
))
1752 pci_write_config_word(adev
->pdev
, PCI_COMMAND
, cmd
);
1758 * GPU helpers function.
1761 * amdgpu_device_need_post - check if the hw need post or not
1763 * @adev: amdgpu_device pointer
1765 * Check if the asic has been initialized (all asics) at driver startup
1766 * or post is needed if hw reset is performed.
1767 * Returns true if need or false if not.
1769 bool amdgpu_device_need_post(struct amdgpu_device
*adev
)
1771 uint32_t reg
, flags
;
1773 if (amdgpu_sriov_vf(adev
))
1776 flags
= amdgpu_device_get_vbios_flags(adev
);
1777 if (flags
& AMDGPU_VBIOS_SKIP
)
1779 if ((flags
& AMDGPU_VBIOS_OPTIONAL
) && !adev
->bios
)
1782 if (amdgpu_passthrough(adev
)) {
1783 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1784 * some old smc fw still need driver do vPost otherwise gpu hang, while
1785 * those smc fw version above 22.15 doesn't have this flaw, so we force
1786 * vpost executed for smc version below 22.15
1788 if (adev
->asic_type
== CHIP_FIJI
) {
1792 err
= request_firmware(&adev
->pm
.fw
, "amdgpu/fiji_smc.bin", adev
->dev
);
1793 /* force vPost if error occurred */
1797 fw_ver
= *((uint32_t *)adev
->pm
.fw
->data
+ 69);
1798 release_firmware(adev
->pm
.fw
);
1799 if (fw_ver
< 0x00160e00)
1804 /* Don't post if we need to reset whole hive on init */
1805 if (adev
->init_lvl
->level
== AMDGPU_INIT_LEVEL_MINIMAL_XGMI
)
1808 if (adev
->has_hw_reset
) {
1809 adev
->has_hw_reset
= false;
1813 /* bios scratch used on CIK+ */
1814 if (adev
->asic_type
>= CHIP_BONAIRE
)
1815 return amdgpu_atombios_scratch_need_asic_init(adev
);
1817 /* check MEM_SIZE for older asics */
1818 reg
= amdgpu_asic_get_config_memsize(adev
);
1820 if ((reg
!= 0) && (reg
!= 0xffffffff))
1827 * Check whether seamless boot is supported.
1829 * So far we only support seamless boot on DCE 3.0 or later.
1830 * If users report that it works on older ASICS as well, we may
1833 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device
*adev
)
1835 switch (amdgpu_seamless
) {
1843 dev_err(adev
->dev
, "Invalid value for amdgpu.seamless: %d\n",
1848 if (!(adev
->flags
& AMD_IS_APU
))
1851 if (adev
->mman
.keep_stolen_vga_memory
)
1854 return amdgpu_ip_version(adev
, DCE_HWIP
, 0) >= IP_VERSION(3, 0, 0);
1858 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1859 * don't support dynamic speed switching. Until we have confirmation from Intel
1860 * that a specific host supports it, it's safer that we keep it disabled for all.
1862 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1863 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1865 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device
*adev
)
1867 #if IS_ENABLED(CONFIG_X86)
1868 struct cpuinfo_x86
*c
= &cpu_data(0);
1870 /* eGPU change speeds based on USB4 fabric conditions */
1871 if (dev_is_removable(adev
->dev
))
1874 if (c
->x86_vendor
== X86_VENDOR_INTEL
)
1880 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device
*adev
)
1882 #if IS_ENABLED(CONFIG_X86)
1883 struct cpuinfo_x86
*c
= &cpu_data(0);
1885 if (!(amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(12, 0, 0) ||
1886 amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(12, 0, 1)))
1890 adev
->pm
.pcie_gen_mask
& CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5
) {
1891 switch (c
->x86_model
) {
1892 case VFM_MODEL(INTEL_ALDERLAKE
):
1893 case VFM_MODEL(INTEL_ALDERLAKE_L
):
1894 case VFM_MODEL(INTEL_RAPTORLAKE
):
1895 case VFM_MODEL(INTEL_RAPTORLAKE_P
):
1896 case VFM_MODEL(INTEL_RAPTORLAKE_S
):
1910 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1912 * @adev: amdgpu_device pointer
1914 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1915 * be set for this device.
1917 * Returns true if it should be used or false if not.
1919 bool amdgpu_device_should_use_aspm(struct amdgpu_device
*adev
)
1921 switch (amdgpu_aspm
) {
1931 if (adev
->flags
& AMD_IS_APU
)
1933 if (amdgpu_device_aspm_support_quirk(adev
))
1935 return pcie_aspm_enabled(adev
->pdev
);
1938 /* if we get transitioned to only one device, take VGA back */
1940 * amdgpu_device_vga_set_decode - enable/disable vga decode
1942 * @pdev: PCI device pointer
1943 * @state: enable/disable vga decode
1945 * Enable/disable vga decode (all asics).
1946 * Returns VGA resource flags.
1948 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev
*pdev
,
1951 struct amdgpu_device
*adev
= drm_to_adev(pci_get_drvdata(pdev
));
1953 amdgpu_asic_set_vga_state(adev
, state
);
1955 return VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
|
1956 VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
;
1958 return VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
;
1962 * amdgpu_device_check_block_size - validate the vm block size
1964 * @adev: amdgpu_device pointer
1966 * Validates the vm block size specified via module parameter.
1967 * The vm block size defines number of bits in page table versus page directory,
1968 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1969 * page table and the remaining bits are in the page directory.
1971 static void amdgpu_device_check_block_size(struct amdgpu_device
*adev
)
1973 /* defines number of bits in page table versus page directory,
1974 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1975 * page table and the remaining bits are in the page directory
1977 if (amdgpu_vm_block_size
== -1)
1980 if (amdgpu_vm_block_size
< 9) {
1981 dev_warn(adev
->dev
, "VM page table size (%d) too small\n",
1982 amdgpu_vm_block_size
);
1983 amdgpu_vm_block_size
= -1;
1988 * amdgpu_device_check_vm_size - validate the vm size
1990 * @adev: amdgpu_device pointer
1992 * Validates the vm size in GB specified via module parameter.
1993 * The VM size is the size of the GPU virtual memory space in GB.
1995 static void amdgpu_device_check_vm_size(struct amdgpu_device
*adev
)
1997 /* no need to check the default value */
1998 if (amdgpu_vm_size
== -1)
2001 if (amdgpu_vm_size
< 1) {
2002 dev_warn(adev
->dev
, "VM size (%d) too small, min is 1GB\n",
2004 amdgpu_vm_size
= -1;
2008 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device
*adev
)
2011 bool is_os_64
= (sizeof(void *) == 8);
2012 uint64_t total_memory
;
2013 uint64_t dram_size_seven_GB
= 0x1B8000000;
2014 uint64_t dram_size_three_GB
= 0xB8000000;
2016 if (amdgpu_smu_memory_pool_size
== 0)
2020 dev_warn(adev
->dev
, "Not 64-bit OS, feature not supported\n");
2024 total_memory
= (uint64_t)si
.totalram
* si
.mem_unit
;
2026 if ((amdgpu_smu_memory_pool_size
== 1) ||
2027 (amdgpu_smu_memory_pool_size
== 2)) {
2028 if (total_memory
< dram_size_three_GB
)
2030 } else if ((amdgpu_smu_memory_pool_size
== 4) ||
2031 (amdgpu_smu_memory_pool_size
== 8)) {
2032 if (total_memory
< dram_size_seven_GB
)
2035 dev_warn(adev
->dev
, "Smu memory pool size not supported\n");
2038 adev
->pm
.smu_prv_buffer_size
= amdgpu_smu_memory_pool_size
<< 28;
2043 dev_warn(adev
->dev
, "No enough system memory\n");
2045 adev
->pm
.smu_prv_buffer_size
= 0;
2048 static int amdgpu_device_init_apu_flags(struct amdgpu_device
*adev
)
2050 if (!(adev
->flags
& AMD_IS_APU
) ||
2051 adev
->asic_type
< CHIP_RAVEN
)
2054 switch (adev
->asic_type
) {
2056 if (adev
->pdev
->device
== 0x15dd)
2057 adev
->apu_flags
|= AMD_APU_IS_RAVEN
;
2058 if (adev
->pdev
->device
== 0x15d8)
2059 adev
->apu_flags
|= AMD_APU_IS_PICASSO
;
2062 if ((adev
->pdev
->device
== 0x1636) ||
2063 (adev
->pdev
->device
== 0x164c))
2064 adev
->apu_flags
|= AMD_APU_IS_RENOIR
;
2066 adev
->apu_flags
|= AMD_APU_IS_GREEN_SARDINE
;
2069 adev
->apu_flags
|= AMD_APU_IS_VANGOGH
;
2071 case CHIP_YELLOW_CARP
:
2073 case CHIP_CYAN_SKILLFISH
:
2074 if ((adev
->pdev
->device
== 0x13FE) ||
2075 (adev
->pdev
->device
== 0x143F))
2076 adev
->apu_flags
|= AMD_APU_IS_CYAN_SKILLFISH2
;
2086 * amdgpu_device_check_arguments - validate module params
2088 * @adev: amdgpu_device pointer
2090 * Validates certain module parameters and updates
2091 * the associated values used by the driver (all asics).
2093 static int amdgpu_device_check_arguments(struct amdgpu_device
*adev
)
2097 if (amdgpu_sched_jobs
< 4) {
2098 dev_warn(adev
->dev
, "sched jobs (%d) must be at least 4\n",
2100 amdgpu_sched_jobs
= 4;
2101 } else if (!is_power_of_2(amdgpu_sched_jobs
)) {
2102 dev_warn(adev
->dev
, "sched jobs (%d) must be a power of 2\n",
2104 amdgpu_sched_jobs
= roundup_pow_of_two(amdgpu_sched_jobs
);
2107 if (amdgpu_gart_size
!= -1 && amdgpu_gart_size
< 32) {
2108 /* gart size must be greater or equal to 32M */
2109 dev_warn(adev
->dev
, "gart size (%d) too small\n",
2111 amdgpu_gart_size
= -1;
2114 if (amdgpu_gtt_size
!= -1 && amdgpu_gtt_size
< 32) {
2115 /* gtt size must be greater or equal to 32M */
2116 dev_warn(adev
->dev
, "gtt size (%d) too small\n",
2118 amdgpu_gtt_size
= -1;
2121 /* valid range is between 4 and 9 inclusive */
2122 if (amdgpu_vm_fragment_size
!= -1 &&
2123 (amdgpu_vm_fragment_size
> 9 || amdgpu_vm_fragment_size
< 4)) {
2124 dev_warn(adev
->dev
, "valid range is between 4 and 9\n");
2125 amdgpu_vm_fragment_size
= -1;
2128 if (amdgpu_sched_hw_submission
< 2) {
2129 dev_warn(adev
->dev
, "sched hw submission jobs (%d) must be at least 2\n",
2130 amdgpu_sched_hw_submission
);
2131 amdgpu_sched_hw_submission
= 2;
2132 } else if (!is_power_of_2(amdgpu_sched_hw_submission
)) {
2133 dev_warn(adev
->dev
, "sched hw submission jobs (%d) must be a power of 2\n",
2134 amdgpu_sched_hw_submission
);
2135 amdgpu_sched_hw_submission
= roundup_pow_of_two(amdgpu_sched_hw_submission
);
2138 if (amdgpu_reset_method
< -1 || amdgpu_reset_method
> 4) {
2139 dev_warn(adev
->dev
, "invalid option for reset method, reverting to default\n");
2140 amdgpu_reset_method
= -1;
2143 amdgpu_device_check_smu_prv_buffer_size(adev
);
2145 amdgpu_device_check_vm_size(adev
);
2147 amdgpu_device_check_block_size(adev
);
2149 adev
->firmware
.load_type
= amdgpu_ucode_get_load_type(adev
, amdgpu_fw_load_type
);
2151 for (i
= 0; i
< MAX_XCP
; i
++) {
2152 switch (amdgpu_enforce_isolation
) {
2157 adev
->enforce_isolation
[i
] = AMDGPU_ENFORCE_ISOLATION_DISABLE
;
2161 adev
->enforce_isolation
[i
] =
2162 AMDGPU_ENFORCE_ISOLATION_ENABLE
;
2165 /* enable legacy mode */
2166 adev
->enforce_isolation
[i
] =
2167 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY
;
2170 /* enable only process isolation without submitting cleaner shader */
2171 adev
->enforce_isolation
[i
] =
2172 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER
;
2181 * amdgpu_switcheroo_set_state - set switcheroo state
2183 * @pdev: pci dev pointer
2184 * @state: vga_switcheroo state
2186 * Callback for the switcheroo driver. Suspends or resumes
2187 * the asics before or after it is powered up using ACPI methods.
2189 static void amdgpu_switcheroo_set_state(struct pci_dev
*pdev
,
2190 enum vga_switcheroo_state state
)
2192 struct drm_device
*dev
= pci_get_drvdata(pdev
);
2195 if (amdgpu_device_supports_px(drm_to_adev(dev
)) &&
2196 state
== VGA_SWITCHEROO_OFF
)
2199 if (state
== VGA_SWITCHEROO_ON
) {
2200 pr_info("switched on\n");
2201 /* don't suspend or resume card normally */
2202 dev
->switch_power_state
= DRM_SWITCH_POWER_CHANGING
;
2204 pci_set_power_state(pdev
, PCI_D0
);
2205 amdgpu_device_load_pci_state(pdev
);
2206 r
= pci_enable_device(pdev
);
2208 dev_warn(&pdev
->dev
, "pci_enable_device failed (%d)\n",
2210 amdgpu_device_resume(dev
, true);
2212 dev
->switch_power_state
= DRM_SWITCH_POWER_ON
;
2214 dev_info(&pdev
->dev
, "switched off\n");
2215 dev
->switch_power_state
= DRM_SWITCH_POWER_CHANGING
;
2216 amdgpu_device_prepare(dev
);
2217 amdgpu_device_suspend(dev
, true);
2218 amdgpu_device_cache_pci_state(pdev
);
2219 /* Shut down the device */
2220 pci_disable_device(pdev
);
2221 pci_set_power_state(pdev
, PCI_D3cold
);
2222 dev
->switch_power_state
= DRM_SWITCH_POWER_OFF
;
2227 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2229 * @pdev: pci dev pointer
2231 * Callback for the switcheroo driver. Check of the switcheroo
2232 * state can be changed.
2233 * Returns true if the state can be changed, false if not.
2235 static bool amdgpu_switcheroo_can_switch(struct pci_dev
*pdev
)
2237 struct drm_device
*dev
= pci_get_drvdata(pdev
);
2240 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2241 * locking inversion with the driver load path. And the access here is
2242 * completely racy anyway. So don't bother with locking for now.
2244 return atomic_read(&dev
->open_count
) == 0;
2247 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops
= {
2248 .set_gpu_state
= amdgpu_switcheroo_set_state
,
2250 .can_switch
= amdgpu_switcheroo_can_switch
,
2254 * amdgpu_device_ip_set_clockgating_state - set the CG state
2256 * @dev: amdgpu_device pointer
2257 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2258 * @state: clockgating state (gate or ungate)
2260 * Sets the requested clockgating state for all instances of
2261 * the hardware IP specified.
2262 * Returns the error code from the last instance.
2264 int amdgpu_device_ip_set_clockgating_state(void *dev
,
2265 enum amd_ip_block_type block_type
,
2266 enum amd_clockgating_state state
)
2268 struct amdgpu_device
*adev
= dev
;
2271 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
2272 if (!adev
->ip_blocks
[i
].status
.valid
)
2274 if (adev
->ip_blocks
[i
].version
->type
!= block_type
)
2276 if (!adev
->ip_blocks
[i
].version
->funcs
->set_clockgating_state
)
2278 r
= adev
->ip_blocks
[i
].version
->funcs
->set_clockgating_state(
2279 &adev
->ip_blocks
[i
], state
);
2282 "set_clockgating_state of IP block <%s> failed %d\n",
2283 adev
->ip_blocks
[i
].version
->funcs
->name
, r
);
2289 * amdgpu_device_ip_set_powergating_state - set the PG state
2291 * @dev: amdgpu_device pointer
2292 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2293 * @state: powergating state (gate or ungate)
2295 * Sets the requested powergating state for all instances of
2296 * the hardware IP specified.
2297 * Returns the error code from the last instance.
2299 int amdgpu_device_ip_set_powergating_state(void *dev
,
2300 enum amd_ip_block_type block_type
,
2301 enum amd_powergating_state state
)
2303 struct amdgpu_device
*adev
= dev
;
2306 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
2307 if (!adev
->ip_blocks
[i
].status
.valid
)
2309 if (adev
->ip_blocks
[i
].version
->type
!= block_type
)
2311 if (!adev
->ip_blocks
[i
].version
->funcs
->set_powergating_state
)
2313 r
= adev
->ip_blocks
[i
].version
->funcs
->set_powergating_state(
2314 &adev
->ip_blocks
[i
], state
);
2317 "set_powergating_state of IP block <%s> failed %d\n",
2318 adev
->ip_blocks
[i
].version
->funcs
->name
, r
);
2324 * amdgpu_device_ip_get_clockgating_state - get the CG state
2326 * @adev: amdgpu_device pointer
2327 * @flags: clockgating feature flags
2329 * Walks the list of IPs on the device and updates the clockgating
2330 * flags for each IP.
2331 * Updates @flags with the feature flags for each hardware IP where
2332 * clockgating is enabled.
2334 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device
*adev
,
2339 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
2340 if (!adev
->ip_blocks
[i
].status
.valid
)
2342 if (adev
->ip_blocks
[i
].version
->funcs
->get_clockgating_state
)
2343 adev
->ip_blocks
[i
].version
->funcs
->get_clockgating_state(
2344 &adev
->ip_blocks
[i
], flags
);
2349 * amdgpu_device_ip_wait_for_idle - wait for idle
2351 * @adev: amdgpu_device pointer
2352 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2354 * Waits for the request hardware IP to be idle.
2355 * Returns 0 for success or a negative error code on failure.
2357 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device
*adev
,
2358 enum amd_ip_block_type block_type
)
2362 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
2363 if (!adev
->ip_blocks
[i
].status
.valid
)
2365 if (adev
->ip_blocks
[i
].version
->type
== block_type
) {
2366 if (adev
->ip_blocks
[i
].version
->funcs
->wait_for_idle
) {
2367 r
= adev
->ip_blocks
[i
].version
->funcs
->wait_for_idle(
2368 &adev
->ip_blocks
[i
]);
2380 * amdgpu_device_ip_is_valid - is the hardware IP enabled
2382 * @adev: amdgpu_device pointer
2383 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2385 * Check if the hardware IP is enable or not.
2386 * Returns true if it the IP is enable, false if not.
2388 bool amdgpu_device_ip_is_valid(struct amdgpu_device
*adev
,
2389 enum amd_ip_block_type block_type
)
2393 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
2394 if (adev
->ip_blocks
[i
].version
->type
== block_type
)
2395 return adev
->ip_blocks
[i
].status
.valid
;
2402 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2404 * @adev: amdgpu_device pointer
2405 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2407 * Returns a pointer to the hardware IP block structure
2408 * if it exists for the asic, otherwise NULL.
2410 struct amdgpu_ip_block
*
2411 amdgpu_device_ip_get_ip_block(struct amdgpu_device
*adev
,
2412 enum amd_ip_block_type type
)
2416 for (i
= 0; i
< adev
->num_ip_blocks
; i
++)
2417 if (adev
->ip_blocks
[i
].version
->type
== type
)
2418 return &adev
->ip_blocks
[i
];
2424 * amdgpu_device_ip_block_version_cmp
2426 * @adev: amdgpu_device pointer
2427 * @type: enum amd_ip_block_type
2428 * @major: major version
2429 * @minor: minor version
2431 * return 0 if equal or greater
2432 * return 1 if smaller or the ip_block doesn't exist
2434 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device
*adev
,
2435 enum amd_ip_block_type type
,
2436 u32 major
, u32 minor
)
2438 struct amdgpu_ip_block
*ip_block
= amdgpu_device_ip_get_ip_block(adev
, type
);
2440 if (ip_block
&& ((ip_block
->version
->major
> major
) ||
2441 ((ip_block
->version
->major
== major
) &&
2442 (ip_block
->version
->minor
>= minor
))))
2449 * amdgpu_device_ip_block_add
2451 * @adev: amdgpu_device pointer
2452 * @ip_block_version: pointer to the IP to add
2454 * Adds the IP block driver information to the collection of IPs
2457 int amdgpu_device_ip_block_add(struct amdgpu_device
*adev
,
2458 const struct amdgpu_ip_block_version
*ip_block_version
)
2460 if (!ip_block_version
)
2463 switch (ip_block_version
->type
) {
2464 case AMD_IP_BLOCK_TYPE_VCN
:
2465 if (adev
->harvest_ip_mask
& AMD_HARVEST_IP_VCN_MASK
)
2468 case AMD_IP_BLOCK_TYPE_JPEG
:
2469 if (adev
->harvest_ip_mask
& AMD_HARVEST_IP_JPEG_MASK
)
2476 dev_info(adev
->dev
, "detected ip block number %d <%s>\n",
2477 adev
->num_ip_blocks
, ip_block_version
->funcs
->name
);
2479 adev
->ip_blocks
[adev
->num_ip_blocks
].adev
= adev
;
2481 adev
->ip_blocks
[adev
->num_ip_blocks
++].version
= ip_block_version
;
2487 * amdgpu_device_enable_virtual_display - enable virtual display feature
2489 * @adev: amdgpu_device pointer
2491 * Enabled the virtual display feature if the user has enabled it via
2492 * the module parameter virtual_display. This feature provides a virtual
2493 * display hardware on headless boards or in virtualized environments.
2494 * This function parses and validates the configuration string specified by
2495 * the user and configures the virtual display configuration (number of
2496 * virtual connectors, crtcs, etc.) specified.
2498 static void amdgpu_device_enable_virtual_display(struct amdgpu_device
*adev
)
2500 adev
->enable_virtual_display
= false;
2502 if (amdgpu_virtual_display
) {
2503 const char *pci_address_name
= pci_name(adev
->pdev
);
2504 char *pciaddstr
, *pciaddstr_tmp
, *pciaddname_tmp
, *pciaddname
;
2506 pciaddstr
= kstrdup(amdgpu_virtual_display
, GFP_KERNEL
);
2507 pciaddstr_tmp
= pciaddstr
;
2508 while ((pciaddname_tmp
= strsep(&pciaddstr_tmp
, ";"))) {
2509 pciaddname
= strsep(&pciaddname_tmp
, ",");
2510 if (!strcmp("all", pciaddname
)
2511 || !strcmp(pci_address_name
, pciaddname
)) {
2515 adev
->enable_virtual_display
= true;
2518 res
= kstrtol(pciaddname_tmp
, 10,
2526 adev
->mode_info
.num_crtc
= num_crtc
;
2528 adev
->mode_info
.num_crtc
= 1;
2536 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2537 amdgpu_virtual_display
, pci_address_name
,
2538 adev
->enable_virtual_display
, adev
->mode_info
.num_crtc
);
2544 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device
*adev
)
2546 if (amdgpu_sriov_vf(adev
) && !adev
->enable_virtual_display
) {
2547 adev
->mode_info
.num_crtc
= 1;
2548 adev
->enable_virtual_display
= true;
2549 dev_info(adev
->dev
, "virtual_display:%d, num_crtc:%d\n",
2550 adev
->enable_virtual_display
,
2551 adev
->mode_info
.num_crtc
);
2556 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2558 * @adev: amdgpu_device pointer
2560 * Parses the asic configuration parameters specified in the gpu info
2561 * firmware and makes them available to the driver for use in configuring
2563 * Returns 0 on success, -EINVAL on failure.
2565 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device
*adev
)
2567 const char *chip_name
;
2569 const struct gpu_info_firmware_header_v1_0
*hdr
;
2571 adev
->firmware
.gpu_info_fw
= NULL
;
2573 if (adev
->mman
.discovery_bin
)
2576 switch (adev
->asic_type
) {
2580 chip_name
= "vega10";
2583 chip_name
= "vega12";
2586 if (adev
->apu_flags
& AMD_APU_IS_RAVEN2
)
2587 chip_name
= "raven2";
2588 else if (adev
->apu_flags
& AMD_APU_IS_PICASSO
)
2589 chip_name
= "picasso";
2591 chip_name
= "raven";
2594 chip_name
= "arcturus";
2597 chip_name
= "navi12";
2601 err
= amdgpu_ucode_request(adev
, &adev
->firmware
.gpu_info_fw
,
2602 AMDGPU_UCODE_OPTIONAL
,
2603 "amdgpu/%s_gpu_info.bin", chip_name
);
2606 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2611 hdr
= (const struct gpu_info_firmware_header_v1_0
*)adev
->firmware
.gpu_info_fw
->data
;
2612 amdgpu_ucode_print_gpu_info_hdr(&hdr
->header
);
2614 switch (hdr
->version_major
) {
2617 const struct gpu_info_firmware_v1_0
*gpu_info_fw
=
2618 (const struct gpu_info_firmware_v1_0
*)(adev
->firmware
.gpu_info_fw
->data
+
2619 le32_to_cpu(hdr
->header
.ucode_array_offset_bytes
));
2622 * Should be dropped when DAL no longer needs it.
2624 if (adev
->asic_type
== CHIP_NAVI12
)
2625 goto parse_soc_bounding_box
;
2627 adev
->gfx
.config
.max_shader_engines
= le32_to_cpu(gpu_info_fw
->gc_num_se
);
2628 adev
->gfx
.config
.max_cu_per_sh
= le32_to_cpu(gpu_info_fw
->gc_num_cu_per_sh
);
2629 adev
->gfx
.config
.max_sh_per_se
= le32_to_cpu(gpu_info_fw
->gc_num_sh_per_se
);
2630 adev
->gfx
.config
.max_backends_per_se
= le32_to_cpu(gpu_info_fw
->gc_num_rb_per_se
);
2631 adev
->gfx
.config
.max_texture_channel_caches
=
2632 le32_to_cpu(gpu_info_fw
->gc_num_tccs
);
2633 adev
->gfx
.config
.max_gprs
= le32_to_cpu(gpu_info_fw
->gc_num_gprs
);
2634 adev
->gfx
.config
.max_gs_threads
= le32_to_cpu(gpu_info_fw
->gc_num_max_gs_thds
);
2635 adev
->gfx
.config
.gs_vgt_table_depth
= le32_to_cpu(gpu_info_fw
->gc_gs_table_depth
);
2636 adev
->gfx
.config
.gs_prim_buffer_depth
= le32_to_cpu(gpu_info_fw
->gc_gsprim_buff_depth
);
2637 adev
->gfx
.config
.double_offchip_lds_buf
=
2638 le32_to_cpu(gpu_info_fw
->gc_double_offchip_lds_buffer
);
2639 adev
->gfx
.cu_info
.wave_front_size
= le32_to_cpu(gpu_info_fw
->gc_wave_size
);
2640 adev
->gfx
.cu_info
.max_waves_per_simd
=
2641 le32_to_cpu(gpu_info_fw
->gc_max_waves_per_simd
);
2642 adev
->gfx
.cu_info
.max_scratch_slots_per_cu
=
2643 le32_to_cpu(gpu_info_fw
->gc_max_scratch_slots_per_cu
);
2644 adev
->gfx
.cu_info
.lds_size
= le32_to_cpu(gpu_info_fw
->gc_lds_size
);
2645 if (hdr
->version_minor
>= 1) {
2646 const struct gpu_info_firmware_v1_1
*gpu_info_fw
=
2647 (const struct gpu_info_firmware_v1_1
*)(adev
->firmware
.gpu_info_fw
->data
+
2648 le32_to_cpu(hdr
->header
.ucode_array_offset_bytes
));
2649 adev
->gfx
.config
.num_sc_per_sh
=
2650 le32_to_cpu(gpu_info_fw
->num_sc_per_sh
);
2651 adev
->gfx
.config
.num_packer_per_sc
=
2652 le32_to_cpu(gpu_info_fw
->num_packer_per_sc
);
2655 parse_soc_bounding_box
:
2657 * soc bounding box info is not integrated in disocovery table,
2658 * we always need to parse it from gpu info firmware if needed.
2660 if (hdr
->version_minor
== 2) {
2661 const struct gpu_info_firmware_v1_2
*gpu_info_fw
=
2662 (const struct gpu_info_firmware_v1_2
*)(adev
->firmware
.gpu_info_fw
->data
+
2663 le32_to_cpu(hdr
->header
.ucode_array_offset_bytes
));
2664 adev
->dm
.soc_bounding_box
= &gpu_info_fw
->soc_bounding_box
;
2670 "Unsupported gpu_info table %d\n", hdr
->header
.ucode_version
);
2679 * amdgpu_device_ip_early_init - run early init for hardware IPs
2681 * @adev: amdgpu_device pointer
2683 * Early initialization pass for hardware IPs. The hardware IPs that make
2684 * up each asic are discovered each IP's early_init callback is run. This
2685 * is the first stage in initializing the asic.
2686 * Returns 0 on success, negative error code on failure.
2688 static int amdgpu_device_ip_early_init(struct amdgpu_device
*adev
)
2690 struct amdgpu_ip_block
*ip_block
;
2691 struct pci_dev
*parent
;
2692 bool total
, skip_bios
;
2693 uint32_t bios_flags
;
2696 amdgpu_device_enable_virtual_display(adev
);
2698 if (amdgpu_sriov_vf(adev
)) {
2699 r
= amdgpu_virt_request_full_gpu(adev
, true);
2704 switch (adev
->asic_type
) {
2705 #ifdef CONFIG_DRM_AMDGPU_SI
2711 adev
->family
= AMDGPU_FAMILY_SI
;
2712 r
= si_set_ip_blocks(adev
);
2717 #ifdef CONFIG_DRM_AMDGPU_CIK
2723 if (adev
->flags
& AMD_IS_APU
)
2724 adev
->family
= AMDGPU_FAMILY_KV
;
2726 adev
->family
= AMDGPU_FAMILY_CI
;
2728 r
= cik_set_ip_blocks(adev
);
2736 case CHIP_POLARIS10
:
2737 case CHIP_POLARIS11
:
2738 case CHIP_POLARIS12
:
2742 if (adev
->flags
& AMD_IS_APU
)
2743 adev
->family
= AMDGPU_FAMILY_CZ
;
2745 adev
->family
= AMDGPU_FAMILY_VI
;
2747 r
= vi_set_ip_blocks(adev
);
2752 r
= amdgpu_discovery_set_ip_blocks(adev
);
2758 /* Check for IP version 9.4.3 with A0 hardware */
2759 if (amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(9, 4, 3) &&
2760 !amdgpu_device_get_rev_id(adev
)) {
2761 dev_err(adev
->dev
, "Unsupported A0 hardware\n");
2762 return -ENODEV
; /* device unsupported - no device error */
2765 if (amdgpu_has_atpx() &&
2766 (amdgpu_is_atpx_hybrid() ||
2767 amdgpu_has_atpx_dgpu_power_cntl()) &&
2768 ((adev
->flags
& AMD_IS_APU
) == 0) &&
2769 !dev_is_removable(&adev
->pdev
->dev
))
2770 adev
->flags
|= AMD_IS_PX
;
2772 if (!(adev
->flags
& AMD_IS_APU
)) {
2773 parent
= pcie_find_root_port(adev
->pdev
);
2774 adev
->has_pr3
= parent
? pci_pr3_present(parent
) : false;
2777 adev
->pm
.pp_feature
= amdgpu_pp_feature_mask
;
2778 if (amdgpu_sriov_vf(adev
) || sched_policy
== KFD_SCHED_POLICY_NO_HWS
)
2779 adev
->pm
.pp_feature
&= ~PP_GFXOFF_MASK
;
2780 if (amdgpu_sriov_vf(adev
) && adev
->asic_type
== CHIP_SIENNA_CICHLID
)
2781 adev
->pm
.pp_feature
&= ~PP_OVERDRIVE_MASK
;
2782 if (!amdgpu_device_pcie_dynamic_switching_supported(adev
))
2783 adev
->pm
.pp_feature
&= ~PP_PCIE_DPM_MASK
;
2785 adev
->virt
.is_xgmi_node_migrate_enabled
= false;
2786 if (amdgpu_sriov_vf(adev
)) {
2787 adev
->virt
.is_xgmi_node_migrate_enabled
=
2788 amdgpu_ip_version((adev
), GC_HWIP
, 0) == IP_VERSION(9, 4, 4);
2792 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
2793 ip_block
= &adev
->ip_blocks
[i
];
2795 if ((amdgpu_ip_block_mask
& (1 << i
)) == 0) {
2796 dev_warn(adev
->dev
, "disabled ip block: %d <%s>\n", i
,
2797 adev
->ip_blocks
[i
].version
->funcs
->name
);
2798 adev
->ip_blocks
[i
].status
.valid
= false;
2799 } else if (ip_block
->version
->funcs
->early_init
) {
2800 r
= ip_block
->version
->funcs
->early_init(ip_block
);
2802 adev
->ip_blocks
[i
].status
.valid
= false;
2805 "early_init of IP block <%s> failed %d\n",
2806 adev
->ip_blocks
[i
].version
->funcs
->name
,
2810 adev
->ip_blocks
[i
].status
.valid
= true;
2813 adev
->ip_blocks
[i
].status
.valid
= true;
2815 /* get the vbios after the asic_funcs are set up */
2816 if (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_COMMON
) {
2817 r
= amdgpu_device_parse_gpu_info_fw(adev
);
2821 bios_flags
= amdgpu_device_get_vbios_flags(adev
);
2822 skip_bios
= !!(bios_flags
& AMDGPU_VBIOS_SKIP
);
2826 !!(bios_flags
& AMDGPU_VBIOS_OPTIONAL
);
2827 if (!amdgpu_get_bios(adev
) && !optional
)
2830 if (optional
&& !adev
->bios
)
2833 "VBIOS image optional, proceeding without VBIOS image");
2836 r
= amdgpu_atombios_init(adev
);
2839 "amdgpu_atombios_init failed\n");
2840 amdgpu_vf_error_put(
2842 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL
,
2849 /*get pf2vf msg info at it's earliest time*/
2850 if (amdgpu_sriov_vf(adev
))
2851 amdgpu_virt_init_data_exchange(adev
);
2858 if (adev
->gmc
.xgmi
.supported
)
2859 amdgpu_xgmi_early_init(adev
);
2861 ip_block
= amdgpu_device_ip_get_ip_block(adev
, AMD_IP_BLOCK_TYPE_GFX
);
2862 if (ip_block
->status
.valid
!= false)
2863 amdgpu_amdkfd_device_probe(adev
);
2865 adev
->cg_flags
&= amdgpu_cg_mask
;
2866 adev
->pg_flags
&= amdgpu_pg_mask
;
2871 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device
*adev
)
2875 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
2876 if (!adev
->ip_blocks
[i
].status
.sw
)
2878 if (adev
->ip_blocks
[i
].status
.hw
)
2880 if (!amdgpu_ip_member_of_hwini(
2881 adev
, adev
->ip_blocks
[i
].version
->type
))
2883 if (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_COMMON
||
2884 (amdgpu_sriov_vf(adev
) && (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_PSP
)) ||
2885 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_IH
) {
2886 r
= adev
->ip_blocks
[i
].version
->funcs
->hw_init(&adev
->ip_blocks
[i
]);
2889 "hw_init of IP block <%s> failed %d\n",
2890 adev
->ip_blocks
[i
].version
->funcs
->name
,
2894 adev
->ip_blocks
[i
].status
.hw
= true;
2901 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device
*adev
)
2905 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
2906 if (!adev
->ip_blocks
[i
].status
.sw
)
2908 if (adev
->ip_blocks
[i
].status
.hw
)
2910 if (!amdgpu_ip_member_of_hwini(
2911 adev
, adev
->ip_blocks
[i
].version
->type
))
2913 r
= adev
->ip_blocks
[i
].version
->funcs
->hw_init(&adev
->ip_blocks
[i
]);
2916 "hw_init of IP block <%s> failed %d\n",
2917 adev
->ip_blocks
[i
].version
->funcs
->name
, r
);
2920 adev
->ip_blocks
[i
].status
.hw
= true;
2926 static int amdgpu_device_fw_loading(struct amdgpu_device
*adev
)
2930 uint32_t smu_version
;
2932 if (adev
->asic_type
>= CHIP_VEGA10
) {
2933 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
2934 if (adev
->ip_blocks
[i
].version
->type
!= AMD_IP_BLOCK_TYPE_PSP
)
2937 if (!amdgpu_ip_member_of_hwini(adev
,
2938 AMD_IP_BLOCK_TYPE_PSP
))
2941 if (!adev
->ip_blocks
[i
].status
.sw
)
2944 /* no need to do the fw loading again if already done*/
2945 if (adev
->ip_blocks
[i
].status
.hw
== true)
2948 if (amdgpu_in_reset(adev
) || adev
->in_suspend
) {
2949 r
= amdgpu_ip_block_resume(&adev
->ip_blocks
[i
]);
2953 r
= adev
->ip_blocks
[i
].version
->funcs
->hw_init(&adev
->ip_blocks
[i
]);
2956 "hw_init of IP block <%s> failed %d\n",
2958 .version
->funcs
->name
,
2962 adev
->ip_blocks
[i
].status
.hw
= true;
2968 if (!amdgpu_sriov_vf(adev
) || adev
->asic_type
== CHIP_TONGA
)
2969 r
= amdgpu_pm_load_smu_firmware(adev
, &smu_version
);
2974 static int amdgpu_device_init_schedulers(struct amdgpu_device
*adev
)
2976 struct drm_sched_init_args args
= {
2977 .ops
= &amdgpu_sched_ops
,
2978 .num_rqs
= DRM_SCHED_PRIORITY_COUNT
,
2979 .timeout_wq
= adev
->reset_domain
->wq
,
2985 for (i
= 0; i
< AMDGPU_MAX_RINGS
; ++i
) {
2986 struct amdgpu_ring
*ring
= adev
->rings
[i
];
2988 /* No need to setup the GPU scheduler for rings that don't need it */
2989 if (!ring
|| ring
->no_scheduler
)
2992 switch (ring
->funcs
->type
) {
2993 case AMDGPU_RING_TYPE_GFX
:
2994 timeout
= adev
->gfx_timeout
;
2996 case AMDGPU_RING_TYPE_COMPUTE
:
2997 timeout
= adev
->compute_timeout
;
2999 case AMDGPU_RING_TYPE_SDMA
:
3000 timeout
= adev
->sdma_timeout
;
3003 timeout
= adev
->video_timeout
;
3007 args
.timeout
= timeout
;
3008 args
.credit_limit
= ring
->num_hw_submission
;
3009 args
.score
= ring
->sched_score
;
3010 args
.name
= ring
->name
;
3012 r
= drm_sched_init(&ring
->sched
, &args
);
3015 "Failed to create scheduler on ring %s.\n",
3019 r
= amdgpu_uvd_entity_init(adev
, ring
);
3022 "Failed to create UVD scheduling entity on ring %s.\n",
3026 r
= amdgpu_vce_entity_init(adev
, ring
);
3029 "Failed to create VCE scheduling entity on ring %s.\n",
3036 amdgpu_xcp_update_partition_sched_list(adev
);
3043 * amdgpu_device_ip_init - run init for hardware IPs
3045 * @adev: amdgpu_device pointer
3047 * Main initialization pass for hardware IPs. The list of all the hardware
3048 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
3049 * are run. sw_init initializes the software state associated with each IP
3050 * and hw_init initializes the hardware associated with each IP.
3051 * Returns 0 on success, negative error code on failure.
3053 static int amdgpu_device_ip_init(struct amdgpu_device
*adev
)
3058 r
= amdgpu_ras_init(adev
);
3062 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
3063 if (!adev
->ip_blocks
[i
].status
.valid
)
3065 if (adev
->ip_blocks
[i
].version
->funcs
->sw_init
) {
3066 r
= adev
->ip_blocks
[i
].version
->funcs
->sw_init(&adev
->ip_blocks
[i
]);
3069 "sw_init of IP block <%s> failed %d\n",
3070 adev
->ip_blocks
[i
].version
->funcs
->name
,
3075 adev
->ip_blocks
[i
].status
.sw
= true;
3077 if (!amdgpu_ip_member_of_hwini(
3078 adev
, adev
->ip_blocks
[i
].version
->type
))
3081 if (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_COMMON
) {
3082 /* need to do common hw init early so everything is set up for gmc */
3083 r
= adev
->ip_blocks
[i
].version
->funcs
->hw_init(&adev
->ip_blocks
[i
]);
3085 dev_err(adev
->dev
, "hw_init %d failed %d\n", i
,
3089 adev
->ip_blocks
[i
].status
.hw
= true;
3090 } else if (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_GMC
) {
3091 /* need to do gmc hw init early so we can allocate gpu mem */
3092 /* Try to reserve bad pages early */
3093 if (amdgpu_sriov_vf(adev
))
3094 amdgpu_virt_exchange_data(adev
);
3096 r
= amdgpu_device_mem_scratch_init(adev
);
3099 "amdgpu_mem_scratch_init failed %d\n",
3103 r
= adev
->ip_blocks
[i
].version
->funcs
->hw_init(&adev
->ip_blocks
[i
]);
3105 dev_err(adev
->dev
, "hw_init %d failed %d\n", i
,
3109 r
= amdgpu_device_wb_init(adev
);
3112 "amdgpu_device_wb_init failed %d\n", r
);
3115 adev
->ip_blocks
[i
].status
.hw
= true;
3117 /* right after GMC hw init, we create CSA */
3118 if (adev
->gfx
.mcbp
) {
3119 r
= amdgpu_allocate_static_csa(adev
, &adev
->virt
.csa_obj
,
3120 AMDGPU_GEM_DOMAIN_VRAM
|
3121 AMDGPU_GEM_DOMAIN_GTT
,
3125 "allocate CSA failed %d\n", r
);
3130 r
= amdgpu_seq64_init(adev
);
3132 dev_err(adev
->dev
, "allocate seq64 failed %d\n",
3139 if (amdgpu_sriov_vf(adev
))
3140 amdgpu_virt_init_data_exchange(adev
);
3142 r
= amdgpu_ib_pool_init(adev
);
3144 dev_err(adev
->dev
, "IB initialization failed (%d).\n", r
);
3145 amdgpu_vf_error_put(adev
, AMDGIM_ERROR_VF_IB_INIT_FAIL
, 0, r
);
3149 r
= amdgpu_ucode_create_bo(adev
); /* create ucode bo when sw_init complete*/
3153 r
= amdgpu_device_ip_hw_init_phase1(adev
);
3157 r
= amdgpu_device_fw_loading(adev
);
3161 r
= amdgpu_device_ip_hw_init_phase2(adev
);
3166 * retired pages will be loaded from eeprom and reserved here,
3167 * it should be called after amdgpu_device_ip_hw_init_phase2 since
3168 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
3169 * for I2C communication which only true at this point.
3171 * amdgpu_ras_recovery_init may fail, but the upper only cares the
3172 * failure from bad gpu situation and stop amdgpu init process
3173 * accordingly. For other failed cases, it will still release all
3174 * the resource and print error message, rather than returning one
3175 * negative value to upper level.
3177 * Note: theoretically, this should be called before all vram allocations
3178 * to protect retired page from abusing
3180 init_badpage
= (adev
->init_lvl
->level
!= AMDGPU_INIT_LEVEL_MINIMAL_XGMI
);
3181 r
= amdgpu_ras_recovery_init(adev
, init_badpage
);
3186 * In case of XGMI grab extra reference for reset domain for this device
3188 if (adev
->gmc
.xgmi
.num_physical_nodes
> 1) {
3189 if (amdgpu_xgmi_add_device(adev
) == 0) {
3190 if (!amdgpu_sriov_vf(adev
)) {
3191 struct amdgpu_hive_info
*hive
= amdgpu_get_xgmi_hive(adev
);
3193 if (WARN_ON(!hive
)) {
3198 if (!hive
->reset_domain
||
3199 !amdgpu_reset_get_reset_domain(hive
->reset_domain
)) {
3201 amdgpu_put_xgmi_hive(hive
);
3205 /* Drop the early temporary reset domain we created for device */
3206 amdgpu_reset_put_reset_domain(adev
->reset_domain
);
3207 adev
->reset_domain
= hive
->reset_domain
;
3208 amdgpu_put_xgmi_hive(hive
);
3213 r
= amdgpu_device_init_schedulers(adev
);
3217 if (adev
->mman
.buffer_funcs_ring
->sched
.ready
)
3218 amdgpu_ttm_set_buffer_funcs_status(adev
, true);
3220 /* Don't init kfd if whole hive need to be reset during init */
3221 if (adev
->init_lvl
->level
!= AMDGPU_INIT_LEVEL_MINIMAL_XGMI
) {
3222 kgd2kfd_init_zone_device(adev
);
3223 amdgpu_amdkfd_device_init(adev
);
3226 amdgpu_fru_get_product_info(adev
);
3228 if (!amdgpu_sriov_vf(adev
) || amdgpu_sriov_ras_cper_en(adev
))
3229 r
= amdgpu_cper_init(adev
);
3237 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3239 * @adev: amdgpu_device pointer
3241 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
3242 * this function before a GPU reset. If the value is retained after a
3243 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents.
3245 static void amdgpu_device_fill_reset_magic(struct amdgpu_device
*adev
)
3247 memcpy(adev
->reset_magic
, adev
->gart
.ptr
, AMDGPU_RESET_MAGIC_NUM
);
3251 * amdgpu_device_check_vram_lost - check if vram is valid
3253 * @adev: amdgpu_device pointer
3255 * Checks the reset magic value written to the gart pointer in VRAM.
3256 * The driver calls this after a GPU reset to see if the contents of
3257 * VRAM is lost or now.
3258 * returns true if vram is lost, false if not.
3260 static bool amdgpu_device_check_vram_lost(struct amdgpu_device
*adev
)
3262 if (memcmp(adev
->gart
.ptr
, adev
->reset_magic
,
3263 AMDGPU_RESET_MAGIC_NUM
))
3266 if (!amdgpu_in_reset(adev
))
3270 * For all ASICs with baco/mode1 reset, the VRAM is
3271 * always assumed to be lost.
3273 switch (amdgpu_asic_reset_method(adev
)) {
3274 case AMD_RESET_METHOD_LINK
:
3275 case AMD_RESET_METHOD_BACO
:
3276 case AMD_RESET_METHOD_MODE1
:
3284 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3286 * @adev: amdgpu_device pointer
3287 * @state: clockgating state (gate or ungate)
3289 * The list of all the hardware IPs that make up the asic is walked and the
3290 * set_clockgating_state callbacks are run.
3291 * Late initialization pass enabling clockgating for hardware IPs.
3292 * Fini or suspend, pass disabling clockgating for hardware IPs.
3293 * Returns 0 on success, negative error code on failure.
3296 int amdgpu_device_set_cg_state(struct amdgpu_device
*adev
,
3297 enum amd_clockgating_state state
)
3301 if (amdgpu_emu_mode
== 1)
3304 for (j
= 0; j
< adev
->num_ip_blocks
; j
++) {
3305 i
= state
== AMD_CG_STATE_GATE
? j
: adev
->num_ip_blocks
- j
- 1;
3306 if (!adev
->ip_blocks
[i
].status
.late_initialized
)
3308 /* skip CG for GFX, SDMA on S0ix */
3309 if (adev
->in_s0ix
&&
3310 (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_GFX
||
3311 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_SDMA
))
3313 /* skip CG for VCE/UVD, it's handled specially */
3314 if (adev
->ip_blocks
[i
].version
->type
!= AMD_IP_BLOCK_TYPE_UVD
&&
3315 adev
->ip_blocks
[i
].version
->type
!= AMD_IP_BLOCK_TYPE_VCE
&&
3316 adev
->ip_blocks
[i
].version
->type
!= AMD_IP_BLOCK_TYPE_VCN
&&
3317 adev
->ip_blocks
[i
].version
->type
!= AMD_IP_BLOCK_TYPE_JPEG
&&
3318 adev
->ip_blocks
[i
].version
->funcs
->set_clockgating_state
) {
3319 /* enable clockgating to save power */
3320 r
= adev
->ip_blocks
[i
].version
->funcs
->set_clockgating_state(&adev
->ip_blocks
[i
],
3324 "set_clockgating_state(gate) of IP block <%s> failed %d\n",
3325 adev
->ip_blocks
[i
].version
->funcs
->name
,
3335 int amdgpu_device_set_pg_state(struct amdgpu_device
*adev
,
3336 enum amd_powergating_state state
)
3340 if (amdgpu_emu_mode
== 1)
3343 for (j
= 0; j
< adev
->num_ip_blocks
; j
++) {
3344 i
= state
== AMD_PG_STATE_GATE
? j
: adev
->num_ip_blocks
- j
- 1;
3345 if (!adev
->ip_blocks
[i
].status
.late_initialized
)
3347 /* skip PG for GFX, SDMA on S0ix */
3348 if (adev
->in_s0ix
&&
3349 (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_GFX
||
3350 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_SDMA
))
3352 /* skip CG for VCE/UVD, it's handled specially */
3353 if (adev
->ip_blocks
[i
].version
->type
!= AMD_IP_BLOCK_TYPE_UVD
&&
3354 adev
->ip_blocks
[i
].version
->type
!= AMD_IP_BLOCK_TYPE_VCE
&&
3355 adev
->ip_blocks
[i
].version
->type
!= AMD_IP_BLOCK_TYPE_VCN
&&
3356 adev
->ip_blocks
[i
].version
->type
!= AMD_IP_BLOCK_TYPE_JPEG
&&
3357 adev
->ip_blocks
[i
].version
->funcs
->set_powergating_state
) {
3358 /* enable powergating to save power */
3359 r
= adev
->ip_blocks
[i
].version
->funcs
->set_powergating_state(&adev
->ip_blocks
[i
],
3363 "set_powergating_state(gate) of IP block <%s> failed %d\n",
3364 adev
->ip_blocks
[i
].version
->funcs
->name
,
3373 static int amdgpu_device_enable_mgpu_fan_boost(void)
3375 struct amdgpu_gpu_instance
*gpu_ins
;
3376 struct amdgpu_device
*adev
;
3379 mutex_lock(&mgpu_info
.mutex
);
3382 * MGPU fan boost feature should be enabled
3383 * only when there are two or more dGPUs in
3386 if (mgpu_info
.num_dgpu
< 2)
3389 for (i
= 0; i
< mgpu_info
.num_dgpu
; i
++) {
3390 gpu_ins
= &(mgpu_info
.gpu_ins
[i
]);
3391 adev
= gpu_ins
->adev
;
3392 if (!(adev
->flags
& AMD_IS_APU
) &&
3393 !gpu_ins
->mgpu_fan_enabled
) {
3394 ret
= amdgpu_dpm_enable_mgpu_fan_boost(adev
);
3398 gpu_ins
->mgpu_fan_enabled
= 1;
3403 mutex_unlock(&mgpu_info
.mutex
);
3409 * amdgpu_device_ip_late_init - run late init for hardware IPs
3411 * @adev: amdgpu_device pointer
3413 * Late initialization pass for hardware IPs. The list of all the hardware
3414 * IPs that make up the asic is walked and the late_init callbacks are run.
3415 * late_init covers any special initialization that an IP requires
3416 * after all of the have been initialized or something that needs to happen
3417 * late in the init process.
3418 * Returns 0 on success, negative error code on failure.
3420 static int amdgpu_device_ip_late_init(struct amdgpu_device
*adev
)
3422 struct amdgpu_gpu_instance
*gpu_instance
;
3425 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
3426 if (!adev
->ip_blocks
[i
].status
.hw
)
3428 if (adev
->ip_blocks
[i
].version
->funcs
->late_init
) {
3429 r
= adev
->ip_blocks
[i
].version
->funcs
->late_init(&adev
->ip_blocks
[i
]);
3432 "late_init of IP block <%s> failed %d\n",
3433 adev
->ip_blocks
[i
].version
->funcs
->name
,
3438 adev
->ip_blocks
[i
].status
.late_initialized
= true;
3441 r
= amdgpu_ras_late_init(adev
);
3443 dev_err(adev
->dev
, "amdgpu_ras_late_init failed %d", r
);
3447 if (!amdgpu_reset_in_recovery(adev
))
3448 amdgpu_ras_set_error_query_ready(adev
, true);
3450 amdgpu_device_set_cg_state(adev
, AMD_CG_STATE_GATE
);
3451 amdgpu_device_set_pg_state(adev
, AMD_PG_STATE_GATE
);
3453 amdgpu_device_fill_reset_magic(adev
);
3455 r
= amdgpu_device_enable_mgpu_fan_boost();
3457 dev_err(adev
->dev
, "enable mgpu fan boost failed (%d).\n", r
);
3459 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3460 if (amdgpu_passthrough(adev
) &&
3461 ((adev
->asic_type
== CHIP_ARCTURUS
&& adev
->gmc
.xgmi
.num_physical_nodes
> 1) ||
3462 adev
->asic_type
== CHIP_ALDEBARAN
))
3463 amdgpu_dpm_handle_passthrough_sbr(adev
, true);
3465 if (adev
->gmc
.xgmi
.num_physical_nodes
> 1) {
3466 mutex_lock(&mgpu_info
.mutex
);
3469 * Reset device p-state to low as this was booted with high.
3471 * This should be performed only after all devices from the same
3472 * hive get initialized.
3474 * However, it's unknown how many device in the hive in advance.
3475 * As this is counted one by one during devices initializations.
3477 * So, we wait for all XGMI interlinked devices initialized.
3478 * This may bring some delays as those devices may come from
3479 * different hives. But that should be OK.
3481 if (mgpu_info
.num_dgpu
== adev
->gmc
.xgmi
.num_physical_nodes
) {
3482 for (i
= 0; i
< mgpu_info
.num_gpu
; i
++) {
3483 gpu_instance
= &(mgpu_info
.gpu_ins
[i
]);
3484 if (gpu_instance
->adev
->flags
& AMD_IS_APU
)
3487 r
= amdgpu_xgmi_set_pstate(gpu_instance
->adev
,
3488 AMDGPU_XGMI_PSTATE_MIN
);
3491 "pstate setting failed (%d).\n",
3498 mutex_unlock(&mgpu_info
.mutex
);
3504 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block
*ip_block
)
3506 struct amdgpu_device
*adev
= ip_block
->adev
;
3509 if (!ip_block
->version
->funcs
->hw_fini
) {
3510 dev_err(adev
->dev
, "hw_fini of IP block <%s> not defined\n",
3511 ip_block
->version
->funcs
->name
);
3513 r
= ip_block
->version
->funcs
->hw_fini(ip_block
);
3514 /* XXX handle errors */
3517 "hw_fini of IP block <%s> failed %d\n",
3518 ip_block
->version
->funcs
->name
, r
);
3522 ip_block
->status
.hw
= false;
3526 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3528 * @adev: amdgpu_device pointer
3530 * For ASICs need to disable SMC first
3532 static void amdgpu_device_smu_fini_early(struct amdgpu_device
*adev
)
3536 if (amdgpu_ip_version(adev
, GC_HWIP
, 0) > IP_VERSION(9, 0, 0))
3539 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
3540 if (!adev
->ip_blocks
[i
].status
.hw
)
3542 if (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_SMC
) {
3543 amdgpu_ip_block_hw_fini(&adev
->ip_blocks
[i
]);
3549 static int amdgpu_device_ip_fini_early(struct amdgpu_device
*adev
)
3553 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
3554 if (!adev
->ip_blocks
[i
].version
->funcs
->early_fini
)
3557 r
= adev
->ip_blocks
[i
].version
->funcs
->early_fini(&adev
->ip_blocks
[i
]);
3560 "early_fini of IP block <%s> failed %d\n",
3561 adev
->ip_blocks
[i
].version
->funcs
->name
, r
);
3565 amdgpu_device_set_pg_state(adev
, AMD_PG_STATE_UNGATE
);
3566 amdgpu_device_set_cg_state(adev
, AMD_CG_STATE_UNGATE
);
3568 amdgpu_amdkfd_suspend(adev
, true);
3569 amdgpu_userq_suspend(adev
);
3571 /* Workaround for ASICs need to disable SMC first */
3572 amdgpu_device_smu_fini_early(adev
);
3574 for (i
= adev
->num_ip_blocks
- 1; i
>= 0; i
--) {
3575 if (!adev
->ip_blocks
[i
].status
.hw
)
3578 amdgpu_ip_block_hw_fini(&adev
->ip_blocks
[i
]);
3581 if (amdgpu_sriov_vf(adev
)) {
3582 if (amdgpu_virt_release_full_gpu(adev
, false))
3584 "failed to release exclusive mode on fini\n");
3591 * amdgpu_device_ip_fini - run fini for hardware IPs
3593 * @adev: amdgpu_device pointer
3595 * Main teardown pass for hardware IPs. The list of all the hardware
3596 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3597 * are run. hw_fini tears down the hardware associated with each IP
3598 * and sw_fini tears down any software state associated with each IP.
3599 * Returns 0 on success, negative error code on failure.
3601 static int amdgpu_device_ip_fini(struct amdgpu_device
*adev
)
3605 amdgpu_cper_fini(adev
);
3607 if (amdgpu_sriov_vf(adev
) && adev
->virt
.ras_init_done
)
3608 amdgpu_virt_release_ras_err_handler_data(adev
);
3610 if (adev
->gmc
.xgmi
.num_physical_nodes
> 1)
3611 amdgpu_xgmi_remove_device(adev
);
3613 amdgpu_amdkfd_device_fini_sw(adev
);
3615 for (i
= adev
->num_ip_blocks
- 1; i
>= 0; i
--) {
3616 if (!adev
->ip_blocks
[i
].status
.sw
)
3619 if (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_GMC
) {
3620 amdgpu_ucode_free_bo(adev
);
3621 amdgpu_free_static_csa(&adev
->virt
.csa_obj
);
3622 amdgpu_device_wb_fini(adev
);
3623 amdgpu_device_mem_scratch_fini(adev
);
3624 amdgpu_ib_pool_fini(adev
);
3625 amdgpu_seq64_fini(adev
);
3626 amdgpu_doorbell_fini(adev
);
3628 if (adev
->ip_blocks
[i
].version
->funcs
->sw_fini
) {
3629 r
= adev
->ip_blocks
[i
].version
->funcs
->sw_fini(&adev
->ip_blocks
[i
]);
3630 /* XXX handle errors */
3633 "sw_fini of IP block <%s> failed %d\n",
3634 adev
->ip_blocks
[i
].version
->funcs
->name
,
3638 adev
->ip_blocks
[i
].status
.sw
= false;
3639 adev
->ip_blocks
[i
].status
.valid
= false;
3642 for (i
= adev
->num_ip_blocks
- 1; i
>= 0; i
--) {
3643 if (!adev
->ip_blocks
[i
].status
.late_initialized
)
3645 if (adev
->ip_blocks
[i
].version
->funcs
->late_fini
)
3646 adev
->ip_blocks
[i
].version
->funcs
->late_fini(&adev
->ip_blocks
[i
]);
3647 adev
->ip_blocks
[i
].status
.late_initialized
= false;
3650 amdgpu_ras_fini(adev
);
3656 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3658 * @work: work_struct.
3660 static void amdgpu_device_delayed_init_work_handler(struct work_struct
*work
)
3662 struct amdgpu_device
*adev
=
3663 container_of(work
, struct amdgpu_device
, delayed_init_work
.work
);
3666 r
= amdgpu_ib_ring_tests(adev
);
3668 dev_err(adev
->dev
, "ib ring test failed (%d).\n", r
);
3671 static void amdgpu_device_delay_enable_gfx_off(struct work_struct
*work
)
3673 struct amdgpu_device
*adev
=
3674 container_of(work
, struct amdgpu_device
, gfx
.gfx_off_delay_work
.work
);
3676 WARN_ON_ONCE(adev
->gfx
.gfx_off_state
);
3677 WARN_ON_ONCE(adev
->gfx
.gfx_off_req_count
);
3679 if (!amdgpu_dpm_set_powergating_by_smu(adev
, AMD_IP_BLOCK_TYPE_GFX
, true, 0))
3680 adev
->gfx
.gfx_off_state
= true;
3684 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3686 * @adev: amdgpu_device pointer
3688 * Main suspend function for hardware IPs. The list of all the hardware
3689 * IPs that make up the asic is walked, clockgating is disabled and the
3690 * suspend callbacks are run. suspend puts the hardware and software state
3691 * in each IP into a state suitable for suspend.
3692 * Returns 0 on success, negative error code on failure.
3694 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device
*adev
)
3698 amdgpu_device_set_pg_state(adev
, AMD_PG_STATE_UNGATE
);
3699 amdgpu_device_set_cg_state(adev
, AMD_CG_STATE_UNGATE
);
3702 * Per PMFW team's suggestion, driver needs to handle gfxoff
3703 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3704 * scenario. Add the missing df cstate disablement here.
3706 if (amdgpu_dpm_set_df_cstate(adev
, DF_CSTATE_DISALLOW
))
3707 dev_warn(adev
->dev
, "Failed to disallow df cstate");
3709 for (i
= adev
->num_ip_blocks
- 1; i
>= 0; i
--) {
3710 if (!adev
->ip_blocks
[i
].status
.valid
)
3713 /* displays are handled separately */
3714 if (adev
->ip_blocks
[i
].version
->type
!= AMD_IP_BLOCK_TYPE_DCE
)
3717 /* XXX handle errors */
3718 r
= amdgpu_ip_block_suspend(&adev
->ip_blocks
[i
]);
3727 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3729 * @adev: amdgpu_device pointer
3731 * Main suspend function for hardware IPs. The list of all the hardware
3732 * IPs that make up the asic is walked, clockgating is disabled and the
3733 * suspend callbacks are run. suspend puts the hardware and software state
3734 * in each IP into a state suitable for suspend.
3735 * Returns 0 on success, negative error code on failure.
3737 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device
*adev
)
3742 amdgpu_dpm_gfx_state_change(adev
, sGpuChangeState_D3Entry
);
3744 for (i
= adev
->num_ip_blocks
- 1; i
>= 0; i
--) {
3745 if (!adev
->ip_blocks
[i
].status
.valid
)
3747 /* displays are handled in phase1 */
3748 if (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_DCE
)
3750 /* PSP lost connection when err_event_athub occurs */
3751 if (amdgpu_ras_intr_triggered() &&
3752 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_PSP
) {
3753 adev
->ip_blocks
[i
].status
.hw
= false;
3757 /* skip unnecessary suspend if we do not initialize them yet */
3758 if (!amdgpu_ip_member_of_hwini(
3759 adev
, adev
->ip_blocks
[i
].version
->type
))
3762 /* Since we skip suspend for S0i3, we need to cancel the delayed
3763 * idle work here as the suspend callback never gets called.
3765 if (adev
->in_s0ix
&&
3766 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_GFX
&&
3767 amdgpu_ip_version(adev
, GC_HWIP
, 0) >= IP_VERSION(10, 0, 0))
3768 cancel_delayed_work_sync(&adev
->gfx
.idle_work
);
3769 /* skip suspend of gfx/mes and psp for S0ix
3770 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3771 * like at runtime. PSP is also part of the always on hardware
3772 * so no need to suspend it.
3774 if (adev
->in_s0ix
&&
3775 (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_PSP
||
3776 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_GFX
||
3777 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_MES
))
3780 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3781 if (adev
->in_s0ix
&&
3782 (amdgpu_ip_version(adev
, SDMA0_HWIP
, 0) >=
3783 IP_VERSION(5, 0, 0)) &&
3784 (adev
->ip_blocks
[i
].version
->type
==
3785 AMD_IP_BLOCK_TYPE_SDMA
))
3788 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3789 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3790 * from this location and RLC Autoload automatically also gets loaded
3791 * from here based on PMFW -> PSP message during re-init sequence.
3792 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3793 * the TMR and reload FWs again for IMU enabled APU ASICs.
3795 if (amdgpu_in_reset(adev
) &&
3796 (adev
->flags
& AMD_IS_APU
) && adev
->gfx
.imu
.funcs
&&
3797 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_PSP
)
3800 /* XXX handle errors */
3801 r
= amdgpu_ip_block_suspend(&adev
->ip_blocks
[i
]);
3802 adev
->ip_blocks
[i
].status
.hw
= false;
3804 /* handle putting the SMC in the appropriate state */
3805 if (!amdgpu_sriov_vf(adev
)) {
3806 if (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_SMC
) {
3807 r
= amdgpu_dpm_set_mp1_state(adev
, adev
->mp1_state
);
3810 "SMC failed to set mp1 state %d, %d\n",
3811 adev
->mp1_state
, r
);
3822 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3824 * @adev: amdgpu_device pointer
3826 * Main suspend function for hardware IPs. The list of all the hardware
3827 * IPs that make up the asic is walked, clockgating is disabled and the
3828 * suspend callbacks are run. suspend puts the hardware and software state
3829 * in each IP into a state suitable for suspend.
3830 * Returns 0 on success, negative error code on failure.
3832 int amdgpu_device_ip_suspend(struct amdgpu_device
*adev
)
3836 if (amdgpu_sriov_vf(adev
)) {
3837 amdgpu_virt_fini_data_exchange(adev
);
3838 amdgpu_virt_request_full_gpu(adev
, false);
3841 amdgpu_ttm_set_buffer_funcs_status(adev
, false);
3843 r
= amdgpu_device_ip_suspend_phase1(adev
);
3846 r
= amdgpu_device_ip_suspend_phase2(adev
);
3848 if (amdgpu_sriov_vf(adev
))
3849 amdgpu_virt_release_full_gpu(adev
, false);
3854 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device
*adev
)
3858 static enum amd_ip_block_type ip_order
[] = {
3859 AMD_IP_BLOCK_TYPE_COMMON
,
3860 AMD_IP_BLOCK_TYPE_GMC
,
3861 AMD_IP_BLOCK_TYPE_PSP
,
3862 AMD_IP_BLOCK_TYPE_IH
,
3865 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
3867 struct amdgpu_ip_block
*block
;
3869 block
= &adev
->ip_blocks
[i
];
3870 block
->status
.hw
= false;
3872 for (j
= 0; j
< ARRAY_SIZE(ip_order
); j
++) {
3874 if (block
->version
->type
!= ip_order
[j
] ||
3875 !block
->status
.valid
)
3878 r
= block
->version
->funcs
->hw_init(&adev
->ip_blocks
[i
]);
3880 dev_err(adev
->dev
, "RE-INIT-early: %s failed\n",
3881 block
->version
->funcs
->name
);
3884 block
->status
.hw
= true;
3891 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device
*adev
)
3893 struct amdgpu_ip_block
*block
;
3896 static enum amd_ip_block_type ip_order
[] = {
3897 AMD_IP_BLOCK_TYPE_SMC
,
3898 AMD_IP_BLOCK_TYPE_DCE
,
3899 AMD_IP_BLOCK_TYPE_GFX
,
3900 AMD_IP_BLOCK_TYPE_SDMA
,
3901 AMD_IP_BLOCK_TYPE_MES
,
3902 AMD_IP_BLOCK_TYPE_UVD
,
3903 AMD_IP_BLOCK_TYPE_VCE
,
3904 AMD_IP_BLOCK_TYPE_VCN
,
3905 AMD_IP_BLOCK_TYPE_JPEG
3908 for (i
= 0; i
< ARRAY_SIZE(ip_order
); i
++) {
3909 block
= amdgpu_device_ip_get_ip_block(adev
, ip_order
[i
]);
3914 if (block
->status
.valid
&& !block
->status
.hw
) {
3915 if (block
->version
->type
== AMD_IP_BLOCK_TYPE_SMC
) {
3916 r
= amdgpu_ip_block_resume(block
);
3918 r
= block
->version
->funcs
->hw_init(block
);
3922 dev_err(adev
->dev
, "RE-INIT-late: %s failed\n",
3923 block
->version
->funcs
->name
);
3926 block
->status
.hw
= true;
3934 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3936 * @adev: amdgpu_device pointer
3938 * First resume function for hardware IPs. The list of all the hardware
3939 * IPs that make up the asic is walked and the resume callbacks are run for
3940 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3941 * after a suspend and updates the software state as necessary. This
3942 * function is also used for restoring the GPU after a GPU reset.
3943 * Returns 0 on success, negative error code on failure.
3945 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device
*adev
)
3949 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
3950 if (!adev
->ip_blocks
[i
].status
.valid
|| adev
->ip_blocks
[i
].status
.hw
)
3952 if (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_COMMON
||
3953 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_GMC
||
3954 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_IH
||
3955 (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_PSP
&& amdgpu_sriov_vf(adev
))) {
3957 r
= amdgpu_ip_block_resume(&adev
->ip_blocks
[i
]);
3967 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3969 * @adev: amdgpu_device pointer
3971 * Second resume function for hardware IPs. The list of all the hardware
3972 * IPs that make up the asic is walked and the resume callbacks are run for
3973 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3974 * functional state after a suspend and updates the software state as
3975 * necessary. This function is also used for restoring the GPU after a GPU
3977 * Returns 0 on success, negative error code on failure.
3979 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device
*adev
)
3983 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
3984 if (!adev
->ip_blocks
[i
].status
.valid
|| adev
->ip_blocks
[i
].status
.hw
)
3986 if (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_COMMON
||
3987 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_GMC
||
3988 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_IH
||
3989 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_DCE
||
3990 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_PSP
)
3992 r
= amdgpu_ip_block_resume(&adev
->ip_blocks
[i
]);
4001 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
4003 * @adev: amdgpu_device pointer
4005 * Third resume function for hardware IPs. The list of all the hardware
4006 * IPs that make up the asic is walked and the resume callbacks are run for
4007 * all DCE. resume puts the hardware into a functional state after a suspend
4008 * and updates the software state as necessary. This function is also used
4009 * for restoring the GPU after a GPU reset.
4011 * Returns 0 on success, negative error code on failure.
4013 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device
*adev
)
4017 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
4018 if (!adev
->ip_blocks
[i
].status
.valid
|| adev
->ip_blocks
[i
].status
.hw
)
4020 if (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_DCE
) {
4021 r
= amdgpu_ip_block_resume(&adev
->ip_blocks
[i
]);
4031 * amdgpu_device_ip_resume - run resume for hardware IPs
4033 * @adev: amdgpu_device pointer
4035 * Main resume function for hardware IPs. The hardware IPs
4036 * are split into two resume functions because they are
4037 * also used in recovering from a GPU reset and some additional
4038 * steps need to be take between them. In this case (S3/S4) they are
4040 * Returns 0 on success, negative error code on failure.
4042 static int amdgpu_device_ip_resume(struct amdgpu_device
*adev
)
4046 r
= amdgpu_device_ip_resume_phase1(adev
);
4050 r
= amdgpu_device_fw_loading(adev
);
4054 r
= amdgpu_device_ip_resume_phase2(adev
);
4056 if (adev
->mman
.buffer_funcs_ring
->sched
.ready
)
4057 amdgpu_ttm_set_buffer_funcs_status(adev
, true);
4062 amdgpu_fence_driver_hw_init(adev
);
4064 r
= amdgpu_device_ip_resume_phase3(adev
);
4070 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
4072 * @adev: amdgpu_device pointer
4074 * Query the VBIOS data tables to determine if the board supports SR-IOV.
4076 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device
*adev
)
4078 if (amdgpu_sriov_vf(adev
)) {
4079 if (adev
->is_atom_fw
) {
4080 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev
))
4081 adev
->virt
.caps
|= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS
;
4083 if (amdgpu_atombios_has_gpu_virtualization_table(adev
))
4084 adev
->virt
.caps
|= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS
;
4087 if (!(adev
->virt
.caps
& AMDGPU_SRIOV_CAPS_SRIOV_VBIOS
))
4088 amdgpu_vf_error_put(adev
, AMDGIM_ERROR_VF_NO_VBIOS
, 0, 0);
4093 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
4095 * @pdev : pci device context
4096 * @asic_type: AMD asic type
4098 * Check if there is DC (new modesetting infrastructre) support for an asic.
4099 * returns true if DC has support, false if not.
4101 bool amdgpu_device_asic_has_dc_support(struct pci_dev
*pdev
,
4102 enum amd_asic_type asic_type
)
4104 switch (asic_type
) {
4105 #ifdef CONFIG_DRM_AMDGPU_SI
4109 /* chips with no display hardware */
4111 #if defined(CONFIG_DRM_AMD_DC)
4117 * We have systems in the wild with these ASICs that require
4118 * LVDS and VGA support which is not supported with DC.
4120 * Fallback to the non-DC driver here by default so as not to
4121 * cause regressions.
4123 #if defined(CONFIG_DRM_AMD_DC_SI)
4124 return amdgpu_dc
> 0;
4133 * We have systems in the wild with these ASICs that require
4134 * VGA support which is not supported with DC.
4136 * Fallback to the non-DC driver here by default so as not to
4137 * cause regressions.
4139 return amdgpu_dc
> 0;
4141 return amdgpu_dc
!= 0;
4147 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
4154 * amdgpu_device_has_dc_support - check if dc is supported
4156 * @adev: amdgpu_device pointer
4158 * Returns true for supported, false for not supported
4160 bool amdgpu_device_has_dc_support(struct amdgpu_device
*adev
)
4162 if (adev
->enable_virtual_display
||
4163 (adev
->harvest_ip_mask
& AMD_HARVEST_IP_DMU_MASK
))
4166 return amdgpu_device_asic_has_dc_support(adev
->pdev
, adev
->asic_type
);
4169 static void amdgpu_device_xgmi_reset_func(struct work_struct
*__work
)
4171 struct amdgpu_device
*adev
=
4172 container_of(__work
, struct amdgpu_device
, xgmi_reset_work
);
4173 struct amdgpu_hive_info
*hive
= amdgpu_get_xgmi_hive(adev
);
4175 /* It's a bug to not have a hive within this function */
4180 * Use task barrier to synchronize all xgmi reset works across the
4181 * hive. task_barrier_enter and task_barrier_exit will block
4182 * until all the threads running the xgmi reset works reach
4183 * those points. task_barrier_full will do both blocks.
4185 if (amdgpu_asic_reset_method(adev
) == AMD_RESET_METHOD_BACO
) {
4187 task_barrier_enter(&hive
->tb
);
4188 adev
->asic_reset_res
= amdgpu_device_baco_enter(adev
);
4190 if (adev
->asic_reset_res
)
4193 task_barrier_exit(&hive
->tb
);
4194 adev
->asic_reset_res
= amdgpu_device_baco_exit(adev
);
4196 if (adev
->asic_reset_res
)
4199 amdgpu_ras_reset_error_count(adev
, AMDGPU_RAS_BLOCK__MMHUB
);
4202 task_barrier_full(&hive
->tb
);
4203 adev
->asic_reset_res
= amdgpu_asic_reset(adev
);
4207 if (adev
->asic_reset_res
)
4209 "ASIC reset failed with error, %d for drm dev, %s",
4210 adev
->asic_reset_res
, adev_to_drm(adev
)->unique
);
4211 amdgpu_put_xgmi_hive(hive
);
4214 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device
*adev
)
4216 char *input
= amdgpu_lockup_timeout
;
4217 char *timeout_setting
= NULL
;
4223 * By default timeout for jobs is 10 sec
4225 adev
->compute_timeout
= adev
->gfx_timeout
= msecs_to_jiffies(10000);
4226 adev
->sdma_timeout
= adev
->video_timeout
= adev
->gfx_timeout
;
4228 if (strnlen(input
, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH
)) {
4229 while ((timeout_setting
= strsep(&input
, ",")) &&
4230 strnlen(timeout_setting
, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH
)) {
4231 ret
= kstrtol(timeout_setting
, 0, &timeout
);
4238 } else if (timeout
< 0) {
4239 timeout
= MAX_SCHEDULE_TIMEOUT
;
4240 dev_warn(adev
->dev
, "lockup timeout disabled");
4241 add_taint(TAINT_SOFTLOCKUP
, LOCKDEP_STILL_OK
);
4243 timeout
= msecs_to_jiffies(timeout
);
4248 adev
->gfx_timeout
= timeout
;
4251 adev
->compute_timeout
= timeout
;
4254 adev
->sdma_timeout
= timeout
;
4257 adev
->video_timeout
= timeout
;
4264 * There is only one value specified and
4265 * it should apply to all non-compute jobs.
4268 adev
->sdma_timeout
= adev
->video_timeout
= adev
->gfx_timeout
;
4269 if (amdgpu_sriov_vf(adev
) || amdgpu_passthrough(adev
))
4270 adev
->compute_timeout
= adev
->gfx_timeout
;
4278 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4280 * @adev: amdgpu_device pointer
4282 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4284 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device
*adev
)
4286 struct iommu_domain
*domain
;
4288 domain
= iommu_get_domain_for_dev(adev
->dev
);
4289 if (!domain
|| domain
->type
== IOMMU_DOMAIN_IDENTITY
)
4290 adev
->ram_is_direct_mapped
= true;
4293 #if defined(CONFIG_HSA_AMD_P2P)
4295 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4297 * @adev: amdgpu_device pointer
4299 * return if IOMMU remapping bar address
4301 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device
*adev
)
4303 struct iommu_domain
*domain
;
4305 domain
= iommu_get_domain_for_dev(adev
->dev
);
4306 if (domain
&& (domain
->type
== IOMMU_DOMAIN_DMA
||
4307 domain
->type
== IOMMU_DOMAIN_DMA_FQ
))
4314 static void amdgpu_device_set_mcbp(struct amdgpu_device
*adev
)
4316 if (amdgpu_mcbp
== 1)
4317 adev
->gfx
.mcbp
= true;
4318 else if (amdgpu_mcbp
== 0)
4319 adev
->gfx
.mcbp
= false;
4321 if (amdgpu_sriov_vf(adev
))
4322 adev
->gfx
.mcbp
= true;
4325 dev_info(adev
->dev
, "MCBP is enabled\n");
4329 * amdgpu_device_init - initialize the driver
4331 * @adev: amdgpu_device pointer
4332 * @flags: driver flags
4334 * Initializes the driver info and hw (all asics).
4335 * Returns 0 for success or an error on failure.
4336 * Called at driver startup.
4338 int amdgpu_device_init(struct amdgpu_device
*adev
,
4341 struct pci_dev
*pdev
= adev
->pdev
;
4347 adev
->shutdown
= false;
4348 adev
->flags
= flags
;
4350 if (amdgpu_force_asic_type
>= 0 && amdgpu_force_asic_type
< CHIP_LAST
)
4351 adev
->asic_type
= amdgpu_force_asic_type
;
4353 adev
->asic_type
= flags
& AMD_ASIC_MASK
;
4355 adev
->usec_timeout
= AMDGPU_MAX_USEC_TIMEOUT
;
4356 if (amdgpu_emu_mode
== 1)
4357 adev
->usec_timeout
*= 10;
4358 adev
->gmc
.gart_size
= 512 * 1024 * 1024;
4359 adev
->accel_working
= false;
4360 adev
->num_rings
= 0;
4361 RCU_INIT_POINTER(adev
->gang_submit
, dma_fence_get_stub());
4362 adev
->mman
.buffer_funcs
= NULL
;
4363 adev
->mman
.buffer_funcs_ring
= NULL
;
4364 adev
->vm_manager
.vm_pte_funcs
= NULL
;
4365 adev
->vm_manager
.vm_pte_num_scheds
= 0;
4366 adev
->gmc
.gmc_funcs
= NULL
;
4367 adev
->harvest_ip_mask
= 0x0;
4368 adev
->fence_context
= dma_fence_context_alloc(AMDGPU_MAX_RINGS
);
4369 bitmap_zero(adev
->gfx
.pipe_reserve_bitmap
, AMDGPU_MAX_COMPUTE_QUEUES
);
4371 adev
->smc_rreg
= &amdgpu_invalid_rreg
;
4372 adev
->smc_wreg
= &amdgpu_invalid_wreg
;
4373 adev
->pcie_rreg
= &amdgpu_invalid_rreg
;
4374 adev
->pcie_wreg
= &amdgpu_invalid_wreg
;
4375 adev
->pcie_rreg_ext
= &amdgpu_invalid_rreg_ext
;
4376 adev
->pcie_wreg_ext
= &amdgpu_invalid_wreg_ext
;
4377 adev
->pciep_rreg
= &amdgpu_invalid_rreg
;
4378 adev
->pciep_wreg
= &amdgpu_invalid_wreg
;
4379 adev
->pcie_rreg64
= &amdgpu_invalid_rreg64
;
4380 adev
->pcie_wreg64
= &amdgpu_invalid_wreg64
;
4381 adev
->pcie_rreg64_ext
= &amdgpu_invalid_rreg64_ext
;
4382 adev
->pcie_wreg64_ext
= &amdgpu_invalid_wreg64_ext
;
4383 adev
->uvd_ctx_rreg
= &amdgpu_invalid_rreg
;
4384 adev
->uvd_ctx_wreg
= &amdgpu_invalid_wreg
;
4385 adev
->didt_rreg
= &amdgpu_invalid_rreg
;
4386 adev
->didt_wreg
= &amdgpu_invalid_wreg
;
4387 adev
->gc_cac_rreg
= &amdgpu_invalid_rreg
;
4388 adev
->gc_cac_wreg
= &amdgpu_invalid_wreg
;
4389 adev
->audio_endpt_rreg
= &amdgpu_block_invalid_rreg
;
4390 adev
->audio_endpt_wreg
= &amdgpu_block_invalid_wreg
;
4394 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4395 amdgpu_asic_name
[adev
->asic_type
], pdev
->vendor
, pdev
->device
,
4396 pdev
->subsystem_vendor
, pdev
->subsystem_device
, pdev
->revision
);
4398 /* mutex initialization are all done here so we
4399 * can recall function without having locking issues
4401 mutex_init(&adev
->firmware
.mutex
);
4402 mutex_init(&adev
->pm
.mutex
);
4403 mutex_init(&adev
->gfx
.gpu_clock_mutex
);
4404 mutex_init(&adev
->srbm_mutex
);
4405 mutex_init(&adev
->gfx
.pipe_reserve_mutex
);
4406 mutex_init(&adev
->gfx
.gfx_off_mutex
);
4407 mutex_init(&adev
->gfx
.partition_mutex
);
4408 mutex_init(&adev
->grbm_idx_mutex
);
4409 mutex_init(&adev
->mn_lock
);
4410 mutex_init(&adev
->virt
.vf_errors
.lock
);
4411 hash_init(adev
->mn_hash
);
4412 mutex_init(&adev
->psp
.mutex
);
4413 mutex_init(&adev
->notifier_lock
);
4414 mutex_init(&adev
->pm
.stable_pstate_ctx_lock
);
4415 mutex_init(&adev
->benchmark_mutex
);
4416 mutex_init(&adev
->gfx
.reset_sem_mutex
);
4417 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4418 mutex_init(&adev
->enforce_isolation_mutex
);
4419 for (i
= 0; i
< MAX_XCP
; ++i
) {
4420 adev
->isolation
[i
].spearhead
= dma_fence_get_stub();
4421 amdgpu_sync_create(&adev
->isolation
[i
].active
);
4422 amdgpu_sync_create(&adev
->isolation
[i
].prev
);
4424 mutex_init(&adev
->gfx
.userq_sch_mutex
);
4425 mutex_init(&adev
->gfx
.workload_profile_mutex
);
4426 mutex_init(&adev
->vcn
.workload_profile_mutex
);
4427 mutex_init(&adev
->userq_mutex
);
4429 amdgpu_device_init_apu_flags(adev
);
4431 r
= amdgpu_device_check_arguments(adev
);
4435 spin_lock_init(&adev
->mmio_idx_lock
);
4436 spin_lock_init(&adev
->smc_idx_lock
);
4437 spin_lock_init(&adev
->pcie_idx_lock
);
4438 spin_lock_init(&adev
->uvd_ctx_idx_lock
);
4439 spin_lock_init(&adev
->didt_idx_lock
);
4440 spin_lock_init(&adev
->gc_cac_idx_lock
);
4441 spin_lock_init(&adev
->se_cac_idx_lock
);
4442 spin_lock_init(&adev
->audio_endpt_idx_lock
);
4443 spin_lock_init(&adev
->mm_stats
.lock
);
4444 spin_lock_init(&adev
->virt
.rlcg_reg_lock
);
4445 spin_lock_init(&adev
->wb
.lock
);
4447 xa_init_flags(&adev
->userq_xa
, XA_FLAGS_LOCK_IRQ
);
4449 INIT_LIST_HEAD(&adev
->reset_list
);
4451 INIT_LIST_HEAD(&adev
->ras_list
);
4453 INIT_LIST_HEAD(&adev
->pm
.od_kobj_list
);
4455 INIT_LIST_HEAD(&adev
->userq_mgr_list
);
4457 INIT_DELAYED_WORK(&adev
->delayed_init_work
,
4458 amdgpu_device_delayed_init_work_handler
);
4459 INIT_DELAYED_WORK(&adev
->gfx
.gfx_off_delay_work
,
4460 amdgpu_device_delay_enable_gfx_off
);
4462 * Initialize the enforce_isolation work structures for each XCP
4463 * partition. This work handler is responsible for enforcing shader
4464 * isolation on AMD GPUs. It counts the number of emitted fences for
4465 * each GFX and compute ring. If there are any fences, it schedules
4466 * the `enforce_isolation_work` to be run after a delay. If there are
4467 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4470 for (i
= 0; i
< MAX_XCP
; i
++) {
4471 INIT_DELAYED_WORK(&adev
->gfx
.enforce_isolation
[i
].work
,
4472 amdgpu_gfx_enforce_isolation_handler
);
4473 adev
->gfx
.enforce_isolation
[i
].adev
= adev
;
4474 adev
->gfx
.enforce_isolation
[i
].xcp_id
= i
;
4477 INIT_WORK(&adev
->xgmi_reset_work
, amdgpu_device_xgmi_reset_func
);
4479 adev
->gfx
.gfx_off_req_count
= 1;
4480 adev
->gfx
.gfx_off_residency
= 0;
4481 adev
->gfx
.gfx_off_entrycount
= 0;
4482 adev
->pm
.ac_power
= power_supply_is_system_supplied() > 0;
4484 atomic_set(&adev
->throttling_logging_enabled
, 1);
4486 * If throttling continues, logging will be performed every minute
4487 * to avoid log flooding. "-1" is subtracted since the thermal
4488 * throttling interrupt comes every second. Thus, the total logging
4489 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4490 * for throttling interrupt) = 60 seconds.
4492 ratelimit_state_init(&adev
->throttling_logging_rs
, (60 - 1) * HZ
, 1);
4494 ratelimit_set_flags(&adev
->throttling_logging_rs
, RATELIMIT_MSG_ON_RELEASE
);
4496 /* Registers mapping */
4497 /* TODO: block userspace mapping of io register */
4498 if (adev
->asic_type
>= CHIP_BONAIRE
) {
4499 adev
->rmmio_base
= pci_resource_start(adev
->pdev
, 5);
4500 adev
->rmmio_size
= pci_resource_len(adev
->pdev
, 5);
4502 adev
->rmmio_base
= pci_resource_start(adev
->pdev
, 2);
4503 adev
->rmmio_size
= pci_resource_len(adev
->pdev
, 2);
4506 for (i
= 0; i
< AMD_IP_BLOCK_TYPE_NUM
; i
++)
4507 atomic_set(&adev
->pm
.pwr_state
[i
], POWER_STATE_UNKNOWN
);
4509 adev
->rmmio
= ioremap(adev
->rmmio_base
, adev
->rmmio_size
);
4513 dev_info(adev
->dev
, "register mmio base: 0x%08X\n",
4514 (uint32_t)adev
->rmmio_base
);
4515 dev_info(adev
->dev
, "register mmio size: %u\n",
4516 (unsigned int)adev
->rmmio_size
);
4519 * Reset domain needs to be present early, before XGMI hive discovered
4520 * (if any) and initialized to use reset sem and in_gpu reset flag
4521 * early on during init and before calling to RREG32.
4523 adev
->reset_domain
= amdgpu_reset_create_reset_domain(SINGLE_DEVICE
, "amdgpu-reset-dev");
4524 if (!adev
->reset_domain
)
4527 /* detect hw virtualization here */
4528 amdgpu_virt_init(adev
);
4530 amdgpu_device_get_pcie_info(adev
);
4532 r
= amdgpu_device_get_job_timeout_settings(adev
);
4534 dev_err(adev
->dev
, "invalid lockup_timeout parameter syntax\n");
4538 amdgpu_device_set_mcbp(adev
);
4541 * By default, use default mode where all blocks are expected to be
4542 * initialized. At present a 'swinit' of blocks is required to be
4543 * completed before the need for a different level is detected.
4545 amdgpu_set_init_level(adev
, AMDGPU_INIT_LEVEL_DEFAULT
);
4546 /* early init functions */
4547 r
= amdgpu_device_ip_early_init(adev
);
4552 * No need to remove conflicting FBs for non-display class devices.
4553 * This prevents the sysfb from being freed accidently.
4555 if ((pdev
->class >> 8) == PCI_CLASS_DISPLAY_VGA
||
4556 (pdev
->class >> 8) == PCI_CLASS_DISPLAY_OTHER
) {
4557 /* Get rid of things like offb */
4558 r
= aperture_remove_conflicting_pci_devices(adev
->pdev
, amdgpu_kms_driver
.name
);
4563 /* Enable TMZ based on IP_VERSION */
4564 amdgpu_gmc_tmz_set(adev
);
4566 if (amdgpu_sriov_vf(adev
) &&
4567 amdgpu_ip_version(adev
, GC_HWIP
, 0) >= IP_VERSION(10, 3, 0))
4568 /* VF MMIO access (except mailbox range) from CPU
4569 * will be blocked during sriov runtime
4571 adev
->virt
.caps
|= AMDGPU_VF_MMIO_ACCESS_PROTECT
;
4573 amdgpu_gmc_noretry_set(adev
);
4574 /* Need to get xgmi info early to decide the reset behavior*/
4575 if (adev
->gmc
.xgmi
.supported
) {
4576 r
= adev
->gfxhub
.funcs
->get_xgmi_info(adev
);
4581 /* enable PCIE atomic ops */
4582 if (amdgpu_sriov_vf(adev
)) {
4583 if (adev
->virt
.fw_reserve
.p_pf2vf
)
4584 adev
->have_atomics_support
= ((struct amd_sriov_msg_pf2vf_info
*)
4585 adev
->virt
.fw_reserve
.p_pf2vf
)->pcie_atomic_ops_support_flags
==
4586 (PCI_EXP_DEVCAP2_ATOMIC_COMP32
| PCI_EXP_DEVCAP2_ATOMIC_COMP64
);
4587 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4588 * internal path natively support atomics, set have_atomics_support to true.
4590 } else if ((adev
->flags
& AMD_IS_APU
) &&
4591 (amdgpu_ip_version(adev
, GC_HWIP
, 0) >
4592 IP_VERSION(9, 0, 0))) {
4593 adev
->have_atomics_support
= true;
4595 adev
->have_atomics_support
=
4596 !pci_enable_atomic_ops_to_root(adev
->pdev
,
4597 PCI_EXP_DEVCAP2_ATOMIC_COMP32
|
4598 PCI_EXP_DEVCAP2_ATOMIC_COMP64
);
4601 if (!adev
->have_atomics_support
)
4602 dev_info(adev
->dev
, "PCIE atomic ops is not supported\n");
4604 /* doorbell bar mapping and doorbell index init*/
4605 amdgpu_doorbell_init(adev
);
4607 if (amdgpu_emu_mode
== 1) {
4608 /* post the asic on emulation mode */
4609 emu_soc_asic_init(adev
);
4610 goto fence_driver_init
;
4613 amdgpu_reset_init(adev
);
4615 /* detect if we are with an SRIOV vbios */
4617 amdgpu_device_detect_sriov_bios(adev
);
4619 /* check if we need to reset the asic
4620 * E.g., driver was not cleanly unloaded previously, etc.
4622 if (!amdgpu_sriov_vf(adev
) && amdgpu_asic_need_reset_on_init(adev
)) {
4623 if (adev
->gmc
.xgmi
.num_physical_nodes
) {
4624 dev_info(adev
->dev
, "Pending hive reset.\n");
4625 amdgpu_set_init_level(adev
,
4626 AMDGPU_INIT_LEVEL_MINIMAL_XGMI
);
4627 } else if (amdgpu_ip_version(adev
, MP1_HWIP
, 0) == IP_VERSION(13, 0, 10) &&
4628 !amdgpu_device_has_display_hardware(adev
)) {
4629 r
= psp_gpu_reset(adev
);
4631 tmp
= amdgpu_reset_method
;
4632 /* It should do a default reset when loading or reloading the driver,
4633 * regardless of the module parameter reset_method.
4635 amdgpu_reset_method
= AMD_RESET_METHOD_NONE
;
4636 r
= amdgpu_asic_reset(adev
);
4637 amdgpu_reset_method
= tmp
;
4641 dev_err(adev
->dev
, "asic reset on init failed\n");
4646 /* Post card if necessary */
4647 if (amdgpu_device_need_post(adev
)) {
4649 dev_err(adev
->dev
, "no vBIOS found\n");
4653 dev_info(adev
->dev
, "GPU posting now...\n");
4654 r
= amdgpu_device_asic_init(adev
);
4656 dev_err(adev
->dev
, "gpu post error!\n");
4662 if (adev
->is_atom_fw
) {
4663 /* Initialize clocks */
4664 r
= amdgpu_atomfirmware_get_clock_info(adev
);
4666 dev_err(adev
->dev
, "amdgpu_atomfirmware_get_clock_info failed\n");
4667 amdgpu_vf_error_put(adev
, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL
, 0, 0);
4671 /* Initialize clocks */
4672 r
= amdgpu_atombios_get_clock_info(adev
);
4674 dev_err(adev
->dev
, "amdgpu_atombios_get_clock_info failed\n");
4675 amdgpu_vf_error_put(adev
, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL
, 0, 0);
4678 /* init i2c buses */
4679 amdgpu_i2c_init(adev
);
4685 r
= amdgpu_fence_driver_sw_init(adev
);
4687 dev_err(adev
->dev
, "amdgpu_fence_driver_sw_init failed\n");
4688 amdgpu_vf_error_put(adev
, AMDGIM_ERROR_VF_FENCE_INIT_FAIL
, 0, 0);
4692 /* init the mode config */
4693 drm_mode_config_init(adev_to_drm(adev
));
4695 r
= amdgpu_device_ip_init(adev
);
4697 dev_err(adev
->dev
, "amdgpu_device_ip_init failed\n");
4698 amdgpu_vf_error_put(adev
, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL
, 0, 0);
4699 goto release_ras_con
;
4702 amdgpu_fence_driver_hw_init(adev
);
4705 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4706 adev
->gfx
.config
.max_shader_engines
,
4707 adev
->gfx
.config
.max_sh_per_se
,
4708 adev
->gfx
.config
.max_cu_per_sh
,
4709 adev
->gfx
.cu_info
.number
);
4711 adev
->accel_working
= true;
4713 amdgpu_vm_check_compute_bug(adev
);
4715 /* Initialize the buffer migration limit. */
4716 if (amdgpu_moverate
>= 0)
4717 max_MBps
= amdgpu_moverate
;
4719 max_MBps
= 8; /* Allow 8 MB/s. */
4720 /* Get a log2 for easy divisions. */
4721 adev
->mm_stats
.log2_max_MBps
= ilog2(max(1u, max_MBps
));
4724 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4725 * Otherwise the mgpu fan boost feature will be skipped due to the
4726 * gpu instance is counted less.
4728 amdgpu_register_gpu_instance(adev
);
4730 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4731 * explicit gating rather than handling it automatically.
4733 if (adev
->init_lvl
->level
!= AMDGPU_INIT_LEVEL_MINIMAL_XGMI
) {
4734 r
= amdgpu_device_ip_late_init(adev
);
4736 dev_err(adev
->dev
, "amdgpu_device_ip_late_init failed\n");
4737 amdgpu_vf_error_put(adev
, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL
, 0, r
);
4738 goto release_ras_con
;
4741 amdgpu_ras_resume(adev
);
4742 queue_delayed_work(system_wq
, &adev
->delayed_init_work
,
4743 msecs_to_jiffies(AMDGPU_RESUME_MS
));
4746 if (amdgpu_sriov_vf(adev
)) {
4747 amdgpu_virt_release_full_gpu(adev
, true);
4748 flush_delayed_work(&adev
->delayed_init_work
);
4752 * Place those sysfs registering after `late_init`. As some of those
4753 * operations performed in `late_init` might affect the sysfs
4754 * interfaces creating.
4756 r
= amdgpu_atombios_sysfs_init(adev
);
4758 drm_err(&adev
->ddev
,
4759 "registering atombios sysfs failed (%d).\n", r
);
4761 r
= amdgpu_pm_sysfs_init(adev
);
4763 dev_err(adev
->dev
, "registering pm sysfs failed (%d).\n", r
);
4765 r
= amdgpu_ucode_sysfs_init(adev
);
4767 adev
->ucode_sysfs_en
= false;
4768 dev_err(adev
->dev
, "Creating firmware sysfs failed (%d).\n", r
);
4770 adev
->ucode_sysfs_en
= true;
4772 r
= amdgpu_device_attr_sysfs_init(adev
);
4774 dev_err(adev
->dev
, "Could not create amdgpu device attr\n");
4776 r
= devm_device_add_group(adev
->dev
, &amdgpu_board_attrs_group
);
4779 "Could not create amdgpu board attributes\n");
4781 amdgpu_fru_sysfs_init(adev
);
4782 amdgpu_reg_state_sysfs_init(adev
);
4783 amdgpu_xcp_sysfs_init(adev
);
4785 if (IS_ENABLED(CONFIG_PERF_EVENTS
))
4786 r
= amdgpu_pmu_init(adev
);
4788 dev_err(adev
->dev
, "amdgpu_pmu_init failed\n");
4790 /* Have stored pci confspace at hand for restore in sudden PCI error */
4791 if (amdgpu_device_cache_pci_state(adev
->pdev
))
4792 pci_restore_state(pdev
);
4794 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4795 /* this will fail for cards that aren't VGA class devices, just
4798 if ((adev
->pdev
->class >> 8) == PCI_CLASS_DISPLAY_VGA
)
4799 vga_client_register(adev
->pdev
, amdgpu_device_vga_set_decode
);
4801 px
= amdgpu_device_supports_px(adev
);
4803 if (px
|| (!dev_is_removable(&adev
->pdev
->dev
) &&
4804 apple_gmux_detect(NULL
, NULL
)))
4805 vga_switcheroo_register_client(adev
->pdev
,
4806 &amdgpu_switcheroo_ops
, px
);
4809 vga_switcheroo_init_domain_pm_ops(adev
->dev
, &adev
->vga_pm_domain
);
4811 if (adev
->init_lvl
->level
== AMDGPU_INIT_LEVEL_MINIMAL_XGMI
)
4812 amdgpu_xgmi_reset_on_init(adev
);
4814 amdgpu_device_check_iommu_direct_map(adev
);
4816 adev
->pm_nb
.notifier_call
= amdgpu_device_pm_notifier
;
4817 r
= register_pm_notifier(&adev
->pm_nb
);
4824 if (amdgpu_sriov_vf(adev
))
4825 amdgpu_virt_release_full_gpu(adev
, true);
4827 /* failed in exclusive mode due to timeout */
4828 if (amdgpu_sriov_vf(adev
) &&
4829 !amdgpu_sriov_runtime(adev
) &&
4830 amdgpu_virt_mmio_blocked(adev
) &&
4831 !amdgpu_virt_wait_reset(adev
)) {
4832 dev_err(adev
->dev
, "VF exclusive mode timeout\n");
4833 /* Don't send request since VF is inactive. */
4834 adev
->virt
.caps
&= ~AMDGPU_SRIOV_CAPS_RUNTIME
;
4835 adev
->virt
.ops
= NULL
;
4838 amdgpu_release_ras_context(adev
);
4841 amdgpu_vf_error_trans_all(adev
);
4846 static void amdgpu_device_unmap_mmio(struct amdgpu_device
*adev
)
4849 /* Clear all CPU mappings pointing to this device */
4850 unmap_mapping_range(adev
->ddev
.anon_inode
->i_mapping
, 0, 0, 1);
4852 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4853 amdgpu_doorbell_fini(adev
);
4855 iounmap(adev
->rmmio
);
4857 if (adev
->mman
.aper_base_kaddr
)
4858 iounmap(adev
->mman
.aper_base_kaddr
);
4859 adev
->mman
.aper_base_kaddr
= NULL
;
4861 /* Memory manager related */
4862 if (!adev
->gmc
.xgmi
.connected_to_cpu
&& !adev
->gmc
.is_app_apu
) {
4863 arch_phys_wc_del(adev
->gmc
.vram_mtrr
);
4864 arch_io_free_memtype_wc(adev
->gmc
.aper_base
, adev
->gmc
.aper_size
);
4869 * amdgpu_device_fini_hw - tear down the driver
4871 * @adev: amdgpu_device pointer
4873 * Tear down the driver info (all asics).
4874 * Called at driver shutdown.
4876 void amdgpu_device_fini_hw(struct amdgpu_device
*adev
)
4878 dev_info(adev
->dev
, "amdgpu: finishing device.\n");
4879 flush_delayed_work(&adev
->delayed_init_work
);
4881 if (adev
->mman
.initialized
)
4882 drain_workqueue(adev
->mman
.bdev
.wq
);
4883 adev
->shutdown
= true;
4885 unregister_pm_notifier(&adev
->pm_nb
);
4887 /* make sure IB test finished before entering exclusive mode
4888 * to avoid preemption on IB test
4890 if (amdgpu_sriov_vf(adev
)) {
4891 amdgpu_virt_request_full_gpu(adev
, false);
4892 amdgpu_virt_fini_data_exchange(adev
);
4895 /* disable all interrupts */
4896 amdgpu_irq_disable_all(adev
);
4897 if (adev
->mode_info
.mode_config_initialized
) {
4898 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev
)))
4899 drm_helper_force_disable_all(adev_to_drm(adev
));
4901 drm_atomic_helper_shutdown(adev_to_drm(adev
));
4903 amdgpu_fence_driver_hw_fini(adev
);
4905 if (adev
->pm
.sysfs_initialized
)
4906 amdgpu_pm_sysfs_fini(adev
);
4907 if (adev
->ucode_sysfs_en
)
4908 amdgpu_ucode_sysfs_fini(adev
);
4909 amdgpu_device_attr_sysfs_fini(adev
);
4910 amdgpu_fru_sysfs_fini(adev
);
4912 amdgpu_reg_state_sysfs_fini(adev
);
4913 amdgpu_xcp_sysfs_fini(adev
);
4915 /* disable ras feature must before hw fini */
4916 amdgpu_ras_pre_fini(adev
);
4918 amdgpu_ttm_set_buffer_funcs_status(adev
, false);
4920 amdgpu_device_ip_fini_early(adev
);
4922 amdgpu_irq_fini_hw(adev
);
4924 if (adev
->mman
.initialized
)
4925 ttm_device_clear_dma_mappings(&adev
->mman
.bdev
);
4927 amdgpu_gart_dummy_page_fini(adev
);
4929 if (drm_dev_is_unplugged(adev_to_drm(adev
)))
4930 amdgpu_device_unmap_mmio(adev
);
4934 void amdgpu_device_fini_sw(struct amdgpu_device
*adev
)
4939 amdgpu_device_ip_fini(adev
);
4940 amdgpu_fence_driver_sw_fini(adev
);
4941 amdgpu_ucode_release(&adev
->firmware
.gpu_info_fw
);
4942 adev
->accel_working
= false;
4943 dma_fence_put(rcu_dereference_protected(adev
->gang_submit
, true));
4944 for (i
= 0; i
< MAX_XCP
; ++i
) {
4945 dma_fence_put(adev
->isolation
[i
].spearhead
);
4946 amdgpu_sync_free(&adev
->isolation
[i
].active
);
4947 amdgpu_sync_free(&adev
->isolation
[i
].prev
);
4950 amdgpu_reset_fini(adev
);
4952 /* free i2c buses */
4953 amdgpu_i2c_fini(adev
);
4956 if (amdgpu_emu_mode
!= 1)
4957 amdgpu_atombios_fini(adev
);
4958 amdgpu_bios_release(adev
);
4961 kfree(adev
->fru_info
);
4962 adev
->fru_info
= NULL
;
4964 kfree(adev
->xcp_mgr
);
4965 adev
->xcp_mgr
= NULL
;
4967 px
= amdgpu_device_supports_px(adev
);
4969 if (px
|| (!dev_is_removable(&adev
->pdev
->dev
) &&
4970 apple_gmux_detect(NULL
, NULL
)))
4971 vga_switcheroo_unregister_client(adev
->pdev
);
4974 vga_switcheroo_fini_domain_pm_ops(adev
->dev
);
4976 if ((adev
->pdev
->class >> 8) == PCI_CLASS_DISPLAY_VGA
)
4977 vga_client_unregister(adev
->pdev
);
4979 if (drm_dev_enter(adev_to_drm(adev
), &idx
)) {
4981 iounmap(adev
->rmmio
);
4986 if (IS_ENABLED(CONFIG_PERF_EVENTS
))
4987 amdgpu_pmu_fini(adev
);
4988 if (adev
->mman
.discovery_bin
)
4989 amdgpu_discovery_fini(adev
);
4991 amdgpu_reset_put_reset_domain(adev
->reset_domain
);
4992 adev
->reset_domain
= NULL
;
4994 kfree(adev
->pci_state
);
4999 * amdgpu_device_evict_resources - evict device resources
5000 * @adev: amdgpu device object
5002 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
5003 * of the vram memory type. Mainly used for evicting device resources
5007 static int amdgpu_device_evict_resources(struct amdgpu_device
*adev
)
5011 /* No need to evict vram on APUs unless going to S4 */
5012 if (!adev
->in_s4
&& (adev
->flags
& AMD_IS_APU
))
5015 ret
= amdgpu_ttm_evict_resources(adev
, TTM_PL_VRAM
);
5017 dev_warn(adev
->dev
, "evicting device resources failed\n");
5022 ret
= ttm_device_prepare_hibernation(&adev
->mman
.bdev
);
5024 dev_err(adev
->dev
, "prepare hibernation failed, %d\n", ret
);
5033 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
5034 * @nb: notifier block
5035 * @mode: suspend mode
5038 * This function is called when the system is about to suspend or hibernate.
5039 * It is used to set the appropriate flags so that eviction can be optimized
5040 * in the pm prepare callback.
5042 static int amdgpu_device_pm_notifier(struct notifier_block
*nb
, unsigned long mode
,
5045 struct amdgpu_device
*adev
= container_of(nb
, struct amdgpu_device
, pm_nb
);
5048 case PM_HIBERNATION_PREPARE
:
5051 case PM_POST_HIBERNATION
:
5052 adev
->in_s4
= false;
5060 * amdgpu_device_prepare - prepare for device suspend
5062 * @dev: drm dev pointer
5064 * Prepare to put the hw in the suspend state (all asics).
5065 * Returns 0 for success or an error on failure.
5066 * Called at driver suspend.
5068 int amdgpu_device_prepare(struct drm_device
*dev
)
5070 struct amdgpu_device
*adev
= drm_to_adev(dev
);
5073 if (dev
->switch_power_state
== DRM_SWITCH_POWER_OFF
)
5076 /* Evict the majority of BOs before starting suspend sequence */
5077 r
= amdgpu_device_evict_resources(adev
);
5081 flush_delayed_work(&adev
->gfx
.gfx_off_delay_work
);
5083 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
5084 if (!adev
->ip_blocks
[i
].status
.valid
)
5086 if (!adev
->ip_blocks
[i
].version
->funcs
->prepare_suspend
)
5088 r
= adev
->ip_blocks
[i
].version
->funcs
->prepare_suspend(&adev
->ip_blocks
[i
]);
5097 * amdgpu_device_complete - complete power state transition
5099 * @dev: drm dev pointer
5101 * Undo the changes from amdgpu_device_prepare. This will be
5102 * called on all resume transitions, including those that failed.
5104 void amdgpu_device_complete(struct drm_device
*dev
)
5106 struct amdgpu_device
*adev
= drm_to_adev(dev
);
5109 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
5110 if (!adev
->ip_blocks
[i
].status
.valid
)
5112 if (!adev
->ip_blocks
[i
].version
->funcs
->complete
)
5114 adev
->ip_blocks
[i
].version
->funcs
->complete(&adev
->ip_blocks
[i
]);
5119 * amdgpu_device_suspend - initiate device suspend
5121 * @dev: drm dev pointer
5122 * @notify_clients: notify in-kernel DRM clients
5124 * Puts the hw in the suspend state (all asics).
5125 * Returns 0 for success or an error on failure.
5126 * Called at driver suspend.
5128 int amdgpu_device_suspend(struct drm_device
*dev
, bool notify_clients
)
5130 struct amdgpu_device
*adev
= drm_to_adev(dev
);
5133 if (dev
->switch_power_state
== DRM_SWITCH_POWER_OFF
)
5136 adev
->in_suspend
= true;
5138 if (amdgpu_sriov_vf(adev
)) {
5139 if (!adev
->in_s0ix
&& !adev
->in_runpm
)
5140 amdgpu_amdkfd_suspend_process(adev
);
5141 amdgpu_virt_fini_data_exchange(adev
);
5142 r
= amdgpu_virt_request_full_gpu(adev
, false);
5147 if (amdgpu_acpi_smart_shift_update(adev
, AMDGPU_SS_DEV_D3
))
5148 dev_warn(adev
->dev
, "smart shift update failed\n");
5151 drm_client_dev_suspend(adev_to_drm(adev
), false);
5153 cancel_delayed_work_sync(&adev
->delayed_init_work
);
5155 amdgpu_ras_suspend(adev
);
5157 amdgpu_device_ip_suspend_phase1(adev
);
5159 if (!adev
->in_s0ix
) {
5160 amdgpu_amdkfd_suspend(adev
, !amdgpu_sriov_vf(adev
) && !adev
->in_runpm
);
5161 amdgpu_userq_suspend(adev
);
5164 r
= amdgpu_device_evict_resources(adev
);
5168 amdgpu_ttm_set_buffer_funcs_status(adev
, false);
5170 amdgpu_fence_driver_hw_fini(adev
);
5172 amdgpu_device_ip_suspend_phase2(adev
);
5174 if (amdgpu_sriov_vf(adev
))
5175 amdgpu_virt_release_full_gpu(adev
, false);
5177 r
= amdgpu_dpm_notify_rlc_state(adev
, false);
5184 static inline int amdgpu_virt_resume(struct amdgpu_device
*adev
)
5187 unsigned int prev_physical_node_id
= adev
->gmc
.xgmi
.physical_node_id
;
5189 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO)
5190 * may not work. The access could be blocked by nBIF protection as VF isn't in
5191 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX
5192 * so that QEMU reprograms MSIX table.
5194 amdgpu_restore_msix(adev
);
5196 r
= adev
->gfxhub
.funcs
->get_xgmi_info(adev
);
5200 dev_info(adev
->dev
, "xgmi node, old id %d, new id %d\n",
5201 prev_physical_node_id
, adev
->gmc
.xgmi
.physical_node_id
);
5203 adev
->vm_manager
.vram_base_offset
= adev
->gfxhub
.funcs
->get_mc_fb_offset(adev
);
5204 adev
->vm_manager
.vram_base_offset
+=
5205 adev
->gmc
.xgmi
.physical_node_id
* adev
->gmc
.xgmi
.node_segment_size
;
5211 * amdgpu_device_resume - initiate device resume
5213 * @dev: drm dev pointer
5214 * @notify_clients: notify in-kernel DRM clients
5216 * Bring the hw back to operating state (all asics).
5217 * Returns 0 for success or an error on failure.
5218 * Called at driver resume.
5220 int amdgpu_device_resume(struct drm_device
*dev
, bool notify_clients
)
5222 struct amdgpu_device
*adev
= drm_to_adev(dev
);
5225 if (amdgpu_sriov_vf(adev
)) {
5226 r
= amdgpu_virt_request_full_gpu(adev
, true);
5231 if (amdgpu_virt_xgmi_migrate_enabled(adev
)) {
5232 r
= amdgpu_virt_resume(adev
);
5237 if (dev
->switch_power_state
== DRM_SWITCH_POWER_OFF
)
5241 amdgpu_dpm_gfx_state_change(adev
, sGpuChangeState_D0Entry
);
5244 if (amdgpu_device_need_post(adev
)) {
5245 r
= amdgpu_device_asic_init(adev
);
5247 dev_err(adev
->dev
, "amdgpu asic init failed\n");
5250 r
= amdgpu_device_ip_resume(adev
);
5253 dev_err(adev
->dev
, "amdgpu_device_ip_resume failed (%d).\n", r
);
5257 if (!adev
->in_s0ix
) {
5258 r
= amdgpu_amdkfd_resume(adev
, !amdgpu_sriov_vf(adev
) && !adev
->in_runpm
);
5262 r
= amdgpu_userq_resume(adev
);
5267 r
= amdgpu_device_ip_late_init(adev
);
5271 queue_delayed_work(system_wq
, &adev
->delayed_init_work
,
5272 msecs_to_jiffies(AMDGPU_RESUME_MS
));
5274 if (amdgpu_sriov_vf(adev
)) {
5275 amdgpu_virt_init_data_exchange(adev
);
5276 amdgpu_virt_release_full_gpu(adev
, true);
5278 if (!adev
->in_s0ix
&& !r
&& !adev
->in_runpm
)
5279 r
= amdgpu_amdkfd_resume_process(adev
);
5285 /* Make sure IB tests flushed */
5286 flush_delayed_work(&adev
->delayed_init_work
);
5289 drm_client_dev_resume(adev_to_drm(adev
), false);
5291 amdgpu_ras_resume(adev
);
5293 if (adev
->mode_info
.num_crtc
) {
5295 * Most of the connector probing functions try to acquire runtime pm
5296 * refs to ensure that the GPU is powered on when connector polling is
5297 * performed. Since we're calling this from a runtime PM callback,
5298 * trying to acquire rpm refs will cause us to deadlock.
5300 * Since we're guaranteed to be holding the rpm lock, it's safe to
5301 * temporarily disable the rpm helpers so this doesn't deadlock us.
5304 dev
->dev
->power
.disable_depth
++;
5306 if (!adev
->dc_enabled
)
5307 drm_helper_hpd_irq_event(dev
);
5309 drm_kms_helper_hotplug_event(dev
);
5311 dev
->dev
->power
.disable_depth
--;
5315 amdgpu_vram_mgr_clear_reset_blocks(adev
);
5316 adev
->in_suspend
= false;
5318 if (amdgpu_acpi_smart_shift_update(adev
, AMDGPU_SS_DEV_D0
))
5319 dev_warn(adev
->dev
, "smart shift update failed\n");
5325 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5327 * @adev: amdgpu_device pointer
5329 * The list of all the hardware IPs that make up the asic is walked and
5330 * the check_soft_reset callbacks are run. check_soft_reset determines
5331 * if the asic is still hung or not.
5332 * Returns true if any of the IPs are still in a hung state, false if not.
5334 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device
*adev
)
5337 bool asic_hang
= false;
5339 if (amdgpu_sriov_vf(adev
))
5342 if (amdgpu_asic_need_full_reset(adev
))
5345 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
5346 if (!adev
->ip_blocks
[i
].status
.valid
)
5348 if (adev
->ip_blocks
[i
].version
->funcs
->check_soft_reset
)
5349 adev
->ip_blocks
[i
].status
.hang
=
5350 adev
->ip_blocks
[i
].version
->funcs
->check_soft_reset(
5351 &adev
->ip_blocks
[i
]);
5352 if (adev
->ip_blocks
[i
].status
.hang
) {
5353 dev_info(adev
->dev
, "IP block:%s is hung!\n", adev
->ip_blocks
[i
].version
->funcs
->name
);
5361 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5363 * @adev: amdgpu_device pointer
5365 * The list of all the hardware IPs that make up the asic is walked and the
5366 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5367 * handles any IP specific hardware or software state changes that are
5368 * necessary for a soft reset to succeed.
5369 * Returns 0 on success, negative error code on failure.
5371 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device
*adev
)
5375 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
5376 if (!adev
->ip_blocks
[i
].status
.valid
)
5378 if (adev
->ip_blocks
[i
].status
.hang
&&
5379 adev
->ip_blocks
[i
].version
->funcs
->pre_soft_reset
) {
5380 r
= adev
->ip_blocks
[i
].version
->funcs
->pre_soft_reset(&adev
->ip_blocks
[i
]);
5390 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5392 * @adev: amdgpu_device pointer
5394 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5395 * reset is necessary to recover.
5396 * Returns true if a full asic reset is required, false if not.
5398 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device
*adev
)
5402 if (amdgpu_asic_need_full_reset(adev
))
5405 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
5406 if (!adev
->ip_blocks
[i
].status
.valid
)
5408 if ((adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_GMC
) ||
5409 (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_SMC
) ||
5410 (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_ACP
) ||
5411 (adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_DCE
) ||
5412 adev
->ip_blocks
[i
].version
->type
== AMD_IP_BLOCK_TYPE_PSP
) {
5413 if (adev
->ip_blocks
[i
].status
.hang
) {
5414 dev_info(adev
->dev
, "Some block need full reset!\n");
5423 * amdgpu_device_ip_soft_reset - do a soft reset
5425 * @adev: amdgpu_device pointer
5427 * The list of all the hardware IPs that make up the asic is walked and the
5428 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5429 * IP specific hardware or software state changes that are necessary to soft
5431 * Returns 0 on success, negative error code on failure.
5433 static int amdgpu_device_ip_soft_reset(struct amdgpu_device
*adev
)
5437 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
5438 if (!adev
->ip_blocks
[i
].status
.valid
)
5440 if (adev
->ip_blocks
[i
].status
.hang
&&
5441 adev
->ip_blocks
[i
].version
->funcs
->soft_reset
) {
5442 r
= adev
->ip_blocks
[i
].version
->funcs
->soft_reset(&adev
->ip_blocks
[i
]);
5452 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5454 * @adev: amdgpu_device pointer
5456 * The list of all the hardware IPs that make up the asic is walked and the
5457 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5458 * handles any IP specific hardware or software state changes that are
5459 * necessary after the IP has been soft reset.
5460 * Returns 0 on success, negative error code on failure.
5462 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device
*adev
)
5466 for (i
= 0; i
< adev
->num_ip_blocks
; i
++) {
5467 if (!adev
->ip_blocks
[i
].status
.valid
)
5469 if (adev
->ip_blocks
[i
].status
.hang
&&
5470 adev
->ip_blocks
[i
].version
->funcs
->post_soft_reset
)
5471 r
= adev
->ip_blocks
[i
].version
->funcs
->post_soft_reset(&adev
->ip_blocks
[i
]);
5480 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5482 * @adev: amdgpu_device pointer
5483 * @reset_context: amdgpu reset context pointer
5485 * do VF FLR and reinitialize Asic
5486 * return 0 means succeeded otherwise failed
5488 static int amdgpu_device_reset_sriov(struct amdgpu_device
*adev
,
5489 struct amdgpu_reset_context
*reset_context
)
5492 struct amdgpu_hive_info
*hive
= NULL
;
5494 if (test_bit(AMDGPU_HOST_FLR
, &reset_context
->flags
)) {
5495 if (!amdgpu_ras_get_fed_status(adev
))
5496 amdgpu_virt_ready_to_reset(adev
);
5497 amdgpu_virt_wait_reset(adev
);
5498 clear_bit(AMDGPU_HOST_FLR
, &reset_context
->flags
);
5499 r
= amdgpu_virt_request_full_gpu(adev
, true);
5501 r
= amdgpu_virt_reset_gpu(adev
);
5506 amdgpu_ras_clear_err_state(adev
);
5507 amdgpu_irq_gpu_reset_resume_helper(adev
);
5509 /* some sw clean up VF needs to do before recover */
5510 amdgpu_virt_post_reset(adev
);
5512 /* Resume IP prior to SMC */
5513 r
= amdgpu_device_ip_reinit_early_sriov(adev
);
5517 amdgpu_virt_init_data_exchange(adev
);
5519 r
= amdgpu_device_fw_loading(adev
);
5523 /* now we are okay to resume SMC/CP/SDMA */
5524 r
= amdgpu_device_ip_reinit_late_sriov(adev
);
5528 hive
= amdgpu_get_xgmi_hive(adev
);
5529 /* Update PSP FW topology after reset */
5530 if (hive
&& adev
->gmc
.xgmi
.num_physical_nodes
> 1)
5531 r
= amdgpu_xgmi_update_topology(hive
, adev
);
5533 amdgpu_put_xgmi_hive(hive
);
5537 r
= amdgpu_ib_ring_tests(adev
);
5541 if (adev
->virt
.gim_feature
& AMDGIM_FEATURE_GIM_FLR_VRAMLOST
)
5542 amdgpu_inc_vram_lost(adev
);
5544 /* need to be called during full access so we can't do it later like
5547 amdgpu_amdkfd_post_reset(adev
);
5548 amdgpu_virt_release_full_gpu(adev
, true);
5550 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5551 if (amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(9, 4, 2) ||
5552 amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(9, 4, 3) ||
5553 amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(9, 4, 4) ||
5554 amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(9, 5, 0) ||
5555 amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(11, 0, 3))
5556 amdgpu_ras_resume(adev
);
5558 amdgpu_virt_ras_telemetry_post_reset(adev
);
5564 * amdgpu_device_has_job_running - check if there is any unfinished job
5566 * @adev: amdgpu_device pointer
5568 * check if there is any job running on the device when guest driver receives
5569 * FLR notification from host driver. If there are still jobs running, then
5570 * the guest driver will not respond the FLR reset. Instead, let the job hit
5571 * the timeout and guest driver then issue the reset request.
5573 bool amdgpu_device_has_job_running(struct amdgpu_device
*adev
)
5577 for (i
= 0; i
< AMDGPU_MAX_RINGS
; ++i
) {
5578 struct amdgpu_ring
*ring
= adev
->rings
[i
];
5580 if (!amdgpu_ring_sched_ready(ring
))
5583 if (amdgpu_fence_count_emitted(ring
))
5590 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5592 * @adev: amdgpu_device pointer
5594 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5597 bool amdgpu_device_should_recover_gpu(struct amdgpu_device
*adev
)
5600 if (amdgpu_gpu_recovery
== 0)
5603 /* Skip soft reset check in fatal error mode */
5604 if (!amdgpu_ras_is_poison_mode_supported(adev
))
5607 if (amdgpu_sriov_vf(adev
))
5610 if (amdgpu_gpu_recovery
== -1) {
5611 switch (adev
->asic_type
) {
5612 #ifdef CONFIG_DRM_AMDGPU_SI
5619 #ifdef CONFIG_DRM_AMDGPU_CIK
5626 case CHIP_CYAN_SKILLFISH
:
5636 dev_info(adev
->dev
, "GPU recovery disabled.\n");
5640 int amdgpu_device_mode1_reset(struct amdgpu_device
*adev
)
5646 amdgpu_atombios_scratch_regs_engine_hung(adev
, true);
5648 dev_info(adev
->dev
, "GPU mode1 reset\n");
5650 /* Cache the state before bus master disable. The saved config space
5651 * values are used in other cases like restore after mode-2 reset.
5653 amdgpu_device_cache_pci_state(adev
->pdev
);
5656 pci_clear_master(adev
->pdev
);
5658 if (amdgpu_dpm_is_mode1_reset_supported(adev
)) {
5659 dev_info(adev
->dev
, "GPU smu mode1 reset\n");
5660 ret
= amdgpu_dpm_mode1_reset(adev
);
5662 dev_info(adev
->dev
, "GPU psp mode1 reset\n");
5663 ret
= psp_gpu_reset(adev
);
5667 goto mode1_reset_failed
;
5669 amdgpu_device_load_pci_state(adev
->pdev
);
5670 ret
= amdgpu_psp_wait_for_bootloader(adev
);
5672 goto mode1_reset_failed
;
5674 /* wait for asic to come out of reset */
5675 for (i
= 0; i
< adev
->usec_timeout
; i
++) {
5676 u32 memsize
= adev
->nbio
.funcs
->get_memsize(adev
);
5678 if (memsize
!= 0xffffffff)
5683 if (i
>= adev
->usec_timeout
) {
5685 goto mode1_reset_failed
;
5689 amdgpu_atombios_scratch_regs_engine_hung(adev
, false);
5694 dev_err(adev
->dev
, "GPU mode1 reset failed\n");
5698 int amdgpu_device_link_reset(struct amdgpu_device
*adev
)
5702 dev_info(adev
->dev
, "GPU link reset\n");
5704 if (!adev
->pcie_reset_ctx
.occurs_dpc
)
5705 ret
= amdgpu_dpm_link_reset(adev
);
5708 goto link_reset_failed
;
5710 ret
= amdgpu_psp_wait_for_bootloader(adev
);
5712 goto link_reset_failed
;
5717 dev_err(adev
->dev
, "GPU link reset failed\n");
5721 int amdgpu_device_pre_asic_reset(struct amdgpu_device
*adev
,
5722 struct amdgpu_reset_context
*reset_context
)
5725 struct amdgpu_job
*job
= NULL
;
5726 struct amdgpu_device
*tmp_adev
= reset_context
->reset_req_dev
;
5727 bool need_full_reset
=
5728 test_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
->flags
);
5730 if (reset_context
->reset_req_dev
== adev
)
5731 job
= reset_context
->job
;
5733 if (amdgpu_sriov_vf(adev
))
5734 amdgpu_virt_pre_reset(adev
);
5736 amdgpu_fence_driver_isr_toggle(adev
, true);
5738 /* block all schedulers and reset given job's ring */
5739 for (i
= 0; i
< AMDGPU_MAX_RINGS
; ++i
) {
5740 struct amdgpu_ring
*ring
= adev
->rings
[i
];
5742 if (!amdgpu_ring_sched_ready(ring
))
5745 /* Clear job fence from fence drv to avoid force_completion
5746 * leave NULL and vm flush fence in fence drv
5748 amdgpu_fence_driver_clear_job_fences(ring
);
5750 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5751 amdgpu_fence_driver_force_completion(ring
);
5754 amdgpu_fence_driver_isr_toggle(adev
, false);
5757 drm_sched_increase_karma(&job
->base
);
5759 r
= amdgpu_reset_prepare_hwcontext(adev
, reset_context
);
5760 /* If reset handler not implemented, continue; otherwise return */
5761 if (r
== -EOPNOTSUPP
)
5766 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5767 if (!amdgpu_sriov_vf(adev
)) {
5769 if (!need_full_reset
)
5770 need_full_reset
= amdgpu_device_ip_need_full_reset(adev
);
5772 if (!need_full_reset
&& amdgpu_gpu_recovery
&&
5773 amdgpu_device_ip_check_soft_reset(adev
)) {
5774 amdgpu_device_ip_pre_soft_reset(adev
);
5775 r
= amdgpu_device_ip_soft_reset(adev
);
5776 amdgpu_device_ip_post_soft_reset(adev
);
5777 if (r
|| amdgpu_device_ip_check_soft_reset(adev
)) {
5778 dev_info(adev
->dev
, "soft reset failed, will fallback to full reset!\n");
5779 need_full_reset
= true;
5783 if (!test_bit(AMDGPU_SKIP_COREDUMP
, &reset_context
->flags
)) {
5784 dev_info(tmp_adev
->dev
, "Dumping IP State\n");
5785 /* Trigger ip dump before we reset the asic */
5786 for (i
= 0; i
< tmp_adev
->num_ip_blocks
; i
++)
5787 if (tmp_adev
->ip_blocks
[i
].version
->funcs
->dump_ip_state
)
5788 tmp_adev
->ip_blocks
[i
].version
->funcs
5789 ->dump_ip_state((void *)&tmp_adev
->ip_blocks
[i
]);
5790 dev_info(tmp_adev
->dev
, "Dumping IP State Completed\n");
5793 if (need_full_reset
)
5794 r
= amdgpu_device_ip_suspend(adev
);
5795 if (need_full_reset
)
5796 set_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
->flags
);
5798 clear_bit(AMDGPU_NEED_FULL_RESET
,
5799 &reset_context
->flags
);
5805 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context
*reset_context
)
5807 struct list_head
*device_list_handle
;
5808 bool full_reset
, vram_lost
= false;
5809 struct amdgpu_device
*tmp_adev
;
5812 device_list_handle
= reset_context
->reset_device_list
;
5814 if (!device_list_handle
)
5817 full_reset
= test_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
->flags
);
5820 * If it's reset on init, it's default init level, otherwise keep level
5821 * as recovery level.
5823 if (reset_context
->method
== AMD_RESET_METHOD_ON_INIT
)
5824 init_level
= AMDGPU_INIT_LEVEL_DEFAULT
;
5826 init_level
= AMDGPU_INIT_LEVEL_RESET_RECOVERY
;
5829 list_for_each_entry(tmp_adev
, device_list_handle
, reset_list
) {
5830 amdgpu_set_init_level(tmp_adev
, init_level
);
5833 amdgpu_ras_clear_err_state(tmp_adev
);
5834 r
= amdgpu_device_asic_init(tmp_adev
);
5836 dev_warn(tmp_adev
->dev
, "asic atom init failed!");
5838 dev_info(tmp_adev
->dev
, "GPU reset succeeded, trying to resume\n");
5840 r
= amdgpu_device_ip_resume_phase1(tmp_adev
);
5844 vram_lost
= amdgpu_device_check_vram_lost(tmp_adev
);
5846 if (!test_bit(AMDGPU_SKIP_COREDUMP
, &reset_context
->flags
))
5847 amdgpu_coredump(tmp_adev
, false, vram_lost
, reset_context
->job
);
5852 "VRAM is lost due to GPU reset!\n");
5853 amdgpu_inc_vram_lost(tmp_adev
);
5856 r
= amdgpu_device_fw_loading(tmp_adev
);
5860 r
= amdgpu_xcp_restore_partition_mode(
5865 r
= amdgpu_device_ip_resume_phase2(tmp_adev
);
5869 if (tmp_adev
->mman
.buffer_funcs_ring
->sched
.ready
)
5870 amdgpu_ttm_set_buffer_funcs_status(tmp_adev
, true);
5872 r
= amdgpu_device_ip_resume_phase3(tmp_adev
);
5877 amdgpu_device_fill_reset_magic(tmp_adev
);
5880 * Add this ASIC as tracked as reset was already
5881 * complete successfully.
5883 amdgpu_register_gpu_instance(tmp_adev
);
5885 if (!reset_context
->hive
&&
5886 tmp_adev
->gmc
.xgmi
.num_physical_nodes
> 1)
5887 amdgpu_xgmi_add_device(tmp_adev
);
5889 r
= amdgpu_device_ip_late_init(tmp_adev
);
5893 drm_client_dev_resume(adev_to_drm(tmp_adev
), false);
5896 * The GPU enters bad state once faulty pages
5897 * by ECC has reached the threshold, and ras
5898 * recovery is scheduled next. So add one check
5899 * here to break recovery if it indeed exceeds
5900 * bad page threshold, and remind user to
5901 * retire this GPU or setting one bigger
5902 * bad_page_threshold value to fix this once
5903 * probing driver again.
5905 if (!amdgpu_ras_is_rma(tmp_adev
)) {
5907 amdgpu_ras_resume(tmp_adev
);
5913 /* Update PSP FW topology after reset */
5914 if (reset_context
->hive
&&
5915 tmp_adev
->gmc
.xgmi
.num_physical_nodes
> 1)
5916 r
= amdgpu_xgmi_update_topology(
5917 reset_context
->hive
, tmp_adev
);
5923 /* IP init is complete now, set level as default */
5924 amdgpu_set_init_level(tmp_adev
,
5925 AMDGPU_INIT_LEVEL_DEFAULT
);
5926 amdgpu_irq_gpu_reset_resume_helper(tmp_adev
);
5927 r
= amdgpu_ib_ring_tests(tmp_adev
);
5929 dev_err(tmp_adev
->dev
, "ib ring test failed (%d).\n", r
);
5936 tmp_adev
->asic_reset_res
= r
;
5943 int amdgpu_do_asic_reset(struct list_head
*device_list_handle
,
5944 struct amdgpu_reset_context
*reset_context
)
5946 struct amdgpu_device
*tmp_adev
= NULL
;
5947 bool need_full_reset
, skip_hw_reset
;
5950 /* Try reset handler method first */
5951 tmp_adev
= list_first_entry(device_list_handle
, struct amdgpu_device
,
5954 reset_context
->reset_device_list
= device_list_handle
;
5955 r
= amdgpu_reset_perform_reset(tmp_adev
, reset_context
);
5956 /* If reset handler not implemented, continue; otherwise return */
5957 if (r
== -EOPNOTSUPP
)
5962 /* Reset handler not implemented, use the default method */
5964 test_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
->flags
);
5965 skip_hw_reset
= test_bit(AMDGPU_SKIP_HW_RESET
, &reset_context
->flags
);
5968 * ASIC reset has to be done on all XGMI hive nodes ASAP
5969 * to allow proper links negotiation in FW (within 1 sec)
5971 if (!skip_hw_reset
&& need_full_reset
) {
5972 list_for_each_entry(tmp_adev
, device_list_handle
, reset_list
) {
5973 /* For XGMI run all resets in parallel to speed up the process */
5974 if (tmp_adev
->gmc
.xgmi
.num_physical_nodes
> 1) {
5975 if (!queue_work(system_unbound_wq
,
5976 &tmp_adev
->xgmi_reset_work
))
5979 r
= amdgpu_asic_reset(tmp_adev
);
5982 dev_err(tmp_adev
->dev
,
5983 "ASIC reset failed with error, %d for drm dev, %s",
5984 r
, adev_to_drm(tmp_adev
)->unique
);
5989 /* For XGMI wait for all resets to complete before proceed */
5991 list_for_each_entry(tmp_adev
, device_list_handle
,
5993 if (tmp_adev
->gmc
.xgmi
.num_physical_nodes
> 1) {
5994 flush_work(&tmp_adev
->xgmi_reset_work
);
5995 r
= tmp_adev
->asic_reset_res
;
6003 if (!r
&& amdgpu_ras_intr_triggered()) {
6004 list_for_each_entry(tmp_adev
, device_list_handle
, reset_list
) {
6005 amdgpu_ras_reset_error_count(tmp_adev
,
6006 AMDGPU_RAS_BLOCK__MMHUB
);
6009 amdgpu_ras_intr_cleared();
6012 r
= amdgpu_device_reinit_after_reset(reset_context
);
6014 set_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
->flags
);
6016 clear_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
->flags
);
6022 static void amdgpu_device_set_mp1_state(struct amdgpu_device
*adev
)
6025 switch (amdgpu_asic_reset_method(adev
)) {
6026 case AMD_RESET_METHOD_MODE1
:
6027 case AMD_RESET_METHOD_LINK
:
6028 adev
->mp1_state
= PP_MP1_STATE_SHUTDOWN
;
6030 case AMD_RESET_METHOD_MODE2
:
6031 adev
->mp1_state
= PP_MP1_STATE_RESET
;
6034 adev
->mp1_state
= PP_MP1_STATE_NONE
;
6039 static void amdgpu_device_unset_mp1_state(struct amdgpu_device
*adev
)
6041 amdgpu_vf_error_trans_all(adev
);
6042 adev
->mp1_state
= PP_MP1_STATE_NONE
;
6045 static void amdgpu_device_resume_display_audio(struct amdgpu_device
*adev
)
6047 struct pci_dev
*p
= NULL
;
6049 p
= pci_get_domain_bus_and_slot(pci_domain_nr(adev
->pdev
->bus
),
6050 adev
->pdev
->bus
->number
, 1);
6052 pm_runtime_enable(&(p
->dev
));
6053 pm_runtime_resume(&(p
->dev
));
6059 static int amdgpu_device_suspend_display_audio(struct amdgpu_device
*adev
)
6061 enum amd_reset_method reset_method
;
6062 struct pci_dev
*p
= NULL
;
6066 * For now, only BACO and mode1 reset are confirmed
6067 * to suffer the audio issue without proper suspended.
6069 reset_method
= amdgpu_asic_reset_method(adev
);
6070 if ((reset_method
!= AMD_RESET_METHOD_BACO
) &&
6071 (reset_method
!= AMD_RESET_METHOD_MODE1
))
6074 p
= pci_get_domain_bus_and_slot(pci_domain_nr(adev
->pdev
->bus
),
6075 adev
->pdev
->bus
->number
, 1);
6079 expires
= pm_runtime_autosuspend_expiration(&(p
->dev
));
6082 * If we cannot get the audio device autosuspend delay,
6083 * a fixed 4S interval will be used. Considering 3S is
6084 * the audio controller default autosuspend delay setting.
6085 * 4S used here is guaranteed to cover that.
6087 expires
= ktime_get_mono_fast_ns() + NSEC_PER_SEC
* 4ULL;
6089 while (!pm_runtime_status_suspended(&(p
->dev
))) {
6090 if (!pm_runtime_suspend(&(p
->dev
)))
6093 if (expires
< ktime_get_mono_fast_ns()) {
6094 dev_warn(adev
->dev
, "failed to suspend display audio\n");
6096 /* TODO: abort the succeeding gpu reset? */
6101 pm_runtime_disable(&(p
->dev
));
6107 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device
*adev
)
6109 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
6111 #if defined(CONFIG_DEBUG_FS)
6112 if (!amdgpu_sriov_vf(adev
))
6113 cancel_work(&adev
->reset_work
);
6117 cancel_work(&adev
->kfd
.reset_work
);
6119 if (amdgpu_sriov_vf(adev
))
6120 cancel_work(&adev
->virt
.flr_work
);
6122 if (con
&& adev
->ras_enabled
)
6123 cancel_work(&con
->recovery_work
);
6127 static int amdgpu_device_health_check(struct list_head
*device_list_handle
)
6129 struct amdgpu_device
*tmp_adev
;
6132 list_for_each_entry(tmp_adev
, device_list_handle
, reset_list
) {
6133 ret
|= amdgpu_device_bus_status_check(tmp_adev
);
6139 static int amdgpu_device_recovery_prepare(struct amdgpu_device
*adev
,
6140 struct list_head
*device_list
,
6141 struct amdgpu_hive_info
*hive
)
6143 struct amdgpu_device
*tmp_adev
= NULL
;
6147 * Build list of devices to reset.
6148 * In case we are in XGMI hive mode, resort the device list
6149 * to put adev in the 1st position.
6151 if (!amdgpu_sriov_vf(adev
) && (adev
->gmc
.xgmi
.num_physical_nodes
> 1) && hive
) {
6152 list_for_each_entry(tmp_adev
, &hive
->device_list
, gmc
.xgmi
.head
) {
6153 list_add_tail(&tmp_adev
->reset_list
, device_list
);
6155 tmp_adev
->shutdown
= true;
6156 if (adev
->pcie_reset_ctx
.occurs_dpc
)
6157 tmp_adev
->pcie_reset_ctx
.in_link_reset
= true;
6159 if (!list_is_first(&adev
->reset_list
, device_list
))
6160 list_rotate_to_front(&adev
->reset_list
, device_list
);
6162 list_add_tail(&adev
->reset_list
, device_list
);
6165 if (!amdgpu_sriov_vf(adev
) && (!adev
->pcie_reset_ctx
.occurs_dpc
)) {
6166 r
= amdgpu_device_health_check(device_list
);
6174 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device
*adev
,
6175 struct list_head
*device_list
)
6177 struct amdgpu_device
*tmp_adev
= NULL
;
6179 if (list_empty(device_list
))
6182 list_first_entry(device_list
, struct amdgpu_device
, reset_list
);
6183 amdgpu_device_lock_reset_domain(tmp_adev
->reset_domain
);
6186 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device
*adev
,
6187 struct list_head
*device_list
)
6189 struct amdgpu_device
*tmp_adev
= NULL
;
6191 if (list_empty(device_list
))
6194 list_first_entry(device_list
, struct amdgpu_device
, reset_list
);
6195 amdgpu_device_unlock_reset_domain(tmp_adev
->reset_domain
);
6198 static void amdgpu_device_halt_activities(struct amdgpu_device
*adev
,
6199 struct amdgpu_job
*job
,
6200 struct amdgpu_reset_context
*reset_context
,
6201 struct list_head
*device_list
,
6202 struct amdgpu_hive_info
*hive
,
6203 bool need_emergency_restart
)
6205 struct amdgpu_device
*tmp_adev
= NULL
;
6208 /* block all schedulers and reset given job's ring */
6209 list_for_each_entry(tmp_adev
, device_list
, reset_list
) {
6210 amdgpu_device_set_mp1_state(tmp_adev
);
6213 * Try to put the audio codec into suspend state
6214 * before gpu reset started.
6216 * Due to the power domain of the graphics device
6217 * is shared with AZ power domain. Without this,
6218 * we may change the audio hardware from behind
6219 * the audio driver's back. That will trigger
6220 * some audio codec errors.
6222 if (!amdgpu_device_suspend_display_audio(tmp_adev
))
6223 tmp_adev
->pcie_reset_ctx
.audio_suspended
= true;
6225 amdgpu_ras_set_error_query_ready(tmp_adev
, false);
6227 cancel_delayed_work_sync(&tmp_adev
->delayed_init_work
);
6229 amdgpu_amdkfd_pre_reset(tmp_adev
, reset_context
);
6232 * Mark these ASICs to be reset as untracked first
6233 * And add them back after reset completed
6235 amdgpu_unregister_gpu_instance(tmp_adev
);
6237 drm_client_dev_suspend(adev_to_drm(tmp_adev
), false);
6239 /* disable ras on ALL IPs */
6240 if (!need_emergency_restart
&&
6241 (!adev
->pcie_reset_ctx
.occurs_dpc
) &&
6242 amdgpu_device_ip_need_full_reset(tmp_adev
))
6243 amdgpu_ras_suspend(tmp_adev
);
6245 for (i
= 0; i
< AMDGPU_MAX_RINGS
; ++i
) {
6246 struct amdgpu_ring
*ring
= tmp_adev
->rings
[i
];
6248 if (!amdgpu_ring_sched_ready(ring
))
6251 drm_sched_stop(&ring
->sched
, job
? &job
->base
: NULL
);
6253 if (need_emergency_restart
)
6254 amdgpu_job_stop_all_jobs_on_sched(&ring
->sched
);
6256 atomic_inc(&tmp_adev
->gpu_reset_counter
);
6260 static int amdgpu_device_asic_reset(struct amdgpu_device
*adev
,
6261 struct list_head
*device_list
,
6262 struct amdgpu_reset_context
*reset_context
)
6264 struct amdgpu_device
*tmp_adev
= NULL
;
6265 int retry_limit
= AMDGPU_MAX_RETRY_LIMIT
;
6268 retry
: /* Rest of adevs pre asic reset from XGMI hive. */
6269 list_for_each_entry(tmp_adev
, device_list
, reset_list
) {
6270 if (adev
->pcie_reset_ctx
.occurs_dpc
)
6271 tmp_adev
->no_hw_access
= true;
6272 r
= amdgpu_device_pre_asic_reset(tmp_adev
, reset_context
);
6273 if (adev
->pcie_reset_ctx
.occurs_dpc
)
6274 tmp_adev
->no_hw_access
= false;
6275 /*TODO Should we stop ?*/
6277 dev_err(tmp_adev
->dev
, "GPU pre asic reset failed with err, %d for drm dev, %s ",
6278 r
, adev_to_drm(tmp_adev
)->unique
);
6279 tmp_adev
->asic_reset_res
= r
;
6283 /* Actual ASIC resets if needed.*/
6284 /* Host driver will handle XGMI hive reset for SRIOV */
6285 if (amdgpu_sriov_vf(adev
)) {
6287 /* Bail out of reset early */
6288 if (amdgpu_ras_is_rma(adev
))
6291 if (amdgpu_ras_get_fed_status(adev
) || amdgpu_virt_rcvd_ras_interrupt(adev
)) {
6292 dev_dbg(adev
->dev
, "Detected RAS error, wait for FLR completion\n");
6293 amdgpu_ras_set_fed(adev
, true);
6294 set_bit(AMDGPU_HOST_FLR
, &reset_context
->flags
);
6297 r
= amdgpu_device_reset_sriov(adev
, reset_context
);
6298 if (AMDGPU_RETRY_SRIOV_RESET(r
) && (retry_limit
--) > 0) {
6299 amdgpu_virt_release_full_gpu(adev
, true);
6303 adev
->asic_reset_res
= r
;
6305 r
= amdgpu_do_asic_reset(device_list
, reset_context
);
6306 if (r
&& r
== -EAGAIN
)
6310 list_for_each_entry(tmp_adev
, device_list
, reset_list
) {
6312 * Drop any pending non scheduler resets queued before reset is done.
6313 * Any reset scheduled after this point would be valid. Scheduler resets
6314 * were already dropped during drm_sched_stop and no new ones can come
6315 * in before drm_sched_start.
6317 amdgpu_device_stop_pending_resets(tmp_adev
);
6323 static int amdgpu_device_sched_resume(struct list_head
*device_list
,
6324 struct amdgpu_reset_context
*reset_context
,
6327 struct amdgpu_device
*tmp_adev
= NULL
;
6330 /* Post ASIC reset for all devs .*/
6331 list_for_each_entry(tmp_adev
, device_list
, reset_list
) {
6333 for (i
= 0; i
< AMDGPU_MAX_RINGS
; ++i
) {
6334 struct amdgpu_ring
*ring
= tmp_adev
->rings
[i
];
6336 if (!amdgpu_ring_sched_ready(ring
))
6339 drm_sched_start(&ring
->sched
, 0);
6342 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev
)) && !job_signaled
)
6343 drm_helper_resume_force_mode(adev_to_drm(tmp_adev
));
6345 if (tmp_adev
->asic_reset_res
)
6346 r
= tmp_adev
->asic_reset_res
;
6348 tmp_adev
->asic_reset_res
= 0;
6351 /* bad news, how to tell it to userspace ?
6352 * for ras error, we should report GPU bad status instead of
6355 if (reset_context
->src
!= AMDGPU_RESET_SRC_RAS
||
6356 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev
))
6357 dev_info(tmp_adev
->dev
, "GPU reset(%d) failed\n",
6358 atomic_read(&tmp_adev
->gpu_reset_counter
));
6359 amdgpu_vf_error_put(tmp_adev
, AMDGIM_ERROR_VF_GPU_RESET_FAIL
, 0, r
);
6361 dev_info(tmp_adev
->dev
, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev
->gpu_reset_counter
));
6362 if (amdgpu_acpi_smart_shift_update(tmp_adev
,
6364 dev_warn(tmp_adev
->dev
,
6365 "smart shift update failed\n");
6372 static void amdgpu_device_gpu_resume(struct amdgpu_device
*adev
,
6373 struct list_head
*device_list
,
6374 bool need_emergency_restart
)
6376 struct amdgpu_device
*tmp_adev
= NULL
;
6378 list_for_each_entry(tmp_adev
, device_list
, reset_list
) {
6379 /* unlock kfd: SRIOV would do it separately */
6380 if (!need_emergency_restart
&& !amdgpu_sriov_vf(tmp_adev
))
6381 amdgpu_amdkfd_post_reset(tmp_adev
);
6383 /* kfd_post_reset will do nothing if kfd device is not initialized,
6384 * need to bring up kfd here if it's not be initialized before
6386 if (!adev
->kfd
.init_complete
)
6387 amdgpu_amdkfd_device_init(adev
);
6389 if (tmp_adev
->pcie_reset_ctx
.audio_suspended
)
6390 amdgpu_device_resume_display_audio(tmp_adev
);
6392 amdgpu_device_unset_mp1_state(tmp_adev
);
6394 amdgpu_ras_set_error_query_ready(tmp_adev
, true);
6401 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
6403 * @adev: amdgpu_device pointer
6404 * @job: which job trigger hang
6405 * @reset_context: amdgpu reset context pointer
6407 * Attempt to reset the GPU if it has hung (all asics).
6408 * Attempt to do soft-reset or full-reset and reinitialize Asic
6409 * Returns 0 for success or an error on failure.
6412 int amdgpu_device_gpu_recover(struct amdgpu_device
*adev
,
6413 struct amdgpu_job
*job
,
6414 struct amdgpu_reset_context
*reset_context
)
6416 struct list_head device_list
;
6417 bool job_signaled
= false;
6418 struct amdgpu_hive_info
*hive
= NULL
;
6420 bool need_emergency_restart
= false;
6423 * If it reaches here because of hang/timeout and a RAS error is
6424 * detected at the same time, let RAS recovery take care of it.
6426 if (amdgpu_ras_is_err_state(adev
, AMDGPU_RAS_BLOCK__ANY
) &&
6427 !amdgpu_sriov_vf(adev
) &&
6428 reset_context
->src
!= AMDGPU_RESET_SRC_RAS
) {
6430 "Gpu recovery from source: %d yielding to RAS error recovery handling",
6431 reset_context
->src
);
6436 * Special case: RAS triggered and full reset isn't supported
6438 need_emergency_restart
= amdgpu_ras_need_emergency_restart(adev
);
6441 * Flush RAM to disk so that after reboot
6442 * the user can read log and see why the system rebooted.
6444 if (need_emergency_restart
&& amdgpu_ras_get_context(adev
) &&
6445 amdgpu_ras_get_context(adev
)->reboot
) {
6446 dev_warn(adev
->dev
, "Emergency reboot.");
6449 emergency_restart();
6452 dev_info(adev
->dev
, "GPU %s begin!\n",
6453 need_emergency_restart
? "jobs stop":"reset");
6455 if (!amdgpu_sriov_vf(adev
))
6456 hive
= amdgpu_get_xgmi_hive(adev
);
6458 mutex_lock(&hive
->hive_lock
);
6460 reset_context
->job
= job
;
6461 reset_context
->hive
= hive
;
6462 INIT_LIST_HEAD(&device_list
);
6464 if (amdgpu_device_recovery_prepare(adev
, &device_list
, hive
))
6467 /* We need to lock reset domain only once both for XGMI and single device */
6468 amdgpu_device_recovery_get_reset_lock(adev
, &device_list
);
6470 amdgpu_device_halt_activities(adev
, job
, reset_context
, &device_list
,
6471 hive
, need_emergency_restart
);
6472 if (need_emergency_restart
)
6473 goto skip_sched_resume
;
6475 * Must check guilty signal here since after this point all old
6476 * HW fences are force signaled.
6478 * job->base holds a reference to parent fence
6480 if (job
&& dma_fence_is_signaled(&job
->hw_fence
.base
)) {
6481 job_signaled
= true;
6482 dev_info(adev
->dev
, "Guilty job already signaled, skipping HW reset");
6486 r
= amdgpu_device_asic_reset(adev
, &device_list
, reset_context
);
6490 r
= amdgpu_device_sched_resume(&device_list
, reset_context
, job_signaled
);
6494 amdgpu_device_gpu_resume(adev
, &device_list
, need_emergency_restart
);
6496 amdgpu_device_recovery_put_reset_lock(adev
, &device_list
);
6499 mutex_unlock(&hive
->hive_lock
);
6500 amdgpu_put_xgmi_hive(hive
);
6504 dev_info(adev
->dev
, "GPU reset end with ret = %d\n", r
);
6506 atomic_set(&adev
->reset_domain
->reset_res
, r
);
6509 struct amdgpu_task_info
*ti
= NULL
;
6512 ti
= amdgpu_vm_get_task_info_pasid(adev
, job
->pasid
);
6514 drm_dev_wedged_event(adev_to_drm(adev
), DRM_WEDGE_RECOVERY_NONE
,
6515 ti
? &ti
->task
: NULL
);
6517 amdgpu_vm_put_task_info(ti
);
6524 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6526 * @adev: amdgpu_device pointer
6527 * @speed: pointer to the speed of the link
6528 * @width: pointer to the width of the link
6530 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6531 * first physical partner to an AMD dGPU.
6532 * This will exclude any virtual switches and links.
6534 static void amdgpu_device_partner_bandwidth(struct amdgpu_device
*adev
,
6535 enum pci_bus_speed
*speed
,
6536 enum pcie_link_width
*width
)
6538 struct pci_dev
*parent
= adev
->pdev
;
6540 if (!speed
|| !width
)
6543 *speed
= PCI_SPEED_UNKNOWN
;
6544 *width
= PCIE_LNK_WIDTH_UNKNOWN
;
6546 if (amdgpu_device_pcie_dynamic_switching_supported(adev
)) {
6547 while ((parent
= pci_upstream_bridge(parent
))) {
6548 /* skip upstream/downstream switches internal to dGPU*/
6549 if (parent
->vendor
== PCI_VENDOR_ID_ATI
)
6551 *speed
= pcie_get_speed_cap(parent
);
6552 *width
= pcie_get_width_cap(parent
);
6556 /* use the current speeds rather than max if switching is not supported */
6557 pcie_bandwidth_available(adev
->pdev
, NULL
, speed
, width
);
6562 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU
6564 * @adev: amdgpu_device pointer
6565 * @speed: pointer to the speed of the link
6566 * @width: pointer to the width of the link
6568 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6569 * AMD dGPU which may be a virtual upstream bridge.
6571 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device
*adev
,
6572 enum pci_bus_speed
*speed
,
6573 enum pcie_link_width
*width
)
6575 struct pci_dev
*parent
= adev
->pdev
;
6577 if (!speed
|| !width
)
6580 parent
= pci_upstream_bridge(parent
);
6581 if (parent
&& parent
->vendor
== PCI_VENDOR_ID_ATI
) {
6582 /* use the upstream/downstream switches internal to dGPU */
6583 *speed
= pcie_get_speed_cap(parent
);
6584 *width
= pcie_get_width_cap(parent
);
6585 while ((parent
= pci_upstream_bridge(parent
))) {
6586 if (parent
->vendor
== PCI_VENDOR_ID_ATI
) {
6587 /* use the upstream/downstream switches internal to dGPU */
6588 *speed
= pcie_get_speed_cap(parent
);
6589 *width
= pcie_get_width_cap(parent
);
6593 /* use the device itself */
6594 *speed
= pcie_get_speed_cap(adev
->pdev
);
6595 *width
= pcie_get_width_cap(adev
->pdev
);
6600 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6602 * @adev: amdgpu_device pointer
6604 * Fetches and stores in the driver the PCIE capabilities (gen speed
6605 * and lanes) of the slot the device is in. Handles APUs and
6606 * virtualized environments where PCIE config space may not be available.
6608 static void amdgpu_device_get_pcie_info(struct amdgpu_device
*adev
)
6610 enum pci_bus_speed speed_cap
, platform_speed_cap
;
6611 enum pcie_link_width platform_link_width
, link_width
;
6613 if (amdgpu_pcie_gen_cap
)
6614 adev
->pm
.pcie_gen_mask
= amdgpu_pcie_gen_cap
;
6616 if (amdgpu_pcie_lane_cap
)
6617 adev
->pm
.pcie_mlw_mask
= amdgpu_pcie_lane_cap
;
6619 /* covers APUs as well */
6620 if (pci_is_root_bus(adev
->pdev
->bus
) && !amdgpu_passthrough(adev
)) {
6621 if (adev
->pm
.pcie_gen_mask
== 0)
6622 adev
->pm
.pcie_gen_mask
= AMDGPU_DEFAULT_PCIE_GEN_MASK
;
6623 if (adev
->pm
.pcie_mlw_mask
== 0)
6624 adev
->pm
.pcie_mlw_mask
= AMDGPU_DEFAULT_PCIE_MLW_MASK
;
6628 if (adev
->pm
.pcie_gen_mask
&& adev
->pm
.pcie_mlw_mask
)
6631 amdgpu_device_partner_bandwidth(adev
, &platform_speed_cap
,
6632 &platform_link_width
);
6633 amdgpu_device_gpu_bandwidth(adev
, &speed_cap
, &link_width
);
6635 if (adev
->pm
.pcie_gen_mask
== 0) {
6637 if (speed_cap
== PCI_SPEED_UNKNOWN
) {
6638 adev
->pm
.pcie_gen_mask
|= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1
|
6639 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2
|
6640 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3
);
6642 if (speed_cap
== PCIE_SPEED_32_0GT
)
6643 adev
->pm
.pcie_gen_mask
|= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1
|
6644 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2
|
6645 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3
|
6646 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4
|
6647 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5
);
6648 else if (speed_cap
== PCIE_SPEED_16_0GT
)
6649 adev
->pm
.pcie_gen_mask
|= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1
|
6650 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2
|
6651 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3
|
6652 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4
);
6653 else if (speed_cap
== PCIE_SPEED_8_0GT
)
6654 adev
->pm
.pcie_gen_mask
|= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1
|
6655 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2
|
6656 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3
);
6657 else if (speed_cap
== PCIE_SPEED_5_0GT
)
6658 adev
->pm
.pcie_gen_mask
|= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1
|
6659 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2
);
6661 adev
->pm
.pcie_gen_mask
|= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1
;
6664 if (platform_speed_cap
== PCI_SPEED_UNKNOWN
) {
6665 adev
->pm
.pcie_gen_mask
|= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1
|
6666 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2
);
6668 if (platform_speed_cap
== PCIE_SPEED_32_0GT
)
6669 adev
->pm
.pcie_gen_mask
|= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1
|
6670 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2
|
6671 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3
|
6672 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4
|
6673 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5
);
6674 else if (platform_speed_cap
== PCIE_SPEED_16_0GT
)
6675 adev
->pm
.pcie_gen_mask
|= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1
|
6676 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2
|
6677 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3
|
6678 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4
);
6679 else if (platform_speed_cap
== PCIE_SPEED_8_0GT
)
6680 adev
->pm
.pcie_gen_mask
|= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1
|
6681 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2
|
6682 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3
);
6683 else if (platform_speed_cap
== PCIE_SPEED_5_0GT
)
6684 adev
->pm
.pcie_gen_mask
|= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1
|
6685 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2
);
6687 adev
->pm
.pcie_gen_mask
|= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1
;
6691 if (adev
->pm
.pcie_mlw_mask
== 0) {
6693 if (link_width
== PCIE_LNK_WIDTH_UNKNOWN
) {
6694 adev
->pm
.pcie_mlw_mask
|= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK
;
6696 switch (link_width
) {
6698 adev
->pm
.pcie_mlw_mask
|= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32
|
6699 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16
|
6700 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12
|
6701 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8
|
6702 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4
|
6703 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2
|
6704 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1
);
6707 adev
->pm
.pcie_mlw_mask
|= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16
|
6708 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12
|
6709 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8
|
6710 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4
|
6711 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2
|
6712 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1
);
6715 adev
->pm
.pcie_mlw_mask
|= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12
|
6716 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8
|
6717 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4
|
6718 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2
|
6719 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1
);
6722 adev
->pm
.pcie_mlw_mask
|= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8
|
6723 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4
|
6724 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2
|
6725 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1
);
6728 adev
->pm
.pcie_mlw_mask
|= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4
|
6729 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2
|
6730 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1
);
6733 adev
->pm
.pcie_mlw_mask
|= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2
|
6734 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1
);
6737 adev
->pm
.pcie_mlw_mask
|= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1
;
6744 if (platform_link_width
== PCIE_LNK_WIDTH_UNKNOWN
) {
6745 adev
->pm
.pcie_mlw_mask
|= AMDGPU_DEFAULT_PCIE_MLW_MASK
;
6747 switch (platform_link_width
) {
6749 adev
->pm
.pcie_mlw_mask
|= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32
|
6750 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16
|
6751 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12
|
6752 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8
|
6753 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4
|
6754 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2
|
6755 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1
);
6758 adev
->pm
.pcie_mlw_mask
|= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16
|
6759 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12
|
6760 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8
|
6761 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4
|
6762 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2
|
6763 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1
);
6766 adev
->pm
.pcie_mlw_mask
|= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12
|
6767 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8
|
6768 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4
|
6769 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2
|
6770 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1
);
6773 adev
->pm
.pcie_mlw_mask
|= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8
|
6774 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4
|
6775 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2
|
6776 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1
);
6779 adev
->pm
.pcie_mlw_mask
|= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4
|
6780 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2
|
6781 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1
);
6784 adev
->pm
.pcie_mlw_mask
|= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2
|
6785 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1
);
6788 adev
->pm
.pcie_mlw_mask
|= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1
;
6798 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6800 * @adev: amdgpu_device pointer
6801 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6803 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6804 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6807 bool amdgpu_device_is_peer_accessible(struct amdgpu_device
*adev
,
6808 struct amdgpu_device
*peer_adev
)
6810 #ifdef CONFIG_HSA_AMD_P2P
6812 !adev
->gmc
.xgmi
.connected_to_cpu
&&
6813 !(pci_p2pdma_distance(adev
->pdev
, peer_adev
->dev
, false) < 0);
6815 dev_info(adev
->dev
, "PCIe P2P access from peer device %s is not supported by the chipset\n",
6816 pci_name(peer_adev
->pdev
));
6818 bool is_large_bar
= adev
->gmc
.visible_vram_size
&&
6819 adev
->gmc
.real_vram_size
== adev
->gmc
.visible_vram_size
;
6820 bool p2p_addressable
= amdgpu_device_check_iommu_remap(peer_adev
);
6822 if (!p2p_addressable
) {
6823 uint64_t address_mask
= peer_adev
->dev
->dma_mask
?
6824 ~*peer_adev
->dev
->dma_mask
: ~((1ULL << 32) - 1);
6825 resource_size_t aper_limit
=
6826 adev
->gmc
.aper_base
+ adev
->gmc
.aper_size
- 1;
6828 p2p_addressable
= !(adev
->gmc
.aper_base
& address_mask
||
6829 aper_limit
& address_mask
);
6831 return pcie_p2p
&& is_large_bar
&& p2p_access
&& p2p_addressable
;
6837 int amdgpu_device_baco_enter(struct amdgpu_device
*adev
)
6839 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
6841 if (!amdgpu_device_supports_baco(adev
))
6844 if (ras
&& adev
->ras_enabled
&&
6845 adev
->nbio
.funcs
->enable_doorbell_interrupt
)
6846 adev
->nbio
.funcs
->enable_doorbell_interrupt(adev
, false);
6848 return amdgpu_dpm_baco_enter(adev
);
6851 int amdgpu_device_baco_exit(struct amdgpu_device
*adev
)
6853 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
6856 if (!amdgpu_device_supports_baco(adev
))
6859 ret
= amdgpu_dpm_baco_exit(adev
);
6863 if (ras
&& adev
->ras_enabled
&&
6864 adev
->nbio
.funcs
->enable_doorbell_interrupt
)
6865 adev
->nbio
.funcs
->enable_doorbell_interrupt(adev
, true);
6867 if (amdgpu_passthrough(adev
) && adev
->nbio
.funcs
&&
6868 adev
->nbio
.funcs
->clear_doorbell_interrupt
)
6869 adev
->nbio
.funcs
->clear_doorbell_interrupt(adev
);
6875 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6876 * @pdev: PCI device struct
6877 * @state: PCI channel state
6879 * Description: Called when a PCI error is detected.
6881 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6883 pci_ers_result_t
amdgpu_pci_error_detected(struct pci_dev
*pdev
, pci_channel_state_t state
)
6885 struct drm_device
*dev
= pci_get_drvdata(pdev
);
6886 struct amdgpu_device
*adev
= drm_to_adev(dev
);
6887 struct amdgpu_hive_info
*hive
= amdgpu_get_xgmi_hive(adev
);
6888 struct amdgpu_reset_context reset_context
;
6889 struct list_head device_list
;
6891 dev_info(adev
->dev
, "PCI error: detected callback!!\n");
6893 if (!amdgpu_dpm_is_link_reset_supported(adev
)) {
6894 dev_warn(adev
->dev
, "No support for XGMI hive yet...\n");
6895 return PCI_ERS_RESULT_DISCONNECT
;
6898 adev
->pci_channel_state
= state
;
6901 case pci_channel_io_normal
:
6902 dev_info(adev
->dev
, "pci_channel_io_normal: state(%d)!!\n", state
);
6903 return PCI_ERS_RESULT_CAN_RECOVER
;
6904 case pci_channel_io_frozen
:
6905 /* Fatal error, prepare for slot reset */
6906 dev_info(adev
->dev
, "pci_channel_io_frozen: state(%d)!!\n", state
);
6909 mutex_lock(&hive
->hive_lock
);
6910 adev
->pcie_reset_ctx
.occurs_dpc
= true;
6911 memset(&reset_context
, 0, sizeof(reset_context
));
6912 INIT_LIST_HEAD(&device_list
);
6914 amdgpu_device_recovery_prepare(adev
, &device_list
, hive
);
6915 amdgpu_device_recovery_get_reset_lock(adev
, &device_list
);
6916 amdgpu_device_halt_activities(adev
, NULL
, &reset_context
, &device_list
,
6919 mutex_unlock(&hive
->hive_lock
);
6920 amdgpu_put_xgmi_hive(hive
);
6922 return PCI_ERS_RESULT_NEED_RESET
;
6923 case pci_channel_io_perm_failure
:
6924 /* Permanent error, prepare for device removal */
6925 dev_info(adev
->dev
, "pci_channel_io_perm_failure: state(%d)!!\n", state
);
6926 return PCI_ERS_RESULT_DISCONNECT
;
6929 return PCI_ERS_RESULT_NEED_RESET
;
6933 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6934 * @pdev: pointer to PCI device
6936 pci_ers_result_t
amdgpu_pci_mmio_enabled(struct pci_dev
*pdev
)
6938 struct drm_device
*dev
= pci_get_drvdata(pdev
);
6939 struct amdgpu_device
*adev
= drm_to_adev(dev
);
6941 dev_info(adev
->dev
, "PCI error: mmio enabled callback!!\n");
6943 /* TODO - dump whatever for debugging purposes */
6945 /* This called only if amdgpu_pci_error_detected returns
6946 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6947 * works, no need to reset slot.
6950 return PCI_ERS_RESULT_RECOVERED
;
6954 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6955 * @pdev: PCI device struct
6957 * Description: This routine is called by the pci error recovery
6958 * code after the PCI slot has been reset, just before we
6959 * should resume normal operations.
6961 pci_ers_result_t
amdgpu_pci_slot_reset(struct pci_dev
*pdev
)
6963 struct drm_device
*dev
= pci_get_drvdata(pdev
);
6964 struct amdgpu_device
*adev
= drm_to_adev(dev
);
6965 struct amdgpu_reset_context reset_context
;
6966 struct amdgpu_device
*tmp_adev
;
6967 struct amdgpu_hive_info
*hive
;
6968 struct list_head device_list
;
6972 /* PCI error slot reset should be skipped During RAS recovery */
6973 if ((amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(9, 4, 3) ||
6974 amdgpu_ip_version(adev
, GC_HWIP
, 0) == IP_VERSION(9, 4, 4)) &&
6975 amdgpu_ras_in_recovery(adev
))
6976 return PCI_ERS_RESULT_RECOVERED
;
6978 dev_info(adev
->dev
, "PCI error: slot reset callback!!\n");
6980 memset(&reset_context
, 0, sizeof(reset_context
));
6982 /* wait for asic to come out of reset */
6985 /* Restore PCI confspace */
6986 amdgpu_device_load_pci_state(pdev
);
6988 /* confirm ASIC came out of reset */
6989 for (i
= 0; i
< adev
->usec_timeout
; i
++) {
6990 memsize
= amdgpu_asic_get_config_memsize(adev
);
6992 if (memsize
!= 0xffffffff)
6996 if (memsize
== 0xffffffff) {
7001 reset_context
.method
= AMD_RESET_METHOD_NONE
;
7002 reset_context
.reset_req_dev
= adev
;
7003 set_bit(AMDGPU_NEED_FULL_RESET
, &reset_context
.flags
);
7004 set_bit(AMDGPU_SKIP_COREDUMP
, &reset_context
.flags
);
7005 INIT_LIST_HEAD(&device_list
);
7007 hive
= amdgpu_get_xgmi_hive(adev
);
7009 mutex_lock(&hive
->hive_lock
);
7010 reset_context
.hive
= hive
;
7011 list_for_each_entry(tmp_adev
, &hive
->device_list
, gmc
.xgmi
.head
) {
7012 tmp_adev
->pcie_reset_ctx
.in_link_reset
= true;
7013 list_add_tail(&tmp_adev
->reset_list
, &device_list
);
7016 set_bit(AMDGPU_SKIP_HW_RESET
, &reset_context
.flags
);
7017 list_add_tail(&adev
->reset_list
, &device_list
);
7020 r
= amdgpu_device_asic_reset(adev
, &device_list
, &reset_context
);
7023 if (amdgpu_device_cache_pci_state(adev
->pdev
))
7024 pci_restore_state(adev
->pdev
);
7025 dev_info(adev
->dev
, "PCIe error recovery succeeded\n");
7027 dev_err(adev
->dev
, "PCIe error recovery failed, err:%d\n", r
);
7029 list_for_each_entry(tmp_adev
, &device_list
, reset_list
)
7030 amdgpu_device_unset_mp1_state(tmp_adev
);
7032 amdgpu_device_recovery_put_reset_lock(adev
, &device_list
);
7036 mutex_unlock(&hive
->hive_lock
);
7037 amdgpu_put_xgmi_hive(hive
);
7040 return r
? PCI_ERS_RESULT_DISCONNECT
: PCI_ERS_RESULT_RECOVERED
;
7044 * amdgpu_pci_resume() - resume normal ops after PCI reset
7045 * @pdev: pointer to PCI device
7047 * Called when the error recovery driver tells us that its
7048 * OK to resume normal operation.
7050 void amdgpu_pci_resume(struct pci_dev
*pdev
)
7052 struct drm_device
*dev
= pci_get_drvdata(pdev
);
7053 struct amdgpu_device
*adev
= drm_to_adev(dev
);
7054 struct list_head device_list
;
7055 struct amdgpu_hive_info
*hive
= NULL
;
7056 struct amdgpu_device
*tmp_adev
= NULL
;
7058 dev_info(adev
->dev
, "PCI error: resume callback!!\n");
7060 /* Only continue execution for the case of pci_channel_io_frozen */
7061 if (adev
->pci_channel_state
!= pci_channel_io_frozen
)
7064 INIT_LIST_HEAD(&device_list
);
7066 hive
= amdgpu_get_xgmi_hive(adev
);
7068 mutex_lock(&hive
->hive_lock
);
7069 list_for_each_entry(tmp_adev
, &hive
->device_list
, gmc
.xgmi
.head
) {
7070 tmp_adev
->pcie_reset_ctx
.in_link_reset
= false;
7071 list_add_tail(&tmp_adev
->reset_list
, &device_list
);
7074 list_add_tail(&adev
->reset_list
, &device_list
);
7076 amdgpu_device_sched_resume(&device_list
, NULL
, NULL
);
7077 amdgpu_device_gpu_resume(adev
, &device_list
, false);
7078 amdgpu_device_recovery_put_reset_lock(adev
, &device_list
);
7079 adev
->pcie_reset_ctx
.occurs_dpc
= false;
7082 mutex_unlock(&hive
->hive_lock
);
7083 amdgpu_put_xgmi_hive(hive
);
7087 bool amdgpu_device_cache_pci_state(struct pci_dev
*pdev
)
7089 struct drm_device
*dev
= pci_get_drvdata(pdev
);
7090 struct amdgpu_device
*adev
= drm_to_adev(dev
);
7093 if (amdgpu_sriov_vf(adev
))
7096 r
= pci_save_state(pdev
);
7098 kfree(adev
->pci_state
);
7100 adev
->pci_state
= pci_store_saved_state(pdev
);
7102 if (!adev
->pci_state
) {
7103 dev_err(adev
->dev
, "Failed to store PCI saved state");
7107 dev_warn(adev
->dev
, "Failed to save PCI state, err:%d\n", r
);
7114 bool amdgpu_device_load_pci_state(struct pci_dev
*pdev
)
7116 struct drm_device
*dev
= pci_get_drvdata(pdev
);
7117 struct amdgpu_device
*adev
= drm_to_adev(dev
);
7120 if (!adev
->pci_state
)
7123 r
= pci_load_saved_state(pdev
, adev
->pci_state
);
7126 pci_restore_state(pdev
);
7128 dev_warn(adev
->dev
, "Failed to load PCI state, err:%d\n", r
);
7135 void amdgpu_device_flush_hdp(struct amdgpu_device
*adev
,
7136 struct amdgpu_ring
*ring
)
7138 #ifdef CONFIG_X86_64
7139 if ((adev
->flags
& AMD_IS_APU
) && !amdgpu_passthrough(adev
))
7142 if (adev
->gmc
.xgmi
.connected_to_cpu
)
7145 if (ring
&& ring
->funcs
->emit_hdp_flush
)
7146 amdgpu_ring_emit_hdp_flush(ring
);
7148 amdgpu_asic_flush_hdp(adev
, ring
);
7151 void amdgpu_device_invalidate_hdp(struct amdgpu_device
*adev
,
7152 struct amdgpu_ring
*ring
)
7154 #ifdef CONFIG_X86_64
7155 if ((adev
->flags
& AMD_IS_APU
) && !amdgpu_passthrough(adev
))
7158 if (adev
->gmc
.xgmi
.connected_to_cpu
)
7161 amdgpu_asic_invalidate_hdp(adev
, ring
);
7164 int amdgpu_in_reset(struct amdgpu_device
*adev
)
7166 return atomic_read(&adev
->reset_domain
->in_gpu_reset
);
7170 * amdgpu_device_halt() - bring hardware to some kind of halt state
7172 * @adev: amdgpu_device pointer
7174 * Bring hardware to some kind of halt state so that no one can touch it
7175 * any more. It will help to maintain error context when error occurred.
7176 * Compare to a simple hang, the system will keep stable at least for SSH
7177 * access. Then it should be trivial to inspect the hardware state and
7178 * see what's going on. Implemented as following:
7180 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
7181 * clears all CPU mappings to device, disallows remappings through page faults
7182 * 2. amdgpu_irq_disable_all() disables all interrupts
7183 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
7184 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
7185 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
7186 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
7187 * flush any in flight DMA operations
7189 void amdgpu_device_halt(struct amdgpu_device
*adev
)
7191 struct pci_dev
*pdev
= adev
->pdev
;
7192 struct drm_device
*ddev
= adev_to_drm(adev
);
7194 amdgpu_xcp_dev_unplug(adev
);
7195 drm_dev_unplug(ddev
);
7197 amdgpu_irq_disable_all(adev
);
7199 amdgpu_fence_driver_hw_fini(adev
);
7201 adev
->no_hw_access
= true;
7203 amdgpu_device_unmap_mmio(adev
);
7205 pci_disable_device(pdev
);
7206 pci_wait_for_pending_transaction(pdev
);
7209 u32
amdgpu_device_pcie_port_rreg(struct amdgpu_device
*adev
,
7212 unsigned long flags
, address
, data
;
7215 address
= adev
->nbio
.funcs
->get_pcie_port_index_offset(adev
);
7216 data
= adev
->nbio
.funcs
->get_pcie_port_data_offset(adev
);
7218 spin_lock_irqsave(&adev
->pcie_idx_lock
, flags
);
7219 WREG32(address
, reg
* 4);
7220 (void)RREG32(address
);
7222 spin_unlock_irqrestore(&adev
->pcie_idx_lock
, flags
);
7226 void amdgpu_device_pcie_port_wreg(struct amdgpu_device
*adev
,
7229 unsigned long flags
, address
, data
;
7231 address
= adev
->nbio
.funcs
->get_pcie_port_index_offset(adev
);
7232 data
= adev
->nbio
.funcs
->get_pcie_port_data_offset(adev
);
7234 spin_lock_irqsave(&adev
->pcie_idx_lock
, flags
);
7235 WREG32(address
, reg
* 4);
7236 (void)RREG32(address
);
7239 spin_unlock_irqrestore(&adev
->pcie_idx_lock
, flags
);
7243 * amdgpu_device_get_gang - return a reference to the current gang
7244 * @adev: amdgpu_device pointer
7246 * Returns: A new reference to the current gang leader.
7248 struct dma_fence
*amdgpu_device_get_gang(struct amdgpu_device
*adev
)
7250 struct dma_fence
*fence
;
7253 fence
= dma_fence_get_rcu_safe(&adev
->gang_submit
);
7259 * amdgpu_device_switch_gang - switch to a new gang
7260 * @adev: amdgpu_device pointer
7261 * @gang: the gang to switch to
7263 * Try to switch to a new gang.
7264 * Returns: NULL if we switched to the new gang or a reference to the current
7267 struct dma_fence
*amdgpu_device_switch_gang(struct amdgpu_device
*adev
,
7268 struct dma_fence
*gang
)
7270 struct dma_fence
*old
= NULL
;
7272 dma_fence_get(gang
);
7275 old
= amdgpu_device_get_gang(adev
);
7279 if (!dma_fence_is_signaled(old
)) {
7280 dma_fence_put(gang
);
7284 } while (cmpxchg((struct dma_fence __force
**)&adev
->gang_submit
,
7288 * Drop it once for the exchanged reference in adev and once for the
7289 * thread local reference acquired in amdgpu_device_get_gang().
7297 * amdgpu_device_enforce_isolation - enforce HW isolation
7298 * @adev: the amdgpu device pointer
7299 * @ring: the HW ring the job is supposed to run on
7300 * @job: the job which is about to be pushed to the HW ring
7302 * Makes sure that only one client at a time can use the GFX block.
7303 * Returns: The dependency to wait on before the job can be pushed to the HW.
7304 * The function is called multiple times until NULL is returned.
7306 struct dma_fence
*amdgpu_device_enforce_isolation(struct amdgpu_device
*adev
,
7307 struct amdgpu_ring
*ring
,
7308 struct amdgpu_job
*job
)
7310 struct amdgpu_isolation
*isolation
= &adev
->isolation
[ring
->xcp_id
];
7311 struct drm_sched_fence
*f
= job
->base
.s_fence
;
7312 struct dma_fence
*dep
;
7317 * For now enforce isolation only for the GFX block since we only need
7318 * the cleaner shader on those rings.
7320 if (ring
->funcs
->type
!= AMDGPU_RING_TYPE_GFX
&&
7321 ring
->funcs
->type
!= AMDGPU_RING_TYPE_COMPUTE
)
7325 * All submissions where enforce isolation is false are handled as if
7326 * they come from a single client. Use ~0l as the owner to distinct it
7327 * from kernel submissions where the owner is NULL.
7329 owner
= job
->enforce_isolation
? f
->owner
: (void *)~0l;
7331 mutex_lock(&adev
->enforce_isolation_mutex
);
7334 * The "spearhead" submission is the first one which changes the
7335 * ownership to its client. We always need to wait for it to be
7336 * pushed to the HW before proceeding with anything.
7338 if (&f
->scheduled
!= isolation
->spearhead
&&
7339 !dma_fence_is_signaled(isolation
->spearhead
)) {
7340 dep
= isolation
->spearhead
;
7344 if (isolation
->owner
!= owner
) {
7347 * Wait for any gang to be assembled before switching to a
7348 * different owner or otherwise we could deadlock the
7351 if (!job
->gang_submit
) {
7352 dep
= amdgpu_device_get_gang(adev
);
7353 if (!dma_fence_is_signaled(dep
))
7354 goto out_return_dep
;
7358 dma_fence_put(isolation
->spearhead
);
7359 isolation
->spearhead
= dma_fence_get(&f
->scheduled
);
7360 amdgpu_sync_move(&isolation
->active
, &isolation
->prev
);
7361 trace_amdgpu_isolation(isolation
->owner
, owner
);
7362 isolation
->owner
= owner
;
7366 * Specifying the ring here helps to pipeline submissions even when
7367 * isolation is enabled. If that is not desired for testing NULL can be
7368 * used instead of the ring to enforce a CPU round trip while switching
7371 dep
= amdgpu_sync_peek_fence(&isolation
->prev
, ring
);
7372 r
= amdgpu_sync_fence(&isolation
->active
, &f
->finished
, GFP_NOWAIT
);
7374 dev_warn(adev
->dev
, "OOM tracking isolation\n");
7379 mutex_unlock(&adev
->enforce_isolation_mutex
);
7383 bool amdgpu_device_has_display_hardware(struct amdgpu_device
*adev
)
7385 switch (adev
->asic_type
) {
7386 #ifdef CONFIG_DRM_AMDGPU_SI
7390 /* chips with no display hardware */
7392 #ifdef CONFIG_DRM_AMDGPU_SI
7398 #ifdef CONFIG_DRM_AMDGPU_CIK
7407 case CHIP_POLARIS10
:
7408 case CHIP_POLARIS11
:
7409 case CHIP_POLARIS12
:
7413 /* chips with display hardware */
7417 if (!amdgpu_ip_version(adev
, DCE_HWIP
, 0) ||
7418 (adev
->harvest_ip_mask
& AMD_HARVEST_IP_DMU_MASK
))
7424 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device
*adev
,
7425 uint32_t inst
, uint32_t reg_addr
, char reg_name
[],
7426 uint32_t expected_value
, uint32_t mask
)
7430 uint32_t tmp_
= RREG32(reg_addr
);
7431 uint32_t loop
= adev
->usec_timeout
;
7433 while ((tmp_
& (mask
)) != (expected_value
)) {
7435 loop
= adev
->usec_timeout
;
7439 tmp_
= RREG32(reg_addr
);
7444 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
7445 inst
, reg_name
, (uint32_t)expected_value
,
7446 (uint32_t)(tmp_
& (mask
)));
7454 ssize_t
amdgpu_get_soft_full_reset_mask(struct amdgpu_ring
*ring
)
7458 if (!ring
|| !ring
->adev
)
7461 if (amdgpu_device_should_recover_gpu(ring
->adev
))
7462 size
|= AMDGPU_RESET_TYPE_FULL
;
7464 if (unlikely(!ring
->adev
->debug_disable_soft_recovery
) &&
7465 !amdgpu_sriov_vf(ring
->adev
) && ring
->funcs
->soft_recovery
)
7466 size
|= AMDGPU_RESET_TYPE_SOFT_RESET
;
7471 ssize_t
amdgpu_show_reset_mask(char *buf
, uint32_t supported_reset
)
7475 if (supported_reset
== 0) {
7476 size
+= sysfs_emit_at(buf
, size
, "unsupported");
7477 size
+= sysfs_emit_at(buf
, size
, "\n");
7482 if (supported_reset
& AMDGPU_RESET_TYPE_SOFT_RESET
)
7483 size
+= sysfs_emit_at(buf
, size
, "soft ");
7485 if (supported_reset
& AMDGPU_RESET_TYPE_PER_QUEUE
)
7486 size
+= sysfs_emit_at(buf
, size
, "queue ");
7488 if (supported_reset
& AMDGPU_RESET_TYPE_PER_PIPE
)
7489 size
+= sysfs_emit_at(buf
, size
, "pipe ");
7491 if (supported_reset
& AMDGPU_RESET_TYPE_FULL
)
7492 size
+= sysfs_emit_at(buf
, size
, "full ");
7494 size
+= sysfs_emit_at(buf
, size
, "\n");