git.ipfire.org Git - thirdparty/kernel/stable.git/blob

1 /*

5 *

6 * Permission is hereby granted, free of charge, to any person obtaining a

7 * copy of this software and associated documentation files (the "Software"),

8 * to deal in the Software without restriction, including without limitation

9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,

10 * and/or sell copies of the Software, and to permit persons to whom the

11 * Software is furnished to do so, subject to the following conditions:

12 *

13 * The above copyright notice and this permission notice shall be included in

14 * all copies or substantial portions of the Software.

15 *

16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL

19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR

20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,

21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR

22 * OTHER DEALINGS IN THE SOFTWARE.

23 *

24 * Authors: Dave Airlie

25 * Alex Deucher

26 * Jerome Glisse

27 */

28 #include <linux/power_supply.h>

29 #include <linux/kthread.h>

30 #include <linux/module.h>

31 #include <linux/console.h>

32 #include <linux/slab.h>

33 #include <linux/iommu.h>

34 #include <linux/pci.h>

36 #include <drm/drm_atomic_helper.h>

37 #include <drm/drm_probe_helper.h>

38 #include <drm/amdgpu_drm.h>

39 #include <linux/vgaarb.h>

40 #include <linux/vga_switcheroo.h>

41 #include <linux/efi.h>

42 #include "amdgpu.h"

43 #include "amdgpu_trace.h"

44 #include "amdgpu_i2c.h"

45 #include "atom.h"

46 #include "amdgpu_atombios.h"

47 #include "amdgpu_atomfirmware.h"

48 #include "amd_pcie.h"

49 #ifdef CONFIG_DRM_AMDGPU_SI

50 #include "si.h"

51 #endif

52 #ifdef CONFIG_DRM_AMDGPU_CIK

53 #include "cik.h"

54 #endif

55 #include "vi.h"

56 #include "soc15.h"

57 #include "nv.h"

58 #include "bif/bif_4_1_d.h"

59 #include <linux/pci.h>

60 #include <linux/firmware.h>

61 #include "amdgpu_vf_error.h"

63 #include "amdgpu_amdkfd.h"

64 #include "amdgpu_pm.h"

66 #include "amdgpu_xgmi.h"

67 #include "amdgpu_ras.h"

68 #include "amdgpu_pmu.h"

69 #include "amdgpu_fru_eeprom.h"

70 #include "amdgpu_reset.h"

72 #include <linux/suspend.h>

73 #include <drm/task_barrier.h>

74 #include <linux/pm_runtime.h>

76 #include <drm/drm_drv.h>

78 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");

79 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");

80 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");

81 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");

82 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");

83 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");

84 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");

85 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");

86 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");

87 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");

88 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");

89 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");

91 #define AMDGPU_RESUME_MS 2000

93 const char *amdgpu_asic_name[] = {

94 "TAHITI",

95 "PITCAIRN",

96 "VERDE",

97 "OLAND",

98 "HAINAN",

99 "BONAIRE",

100 "KAVERI",

101 "KABINI",

102 "HAWAII",

103 "MULLINS",

104 "TOPAZ",

105 "TONGA",

106 "FIJI",

107 "CARRIZO",

108 "STONEY",

109 "POLARIS10",

110 "POLARIS11",

111 "POLARIS12",

112 "VEGAM",

113 "VEGA10",

114 "VEGA12",

115 "VEGA20",

116 "RAVEN",

117 "ARCTURUS",

118 "RENOIR",

119 "ALDEBARAN",

120 "NAVI10",

121 "CYAN_SKILLFISH",

122 "NAVI14",

123 "NAVI12",

124 "SIENNA_CICHLID",

125 "NAVY_FLOUNDER",

126 "VANGOGH",

127 "DIMGREY_CAVEFISH",

128 "BEIGE_GOBY",

129 "YELLOW_CARP",

130 "IP DISCOVERY",

131 "LAST",

132 };

133

134 /**

135 * DOC: pcie_replay_count

136 *

137 * The amdgpu driver provides a sysfs API for reporting the total number

138 * of PCIe replays (NAKs)

139 * The file pcie_replay_count is used for this and returns the total

140 * number of replays as a sum of the NAKs generated and NAKs received

141 */

142

143 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,

144 struct device_attribute *attr, char *buf)

145 {

146 struct drm_device *ddev = dev_get_drvdata(dev);

147 struct amdgpu_device *adev = drm_to_adev(ddev);

148 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);

149

150 return sysfs_emit(buf, "%llu\n", cnt);

151 }

152

153 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,

154 amdgpu_device_get_pcie_replay_count, NULL);

155

156 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);

157

158 /**

159 * DOC: product_name

160 *

161 * The amdgpu driver provides a sysfs API for reporting the product name

162 * for the device

163 * The file serial_number is used for this and returns the product name

164 * as returned from the FRU.

165 * NOTE: This is only available for certain server cards

166 */

167

168 static ssize_t amdgpu_device_get_product_name(struct device *dev,

169 struct device_attribute *attr, char *buf)

170 {

171 struct drm_device *ddev = dev_get_drvdata(dev);

172 struct amdgpu_device *adev = drm_to_adev(ddev);

173

174 return sysfs_emit(buf, "%s\n", adev->product_name);

175 }

176

177 static DEVICE_ATTR(product_name, S_IRUGO,

178 amdgpu_device_get_product_name, NULL);

179

180 /**

181 * DOC: product_number

182 *

183 * The amdgpu driver provides a sysfs API for reporting the part number

184 * for the device

185 * The file serial_number is used for this and returns the part number

186 * as returned from the FRU.

187 * NOTE: This is only available for certain server cards

188 */

189

190 static ssize_t amdgpu_device_get_product_number(struct device *dev,

191 struct device_attribute *attr, char *buf)

192 {

193 struct drm_device *ddev = dev_get_drvdata(dev);

194 struct amdgpu_device *adev = drm_to_adev(ddev);

195

196 return sysfs_emit(buf, "%s\n", adev->product_number);

197 }

198

199 static DEVICE_ATTR(product_number, S_IRUGO,

200 amdgpu_device_get_product_number, NULL);

201

202 /**

203 * DOC: serial_number

204 *

205 * The amdgpu driver provides a sysfs API for reporting the serial number

206 * for the device

207 * The file serial_number is used for this and returns the serial number

208 * as returned from the FRU.

209 * NOTE: This is only available for certain server cards

210 */

211

212 static ssize_t amdgpu_device_get_serial_number(struct device *dev,

213 struct device_attribute *attr, char *buf)

214 {

215 struct drm_device *ddev = dev_get_drvdata(dev);

216 struct amdgpu_device *adev = drm_to_adev(ddev);

217

218 return sysfs_emit(buf, "%s\n", adev->serial);

219 }

220

221 static DEVICE_ATTR(serial_number, S_IRUGO,

222 amdgpu_device_get_serial_number, NULL);

223

224 /**

225 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control

226 *

227 * @dev: drm_device pointer

228 *

229 * Returns true if the device is a dGPU with ATPX power control,

230 * otherwise return false.

231 */

232 bool amdgpu_device_supports_px(struct drm_device *dev)

233 {

234 struct amdgpu_device *adev = drm_to_adev(dev);

235

236 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())

237 return true;

238 return false;

239 }

240

241 /**

242 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources

243 *

244 * @dev: drm_device pointer

245 *

246 * Returns true if the device is a dGPU with ACPI power control,

247 * otherwise return false.

248 */

249 bool amdgpu_device_supports_boco(struct drm_device *dev)

250 {

251 struct amdgpu_device *adev = drm_to_adev(dev);

252

253 if (adev->has_pr3 ||

254 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))

255 return true;

256 return false;

257 }

258

259 /**

260 * amdgpu_device_supports_baco - Does the device support BACO

261 *

262 * @dev: drm_device pointer

263 *

264 * Returns true if the device supporte BACO,

265 * otherwise return false.

266 */

267 bool amdgpu_device_supports_baco(struct drm_device *dev)

268 {

269 struct amdgpu_device *adev = drm_to_adev(dev);

270

271 return amdgpu_asic_supports_baco(adev);

272 }

273

274 /**

275 * amdgpu_device_supports_smart_shift - Is the device dGPU with

276 * smart shift support

277 *

278 * @dev: drm_device pointer

279 *

280 * Returns true if the device is a dGPU with Smart Shift support,

281 * otherwise returns false.

282 */

283 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)

284 {

285 return (amdgpu_device_supports_boco(dev) &&

286 amdgpu_acpi_is_power_shift_control_supported());

287 }

288

289 /*

290 * VRAM access helper functions

291 */

292

293 /**

294 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA

295 *

296 * @adev: amdgpu_device pointer

297 * @pos: offset of the buffer in vram

298 * @buf: virtual address of the buffer in system memory

299 * @size: read/write size, sizeof(@buf) must > @size

300 * @write: true - write to vram, otherwise - read from vram

301 */

302 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,

303 void *buf, size_t size, bool write)

304 {

305 unsigned long flags;

306 uint32_t hi = ~0, tmp = 0;

307 uint32_t *data = buf;

308 uint64_t last;

309 int idx;

310

311 if (!drm_dev_enter(adev_to_drm(adev), &idx))

312 return;

313

314 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));

315

316 spin_lock_irqsave(&adev->mmio_idx_lock, flags);

317 for (last = pos + size; pos < last; pos += 4) {

318 tmp = pos >> 31;

319

320 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);

321 if (tmp != hi) {

322 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);

323 hi = tmp;

324 }

325 if (write)

326 WREG32_NO_KIQ(mmMM_DATA, *data++);

327 else

328 *data++ = RREG32_NO_KIQ(mmMM_DATA);

329 }

330

331 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);

332 drm_dev_exit(idx);

333 }

334

335 /**

336 * amdgpu_device_aper_access - access vram by vram aperature

337 *

338 * @adev: amdgpu_device pointer

339 * @pos: offset of the buffer in vram

340 * @buf: virtual address of the buffer in system memory

341 * @size: read/write size, sizeof(@buf) must > @size

342 * @write: true - write to vram, otherwise - read from vram

343 *

344 * The return value means how many bytes have been transferred.

345 */

346 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,

347 void *buf, size_t size, bool write)

348 {

349 #ifdef CONFIG_64BIT

350 void __iomem *addr;

351 size_t count = 0;

352 uint64_t last;

353

354 if (!adev->mman.aper_base_kaddr)

355 return 0;

356

357 last = min(pos + size, adev->gmc.visible_vram_size);

358 if (last > pos) {

359 addr = adev->mman.aper_base_kaddr + pos;

360 count = last - pos;

361

362 if (write) {

363 memcpy_toio(addr, buf, count);

364 mb();

365 amdgpu_device_flush_hdp(adev, NULL);

366 } else {

367 amdgpu_device_invalidate_hdp(adev, NULL);

368 mb();

369 memcpy_fromio(buf, addr, count);

370 }

371

372 }

373

374 return count;

375 #else

376 return 0;

377 #endif

378 }

379

380 /**

381 * amdgpu_device_vram_access - read/write a buffer in vram

382 *

383 * @adev: amdgpu_device pointer

384 * @pos: offset of the buffer in vram

385 * @buf: virtual address of the buffer in system memory

386 * @size: read/write size, sizeof(@buf) must > @size

387 * @write: true - write to vram, otherwise - read from vram

388 */

389 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,

390 void *buf, size_t size, bool write)

391 {

392 size_t count;

393

394 /* try to using vram apreature to access vram first */

395 count = amdgpu_device_aper_access(adev, pos, buf, size, write);

396 size -= count;

397 if (size) {

398 /* using MM to access rest vram */

399 pos += count;

400 buf += count;

401 amdgpu_device_mm_access(adev, pos, buf, size, write);

402 }

403 }

404

405 /*

406 * register access helper functions.

407 */

408

409 /* Check if hw access should be skipped because of hotplug or device error */

410 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)

411 {

412 if (adev->no_hw_access)

413 return true;

414

415 #ifdef CONFIG_LOCKDEP

416 /*

417 * This is a bit complicated to understand, so worth a comment. What we assert

418 * here is that the GPU reset is not running on another thread in parallel.

419 *

420 * For this we trylock the read side of the reset semaphore, if that succeeds

421 * we know that the reset is not running in paralell.

422 *

423 * If the trylock fails we assert that we are either already holding the read

424 * side of the lock or are the reset thread itself and hold the write side of

425 * the lock.

426 */

427 if (in_task()) {

428 if (down_read_trylock(&adev->reset_sem))

429 up_read(&adev->reset_sem);

430 else

431 lockdep_assert_held(&adev->reset_sem);

432 }

433 #endif

434 return false;

435 }

436

437 /**

438 * amdgpu_device_rreg - read a memory mapped IO or indirect register

439 *

440 * @adev: amdgpu_device pointer

441 * @reg: dword aligned register offset

442 * @acc_flags: access flags which require special behavior

443 *

444 * Returns the 32 bit value from the offset specified.

445 */

446 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,

447 uint32_t reg, uint32_t acc_flags)

448 {

449 uint32_t ret;

450

451 if (amdgpu_device_skip_hw_access(adev))

452 return 0;

453

454 if ((reg * 4) < adev->rmmio_size) {

455 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&

456 amdgpu_sriov_runtime(adev) &&

457 down_read_trylock(&adev->reset_sem)) {

458 ret = amdgpu_kiq_rreg(adev, reg);

459 up_read(&adev->reset_sem);

460 } else {

461 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));

462 }

463 } else {

464 ret = adev->pcie_rreg(adev, reg * 4);

465 }

466

467 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);

468

469 return ret;

470 }

471

472 /*

473 * MMIO register read with bytes helper functions

474 * @offset:bytes offset from MMIO start

475 *

476 */

477

478 /**

479 * amdgpu_mm_rreg8 - read a memory mapped IO register

480 *

481 * @adev: amdgpu_device pointer

482 * @offset: byte aligned register offset

483 *

484 * Returns the 8 bit value from the offset specified.

485 */

486 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)

487 {

488 if (amdgpu_device_skip_hw_access(adev))

489 return 0;

490

491 if (offset < adev->rmmio_size)

492 return (readb(adev->rmmio + offset));

493 BUG();

494 }

495

496 /*

497 * MMIO register write with bytes helper functions

498 * @offset:bytes offset from MMIO start

499 * @value: the value want to be written to the register

500 *

501 */

502 /**

503 * amdgpu_mm_wreg8 - read a memory mapped IO register

504 *

505 * @adev: amdgpu_device pointer

506 * @offset: byte aligned register offset

507 * @value: 8 bit value to write

508 *

509 * Writes the value specified to the offset specified.

510 */

511 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)

512 {

513 if (amdgpu_device_skip_hw_access(adev))

514 return;

515

516 if (offset < adev->rmmio_size)

517 writeb(value, adev->rmmio + offset);

518 else

519 BUG();

520 }

521

522 /**

523 * amdgpu_device_wreg - write to a memory mapped IO or indirect register

524 *

525 * @adev: amdgpu_device pointer

526 * @reg: dword aligned register offset

527 * @v: 32 bit value to write to the register

528 * @acc_flags: access flags which require special behavior

529 *

530 * Writes the value specified to the offset specified.

531 */

532 void amdgpu_device_wreg(struct amdgpu_device *adev,

533 uint32_t reg, uint32_t v,

534 uint32_t acc_flags)

535 {

536 if (amdgpu_device_skip_hw_access(adev))

537 return;

538

539 if ((reg * 4) < adev->rmmio_size) {

540 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&

541 amdgpu_sriov_runtime(adev) &&

542 down_read_trylock(&adev->reset_sem)) {

543 amdgpu_kiq_wreg(adev, reg, v);

544 up_read(&adev->reset_sem);

545 } else {

546 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));

547 }

548 } else {

549 adev->pcie_wreg(adev, reg * 4, v);

550 }

551

552 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);

553 }

554

555 /**

556 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range

557 *

558 * this function is invoked only the debugfs register access

559 */

560 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,

561 uint32_t reg, uint32_t v)

562 {

563 if (amdgpu_device_skip_hw_access(adev))

564 return;

565

566 if (amdgpu_sriov_fullaccess(adev) &&

567 adev->gfx.rlc.funcs &&

568 adev->gfx.rlc.funcs->is_rlcg_access_range) {

569 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))

570 return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0);

571 } else if ((reg * 4) >= adev->rmmio_size) {

572 adev->pcie_wreg(adev, reg * 4, v);

573 } else {

574 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));

575 }

576 }

577

578 /**

579 * amdgpu_mm_rdoorbell - read a doorbell dword

580 *

581 * @adev: amdgpu_device pointer

582 * @index: doorbell index

583 *

584 * Returns the value in the doorbell aperture at the

585 * requested doorbell index (CIK).

586 */

587 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)

588 {

589 if (amdgpu_device_skip_hw_access(adev))

590 return 0;

591

592 if (index < adev->doorbell.num_doorbells) {

593 return readl(adev->doorbell.ptr + index);

594 } else {

595 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);

596 return 0;

597 }

598 }

599

600 /**

601 * amdgpu_mm_wdoorbell - write a doorbell dword

602 *

603 * @adev: amdgpu_device pointer

604 * @index: doorbell index

605 * @v: value to write

606 *

607 * Writes @v to the doorbell aperture at the

608 * requested doorbell index (CIK).

609 */

610 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)

611 {

612 if (amdgpu_device_skip_hw_access(adev))

613 return;

614

615 if (index < adev->doorbell.num_doorbells) {

616 writel(v, adev->doorbell.ptr + index);

617 } else {

618 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);

619 }

620 }

621

622 /**

623 * amdgpu_mm_rdoorbell64 - read a doorbell Qword

624 *

625 * @adev: amdgpu_device pointer

626 * @index: doorbell index

627 *

628 * Returns the value in the doorbell aperture at the

629 * requested doorbell index (VEGA10+).

630 */

631 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)

632 {

633 if (amdgpu_device_skip_hw_access(adev))

634 return 0;

635

636 if (index < adev->doorbell.num_doorbells) {

637 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));

638 } else {

639 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);

640 return 0;

641 }

642 }

643

644 /**

645 * amdgpu_mm_wdoorbell64 - write a doorbell Qword

646 *

647 * @adev: amdgpu_device pointer

648 * @index: doorbell index

649 * @v: value to write

650 *

651 * Writes @v to the doorbell aperture at the

652 * requested doorbell index (VEGA10+).

653 */

654 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)

655 {

656 if (amdgpu_device_skip_hw_access(adev))

657 return;

658

659 if (index < adev->doorbell.num_doorbells) {

660 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);

661 } else {

662 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);

663 }

664 }

665

666 /**

667 * amdgpu_device_indirect_rreg - read an indirect register

668 *

669 * @adev: amdgpu_device pointer

670 * @pcie_index: mmio register offset

671 * @pcie_data: mmio register offset

672 * @reg_addr: indirect register address to read from

673 *

674 * Returns the value of indirect register @reg_addr

675 */

676 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,

677 u32 pcie_index, u32 pcie_data,

678 u32 reg_addr)

679 {

680 unsigned long flags;

681 u32 r;

682 void __iomem *pcie_index_offset;

683 void __iomem *pcie_data_offset;

684

685 spin_lock_irqsave(&adev->pcie_idx_lock, flags);

686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;

687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;

688

689 writel(reg_addr, pcie_index_offset);

690 readl(pcie_index_offset);

691 r = readl(pcie_data_offset);

692 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);

693

694 return r;

695 }

696

697 /**

698 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register

699 *

700 * @adev: amdgpu_device pointer

701 * @pcie_index: mmio register offset

702 * @pcie_data: mmio register offset

703 * @reg_addr: indirect register address to read from

704 *

705 * Returns the value of indirect register @reg_addr

706 */

707 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,

708 u32 pcie_index, u32 pcie_data,

709 u32 reg_addr)

710 {

711 unsigned long flags;

712 u64 r;

713 void __iomem *pcie_index_offset;

714 void __iomem *pcie_data_offset;

715

716 spin_lock_irqsave(&adev->pcie_idx_lock, flags);

717 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;

718 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;

719

720 /* read low 32 bits */

721 writel(reg_addr, pcie_index_offset);

722 readl(pcie_index_offset);

723 r = readl(pcie_data_offset);

724 /* read high 32 bits */

725 writel(reg_addr + 4, pcie_index_offset);

726 readl(pcie_index_offset);

727 r |= ((u64)readl(pcie_data_offset) << 32);

728 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);

729

730 return r;

731 }

732

733 /**

734 * amdgpu_device_indirect_wreg - write an indirect register address

735 *

736 * @adev: amdgpu_device pointer

737 * @pcie_index: mmio register offset

738 * @pcie_data: mmio register offset

739 * @reg_addr: indirect register offset

740 * @reg_data: indirect register data

741 *

742 */

743 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,

744 u32 pcie_index, u32 pcie_data,

745 u32 reg_addr, u32 reg_data)

746 {

747 unsigned long flags;

748 void __iomem *pcie_index_offset;

749 void __iomem *pcie_data_offset;

750

751 spin_lock_irqsave(&adev->pcie_idx_lock, flags);

752 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;

753 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;

754

755 writel(reg_addr, pcie_index_offset);

756 readl(pcie_index_offset);

757 writel(reg_data, pcie_data_offset);

758 readl(pcie_data_offset);

759 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);

760 }

761

762 /**

763 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address

764 *

765 * @adev: amdgpu_device pointer

766 * @pcie_index: mmio register offset

767 * @pcie_data: mmio register offset

768 * @reg_addr: indirect register offset

769 * @reg_data: indirect register data

770 *

771 */

772 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,

773 u32 pcie_index, u32 pcie_data,

774 u32 reg_addr, u64 reg_data)

775 {

776 unsigned long flags;

777 void __iomem *pcie_index_offset;

778 void __iomem *pcie_data_offset;

779

780 spin_lock_irqsave(&adev->pcie_idx_lock, flags);

781 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;

782 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;

783

784 /* write low 32 bits */

785 writel(reg_addr, pcie_index_offset);

786 readl(pcie_index_offset);

787 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);

788 readl(pcie_data_offset);

789 /* write high 32 bits */

790 writel(reg_addr + 4, pcie_index_offset);

791 readl(pcie_index_offset);

792 writel((u32)(reg_data >> 32), pcie_data_offset);

793 readl(pcie_data_offset);

794 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);

795 }

796

797 /**

798 * amdgpu_invalid_rreg - dummy reg read function

799 *

800 * @adev: amdgpu_device pointer

801 * @reg: offset of register

802 *

803 * Dummy register read function. Used for register blocks

804 * that certain asics don't have (all asics).

805 * Returns the value in the register.

806 */

807 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)

808 {

809 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);

810 BUG();

811 return 0;

812 }

813

814 /**

815 * amdgpu_invalid_wreg - dummy reg write function

816 *

817 * @adev: amdgpu_device pointer

818 * @reg: offset of register

819 * @v: value to write to the register

820 *

821 * Dummy register read function. Used for register blocks

822 * that certain asics don't have (all asics).

823 */

824 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)

825 {

826 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",

827 reg, v);

828 BUG();

829 }

830

831 /**

832 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function

833 *

834 * @adev: amdgpu_device pointer

835 * @reg: offset of register

836 *

837 * Dummy register read function. Used for register blocks

838 * that certain asics don't have (all asics).

839 * Returns the value in the register.

840 */

841 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)

842 {

843 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);

844 BUG();

845 return 0;

846 }

847

848 /**

849 * amdgpu_invalid_wreg64 - dummy reg write function

850 *

851 * @adev: amdgpu_device pointer

852 * @reg: offset of register

853 * @v: value to write to the register

854 *

855 * Dummy register read function. Used for register blocks

856 * that certain asics don't have (all asics).

857 */

858 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)

859 {

860 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",

861 reg, v);

862 BUG();

863 }

864

865 /**

866 * amdgpu_block_invalid_rreg - dummy reg read function

867 *

868 * @adev: amdgpu_device pointer

869 * @block: offset of instance

870 * @reg: offset of register

871 *

872 * Dummy register read function. Used for register blocks

873 * that certain asics don't have (all asics).

874 * Returns the value in the register.

875 */

876 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,

877 uint32_t block, uint32_t reg)

878 {

879 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",

880 reg, block);

881 BUG();

882 return 0;

883 }

884

885 /**

886 * amdgpu_block_invalid_wreg - dummy reg write function

887 *

888 * @adev: amdgpu_device pointer

889 * @block: offset of instance

890 * @reg: offset of register

891 * @v: value to write to the register

892 *

893 * Dummy register read function. Used for register blocks

894 * that certain asics don't have (all asics).

895 */

896 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,

897 uint32_t block,

898 uint32_t reg, uint32_t v)

899 {

900 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",

901 reg, block, v);

902 BUG();

903 }

904

905 /**

906 * amdgpu_device_asic_init - Wrapper for atom asic_init

907 *

908 * @adev: amdgpu_device pointer

909 *

910 * Does any asic specific work and then calls atom asic init.

911 */

912 static int amdgpu_device_asic_init(struct amdgpu_device *adev)

913 {

914 amdgpu_asic_pre_asic_init(adev);

915

916 return amdgpu_atom_asic_init(adev->mode_info.atom_context);

917 }

918

919 /**

920 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page

921 *

922 * @adev: amdgpu_device pointer

923 *

924 * Allocates a scratch page of VRAM for use by various things in the

925 * driver.

926 */

927 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)

928 {

929 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,

930 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,

931 &adev->vram_scratch.robj,

932 &adev->vram_scratch.gpu_addr,

933 (void **)&adev->vram_scratch.ptr);

934 }

935

936 /**

937 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page

938 *

939 * @adev: amdgpu_device pointer

940 *

941 * Frees the VRAM scratch page.

942 */

943 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)

944 {

945 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);

946 }

947

948 /**

949 * amdgpu_device_program_register_sequence - program an array of registers.

950 *

951 * @adev: amdgpu_device pointer

952 * @registers: pointer to the register array

953 * @array_size: size of the register array

954 *

955 * Programs an array or registers with and and or masks.

956 * This is a helper for setting golden registers.

957 */

958 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,

959 const u32 *registers,

960 const u32 array_size)

961 {

962 u32 tmp, reg, and_mask, or_mask;

963 int i;

964

965 if (array_size % 3)

966 return;

967

968 for (i = 0; i < array_size; i +=3) {

969 reg = registers[i + 0];

970 and_mask = registers[i + 1];

971 or_mask = registers[i + 2];

972

973 if (and_mask == 0xffffffff) {

974 tmp = or_mask;

975 } else {

976 tmp = RREG32(reg);

977 tmp &= ~and_mask;

978 if (adev->family >= AMDGPU_FAMILY_AI)

979 tmp |= (or_mask & and_mask);

980 else

981 tmp |= or_mask;

982 }

983 WREG32(reg, tmp);

984 }

985 }

986

987 /**

988 * amdgpu_device_pci_config_reset - reset the GPU

989 *

990 * @adev: amdgpu_device pointer

991 *

992 * Resets the GPU using the pci config reset sequence.

993 * Only applicable to asics prior to vega10.

994 */

995 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)

996 {

997 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);

998 }

999

1000 /**

1001 * amdgpu_device_pci_reset - reset the GPU using generic PCI means

1002 *

1003 * @adev: amdgpu_device pointer

1004 *

1005 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).

1006 */

1007 int amdgpu_device_pci_reset(struct amdgpu_device *adev)

1008 {

1009 return pci_reset_function(adev->pdev);

1010 }

1011

1012 /*

1013 * GPU doorbell aperture helpers function.

1014 */

1015 /**

1016 * amdgpu_device_doorbell_init - Init doorbell driver information.

1017 *

1018 * @adev: amdgpu_device pointer

1019 *

1020 * Init doorbell driver information (CIK)

1021 * Returns 0 on success, error on failure.

1022 */

1023 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)

1024 {

1025

1026 /* No doorbell on SI hardware generation */

1027 if (adev->asic_type < CHIP_BONAIRE) {

1028 adev->doorbell.base = 0;

1029 adev->doorbell.size = 0;

1030 adev->doorbell.num_doorbells = 0;

1031 adev->doorbell.ptr = NULL;

1032 return 0;

1033 }

1034

1035 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)

1036 return -EINVAL;

1037

1038 amdgpu_asic_init_doorbell_index(adev);

1039

1040 /* doorbell bar mapping */

1041 adev->doorbell.base = pci_resource_start(adev->pdev, 2);

1042 adev->doorbell.size = pci_resource_len(adev->pdev, 2);

1043

1044 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),

1045 adev->doorbell_index.max_assignment+1);

1046 if (adev->doorbell.num_doorbells == 0)

1047 return -EINVAL;

1048

1049 /* For Vega, reserve and map two pages on doorbell BAR since SDMA

1050 * paging queue doorbell use the second page. The

1051 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the

1052 * doorbells are in the first page. So with paging queue enabled,

1053 * the max num_doorbells should + 1 page (0x400 in dword)

1054 */

1055 if (adev->asic_type >= CHIP_VEGA10)

1056 adev->doorbell.num_doorbells += 0x400;

1057

1058 adev->doorbell.ptr = ioremap(adev->doorbell.base,

1059 adev->doorbell.num_doorbells *

1060 sizeof(u32));

1061 if (adev->doorbell.ptr == NULL)

1062 return -ENOMEM;

1063

1064 return 0;

1065 }

1066

1067 /**

1068 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.

1069 *

1070 * @adev: amdgpu_device pointer

1071 *

1072 * Tear down doorbell driver information (CIK)

1073 */

1074 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)

1075 {

1076 iounmap(adev->doorbell.ptr);

1077 adev->doorbell.ptr = NULL;

1078 }

1079

1080

1081

1082 /*

1083 * amdgpu_device_wb_*()

1084 * Writeback is the method by which the GPU updates special pages in memory

1085 * with the status of certain GPU events (fences, ring pointers,etc.).

1086 */

1087

1088 /**

1089 * amdgpu_device_wb_fini - Disable Writeback and free memory

1090 *

1091 * @adev: amdgpu_device pointer

1092 *

1093 * Disables Writeback and frees the Writeback memory (all asics).

1094 * Used at driver shutdown.

1095 */

1096 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)

1097 {

1098 if (adev->wb.wb_obj) {

1099 amdgpu_bo_free_kernel(&adev->wb.wb_obj,

1100 &adev->wb.gpu_addr,

1101 (void **)&adev->wb.wb);

1102 adev->wb.wb_obj = NULL;

1103 }

1104 }

1105

1106 /**

1107 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory

1108 *

1109 * @adev: amdgpu_device pointer

1110 *

1111 * Initializes writeback and allocates writeback memory (all asics).

1112 * Used at driver startup.

1113 * Returns 0 on success or an -error on failure.

1114 */

1115 static int amdgpu_device_wb_init(struct amdgpu_device *adev)

1116 {

1117 int r;

1118

1119 if (adev->wb.wb_obj == NULL) {

1120 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */

1121 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,

1122 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,

1123 &adev->wb.wb_obj, &adev->wb.gpu_addr,

1124 (void **)&adev->wb.wb);

1125 if (r) {

1126 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);

1127 return r;

1128 }

1129

1130 adev->wb.num_wb = AMDGPU_MAX_WB;

1131 memset(&adev->wb.used, 0, sizeof(adev->wb.used));

1132

1133 /* clear wb memory */

1134 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);

1135 }

1136

1137 return 0;

1138 }

1139

1140 /**

1141 * amdgpu_device_wb_get - Allocate a wb entry

1142 *

1143 * @adev: amdgpu_device pointer

1144 * @wb: wb index

1145 *

1146 * Allocate a wb slot for use by the driver (all asics).

1147 * Returns 0 on success or -EINVAL on failure.

1148 */

1149 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)

1150 {

1151 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);

1152

1153 if (offset < adev->wb.num_wb) {

1154 __set_bit(offset, adev->wb.used);

1155 *wb = offset << 3; /* convert to dw offset */

1156 return 0;

1157 } else {

1158 return -EINVAL;

1159 }

1160 }

1161

1162 /**

1163 * amdgpu_device_wb_free - Free a wb entry

1164 *

1165 * @adev: amdgpu_device pointer

1166 * @wb: wb index

1167 *

1168 * Free a wb slot allocated for use by the driver (all asics)

1169 */

1170 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)

1171 {

1172 wb >>= 3;

1173 if (wb < adev->wb.num_wb)

1174 __clear_bit(wb, adev->wb.used);

1175 }

1176

1177 /**

1178 * amdgpu_device_resize_fb_bar - try to resize FB BAR

1179 *

1180 * @adev: amdgpu_device pointer

1181 *

1182 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not

1183 * to fail, but if any of the BARs is not accessible after the size we abort

1184 * driver loading by returning -ENODEV.

1185 */

1186 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)

1187 {

1188 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);

1189 struct pci_bus *root;

1190 struct resource *res;

1191 unsigned i;

1192 u16 cmd;

1193 int r;

1194

1195 /* Bypass for VF */

1196 if (amdgpu_sriov_vf(adev))

1197 return 0;

1198

1199 /* skip if the bios has already enabled large BAR */

1200 if (adev->gmc.real_vram_size &&

1201 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))

1202 return 0;

1203

1204 /* Check if the root BUS has 64bit memory resources */

1205 root = adev->pdev->bus;

1206 while (root->parent)

1207 root = root->parent;

1208

1209 pci_bus_for_each_resource(root, res, i) {

1210 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&

1211 res->start > 0x100000000ull)

1212 break;

1213 }

1214

1215 /* Trying to resize is pointless without a root hub window above 4GB */

1216 if (!res)

1217 return 0;

1218

1219 /* Limit the BAR size to what is available */

1220 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,

1221 rbar_size);

1222

1223 /* Disable memory decoding while we change the BAR addresses and size */

1224 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);

1225 pci_write_config_word(adev->pdev, PCI_COMMAND,

1226 cmd & ~PCI_COMMAND_MEMORY);

1227

1228 /* Free the VRAM and doorbell BAR, we most likely need to move both. */

1229 amdgpu_device_doorbell_fini(adev);

1230 if (adev->asic_type >= CHIP_BONAIRE)

1231 pci_release_resource(adev->pdev, 2);

1232

1233 pci_release_resource(adev->pdev, 0);

1234

1235 r = pci_resize_resource(adev->pdev, 0, rbar_size);

1236 if (r == -ENOSPC)

1237 DRM_INFO("Not enough PCI address space for a large BAR.");

1238 else if (r && r != -ENOTSUPP)

1239 DRM_ERROR("Problem resizing BAR0 (%d).", r);

1240

1241 pci_assign_unassigned_bus_resources(adev->pdev->bus);

1242

1243 /* When the doorbell or fb BAR isn't available we have no chance of

1244 * using the device.

1245 */

1246 r = amdgpu_device_doorbell_init(adev);

1247 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))

1248 return -ENODEV;

1249

1250 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);

1251

1252 return 0;

1253 }

1254

1255 /*

1256 * GPU helpers function.

1257 */

1258 /**

1259 * amdgpu_device_need_post - check if the hw need post or not

1260 *

1261 * @adev: amdgpu_device pointer

1262 *

1263 * Check if the asic has been initialized (all asics) at driver startup

1264 * or post is needed if hw reset is performed.

1265 * Returns true if need or false if not.

1266 */

1267 bool amdgpu_device_need_post(struct amdgpu_device *adev)

1268 {

1269 uint32_t reg;

1270

1271 if (amdgpu_sriov_vf(adev))

1272 return false;

1273

1274 if (amdgpu_passthrough(adev)) {

1275 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot

1276 * some old smc fw still need driver do vPost otherwise gpu hang, while

1277 * those smc fw version above 22.15 doesn't have this flaw, so we force

1278 * vpost executed for smc version below 22.15

1279 */

1280 if (adev->asic_type == CHIP_FIJI) {

1281 int err;

1282 uint32_t fw_ver;

1283 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);

1284 /* force vPost if error occured */

1285 if (err)

1286 return true;

1287

1288 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);

1289 if (fw_ver < 0x00160e00)

1290 return true;

1291 }

1292 }

1293

1294 /* Don't post if we need to reset whole hive on init */

1295 if (adev->gmc.xgmi.pending_reset)

1296 return false;

1297

1298 if (adev->has_hw_reset) {

1299 adev->has_hw_reset = false;

1300 return true;

1301 }

1302

1303 /* bios scratch used on CIK+ */

1304 if (adev->asic_type >= CHIP_BONAIRE)

1305 return amdgpu_atombios_scratch_need_asic_init(adev);

1306

1307 /* check MEM_SIZE for older asics */

1308 reg = amdgpu_asic_get_config_memsize(adev);

1309

1310 if ((reg != 0) && (reg != 0xffffffff))

1311 return false;

1312

1313 return true;

1314 }

1315

1316 /* if we get transitioned to only one device, take VGA back */

1317 /**

1318 * amdgpu_device_vga_set_decode - enable/disable vga decode

1319 *

1320 * @pdev: PCI device pointer

1321 * @state: enable/disable vga decode

1322 *

1323 * Enable/disable vga decode (all asics).

1324 * Returns VGA resource flags.

1325 */

1326 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,

1327 bool state)

1328 {

1329 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));

1330 amdgpu_asic_set_vga_state(adev, state);

1331 if (state)

1332 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |

1333 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;

1334 else

1335 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;

1336 }

1337

1338 /**

1339 * amdgpu_device_check_block_size - validate the vm block size

1340 *

1341 * @adev: amdgpu_device pointer

1342 *

1343 * Validates the vm block size specified via module parameter.

1344 * The vm block size defines number of bits in page table versus page directory,

1345 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the

1346 * page table and the remaining bits are in the page directory.

1347 */

1348 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)

1349 {

1350 /* defines number of bits in page table versus page directory,

1351 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the

1352 * page table and the remaining bits are in the page directory */

1353 if (amdgpu_vm_block_size == -1)

1354 return;

1355

1356 if (amdgpu_vm_block_size < 9) {

1357 dev_warn(adev->dev, "VM page table size (%d) too small\n",

1358 amdgpu_vm_block_size);

1359 amdgpu_vm_block_size = -1;

1360 }

1361 }

1362

1363 /**

1364 * amdgpu_device_check_vm_size - validate the vm size

1365 *

1366 * @adev: amdgpu_device pointer

1367 *

1368 * Validates the vm size in GB specified via module parameter.

1369 * The VM size is the size of the GPU virtual memory space in GB.

1370 */

1371 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)

1372 {

1373 /* no need to check the default value */

1374 if (amdgpu_vm_size == -1)

1375 return;

1376

1377 if (amdgpu_vm_size < 1) {

1378 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",

1379 amdgpu_vm_size);

1380 amdgpu_vm_size = -1;

1381 }

1382 }

1383

1384 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)

1385 {

1386 struct sysinfo si;

1387 bool is_os_64 = (sizeof(void *) == 8);

1388 uint64_t total_memory;

1389 uint64_t dram_size_seven_GB = 0x1B8000000;

1390 uint64_t dram_size_three_GB = 0xB8000000;

1391

1392 if (amdgpu_smu_memory_pool_size == 0)

1393 return;

1394

1395 if (!is_os_64) {

1396 DRM_WARN("Not 64-bit OS, feature not supported\n");

1397 goto def_value;

1398 }

1399 si_meminfo(&si);

1400 total_memory = (uint64_t)si.totalram * si.mem_unit;

1401

1402 if ((amdgpu_smu_memory_pool_size == 1) ||

1403 (amdgpu_smu_memory_pool_size == 2)) {

1404 if (total_memory < dram_size_three_GB)

1405 goto def_value1;

1406 } else if ((amdgpu_smu_memory_pool_size == 4) ||

1407 (amdgpu_smu_memory_pool_size == 8)) {

1408 if (total_memory < dram_size_seven_GB)

1409 goto def_value1;

1410 } else {

1411 DRM_WARN("Smu memory pool size not supported\n");

1412 goto def_value;

1413 }

1414 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;

1415

1416 return;

1417

1418 def_value1:

1419 DRM_WARN("No enough system memory\n");

1420 def_value:

1421 adev->pm.smu_prv_buffer_size = 0;

1422 }

1423

1424 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)

1425 {

1426 if (!(adev->flags & AMD_IS_APU) ||

1427 adev->asic_type < CHIP_RAVEN)

1428 return 0;

1429

1430 switch (adev->asic_type) {

1431 case CHIP_RAVEN:

1432 if (adev->pdev->device == 0x15dd)

1433 adev->apu_flags |= AMD_APU_IS_RAVEN;

1434 if (adev->pdev->device == 0x15d8)

1435 adev->apu_flags |= AMD_APU_IS_PICASSO;

1436 break;

1437 case CHIP_RENOIR:

1438 if ((adev->pdev->device == 0x1636) ||

1439 (adev->pdev->device == 0x164c))

1440 adev->apu_flags |= AMD_APU_IS_RENOIR;

1441 else

1442 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;

1443 break;

1444 case CHIP_VANGOGH:

1445 adev->apu_flags |= AMD_APU_IS_VANGOGH;

1446 break;

1447 case CHIP_YELLOW_CARP:

1448 break;

1449 case CHIP_CYAN_SKILLFISH:

1450 if (adev->pdev->device == 0x13FE)

1451 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;

1452 break;

1453 default:

1454 break;

1455 }

1456

1457 return 0;

1458 }

1459

1460 /**

1461 * amdgpu_device_check_arguments - validate module params

1462 *

1463 * @adev: amdgpu_device pointer

1464 *

1465 * Validates certain module parameters and updates

1466 * the associated values used by the driver (all asics).

1467 */

1468 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)

1469 {

1470 if (amdgpu_sched_jobs < 4) {

1471 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",

1472 amdgpu_sched_jobs);

1473 amdgpu_sched_jobs = 4;

1474 } else if (!is_power_of_2(amdgpu_sched_jobs)){

1475 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",

1476 amdgpu_sched_jobs);

1477 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);

1478 }

1479

1480 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {

1481 /* gart size must be greater or equal to 32M */

1482 dev_warn(adev->dev, "gart size (%d) too small\n",

1483 amdgpu_gart_size);

1484 amdgpu_gart_size = -1;

1485 }

1486

1487 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {

1488 /* gtt size must be greater or equal to 32M */

1489 dev_warn(adev->dev, "gtt size (%d) too small\n",

1490 amdgpu_gtt_size);

1491 amdgpu_gtt_size = -1;

1492 }

1493

1494 /* valid range is between 4 and 9 inclusive */

1495 if (amdgpu_vm_fragment_size != -1 &&

1496 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {

1497 dev_warn(adev->dev, "valid range is between 4 and 9\n");

1498 amdgpu_vm_fragment_size = -1;

1499 }

1500

1501 if (amdgpu_sched_hw_submission < 2) {

1502 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",

1503 amdgpu_sched_hw_submission);

1504 amdgpu_sched_hw_submission = 2;

1505 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {

1506 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",

1507 amdgpu_sched_hw_submission);

1508 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);

1509 }

1510

1511 amdgpu_device_check_smu_prv_buffer_size(adev);

1512

1513 amdgpu_device_check_vm_size(adev);

1514

1515 amdgpu_device_check_block_size(adev);

1516

1517 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);

1518

1519 amdgpu_gmc_tmz_set(adev);

1520

1521 amdgpu_gmc_noretry_set(adev);

1522

1523 return 0;

1524 }

1525

1526 /**

1527 * amdgpu_switcheroo_set_state - set switcheroo state

1528 *

1529 * @pdev: pci dev pointer

1530 * @state: vga_switcheroo state

1531 *

1532 * Callback for the switcheroo driver. Suspends or resumes the

1533 * the asics before or after it is powered up using ACPI methods.

1534 */

1535 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,

1536 enum vga_switcheroo_state state)

1537 {

1538 struct drm_device *dev = pci_get_drvdata(pdev);

1539 int r;

1540

1541 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)

1542 return;

1543

1544 if (state == VGA_SWITCHEROO_ON) {

1545 pr_info("switched on\n");

1546 /* don't suspend or resume card normally */

1547 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;

1548

1549 pci_set_power_state(pdev, PCI_D0);

1550 amdgpu_device_load_pci_state(pdev);

1551 r = pci_enable_device(pdev);

1552 if (r)

1553 DRM_WARN("pci_enable_device failed (%d)\n", r);

1554 amdgpu_device_resume(dev, true);

1555

1556 dev->switch_power_state = DRM_SWITCH_POWER_ON;

1557 } else {

1558 pr_info("switched off\n");

1559 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;

1560 amdgpu_device_suspend(dev, true);

1561 amdgpu_device_cache_pci_state(pdev);

1562 /* Shut down the device */

1563 pci_disable_device(pdev);

1564 pci_set_power_state(pdev, PCI_D3cold);

1565 dev->switch_power_state = DRM_SWITCH_POWER_OFF;

1566 }

1567 }

1568

1569 /**

1570 * amdgpu_switcheroo_can_switch - see if switcheroo state can change

1571 *

1572 * @pdev: pci dev pointer

1573 *

1574 * Callback for the switcheroo driver. Check of the switcheroo

1575 * state can be changed.

1576 * Returns true if the state can be changed, false if not.

1577 */

1578 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)

1579 {

1580 struct drm_device *dev = pci_get_drvdata(pdev);

1581

1582 /*

1583 * FIXME: open_count is protected by drm_global_mutex but that would lead to

1584 * locking inversion with the driver load path. And the access here is

1585 * completely racy anyway. So don't bother with locking for now.

1586 */

1587 return atomic_read(&dev->open_count) == 0;

1588 }

1589

1590 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {

1591 .set_gpu_state = amdgpu_switcheroo_set_state,

1592 .reprobe = NULL,

1593 .can_switch = amdgpu_switcheroo_can_switch,

1594 };

1595

1596 /**

1597 * amdgpu_device_ip_set_clockgating_state - set the CG state

1598 *

1599 * @dev: amdgpu_device pointer

1600 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)

1601 * @state: clockgating state (gate or ungate)

1602 *

1603 * Sets the requested clockgating state for all instances of

1604 * the hardware IP specified.

1605 * Returns the error code from the last instance.

1606 */

1607 int amdgpu_device_ip_set_clockgating_state(void *dev,

1608 enum amd_ip_block_type block_type,

1609 enum amd_clockgating_state state)

1610 {

1611 struct amdgpu_device *adev = dev;

1612 int i, r = 0;

1613

1614 for (i = 0; i < adev->num_ip_blocks; i++) {

1615 if (!adev->ip_blocks[i].status.valid)

1616 continue;

1617 if (adev->ip_blocks[i].version->type != block_type)

1618 continue;

1619 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)

1620 continue;

1621 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(

1622 (void *)adev, state);

1623 if (r)

1624 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",

1625 adev->ip_blocks[i].version->funcs->name, r);

1626 }

1627 return r;

1628 }

1629

1630 /**

1631 * amdgpu_device_ip_set_powergating_state - set the PG state

1632 *

1633 * @dev: amdgpu_device pointer

1634 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)

1635 * @state: powergating state (gate or ungate)

1636 *

1637 * Sets the requested powergating state for all instances of

1638 * the hardware IP specified.

1639 * Returns the error code from the last instance.

1640 */

1641 int amdgpu_device_ip_set_powergating_state(void *dev,

1642 enum amd_ip_block_type block_type,

1643 enum amd_powergating_state state)

1644 {

1645 struct amdgpu_device *adev = dev;

1646 int i, r = 0;

1647

1648 for (i = 0; i < adev->num_ip_blocks; i++) {

1649 if (!adev->ip_blocks[i].status.valid)

1650 continue;

1651 if (adev->ip_blocks[i].version->type != block_type)

1652 continue;

1653 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)

1654 continue;

1655 r = adev->ip_blocks[i].version->funcs->set_powergating_state(

1656 (void *)adev, state);

1657 if (r)

1658 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",

1659 adev->ip_blocks[i].version->funcs->name, r);

1660 }

1661 return r;

1662 }

1663

1664 /**

1665 * amdgpu_device_ip_get_clockgating_state - get the CG state

1666 *

1667 * @adev: amdgpu_device pointer

1668 * @flags: clockgating feature flags

1669 *

1670 * Walks the list of IPs on the device and updates the clockgating

1671 * flags for each IP.

1672 * Updates @flags with the feature flags for each hardware IP where

1673 * clockgating is enabled.

1674 */

1675 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,

1676 u32 *flags)

1677 {

1678 int i;

1679

1680 for (i = 0; i < adev->num_ip_blocks; i++) {

1681 if (!adev->ip_blocks[i].status.valid)

1682 continue;

1683 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)

1684 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);

1685 }

1686 }

1687

1688 /**

1689 * amdgpu_device_ip_wait_for_idle - wait for idle

1690 *

1691 * @adev: amdgpu_device pointer

1692 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)

1693 *

1694 * Waits for the request hardware IP to be idle.

1695 * Returns 0 for success or a negative error code on failure.

1696 */

1697 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,

1698 enum amd_ip_block_type block_type)

1699 {

1700 int i, r;

1701

1702 for (i = 0; i < adev->num_ip_blocks; i++) {

1703 if (!adev->ip_blocks[i].status.valid)

1704 continue;

1705 if (adev->ip_blocks[i].version->type == block_type) {

1706 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);

1707 if (r)

1708 return r;

1709 break;

1710 }

1711 }

1712 return 0;

1713

1714 }

1715

1716 /**

1717 * amdgpu_device_ip_is_idle - is the hardware IP idle

1718 *

1719 * @adev: amdgpu_device pointer

1720 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)

1721 *

1722 * Check if the hardware IP is idle or not.

1723 * Returns true if it the IP is idle, false if not.

1724 */

1725 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,

1726 enum amd_ip_block_type block_type)

1727 {

1728 int i;

1729

1730 for (i = 0; i < adev->num_ip_blocks; i++) {

1731 if (!adev->ip_blocks[i].status.valid)

1732 continue;

1733 if (adev->ip_blocks[i].version->type == block_type)

1734 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);

1735 }

1736 return true;

1737

1738 }

1739

1740 /**

1741 * amdgpu_device_ip_get_ip_block - get a hw IP pointer

1742 *

1743 * @adev: amdgpu_device pointer

1744 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)

1745 *

1746 * Returns a pointer to the hardware IP block structure

1747 * if it exists for the asic, otherwise NULL.

1748 */

1749 struct amdgpu_ip_block *

1750 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,

1751 enum amd_ip_block_type type)

1752 {

1753 int i;

1754

1755 for (i = 0; i < adev->num_ip_blocks; i++)

1756 if (adev->ip_blocks[i].version->type == type)

1757 return &adev->ip_blocks[i];

1758

1759 return NULL;

1760 }

1761

1762 /**

1763 * amdgpu_device_ip_block_version_cmp

1764 *

1765 * @adev: amdgpu_device pointer

1766 * @type: enum amd_ip_block_type

1767 * @major: major version

1768 * @minor: minor version

1769 *

1770 * return 0 if equal or greater

1771 * return 1 if smaller or the ip_block doesn't exist

1772 */

1773 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,

1774 enum amd_ip_block_type type,

1775 u32 major, u32 minor)

1776 {

1777 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);

1778

1779 if (ip_block && ((ip_block->version->major > major) ||

1780 ((ip_block->version->major == major) &&

1781 (ip_block->version->minor >= minor))))

1782 return 0;

1783

1784 return 1;

1785 }

1786

1787 /**

1788 * amdgpu_device_ip_block_add

1789 *

1790 * @adev: amdgpu_device pointer

1791 * @ip_block_version: pointer to the IP to add

1792 *

1793 * Adds the IP block driver information to the collection of IPs

1794 * on the asic.

1795 */

1796 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,

1797 const struct amdgpu_ip_block_version *ip_block_version)

1798 {

1799 if (!ip_block_version)

1800 return -EINVAL;

1801

1802 switch (ip_block_version->type) {

1803 case AMD_IP_BLOCK_TYPE_VCN:

1804 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)

1805 return 0;

1806 break;

1807 case AMD_IP_BLOCK_TYPE_JPEG:

1808 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)

1809 return 0;

1810 break;

1811 default:

1812 break;

1813 }

1814

1815 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,

1816 ip_block_version->funcs->name);

1817

1818 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;

1819

1820 return 0;

1821 }

1822

1823 /**

1824 * amdgpu_device_enable_virtual_display - enable virtual display feature

1825 *

1826 * @adev: amdgpu_device pointer

1827 *

1828 * Enabled the virtual display feature if the user has enabled it via

1829 * the module parameter virtual_display. This feature provides a virtual

1830 * display hardware on headless boards or in virtualized environments.

1831 * This function parses and validates the configuration string specified by

1832 * the user and configues the virtual display configuration (number of

1833 * virtual connectors, crtcs, etc.) specified.

1834 */

1835 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)

1836 {

1837 adev->enable_virtual_display = false;

1838

1839 if (amdgpu_virtual_display) {

1840 const char *pci_address_name = pci_name(adev->pdev);

1841 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;

1842

1843 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);

1844 pciaddstr_tmp = pciaddstr;

1845 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {

1846 pciaddname = strsep(&pciaddname_tmp, ",");

1847 if (!strcmp("all", pciaddname)

1848 || !strcmp(pci_address_name, pciaddname)) {

1849 long num_crtc;

1850 int res = -1;

1851

1852 adev->enable_virtual_display = true;

1853

1854 if (pciaddname_tmp)

1855 res = kstrtol(pciaddname_tmp, 10,

1856 &num_crtc);

1857

1858 if (!res) {

1859 if (num_crtc < 1)

1860 num_crtc = 1;

1861 if (num_crtc > 6)

1862 num_crtc = 6;

1863 adev->mode_info.num_crtc = num_crtc;

1864 } else {

1865 adev->mode_info.num_crtc = 1;

1866 }

1867 break;

1868 }

1869 }

1870

1871 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",

1872 amdgpu_virtual_display, pci_address_name,

1873 adev->enable_virtual_display, adev->mode_info.num_crtc);

1874

1875 kfree(pciaddstr);

1876 }

1877 }

1878

1879 /**

1880 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware

1881 *

1882 * @adev: amdgpu_device pointer

1883 *

1884 * Parses the asic configuration parameters specified in the gpu info

1885 * firmware and makes them availale to the driver for use in configuring

1886 * the asic.

1887 * Returns 0 on success, -EINVAL on failure.

1888 */

1889 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)

1890 {

1891 const char *chip_name;

1892 char fw_name[40];

1893 int err;

1894 const struct gpu_info_firmware_header_v1_0 *hdr;

1895

1896 adev->firmware.gpu_info_fw = NULL;

1897

1898 if (adev->mman.discovery_bin) {

1899 amdgpu_discovery_get_gfx_info(adev);

1900

1901 /*

1902 * FIXME: The bounding box is still needed by Navi12, so

1903 * temporarily read it from gpu_info firmware. Should be droped

1904 * when DAL no longer needs it.

1905 */

1906 if (adev->asic_type != CHIP_NAVI12)

1907 return 0;

1908 }

1909

1910 switch (adev->asic_type) {

1911 #ifdef CONFIG_DRM_AMDGPU_SI

1912 case CHIP_VERDE:

1913 case CHIP_TAHITI:

1914 case CHIP_PITCAIRN:

1915 case CHIP_OLAND:

1916 case CHIP_HAINAN:

1917 #endif

1918 #ifdef CONFIG_DRM_AMDGPU_CIK

1919 case CHIP_BONAIRE:

1920 case CHIP_HAWAII:

1921 case CHIP_KAVERI:

1922 case CHIP_KABINI:

1923 case CHIP_MULLINS:

1924 #endif

1925 case CHIP_TOPAZ:

1926 case CHIP_TONGA:

1927 case CHIP_FIJI:

1928 case CHIP_POLARIS10:

1929 case CHIP_POLARIS11:

1930 case CHIP_POLARIS12:

1931 case CHIP_VEGAM:

1932 case CHIP_CARRIZO:

1933 case CHIP_STONEY:

1934 case CHIP_VEGA20:

1935 case CHIP_ALDEBARAN:

1936 case CHIP_SIENNA_CICHLID:

1937 case CHIP_NAVY_FLOUNDER:

1938 case CHIP_DIMGREY_CAVEFISH:

1939 case CHIP_BEIGE_GOBY:

1940 default:

1941 return 0;

1942 case CHIP_VEGA10:

1943 chip_name = "vega10";

1944 break;

1945 case CHIP_VEGA12:

1946 chip_name = "vega12";

1947 break;

1948 case CHIP_RAVEN:

1949 if (adev->apu_flags & AMD_APU_IS_RAVEN2)

1950 chip_name = "raven2";

1951 else if (adev->apu_flags & AMD_APU_IS_PICASSO)

1952 chip_name = "picasso";

1953 else

1954 chip_name = "raven";

1955 break;

1956 case CHIP_ARCTURUS:

1957 chip_name = "arcturus";

1958 break;

1959 case CHIP_RENOIR:

1960 if (adev->apu_flags & AMD_APU_IS_RENOIR)

1961 chip_name = "renoir";

1962 else

1963 chip_name = "green_sardine";

1964 break;

1965 case CHIP_NAVI10:

1966 chip_name = "navi10";

1967 break;

1968 case CHIP_NAVI14:

1969 chip_name = "navi14";

1970 break;

1971 case CHIP_NAVI12:

1972 chip_name = "navi12";

1973 break;

1974 case CHIP_VANGOGH:

1975 chip_name = "vangogh";

1976 break;

1977 case CHIP_YELLOW_CARP:

1978 chip_name = "yellow_carp";

1979 break;

1980 }

1981

1982 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);

1983 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);

1984 if (err) {

1985 dev_err(adev->dev,

1986 "Failed to load gpu_info firmware \"%s\"\n",

1987 fw_name);

1988 goto out;

1989 }

1990 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);

1991 if (err) {

1992 dev_err(adev->dev,

1993 "Failed to validate gpu_info firmware \"%s\"\n",

1994 fw_name);

1995 goto out;

1996 }

1997

1998 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;

1999 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);

2000

2001 switch (hdr->version_major) {

2002 case 1:

2003 {

2004 const struct gpu_info_firmware_v1_0 *gpu_info_fw =

2005 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +

2006 le32_to_cpu(hdr->header.ucode_array_offset_bytes));

2007

2008 /*

2009 * Should be droped when DAL no longer needs it.

2010 */

2011 if (adev->asic_type == CHIP_NAVI12)

2012 goto parse_soc_bounding_box;

2013

2014 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);

2015 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);

2016 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);

2017 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);

2018 adev->gfx.config.max_texture_channel_caches =

2019 le32_to_cpu(gpu_info_fw->gc_num_tccs);

2020 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);

2021 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);

2022 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);

2023 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);

2024 adev->gfx.config.double_offchip_lds_buf =

2025 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);

2026 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);

2027 adev->gfx.cu_info.max_waves_per_simd =

2028 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);

2029 adev->gfx.cu_info.max_scratch_slots_per_cu =

2030 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);

2031 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);

2032 if (hdr->version_minor >= 1) {

2033 const struct gpu_info_firmware_v1_1 *gpu_info_fw =

2034 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +

2035 le32_to_cpu(hdr->header.ucode_array_offset_bytes));

2036 adev->gfx.config.num_sc_per_sh =

2037 le32_to_cpu(gpu_info_fw->num_sc_per_sh);

2038 adev->gfx.config.num_packer_per_sc =

2039 le32_to_cpu(gpu_info_fw->num_packer_per_sc);

2040 }

2041

2042 parse_soc_bounding_box:

2043 /*

2044 * soc bounding box info is not integrated in disocovery table,

2045 * we always need to parse it from gpu info firmware if needed.

2046 */

2047 if (hdr->version_minor == 2) {

2048 const struct gpu_info_firmware_v1_2 *gpu_info_fw =

2049 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +

2050 le32_to_cpu(hdr->header.ucode_array_offset_bytes));

2051 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;

2052 }

2053 break;

2054 }

2055 default:

2056 dev_err(adev->dev,

2057 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);

2058 err = -EINVAL;

2059 goto out;

2060 }

2061 out:

2062 return err;

2063 }

2064

2065 /**

2066 * amdgpu_device_ip_early_init - run early init for hardware IPs

2067 *

2068 * @adev: amdgpu_device pointer

2069 *

2070 * Early initialization pass for hardware IPs. The hardware IPs that make

2071 * up each asic are discovered each IP's early_init callback is run. This

2072 * is the first stage in initializing the asic.

2073 * Returns 0 on success, negative error code on failure.

2074 */

2075 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)

2076 {

2077 struct drm_device *dev = adev_to_drm(adev);

2078 struct pci_dev *parent;

2079 int i, r;

2080

2081 amdgpu_device_enable_virtual_display(adev);

2082

2083 if (amdgpu_sriov_vf(adev)) {

2084 r = amdgpu_virt_request_full_gpu(adev, true);

2085 if (r)

2086 return r;

2087 }

2088

2089 switch (adev->asic_type) {

2090 #ifdef CONFIG_DRM_AMDGPU_SI

2091 case CHIP_VERDE:

2092 case CHIP_TAHITI:

2093 case CHIP_PITCAIRN:

2094 case CHIP_OLAND:

2095 case CHIP_HAINAN:

2096 adev->family = AMDGPU_FAMILY_SI;

2097 r = si_set_ip_blocks(adev);

2098 if (r)

2099 return r;

2100 break;

2101 #endif

2102 #ifdef CONFIG_DRM_AMDGPU_CIK

2103 case CHIP_BONAIRE:

2104 case CHIP_HAWAII:

2105 case CHIP_KAVERI:

2106 case CHIP_KABINI:

2107 case CHIP_MULLINS:

2108 if (adev->flags & AMD_IS_APU)

2109 adev->family = AMDGPU_FAMILY_KV;

2110 else

2111 adev->family = AMDGPU_FAMILY_CI;

2112

2113 r = cik_set_ip_blocks(adev);

2114 if (r)

2115 return r;

2116 break;

2117 #endif

2118 case CHIP_TOPAZ:

2119 case CHIP_TONGA:

2120 case CHIP_FIJI:

2121 case CHIP_POLARIS10:

2122 case CHIP_POLARIS11:

2123 case CHIP_POLARIS12:

2124 case CHIP_VEGAM:

2125 case CHIP_CARRIZO:

2126 case CHIP_STONEY:

2127 if (adev->flags & AMD_IS_APU)

2128 adev->family = AMDGPU_FAMILY_CZ;

2129 else

2130 adev->family = AMDGPU_FAMILY_VI;

2131

2132 r = vi_set_ip_blocks(adev);

2133 if (r)

2134 return r;

2135 break;

2136 default:

2137 r = amdgpu_discovery_set_ip_blocks(adev);

2138 if (r)

2139 return r;

2140 break;

2141 }

2142

2143 if (amdgpu_has_atpx() &&

2144 (amdgpu_is_atpx_hybrid() ||

2145 amdgpu_has_atpx_dgpu_power_cntl()) &&

2146 ((adev->flags & AMD_IS_APU) == 0) &&

2147 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))

2148 adev->flags |= AMD_IS_PX;

2149

2150 parent = pci_upstream_bridge(adev->pdev);

2151 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;

2152

2153 amdgpu_amdkfd_device_probe(adev);

2154

2155 adev->pm.pp_feature = amdgpu_pp_feature_mask;

2156 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)

2157 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;

2158 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)

2159 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;

2160

2161 for (i = 0; i < adev->num_ip_blocks; i++) {

2162 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {

2163 DRM_ERROR("disabled ip block: %d <%s>\n",

2164 i, adev->ip_blocks[i].version->funcs->name);

2165 adev->ip_blocks[i].status.valid = false;

2166 } else {

2167 if (adev->ip_blocks[i].version->funcs->early_init) {

2168 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);

2169 if (r == -ENOENT) {

2170 adev->ip_blocks[i].status.valid = false;

2171 } else if (r) {

2172 DRM_ERROR("early_init of IP block <%s> failed %d\n",

2173 adev->ip_blocks[i].version->funcs->name, r);

2174 return r;

2175 } else {

2176 adev->ip_blocks[i].status.valid = true;

2177 }

2178 } else {

2179 adev->ip_blocks[i].status.valid = true;

2180 }

2181 }

2182 /* get the vbios after the asic_funcs are set up */

2183 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {

2184 r = amdgpu_device_parse_gpu_info_fw(adev);

2185 if (r)

2186 return r;

2187

2188 /* Read BIOS */

2189 if (!amdgpu_get_bios(adev))

2190 return -EINVAL;

2191

2192 r = amdgpu_atombios_init(adev);

2193 if (r) {

2194 dev_err(adev->dev, "amdgpu_atombios_init failed\n");

2195 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);

2196 return r;

2197 }

2198

2199 /*get pf2vf msg info at it's earliest time*/

2200 if (amdgpu_sriov_vf(adev))

2201 amdgpu_virt_init_data_exchange(adev);

2202

2203 }

2204 }

2205

2206 adev->cg_flags &= amdgpu_cg_mask;

2207 adev->pg_flags &= amdgpu_pg_mask;

2208

2209 return 0;

2210 }

2211

2212 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)

2213 {

2214 int i, r;

2215

2216 for (i = 0; i < adev->num_ip_blocks; i++) {

2217 if (!adev->ip_blocks[i].status.sw)

2218 continue;

2219 if (adev->ip_blocks[i].status.hw)

2220 continue;

2221 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||

2222 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||

2223 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {

2224 r = adev->ip_blocks[i].version->funcs->hw_init(adev);

2225 if (r) {

2226 DRM_ERROR("hw_init of IP block <%s> failed %d\n",

2227 adev->ip_blocks[i].version->funcs->name, r);

2228 return r;

2229 }

2230 adev->ip_blocks[i].status.hw = true;

2231 }

2232 }

2233

2234 return 0;

2235 }

2236

2237 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)

2238 {

2239 int i, r;

2240

2241 for (i = 0; i < adev->num_ip_blocks; i++) {

2242 if (!adev->ip_blocks[i].status.sw)

2243 continue;

2244 if (adev->ip_blocks[i].status.hw)

2245 continue;

2246 r = adev->ip_blocks[i].version->funcs->hw_init(adev);

2247 if (r) {

2248 DRM_ERROR("hw_init of IP block <%s> failed %d\n",

2249 adev->ip_blocks[i].version->funcs->name, r);

2250 return r;

2251 }

2252 adev->ip_blocks[i].status.hw = true;

2253 }

2254

2255 return 0;

2256 }

2257

2258 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)

2259 {

2260 int r = 0;

2261 int i;

2262 uint32_t smu_version;

2263

2264 if (adev->asic_type >= CHIP_VEGA10) {

2265 for (i = 0; i < adev->num_ip_blocks; i++) {

2266 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)

2267 continue;

2268

2269 if (!adev->ip_blocks[i].status.sw)

2270 continue;

2271

2272 /* no need to do the fw loading again if already done*/

2273 if (adev->ip_blocks[i].status.hw == true)

2274 break;

2275

2276 if (amdgpu_in_reset(adev) || adev->in_suspend) {

2277 r = adev->ip_blocks[i].version->funcs->resume(adev);

2278 if (r) {

2279 DRM_ERROR("resume of IP block <%s> failed %d\n",

2280 adev->ip_blocks[i].version->funcs->name, r);

2281 return r;

2282 }

2283 } else {

2284 r = adev->ip_blocks[i].version->funcs->hw_init(adev);

2285 if (r) {

2286 DRM_ERROR("hw_init of IP block <%s> failed %d\n",

2287 adev->ip_blocks[i].version->funcs->name, r);

2288 return r;

2289 }

2290 }

2291

2292 adev->ip_blocks[i].status.hw = true;

2293 break;

2294 }

2295 }

2296

2297 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)

2298 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);

2299

2300 return r;

2301 }

2302

2303 /**

2304 * amdgpu_device_ip_init - run init for hardware IPs

2305 *

2306 * @adev: amdgpu_device pointer

2307 *

2308 * Main initialization pass for hardware IPs. The list of all the hardware

2309 * IPs that make up the asic is walked and the sw_init and hw_init callbacks

2310 * are run. sw_init initializes the software state associated with each IP

2311 * and hw_init initializes the hardware associated with each IP.

2312 * Returns 0 on success, negative error code on failure.

2313 */

2314 static int amdgpu_device_ip_init(struct amdgpu_device *adev)

2315 {

2316 int i, r;

2317

2318 r = amdgpu_ras_init(adev);

2319 if (r)

2320 return r;

2321

2322 for (i = 0; i < adev->num_ip_blocks; i++) {

2323 if (!adev->ip_blocks[i].status.valid)

2324 continue;

2325 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);

2326 if (r) {

2327 DRM_ERROR("sw_init of IP block <%s> failed %d\n",

2328 adev->ip_blocks[i].version->funcs->name, r);

2329 goto init_failed;

2330 }

2331 adev->ip_blocks[i].status.sw = true;

2332

2333 /* need to do gmc hw init early so we can allocate gpu mem */

2334 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {

2335 /* Try to reserve bad pages early */

2336 if (amdgpu_sriov_vf(adev))

2337 amdgpu_virt_exchange_data(adev);

2338

2339 r = amdgpu_device_vram_scratch_init(adev);

2340 if (r) {

2341 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);

2342 goto init_failed;

2343 }

2344 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);

2345 if (r) {

2346 DRM_ERROR("hw_init %d failed %d\n", i, r);

2347 goto init_failed;

2348 }

2349 r = amdgpu_device_wb_init(adev);

2350 if (r) {

2351 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);

2352 goto init_failed;

2353 }

2354 adev->ip_blocks[i].status.hw = true;

2355

2356 /* right after GMC hw init, we create CSA */

2357 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {

2358 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,

2359 AMDGPU_GEM_DOMAIN_VRAM,

2360 AMDGPU_CSA_SIZE);

2361 if (r) {

2362 DRM_ERROR("allocate CSA failed %d\n", r);

2363 goto init_failed;

2364 }

2365 }

2366 }

2367 }

2368

2369 if (amdgpu_sriov_vf(adev))

2370 amdgpu_virt_init_data_exchange(adev);

2371

2372 r = amdgpu_ib_pool_init(adev);

2373 if (r) {

2374 dev_err(adev->dev, "IB initialization failed (%d).\n", r);

2375 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);

2376 goto init_failed;

2377 }

2378

2379 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/

2380 if (r)

2381 goto init_failed;

2382

2383 r = amdgpu_device_ip_hw_init_phase1(adev);

2384 if (r)

2385 goto init_failed;

2386

2387 r = amdgpu_device_fw_loading(adev);

2388 if (r)

2389 goto init_failed;

2390

2391 r = amdgpu_device_ip_hw_init_phase2(adev);

2392 if (r)

2393 goto init_failed;

2394

2395 /*

2396 * retired pages will be loaded from eeprom and reserved here,

2397 * it should be called after amdgpu_device_ip_hw_init_phase2 since

2398 * for some ASICs the RAS EEPROM code relies on SMU fully functioning

2399 * for I2C communication which only true at this point.

2400 *

2401 * amdgpu_ras_recovery_init may fail, but the upper only cares the

2402 * failure from bad gpu situation and stop amdgpu init process

2403 * accordingly. For other failed cases, it will still release all

2404 * the resource and print error message, rather than returning one

2405 * negative value to upper level.

2406 *

2407 * Note: theoretically, this should be called before all vram allocations

2408 * to protect retired page from abusing

2409 */

2410 r = amdgpu_ras_recovery_init(adev);

2411 if (r)

2412 goto init_failed;

2413

2414 if (adev->gmc.xgmi.num_physical_nodes > 1)

2415 amdgpu_xgmi_add_device(adev);

2416

2417 /* Don't init kfd if whole hive need to be reset during init */

2418 if (!adev->gmc.xgmi.pending_reset)

2419 amdgpu_amdkfd_device_init(adev);

2420

2421 amdgpu_fru_get_product_info(adev);

2422

2423 init_failed:

2424 if (amdgpu_sriov_vf(adev))

2425 amdgpu_virt_release_full_gpu(adev, true);

2426

2427 return r;

2428 }

2429

2430 /**

2431 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer

2432 *

2433 * @adev: amdgpu_device pointer

2434 *

2435 * Writes a reset magic value to the gart pointer in VRAM. The driver calls

2436 * this function before a GPU reset. If the value is retained after a

2437 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.

2438 */

2439 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)

2440 {

2441 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);

2442 }

2443

2444 /**

2445 * amdgpu_device_check_vram_lost - check if vram is valid

2446 *

2447 * @adev: amdgpu_device pointer

2448 *

2449 * Checks the reset magic value written to the gart pointer in VRAM.

2450 * The driver calls this after a GPU reset to see if the contents of

2451 * VRAM is lost or now.

2452 * returns true if vram is lost, false if not.

2453 */

2454 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)

2455 {

2456 if (memcmp(adev->gart.ptr, adev->reset_magic,

2457 AMDGPU_RESET_MAGIC_NUM))

2458 return true;

2459

2460 if (!amdgpu_in_reset(adev))

2461 return false;

2462

2463 /*

2464 * For all ASICs with baco/mode1 reset, the VRAM is

2465 * always assumed to be lost.

2466 */

2467 switch (amdgpu_asic_reset_method(adev)) {

2468 case AMD_RESET_METHOD_BACO:

2469 case AMD_RESET_METHOD_MODE1:

2470 return true;

2471 default:

2472 return false;

2473 }

2474 }

2475

2476 /**

2477 * amdgpu_device_set_cg_state - set clockgating for amdgpu device

2478 *

2479 * @adev: amdgpu_device pointer

2480 * @state: clockgating state (gate or ungate)

2481 *

2482 * The list of all the hardware IPs that make up the asic is walked and the

2483 * set_clockgating_state callbacks are run.

2484 * Late initialization pass enabling clockgating for hardware IPs.

2485 * Fini or suspend, pass disabling clockgating for hardware IPs.

2486 * Returns 0 on success, negative error code on failure.

2487 */

2488

2489 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,

2490 enum amd_clockgating_state state)

2491 {

2492 int i, j, r;

2493

2494 if (amdgpu_emu_mode == 1)

2495 return 0;

2496

2497 for (j = 0; j < adev->num_ip_blocks; j++) {

2498 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;

2499 if (!adev->ip_blocks[i].status.late_initialized)

2500 continue;

2501 /* skip CG for GFX on S0ix */

2502 if (adev->in_s0ix &&

2503 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)

2504 continue;

2505 /* skip CG for VCE/UVD, it's handled specially */

2506 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&

2507 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&

2508 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&

2509 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&

2510 adev->ip_blocks[i].version->funcs->set_clockgating_state) {

2511 /* enable clockgating to save power */

2512 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,

2513 state);

2514 if (r) {

2515 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",

2516 adev->ip_blocks[i].version->funcs->name, r);

2517 return r;

2518 }

2519 }

2520 }

2521

2522 return 0;

2523 }

2524

2525 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,

2526 enum amd_powergating_state state)

2527 {

2528 int i, j, r;

2529

2530 if (amdgpu_emu_mode == 1)

2531 return 0;

2532

2533 for (j = 0; j < adev->num_ip_blocks; j++) {

2534 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;

2535 if (!adev->ip_blocks[i].status.late_initialized)

2536 continue;

2537 /* skip PG for GFX on S0ix */

2538 if (adev->in_s0ix &&

2539 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)

2540 continue;

2541 /* skip CG for VCE/UVD, it's handled specially */

2542 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&

2543 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&

2544 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&

2545 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&

2546 adev->ip_blocks[i].version->funcs->set_powergating_state) {

2547 /* enable powergating to save power */

2548 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,

2549 state);

2550 if (r) {

2551 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",

2552 adev->ip_blocks[i].version->funcs->name, r);

2553 return r;

2554 }

2555 }

2556 }

2557 return 0;

2558 }

2559

2560 static int amdgpu_device_enable_mgpu_fan_boost(void)

2561 {

2562 struct amdgpu_gpu_instance *gpu_ins;

2563 struct amdgpu_device *adev;

2564 int i, ret = 0;

2565

2566 mutex_lock(&mgpu_info.mutex);

2567

2568 /*

2569 * MGPU fan boost feature should be enabled

2570 * only when there are two or more dGPUs in

2571 * the system

2572 */

2573 if (mgpu_info.num_dgpu < 2)

2574 goto out;

2575

2576 for (i = 0; i < mgpu_info.num_dgpu; i++) {

2577 gpu_ins = &(mgpu_info.gpu_ins[i]);

2578 adev = gpu_ins->adev;

2579 if (!(adev->flags & AMD_IS_APU) &&

2580 !gpu_ins->mgpu_fan_enabled) {

2581 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);

2582 if (ret)

2583 break;

2584

2585 gpu_ins->mgpu_fan_enabled = 1;

2586 }

2587 }

2588

2589 out:

2590 mutex_unlock(&mgpu_info.mutex);

2591

2592 return ret;

2593 }

2594

2595 /**

2596 * amdgpu_device_ip_late_init - run late init for hardware IPs

2597 *

2598 * @adev: amdgpu_device pointer

2599 *

2600 * Late initialization pass for hardware IPs. The list of all the hardware

2601 * IPs that make up the asic is walked and the late_init callbacks are run.

2602 * late_init covers any special initialization that an IP requires

2603 * after all of the have been initialized or something that needs to happen

2604 * late in the init process.

2605 * Returns 0 on success, negative error code on failure.

2606 */

2607 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)

2608 {

2609 struct amdgpu_gpu_instance *gpu_instance;

2610 int i = 0, r;

2611

2612 for (i = 0; i < adev->num_ip_blocks; i++) {

2613 if (!adev->ip_blocks[i].status.hw)

2614 continue;

2615 if (adev->ip_blocks[i].version->funcs->late_init) {

2616 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);

2617 if (r) {

2618 DRM_ERROR("late_init of IP block <%s> failed %d\n",

2619 adev->ip_blocks[i].version->funcs->name, r);

2620 return r;

2621 }

2622 }

2623 adev->ip_blocks[i].status.late_initialized = true;

2624 }

2625

2626 amdgpu_ras_set_error_query_ready(adev, true);

2627

2628 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);

2629 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);

2630

2631 amdgpu_device_fill_reset_magic(adev);

2632

2633 r = amdgpu_device_enable_mgpu_fan_boost();

2634 if (r)

2635 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

2636

2637 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */

2638 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||

2639 adev->asic_type == CHIP_ALDEBARAN ))

2640 smu_handle_passthrough_sbr(&adev->smu, true);

2641

2642 if (adev->gmc.xgmi.num_physical_nodes > 1) {

2643 mutex_lock(&mgpu_info.mutex);

2644

2645 /*

2646 * Reset device p-state to low as this was booted with high.

2647 *

2648 * This should be performed only after all devices from the same

2649 * hive get initialized.

2650 *

2651 * However, it's unknown how many device in the hive in advance.

2652 * As this is counted one by one during devices initializations.

2653 *

2654 * So, we wait for all XGMI interlinked devices initialized.

2655 * This may bring some delays as those devices may come from

2656 * different hives. But that should be OK.

2657 */

2658 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {

2659 for (i = 0; i < mgpu_info.num_gpu; i++) {

2660 gpu_instance = &(mgpu_info.gpu_ins[i]);

2661 if (gpu_instance->adev->flags & AMD_IS_APU)

2662 continue;

2663

2664 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,

2665 AMDGPU_XGMI_PSTATE_MIN);

2666 if (r) {

2667 DRM_ERROR("pstate setting failed (%d).\n", r);

2668 break;

2669 }

2670 }

2671 }

2672

2673 mutex_unlock(&mgpu_info.mutex);

2674 }

2675

2676 return 0;

2677 }

2678

2679 /**

2680 * amdgpu_device_smu_fini_early - smu hw_fini wrapper

2681 *

2682 * @adev: amdgpu_device pointer

2683 *

2684 * For ASICs need to disable SMC first

2685 */

2686 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)

2687 {

2688 int i, r;

2689

2690 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))

2691 return;

2692

2693 for (i = 0; i < adev->num_ip_blocks; i++) {

2694 if (!adev->ip_blocks[i].status.hw)

2695 continue;

2696 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {

2697 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);

2698 /* XXX handle errors */

2699 if (r) {

2700 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",

2701 adev->ip_blocks[i].version->funcs->name, r);

2702 }

2703 adev->ip_blocks[i].status.hw = false;

2704 break;

2705 }

2706 }

2707 }

2708

2709 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)

2710 {

2711 int i, r;

2712

2713 for (i = 0; i < adev->num_ip_blocks; i++) {

2714 if (!adev->ip_blocks[i].version->funcs->early_fini)

2715 continue;

2716

2717 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);

2718 if (r) {

2719 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",

2720 adev->ip_blocks[i].version->funcs->name, r);

2721 }

2722 }

2723

2724 amdgpu_amdkfd_suspend(adev, false);

2725

2726 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);

2727 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);

2728

2729 /* Workaroud for ASICs need to disable SMC first */

2730 amdgpu_device_smu_fini_early(adev);

2731

2732 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {

2733 if (!adev->ip_blocks[i].status.hw)

2734 continue;

2735

2736 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);

2737 /* XXX handle errors */

2738 if (r) {

2739 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",

2740 adev->ip_blocks[i].version->funcs->name, r);

2741 }

2742

2743 adev->ip_blocks[i].status.hw = false;

2744 }

2745

2746 if (amdgpu_sriov_vf(adev)) {

2747 if (amdgpu_virt_release_full_gpu(adev, false))

2748 DRM_ERROR("failed to release exclusive mode on fini\n");

2749 }

2750

2751 return 0;

2752 }

2753

2754 /**

2755 * amdgpu_device_ip_fini - run fini for hardware IPs

2756 *

2757 * @adev: amdgpu_device pointer

2758 *

2759 * Main teardown pass for hardware IPs. The list of all the hardware

2760 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks

2761 * are run. hw_fini tears down the hardware associated with each IP

2762 * and sw_fini tears down any software state associated with each IP.

2763 * Returns 0 on success, negative error code on failure.

2764 */

2765 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)

2766 {

2767 int i, r;

2768

2769 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)

2770 amdgpu_virt_release_ras_err_handler_data(adev);

2771

2772 if (adev->gmc.xgmi.num_physical_nodes > 1)

2773 amdgpu_xgmi_remove_device(adev);

2774

2775 amdgpu_amdkfd_device_fini_sw(adev);

2776

2777 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {

2778 if (!adev->ip_blocks[i].status.sw)

2779 continue;

2780

2781 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {

2782 amdgpu_ucode_free_bo(adev);

2783 amdgpu_free_static_csa(&adev->virt.csa_obj);

2784 amdgpu_device_wb_fini(adev);

2785 amdgpu_device_vram_scratch_fini(adev);

2786 amdgpu_ib_pool_fini(adev);

2787 }

2788

2789 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);

2790 /* XXX handle errors */

2791 if (r) {

2792 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",

2793 adev->ip_blocks[i].version->funcs->name, r);

2794 }

2795 adev->ip_blocks[i].status.sw = false;

2796 adev->ip_blocks[i].status.valid = false;

2797 }

2798

2799 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {

2800 if (!adev->ip_blocks[i].status.late_initialized)

2801 continue;

2802 if (adev->ip_blocks[i].version->funcs->late_fini)

2803 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);

2804 adev->ip_blocks[i].status.late_initialized = false;

2805 }

2806

2807 amdgpu_ras_fini(adev);

2808

2809 return 0;

2810 }

2811

2812 /**

2813 * amdgpu_device_delayed_init_work_handler - work handler for IB tests

2814 *

2815 * @work: work_struct.

2816 */

2817 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)

2818 {

2819 struct amdgpu_device *adev =

2820 container_of(work, struct amdgpu_device, delayed_init_work.work);

2821 int r;

2822

2823 r = amdgpu_ib_ring_tests(adev);

2824 if (r)

2825 DRM_ERROR("ib ring test failed (%d).\n", r);

2826 }

2827

2828 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)

2829 {

2830 struct amdgpu_device *adev =

2831 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);

2832

2833 WARN_ON_ONCE(adev->gfx.gfx_off_state);

2834 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);

2835

2836 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))

2837 adev->gfx.gfx_off_state = true;

2838 }

2839

2840 /**

2841 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)

2842 *

2843 * @adev: amdgpu_device pointer

2844 *

2845 * Main suspend function for hardware IPs. The list of all the hardware

2846 * IPs that make up the asic is walked, clockgating is disabled and the

2847 * suspend callbacks are run. suspend puts the hardware and software state

2848 * in each IP into a state suitable for suspend.

2849 * Returns 0 on success, negative error code on failure.

2850 */

2851 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)

2852 {

2853 int i, r;

2854

2855 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);

2856 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);

2857

2858 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {

2859 if (!adev->ip_blocks[i].status.valid)

2860 continue;

2861

2862 /* displays are handled separately */

2863 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)

2864 continue;

2865

2866 /* XXX handle errors */

2867 r = adev->ip_blocks[i].version->funcs->suspend(adev);

2868 /* XXX handle errors */

2869 if (r) {

2870 DRM_ERROR("suspend of IP block <%s> failed %d\n",

2871 adev->ip_blocks[i].version->funcs->name, r);

2872 return r;

2873 }

2874

2875 adev->ip_blocks[i].status.hw = false;

2876 }

2877

2878 return 0;

2879 }

2880

2881 /**

2882 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)

2883 *

2884 * @adev: amdgpu_device pointer

2885 *

2886 * Main suspend function for hardware IPs. The list of all the hardware

2887 * IPs that make up the asic is walked, clockgating is disabled and the

2888 * suspend callbacks are run. suspend puts the hardware and software state

2889 * in each IP into a state suitable for suspend.

2890 * Returns 0 on success, negative error code on failure.

2891 */

2892 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)

2893 {

2894 int i, r;

2895

2896 if (adev->in_s0ix)

2897 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);

2898

2899 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {

2900 if (!adev->ip_blocks[i].status.valid)

2901 continue;

2902 /* displays are handled in phase1 */

2903 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)

2904 continue;

2905 /* PSP lost connection when err_event_athub occurs */

2906 if (amdgpu_ras_intr_triggered() &&

2907 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {

2908 adev->ip_blocks[i].status.hw = false;

2909 continue;

2910 }

2911

2912 /* skip unnecessary suspend if we do not initialize them yet */

2913 if (adev->gmc.xgmi.pending_reset &&

2914 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||

2915 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||

2916 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||

2917 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {

2918 adev->ip_blocks[i].status.hw = false;

2919 continue;

2920 }

2921

2922 /* skip suspend of gfx and psp for S0ix

2923 * gfx is in gfxoff state, so on resume it will exit gfxoff just

2924 * like at runtime. PSP is also part of the always on hardware

2925 * so no need to suspend it.

2926 */

2927 if (adev->in_s0ix &&

2928 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||

2929 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))

2930 continue;

2931

2932 /* XXX handle errors */

2933 r = adev->ip_blocks[i].version->funcs->suspend(adev);

2934 /* XXX handle errors */

2935 if (r) {

2936 DRM_ERROR("suspend of IP block <%s> failed %d\n",

2937 adev->ip_blocks[i].version->funcs->name, r);

2938 }

2939 adev->ip_blocks[i].status.hw = false;

2940 /* handle putting the SMC in the appropriate state */

2941 if(!amdgpu_sriov_vf(adev)){

2942 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {

2943 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);

2944 if (r) {

2945 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",

2946 adev->mp1_state, r);

2947 return r;

2948 }

2949 }

2950 }

2951 }

2952

2953 return 0;

2954 }

2955

2956 /**

2957 * amdgpu_device_ip_suspend - run suspend for hardware IPs

2958 *

2959 * @adev: amdgpu_device pointer

2960 *

2961 * Main suspend function for hardware IPs. The list of all the hardware

2962 * IPs that make up the asic is walked, clockgating is disabled and the

2963 * suspend callbacks are run. suspend puts the hardware and software state

2964 * in each IP into a state suitable for suspend.

2965 * Returns 0 on success, negative error code on failure.

2966 */

2967 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)

2968 {

2969 int r;

2970

2971 if (amdgpu_sriov_vf(adev)) {

2972 amdgpu_virt_fini_data_exchange(adev);

2973 amdgpu_virt_request_full_gpu(adev, false);

2974 }

2975

2976 r = amdgpu_device_ip_suspend_phase1(adev);

2977 if (r)

2978 return r;

2979 r = amdgpu_device_ip_suspend_phase2(adev);

2980

2981 if (amdgpu_sriov_vf(adev))

2982 amdgpu_virt_release_full_gpu(adev, false);

2983

2984 return r;

2985 }

2986

2987 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)

2988 {

2989 int i, r;

2990

2991 static enum amd_ip_block_type ip_order[] = {

2992 AMD_IP_BLOCK_TYPE_GMC,

2993 AMD_IP_BLOCK_TYPE_COMMON,

2994 AMD_IP_BLOCK_TYPE_PSP,

2995 AMD_IP_BLOCK_TYPE_IH,

2996 };

2997

2998 for (i = 0; i < adev->num_ip_blocks; i++) {

2999 int j;

3000 struct amdgpu_ip_block *block;

3001

3002 block = &adev->ip_blocks[i];

3003 block->status.hw = false;

3004

3005 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {

3006

3007 if (block->version->type != ip_order[j] ||

3008 !block->status.valid)

3009 continue;

3010

3011 r = block->version->funcs->hw_init(adev);

3012 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");

3013 if (r)

3014 return r;

3015 block->status.hw = true;

3016 }

3017 }

3018

3019 return 0;

3020 }

3021

3022 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)

3023 {

3024 int i, r;

3025

3026 static enum amd_ip_block_type ip_order[] = {

3027 AMD_IP_BLOCK_TYPE_SMC,

3028 AMD_IP_BLOCK_TYPE_DCE,

3029 AMD_IP_BLOCK_TYPE_GFX,

3030 AMD_IP_BLOCK_TYPE_SDMA,

3031 AMD_IP_BLOCK_TYPE_UVD,

3032 AMD_IP_BLOCK_TYPE_VCE,

3033 AMD_IP_BLOCK_TYPE_VCN

3034 };

3035

3036 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {

3037 int j;

3038 struct amdgpu_ip_block *block;

3039

3040 for (j = 0; j < adev->num_ip_blocks; j++) {

3041 block = &adev->ip_blocks[j];

3042

3043 if (block->version->type != ip_order[i] ||

3044 !block->status.valid ||

3045 block->status.hw)

3046 continue;

3047

3048 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)

3049 r = block->version->funcs->resume(adev);

3050 else

3051 r = block->version->funcs->hw_init(adev);

3052

3053 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");

3054 if (r)

3055 return r;

3056 block->status.hw = true;

3057 }

3058 }

3059

3060 return 0;

3061 }

3062

3063 /**

3064 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs

3065 *

3066 * @adev: amdgpu_device pointer

3067 *

3068 * First resume function for hardware IPs. The list of all the hardware

3069 * IPs that make up the asic is walked and the resume callbacks are run for

3070 * COMMON, GMC, and IH. resume puts the hardware into a functional state

3071 * after a suspend and updates the software state as necessary. This

3072 * function is also used for restoring the GPU after a GPU reset.

3073 * Returns 0 on success, negative error code on failure.

3074 */

3075 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)

3076 {

3077 int i, r;

3078

3079 for (i = 0; i < adev->num_ip_blocks; i++) {

3080 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)

3081 continue;

3082 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||

3083 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||

3084 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {

3085

3086 r = adev->ip_blocks[i].version->funcs->resume(adev);

3087 if (r) {

3088 DRM_ERROR("resume of IP block <%s> failed %d\n",

3089 adev->ip_blocks[i].version->funcs->name, r);

3090 return r;

3091 }

3092 adev->ip_blocks[i].status.hw = true;

3093 }

3094 }

3095

3096 return 0;

3097 }

3098

3099 /**

3100 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs

3101 *

3102 * @adev: amdgpu_device pointer

3103 *

3104 * First resume function for hardware IPs. The list of all the hardware

3105 * IPs that make up the asic is walked and the resume callbacks are run for

3106 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a

3107 * functional state after a suspend and updates the software state as

3108 * necessary. This function is also used for restoring the GPU after a GPU

3109 * reset.

3110 * Returns 0 on success, negative error code on failure.

3111 */

3112 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)

3113 {

3114 int i, r;

3115

3116 for (i = 0; i < adev->num_ip_blocks; i++) {

3117 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)

3118 continue;

3119 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||

3120 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||

3121 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||

3122 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)

3123 continue;

3124 r = adev->ip_blocks[i].version->funcs->resume(adev);

3125 if (r) {

3126 DRM_ERROR("resume of IP block <%s> failed %d\n",

3127 adev->ip_blocks[i].version->funcs->name, r);

3128 return r;

3129 }

3130 adev->ip_blocks[i].status.hw = true;

3131 }

3132

3133 return 0;

3134 }

3135

3136 /**

3137 * amdgpu_device_ip_resume - run resume for hardware IPs

3138 *

3139 * @adev: amdgpu_device pointer

3140 *

3141 * Main resume function for hardware IPs. The hardware IPs

3142 * are split into two resume functions because they are

3143 * are also used in in recovering from a GPU reset and some additional

3144 * steps need to be take between them. In this case (S3/S4) they are

3145 * run sequentially.

3146 * Returns 0 on success, negative error code on failure.

3147 */

3148 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)

3149 {

3150 int r;

3151

3152 r = amdgpu_amdkfd_resume_iommu(adev);

3153 if (r)

3154 return r;

3155

3156 r = amdgpu_device_ip_resume_phase1(adev);

3157 if (r)

3158 return r;

3159

3160 r = amdgpu_device_fw_loading(adev);

3161 if (r)

3162 return r;

3163

3164 r = amdgpu_device_ip_resume_phase2(adev);

3165

3166 return r;

3167 }

3168

3169 /**

3170 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV

3171 *

3172 * @adev: amdgpu_device pointer

3173 *

3174 * Query the VBIOS data tables to determine if the board supports SR-IOV.

3175 */

3176 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)

3177 {

3178 if (amdgpu_sriov_vf(adev)) {

3179 if (adev->is_atom_fw) {

3180 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))

3181 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;

3182 } else {

3183 if (amdgpu_atombios_has_gpu_virtualization_table(adev))

3184 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;

3185 }

3186

3187 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))

3188 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);

3189 }

3190 }

3191

3192 /**

3193 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic

3194 *

3195 * @asic_type: AMD asic type

3196 *

3197 * Check if there is DC (new modesetting infrastructre) support for an asic.

3198 * returns true if DC has support, false if not.

3199 */

3200 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)

3201 {

3202 switch (asic_type) {

3203 #ifdef CONFIG_DRM_AMDGPU_SI

3204 case CHIP_HAINAN:

3205 #endif

3206 case CHIP_TOPAZ:

3207 /* chips with no display hardware */

3208 return false;

3209 #if defined(CONFIG_DRM_AMD_DC)

3210 case CHIP_TAHITI:

3211 case CHIP_PITCAIRN:

3212 case CHIP_VERDE:

3213 case CHIP_OLAND:

3214 /*

3215 * We have systems in the wild with these ASICs that require

3216 * LVDS and VGA support which is not supported with DC.

3217 *

3218 * Fallback to the non-DC driver here by default so as not to

3219 * cause regressions.

3220 */

3221 #if defined(CONFIG_DRM_AMD_DC_SI)

3222 return amdgpu_dc > 0;

3223 #else

3224 return false;

3225 #endif

3226 case CHIP_BONAIRE:

3227 case CHIP_KAVERI:

3228 case CHIP_KABINI:

3229 case CHIP_MULLINS:

3230 /*

3231 * We have systems in the wild with these ASICs that require

3232 * LVDS and VGA support which is not supported with DC.

3233 *

3234 * Fallback to the non-DC driver here by default so as not to

3235 * cause regressions.

3236 */

3237 return amdgpu_dc > 0;

3238 case CHIP_HAWAII:

3239 case CHIP_CARRIZO:

3240 case CHIP_STONEY:

3241 case CHIP_POLARIS10:

3242 case CHIP_POLARIS11:

3243 case CHIP_POLARIS12:

3244 case CHIP_VEGAM:

3245 case CHIP_TONGA:

3246 case CHIP_FIJI:

3247 case CHIP_VEGA10:

3248 case CHIP_VEGA12:

3249 case CHIP_VEGA20:

3250 #if defined(CONFIG_DRM_AMD_DC_DCN)

3251 case CHIP_RAVEN:

3252 case CHIP_NAVI10:

3253 case CHIP_NAVI14:

3254 case CHIP_NAVI12:

3255 case CHIP_RENOIR:

3256 case CHIP_CYAN_SKILLFISH:

3257 case CHIP_SIENNA_CICHLID:

3258 case CHIP_NAVY_FLOUNDER:

3259 case CHIP_DIMGREY_CAVEFISH:

3260 case CHIP_BEIGE_GOBY:

3261 case CHIP_VANGOGH:

3262 case CHIP_YELLOW_CARP:

3263 #endif

3264 default:

3265 return amdgpu_dc != 0;

3266 #else

3267 default:

3268 if (amdgpu_dc > 0)

3269 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "

3270 "but isn't supported by ASIC, ignoring\n");

3271 return false;

3272 #endif

3273 }

3274 }

3275

3276 /**

3277 * amdgpu_device_has_dc_support - check if dc is supported

3278 *

3279 * @adev: amdgpu_device pointer

3280 *

3281 * Returns true for supported, false for not supported

3282 */

3283 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)

3284 {

3285 if (amdgpu_sriov_vf(adev) ||

3286 adev->enable_virtual_display ||

3287 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))

3288 return false;

3289

3290 return amdgpu_device_asic_has_dc_support(adev->asic_type);

3291 }

3292

3293 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

3294 {

3295 struct amdgpu_device *adev =

3296 container_of(__work, struct amdgpu_device, xgmi_reset_work);

3297 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);

3298

3299 /* It's a bug to not have a hive within this function */

3300 if (WARN_ON(!hive))

3301 return;

3302

3303 /*

3304 * Use task barrier to synchronize all xgmi reset works across the

3305 * hive. task_barrier_enter and task_barrier_exit will block

3306 * until all the threads running the xgmi reset works reach

3307 * those points. task_barrier_full will do both blocks.

3308 */

3309 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {

3310

3311 task_barrier_enter(&hive->tb);

3312 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));

3313

3314 if (adev->asic_reset_res)

3315 goto fail;

3316

3317 task_barrier_exit(&hive->tb);

3318 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));

3319

3320 if (adev->asic_reset_res)

3321 goto fail;

3322

3323 if (adev->mmhub.ras_funcs &&

3324 adev->mmhub.ras_funcs->reset_ras_error_count)

3325 adev->mmhub.ras_funcs->reset_ras_error_count(adev);

3326 } else {

3327

3328 task_barrier_full(&hive->tb);

3329 adev->asic_reset_res = amdgpu_asic_reset(adev);

3330 }

3331

3332 fail:

3333 if (adev->asic_reset_res)

3334 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

3335 adev->asic_reset_res, adev_to_drm(adev)->unique);

3336 amdgpu_put_xgmi_hive(hive);

3337 }

3338

3339 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)

3340 {

3341 char *input = amdgpu_lockup_timeout;

3342 char *timeout_setting = NULL;

3343 int index = 0;

3344 long timeout;

3345 int ret = 0;

3346

3347 /*

3348 * By default timeout for non compute jobs is 10000

3349 * and 60000 for compute jobs.

3350 * In SR-IOV or passthrough mode, timeout for compute

3351 * jobs are 60000 by default.

3352 */

3353 adev->gfx_timeout = msecs_to_jiffies(10000);

3354 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;

3355 if (amdgpu_sriov_vf(adev))

3356 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?

3357 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);

3358 else

3359 adev->compute_timeout = msecs_to_jiffies(60000);

3360

3361 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {

3362 while ((timeout_setting = strsep(&input, ",")) &&

3363 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {

3364 ret = kstrtol(timeout_setting, 0, &timeout);

3365 if (ret)

3366 return ret;

3367

3368 if (timeout == 0) {

3369 index++;

3370 continue;

3371 } else if (timeout < 0) {

3372 timeout = MAX_SCHEDULE_TIMEOUT;

3373 dev_warn(adev->dev, "lockup timeout disabled");

3374 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);

3375 } else {

3376 timeout = msecs_to_jiffies(timeout);

3377 }

3378

3379 switch (index++) {

3380 case 0:

3381 adev->gfx_timeout = timeout;

3382 break;

3383 case 1:

3384 adev->compute_timeout = timeout;

3385 break;

3386 case 2:

3387 adev->sdma_timeout = timeout;

3388 break;

3389 case 3:

3390 adev->video_timeout = timeout;

3391 break;

3392 default:

3393 break;

3394 }

3395 }

3396 /*

3397 * There is only one value specified and

3398 * it should apply to all non-compute jobs.

3399 */

3400 if (index == 1) {

3401 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;

3402 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))

3403 adev->compute_timeout = adev->gfx_timeout;

3404 }

3405 }

3406

3407 return ret;

3408 }

3409

3410 /**

3411 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU

3412 *

3413 * @adev: amdgpu_device pointer

3414 *

3415 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode

3416 */

3417 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)

3418 {

3419 struct iommu_domain *domain;

3420

3421 domain = iommu_get_domain_for_dev(adev->dev);

3422 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)

3423 adev->ram_is_direct_mapped = true;

3424 }

3425

3426 static const struct attribute *amdgpu_dev_attributes[] = {

3427 &dev_attr_product_name.attr,

3428 &dev_attr_product_number.attr,

3429 &dev_attr_serial_number.attr,

3430 &dev_attr_pcie_replay_count.attr,

3431 NULL

3432 };

3433

3434 /**

3435 * amdgpu_device_init - initialize the driver

3436 *

3437 * @adev: amdgpu_device pointer

3438 * @flags: driver flags

3439 *

3440 * Initializes the driver info and hw (all asics).

3441 * Returns 0 for success or an error on failure.

3442 * Called at driver startup.

3443 */

3444 int amdgpu_device_init(struct amdgpu_device *adev,

3445 uint32_t flags)

3446 {

3447 struct drm_device *ddev = adev_to_drm(adev);

3448 struct pci_dev *pdev = adev->pdev;

3449 int r, i;

3450 bool px = false;

3451 u32 max_MBps;

3452

3453 adev->shutdown = false;

3454 adev->flags = flags;

3455

3456 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)

3457 adev->asic_type = amdgpu_force_asic_type;

3458 else

3459 adev->asic_type = flags & AMD_ASIC_MASK;

3460

3461 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;

3462 if (amdgpu_emu_mode == 1)

3463 adev->usec_timeout *= 10;

3464 adev->gmc.gart_size = 512 * 1024 * 1024;

3465 adev->accel_working = false;

3466 adev->num_rings = 0;

3467 adev->mman.buffer_funcs = NULL;

3468 adev->mman.buffer_funcs_ring = NULL;

3469 adev->vm_manager.vm_pte_funcs = NULL;

3470 adev->vm_manager.vm_pte_num_scheds = 0;

3471 adev->gmc.gmc_funcs = NULL;

3472 adev->harvest_ip_mask = 0x0;

3473 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);

3474 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);

3475

3476 adev->smc_rreg = &amdgpu_invalid_rreg;

3477 adev->smc_wreg = &amdgpu_invalid_wreg;

3478 adev->pcie_rreg = &amdgpu_invalid_rreg;

3479 adev->pcie_wreg = &amdgpu_invalid_wreg;

3480 adev->pciep_rreg = &amdgpu_invalid_rreg;

3481 adev->pciep_wreg = &amdgpu_invalid_wreg;

3482 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;

3483 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;

3484 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;

3485 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;

3486 adev->didt_rreg = &amdgpu_invalid_rreg;

3487 adev->didt_wreg = &amdgpu_invalid_wreg;

3488 adev->gc_cac_rreg = &amdgpu_invalid_rreg;

3489 adev->gc_cac_wreg = &amdgpu_invalid_wreg;

3490 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;

3491 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;

3492

3493 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",

3494 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,

3495 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);

3496

3497 /* mutex initialization are all done here so we

3498 * can recall function without having locking issues */

3499 mutex_init(&adev->firmware.mutex);

3500 mutex_init(&adev->pm.mutex);

3501 mutex_init(&adev->gfx.gpu_clock_mutex);

3502 mutex_init(&adev->srbm_mutex);

3503 mutex_init(&adev->gfx.pipe_reserve_mutex);

3504 mutex_init(&adev->gfx.gfx_off_mutex);

3505 mutex_init(&adev->grbm_idx_mutex);

3506 mutex_init(&adev->mn_lock);

3507 mutex_init(&adev->virt.vf_errors.lock);

3508 hash_init(adev->mn_hash);

3509 atomic_set(&adev->in_gpu_reset, 0);

3510 init_rwsem(&adev->reset_sem);

3511 mutex_init(&adev->psp.mutex);

3512 mutex_init(&adev->notifier_lock);

3513

3514 amdgpu_device_init_apu_flags(adev);

3515

3516 r = amdgpu_device_check_arguments(adev);

3517 if (r)

3518 return r;

3519

3520 spin_lock_init(&adev->mmio_idx_lock);

3521 spin_lock_init(&adev->smc_idx_lock);

3522 spin_lock_init(&adev->pcie_idx_lock);

3523 spin_lock_init(&adev->uvd_ctx_idx_lock);

3524 spin_lock_init(&adev->didt_idx_lock);

3525 spin_lock_init(&adev->gc_cac_idx_lock);

3526 spin_lock_init(&adev->se_cac_idx_lock);

3527 spin_lock_init(&adev->audio_endpt_idx_lock);

3528 spin_lock_init(&adev->mm_stats.lock);

3529

3530 INIT_LIST_HEAD(&adev->shadow_list);

3531 mutex_init(&adev->shadow_list_lock);

3532

3533 INIT_LIST_HEAD(&adev->reset_list);

3534

3535 INIT_DELAYED_WORK(&adev->delayed_init_work,

3536 amdgpu_device_delayed_init_work_handler);

3537 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,

3538 amdgpu_device_delay_enable_gfx_off);

3539

3540 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);

3541

3542 adev->gfx.gfx_off_req_count = 1;

3543 adev->pm.ac_power = power_supply_is_system_supplied() > 0;

3544

3545 atomic_set(&adev->throttling_logging_enabled, 1);

3546 /*

3547 * If throttling continues, logging will be performed every minute

3548 * to avoid log flooding. "-1" is subtracted since the thermal

3549 * throttling interrupt comes every second. Thus, the total logging

3550 * interval is 59 seconds(retelimited printk interval) + 1(waiting

3551 * for throttling interrupt) = 60 seconds.

3552 */

3553 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);

3554 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);

3555

3556 /* Registers mapping */

3557 /* TODO: block userspace mapping of io register */

3558 if (adev->asic_type >= CHIP_BONAIRE) {

3559 adev->rmmio_base = pci_resource_start(adev->pdev, 5);

3560 adev->rmmio_size = pci_resource_len(adev->pdev, 5);

3561 } else {

3562 adev->rmmio_base = pci_resource_start(adev->pdev, 2);

3563 adev->rmmio_size = pci_resource_len(adev->pdev, 2);

3564 }

3565

3566 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)

3567 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);

3568

3569 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);

3570 if (adev->rmmio == NULL) {

3571 return -ENOMEM;

3572 }

3573 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);

3574 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);

3575

3576 amdgpu_device_get_pcie_info(adev);

3577

3578 if (amdgpu_mcbp)

3579 DRM_INFO("MCBP is enabled\n");

3580

3581 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)

3582 adev->enable_mes = true;

3583

3584 /* detect hw virtualization here */

3585 amdgpu_detect_virtualization(adev);

3586

3587 r = amdgpu_device_get_job_timeout_settings(adev);

3588 if (r) {

3589 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");

3590 return r;

3591 }

3592

3593 /* early init functions */

3594 r = amdgpu_device_ip_early_init(adev);

3595 if (r)

3596 return r;

3597

3598 /* Need to get xgmi info early to decide the reset behavior*/

3599 if (adev->gmc.xgmi.supported) {

3600 r = adev->gfxhub.funcs->get_xgmi_info(adev);

3601 if (r)

3602 return r;

3603 }

3604

3605 /* enable PCIE atomic ops */

3606 if (amdgpu_sriov_vf(adev))

3607 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)

3608 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_enabled_flags ==

3609 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);

3610 else

3611 adev->have_atomics_support =

3612 !pci_enable_atomic_ops_to_root(adev->pdev,

3613 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |

3614 PCI_EXP_DEVCAP2_ATOMIC_COMP64);

3615 if (!adev->have_atomics_support)

3616 dev_info(adev->dev, "PCIE atomic ops is not supported\n");

3617

3618 /* doorbell bar mapping and doorbell index init*/

3619 amdgpu_device_doorbell_init(adev);

3620

3621 if (amdgpu_emu_mode == 1) {

3622 /* post the asic on emulation mode */

3623 emu_soc_asic_init(adev);

3624 goto fence_driver_init;

3625 }

3626

3627 amdgpu_reset_init(adev);

3628

3629 /* detect if we are with an SRIOV vbios */

3630 amdgpu_device_detect_sriov_bios(adev);

3631

3632 /* check if we need to reset the asic

3633 * E.g., driver was not cleanly unloaded previously, etc.

3634 */

3635 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {

3636 if (adev->gmc.xgmi.num_physical_nodes) {

3637 dev_info(adev->dev, "Pending hive reset.\n");

3638 adev->gmc.xgmi.pending_reset = true;

3639 /* Only need to init necessary block for SMU to handle the reset */

3640 for (i = 0; i < adev->num_ip_blocks; i++) {

3641 if (!adev->ip_blocks[i].status.valid)

3642 continue;

3643 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||

3644 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||

3645 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||

3646 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {

3647 DRM_DEBUG("IP %s disabled for hw_init.\n",

3648 adev->ip_blocks[i].version->funcs->name);

3649 adev->ip_blocks[i].status.hw = true;

3650 }

3651 }

3652 } else {

3653 r = amdgpu_asic_reset(adev);

3654 if (r) {

3655 dev_err(adev->dev, "asic reset on init failed\n");

3656 goto failed;

3657 }

3658 }

3659 }

3660

3661 pci_enable_pcie_error_reporting(adev->pdev);

3662

3663 /* Post card if necessary */

3664 if (amdgpu_device_need_post(adev)) {

3665 if (!adev->bios) {

3666 dev_err(adev->dev, "no vBIOS found\n");

3667 r = -EINVAL;

3668 goto failed;

3669 }

3670 DRM_INFO("GPU posting now...\n");

3671 r = amdgpu_device_asic_init(adev);

3672 if (r) {

3673 dev_err(adev->dev, "gpu post error!\n");

3674 goto failed;

3675 }

3676 }

3677

3678 if (adev->is_atom_fw) {

3679 /* Initialize clocks */

3680 r = amdgpu_atomfirmware_get_clock_info(adev);

3681 if (r) {

3682 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");

3683 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);

3684 goto failed;

3685 }

3686 } else {

3687 /* Initialize clocks */

3688 r = amdgpu_atombios_get_clock_info(adev);

3689 if (r) {

3690 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");

3691 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);

3692 goto failed;

3693 }

3694 /* init i2c buses */

3695 if (!amdgpu_device_has_dc_support(adev))

3696 amdgpu_atombios_i2c_init(adev);

3697 }

3698

3699 fence_driver_init:

3700 /* Fence driver */

3701 r = amdgpu_fence_driver_sw_init(adev);

3702 if (r) {

3703 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");

3704 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);

3705 goto failed;

3706 }

3707

3708 /* init the mode config */

3709 drm_mode_config_init(adev_to_drm(adev));

3710

3711 r = amdgpu_device_ip_init(adev);

3712 if (r) {

3713 /* failed in exclusive mode due to timeout */

3714 if (amdgpu_sriov_vf(adev) &&

3715 !amdgpu_sriov_runtime(adev) &&

3716 amdgpu_virt_mmio_blocked(adev) &&

3717 !amdgpu_virt_wait_reset(adev)) {

3718 dev_err(adev->dev, "VF exclusive mode timeout\n");

3719 /* Don't send request since VF is inactive. */

3720 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;

3721 adev->virt.ops = NULL;

3722 r = -EAGAIN;

3723 goto release_ras_con;

3724 }

3725 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");

3726 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);

3727 goto release_ras_con;

3728 }

3729

3730 amdgpu_fence_driver_hw_init(adev);

3731

3732 dev_info(adev->dev,

3733 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",

3734 adev->gfx.config.max_shader_engines,

3735 adev->gfx.config.max_sh_per_se,

3736 adev->gfx.config.max_cu_per_sh,

3737 adev->gfx.cu_info.number);

3738

3739 adev->accel_working = true;

3740

3741 amdgpu_vm_check_compute_bug(adev);

3742

3743 /* Initialize the buffer migration limit. */

3744 if (amdgpu_moverate >= 0)

3745 max_MBps = amdgpu_moverate;

3746 else

3747 max_MBps = 8; /* Allow 8 MB/s. */

3748 /* Get a log2 for easy divisions. */

3749 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));

3750

3751 r = amdgpu_pm_sysfs_init(adev);

3752 if (r) {

3753 adev->pm_sysfs_en = false;

3754 DRM_ERROR("registering pm debugfs failed (%d).\n", r);

3755 } else

3756 adev->pm_sysfs_en = true;

3757

3758 r = amdgpu_ucode_sysfs_init(adev);

3759 if (r) {

3760 adev->ucode_sysfs_en = false;

3761 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);

3762 } else

3763 adev->ucode_sysfs_en = true;

3764

3765 if ((amdgpu_testing & 1)) {

3766 if (adev->accel_working)

3767 amdgpu_test_moves(adev);

3768 else

3769 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");

3770 }

3771 if (amdgpu_benchmarking) {

3772 if (adev->accel_working)

3773 amdgpu_benchmark(adev, amdgpu_benchmarking);

3774 else

3775 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");

3776 }

3777

3778 /*

3779 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.

3780 * Otherwise the mgpu fan boost feature will be skipped due to the

3781 * gpu instance is counted less.

3782 */

3783 amdgpu_register_gpu_instance(adev);

3784

3785 /* enable clockgating, etc. after ib tests, etc. since some blocks require

3786 * explicit gating rather than handling it automatically.

3787 */

3788 if (!adev->gmc.xgmi.pending_reset) {

3789 r = amdgpu_device_ip_late_init(adev);

3790 if (r) {

3791 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");

3792 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);

3793 goto release_ras_con;

3794 }

3795 /* must succeed. */

3796 amdgpu_ras_resume(adev);

3797 queue_delayed_work(system_wq, &adev->delayed_init_work,

3798 msecs_to_jiffies(AMDGPU_RESUME_MS));

3799 }

3800

3801 if (amdgpu_sriov_vf(adev))

3802 flush_delayed_work(&adev->delayed_init_work);

3803

3804 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);

3805 if (r)

3806 dev_err(adev->dev, "Could not create amdgpu device attr\n");

3807

3808 if (IS_ENABLED(CONFIG_PERF_EVENTS))

3809 r = amdgpu_pmu_init(adev);

3810 if (r)

3811 dev_err(adev->dev, "amdgpu_pmu_init failed\n");

3812

3813 /* Have stored pci confspace at hand for restore in sudden PCI error */

3814 if (amdgpu_device_cache_pci_state(adev->pdev))

3815 pci_restore_state(pdev);

3816

3817 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */

3818 /* this will fail for cards that aren't VGA class devices, just

3819 * ignore it */

3820 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)

3821 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);

3822

3823 if (amdgpu_device_supports_px(ddev)) {

3824 px = true;

3825 vga_switcheroo_register_client(adev->pdev,

3826 &amdgpu_switcheroo_ops, px);

3827 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);

3828 }

3829

3830 if (adev->gmc.xgmi.pending_reset)

3831 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,

3832 msecs_to_jiffies(AMDGPU_RESUME_MS));

3833

3834 amdgpu_device_check_iommu_direct_map(adev);

3835

3836 return 0;

3837

3838 release_ras_con:

3839 amdgpu_release_ras_context(adev);

3840

3841 failed:

3842 amdgpu_vf_error_trans_all(adev);

3843

3844 return r;

3845 }

3846

3847 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)

3848 {

3849

3850 /* Clear all CPU mappings pointing to this device */

3851 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);

3852

3853 /* Unmap all mapped bars - Doorbell, registers and VRAM */

3854 amdgpu_device_doorbell_fini(adev);

3855

3856 iounmap(adev->rmmio);

3857 adev->rmmio = NULL;

3858 if (adev->mman.aper_base_kaddr)

3859 iounmap(adev->mman.aper_base_kaddr);

3860 adev->mman.aper_base_kaddr = NULL;

3861

3862 /* Memory manager related */

3863 if (!adev->gmc.xgmi.connected_to_cpu) {

3864 arch_phys_wc_del(adev->gmc.vram_mtrr);

3865 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);

3866 }

3867 }

3868

3869 /**

3870 * amdgpu_device_fini_hw - tear down the driver

3871 *

3872 * @adev: amdgpu_device pointer

3873 *

3874 * Tear down the driver info (all asics).

3875 * Called at driver shutdown.

3876 */

3877 void amdgpu_device_fini_hw(struct amdgpu_device *adev)

3878 {

3879 dev_info(adev->dev, "amdgpu: finishing device.\n");

3880 flush_delayed_work(&adev->delayed_init_work);

3881 if (adev->mman.initialized) {

3882 flush_delayed_work(&adev->mman.bdev.wq);

3883 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);

3884 }

3885 adev->shutdown = true;

3886

3887 /* make sure IB test finished before entering exclusive mode

3888 * to avoid preemption on IB test

3889 * */

3890 if (amdgpu_sriov_vf(adev)) {

3891 amdgpu_virt_request_full_gpu(adev, false);

3892 amdgpu_virt_fini_data_exchange(adev);

3893 }

3894

3895 /* disable all interrupts */

3896 amdgpu_irq_disable_all(adev);

3897 if (adev->mode_info.mode_config_initialized){

3898 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))

3899 drm_helper_force_disable_all(adev_to_drm(adev));

3900 else

3901 drm_atomic_helper_shutdown(adev_to_drm(adev));

3902 }

3903 amdgpu_fence_driver_hw_fini(adev);

3904

3905 if (adev->pm_sysfs_en)

3906 amdgpu_pm_sysfs_fini(adev);

3907 if (adev->ucode_sysfs_en)

3908 amdgpu_ucode_sysfs_fini(adev);

3909 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);

3910

3911 /* disable ras feature must before hw fini */

3912 amdgpu_ras_pre_fini(adev);

3913

3914 amdgpu_device_ip_fini_early(adev);

3915

3916 amdgpu_irq_fini_hw(adev);

3917

3918 if (adev->mman.initialized)

3919 ttm_device_clear_dma_mappings(&adev->mman.bdev);

3920

3921 amdgpu_gart_dummy_page_fini(adev);

3922

3923 if (drm_dev_is_unplugged(adev_to_drm(adev)))

3924 amdgpu_device_unmap_mmio(adev);

3925

3926 }

3927

3928 void amdgpu_device_fini_sw(struct amdgpu_device *adev)

3929 {

3930 int idx;

3931

3932 amdgpu_fence_driver_sw_fini(adev);

3933 amdgpu_device_ip_fini(adev);

3934 release_firmware(adev->firmware.gpu_info_fw);

3935 adev->firmware.gpu_info_fw = NULL;

3936 adev->accel_working = false;

3937

3938 amdgpu_reset_fini(adev);

3939

3940 /* free i2c buses */

3941 if (!amdgpu_device_has_dc_support(adev))

3942 amdgpu_i2c_fini(adev);

3943

3944 if (amdgpu_emu_mode != 1)

3945 amdgpu_atombios_fini(adev);

3946

3947 kfree(adev->bios);

3948 adev->bios = NULL;

3949 if (amdgpu_device_supports_px(adev_to_drm(adev))) {

3950 vga_switcheroo_unregister_client(adev->pdev);

3951 vga_switcheroo_fini_domain_pm_ops(adev->dev);

3952 }

3953 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)

3954 vga_client_unregister(adev->pdev);

3955

3956 if (drm_dev_enter(adev_to_drm(adev), &idx)) {

3957

3958 iounmap(adev->rmmio);

3959 adev->rmmio = NULL;

3960 amdgpu_device_doorbell_fini(adev);

3961 drm_dev_exit(idx);

3962 }

3963

3964 if (IS_ENABLED(CONFIG_PERF_EVENTS))

3965 amdgpu_pmu_fini(adev);

3966 if (adev->mman.discovery_bin)

3967 amdgpu_discovery_fini(adev);

3968

3969 kfree(adev->pci_state);

3970

3971 }

3972

3973 /**

3974 * amdgpu_device_evict_resources - evict device resources

3975 * @adev: amdgpu device object

3976 *

3977 * Evicts all ttm device resources(vram BOs, gart table) from the lru list

3978 * of the vram memory type. Mainly used for evicting device resources

3979 * at suspend time.

3980 *

3981 */

3982 static void amdgpu_device_evict_resources(struct amdgpu_device *adev)

3983 {

3984 /* No need to evict vram on APUs for suspend to ram or s2idle */

3985 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))

3986 return;

3987

3988 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))

3989 DRM_WARN("evicting device resources failed\n");

3990

3991 }

3992

3993 /*

3994 * Suspend & resume.

3995 */

3996 /**

3997 * amdgpu_device_suspend - initiate device suspend

3998 *

3999 * @dev: drm dev pointer

4000 * @fbcon : notify the fbdev of suspend

4001 *

4002 * Puts the hw in the suspend state (all asics).

4003 * Returns 0 for success or an error on failure.

4004 * Called at driver suspend.

4005 */

4006 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)

4007 {

4008 struct amdgpu_device *adev = drm_to_adev(dev);

4009

4010 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)

4011 return 0;

4012

4013 adev->in_suspend = true;

4014

4015 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))

4016 DRM_WARN("smart shift update failed\n");

4017

4018 drm_kms_helper_poll_disable(dev);

4019

4020 if (fbcon)

4021 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);

4022

4023 cancel_delayed_work_sync(&adev->delayed_init_work);

4024

4025 amdgpu_ras_suspend(adev);

4026

4027 amdgpu_device_ip_suspend_phase1(adev);

4028

4029 if (!adev->in_s0ix)

4030 amdgpu_amdkfd_suspend(adev, adev->in_runpm);

4031

4032 amdgpu_device_evict_resources(adev);

4033

4034 amdgpu_fence_driver_hw_fini(adev);

4035

4036 amdgpu_device_ip_suspend_phase2(adev);

4037

4038 return 0;

4039 }

4040

4041 /**

4042 * amdgpu_device_resume - initiate device resume

4043 *

4044 * @dev: drm dev pointer

4045 * @fbcon : notify the fbdev of resume

4046 *

4047 * Bring the hw back to operating state (all asics).

4048 * Returns 0 for success or an error on failure.

4049 * Called at driver resume.

4050 */

4051 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)

4052 {

4053 struct amdgpu_device *adev = drm_to_adev(dev);

4054 int r = 0;

4055

4056 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)

4057 return 0;

4058

4059 if (adev->in_s0ix)

4060 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);

4061

4062 /* post card */

4063 if (amdgpu_device_need_post(adev)) {

4064 r = amdgpu_device_asic_init(adev);

4065 if (r)

4066 dev_err(adev->dev, "amdgpu asic init failed\n");

4067 }

4068

4069 r = amdgpu_device_ip_resume(adev);

4070 if (r) {

4071 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);

4072 return r;

4073 }

4074 amdgpu_fence_driver_hw_init(adev);

4075

4076 r = amdgpu_device_ip_late_init(adev);

4077 if (r)

4078 return r;

4079

4080 queue_delayed_work(system_wq, &adev->delayed_init_work,

4081 msecs_to_jiffies(AMDGPU_RESUME_MS));

4082

4083 if (!adev->in_s0ix) {

4084 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);

4085 if (r)

4086 return r;

4087 }

4088

4089 /* Make sure IB tests flushed */

4090 flush_delayed_work(&adev->delayed_init_work);

4091

4092 if (fbcon)

4093 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);

4094

4095 drm_kms_helper_poll_enable(dev);

4096

4097 amdgpu_ras_resume(adev);

4098

4099 /*

4100 * Most of the connector probing functions try to acquire runtime pm

4101 * refs to ensure that the GPU is powered on when connector polling is

4102 * performed. Since we're calling this from a runtime PM callback,

4103 * trying to acquire rpm refs will cause us to deadlock.

4104 *

4105 * Since we're guaranteed to be holding the rpm lock, it's safe to

4106 * temporarily disable the rpm helpers so this doesn't deadlock us.

4107 */

4108 #ifdef CONFIG_PM

4109 dev->dev->power.disable_depth++;

4110 #endif

4111 if (!amdgpu_device_has_dc_support(adev))

4112 drm_helper_hpd_irq_event(dev);

4113 else

4114 drm_kms_helper_hotplug_event(dev);

4115 #ifdef CONFIG_PM

4116 dev->dev->power.disable_depth--;

4117 #endif

4118 adev->in_suspend = false;

4119

4120 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))

4121 DRM_WARN("smart shift update failed\n");

4122

4123 return 0;

4124 }

4125

4126 /**

4127 * amdgpu_device_ip_check_soft_reset - did soft reset succeed

4128 *

4129 * @adev: amdgpu_device pointer

4130 *

4131 * The list of all the hardware IPs that make up the asic is walked and

4132 * the check_soft_reset callbacks are run. check_soft_reset determines

4133 * if the asic is still hung or not.

4134 * Returns true if any of the IPs are still in a hung state, false if not.

4135 */

4136 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)

4137 {

4138 int i;

4139 bool asic_hang = false;

4140

4141 if (amdgpu_sriov_vf(adev))

4142 return true;

4143

4144 if (amdgpu_asic_need_full_reset(adev))

4145 return true;

4146

4147 for (i = 0; i < adev->num_ip_blocks; i++) {

4148 if (!adev->ip_blocks[i].status.valid)

4149 continue;

4150 if (adev->ip_blocks[i].version->funcs->check_soft_reset)

4151 adev->ip_blocks[i].status.hang =

4152 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);

4153 if (adev->ip_blocks[i].status.hang) {

4154 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);

4155 asic_hang = true;

4156 }

4157 }

4158 return asic_hang;

4159 }

4160

4161 /**

4162 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset

4163 *

4164 * @adev: amdgpu_device pointer

4165 *

4166 * The list of all the hardware IPs that make up the asic is walked and the

4167 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset

4168 * handles any IP specific hardware or software state changes that are

4169 * necessary for a soft reset to succeed.

4170 * Returns 0 on success, negative error code on failure.

4171 */

4172 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)

4173 {

4174 int i, r = 0;

4175

4176 for (i = 0; i < adev->num_ip_blocks; i++) {

4177 if (!adev->ip_blocks[i].status.valid)

4178 continue;

4179 if (adev->ip_blocks[i].status.hang &&

4180 adev->ip_blocks[i].version->funcs->pre_soft_reset) {

4181 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);

4182 if (r)

4183 return r;

4184 }

4185 }

4186

4187 return 0;

4188 }

4189

4190 /**

4191 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed

4192 *

4193 * @adev: amdgpu_device pointer

4194 *

4195 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu

4196 * reset is necessary to recover.

4197 * Returns true if a full asic reset is required, false if not.

4198 */

4199 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)

4200 {

4201 int i;

4202

4203 if (amdgpu_asic_need_full_reset(adev))

4204 return true;

4205

4206 for (i = 0; i < adev->num_ip_blocks; i++) {

4207 if (!adev->ip_blocks[i].status.valid)

4208 continue;

4209 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||

4210 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||

4211 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||

4212 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||

4213 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {

4214 if (adev->ip_blocks[i].status.hang) {

4215 dev_info(adev->dev, "Some block need full reset!\n");

4216 return true;

4217 }

4218 }

4219 }

4220 return false;

4221 }

4222

4223 /**

4224 * amdgpu_device_ip_soft_reset - do a soft reset

4225 *

4226 * @adev: amdgpu_device pointer

4227 *

4228 * The list of all the hardware IPs that make up the asic is walked and the

4229 * soft_reset callbacks are run if the block is hung. soft_reset handles any

4230 * IP specific hardware or software state changes that are necessary to soft

4231 * reset the IP.

4232 * Returns 0 on success, negative error code on failure.

4233 */

4234 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)

4235 {

4236 int i, r = 0;

4237

4238 for (i = 0; i < adev->num_ip_blocks; i++) {

4239 if (!adev->ip_blocks[i].status.valid)

4240 continue;

4241 if (adev->ip_blocks[i].status.hang &&

4242 adev->ip_blocks[i].version->funcs->soft_reset) {

4243 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);

4244 if (r)

4245 return r;

4246 }

4247 }

4248

4249 return 0;

4250 }

4251

4252 /**

4253 * amdgpu_device_ip_post_soft_reset - clean up from soft reset

4254 *

4255 * @adev: amdgpu_device pointer

4256 *

4257 * The list of all the hardware IPs that make up the asic is walked and the

4258 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset

4259 * handles any IP specific hardware or software state changes that are

4260 * necessary after the IP has been soft reset.

4261 * Returns 0 on success, negative error code on failure.

4262 */

4263 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)

4264 {

4265 int i, r = 0;

4266

4267 for (i = 0; i < adev->num_ip_blocks; i++) {

4268 if (!adev->ip_blocks[i].status.valid)

4269 continue;

4270 if (adev->ip_blocks[i].status.hang &&

4271 adev->ip_blocks[i].version->funcs->post_soft_reset)

4272 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);

4273 if (r)

4274 return r;

4275 }

4276

4277 return 0;

4278 }

4279

4280 /**

4281 * amdgpu_device_recover_vram - Recover some VRAM contents

4282 *

4283 * @adev: amdgpu_device pointer

4284 *

4285 * Restores the contents of VRAM buffers from the shadows in GTT. Used to

4286 * restore things like GPUVM page tables after a GPU reset where

4287 * the contents of VRAM might be lost.

4288 *

4289 * Returns:

4290 * 0 on success, negative error code on failure.

4291 */

4292 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)

4293 {

4294 struct dma_fence *fence = NULL, *next = NULL;

4295 struct amdgpu_bo *shadow;

4296 struct amdgpu_bo_vm *vmbo;

4297 long r = 1, tmo;

4298

4299 if (amdgpu_sriov_runtime(adev))

4300 tmo = msecs_to_jiffies(8000);

4301 else

4302 tmo = msecs_to_jiffies(100);

4303

4304 dev_info(adev->dev, "recover vram bo from shadow start\n");

4305 mutex_lock(&adev->shadow_list_lock);

4306 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {

4307 shadow = &vmbo->bo;

4308 /* No need to recover an evicted BO */

4309 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||

4310 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||

4311 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)

4312 continue;

4313

4314 r = amdgpu_bo_restore_shadow(shadow, &next);

4315 if (r)

4316 break;

4317

4318 if (fence) {

4319 tmo = dma_fence_wait_timeout(fence, false, tmo);

4320 dma_fence_put(fence);

4321 fence = next;

4322 if (tmo == 0) {

4323 r = -ETIMEDOUT;

4324 break;

4325 } else if (tmo < 0) {

4326 r = tmo;

4327 break;

4328 }

4329 } else {

4330 fence = next;

4331 }

4332 }

4333 mutex_unlock(&adev->shadow_list_lock);

4334

4335 if (fence)

4336 tmo = dma_fence_wait_timeout(fence, false, tmo);

4337 dma_fence_put(fence);

4338

4339 if (r < 0 || tmo <= 0) {

4340 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);

4341 return -EIO;

4342 }

4343

4344 dev_info(adev->dev, "recover vram bo from shadow done\n");

4345 return 0;

4346 }

4347

4348

4349 /**

4350 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf

4351 *

4352 * @adev: amdgpu_device pointer

4353 * @from_hypervisor: request from hypervisor

4354 *

4355 * do VF FLR and reinitialize Asic

4356 * return 0 means succeeded otherwise failed

4357 */

4358 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,

4359 bool from_hypervisor)

4360 {

4361 int r;

4362 struct amdgpu_hive_info *hive = NULL;

4363

4364 amdgpu_amdkfd_pre_reset(adev);

4365

4366 amdgpu_amdkfd_pre_reset(adev);

4367

4368 if (from_hypervisor)

4369 r = amdgpu_virt_request_full_gpu(adev, true);

4370 else

4371 r = amdgpu_virt_reset_gpu(adev);

4372 if (r)

4373 return r;

4374

4375 /* Resume IP prior to SMC */

4376 r = amdgpu_device_ip_reinit_early_sriov(adev);

4377 if (r)

4378 goto error;

4379

4380 amdgpu_virt_init_data_exchange(adev);

4381

4382 r = amdgpu_device_fw_loading(adev);

4383 if (r)

4384 return r;

4385

4386 /* now we are okay to resume SMC/CP/SDMA */

4387 r = amdgpu_device_ip_reinit_late_sriov(adev);

4388 if (r)

4389 goto error;

4390

4391 hive = amdgpu_get_xgmi_hive(adev);

4392 /* Update PSP FW topology after reset */

4393 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)

4394 r = amdgpu_xgmi_update_topology(hive, adev);

4395

4396 if (hive)

4397 amdgpu_put_xgmi_hive(hive);

4398

4399 if (!r) {

4400 amdgpu_irq_gpu_reset_resume_helper(adev);

4401 r = amdgpu_ib_ring_tests(adev);

4402 amdgpu_amdkfd_post_reset(adev);

4403 }

4404

4405 error:

4406 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {

4407 amdgpu_inc_vram_lost(adev);

4408 r = amdgpu_device_recover_vram(adev);

4409 }

4410 amdgpu_virt_release_full_gpu(adev, true);

4411

4412 return r;

4413 }

4414

4415 /**

4416 * amdgpu_device_has_job_running - check if there is any job in mirror list

4417 *

4418 * @adev: amdgpu_device pointer

4419 *

4420 * check if there is any job in mirror list

4421 */

4422 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)

4423 {

4424 int i;

4425 struct drm_sched_job *job;

4426

4427 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

4428 struct amdgpu_ring *ring = adev->rings[i];

4429

4430 if (!ring || !ring->sched.thread)

4431 continue;

4432

4433 spin_lock(&ring->sched.job_list_lock);

4434 job = list_first_entry_or_null(&ring->sched.pending_list,

4435 struct drm_sched_job, list);

4436 spin_unlock(&ring->sched.job_list_lock);

4437 if (job)

4438 return true;

4439 }

4440 return false;

4441 }

4442

4443 /**

4444 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery

4445 *

4446 * @adev: amdgpu_device pointer

4447 *

4448 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover

4449 * a hung GPU.

4450 */

4451 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)

4452 {

4453 if (!amdgpu_device_ip_check_soft_reset(adev)) {

4454 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");

4455 return false;

4456 }

4457

4458 if (amdgpu_gpu_recovery == 0)

4459 goto disabled;

4460

4461 if (amdgpu_sriov_vf(adev))

4462 return true;

4463

4464 if (amdgpu_gpu_recovery == -1) {

4465 switch (adev->asic_type) {

4466 #ifdef CONFIG_DRM_AMDGPU_SI

4467 case CHIP_VERDE:

4468 case CHIP_TAHITI:

4469 case CHIP_PITCAIRN:

4470 case CHIP_OLAND:

4471 case CHIP_HAINAN:

4472 #endif

4473 #ifdef CONFIG_DRM_AMDGPU_CIK

4474 case CHIP_KAVERI:

4475 case CHIP_KABINI:

4476 case CHIP_MULLINS:

4477 #endif

4478 case CHIP_CARRIZO:

4479 case CHIP_STONEY:

4480 case CHIP_CYAN_SKILLFISH:

4481 goto disabled;

4482 default:

4483 break;

4484 }

4485 }

4486

4487 return true;

4488

4489 disabled:

4490 dev_info(adev->dev, "GPU recovery disabled.\n");

4491 return false;

4492 }

4493

4494 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)

4495 {

4496 u32 i;

4497 int ret = 0;

4498

4499 amdgpu_atombios_scratch_regs_engine_hung(adev, true);

4500

4501 dev_info(adev->dev, "GPU mode1 reset\n");

4502

4503 /* disable BM */

4504 pci_clear_master(adev->pdev);

4505

4506 amdgpu_device_cache_pci_state(adev->pdev);

4507

4508 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {

4509 dev_info(adev->dev, "GPU smu mode1 reset\n");

4510 ret = amdgpu_dpm_mode1_reset(adev);

4511 } else {

4512 dev_info(adev->dev, "GPU psp mode1 reset\n");

4513 ret = psp_gpu_reset(adev);

4514 }

4515

4516 if (ret)

4517 dev_err(adev->dev, "GPU mode1 reset failed\n");

4518

4519 amdgpu_device_load_pci_state(adev->pdev);

4520

4521 /* wait for asic to come out of reset */

4522 for (i = 0; i < adev->usec_timeout; i++) {

4523 u32 memsize = adev->nbio.funcs->get_memsize(adev);

4524

4525 if (memsize != 0xffffffff)

4526 break;

4527 udelay(1);

4528 }

4529

4530 amdgpu_atombios_scratch_regs_engine_hung(adev, false);

4531 return ret;

4532 }

4533

4534 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,

4535 struct amdgpu_reset_context *reset_context)

4536 {

4537 int i, r = 0;

4538 struct amdgpu_job *job = NULL;

4539 bool need_full_reset =

4540 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

4541

4542 if (reset_context->reset_req_dev == adev)

4543 job = reset_context->job;

4544

4545 if (amdgpu_sriov_vf(adev)) {

4546 /* stop the data exchange thread */

4547 amdgpu_virt_fini_data_exchange(adev);

4548 }

4549

4550 /* block all schedulers and reset given job's ring */

4551 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

4552 struct amdgpu_ring *ring = adev->rings[i];

4553

4554 if (!ring || !ring->sched.thread)

4555 continue;

4556

4557 /*clear job fence from fence drv to avoid force_completion

4558 *leave NULL and vm flush fence in fence drv */

4559 amdgpu_fence_driver_clear_job_fences(ring);

4560

4561 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */

4562 amdgpu_fence_driver_force_completion(ring);

4563 }

4564

4565 if (job && job->vm)

4566 drm_sched_increase_karma(&job->base);

4567

4568 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);

4569 /* If reset handler not implemented, continue; otherwise return */

4570 if (r == -ENOSYS)

4571 r = 0;

4572 else

4573 return r;

4574

4575 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */

4576 if (!amdgpu_sriov_vf(adev)) {

4577

4578 if (!need_full_reset)

4579 need_full_reset = amdgpu_device_ip_need_full_reset(adev);

4580

4581 if (!need_full_reset) {

4582 amdgpu_device_ip_pre_soft_reset(adev);

4583 r = amdgpu_device_ip_soft_reset(adev);

4584 amdgpu_device_ip_post_soft_reset(adev);

4585 if (r || amdgpu_device_ip_check_soft_reset(adev)) {

4586 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");

4587 need_full_reset = true;

4588 }

4589 }

4590

4591 if (need_full_reset)

4592 r = amdgpu_device_ip_suspend(adev);

4593 if (need_full_reset)

4594 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

4595 else

4596 clear_bit(AMDGPU_NEED_FULL_RESET,

4597 &reset_context->flags);

4598 }

4599

4600 return r;

4601 }

4602

4603 int amdgpu_do_asic_reset(struct list_head *device_list_handle,

4604 struct amdgpu_reset_context *reset_context)

4605 {

4606 struct amdgpu_device *tmp_adev = NULL;

4607 bool need_full_reset, skip_hw_reset, vram_lost = false;

4608 int r = 0;

4609

4610 /* Try reset handler method first */

4611 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,

4612 reset_list);

4613 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);

4614 /* If reset handler not implemented, continue; otherwise return */

4615 if (r == -ENOSYS)

4616 r = 0;

4617 else

4618 return r;

4619

4620 /* Reset handler not implemented, use the default method */

4621 need_full_reset =

4622 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

4623 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);

4624

4625 /*

4626 * ASIC reset has to be done on all XGMI hive nodes ASAP

4627 * to allow proper links negotiation in FW (within 1 sec)

4628 */

4629 if (!skip_hw_reset && need_full_reset) {

4630 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {

4631 /* For XGMI run all resets in parallel to speed up the process */

4632 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

4633 tmp_adev->gmc.xgmi.pending_reset = false;

4634 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))

4635 r = -EALREADY;

4636 } else

4637 r = amdgpu_asic_reset(tmp_adev);

4638

4639 if (r) {

4640 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",

4641 r, adev_to_drm(tmp_adev)->unique);

4642 break;

4643 }

4644 }

4645

4646 /* For XGMI wait for all resets to complete before proceed */

4647 if (!r) {

4648 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {

4649 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {

4650 flush_work(&tmp_adev->xgmi_reset_work);

4651 r = tmp_adev->asic_reset_res;

4652 if (r)

4653 break;

4654 }

4655 }

4656 }

4657 }

4658

4659 if (!r && amdgpu_ras_intr_triggered()) {

4660 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {

4661 if (tmp_adev->mmhub.ras_funcs &&

4662 tmp_adev->mmhub.ras_funcs->reset_ras_error_count)

4663 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);

4664 }

4665

4666 amdgpu_ras_intr_cleared();

4667 }

4668

4669 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {

4670 if (need_full_reset) {

4671 /* post card */

4672 r = amdgpu_device_asic_init(tmp_adev);

4673 if (r) {

4674 dev_warn(tmp_adev->dev, "asic atom init failed!");

4675 } else {

4676 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");

4677 r = amdgpu_amdkfd_resume_iommu(tmp_adev);

4678 if (r)

4679 goto out;

4680

4681 r = amdgpu_device_ip_resume_phase1(tmp_adev);

4682 if (r)

4683 goto out;

4684

4685 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

4686 if (vram_lost) {

4687 DRM_INFO("VRAM is lost due to GPU reset!\n");

4688 amdgpu_inc_vram_lost(tmp_adev);

4689 }

4690

4691 r = amdgpu_device_fw_loading(tmp_adev);

4692 if (r)

4693 return r;

4694

4695 r = amdgpu_device_ip_resume_phase2(tmp_adev);

4696 if (r)

4697 goto out;

4698

4699 if (vram_lost)

4700 amdgpu_device_fill_reset_magic(tmp_adev);

4701

4702 /*

4703 * Add this ASIC as tracked as reset was already

4704 * complete successfully.

4705 */

4706 amdgpu_register_gpu_instance(tmp_adev);

4707

4708 if (!reset_context->hive &&

4709 tmp_adev->gmc.xgmi.num_physical_nodes > 1)

4710 amdgpu_xgmi_add_device(tmp_adev);

4711

4712 r = amdgpu_device_ip_late_init(tmp_adev);

4713 if (r)

4714 goto out;

4715

4716 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);

4717

4718 /*

4719 * The GPU enters bad state once faulty pages

4720 * by ECC has reached the threshold, and ras

4721 * recovery is scheduled next. So add one check

4722 * here to break recovery if it indeed exceeds

4723 * bad page threshold, and remind user to

4724 * retire this GPU or setting one bigger

4725 * bad_page_threshold value to fix this once

4726 * probing driver again.

4727 */

4728 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {

4729 /* must succeed. */

4730 amdgpu_ras_resume(tmp_adev);

4731 } else {

4732 r = -EINVAL;

4733 goto out;

4734 }

4735

4736 /* Update PSP FW topology after reset */

4737 if (reset_context->hive &&

4738 tmp_adev->gmc.xgmi.num_physical_nodes > 1)

4739 r = amdgpu_xgmi_update_topology(

4740 reset_context->hive, tmp_adev);

4741 }

4742 }

4743

4744 out:

4745 if (!r) {

4746 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);

4747 r = amdgpu_ib_ring_tests(tmp_adev);

4748 if (r) {

4749 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);

4750 need_full_reset = true;

4751 r = -EAGAIN;

4752 goto end;

4753 }

4754 }

4755

4756 if (!r)

4757 r = amdgpu_device_recover_vram(tmp_adev);

4758 else

4759 tmp_adev->asic_reset_res = r;

4760 }

4761

4762 end:

4763 if (need_full_reset)

4764 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

4765 else

4766 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

4767 return r;

4768 }

4769

4770 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,

4771 struct amdgpu_hive_info *hive)

4772 {

4773 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)

4774 return false;

4775

4776 if (hive) {

4777 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);

4778 } else {

4779 down_write(&adev->reset_sem);

4780 }

4781

4782 switch (amdgpu_asic_reset_method(adev)) {

4783 case AMD_RESET_METHOD_MODE1:

4784 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;

4785 break;

4786 case AMD_RESET_METHOD_MODE2:

4787 adev->mp1_state = PP_MP1_STATE_RESET;

4788 break;

4789 default:

4790 adev->mp1_state = PP_MP1_STATE_NONE;

4791 break;

4792 }

4793

4794 return true;

4795 }

4796

4797 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)

4798 {

4799 amdgpu_vf_error_trans_all(adev);

4800 adev->mp1_state = PP_MP1_STATE_NONE;

4801 atomic_set(&adev->in_gpu_reset, 0);

4802 up_write(&adev->reset_sem);

4803 }

4804

4805 /*

4806 * to lockup a list of amdgpu devices in a hive safely, if not a hive

4807 * with multiple nodes, it will be similar as amdgpu_device_lock_adev.

4808 *

4809 * unlock won't require roll back.

4810 */

4811 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)

4812 {

4813 struct amdgpu_device *tmp_adev = NULL;

4814

4815 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {

4816 if (!hive) {

4817 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");

4818 return -ENODEV;

4819 }

4820 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {

4821 if (!amdgpu_device_lock_adev(tmp_adev, hive))

4822 goto roll_back;

4823 }

4824 } else if (!amdgpu_device_lock_adev(adev, hive))

4825 return -EAGAIN;

4826

4827 return 0;

4828 roll_back:

4829 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {

4830 /*

4831 * if the lockup iteration break in the middle of a hive,

4832 * it may means there may has a race issue,

4833 * or a hive device locked up independently.

4834 * we may be in trouble and may not, so will try to roll back

4835 * the lock and give out a warnning.

4836 */

4837 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");

4838 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {

4839 amdgpu_device_unlock_adev(tmp_adev);

4840 }

4841 }

4842 return -EAGAIN;

4843 }

4844

4845 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)

4846 {

4847 struct pci_dev *p = NULL;

4848

4849 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),

4850 adev->pdev->bus->number, 1);

4851 if (p) {

4852 pm_runtime_enable(&(p->dev));

4853 pm_runtime_resume(&(p->dev));

4854 }

4855 }

4856

4857 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)

4858 {

4859 enum amd_reset_method reset_method;

4860 struct pci_dev *p = NULL;

4861 u64 expires;

4862

4863 /*

4864 * For now, only BACO and mode1 reset are confirmed

4865 * to suffer the audio issue without proper suspended.

4866 */

4867 reset_method = amdgpu_asic_reset_method(adev);

4868 if ((reset_method != AMD_RESET_METHOD_BACO) &&

4869 (reset_method != AMD_RESET_METHOD_MODE1))

4870 return -EINVAL;

4871

4872 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),

4873 adev->pdev->bus->number, 1);

4874 if (!p)

4875 return -ENODEV;

4876

4877 expires = pm_runtime_autosuspend_expiration(&(p->dev));

4878 if (!expires)

4879 /*

4880 * If we cannot get the audio device autosuspend delay,

4881 * a fixed 4S interval will be used. Considering 3S is

4882 * the audio controller default autosuspend delay setting.

4883 * 4S used here is guaranteed to cover that.

4884 */

4885 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;

4886

4887 while (!pm_runtime_status_suspended(&(p->dev))) {

4888 if (!pm_runtime_suspend(&(p->dev)))

4889 break;

4890

4891 if (expires < ktime_get_mono_fast_ns()) {

4892 dev_warn(adev->dev, "failed to suspend display audio\n");

4893 /* TODO: abort the succeeding gpu reset? */

4894 return -ETIMEDOUT;

4895 }

4896 }

4897

4898 pm_runtime_disable(&(p->dev));

4899

4900 return 0;

4901 }

4902

4903 static void amdgpu_device_recheck_guilty_jobs(

4904 struct amdgpu_device *adev, struct list_head *device_list_handle,

4905 struct amdgpu_reset_context *reset_context)

4906 {

4907 int i, r = 0;

4908

4909 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

4910 struct amdgpu_ring *ring = adev->rings[i];

4911 int ret = 0;

4912 struct drm_sched_job *s_job;

4913

4914 if (!ring || !ring->sched.thread)

4915 continue;

4916

4917 s_job = list_first_entry_or_null(&ring->sched.pending_list,

4918 struct drm_sched_job, list);

4919 if (s_job == NULL)

4920 continue;

4921

4922 /* clear job's guilty and depend the folowing step to decide the real one */

4923 drm_sched_reset_karma(s_job);

4924 /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get

4925 * to make sure fence is balanced */

4926 dma_fence_get(s_job->s_fence->parent);

4927 drm_sched_resubmit_jobs_ext(&ring->sched, 1);

4928

4929 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);

4930 if (ret == 0) { /* timeout */

4931 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",

4932 ring->sched.name, s_job->id);

4933

4934 /* set guilty */

4935 drm_sched_increase_karma(s_job);

4936 retry:

4937 /* do hw reset */

4938 if (amdgpu_sriov_vf(adev)) {

4939 amdgpu_virt_fini_data_exchange(adev);

4940 r = amdgpu_device_reset_sriov(adev, false);

4941 if (r)

4942 adev->asic_reset_res = r;

4943 } else {

4944 clear_bit(AMDGPU_SKIP_HW_RESET,

4945 &reset_context->flags);

4946 r = amdgpu_do_asic_reset(device_list_handle,

4947 reset_context);

4948 if (r && r == -EAGAIN)

4949 goto retry;

4950 }

4951

4952 /*

4953 * add reset counter so that the following

4954 * resubmitted job could flush vmid

4955 */

4956 atomic_inc(&adev->gpu_reset_counter);

4957 continue;

4958 }

4959

4960 /* got the hw fence, signal finished fence */

4961 atomic_dec(ring->sched.score);

4962 dma_fence_put(s_job->s_fence->parent);

4963 dma_fence_get(&s_job->s_fence->finished);

4964 dma_fence_signal(&s_job->s_fence->finished);

4965 dma_fence_put(&s_job->s_fence->finished);

4966

4967 /* remove node from list and free the job */

4968 spin_lock(&ring->sched.job_list_lock);

4969 list_del_init(&s_job->list);

4970 spin_unlock(&ring->sched.job_list_lock);

4971 ring->sched.ops->free_job(s_job);

4972 }

4973 }

4974

4975 /**

4976 * amdgpu_device_gpu_recover - reset the asic and recover scheduler

4977 *

4978 * @adev: amdgpu_device pointer

4979 * @job: which job trigger hang

4980 *

4981 * Attempt to reset the GPU if it has hung (all asics).

4982 * Attempt to do soft-reset or full-reset and reinitialize Asic

4983 * Returns 0 for success or an error on failure.

4984 */

4985

4986 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

4987 struct amdgpu_job *job)

4988 {

4989 struct list_head device_list, *device_list_handle = NULL;

4990 bool job_signaled = false;

4991 struct amdgpu_hive_info *hive = NULL;

4992 struct amdgpu_device *tmp_adev = NULL;

4993 int i, r = 0;

4994 bool need_emergency_restart = false;

4995 bool audio_suspended = false;

4996 int tmp_vram_lost_counter;

4997 struct amdgpu_reset_context reset_context;

4998

4999 memset(&reset_context, 0, sizeof(reset_context));

5000

5001 /*

5002 * Special case: RAS triggered and full reset isn't supported

5003 */

5004 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);

5005

5006 /*

5007 * Flush RAM to disk so that after reboot

5008 * the user can read log and see why the system rebooted.

5009 */

5010 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {

5011 DRM_WARN("Emergency reboot.");

5012

5013 ksys_sync_helper();

5014 emergency_restart();

5015 }

5016

5017 dev_info(adev->dev, "GPU %s begin!\n",

5018 need_emergency_restart ? "jobs stop":"reset");

5019

5020 /*

5021 * Here we trylock to avoid chain of resets executing from

5022 * either trigger by jobs on different adevs in XGMI hive or jobs on

5023 * different schedulers for same device while this TO handler is running.

5024 * We always reset all schedulers for device and all devices for XGMI

5025 * hive so that should take care of them too.

5026 */

5027 if (!amdgpu_sriov_vf(adev))

5028 hive = amdgpu_get_xgmi_hive(adev);

5029 if (hive) {

5030 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {

5031 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",

5032 job ? job->base.id : -1, hive->hive_id);

5033 amdgpu_put_xgmi_hive(hive);

5034 if (job && job->vm)

5035 drm_sched_increase_karma(&job->base);

5036 return 0;

5037 }

5038 mutex_lock(&hive->hive_lock);

5039 }

5040

5041 reset_context.method = AMD_RESET_METHOD_NONE;

5042 reset_context.reset_req_dev = adev;

5043 reset_context.job = job;

5044 reset_context.hive = hive;

5045 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

5046

5047 /*

5048 * lock the device before we try to operate the linked list

5049 * if didn't get the device lock, don't touch the linked list since

5050 * others may iterating it.

5051 */

5052 r = amdgpu_device_lock_hive_adev(adev, hive);

5053 if (r) {

5054 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",

5055 job ? job->base.id : -1);

5056

5057 /* even we skipped this reset, still need to set the job to guilty */

5058 if (job && job->vm)

5059 drm_sched_increase_karma(&job->base);

5060 goto skip_recovery;

5061 }

5062

5063 /*

5064 * Build list of devices to reset.

5065 * In case we are in XGMI hive mode, resort the device list

5066 * to put adev in the 1st position.

5067 */

5068 INIT_LIST_HEAD(&device_list);

5069 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {

5070 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)

5071 list_add_tail(&tmp_adev->reset_list, &device_list);

5072 if (!list_is_first(&adev->reset_list, &device_list))

5073 list_rotate_to_front(&adev->reset_list, &device_list);

5074 device_list_handle = &device_list;

5075 } else {

5076 list_add_tail(&adev->reset_list, &device_list);

5077 device_list_handle = &device_list;

5078 }

5079

5080 /* block all schedulers and reset given job's ring */

5081 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {

5082 /*

5083 * Try to put the audio codec into suspend state

5084 * before gpu reset started.

5085 *

5086 * Due to the power domain of the graphics device

5087 * is shared with AZ power domain. Without this,

5088 * we may change the audio hardware from behind

5089 * the audio driver's back. That will trigger

5090 * some audio codec errors.

5091 */

5092 if (!amdgpu_device_suspend_display_audio(tmp_adev))

5093 audio_suspended = true;

5094

5095 amdgpu_ras_set_error_query_ready(tmp_adev, false);

5096

5097 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);

5098

5099 if (!amdgpu_sriov_vf(tmp_adev))

5100 amdgpu_amdkfd_pre_reset(tmp_adev);

5101

5102 /*

5103 * Mark these ASICs to be reseted as untracked first

5104 * And add them back after reset completed

5105 */

5106 amdgpu_unregister_gpu_instance(tmp_adev);

5107

5108 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);

5109

5110 /* disable ras on ALL IPs */

5111 if (!need_emergency_restart &&

5112 amdgpu_device_ip_need_full_reset(tmp_adev))

5113 amdgpu_ras_suspend(tmp_adev);

5114

5115 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

5116 struct amdgpu_ring *ring = tmp_adev->rings[i];

5117

5118 if (!ring || !ring->sched.thread)

5119 continue;

5120

5121 drm_sched_stop(&ring->sched, job ? &job->base : NULL);

5122

5123 if (need_emergency_restart)

5124 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);

5125 }

5126 atomic_inc(&tmp_adev->gpu_reset_counter);

5127 }

5128

5129 if (need_emergency_restart)

5130 goto skip_sched_resume;

5131

5132 /*

5133 * Must check guilty signal here since after this point all old

5134 * HW fences are force signaled.

5135 *

5136 * job->base holds a reference to parent fence

5137 */

5138 if (job && job->base.s_fence->parent &&

5139 dma_fence_is_signaled(job->base.s_fence->parent)) {

5140 job_signaled = true;

5141 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");

5142 goto skip_hw_reset;

5143 }

5144

5145 retry: /* Rest of adevs pre asic reset from XGMI hive. */

5146 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {

5147 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);

5148 /*TODO Should we stop ?*/

5149 if (r) {

5150 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",

5151 r, adev_to_drm(tmp_adev)->unique);

5152 tmp_adev->asic_reset_res = r;

5153 }

5154 }

5155

5156 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));

5157 /* Actual ASIC resets if needed.*/

5158 /* Host driver will handle XGMI hive reset for SRIOV */

5159 if (amdgpu_sriov_vf(adev)) {

5160 r = amdgpu_device_reset_sriov(adev, job ? false : true);

5161 if (r)

5162 adev->asic_reset_res = r;

5163 } else {

5164 r = amdgpu_do_asic_reset(device_list_handle, &reset_context);

5165 if (r && r == -EAGAIN)

5166 goto retry;

5167 }

5168

5169 skip_hw_reset:

5170

5171 /* Post ASIC reset for all devs .*/

5172 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {

5173

5174 /*

5175 * Sometimes a later bad compute job can block a good gfx job as gfx

5176 * and compute ring share internal GC HW mutually. We add an additional

5177 * guilty jobs recheck step to find the real guilty job, it synchronously

5178 * submits and pends for the first job being signaled. If it gets timeout,

5179 * we identify it as a real guilty job.

5180 */

5181 if (amdgpu_gpu_recovery == 2 &&

5182 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))

5183 amdgpu_device_recheck_guilty_jobs(

5184 tmp_adev, device_list_handle, &reset_context);

5185

5186 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

5187 struct amdgpu_ring *ring = tmp_adev->rings[i];

5188

5189 if (!ring || !ring->sched.thread)

5190 continue;

5191

5192 /* No point to resubmit jobs if we didn't HW reset*/

5193 if (!tmp_adev->asic_reset_res && !job_signaled)

5194 drm_sched_resubmit_jobs(&ring->sched);

5195

5196 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);

5197 }

5198

5199 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {

5200 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));

5201 }

5202

5203 tmp_adev->asic_reset_res = 0;

5204

5205 if (r) {

5206 /* bad news, how to tell it to userspace ? */

5207 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));

5208 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);

5209 } else {

5210 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));

5211 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))

5212 DRM_WARN("smart shift update failed\n");

5213 }

5214 }

5215

5216 skip_sched_resume:

5217 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {

5218 /* unlock kfd: SRIOV would do it separately */

5219 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))

5220 amdgpu_amdkfd_post_reset(tmp_adev);

5221

5222 /* kfd_post_reset will do nothing if kfd device is not initialized,

5223 * need to bring up kfd here if it's not be initialized before

5224 */

5225 if (!adev->kfd.init_complete)

5226 amdgpu_amdkfd_device_init(adev);

5227

5228 if (audio_suspended)

5229 amdgpu_device_resume_display_audio(tmp_adev);

5230 amdgpu_device_unlock_adev(tmp_adev);

5231 }

5232

5233 skip_recovery:

5234 if (hive) {

5235 atomic_set(&hive->in_reset, 0);

5236 mutex_unlock(&hive->hive_lock);

5237 amdgpu_put_xgmi_hive(hive);

5238 }

5239

5240 if (r && r != -EAGAIN)

5241 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);

5242 return r;

5243 }

5244

5245 /**

5246 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot

5247 *

5248 * @adev: amdgpu_device pointer

5249 *

5250 * Fetchs and stores in the driver the PCIE capabilities (gen speed

5251 * and lanes) of the slot the device is in. Handles APUs and

5252 * virtualized environments where PCIE config space may not be available.

5253 */

5254 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)

5255 {

5256 struct pci_dev *pdev;

5257 enum pci_bus_speed speed_cap, platform_speed_cap;

5258 enum pcie_link_width platform_link_width;

5259

5260 if (amdgpu_pcie_gen_cap)

5261 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;

5262

5263 if (amdgpu_pcie_lane_cap)

5264 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;

5265

5266 /* covers APUs as well */

5267 if (pci_is_root_bus(adev->pdev->bus)) {

5268 if (adev->pm.pcie_gen_mask == 0)

5269 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;

5270 if (adev->pm.pcie_mlw_mask == 0)

5271 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;

5272 return;

5273 }

5274

5275 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)

5276 return;

5277

5278 pcie_bandwidth_available(adev->pdev, NULL,

5279 &platform_speed_cap, &platform_link_width);

5280

5281 if (adev->pm.pcie_gen_mask == 0) {

5282 /* asic caps */

5283 pdev = adev->pdev;

5284 speed_cap = pcie_get_speed_cap(pdev);

5285 if (speed_cap == PCI_SPEED_UNKNOWN) {

5286 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |

5287 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |

5288 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);

5289 } else {

5290 if (speed_cap == PCIE_SPEED_32_0GT)

5291 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |

5292 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |

5293 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |

5294 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |

5295 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);

5296 else if (speed_cap == PCIE_SPEED_16_0GT)

5297 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |

5298 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |

5299 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |

5300 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);

5301 else if (speed_cap == PCIE_SPEED_8_0GT)

5302 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |

5303 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |

5304 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);

5305 else if (speed_cap == PCIE_SPEED_5_0GT)

5306 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |

5307 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);

5308 else

5309 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;

5310 }

5311 /* platform caps */

5312 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {

5313 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |

5314 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);

5315 } else {

5316 if (platform_speed_cap == PCIE_SPEED_32_0GT)

5317 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |

5318 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |

5319 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |

5320 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |

5321 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);

5322 else if (platform_speed_cap == PCIE_SPEED_16_0GT)

5323 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |

5324 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |

5325 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |

5326 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);

5327 else if (platform_speed_cap == PCIE_SPEED_8_0GT)

5328 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |

5329 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |

5330 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);

5331 else if (platform_speed_cap == PCIE_SPEED_5_0GT)

5332 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |

5333 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);

5334 else

5335 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;

5336

5337 }

5338 }

5339 if (adev->pm.pcie_mlw_mask == 0) {

5340 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {

5341 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;

5342 } else {

5343 switch (platform_link_width) {

5344 case PCIE_LNK_X32:

5345 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |

5346 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |

5347 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |

5348 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |

5349 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |

5350 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |

5351 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);

5352 break;

5353 case PCIE_LNK_X16:

5354 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |

5355 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |

5356 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |

5357 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |

5358 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |

5359 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);

5360 break;

5361 case PCIE_LNK_X12:

5362 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |

5363 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |

5364 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |

5365 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |

5366 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);

5367 break;

5368 case PCIE_LNK_X8:

5369 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |

5370 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |

5371 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |

5372 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);

5373 break;

5374 case PCIE_LNK_X4:

5375 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |

5376 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |

5377 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);

5378 break;

5379 case PCIE_LNK_X2:

5380 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |

5381 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);

5382 break;

5383 case PCIE_LNK_X1:

5384 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;

5385 break;

5386 default:

5387 break;

5388 }

5389 }

5390 }

5391 }

5392

5393 int amdgpu_device_baco_enter(struct drm_device *dev)

5394 {

5395 struct amdgpu_device *adev = drm_to_adev(dev);

5396 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

5397

5398 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))

5399 return -ENOTSUPP;

5400

5401 if (ras && adev->ras_enabled &&

5402 adev->nbio.funcs->enable_doorbell_interrupt)

5403 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);

5404

5405 return amdgpu_dpm_baco_enter(adev);

5406 }

5407

5408 int amdgpu_device_baco_exit(struct drm_device *dev)

5409 {

5410 struct amdgpu_device *adev = drm_to_adev(dev);

5411 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

5412 int ret = 0;

5413

5414 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))

5415 return -ENOTSUPP;

5416

5417 ret = amdgpu_dpm_baco_exit(adev);

5418 if (ret)

5419 return ret;

5420

5421 if (ras && adev->ras_enabled &&

5422 adev->nbio.funcs->enable_doorbell_interrupt)

5423 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);

5424

5425 if (amdgpu_passthrough(adev) &&

5426 adev->nbio.funcs->clear_doorbell_interrupt)

5427 adev->nbio.funcs->clear_doorbell_interrupt(adev);

5428

5429 return 0;

5430 }

5431

5432 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)

5433 {

5434 int i;

5435

5436 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

5437 struct amdgpu_ring *ring = adev->rings[i];

5438

5439 if (!ring || !ring->sched.thread)

5440 continue;

5441

5442 cancel_delayed_work_sync(&ring->sched.work_tdr);

5443 }

5444 }

5445

5446 /**

5447 * amdgpu_pci_error_detected - Called when a PCI error is detected.

5448 * @pdev: PCI device struct

5449 * @state: PCI channel state

5450 *

5451 * Description: Called when a PCI error is detected.

5452 *

5453 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.

5454 */

5455 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)

5456 {

5457 struct drm_device *dev = pci_get_drvdata(pdev);

5458 struct amdgpu_device *adev = drm_to_adev(dev);

5459 int i;

5460

5461 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);

5462

5463 if (adev->gmc.xgmi.num_physical_nodes > 1) {

5464 DRM_WARN("No support for XGMI hive yet...");

5465 return PCI_ERS_RESULT_DISCONNECT;

5466 }

5467

5468 adev->pci_channel_state = state;

5469

5470 switch (state) {

5471 case pci_channel_io_normal:

5472 return PCI_ERS_RESULT_CAN_RECOVER;

5473 /* Fatal error, prepare for slot reset */

5474 case pci_channel_io_frozen:

5475 /*

5476 * Cancel and wait for all TDRs in progress if failing to

5477 * set adev->in_gpu_reset in amdgpu_device_lock_adev

5478 *

5479 * Locking adev->reset_sem will prevent any external access

5480 * to GPU during PCI error recovery

5481 */

5482 while (!amdgpu_device_lock_adev(adev, NULL))

5483 amdgpu_cancel_all_tdr(adev);

5484

5485 /*

5486 * Block any work scheduling as we do for regular GPU reset

5487 * for the duration of the recovery

5488 */

5489 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

5490 struct amdgpu_ring *ring = adev->rings[i];

5491

5492 if (!ring || !ring->sched.thread)

5493 continue;

5494

5495 drm_sched_stop(&ring->sched, NULL);

5496 }

5497 atomic_inc(&adev->gpu_reset_counter);

5498 return PCI_ERS_RESULT_NEED_RESET;

5499 case pci_channel_io_perm_failure:

5500 /* Permanent error, prepare for device removal */

5501 return PCI_ERS_RESULT_DISCONNECT;

5502 }

5503

5504 return PCI_ERS_RESULT_NEED_RESET;

5505 }

5506

5507 /**

5508 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers

5509 * @pdev: pointer to PCI device

5510 */

5511 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)

5512 {

5513

5514 DRM_INFO("PCI error: mmio enabled callback!!\n");

5515

5516 /* TODO - dump whatever for debugging purposes */

5517

5518 /* This called only if amdgpu_pci_error_detected returns

5519 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still

5520 * works, no need to reset slot.

5521 */

5522

5523 return PCI_ERS_RESULT_RECOVERED;

5524 }

5525

5526 /**

5527 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.

5528 * @pdev: PCI device struct

5529 *

5530 * Description: This routine is called by the pci error recovery

5531 * code after the PCI slot has been reset, just before we

5532 * should resume normal operations.

5533 */

5534 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)

5535 {

5536 struct drm_device *dev = pci_get_drvdata(pdev);

5537 struct amdgpu_device *adev = drm_to_adev(dev);

5538 int r, i;

5539 struct amdgpu_reset_context reset_context;

5540 u32 memsize;

5541 struct list_head device_list;

5542

5543 DRM_INFO("PCI error: slot reset callback!!\n");

5544

5545 memset(&reset_context, 0, sizeof(reset_context));

5546

5547 INIT_LIST_HEAD(&device_list);

5548 list_add_tail(&adev->reset_list, &device_list);

5549

5550 /* wait for asic to come out of reset */

5551 msleep(500);

5552

5553 /* Restore PCI confspace */

5554 amdgpu_device_load_pci_state(pdev);

5555

5556 /* confirm ASIC came out of reset */

5557 for (i = 0; i < adev->usec_timeout; i++) {

5558 memsize = amdgpu_asic_get_config_memsize(adev);

5559

5560 if (memsize != 0xffffffff)

5561 break;

5562 udelay(1);

5563 }

5564 if (memsize == 0xffffffff) {

5565 r = -ETIME;

5566 goto out;

5567 }

5568

5569 reset_context.method = AMD_RESET_METHOD_NONE;

5570 reset_context.reset_req_dev = adev;

5571 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

5572 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);

5573

5574 adev->no_hw_access = true;

5575 r = amdgpu_device_pre_asic_reset(adev, &reset_context);

5576 adev->no_hw_access = false;

5577 if (r)

5578 goto out;

5579

5580 r = amdgpu_do_asic_reset(&device_list, &reset_context);

5581

5582 out:

5583 if (!r) {

5584 if (amdgpu_device_cache_pci_state(adev->pdev))

5585 pci_restore_state(adev->pdev);

5586

5587 DRM_INFO("PCIe error recovery succeeded\n");

5588 } else {

5589 DRM_ERROR("PCIe error recovery failed, err:%d", r);

5590 amdgpu_device_unlock_adev(adev);

5591 }

5592

5593 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;

5594 }

5595

5596 /**

5597 * amdgpu_pci_resume() - resume normal ops after PCI reset

5598 * @pdev: pointer to PCI device

5599 *

5600 * Called when the error recovery driver tells us that its

5601 * OK to resume normal operation.

5602 */

5603 void amdgpu_pci_resume(struct pci_dev *pdev)

5604 {

5605 struct drm_device *dev = pci_get_drvdata(pdev);

5606 struct amdgpu_device *adev = drm_to_adev(dev);

5607 int i;

5608

5609

5610 DRM_INFO("PCI error: resume callback!!\n");

5611

5612 /* Only continue execution for the case of pci_channel_io_frozen */

5613 if (adev->pci_channel_state != pci_channel_io_frozen)

5614 return;

5615

5616 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

5617 struct amdgpu_ring *ring = adev->rings[i];

5618

5619 if (!ring || !ring->sched.thread)

5620 continue;

5621

5622

5623 drm_sched_resubmit_jobs(&ring->sched);

5624 drm_sched_start(&ring->sched, true);

5625 }

5626

5627 amdgpu_device_unlock_adev(adev);

5628 }

5629

5630 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)

5631 {

5632 struct drm_device *dev = pci_get_drvdata(pdev);

5633 struct amdgpu_device *adev = drm_to_adev(dev);

5634 int r;

5635

5636 r = pci_save_state(pdev);

5637 if (!r) {

5638 kfree(adev->pci_state);

5639

5640 adev->pci_state = pci_store_saved_state(pdev);

5641

5642 if (!adev->pci_state) {

5643 DRM_ERROR("Failed to store PCI saved state");

5644 return false;

5645 }

5646 } else {

5647 DRM_WARN("Failed to save PCI state, err:%d\n", r);

5648 return false;

5649 }

5650

5651 return true;

5652 }

5653

5654 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)

5655 {

5656 struct drm_device *dev = pci_get_drvdata(pdev);

5657 struct amdgpu_device *adev = drm_to_adev(dev);

5658 int r;

5659

5660 if (!adev->pci_state)

5661 return false;

5662

5663 r = pci_load_saved_state(pdev, adev->pci_state);

5664

5665 if (!r) {

5666 pci_restore_state(pdev);

5667 } else {

5668 DRM_WARN("Failed to load PCI state, err:%d\n", r);

5669 return false;

5670 }

5671

5672 return true;

5673 }

5674

5675 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,

5676 struct amdgpu_ring *ring)

5677 {

5678 #ifdef CONFIG_X86_64

5679 if (adev->flags & AMD_IS_APU)

5680 return;

5681 #endif

5682 if (adev->gmc.xgmi.connected_to_cpu)

5683 return;

5684

5685 if (ring && ring->funcs->emit_hdp_flush)

5686 amdgpu_ring_emit_hdp_flush(ring);

5687 else

5688 amdgpu_asic_flush_hdp(adev, ring);

5689 }

5690

5691 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,

5692 struct amdgpu_ring *ring)

5693 {

5694 #ifdef CONFIG_X86_64

5695 if (adev->flags & AMD_IS_APU)

5696 return;

5697 #endif

5698 if (adev->gmc.xgmi.connected_to_cpu)

5699 return;

5700

5701 amdgpu_asic_invalidate_hdp(adev, ring);

5702 }

5703

5704 /**

5705 * amdgpu_device_halt() - bring hardware to some kind of halt state

5706 *

5707 * @adev: amdgpu_device pointer

5708 *

5709 * Bring hardware to some kind of halt state so that no one can touch it

5710 * any more. It will help to maintain error context when error occurred.

5711 * Compare to a simple hang, the system will keep stable at least for SSH

5712 * access. Then it should be trivial to inspect the hardware state and

5713 * see what's going on. Implemented as following:

5714 *

5715 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),

5716 * clears all CPU mappings to device, disallows remappings through page faults

5717 * 2. amdgpu_irq_disable_all() disables all interrupts

5718 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences

5719 * 4. set adev->no_hw_access to avoid potential crashes after setp 5

5720 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings

5721 * 6. pci_disable_device() and pci_wait_for_pending_transaction()

5722 * flush any in flight DMA operations

5723 */

5724 void amdgpu_device_halt(struct amdgpu_device *adev)

5725 {

5726 struct pci_dev *pdev = adev->pdev;

5727 struct drm_device *ddev = adev_to_drm(adev);

5728

5729 drm_dev_unplug(ddev);

5730

5731 amdgpu_irq_disable_all(adev);

5732

5733 amdgpu_fence_driver_hw_fini(adev);

5734

5735 adev->no_hw_access = true;

5736

5737 amdgpu_device_unmap_mmio(adev);

5738

5739 pci_disable_device(pdev);

5740 pci_wait_for_pending_transaction(pdev);

5741 }