From: Ce Sun Date: Fri, 21 Mar 2025 02:11:18 +0000 (+0800) Subject: drm/amdgpu: Multi-GPU DPC recovery support X-Git-Tag: v6.16-rc1~144^2~10^2~461 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8ba904f54148d2a093650b13a229d2ff18142f7d;p=thirdparty%2Flinux.git drm/amdgpu: Multi-GPU DPC recovery support Add support for DPC recover based on refactored code Signed-off-by: Ce Sun Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 2db233a9f9721..9a9e5249c63f9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -551,6 +551,7 @@ struct amdgpu_allowed_register_entry { * are reset depends on the ASIC. Notably doesn't reset IPs * shared with the CPU on APUs or the memory controllers (so * VRAM is not lost). Not available on all ASICs. + * @AMD_RESET_LINK: Triggers SW-UP link reset on other GPUs * @AMD_RESET_BACO: BACO (Bus Alive, Chip Off) method powers off and on the card * but without powering off the PCI bus. Suitable only for * discrete GPUs. @@ -568,6 +569,7 @@ enum amd_reset_method { AMD_RESET_METHOD_MODE0, AMD_RESET_METHOD_MODE1, AMD_RESET_METHOD_MODE2, + AMD_RESET_METHOD_LINK, AMD_RESET_METHOD_BACO, AMD_RESET_METHOD_PCI, AMD_RESET_METHOD_ON_INIT, @@ -830,6 +832,8 @@ struct amdgpu_mqd { }; struct amdgpu_pcie_reset_ctx { + bool in_link_reset; + bool occurs_dpc; bool audio_suspended; }; @@ -1469,6 +1473,7 @@ void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, const u32 array_size); int amdgpu_device_mode1_reset(struct amdgpu_device *adev); +int amdgpu_device_link_reset(struct amdgpu_device *adev); bool amdgpu_device_supports_atpx(struct drm_device *dev); bool amdgpu_device_supports_px(struct drm_device *dev); bool amdgpu_device_supports_boco(struct drm_device *dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1866d07db1331..7525128f971f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3172,6 +3172,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) * always assumed to be lost. */ switch (amdgpu_asic_reset_method(adev)) { + case AMD_RESET_METHOD_LINK: case AMD_RESET_METHOD_BACO: case AMD_RESET_METHOD_MODE1: return true; @@ -5510,6 +5511,29 @@ mode1_reset_failed: return ret; } +int amdgpu_device_link_reset(struct amdgpu_device *adev) +{ + int ret = 0; + + dev_info(adev->dev, "GPU link reset\n"); + + if (!adev->pcie_reset_ctx.occurs_dpc) + ret = amdgpu_dpm_link_reset(adev); + + if (ret) + goto link_reset_failed; + + ret = amdgpu_psp_wait_for_bootloader(adev); + if (ret) + goto link_reset_failed; + + return 0; + +link_reset_failed: + dev_err(adev->dev, "GPU link reset failed\n"); + return ret; +} + int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, struct amdgpu_reset_context *reset_context) { @@ -5814,6 +5838,7 @@ static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) switch (amdgpu_asic_reset_method(adev)) { case AMD_RESET_METHOD_MODE1: + case AMD_RESET_METHOD_LINK: adev->mp1_state = PP_MP1_STATE_SHUTDOWN; break; case AMD_RESET_METHOD_MODE2: @@ -5951,6 +5976,8 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev, list_add_tail(&tmp_adev->reset_list, device_list); if (adev->shutdown) tmp_adev->shutdown = true; + if (adev->pcie_reset_ctx.occurs_dpc) + tmp_adev->pcie_reset_ctx.in_link_reset = true; } if (!list_is_first(&adev->reset_list, device_list)) list_rotate_to_front(&adev->reset_list, device_list); @@ -5960,7 +5987,7 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev, device_list_handle = device_list; } - if (!amdgpu_sriov_vf(adev)) { + if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { r = amdgpu_device_health_check(device_list_handle); if (r) return r; @@ -6005,6 +6032,7 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev, /* disable ras on ALL IPs */ if (!need_emergency_restart && + (!adev->pcie_reset_ctx.occurs_dpc) && amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); @@ -6035,7 +6063,11 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list, reset_list) { + if (adev->pcie_reset_ctx.occurs_dpc) + tmp_adev->no_hw_access = true; r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); + if (adev->pcie_reset_ctx.occurs_dpc) + tmp_adev->no_hw_access = false; /*TODO Should we stop ?*/ if (r) { dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", @@ -6634,12 +6666,15 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - int i; + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + struct amdgpu_reset_context reset_context; + struct list_head device_list; + int r = 0; - DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); + dev_info(adev->dev, "PCI error: detected callback!!\n"); - if (adev->gmc.xgmi.num_physical_nodes > 1) { - DRM_WARN("No support for XGMI hive yet..."); + if (!amdgpu_dpm_is_link_reset_supported(adev)) { + dev_warn(adev->dev, "No support for XGMI hive yet...\n"); return PCI_ERS_RESULT_DISCONNECT; } @@ -6647,32 +6682,30 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta switch (state) { case pci_channel_io_normal: + dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); return PCI_ERS_RESULT_CAN_RECOVER; - /* Fatal error, prepare for slot reset */ case pci_channel_io_frozen: - /* - * Locking adev->reset_domain->sem will prevent any external access - * to GPU during PCI error recovery - */ - amdgpu_device_lock_reset_domain(adev->reset_domain); - amdgpu_device_set_mp1_state(adev); - - /* - * Block any work scheduling as we do for regular GPU reset - * for the duration of the recovery - */ - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - struct amdgpu_ring *ring = adev->rings[i]; - - if (!amdgpu_ring_sched_ready(ring)) - continue; - - drm_sched_stop(&ring->sched, NULL); + /* Fatal error, prepare for slot reset */ + dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); + + if (hive) + mutex_lock(&hive->hive_lock); + adev->pcie_reset_ctx.occurs_dpc = true; + memset(&reset_context, 0, sizeof(reset_context)); + INIT_LIST_HEAD(&device_list); + + r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, + hive, false); + if (hive) { + mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); } - atomic_inc(&adev->gpu_reset_counter); + if (r) + return PCI_ERS_RESULT_DISCONNECT; return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: /* Permanent error, prepare for device removal */ + dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); return PCI_ERS_RESULT_DISCONNECT; } @@ -6685,8 +6718,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta */ pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) { + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); - DRM_INFO("PCI error: mmio enabled callback!!\n"); + dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); /* TODO - dump whatever for debugging purposes */ @@ -6710,10 +6745,12 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - int r, i; struct amdgpu_reset_context reset_context; - u32 memsize; + struct amdgpu_device *tmp_adev = NULL; + struct amdgpu_hive_info *hive = NULL; struct list_head device_list; + int r = 0, i; + u32 memsize; /* PCI error slot reset should be skipped During RAS recovery */ if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || @@ -6721,15 +6758,12 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) amdgpu_ras_in_recovery(adev)) return PCI_ERS_RESULT_RECOVERED; - DRM_INFO("PCI error: slot reset callback!!\n"); + dev_info(adev->dev, "PCI error: slot reset callback!!\n"); memset(&reset_context, 0, sizeof(reset_context)); - INIT_LIST_HEAD(&device_list); - list_add_tail(&adev->reset_list, &device_list); - /* wait for asic to come out of reset */ - msleep(500); + msleep(700); /* Restore PCI confspace */ amdgpu_device_load_pci_state(pdev); @@ -6750,26 +6784,40 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); - - adev->no_hw_access = true; - r = amdgpu_device_pre_asic_reset(adev, &reset_context); - adev->no_hw_access = false; - if (r) - goto out; + set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); + INIT_LIST_HEAD(&device_list); - r = amdgpu_do_asic_reset(&device_list, &reset_context); + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + mutex_lock(&hive->hive_lock); + reset_context.hive = hive; + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + tmp_adev->pcie_reset_ctx.in_link_reset = true; + list_add_tail(&tmp_adev->reset_list, &device_list); + } + } else { + set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); + list_add_tail(&adev->reset_list, &device_list); + } + r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); out: if (!r) { if (amdgpu_device_cache_pci_state(adev->pdev)) pci_restore_state(adev->pdev); - - DRM_INFO("PCIe error recovery succeeded\n"); + dev_info(adev->dev, "PCIe error recovery succeeded\n"); } else { - DRM_ERROR("PCIe error recovery failed, err:%d", r); - amdgpu_device_unset_mp1_state(adev); - amdgpu_device_unlock_reset_domain(adev->reset_domain); + dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); + if (tmp_adev) { + list_for_each_entry(tmp_adev, &device_list, reset_list) + amdgpu_device_unset_mp1_state(tmp_adev); + amdgpu_device_unlock_reset_domain(adev->reset_domain); + } + } + + if (hive) { + mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); } return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; @@ -6786,26 +6834,36 @@ void amdgpu_pci_resume(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - int i; - + struct list_head device_list; + struct amdgpu_hive_info *hive = NULL; + struct amdgpu_device *tmp_adev = NULL; - DRM_INFO("PCI error: resume callback!!\n"); + dev_info(adev->dev, "PCI error: resume callback!!\n"); /* Only continue execution for the case of pci_channel_io_frozen */ if (adev->pci_channel_state != pci_channel_io_frozen) return; - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - struct amdgpu_ring *ring = adev->rings[i]; + INIT_LIST_HEAD(&device_list); - if (!amdgpu_ring_sched_ready(ring)) - continue; + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + mutex_lock(&hive->hive_lock); + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + tmp_adev->pcie_reset_ctx.in_link_reset = false; + list_add_tail(&tmp_adev->reset_list, &device_list); + } + } else + list_add_tail(&adev->reset_list, &device_list); - drm_sched_start(&ring->sched, 0); - } + amdgpu_device_sched_resume(&device_list, NULL, NULL); + amdgpu_device_gpu_resume(adev, &device_list, false); + adev->pcie_reset_ctx.occurs_dpc = false; - amdgpu_device_unset_mp1_state(adev); - amdgpu_device_unlock_reset_domain(adev->reset_domain); + if (hive) { + mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); + } } bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 659eab9b90bec..c457be3a3c56f 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -584,6 +584,8 @@ soc15_asic_reset_method(struct amdgpu_device *adev) * Enable triggering of GPU reset only if specified * by module parameter. */ + if (adev->pcie_reset_ctx.in_link_reset) + return AMD_RESET_METHOD_LINK; if (amdgpu_gpu_recovery == 4 || amdgpu_gpu_recovery == 5) return AMD_RESET_METHOD_MODE2; else if (!(adev->flags & AMD_IS_APU)) @@ -640,6 +642,9 @@ asic_reset: case AMD_RESET_METHOD_MODE2: dev_info(adev->dev, "MODE2 reset\n"); return amdgpu_dpm_mode2_reset(adev); + case AMD_RESET_METHOD_LINK: + dev_info(adev->dev, "Link reset\n"); + return amdgpu_device_link_reset(adev); default: dev_info(adev->dev, "MODE1 reset\n"); return amdgpu_device_mode1_reset(adev);