drm/amdgpu: Multi-GPU DPC recovery support

author Ce Sun <cesun102@amd.com>

Fri, 21 Mar 2025 02:11:18 +0000 (10:11 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Mon, 7 Apr 2025 19:18:31 +0000 (15:18 -0400)
author Ce Sun <cesun102@amd.com>
Fri, 21 Mar 2025 02:11:18 +0000 (10:11 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Mon, 7 Apr 2025 19:18:31 +0000 (15:18 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 2db233a9f9721eb6dbe5c971d794c989a7a72fc4..9a9e5249c63f94ab69c24694fa832f24b0274a90 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -551,6 +551,7 @@ struct amdgpu_allowed_register_entry {
   *                   are reset depends on the ASIC. Notably doesn't reset IPs
   *                   shared with the CPU on APUs or the memory controllers (so
   *                   VRAM is not lost). Not available on all ASICs.
+ * @AMD_RESET_LINK: Triggers SW-UP link reset on other GPUs
   * @AMD_RESET_BACO: BACO (Bus Alive, Chip Off) method powers off and on the card
   *                  but without powering off the PCI bus. Suitable only for
   *                  discrete GPUs.
@@ -568,6 +569,7 @@ enum amd_reset_method {
         AMD_RESET_METHOD_MODE0,
         AMD_RESET_METHOD_MODE1,
         AMD_RESET_METHOD_MODE2,
+       AMD_RESET_METHOD_LINK,
         AMD_RESET_METHOD_BACO,
         AMD_RESET_METHOD_PCI,
         AMD_RESET_METHOD_ON_INIT,
@@ -830,6 +832,8 @@ struct amdgpu_mqd {
  };
  
  struct amdgpu_pcie_reset_ctx {
+       bool in_link_reset;
+       bool occurs_dpc;
         bool audio_suspended;
  };
  
@@ -1469,6 +1473,7 @@ void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
                                              const u32 array_size);
  
  int amdgpu_device_mode1_reset(struct amdgpu_device *adev);
+int amdgpu_device_link_reset(struct amdgpu_device *adev);
  bool amdgpu_device_supports_atpx(struct drm_device *dev);
  bool amdgpu_device_supports_px(struct drm_device *dev);
  bool amdgpu_device_supports_boco(struct drm_device *dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 1866d07db13313d482e223550094719d3baca0ac..7525128f971f47e425a1b6f177c307fa19506134 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3172,6 +3172,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
          * always assumed to be lost.
          */
         switch (amdgpu_asic_reset_method(adev)) {
+       case AMD_RESET_METHOD_LINK:
         case AMD_RESET_METHOD_BACO:
         case AMD_RESET_METHOD_MODE1:
                 return true;
@@ -5510,6 +5511,29 @@ mode1_reset_failed:
         return ret;
  }
  
+int amdgpu_device_link_reset(struct amdgpu_device *adev)
+{
+       int ret = 0;
+
+       dev_info(adev->dev, "GPU link reset\n");
+
+       if (!adev->pcie_reset_ctx.occurs_dpc)
+               ret = amdgpu_dpm_link_reset(adev);
+
+       if (ret)
+               goto link_reset_failed;
+
+       ret = amdgpu_psp_wait_for_bootloader(adev);
+       if (ret)
+               goto link_reset_failed;
+
+       return 0;
+
+link_reset_failed:
+       dev_err(adev->dev, "GPU link reset failed\n");
+       return ret;
+}
+
  int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
                                  struct amdgpu_reset_context *reset_context)
  {
@@ -5814,6 +5838,7 @@ static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
  
         switch (amdgpu_asic_reset_method(adev)) {
         case AMD_RESET_METHOD_MODE1:
+       case AMD_RESET_METHOD_LINK:
                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
                 break;
         case AMD_RESET_METHOD_MODE2:
@@ -5951,6 +5976,8 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
                         list_add_tail(&tmp_adev->reset_list, device_list);
                         if (adev->shutdown)
                                 tmp_adev->shutdown = true;
+                       if (adev->pcie_reset_ctx.occurs_dpc)
+                               tmp_adev->pcie_reset_ctx.in_link_reset = true;
                 }
                 if (!list_is_first(&adev->reset_list, device_list))
                         list_rotate_to_front(&adev->reset_list, device_list);
@@ -5960,7 +5987,7 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
                 device_list_handle = device_list;
         }
  
-       if (!amdgpu_sriov_vf(adev)) {
+       if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
                 r = amdgpu_device_health_check(device_list_handle);
                 if (r)
                         return r;
@@ -6005,6 +6032,7 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
  
                 /* disable ras on ALL IPs */
                 if (!need_emergency_restart &&
+                     (!adev->pcie_reset_ctx.occurs_dpc) &&
                       amdgpu_device_ip_need_full_reset(tmp_adev))
                         amdgpu_ras_suspend(tmp_adev);
  
@@ -6035,7 +6063,11 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev,
  
  retry: /* Rest of adevs pre asic reset from XGMI hive. */
         list_for_each_entry(tmp_adev, device_list, reset_list) {
+               if (adev->pcie_reset_ctx.occurs_dpc)
+                       tmp_adev->no_hw_access = true;
                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
+               if (adev->pcie_reset_ctx.occurs_dpc)
+                       tmp_adev->no_hw_access = false;
                 /*TODO Should we stop ?*/
                 if (r) {
                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
@@ -6634,12 +6666,15 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
  {
         struct drm_device *dev = pci_get_drvdata(pdev);
         struct amdgpu_device *adev = drm_to_adev(dev);
-       int i;
+       struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+       struct amdgpu_reset_context reset_context;
+       struct list_head device_list;
+       int r = 0;
  
-       DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
+       dev_info(adev->dev, "PCI error: detected callback!!\n");
  
-       if (adev->gmc.xgmi.num_physical_nodes > 1) {
-               DRM_WARN("No support for XGMI hive yet...");
+       if (!amdgpu_dpm_is_link_reset_supported(adev)) {
+               dev_warn(adev->dev, "No support for XGMI hive yet...\n");
                 return PCI_ERS_RESULT_DISCONNECT;
         }
  
@@ -6647,32 +6682,30 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
  
         switch (state) {
         case pci_channel_io_normal:
+               dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state);
                 return PCI_ERS_RESULT_CAN_RECOVER;
-       /* Fatal error, prepare for slot reset */
         case pci_channel_io_frozen:
-               /*
-                * Locking adev->reset_domain->sem will prevent any external access
-                * to GPU during PCI error recovery
-                */
-               amdgpu_device_lock_reset_domain(adev->reset_domain);
-               amdgpu_device_set_mp1_state(adev);
-
-               /*
-                * Block any work scheduling as we do for regular GPU reset
-                * for the duration of the recovery
-                */
-               for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
-                       struct amdgpu_ring *ring = adev->rings[i];
-
-                       if (!amdgpu_ring_sched_ready(ring))
-                               continue;
-
-                       drm_sched_stop(&ring->sched, NULL);
+               /* Fatal error, prepare for slot reset */
+               dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state);
+
+               if (hive)
+                       mutex_lock(&hive->hive_lock);
+               adev->pcie_reset_ctx.occurs_dpc = true;
+               memset(&reset_context, 0, sizeof(reset_context));
+               INIT_LIST_HEAD(&device_list);
+
+               r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
+                                        hive, false);
+               if (hive) {
+                       mutex_unlock(&hive->hive_lock);
+                       amdgpu_put_xgmi_hive(hive);
                 }
-               atomic_inc(&adev->gpu_reset_counter);
+               if (r)
+                       return PCI_ERS_RESULT_DISCONNECT;
                 return PCI_ERS_RESULT_NEED_RESET;
         case pci_channel_io_perm_failure:
                 /* Permanent error, prepare for device removal */
+               dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state);
                 return PCI_ERS_RESULT_DISCONNECT;
         }
  
@@ -6685,8 +6718,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
   */
  pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
  {
+       struct drm_device *dev = pci_get_drvdata(pdev);
+       struct amdgpu_device *adev = drm_to_adev(dev);
  
-       DRM_INFO("PCI error: mmio enabled callback!!\n");
+       dev_info(adev->dev, "PCI error: mmio enabled callback!!\n");
  
         /* TODO - dump whatever for debugging purposes */
  
@@ -6710,10 +6745,12 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
  {
         struct drm_device *dev = pci_get_drvdata(pdev);
         struct amdgpu_device *adev = drm_to_adev(dev);
-       int r, i;
         struct amdgpu_reset_context reset_context;
-       u32 memsize;
+       struct amdgpu_device *tmp_adev = NULL;
+       struct amdgpu_hive_info *hive = NULL;
         struct list_head device_list;
+       int r = 0, i;
+       u32 memsize;
  
         /* PCI error slot reset should be skipped During RAS recovery */
         if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
@@ -6721,15 +6758,12 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
             amdgpu_ras_in_recovery(adev))
                 return PCI_ERS_RESULT_RECOVERED;
  
-       DRM_INFO("PCI error: slot reset callback!!\n");
+       dev_info(adev->dev, "PCI error: slot reset callback!!\n");
  
         memset(&reset_context, 0, sizeof(reset_context));
  
-       INIT_LIST_HEAD(&device_list);
-       list_add_tail(&adev->reset_list, &device_list);
-
         /* wait for asic to come out of reset */
-       msleep(500);
+       msleep(700);
  
         /* Restore PCI confspace */
         amdgpu_device_load_pci_state(pdev);
@@ -6750,26 +6784,40 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
         reset_context.method = AMD_RESET_METHOD_NONE;
         reset_context.reset_req_dev = adev;
         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-       set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
-
-       adev->no_hw_access = true;
-       r = amdgpu_device_pre_asic_reset(adev, &reset_context);
-       adev->no_hw_access = false;
-       if (r)
-               goto out;
+       set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
+       INIT_LIST_HEAD(&device_list);
  
-       r = amdgpu_do_asic_reset(&device_list, &reset_context);
+       hive = amdgpu_get_xgmi_hive(adev);
+       if (hive) {
+               mutex_lock(&hive->hive_lock);
+               reset_context.hive = hive;
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+                       tmp_adev->pcie_reset_ctx.in_link_reset = true;
+                       list_add_tail(&tmp_adev->reset_list, &device_list);
+               }
+       } else {
+               set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
+               list_add_tail(&adev->reset_list, &device_list);
+       }
  
+       r = amdgpu_device_asic_reset(adev, &device_list, &reset_context);
  out:
         if (!r) {
                 if (amdgpu_device_cache_pci_state(adev->pdev))
                         pci_restore_state(adev->pdev);
-
-               DRM_INFO("PCIe error recovery succeeded\n");
+               dev_info(adev->dev, "PCIe error recovery succeeded\n");
         } else {
-               DRM_ERROR("PCIe error recovery failed, err:%d", r);
-               amdgpu_device_unset_mp1_state(adev);
-               amdgpu_device_unlock_reset_domain(adev->reset_domain);
+               dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r);
+               if (tmp_adev) {
+                       list_for_each_entry(tmp_adev, &device_list, reset_list)
+                               amdgpu_device_unset_mp1_state(tmp_adev);
+                       amdgpu_device_unlock_reset_domain(adev->reset_domain);
+               }
+       }
+
+       if (hive) {
+               mutex_unlock(&hive->hive_lock);
+               amdgpu_put_xgmi_hive(hive);
         }
  
         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
@@ -6786,26 +6834,36 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
  {
         struct drm_device *dev = pci_get_drvdata(pdev);
         struct amdgpu_device *adev = drm_to_adev(dev);
-       int i;
-
+       struct list_head device_list;
+       struct amdgpu_hive_info *hive = NULL;
+       struct amdgpu_device *tmp_adev = NULL;
  
-       DRM_INFO("PCI error: resume callback!!\n");
+       dev_info(adev->dev, "PCI error: resume callback!!\n");
  
         /* Only continue execution for the case of pci_channel_io_frozen */
         if (adev->pci_channel_state != pci_channel_io_frozen)
                 return;
  
-       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
-               struct amdgpu_ring *ring = adev->rings[i];
+       INIT_LIST_HEAD(&device_list);
  
-               if (!amdgpu_ring_sched_ready(ring))
-                       continue;
+       hive = amdgpu_get_xgmi_hive(adev);
+       if (hive) {
+               mutex_lock(&hive->hive_lock);
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+                       tmp_adev->pcie_reset_ctx.in_link_reset = false;
+                       list_add_tail(&tmp_adev->reset_list, &device_list);
+               }
+       } else
+               list_add_tail(&adev->reset_list, &device_list);
  
-               drm_sched_start(&ring->sched, 0);
-       }
+       amdgpu_device_sched_resume(&device_list, NULL, NULL);
+       amdgpu_device_gpu_resume(adev, &device_list, false);
+       adev->pcie_reset_ctx.occurs_dpc = false;
  
-       amdgpu_device_unset_mp1_state(adev);
-       amdgpu_device_unlock_reset_domain(adev->reset_domain);
+       if (hive) {
+               mutex_unlock(&hive->hive_lock);
+               amdgpu_put_xgmi_hive(hive);
+       }
  }
  
  bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c

index 659eab9b90bec626ef24cbe482063dc599928460..c457be3a3c56f5fea76d49e2c10c88d464d74a9a 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -584,6 +584,8 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
                  * Enable triggering of GPU reset only if specified
                  * by module parameter.
                  */
+               if (adev->pcie_reset_ctx.in_link_reset)
+                       return AMD_RESET_METHOD_LINK;
                 if (amdgpu_gpu_recovery == 4 || amdgpu_gpu_recovery == 5)
                         return AMD_RESET_METHOD_MODE2;
                 else if (!(adev->flags & AMD_IS_APU))
@@ -640,6 +642,9 @@ asic_reset:
         case AMD_RESET_METHOD_MODE2:
                 dev_info(adev->dev, "MODE2 reset\n");
                 return amdgpu_dpm_mode2_reset(adev);
+       case AMD_RESET_METHOD_LINK:
+               dev_info(adev->dev, "Link reset\n");
+               return amdgpu_device_link_reset(adev);
         default:
                 dev_info(adev->dev, "MODE1 reset\n");
                 return amdgpu_device_mode1_reset(adev);
author	Ce Sun <cesun102@amd.com>
	Fri, 21 Mar 2025 02:11:18 +0000 (10:11 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Mon, 7 Apr 2025 19:18:31 +0000 (15:18 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/soc15.c		patch \| blob \| blame \| history