drm/amdgpu: Avoid rma causes GPU duplicate reset

author Ce Sun <cesun102@amd.com>

Sun, 27 Jul 2025 04:06:55 +0000 (12:06 +0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Thu, 13 Nov 2025 20:34:09 +0000 (15:34 -0500)
author Ce Sun <cesun102@amd.com>
Sun, 27 Jul 2025 04:06:55 +0000 (12:06 +0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 13 Nov 2025 20:34:09 +0000 (15:34 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index f5148027107bc8028680d480993a75d2fb92d82d..d9cdc89d4cde18574c5b3aca887514d98fc8cae8 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2927,7 +2927,6 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
                                               page_retirement_dwork.work);
         struct amdgpu_device *adev = con->adev;
         struct ras_err_data err_data;
-       unsigned long err_cnt;
  
         /* If gpu reset is ongoing, delay retiring the bad pages */
         if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
@@ -2939,13 +2938,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
         amdgpu_ras_error_data_init(&err_data);
  
         amdgpu_umc_handle_bad_pages(adev, &err_data);
-       err_cnt = err_data.err_addr_cnt;
  
         amdgpu_ras_error_data_fini(&err_data);
  
-       if (err_cnt && amdgpu_ras_is_rma(adev))
-               amdgpu_ras_reset_gpu(adev);
-
         amdgpu_ras_schedule_retirement_dwork(con,
                         AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
  }
@@ -3008,6 +3003,9 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
         if (total_detect_count)
                 schedule_delayed_work(&ras->page_retirement_dwork, 0);
  
+       if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0)
+               amdgpu_ras_reset_gpu(adev);
+
         return 0;
  }
  
@@ -3043,6 +3041,12 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
                 reset_flags |= msg.reset;
         }
  
+       /*
+        * Try to ensure poison creation handler is completed first
+        * to set rma if bad page exceed threshold.
+        */
+       flush_delayed_work(&con->page_retirement_dwork);
+
         /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
         if (reset_flags && !amdgpu_ras_is_rma(adev)) {
                 if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
@@ -3052,8 +3056,6 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
                 else
                         reset = reset_flags;
  
-               flush_delayed_work(&con->page_retirement_dwork);
-
                 con->gpu_reset_flags |= reset;
                 amdgpu_ras_reset_gpu(adev);
  
@@ -3174,6 +3176,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         mutex_init(&con->recovery_lock);
         INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
         atomic_set(&con->in_recovery, 0);
+       atomic_set(&con->rma_in_recovery, 0);
         con->eeprom_control.bad_channel_bitmap = 0;
  
         max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index 669720a9c60afd2767ff484e91458a2ec68cf84a..7e7521fedafc70e2204919fa6262001a87025881 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -510,6 +510,7 @@ struct amdgpu_ras {
         /* gpu recovery */
         struct work_struct recovery_work;
         atomic_t in_recovery;
+       atomic_t rma_in_recovery;
         struct amdgpu_device *adev;
         /* error handler data */
         struct ras_err_handler_data *eh_data;
author	Ce Sun <cesun102@amd.com>
	Sun, 27 Jul 2025 04:06:55 +0000 (12:06 +0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Thu, 13 Nov 2025 20:34:09 +0000 (15:34 -0500)
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h		patch \| blob \| blame \| history