drm/amdgpu: Update usage for bad page threshold

author Hawking Zhang <Hawking.Zhang@amd.com>

Wed, 22 Jan 2025 11:34:33 +0000 (19:34 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Thu, 13 Feb 2025 02:02:59 +0000 (21:02 -0500)
author Hawking Zhang <Hawking.Zhang@amd.com>
Wed, 22 Jan 2025 11:34:33 +0000 (19:34 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Thu, 13 Feb 2025 02:02:59 +0000 (21:02 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index e789f6790a1c63c57fd0cc68b8a7e977cb4ac058..f52f674477eb79510c291a74f47f9355bcfe0be7 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -964,7 +964,7 @@ module_param_named_unsafe(reset_method, amdgpu_reset_method, int, 0644);
   * result in the GPU entering bad status when the number of total
   * faulty pages by ECC exceeds the threshold value.
   */
-MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore threshold (default value), 0 = disable bad page retirement, -2 = driver sets threshold)");
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore threshold (default value), 0 = disable bad page retirement, -2 = threshold determined by a formula, 0 < threshold < max records, user-defined threshold)");
  module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
  
  MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index f0924aa3f4e485ac60f05f5d71e7d65f5ed162c9..90394f89aba6bb0f9237c43cced65d75e3a43cc2 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3080,31 +3080,29 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  
         /*
-        * Justification of value bad_page_cnt_threshold in ras structure
-        *
-        * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
-        * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
-        * scenarios accordingly.
-        *
-        * Bad page retirement enablement:
-        *    - If amdgpu_bad_page_threshold = -2,
-        *      bad_page_cnt_threshold = typical value by formula.
-        *
-        *    - When the value from user is 0 < amdgpu_bad_page_threshold <
-        *      max record length in eeprom, use it directly.
-        *
-        * Bad page retirement disablement:
-        *    - If amdgpu_bad_page_threshold = 0, bad page retirement
-        *      functionality is disabled, and bad_page_cnt_threshold will
-        *      take no effect.
+        * amdgpu_bad_page_threshold is used to config
+        * the threshold for the number of bad pages.
+        * -1:  Threshold is set to default value
+        *      Driver will issue a warning message when threshold is reached
+        *      and continue runtime services.
+        * 0:   Disable bad page retirement
+        *      Driver will not retire bad pages
+        *      which is intended for debugging purpose.
+        * -2:  Threshold is determined by a formula
+        *      that assumes 1 bad page per 100M of local memory.
+        *      Driver will continue runtime services when threhold is reached.
+        * 0 < threshold < max number of bad page records in EEPROM,
+        *      A user-defined threshold is set
+        *      Driver will halt runtime services when this custom threshold is reached.
          */
-
-       if (amdgpu_bad_page_threshold < 0) {
+       if (amdgpu_bad_page_threshold == -2) {
                 u64 val = adev->gmc.mc_vram_size;
  
                 do_div(val, RAS_BAD_PAGE_COVER);
                 con->bad_page_cnt_threshold = min(lower_32_bits(val),
                                                   max_count);
+       } else if (amdgpu_bad_page_threshold == -1) {
+               con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4;
         } else {
                 con->bad_page_cnt_threshold = min_t(int, max_count,
                                                     amdgpu_bad_page_threshold);
@@ -3848,8 +3846,10 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
         case IP_VERSION(13, 0, 2):
         case IP_VERSION(13, 0, 6):
         case IP_VERSION(13, 0, 12):
+               con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT;
+               break;
         case IP_VERSION(13, 0, 14):
-               con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+               con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1);
                 break;
         default:
                 break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index 82db986c36a0a83b53570e7a209521531431d1ab..cc4586581dba93614caf37b503d4b0338907360b 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -65,7 +65,7 @@ struct amdgpu_iv_entry;
  
  /* Reserve 8 physical dram row for possible retirement.
   * In worst cases, it will lose 8 * 2MB memory in vram domain */
-#define AMDGPU_RAS_RESERVED_VRAM_SIZE  (16ULL << 20)
+#define AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT  (16ULL << 20)
  /* The high three bits indicates socketid */
  #define AMDGPU_RAS_GET_FEATURES(val)  ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

index 52c16bfeccaad88330489283d2c7792e902554aa..723c655bb4d5cd9c5adf135773e610981bee4dc6 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -558,16 +558,17 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
                         return false;
  
         if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
-               if (amdgpu_bad_page_threshold == -1) {
+               if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold)
                         dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
-                               con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
+                                con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
+               if ((amdgpu_bad_page_threshold == -1) ||
+                   (amdgpu_bad_page_threshold == -2)) {
                         dev_warn(adev->dev,
-                               "But GPU can be operated due to bad_page_threshold = -1.\n");
+                                "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n");
                         return false;
                 } else {
-                       dev_warn(adev->dev, "This GPU is in BAD status.");
-                       dev_warn(adev->dev, "Please retire it or set a larger "
-                                "threshold value when reloading driver.\n");
+                       dev_warn(adev->dev,
+                                "Please consider adjusting the customized threshold.\n");
                         return true;
                 }
         }
@@ -758,7 +759,8 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
                         control->tbl_rai.health_percent = 0;
                 }
  
-               if (amdgpu_bad_page_threshold != -1)
+               if ((amdgpu_bad_page_threshold != -1) &&
+                   (amdgpu_bad_page_threshold != -2))
                         ras->is_rma = true;
  
                 /* ignore the -ENOTSUPP return value */
@@ -1428,8 +1430,9 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
  
                 res = __verify_ras_table_checksum(control);
                 if (res)
-                       DRM_ERROR("RAS table incorrect checksum or error:%d\n",
-                                 res);
+                       dev_err(adev->dev,
+                               "RAS table incorrect checksum or error:%d\n",
+                               res);
  
                 /* Warn if we are at 90% of the threshold or above
                  */
@@ -1447,8 +1450,9 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
  
                 res = __verify_ras_table_checksum(control);
                 if (res) {
-                       dev_err(adev->dev, "RAS Table incorrect checksum or error:%d\n",
-                                 res);
+                       dev_err(adev->dev,
+                               "RAS Table incorrect checksum or error:%d\n",
+                               res);
                         return -EINVAL;
                 }
                 if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages) {
@@ -1466,17 +1470,18 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
                         res = amdgpu_ras_eeprom_correct_header_tag(control,
                                                                    RAS_TABLE_HDR_VAL);
                 } else {
-                       dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
+                       dev_warn(adev->dev,
+                               "RAS records:%d exceed threshold:%d\n",
                                 control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
-                       if (amdgpu_bad_page_threshold == -1) {
-                               dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
+                       if ((amdgpu_bad_page_threshold == -1) ||
+                           (amdgpu_bad_page_threshold == -2)) {
                                 res = 0;
+                               dev_warn(adev->dev,
+                                        "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
                         } else {
                                 ras->is_rma = true;
-                               dev_err(adev->dev,
-                                       "RAS records:%d exceed threshold:%d, "
-                                       "GPU will not be initialized. Replace this GPU or increase the threshold",
-                                       control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
+                               dev_warn(adev->dev,
+                                        "User defined threshold is set, runtime service will be halt when threshold is reached\n");
                         }
                 }
         } else {
author	Hawking Zhang <Hawking.Zhang@amd.com>
	Wed, 22 Jan 2025 11:34:33 +0000 (19:34 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Thu, 13 Feb 2025 02:02:59 +0000 (21:02 -0500)
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c		patch \| blob \| blame \| history