]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amd/ras: cap pending_ecc_list size
authorStanley.Yang <Stanley.Yang@amd.com>
Mon, 11 May 2026 11:44:16 +0000 (19:44 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 27 May 2026 14:50:57 +0000 (10:50 -0400)
Drop new entries once pending_ecc_count hits RAS_UMC_PENDING_ECC_MAX
(8192) so an ECC storm or repeated UMC error injection cannot exhaust
kernel memory. Dropped events are counted and reported via a
rate-limited warning.

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
drivers/gpu/drm/amd/ras/rascore/ras_umc.c
drivers/gpu/drm/amd/ras/rascore/ras_umc.h

index 8156531a7b63770556769b29e65d5e5357a595ed..f34dda7ce87b17fe9bd5fd52614bfed7dcd770bd 100644 (file)
                        printk(KERN_WARNING fmt, ##__VA_ARGS__);                           \
        } while (0)
 
+#define RAS_DEV_WARN_RATELIMITED(device, fmt, ...)                                   \
+       do {                                                                       \
+               if (device)                                                              \
+                       dev_warn_ratelimited(((struct amdgpu_device *)device)->dev,        \
+                               fmt, ##__VA_ARGS__);                                            \
+               else                                                                   \
+                       printk_ratelimited(KERN_WARNING fmt, ##__VA_ARGS__);               \
+       } while (0)
+
 #define RAS_DEV_INFO(device, fmt, ...)                                                 \
        do {                                                                         \
                if (device)                                                                \
index 91dd730de3cecb4605482213083f4b90c1491f5c..f32ee2fecf536fd1101c7a023cf1c9ead83c1f29 100644 (file)
@@ -193,12 +193,29 @@ static void ras_umc_reserve_eeprom_record(struct ras_core_context *ras_core,
 }
 
 /* When gpu reset is ongoing, ecc logging operations will be pended.
+ *
+ * The pending list is bounded by RAS_UMC_PENDING_ECC_MAX so that an ECC
+ * storm or repeated UMC error injection cannot make this list (and the
+ * kernel allocations behind it) grow without bound. Once the limit is
+ * reached, additional events are dropped and counted in
+ * pending_ecc_dropped, with a rate-limited warning emitted.
  */
 int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank)
 {
        struct ras_umc *ras_umc = &ras_core->ras_umc;
        struct ras_bank_ecc_node *ecc_node;
 
+       mutex_lock(&ras_umc->pending_ecc_lock);
+       if (ras_umc->pending_ecc_count >= RAS_UMC_PENDING_ECC_MAX) {
+               ras_umc->pending_ecc_dropped++;
+               mutex_unlock(&ras_umc->pending_ecc_lock);
+               RAS_DEV_WARN_RATELIMITED(ras_core->dev,
+                       "pending ECC list full (%u), dropping bad bank event (total dropped:%u)\n",
+                       RAS_UMC_PENDING_ECC_MAX, ras_umc->pending_ecc_dropped);
+               return -ENOSPC;
+       }
+       mutex_unlock(&ras_umc->pending_ecc_lock);
+
        ecc_node = kzalloc_obj(*ecc_node);
        if (!ecc_node)
                return -ENOMEM;
@@ -206,7 +223,15 @@ int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_b
        memcpy(&ecc_node->ecc, bank, sizeof(ecc_node->ecc));
 
        mutex_lock(&ras_umc->pending_ecc_lock);
+       /* re-check under the lock to honor the cap across concurrent callers */
+       if (ras_umc->pending_ecc_count >= RAS_UMC_PENDING_ECC_MAX) {
+               ras_umc->pending_ecc_dropped++;
+               mutex_unlock(&ras_umc->pending_ecc_lock);
+               kfree(ecc_node);
+               return -ENOSPC;
+       }
        list_add_tail(&ecc_node->node, &ras_umc->pending_ecc_list);
+       ras_umc->pending_ecc_count++;
        mutex_unlock(&ras_umc->pending_ecc_lock);
 
        return 0;
@@ -225,8 +250,16 @@ int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core)
                if (!ras_umc_log_bad_bank(ras_core, &ecc_node->ecc)) {
                        list_del(&ecc_node->node);
                        kfree(ecc_node);
+                       if (ras_umc->pending_ecc_count)
+                               ras_umc->pending_ecc_count--;
                }
        }
+       if (ras_umc->pending_ecc_dropped) {
+               RAS_DEV_WARN(ras_core->dev,
+                       "%u pending ECC bad-bank events were dropped during GPU reset\n",
+                       ras_umc->pending_ecc_dropped);
+               ras_umc->pending_ecc_dropped = 0;
+       }
        mutex_unlock(&ras_umc->pending_ecc_lock);
 
        return 0;
@@ -609,6 +642,8 @@ int ras_umc_sw_fini(struct ras_core_context *ras_core)
                list_del(&ecc_node->node);
                kfree(ecc_node);
        }
+       ras_umc->pending_ecc_count = 0;
+       ras_umc->pending_ecc_dropped = 0;
        mutex_unlock(&ras_umc->pending_ecc_lock);
 
        mutex_destroy(&ras_umc->tree_lock);
index 1d3026be509b81a41b371821d50e6d6ab822f123..237525b46b9bbe19f09b0a8b892cdf184f41269e 100644 (file)
@@ -139,8 +139,20 @@ struct ras_umc {
        struct mutex  pending_ecc_lock;
        struct ras_umc_err_data umc_err_data;
        struct list_head pending_ecc_list;
+       /* number of entries currently queued on pending_ecc_list */
+       u32 pending_ecc_count;
+       /* number of entries dropped because pending_ecc_list was full */
+       u32 pending_ecc_dropped;
 };
 
+/*
+ * Upper bound on entries that can be queued on pending_ecc_list while a
+ * GPU reset is in progress. Beyond this, new ECC events are dropped to
+ * prevent unbounded kernel memory growth in case of an ECC storm or
+ * malicious/repeated UMC error injection.
+ */
+#define RAS_UMC_PENDING_ECC_MAX  8192
+
 int ras_umc_sw_init(struct ras_core_context *ras);
 int ras_umc_sw_fini(struct ras_core_context *ras);
 int ras_umc_hw_init(struct ras_core_context *ras);