}
/* When gpu reset is ongoing, ecc logging operations will be pended.
+ *
+ * The pending list is bounded by RAS_UMC_PENDING_ECC_MAX so that an ECC
+ * storm or repeated UMC error injection cannot make this list (and the
+ * kernel allocations behind it) grow without bound. Once the limit is
+ * reached, additional events are dropped and counted in
+ * pending_ecc_dropped, with a rate-limited warning emitted.
*/
int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct ras_bank_ecc_node *ecc_node;
+ mutex_lock(&ras_umc->pending_ecc_lock);
+ if (ras_umc->pending_ecc_count >= RAS_UMC_PENDING_ECC_MAX) {
+ ras_umc->pending_ecc_dropped++;
+ mutex_unlock(&ras_umc->pending_ecc_lock);
+ RAS_DEV_WARN_RATELIMITED(ras_core->dev,
+ "pending ECC list full (%u), dropping bad bank event (total dropped:%u)\n",
+ RAS_UMC_PENDING_ECC_MAX, ras_umc->pending_ecc_dropped);
+ return -ENOSPC;
+ }
+ mutex_unlock(&ras_umc->pending_ecc_lock);
+
ecc_node = kzalloc_obj(*ecc_node);
if (!ecc_node)
return -ENOMEM;
memcpy(&ecc_node->ecc, bank, sizeof(ecc_node->ecc));
mutex_lock(&ras_umc->pending_ecc_lock);
+ /* re-check under the lock to honor the cap across concurrent callers */
+ if (ras_umc->pending_ecc_count >= RAS_UMC_PENDING_ECC_MAX) {
+ ras_umc->pending_ecc_dropped++;
+ mutex_unlock(&ras_umc->pending_ecc_lock);
+ kfree(ecc_node);
+ return -ENOSPC;
+ }
list_add_tail(&ecc_node->node, &ras_umc->pending_ecc_list);
+ ras_umc->pending_ecc_count++;
mutex_unlock(&ras_umc->pending_ecc_lock);
return 0;
if (!ras_umc_log_bad_bank(ras_core, &ecc_node->ecc)) {
list_del(&ecc_node->node);
kfree(ecc_node);
+ if (ras_umc->pending_ecc_count)
+ ras_umc->pending_ecc_count--;
}
}
+ if (ras_umc->pending_ecc_dropped) {
+ RAS_DEV_WARN(ras_core->dev,
+ "%u pending ECC bad-bank events were dropped during GPU reset\n",
+ ras_umc->pending_ecc_dropped);
+ ras_umc->pending_ecc_dropped = 0;
+ }
mutex_unlock(&ras_umc->pending_ecc_lock);
return 0;
list_del(&ecc_node->node);
kfree(ecc_node);
}
+ ras_umc->pending_ecc_count = 0;
+ ras_umc->pending_ecc_dropped = 0;
mutex_unlock(&ras_umc->pending_ecc_lock);
mutex_destroy(&ras_umc->tree_lock);
struct mutex pending_ecc_lock;
struct ras_umc_err_data umc_err_data;
struct list_head pending_ecc_list;
+ /* number of entries currently queued on pending_ecc_list */
+ u32 pending_ecc_count;
+ /* number of entries dropped because pending_ecc_list was full */
+ u32 pending_ecc_dropped;
};
+/*
+ * Upper bound on entries that can be queued on pending_ecc_list while a
+ * GPU reset is in progress. Beyond this, new ECC events are dropped to
+ * prevent unbounded kernel memory growth in case of an ECC storm or
+ * malicious/repeated UMC error injection.
+ */
+#define RAS_UMC_PENDING_ECC_MAX 8192
+
int ras_umc_sw_init(struct ras_core_context *ras);
int ras_umc_sw_fini(struct ras_core_context *ras);
int ras_umc_hw_init(struct ras_core_context *ras);