#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/kfifo.h>
+#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <acpi/apei.h>
#include <acpi/ghes.h>
u64 rootport_total_cor_errs;
u64 rootport_total_fatal_errs;
u64 rootport_total_nonfatal_errs;
+
+ /* Ratelimits for errors */
+ struct ratelimit_state correctable_ratelimit;
+ struct ratelimit_state nonfatal_ratelimit;
};
#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \
dev->aer_info = kzalloc(sizeof(*dev->aer_info), GFP_KERNEL);
+ ratelimit_state_init(&dev->aer_info->correctable_ratelimit,
+ DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+ ratelimit_state_init(&dev->aer_info->nonfatal_ratelimit,
+ DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+
/*
* We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
* PCI_ERR_COR_MASK, and PCI_ERR_CAP. Root and Root Complex Event
}
}
+static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
+{
+ switch (severity) {
+ case AER_NONFATAL:
+ return __ratelimit(&dev->aer_info->nonfatal_ratelimit);
+ case AER_CORRECTABLE:
+ return __ratelimit(&dev->aer_info->correctable_ratelimit);
+ default:
+ return 1; /* Don't ratelimit fatal errors */
+ }
+}
+
static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
{
const char **strings;
trace_aer_event(pci_name(dev), (info->status & ~info->mask),
info->severity, info->tlp_header_valid, &info->tlp);
+ if (!info->ratelimit_print[i])
+ return;
+
if (!info->status) {
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
aer_error_severity_string[info->severity]);
trace_aer_event(pci_name(dev), (status & ~mask),
aer_severity, tlp_header_valid, &aer->header_log);
+ if (!aer_ratelimit(dev, info.severity))
+ return;
+
layer = AER_GET_LAYER_ERROR(aer_severity, status);
agent = AER_GET_AGENT(aer_severity, status);
e_info->dev[i] = pci_dev_get(dev);
e_info->error_dev_num++;
+ /*
+ * Ratelimit AER log messages. "dev" is either the source
+ * identified by the root's Error Source ID or it has an unmasked
+ * error logged in its own AER Capability. Messages are emitted
+ * when "ratelimit_print[i]" is non-zero. If we will print detail
+ * for a downstream device, make sure we print the Error Source ID
+ * from the root as well.
+ */
+ if (aer_ratelimit(dev, e_info->severity)) {
+ e_info->ratelimit_print[i] = 1;
+ e_info->root_ratelimit_print = 1;
+ }
return 0;
}
* e_info->error_dev_num and e_info->dev[], based on the given information.
*/
static bool find_source_device(struct pci_dev *parent,
- struct aer_err_info *e_info)
+ struct aer_err_info *e_info)
{
struct pci_dev *dev = parent;
int result;
pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
entry.devfn);
if (!pdev) {
- pr_err("no pci_dev for %04x:%02x:%02x.%x\n",
- entry.domain, entry.bus,
- PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
+ pr_err_ratelimited("%04x:%02x:%02x.%x: no pci_dev found\n",
+ entry.domain, entry.bus,
+ PCI_SLOT(entry.devfn),
+ PCI_FUNC(entry.devfn));
continue;
}
pci_print_aer(pdev, entry.severity, entry.regs);
bool found;
found = find_source_device(root, info);
- aer_print_source(root, info, found);
+
+ /*
+ * If we're going to log error messages, we've already set
+ * "info->root_ratelimit_print" and "info->ratelimit_print[i]" to
+ * non-zero (which enables printing) because this is either an
+ * ERR_FATAL or we found a device with an error logged in its AER
+ * Capability.
+ *
+ * If we didn't find the Error Source device, at least log the
+ * Requester ID from the ERR_* Message received by the Root Port or
+ * RCEC, ratelimited by the RP or RCEC.
+ */
+ if (info->root_ratelimit_print ||
+ (!found && aer_ratelimit(root, info->severity)))
+ aer_print_source(root, info, found);
+
if (found)
aer_process_err_devices(info);
}