]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
vmcoreinfo: track and log recoverable hardware errors
authorBreno Leitao <leitao@debian.org>
Fri, 10 Oct 2025 10:36:50 +0000 (03:36 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 27 Nov 2025 22:24:44 +0000 (14:24 -0800)
Introduce a generic infrastructure for tracking recoverable hardware
errors (HW errors that are visible to the OS but does not cause a panic)
and record them for vmcore consumption.  This aids post-mortem crash
analysis tools by preserving a count and timestamp for the last occurrence
of such errors.  On the other side, correctable errors, which the OS
typically remains unaware of because the underlying hardware handles them
transparently, are less relevant for crash dump and therefore are NOT
tracked in this infrastructure.

Add centralized logging for sources of recoverable hardware errors based
on the subsystem it has been notified.

hwerror_data is write-only at kernel runtime, and it is meant to be read
from vmcore using tools like crash/drgn.  For example, this is how it
looks like when opening the crashdump from drgn.

>>> prog['hwerror_data']
(struct hwerror_info[1]){
{
.count = (int)844,
.timestamp = (time64_t)1752852018,
},
...

This helps fleet operators quickly triage whether a crash may be
influenced by hardware recoverable errors (which executes a uncommon code
path in the kernel), especially when recoverable errors occurred shortly
before a panic, such as the bug fixed by commit ee62ce7a1d90 ("page_pool:
Track DMA-mapped pages and unmap them when destroying the pool")

This is not intended to replace full hardware diagnostics but provides a
fast way to correlate hardware events with kernel panics quickly.

Rare machine check exceptions—like those indicated by mce_flags.p5 or
mce_flags.winchip—are not accounted for in this method, as they fall
outside the intended usage scope for this feature's user base.

[leitao@debian.org: add hw-recoverable-errors to toctree]
Link: https://lkml.kernel.org/r/20251127-vmcoreinfo_fix-v1-1-26f5b1c43da9@debian.org
Link: https://lkml.kernel.org/r/20251010-vmcore_hw_error-v5-1-636ede3efe44@debian.org
Signed-off-by: Breno Leitao <leitao@debian.org>
Suggested-by: Tony Luck <tony.luck@intel.com>
Suggested-by: Shuai Xue <xueshuai@linux.alibaba.com>
Reviewed-by: Shuai Xue <xueshuai@linux.alibaba.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com> [APEI]
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Bob Moore <robert.moore@intel.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Konrad Rzessutek Wilk <konrad.wilk@oracle.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Omar Sandoval <osandov@osandov.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Documentation/driver-api/hw-recoverable-errors.rst [new file with mode: 0644]
Documentation/driver-api/index.rst
arch/x86/kernel/cpu/mce/core.c
drivers/acpi/apei/ghes.c
drivers/pci/pcie/aer.c
include/linux/vmcore_info.h
include/uapi/linux/vmcore.h
kernel/vmcore_info.c

diff --git a/Documentation/driver-api/hw-recoverable-errors.rst b/Documentation/driver-api/hw-recoverable-errors.rst
new file mode 100644 (file)
index 0000000..fc526c3
--- /dev/null
@@ -0,0 +1,60 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================================================
+Recoverable Hardware Error Tracking in vmcoreinfo
+=================================================
+
+Overview
+--------
+
+This feature provides a generic infrastructure within the Linux kernel to track
+and log recoverable hardware errors. These are hardware recoverable errors
+visible that might not cause immediate panics but may influence health, mainly
+because new code path will be executed in the kernel.
+
+By recording counts and timestamps of recoverable errors into the vmcoreinfo
+crash dump notes, this infrastructure aids post-mortem crash analysis tools in
+correlating hardware events with kernel failures. This enables faster triage
+and better understanding of root causes, especially in large-scale cloud
+environments where hardware issues are common.
+
+Benefits
+--------
+
+- Facilitates correlation of hardware recoverable errors with kernel panics or
+  unusual code paths that lead to system crashes.
+- Provides operators and cloud providers quick insights, improving reliability
+  and reducing troubleshooting time.
+- Complements existing full hardware diagnostics without replacing them.
+
+Data Exposure and Consumption
+-----------------------------
+
+- The tracked error data consists of per-error-type counts and timestamps of
+  last occurrence.
+- This data is stored in the `hwerror_data` array, categorized by error source
+  types like CPU, memory, PCI, CXL, and others.
+- It is exposed via vmcoreinfo crash dump notes and can be read using tools
+  like `crash`, `drgn`, or other kernel crash analysis utilities.
+- There is no other way to read these data other than from crash dumps.
+- These errors are divided by area, which includes CPU, Memory, PCI, CXL and
+  others.
+
+Typical usage example (in drgn REPL):
+
+.. code-block:: python
+
+    >>> prog['hwerror_data']
+    (struct hwerror_info[HWERR_RECOV_MAX]){
+        {
+            .count = (int)844,
+            .timestamp = (time64_t)1752852018,
+        },
+        ...
+    }
+
+Enabling
+--------
+
+- This feature is enabled when CONFIG_VMCORE_INFO is set.
+
index 3e2a270bd82826cd78ffc6f18214fdbde151a36a..a35705b44799624cc0c3399d350091824340879b 100644 (file)
@@ -96,6 +96,7 @@ Subsystem-specific APIs
    gpio/index
    hsi
    hte/index
+   hw-recoverable-errors
    i2c
    iio/index
    infiniband
index 460e90a1a0b172d2b8b88e3635922c637b6e684a..08adbf4cd6edc7a1c619df42134e78662999a2a0 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/task_work.h>
 #include <linux/hardirq.h>
 #include <linux/kexec.h>
+#include <linux/vmcore_info.h>
 
 #include <asm/fred.h>
 #include <asm/cpu_device_id.h>
@@ -1700,6 +1701,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
        }
 
 out:
+       /* Given it didn't panic, mark it as recoverable */
+       hwerr_log_error_type(HWERR_RECOV_OTHERS);
+
        instrumentation_end();
 
 clear:
index 97ee19f2cae0607be65aacce4cd21be99ae0a7a3..92b0e3c391b2d199d599700e2c6bf80c48f15153 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/uuid.h>
 #include <linux/ras.h>
 #include <linux/task_work.h>
+#include <linux/vmcore_info.h>
 
 #include <acpi/actbl1.h>
 #include <acpi/ghes.h>
@@ -867,6 +868,40 @@ int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cper_kfifo_get, "CXL");
 
+static void ghes_log_hwerr(int sev, guid_t *sec_type)
+{
+       if (sev != CPER_SEV_RECOVERABLE)
+               return;
+
+       if (guid_equal(sec_type, &CPER_SEC_PROC_ARM) ||
+           guid_equal(sec_type, &CPER_SEC_PROC_GENERIC) ||
+           guid_equal(sec_type, &CPER_SEC_PROC_IA)) {
+               hwerr_log_error_type(HWERR_RECOV_CPU);
+               return;
+       }
+
+       if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR) ||
+           guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID) ||
+           guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID) ||
+           guid_equal(sec_type, &CPER_SEC_CXL_MEM_MODULE_GUID)) {
+               hwerr_log_error_type(HWERR_RECOV_CXL);
+               return;
+       }
+
+       if (guid_equal(sec_type, &CPER_SEC_PCIE) ||
+           guid_equal(sec_type, &CPER_SEC_PCI_X_BUS)) {
+               hwerr_log_error_type(HWERR_RECOV_PCI);
+               return;
+       }
+
+       if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
+               hwerr_log_error_type(HWERR_RECOV_MEMORY);
+               return;
+       }
+
+       hwerr_log_error_type(HWERR_RECOV_OTHERS);
+}
+
 static void ghes_do_proc(struct ghes *ghes,
                         const struct acpi_hest_generic_status *estatus)
 {
@@ -888,6 +923,7 @@ static void ghes_do_proc(struct ghes *ghes,
                if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
                        fru_text = gdata->fru_text;
 
+               ghes_log_hwerr(sev, sec_type);
                if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
                        struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
index 0b5ed4722ac3238362c98812be7179bf665d3ce6..e0bcaa896803c91e7451e57fed06a725c90c6114 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
+#include <linux/vmcore_info.h>
 #include <acpi/apei.h>
 #include <acpi/ghes.h>
 #include <ras/ras_event.h>
@@ -765,6 +766,7 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
                break;
        case AER_NONFATAL:
                aer_info->dev_total_nonfatal_errs++;
+               hwerr_log_error_type(HWERR_RECOV_PCI);
                counter = &aer_info->dev_nonfatal_errs[0];
                max = AER_MAX_TYPEOF_UNCOR_ERRS;
                break;
index 37e003ae52626ad83b89475c45f634c5ac6c61a1..e71518caacdfcf1a85e9ffaeb7926b55972c07dd 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/linkage.h>
 #include <linux/elfcore.h>
 #include <linux/elf.h>
+#include <uapi/linux/vmcore.h>
 
 #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
 #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4)
@@ -77,4 +78,11 @@ extern u32 *vmcoreinfo_note;
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
                          void *data, size_t data_len);
 void final_note(Elf_Word *buf);
+
+#ifdef CONFIG_VMCORE_INFO
+void hwerr_log_error_type(enum hwerr_error_type src);
+#else
+static inline void hwerr_log_error_type(enum hwerr_error_type src) {};
+#endif
+
 #endif /* LINUX_VMCORE_INFO_H */
index 3e9da91866ffd38d94c658918193d9c0cd3cae2e..2ba89fafa518ae3ef158dd3384faef565b7db24c 100644 (file)
@@ -15,4 +15,13 @@ struct vmcoredd_header {
        __u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */
 };
 
+enum hwerr_error_type {
+       HWERR_RECOV_CPU,
+       HWERR_RECOV_MEMORY,
+       HWERR_RECOV_PCI,
+       HWERR_RECOV_CXL,
+       HWERR_RECOV_OTHERS,
+       HWERR_RECOV_MAX,
+};
+
 #endif /* _UAPI_VMCORE_H */
index e066d31d08f89542b78fabf48efd2685aba2293c..fe9bf8db1922e6a269e3d4034b238266751dcbcb 100644 (file)
@@ -31,6 +31,13 @@ u32 *vmcoreinfo_note;
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
 static unsigned char *vmcoreinfo_data_safecopy;
 
+struct hwerr_info {
+       atomic_t count;
+       time64_t timestamp;
+};
+
+static struct hwerr_info hwerr_data[HWERR_RECOV_MAX];
+
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
                          void *data, size_t data_len)
 {
@@ -118,6 +125,16 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
 }
 EXPORT_SYMBOL(paddr_vmcoreinfo_note);
 
+void hwerr_log_error_type(enum hwerr_error_type src)
+{
+       if (src < 0 || src >= HWERR_RECOV_MAX)
+               return;
+
+       atomic_inc(&hwerr_data[src].count);
+       WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds());
+}
+EXPORT_SYMBOL_GPL(hwerr_log_error_type);
+
 static int __init crash_save_vmcoreinfo_init(void)
 {
        vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);