]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
cxl/ras: Fix CPER handler device confusion
authorDan Williams <dan.j.williams@intel.com>
Thu, 12 Jun 2025 19:20:43 +0000 (12:20 -0700)
committerDave Jiang <dave.jiang@intel.com>
Fri, 13 Jun 2025 16:02:04 +0000 (09:02 -0700)
By inspection, cxl_cper_handle_prot_err() is making a series of fragile
assumptions that can lead to crashes:

1/ It assumes that endpoints identified in the record are a CXL-type-3
   device, nothing guarantees that.

2/ It assumes that the device is bound to the cxl_pci driver, nothing
   guarantees that.

3/ Minor, it holds the device lock over the switch-port tracing for no
   reason as the trace is 100% generated from data in the record.

Correct those by checking that the PCIe endpoint parents a cxl_memdev
before assuming the format of the driver data, and move the lock to where
it is required. Consequently this also makes the implementation ready for
CXL accelerators that are not bound to cxl_pci.

Fixes: 36f257e3b0ba ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
Cc: Terry Bowman <terry.bowman@amd.com>
Cc: Li Ming <ming.li@zohomail.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Reviewed-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Li Ming <ming.li@zohomail.com>
Link: https://patch.msgid.link/20250612192043.2254617-1-dan.j.williams@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
drivers/cxl/core/ras.c

index 485a831695c7052b3c5ef7302801b6fb75a6af71..2731ba3a07993c5a8fecb0e4d76081afff7d432e 100644 (file)
@@ -31,40 +31,38 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
                                               ras_cap.header_log);
 }
 
-static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
-                                 struct cxl_ras_capability_regs ras_cap)
+static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
+                                        struct cxl_ras_capability_regs ras_cap)
 {
        u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
-       struct cxl_dev_state *cxlds;
 
-       cxlds = pci_get_drvdata(pdev);
-       if (!cxlds)
-               return;
-
-       trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+       trace_cxl_aer_correctable_error(cxlmd, status);
 }
 
-static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
-                                   struct cxl_ras_capability_regs ras_cap)
+static void
+cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
+                              struct cxl_ras_capability_regs ras_cap)
 {
        u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
-       struct cxl_dev_state *cxlds;
        u32 fe;
 
-       cxlds = pci_get_drvdata(pdev);
-       if (!cxlds)
-               return;
-
        if (hweight32(status) > 1)
                fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
                                   ras_cap.cap_control));
        else
                fe = status;
 
-       trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+       trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
                                          ras_cap.header_log);
 }
 
+static int match_memdev_by_parent(struct device *dev, const void *uport)
+{
+       if (is_cxl_memdev(dev) && dev->parent == uport)
+               return 1;
+       return 0;
+}
+
 static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
 {
        unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
@@ -73,13 +71,12 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
                pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
                                            data->prot_err.agent_addr.bus,
                                            devfn);
+       struct cxl_memdev *cxlmd;
        int port_type;
 
        if (!pdev)
                return;
 
-       guard(device)(&pdev->dev);
-
        port_type = pci_pcie_type(pdev);
        if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
            port_type == PCI_EXP_TYPE_DOWNSTREAM ||
@@ -92,10 +89,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
                return;
        }
 
+       guard(device)(&pdev->dev);
+       if (!pdev->dev.driver)
+               return;
+
+       struct device *mem_dev __free(put_device) = bus_find_device(
+               &cxl_bus_type, NULL, pdev, match_memdev_by_parent);
+       if (!mem_dev)
+               return;
+
+       cxlmd = to_cxl_memdev(mem_dev);
        if (data->severity == AER_CORRECTABLE)
-               cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
+               cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
        else
-               cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
+               cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
 }
 
 static void cxl_cper_prot_err_work_fn(struct work_struct *work)