]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
PCI/AER: Add optional logging callback for correctable error
authorDave Jiang <dave.jiang@intel.com>
Wed, 30 Nov 2022 22:11:21 +0000 (15:11 -0700)
committerDan Williams <dan.j.williams@intel.com>
Sat, 3 Dec 2022 21:40:56 +0000 (13:40 -0800)
Some new devices such as CXL devices may want to record additional error
information on a corrected error. Add a callback to allow the PCI device
driver to do additional logging such as providing additional stats for user
space RAS monitoring.

For CXL device, this is actually a need due to CXL needing to write to the
CXL RAS capability structure correctable error status register in order to
clear the unmasked correctable errors. See CXL spec rev3.0 8.2.4.16.

Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/166984619233.2804404.3966368388544312674.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Documentation/PCI/pci-error-recovery.rst
drivers/pci/pcie/aer.c
include/linux/pci.h

index 187f43a032006ce65041c9ba971bc31b9b69ed17..bdafeb4b66dcf786792d863bfa0804a429f7f605 100644 (file)
@@ -83,6 +83,7 @@ This structure has the form::
                int (*mmio_enabled)(struct pci_dev *dev);
                int (*slot_reset)(struct pci_dev *dev);
                void (*resume)(struct pci_dev *dev);
+               void (*cor_error_detected)(struct pci_dev *dev);
        };
 
 The possible channel states are::
@@ -422,5 +423,11 @@ That is, the recovery API only requires that:
    - drivers/net/cxgb3
    - drivers/net/s2io.c
 
+   The cor_error_detected() callback is invoked in handle_error_source() when
+   the error severity is "correctable". The callback is optional and allows
+   additional logging to be done if desired. See example:
+
+   - drivers/cxl/pci.c
+
 The End
 -------
index e2d8a74f83c341def915af240c45b50f161621d1..625f7b2cafe42000f60ff0f9c505aee206e4b65d 100644 (file)
@@ -961,8 +961,14 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
                if (aer)
                        pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
                                        info->status);
-               if (pcie_aer_is_native(dev))
+               if (pcie_aer_is_native(dev)) {
+                       struct pci_driver *pdrv = dev->driver;
+
+                       if (pdrv && pdrv->err_handler &&
+                           pdrv->err_handler->cor_error_detected)
+                               pdrv->err_handler->cor_error_detected(dev);
                        pcie_clear_device_status(dev);
+               }
        } else if (info->severity == AER_NONFATAL)
                pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
        else if (info->severity == AER_FATAL)
index 2bda4a4e47e815d30d68eb52da827c15e0417cde..2119a16ecb10b6194133f54b68621cb1f9df1bb3 100644 (file)
@@ -843,6 +843,9 @@ struct pci_error_handlers {
 
        /* Device driver may resume normal operations */
        void (*resume)(struct pci_dev *dev);
+
+       /* Allow device driver to record more details of a correctable error */
+       void (*cor_error_detected)(struct pci_dev *dev);
 };