]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/xe/xe_hw_error: Integrate DRM RAS with hardware error handling
authorRiana Tauro <riana.tauro@intel.com>
Wed, 4 Mar 2026 07:44:10 +0000 (13:14 +0530)
committerRodrigo Vivi <rodrigo.vivi@intel.com>
Fri, 6 Mar 2026 00:38:56 +0000 (19:38 -0500)
Initialize DRM RAS in hw error init. Map the UAPI error severities
with the hardware error severities and refactor file.

Signed-off-by: Riana Tauro <riana.tauro@intel.com>
Reviewed-by: Raag Jadav <raag.jadav@intel.com>
Link: https://patch.msgid.link/20260304074412.464435-10-riana.tauro@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
drivers/gpu/drm/xe/xe_drm_ras_types.h
drivers/gpu/drm/xe/xe_hw_error.c

index 7acc5e7377b2a2c408eb9fd6655054490628fa14..8d729ad6a264dd862ece562942e0201a2f6ed393 100644 (file)
 
 struct drm_ras_node;
 
+/* Error categories reported by hardware */
+enum hardware_error {
+       HARDWARE_ERROR_CORRECTABLE = 0,
+       HARDWARE_ERROR_NONFATAL,
+       HARDWARE_ERROR_FATAL,
+       HARDWARE_ERROR_MAX
+};
+
 /**
  * struct xe_drm_ras_counter - XE RAS counter
  *
index 8c65291f36fc006d7fbd06781f49149e4c1774e4..baae050163df247badd74448dffd1c718f1e7128 100644 (file)
 #include "regs/xe_irq_regs.h"
 
 #include "xe_device.h"
+#include "xe_drm_ras.h"
 #include "xe_hw_error.h"
 #include "xe_mmio.h"
 #include "xe_survivability_mode.h"
 
 #define  HEC_UNCORR_FW_ERR_BITS 4
+
 extern struct fault_attr inject_csc_hw_error;
 
-/* Error categories reported by hardware */
-enum hardware_error {
-       HARDWARE_ERROR_CORRECTABLE = 0,
-       HARDWARE_ERROR_NONFATAL = 1,
-       HARDWARE_ERROR_FATAL = 2,
-       HARDWARE_ERROR_MAX,
-};
+static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES;
 
 static const char * const hec_uncorrected_fw_errors[] = {
        "Fatal",
@@ -32,23 +28,18 @@ static const char * const hec_uncorrected_fw_errors[] = {
        "Data Corruption"
 };
 
-static const char *hw_error_to_str(const enum hardware_error hw_err)
+static bool fault_inject_csc_hw_error(void)
 {
-       switch (hw_err) {
-       case HARDWARE_ERROR_CORRECTABLE:
-               return "CORRECTABLE";
-       case HARDWARE_ERROR_NONFATAL:
-               return "NONFATAL";
-       case HARDWARE_ERROR_FATAL:
-               return "FATAL";
-       default:
-               return "UNKNOWN";
-       }
+       return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1);
 }
 
-static bool fault_inject_csc_hw_error(void)
+static enum drm_xe_ras_error_severity hw_err_to_severity(const enum hardware_error hw_err)
 {
-       return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1);
+       if (hw_err == HARDWARE_ERROR_CORRECTABLE)
+               return DRM_XE_RAS_ERR_SEV_CORRECTABLE;
+
+       /* Uncorrectable errors comprise of both fatal and non-fatal errors */
+       return DRM_XE_RAS_ERR_SEV_UNCORRECTABLE;
 }
 
 static void csc_hw_error_work(struct work_struct *work)
@@ -64,7 +55,8 @@ static void csc_hw_error_work(struct work_struct *work)
 
 static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 {
-       const char *hw_err_str = hw_error_to_str(hw_err);
+       const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err);
+       const char *severity_str = error_severity[severity];
        struct xe_device *xe = tile_to_xe(tile);
        struct xe_mmio *mmio = &tile->mmio;
        u32 base, err_bit, err_src;
@@ -77,8 +69,8 @@ static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error
        lockdep_assert_held(&xe->irq.lock);
        err_src = xe_mmio_read32(mmio, HEC_UNCORR_ERR_STATUS(base));
        if (!err_src) {
-               drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported HEC_ERR_STATUS_%s blank\n",
-                                   tile->id, hw_err_str);
+               drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported %s HEC_ERR_STATUS register blank\n",
+                                   tile->id, severity_str);
                return;
        }
 
@@ -86,8 +78,8 @@ static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error
                fw_err = xe_mmio_read32(mmio, HEC_UNCORR_FW_ERR_DW0(base));
                for_each_set_bit(err_bit, &fw_err, HEC_UNCORR_FW_ERR_BITS) {
                        drm_err_ratelimited(&xe->drm, HW_ERR
-                                           "%s: HEC Uncorrected FW %s error reported, bit[%d] is set\n",
-                                            hw_err_str, hec_uncorrected_fw_errors[err_bit],
+                                           "HEC FW %s %s reported, bit[%d] is set\n",
+                                            hec_uncorrected_fw_errors[err_bit], severity_str,
                                             err_bit);
 
                        schedule_work(&tile->csc_hw_error_work);
@@ -99,7 +91,8 @@ static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error
 
 static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 {
-       const char *hw_err_str = hw_error_to_str(hw_err);
+       const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err);
+       const char *severity_str = error_severity[severity];
        struct xe_device *xe = tile_to_xe(tile);
        unsigned long flags;
        u32 err_src;
@@ -110,8 +103,8 @@ static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_er
        spin_lock_irqsave(&xe->irq.lock, flags);
        err_src = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err));
        if (!err_src) {
-               drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported DEV_ERR_STAT_%s blank!\n",
-                                   tile->id, hw_err_str);
+               drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported %s DEV_ERR_STAT register blank!\n",
+                                   tile->id, severity_str);
                goto unlock;
        }
 
@@ -146,6 +139,14 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
                        hw_error_source_handler(tile, hw_err);
 }
 
+static int hw_error_info_init(struct xe_device *xe)
+{
+       if (xe->info.platform != XE_PVC)
+               return 0;
+
+       return xe_drm_ras_init(xe);
+}
+
 /*
  * Process hardware errors during boot
  */
@@ -172,11 +173,16 @@ static void process_hw_errors(struct xe_device *xe)
 void xe_hw_error_init(struct xe_device *xe)
 {
        struct xe_tile *tile = xe_device_get_root_tile(xe);
+       int ret;
 
        if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
                return;
 
        INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work);
 
+       ret = hw_error_info_init(xe);
+       if (ret)
+               drm_err(&xe->drm, "Failed to initialize XE DRM RAS (%pe)\n", ERR_PTR(ret));
+
        process_hw_errors(xe);
 }