#ifndef _XE_HW_ERROR_REGS_H_
#define _XE_HW_ERROR_REGS_H_
-#define HEC_UNCORR_ERR_STATUS(base) XE_REG((base) + 0x118)
-#define UNCORR_FW_REPORTED_ERR BIT(6)
+#define HEC_UNCORR_ERR_STATUS(base) XE_REG((base) + 0x118)
+#define UNCORR_FW_REPORTED_ERR REG_BIT(6)
-#define HEC_UNCORR_FW_ERR_DW0(base) XE_REG((base) + 0x124)
+#define HEC_UNCORR_FW_ERR_DW0(base) XE_REG((base) + 0x124)
-#define DEV_ERR_STAT_NONFATAL 0x100178
-#define DEV_ERR_STAT_CORRECTABLE 0x10017c
-#define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \
- DEV_ERR_STAT_CORRECTABLE, \
- DEV_ERR_STAT_NONFATAL))
-#define XE_CSC_ERROR BIT(17)
+#define ERR_STAT_GT_COR 0x100160
+#define EU_GRF_COR_ERR REG_BIT(15)
+#define EU_IC_COR_ERR REG_BIT(14)
+#define SLM_COR_ERR REG_BIT(13)
+#define GUC_COR_ERR REG_BIT(1)
+
+#define ERR_STAT_GT_NONFATAL 0x100164
+#define ERR_STAT_GT_FATAL 0x100168
+#define EU_GRF_FAT_ERR REG_BIT(15)
+#define SLM_FAT_ERR REG_BIT(13)
+#define GUC_FAT_ERR REG_BIT(6)
+#define FPU_FAT_ERR REG_BIT(3)
+
+#define ERR_STAT_GT_REG(x) XE_REG(_PICK_EVEN((x), \
+ ERR_STAT_GT_COR, \
+ ERR_STAT_GT_NONFATAL))
+
+#define PVC_COR_ERR_MASK (GUC_COR_ERR | SLM_COR_ERR | \
+ EU_IC_COR_ERR | EU_GRF_COR_ERR)
+
+#define PVC_FAT_ERR_MASK (FPU_FAT_ERR | GUC_FAT_ERR | \
+ EU_GRF_FAT_ERR | SLM_FAT_ERR)
+
+#define DEV_ERR_STAT_NONFATAL 0x100178
+#define DEV_ERR_STAT_CORRECTABLE 0x10017c
+#define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \
+ DEV_ERR_STAT_CORRECTABLE, \
+ DEV_ERR_STAT_NONFATAL))
+
+#define XE_CSC_ERROR 17
+#define XE_GT_ERROR 0
+
+#define ERR_STAT_GT_FATAL_VECTOR_0 0x100260
+#define ERR_STAT_GT_FATAL_VECTOR_1 0x100264
+
+#define ERR_STAT_GT_FATAL_VECTOR_REG(x) XE_REG(_PICK_EVEN((x), \
+ ERR_STAT_GT_FATAL_VECTOR_0, \
+ ERR_STAT_GT_FATAL_VECTOR_1))
+
+#define ERR_STAT_GT_COR_VECTOR_0 0x1002a0
+#define ERR_STAT_GT_COR_VECTOR_1 0x1002a4
+
+#define ERR_STAT_GT_COR_VECTOR_REG(x) XE_REG(_PICK_EVEN((x), \
+ ERR_STAT_GT_COR_VECTOR_0, \
+ ERR_STAT_GT_COR_VECTOR_1))
+
+#define ERR_STAT_GT_VECTOR_REG(hw_err, x) (hw_err == HARDWARE_ERROR_CORRECTABLE ? \
+ ERR_STAT_GT_COR_VECTOR_REG(x) : \
+ ERR_STAT_GT_FATAL_VECTOR_REG(x))
#endif
* Copyright © 2025 Intel Corporation
*/
+#include <linux/bitmap.h>
#include <linux/fault-inject.h>
#include "regs/xe_gsc_regs.h"
#include "xe_mmio.h"
#include "xe_survivability_mode.h"
-#define HEC_UNCORR_FW_ERR_BITS 4
+#define GT_HW_ERROR_MAX_ERR_BITS 16
+#define HEC_UNCORR_FW_ERR_BITS 4
+#define XE_RAS_REG_SIZE 32
+
+#define PVC_ERROR_MASK_SET(hw_err, err_bit) ((hw_err == HARDWARE_ERROR_CORRECTABLE) ? \
+ (PVC_COR_ERR_MASK & REG_BIT(err_bit)) : \
+ (PVC_FAT_ERR_MASK & REG_BIT(err_bit)))
extern struct fault_attr inject_csc_hw_error;
"Data Corruption"
};
-static bool fault_inject_csc_hw_error(void)
-{
- return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1);
-}
+static const unsigned long xe_hw_error_map[] = {
+ [XE_GT_ERROR] = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE,
+};
+
+enum gt_vector_regs {
+ ERR_STAT_GT_VECTOR0 = 0,
+ ERR_STAT_GT_VECTOR1,
+ ERR_STAT_GT_VECTOR2,
+ ERR_STAT_GT_VECTOR3,
+ ERR_STAT_GT_VECTOR4,
+ ERR_STAT_GT_VECTOR5,
+ ERR_STAT_GT_VECTOR6,
+ ERR_STAT_GT_VECTOR7,
+ ERR_STAT_GT_VECTOR_MAX
+};
+
+#define PVC_GT_VECTOR_LEN(hw_err) ((hw_err == HARDWARE_ERROR_CORRECTABLE) ? \
+ ERR_STAT_GT_VECTOR4 : ERR_STAT_GT_VECTOR_MAX)
static enum drm_xe_ras_error_severity hw_err_to_severity(const enum hardware_error hw_err)
{
return DRM_XE_RAS_ERR_SEV_UNCORRECTABLE;
}
+static bool fault_inject_csc_hw_error(void)
+{
+ return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1);
+}
+
static void csc_hw_error_work(struct work_struct *work)
{
struct xe_tile *tile = container_of(work, typeof(*tile), csc_hw_error_work);
xe_mmio_write32(mmio, HEC_UNCORR_ERR_STATUS(base), err_src);
}
+static void log_hw_error(struct xe_tile *tile, const char *name,
+ const enum drm_xe_ras_error_severity severity)
+{
+ const char *severity_str = error_severity[severity];
+ struct xe_device *xe = tile_to_xe(tile);
+
+ if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
+ drm_warn(&xe->drm, "%s %s detected\n", name, severity_str);
+ else
+ drm_err_ratelimited(&xe->drm, "%s %s detected\n", name, severity_str);
+}
+
+static void log_gt_err(struct xe_tile *tile, const char *name, int i, u32 err,
+ const enum drm_xe_ras_error_severity severity)
+{
+ const char *severity_str = error_severity[severity];
+ struct xe_device *xe = tile_to_xe(tile);
+
+ if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
+ drm_warn(&xe->drm, "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
+ name, severity_str, i, err);
+ else
+ drm_err_ratelimited(&xe->drm, "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
+ name, severity_str, i, err);
+}
+
+static void gt_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err,
+ u32 error_id)
+{
+ const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err);
+ struct xe_device *xe = tile_to_xe(tile);
+ struct xe_drm_ras *ras = &xe->ras;
+ struct xe_drm_ras_counter *info = ras->info[severity];
+ struct xe_mmio *mmio = &tile->mmio;
+ unsigned long err_stat = 0;
+ int i;
+
+ if (xe->info.platform != XE_PVC)
+ return;
+
+ if (hw_err == HARDWARE_ERROR_NONFATAL) {
+ atomic_inc(&info[error_id].counter);
+ log_hw_error(tile, info[error_id].name, severity);
+ return;
+ }
+
+ for (i = 0; i < PVC_GT_VECTOR_LEN(hw_err); i++) {
+ u32 vector, val;
+
+ vector = xe_mmio_read32(mmio, ERR_STAT_GT_VECTOR_REG(hw_err, i));
+ if (!vector)
+ continue;
+
+ switch (i) {
+ case ERR_STAT_GT_VECTOR0:
+ case ERR_STAT_GT_VECTOR1: {
+ u32 errbit;
+
+ val = hweight32(vector);
+ atomic_add(val, &info[error_id].counter);
+ log_gt_err(tile, "Subslice", i, vector, severity);
+
+ /*
+ * Error status register is only populated once per error.
+ * Read the register and clear once.
+ */
+ if (err_stat)
+ break;
+
+ err_stat = xe_mmio_read32(mmio, ERR_STAT_GT_REG(hw_err));
+ for_each_set_bit(errbit, &err_stat, GT_HW_ERROR_MAX_ERR_BITS) {
+ if (PVC_ERROR_MASK_SET(hw_err, errbit))
+ atomic_inc(&info[error_id].counter);
+ }
+ if (err_stat)
+ xe_mmio_write32(mmio, ERR_STAT_GT_REG(hw_err), err_stat);
+ break;
+ }
+ case ERR_STAT_GT_VECTOR2:
+ case ERR_STAT_GT_VECTOR3:
+ val = hweight32(vector);
+ atomic_add(val, &info[error_id].counter);
+ log_gt_err(tile, "L3 BANK", i, vector, severity);
+ break;
+ case ERR_STAT_GT_VECTOR6:
+ val = hweight32(vector);
+ atomic_add(val, &info[error_id].counter);
+ log_gt_err(tile, "TLB", i, vector, severity);
+ break;
+ case ERR_STAT_GT_VECTOR7:
+ val = hweight32(vector);
+ atomic_add(val, &info[error_id].counter);
+ log_gt_err(tile, "L3 Fabric", i, vector, severity);
+ break;
+ default:
+ log_gt_err(tile, "Undefined", i, vector, severity);
+ }
+
+ xe_mmio_write32(mmio, ERR_STAT_GT_VECTOR_REG(hw_err, i), vector);
+ }
+}
+
static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
{
const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err);
const char *severity_str = error_severity[severity];
struct xe_device *xe = tile_to_xe(tile);
- unsigned long flags;
- u32 err_src;
+ struct xe_drm_ras *ras = &xe->ras;
+ struct xe_drm_ras_counter *info = ras->info[severity];
+ unsigned long flags, err_src;
+ u32 err_bit;
- if (xe->info.platform != XE_BATTLEMAGE)
+ if (!IS_DGFX(xe))
return;
spin_lock_irqsave(&xe->irq.lock, flags);
goto unlock;
}
- if (err_src & XE_CSC_ERROR)
+ /*
+ * On encountering CSC firmware errors, the graphics device becomes unrecoverable
+ * so return immediately on error. The only way to recover from these errors is
+ * firmware flash. The device will enter Runtime Survivability mode when such
+ * errors are detected.
+ */
+ if (err_src & REG_BIT(XE_CSC_ERROR)) {
csc_hw_error_handler(tile, hw_err);
+ goto clear_reg;
+ }
- xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src);
+ if (!info)
+ goto clear_reg;
+
+ for_each_set_bit(err_bit, &err_src, XE_RAS_REG_SIZE) {
+ const char *name;
+ u32 error_id;
+
+ /* Check error bit is within bounds */
+ if (err_bit >= ARRAY_SIZE(xe_hw_error_map))
+ break;
+
+ error_id = xe_hw_error_map[err_bit];
+
+ /* Check error component is within max */
+ if (!error_id || error_id >= DRM_XE_RAS_ERR_COMP_MAX)
+ continue;
+ name = info[error_id].name;
+ if (!name)
+ continue;
+
+ if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) {
+ drm_warn(&xe->drm, HW_ERR
+ "TILE%d reported %s %s, bit[%d] is set\n",
+ tile->id, name, severity_str, err_bit);
+ } else {
+ drm_err_ratelimited(&xe->drm, HW_ERR
+ "TILE%d reported %s %s, bit[%d] is set\n",
+ tile->id, name, severity_str, err_bit);
+ }
+ if (err_bit == XE_GT_ERROR)
+ gt_hw_error_handler(tile, hw_err, error_id);
+ }
+
+clear_reg:
+ xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src);
unlock:
spin_unlock_irqrestore(&xe->irq.lock, flags);
}
if (fault_inject_csc_hw_error())
schedule_work(&tile->csc_hw_error_work);
- for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
+ for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
if (master_ctl & ERROR_IRQ(hw_err))
hw_error_source_handler(tile, hw_err);
+ }
}
static int hw_error_info_init(struct xe_device *xe)