#ifndef _XE_HW_ERROR_REGS_H_
#define _XE_HW_ERROR_REGS_H_
+#define HEC_UNCORR_ERR_STATUS(base) XE_REG((base) + 0x118)
+#define UNCORR_FW_REPORTED_ERR BIT(6)
+
+#define HEC_UNCORR_FW_ERR_DW0(base) XE_REG((base) + 0x124)
+
#define DEV_ERR_STAT_NONFATAL 0x100178
#define DEV_ERR_STAT_CORRECTABLE 0x10017c
#define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \
DEV_ERR_STAT_CORRECTABLE, \
DEV_ERR_STAT_NONFATAL))
-
+#define XE_CSC_ERROR BIT(17)
#endif
* Copyright © 2025 Intel Corporation
*/
+#include "regs/xe_gsc_regs.h"
#include "regs/xe_hw_error_regs.h"
#include "regs/xe_irq_regs.h"
#include "xe_device.h"
#include "xe_hw_error.h"
#include "xe_mmio.h"
+#include "xe_survivability_mode.h"
+
+#define HEC_UNCORR_FW_ERR_BITS 4
/* Error categories reported by hardware */
enum hardware_error {
HARDWARE_ERROR_MAX,
};
+static const char * const hec_uncorrected_fw_errors[] = {
+ "Fatal",
+ "CSE Disabled",
+ "FD Corruption",
+ "Data Corruption"
+};
+
static const char *hw_error_to_str(const enum hardware_error hw_err)
{
switch (hw_err) {
}
}
+static void csc_hw_error_work(struct work_struct *work)
+{
+ struct xe_tile *tile = container_of(work, typeof(*tile), csc_hw_error_work);
+ struct xe_device *xe = tile_to_xe(tile);
+ int ret;
+
+ ret = xe_survivability_mode_runtime_enable(xe);
+ if (ret)
+ drm_err(&xe->drm, "Failed to enable runtime survivability mode\n");
+}
+
+static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
+{
+ const char *hw_err_str = hw_error_to_str(hw_err);
+ struct xe_device *xe = tile_to_xe(tile);
+ struct xe_mmio *mmio = &tile->mmio;
+ u32 base, err_bit, err_src;
+ unsigned long fw_err;
+
+ if (xe->info.platform != XE_BATTLEMAGE)
+ return;
+
+ base = BMG_GSC_HECI1_BASE;
+ lockdep_assert_held(&xe->irq.lock);
+ err_src = xe_mmio_read32(mmio, HEC_UNCORR_ERR_STATUS(base));
+ if (!err_src) {
+ drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported HEC_ERR_STATUS_%s blank\n",
+ tile->id, hw_err_str);
+ return;
+ }
+
+ if (err_src & UNCORR_FW_REPORTED_ERR) {
+ fw_err = xe_mmio_read32(mmio, HEC_UNCORR_FW_ERR_DW0(base));
+ for_each_set_bit(err_bit, &fw_err, HEC_UNCORR_FW_ERR_BITS) {
+ drm_err_ratelimited(&xe->drm, HW_ERR
+ "%s: HEC Uncorrected FW %s error reported, bit[%d] is set\n",
+ hw_err_str, hec_uncorrected_fw_errors[err_bit],
+ err_bit);
+
+ schedule_work(&tile->csc_hw_error_work);
+ }
+ }
+
+ xe_mmio_write32(mmio, HEC_UNCORR_ERR_STATUS(base), err_src);
+}
+
static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
{
const char *hw_err_str = hw_error_to_str(hw_err);
goto unlock;
}
- /* TODO: Process errrors per source */
+ if (err_src & XE_CSC_ERROR)
+ csc_hw_error_handler(tile, hw_err);
xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src);
*/
void xe_hw_error_init(struct xe_device *xe)
{
+ struct xe_tile *tile = xe_device_get_root_tile(xe);
+
if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
return;
+ INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work);
+
process_hw_errors(xe);
}