#include "regs/xe_irq_regs.h"
#include "xe_device.h"
+#include "xe_drm_ras.h"
#include "xe_hw_error.h"
#include "xe_mmio.h"
#include "xe_survivability_mode.h"
#define HEC_UNCORR_FW_ERR_BITS 4
+
extern struct fault_attr inject_csc_hw_error;
-/* Error categories reported by hardware */
-enum hardware_error {
- HARDWARE_ERROR_CORRECTABLE = 0,
- HARDWARE_ERROR_NONFATAL = 1,
- HARDWARE_ERROR_FATAL = 2,
- HARDWARE_ERROR_MAX,
-};
+static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES;
static const char * const hec_uncorrected_fw_errors[] = {
"Fatal",
"Data Corruption"
};
-static const char *hw_error_to_str(const enum hardware_error hw_err)
+static bool fault_inject_csc_hw_error(void)
{
- switch (hw_err) {
- case HARDWARE_ERROR_CORRECTABLE:
- return "CORRECTABLE";
- case HARDWARE_ERROR_NONFATAL:
- return "NONFATAL";
- case HARDWARE_ERROR_FATAL:
- return "FATAL";
- default:
- return "UNKNOWN";
- }
+ return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1);
}
-static bool fault_inject_csc_hw_error(void)
+static enum drm_xe_ras_error_severity hw_err_to_severity(const enum hardware_error hw_err)
{
- return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1);
+ if (hw_err == HARDWARE_ERROR_CORRECTABLE)
+ return DRM_XE_RAS_ERR_SEV_CORRECTABLE;
+
+ /* Uncorrectable errors comprise of both fatal and non-fatal errors */
+ return DRM_XE_RAS_ERR_SEV_UNCORRECTABLE;
}
static void csc_hw_error_work(struct work_struct *work)
static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
{
- const char *hw_err_str = hw_error_to_str(hw_err);
+ const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err);
+ const char *severity_str = error_severity[severity];
struct xe_device *xe = tile_to_xe(tile);
struct xe_mmio *mmio = &tile->mmio;
u32 base, err_bit, err_src;
lockdep_assert_held(&xe->irq.lock);
err_src = xe_mmio_read32(mmio, HEC_UNCORR_ERR_STATUS(base));
if (!err_src) {
- drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported HEC_ERR_STATUS_%s blank\n",
- tile->id, hw_err_str);
+ drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported %s HEC_ERR_STATUS register blank\n",
+ tile->id, severity_str);
return;
}
fw_err = xe_mmio_read32(mmio, HEC_UNCORR_FW_ERR_DW0(base));
for_each_set_bit(err_bit, &fw_err, HEC_UNCORR_FW_ERR_BITS) {
drm_err_ratelimited(&xe->drm, HW_ERR
- "%s: HEC Uncorrected FW %s error reported, bit[%d] is set\n",
- hw_err_str, hec_uncorrected_fw_errors[err_bit],
+ "HEC FW %s %s reported, bit[%d] is set\n",
+ hec_uncorrected_fw_errors[err_bit], severity_str,
err_bit);
schedule_work(&tile->csc_hw_error_work);
static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
{
- const char *hw_err_str = hw_error_to_str(hw_err);
+ const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err);
+ const char *severity_str = error_severity[severity];
struct xe_device *xe = tile_to_xe(tile);
unsigned long flags;
u32 err_src;
spin_lock_irqsave(&xe->irq.lock, flags);
err_src = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err));
if (!err_src) {
- drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported DEV_ERR_STAT_%s blank!\n",
- tile->id, hw_err_str);
+ drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported %s DEV_ERR_STAT register blank!\n",
+ tile->id, severity_str);
goto unlock;
}
hw_error_source_handler(tile, hw_err);
}
+static int hw_error_info_init(struct xe_device *xe)
+{
+ if (xe->info.platform != XE_PVC)
+ return 0;
+
+ return xe_drm_ras_init(xe);
+}
+
/*
* Process hardware errors during boot
*/
void xe_hw_error_init(struct xe_device *xe)
{
struct xe_tile *tile = xe_device_get_root_tile(xe);
+ int ret;
if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
return;
INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work);
+ ret = hw_error_info_init(xe);
+ if (ret)
+ drm_err(&xe->drm, "Failed to initialize XE DRM RAS (%pe)\n", ERR_PTR(ret));
+
process_hw_errors(xe);
}