]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/xe/ras: Introduce correctable error handling
authorRaag Jadav <raag.jadav@intel.com>
Tue, 28 Apr 2026 05:48:26 +0000 (11:18 +0530)
committerRiana Tauro <riana.tauro@intel.com>
Thu, 30 Apr 2026 06:01:45 +0000 (11:31 +0530)
Add initial support for correctable error handling which is serviced
using system controller event. Currently we only log the errors in
dmesg but this serves as a foundation for RAS infrastructure and will
be further extended to facilitate other RAS features.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Reviewed-by: Mallesh Koujalagi <mallesh.koujalagi@intel.com>
Reviewed-by: Riana Tauro <riana.tauro@intel.com>
Link: https://patch.msgid.link/20260428054826.1202076-4-raag.jadav@intel.com
Signed-off-by: Riana Tauro <riana.tauro@intel.com>
drivers/gpu/drm/xe/Makefile
drivers/gpu/drm/xe/xe_ras.c [new file with mode: 0644]
drivers/gpu/drm/xe/xe_ras.h [new file with mode: 0644]
drivers/gpu/drm/xe/xe_ras_types.h [new file with mode: 0644]
drivers/gpu/drm/xe/xe_sysctrl_event.c

index f22297545ae684ef87f717e177699cc874e923f5..bd1697733335eb9a2e3eeb4f890028e363df0dc3 100644 (file)
@@ -113,6 +113,7 @@ xe-y += xe_bb.o \
        xe_pxp_submit.o \
        xe_query.o \
        xe_range_fence.o \
+       xe_ras.o \
        xe_reg_sr.o \
        xe_reg_whitelist.o \
        xe_ring_ops.o \
diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
new file mode 100644 (file)
index 0000000..4cb16b4
--- /dev/null
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+
+#include "xe_device.h"
+#include "xe_printk.h"
+#include "xe_ras.h"
+#include "xe_ras_types.h"
+#include "xe_sysctrl.h"
+#include "xe_sysctrl_event_types.h"
+
+/* Severity of detected errors  */
+enum xe_ras_severity {
+       XE_RAS_SEV_NOT_SUPPORTED = 0,
+       XE_RAS_SEV_CORRECTABLE,
+       XE_RAS_SEV_UNCORRECTABLE,
+       XE_RAS_SEV_INFORMATIONAL,
+       XE_RAS_SEV_MAX
+};
+
+/* Major IP blocks/components where errors can originate */
+enum xe_ras_component {
+       XE_RAS_COMP_NOT_SUPPORTED = 0,
+       XE_RAS_COMP_DEVICE_MEMORY,
+       XE_RAS_COMP_CORE_COMPUTE,
+       XE_RAS_COMP_RESERVED,
+       XE_RAS_COMP_PCIE,
+       XE_RAS_COMP_FABRIC,
+       XE_RAS_COMP_SOC_INTERNAL,
+       XE_RAS_COMP_MAX
+};
+
+static const char *const xe_ras_severities[] = {
+       [XE_RAS_SEV_NOT_SUPPORTED]              = "Not Supported",
+       [XE_RAS_SEV_CORRECTABLE]                = "Correctable Error",
+       [XE_RAS_SEV_UNCORRECTABLE]              = "Uncorrectable Error",
+       [XE_RAS_SEV_INFORMATIONAL]              = "Informational Error",
+};
+static_assert(ARRAY_SIZE(xe_ras_severities) == XE_RAS_SEV_MAX);
+
+static const char *const xe_ras_components[] = {
+       [XE_RAS_COMP_NOT_SUPPORTED]             = "Not Supported",
+       [XE_RAS_COMP_DEVICE_MEMORY]             = "Device Memory",
+       [XE_RAS_COMP_CORE_COMPUTE]              = "Core Compute",
+       [XE_RAS_COMP_RESERVED]                  = "Reserved",
+       [XE_RAS_COMP_PCIE]                      = "PCIe",
+       [XE_RAS_COMP_FABRIC]                    = "Fabric",
+       [XE_RAS_COMP_SOC_INTERNAL]              = "SoC Internal",
+};
+static_assert(ARRAY_SIZE(xe_ras_components) == XE_RAS_COMP_MAX);
+
+static inline const char *sev_to_str(u8 severity)
+{
+       if (severity >= XE_RAS_SEV_MAX)
+               severity = XE_RAS_SEV_NOT_SUPPORTED;
+
+       return xe_ras_severities[severity];
+}
+
+static inline const char *comp_to_str(u8 component)
+{
+       if (component >= XE_RAS_COMP_MAX)
+               component = XE_RAS_COMP_NOT_SUPPORTED;
+
+       return xe_ras_components[component];
+}
+
+void xe_ras_counter_threshold_crossed(struct xe_device *xe,
+                                     struct xe_sysctrl_event_response *response)
+{
+       struct xe_ras_threshold_crossed *pending = (void *)&response->data;
+       struct xe_ras_error_class *errors = pending->counters;
+       u32 id, ncounters = pending->ncounters;
+
+       BUILD_BUG_ON(sizeof(response->data) < sizeof(*pending));
+       xe_device_assert_mem_access(xe);
+
+       if (!ncounters || ncounters > XE_RAS_NUM_COUNTERS)
+               xe_err(xe, "sysctrl: unexpected counter threshold crossed %u\n", ncounters);
+       else
+               xe_warn(xe, "[RAS]: counter threshold crossed, %u new errors\n", ncounters);
+
+       for (id = 0; id < ncounters && id < XE_RAS_NUM_COUNTERS; id++) {
+               u8 severity, component;
+
+               severity = errors[id].common.severity;
+               component = errors[id].common.component;
+
+               xe_warn(xe, "[RAS]: %s %s detected\n",
+                       comp_to_str(component), sev_to_str(severity));
+       }
+}
diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h
new file mode 100644 (file)
index 0000000..ea90593
--- /dev/null
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+
+#ifndef _XE_RAS_H_
+#define _XE_RAS_H_
+
+struct xe_device;
+struct xe_sysctrl_event_response;
+
+void xe_ras_counter_threshold_crossed(struct xe_device *xe,
+                                     struct xe_sysctrl_event_response *response);
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_ras_types.h b/drivers/gpu/drm/xe/xe_ras_types.h
new file mode 100644 (file)
index 0000000..4e63c67
--- /dev/null
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+
+#ifndef _XE_RAS_TYPES_H_
+#define _XE_RAS_TYPES_H_
+
+#include <linux/types.h>
+
+#define XE_RAS_NUM_COUNTERS                    16
+
+/**
+ * struct xe_ras_error_common - Error fields that are common across all products
+ */
+struct xe_ras_error_common {
+       /** @severity: Error severity */
+       u8 severity;
+       /** @component: IP block where error originated */
+       u8 component;
+} __packed;
+
+/**
+ * struct xe_ras_error_unit - Error unit information
+ */
+struct xe_ras_error_unit {
+       /** @tile: Tile identifier */
+       u8 tile;
+       /** @instance: Instance identifier specific to IP */
+       u32 instance;
+} __packed;
+
+/**
+ * struct xe_ras_error_cause - Error cause information
+ */
+struct xe_ras_error_cause {
+       /** @cause: Cause/checker */
+       u32 cause;
+       /** @reserved: For future use */
+       u8 reserved;
+} __packed;
+
+/**
+ * struct xe_ras_error_product - Error fields that are specific to the product
+ */
+struct xe_ras_error_product {
+       /** @unit: Unit within IP block */
+       struct xe_ras_error_unit unit;
+       /** @cause: Cause/checker */
+       struct xe_ras_error_cause cause;
+} __packed;
+
+/**
+ * struct xe_ras_error_class - Combines common and product-specific parts
+ */
+struct xe_ras_error_class {
+       /** @common: Common error type and component */
+       struct xe_ras_error_common common;
+       /** @product: Product-specific unit and cause */
+       struct xe_ras_error_product product;
+} __packed;
+
+/**
+ * struct xe_ras_threshold_crossed - Data for threshold crossed event
+ */
+struct xe_ras_threshold_crossed {
+       /** @ncounters: Number of error counters that crossed thresholds */
+       u32 ncounters;
+       /** @counters: Array of error counters that crossed threshold */
+       struct xe_ras_error_class counters[XE_RAS_NUM_COUNTERS];
+} __packed;
+
+#endif
index 5a5721699ce33c906f28b45752365cd2b6fdc866..b4d17329af6c8dcce2305b744717c259a072c726 100644 (file)
@@ -6,6 +6,7 @@
 #include "xe_device.h"
 #include "xe_irq.h"
 #include "xe_printk.h"
+#include "xe_ras.h"
 #include "xe_sysctrl.h"
 #include "xe_sysctrl_event_types.h"
 #include "xe_sysctrl_mailbox.h"
@@ -35,7 +36,7 @@ static void get_pending_event(struct xe_sysctrl *sc, struct xe_sysctrl_mailbox_c
                }
 
                if (response->event == XE_SYSCTRL_EVENT_THRESHOLD_CROSSED)
-                       xe_warn(xe, "[RAS]: counter threshold crossed\n");
+                       xe_ras_counter_threshold_crossed(xe, response);
                else
                        xe_warn(xe, "sysctrl: unexpected event %#x\n", response->event);