]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/xe/xe_survivability: Redesign survivability mode
authorRiana Tauro <riana.tauro@intel.com>
Mon, 8 Dec 2025 08:45:41 +0000 (14:15 +0530)
committerRodrigo Vivi <rodrigo.vivi@intel.com>
Tue, 9 Dec 2025 22:19:42 +0000 (17:19 -0500)
Redesign survivability mode to have only one value per file.

1) Retain the survivability_mode sysfs to indicate the type

cat /sys/bus/pci/devices/0000\:03\:00.0/survivability_mode
(Boot / Runtime)

2) Add survivability_info directory to expose boot breadcrumbs.
Entries in survivability mode sysfs are only visible when
boot breadcrumb registers are populated.

/sys/bus/pci/devices/0000:03:00.0/survivability_info
├── aux_info0
├── aux_info1
├── aux_info2
├── aux_info3
├── aux_info4
├── capability_info
├── postcode_trace
└── postcode_trace_overflow

Capability Info:

Provides data about boot status and has bits that
indicate the support for the other breadcrumbs

Postcode Trace / Postcode Trace Overflow :

Each postcode is represented as an 8-bit value and represents
a boot failure event. When a new failure event is logged by Pcode
the existing postcodes are shifted left. These entries provide a
history of 8 postcodes.

Auxiliary Info:

Some failures have additional debug information.

Signed-off-by: Riana Tauro <riana.tauro@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patch.msgid.link/20251208084539.3652902-5-riana.tauro@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
drivers/gpu/drm/xe/xe_survivability_mode.c
drivers/gpu/drm/xe/xe_survivability_mode_types.h

index 1662bfddd4bc9c530644c185eb929f0613eb8e30..b6ff5da86a4dff5982719320a8777a307904d25a 100644 (file)
@@ -19,8 +19,6 @@
 #include "xe_pcode_api.h"
 #include "xe_vsec.h"
 
-#define MAX_SCRATCH_MMIO 8
-
 /**
  * DOC: Survivability Mode
  *
  *
  * Refer :ref:`xe_configfs` for more details on how to use configfs
  *
- * Survivability mode is indicated by the below admin-only readable sysfs which provides additional
- * debug information::
+ * Survivability mode is indicated by the below admin-only readable sysfs entry. It
+ * provides information about the type of survivability mode (Boot/Runtime).
+ *
+ * .. code-block:: shell
+ *
+ *     # cat /sys/bus/pci/devices/<device>/survivability_mode
+ *       Boot
+ *
+ *
+ * Any additional debug information if present will be visible under the directory
+ * ``survivability_info``::
+ *
+ *     /sys/bus/pci/devices/<device>/survivability_info/
+ *     ├── aux_info0
+ *     ├── aux_info1
+ *     ├── aux_info2
+ *     ├── aux_info3
+ *     ├── aux_info4
+ *     ├── capability_info
+ *     ├── fdo_mode
+ *     ├── postcode_trace
+ *     └── postcode_trace_overflow
+ *
+ * This directory has the following attributes
  *
- *     /sys/bus/pci/devices/<device>/survivability_mode
+ * - ``capability_info`` : Indicates Boot status and support for additional information
  *
- * Capability Information:
- *     Provides boot status
- * Postcode Information:
- *     Provides information about the failure
- * Overflow Information
- *     Provides history of previous failures
- * Auxiliary Information
- *     Certain failures may have information in addition to postcode information
+ * - ``postcode_trace``, ``postcode_trace_overflow`` : Each postcode is a 8bit value and
+ *   represents a boot failure event. When a new failure event is logged by PCODE the
+ *   existing postcodes are shifted left. These entries provide a history of 8 postcodes.
+ *
+ * - ``aux_info<n>`` : Some failures have additional debug information
  *
  * Runtime Survivability
  * =====================
  * Certain runtime firmware errors can cause the device to enter a wedged state
  * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation.
  * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and
- * is indicated by the presence of survivability mode sysfs::
+ * is indicated by the presence of survivability mode sysfs.
+ * Survivability mode sysfs provides information about the type of survivability mode.
  *
- *     /sys/bus/pci/devices/<device>/survivability_mode
+ * .. code-block:: shell
  *
- * Survivability mode sysfs provides information about the type of survivability mode.
+ *     # cat /sys/bus/pci/devices/<device>/survivability_mode
+ *       Runtime
  *
  * When such errors occur, userspace is notified with the drm device wedged uevent and runtime
  * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd
  * to restore device to normal operation.
  */
 
+static const char * const reg_map[] = {
+       [CAPABILITY_INFO]         = "Capability Info",
+       [POSTCODE_TRACE]          = "Postcode trace",
+       [POSTCODE_TRACE_OVERFLOW] = "Postcode trace overflow",
+       [AUX_INFO0]               = "Auxiliary Info 0",
+       [AUX_INFO1]               = "Auxiliary Info 1",
+       [AUX_INFO2]               = "Auxiliary Info 2",
+       [AUX_INFO3]               = "Auxiliary Info 3",
+       [AUX_INFO4]               = "Auxiliary Info 4",
+};
+
+struct xe_survivability_attribute {
+       struct device_attribute attr;
+       u8 index;
+};
+
+static struct
+xe_survivability_attribute *dev_attr_to_survivability_attr(struct device_attribute *attr)
+{
+       return container_of(attr, struct xe_survivability_attribute, attr);
+}
+
 static u32 aux_history_offset(u32 reg_value)
 {
        return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
 }
 
-static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
-                                  int id, char *name)
+static void set_survivability_info(struct xe_mmio *mmio, u32  *info, int id)
 {
-       strscpy(info[id].name, name, sizeof(info[id].name));
-       info[id].reg = PCODE_SCRATCH(id).raw;
-       info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
+       info[id] = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
 }
 
 static void populate_survivability_info(struct xe_device *xe)
 {
        struct xe_survivability *survivability = &xe->survivability;
-       struct xe_survivability_info *info = survivability->info;
+       u32 *info = survivability->info;
        struct xe_mmio *mmio;
        u32 id = 0, reg_value;
-       char name[NAME_MAX];
        int index;
 
        mmio = xe_root_tile_mmio(xe);
-       set_survivability_info(mmio, info, id, "Capability Info");
-       reg_value = info[id].value;
+       set_survivability_info(mmio, info, CAPABILITY_INFO);
+       reg_value = info[CAPABILITY_INFO];
 
        if (reg_value & HISTORY_TRACKING) {
-               id++;
-               set_survivability_info(mmio, info, id, "Postcode Info");
+               set_survivability_info(mmio, info, POSTCODE_TRACE);
 
-               if (reg_value & OVERFLOW_SUPPORT) {
-                       id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
-                       set_survivability_info(mmio, info, id, "Overflow Info");
-               }
+               if (reg_value & OVERFLOW_SUPPORT)
+                       set_survivability_info(mmio, info, POSTCODE_TRACE_OVERFLOW);
        }
 
        if (reg_value & AUXINFO_SUPPORT) {
                id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
 
-               for (index = 0; id && reg_value; index++, reg_value = info[id].value,
-                    id = aux_history_offset(reg_value)) {
-                       snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
-                       set_survivability_info(mmio, info, id, name);
+               for (index = 0; id >= AUX_INFO0 && id < MAX_SCRATCH_REG; index++) {
+                       set_survivability_info(mmio, info, id);
+                       id = aux_history_offset(info[id]);
                }
        }
 }
@@ -130,15 +163,14 @@ static void log_survivability_info(struct pci_dev *pdev)
 {
        struct xe_device *xe = pdev_to_xe_device(pdev);
        struct xe_survivability *survivability = &xe->survivability;
-       struct xe_survivability_info *info = survivability->info;
+       u32 *info = survivability->info;
        int id;
 
        dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
                 survivability->boot_status);
-       for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
-               if (info[id].reg)
-                       dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
-                                info[id].reg, info[id].value);
+       for (id = 0; id < MAX_SCRATCH_REG; id++) {
+               if (info[id])
+                       dev_info(&pdev->dev, "%s: 0x%x\n", reg_map[id], info[id]);
        }
 }
 
@@ -156,25 +188,38 @@ static ssize_t survivability_mode_show(struct device *dev,
        struct pci_dev *pdev = to_pci_dev(dev);
        struct xe_device *xe = pdev_to_xe_device(pdev);
        struct xe_survivability *survivability = &xe->survivability;
-       struct xe_survivability_info *info = survivability->info;
-       int index = 0, count = 0;
 
-       count += sysfs_emit_at(buff, count, "Survivability mode type: %s\n",
-                              survivability->type ? "Runtime" : "Boot");
+       return sysfs_emit(buff, "%s\n", survivability->type ? "Runtime" : "Boot");
+}
 
-       if (!check_boot_failure(xe))
-               return count;
+static DEVICE_ATTR_ADMIN_RO(survivability_mode);
 
-       for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
-               if (info[index].reg)
-                       count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
-                                              info[index].reg, info[index].value);
-       }
+static ssize_t survivability_info_show(struct device *dev,
+                                      struct device_attribute *attr, char *buff)
+{
+       struct xe_survivability_attribute *sa = dev_attr_to_survivability_attr(attr);
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct xe_device *xe = pdev_to_xe_device(pdev);
+       struct xe_survivability *survivability = &xe->survivability;
+       u32 *info = survivability->info;
 
-       return count;
+       return sysfs_emit(buff, "0x%x\n", info[sa->index]);
 }
 
-static DEVICE_ATTR_ADMIN_RO(survivability_mode);
+#define SURVIVABILITY_ATTR_RO(name, _index)                                    \
+       struct xe_survivability_attribute attr_##name = {                       \
+               .attr =  __ATTR(name, 0400, survivability_info_show, NULL),     \
+               .index = _index,                                                \
+       }
+
+SURVIVABILITY_ATTR_RO(capability_info, CAPABILITY_INFO);
+SURVIVABILITY_ATTR_RO(postcode_trace, POSTCODE_TRACE);
+SURVIVABILITY_ATTR_RO(postcode_trace_overflow, POSTCODE_TRACE_OVERFLOW);
+SURVIVABILITY_ATTR_RO(aux_info0, AUX_INFO0);
+SURVIVABILITY_ATTR_RO(aux_info1, AUX_INFO1);
+SURVIVABILITY_ATTR_RO(aux_info2, AUX_INFO2);
+SURVIVABILITY_ATTR_RO(aux_info3, AUX_INFO3);
+SURVIVABILITY_ATTR_RO(aux_info4, AUX_INFO4);
 
 static void xe_survivability_mode_fini(void *arg)
 {
@@ -182,17 +227,48 @@ static void xe_survivability_mode_fini(void *arg)
        struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
        struct device *dev = &pdev->dev;
 
-       sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
+       device_remove_file(dev, &dev_attr_survivability_mode);
 }
 
+static umode_t survivability_info_attrs_visible(struct kobject *kobj, struct attribute *attr,
+                                               int idx)
+{
+       struct xe_device *xe = kdev_to_xe_device(kobj_to_dev(kobj));
+       struct xe_survivability *survivability = &xe->survivability;
+       u32 *info = survivability->info;
+
+       if (info[idx])
+               return 0400;
+
+       return 0;
+}
+
+/* Attributes are ordered according to enum scratch_reg */
+static struct attribute *survivability_info_attrs[] = {
+       &attr_capability_info.attr.attr,
+       &attr_postcode_trace.attr.attr,
+       &attr_postcode_trace_overflow.attr.attr,
+       &attr_aux_info0.attr.attr,
+       &attr_aux_info1.attr.attr,
+       &attr_aux_info2.attr.attr,
+       &attr_aux_info3.attr.attr,
+       &attr_aux_info4.attr.attr,
+       NULL,
+};
+
+static const struct attribute_group survivability_info_group = {
+       .name = "survivability_info",
+       .attrs = survivability_info_attrs,
+       .is_visible = survivability_info_attrs_visible,
+};
+
 static int create_survivability_sysfs(struct pci_dev *pdev)
 {
        struct device *dev = &pdev->dev;
        struct xe_device *xe = pdev_to_xe_device(pdev);
        int ret;
 
-       /* create survivability mode sysfs */
-       ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
+       ret = device_create_file(dev, &dev_attr_survivability_mode);
        if (ret) {
                dev_warn(dev, "Failed to create survivability sysfs files\n");
                return ret;
@@ -203,6 +279,12 @@ static int create_survivability_sysfs(struct pci_dev *pdev)
        if (ret)
                return ret;
 
+       if (check_boot_failure(xe)) {
+               ret = devm_device_add_group(dev, &survivability_info_group);
+               if (ret)
+                       return ret;
+       }
+
        return 0;
 }
 
@@ -239,25 +321,6 @@ err:
        return ret;
 }
 
-static int init_survivability_mode(struct xe_device *xe)
-{
-       struct xe_survivability *survivability = &xe->survivability;
-       struct xe_survivability_info *info;
-
-       survivability->size = MAX_SCRATCH_MMIO;
-
-       info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
-                           GFP_KERNEL);
-       if (!info)
-               return -ENOMEM;
-
-       survivability->info = info;
-
-       populate_survivability_info(xe);
-
-       return 0;
-}
-
 /**
  * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled
  * @xe: xe device instance
@@ -325,9 +388,7 @@ int xe_survivability_mode_runtime_enable(struct xe_device *xe)
                return -EINVAL;
        }
 
-       ret = init_survivability_mode(xe);
-       if (ret)
-               return ret;
+       populate_survivability_info(xe);
 
        ret = create_survivability_sysfs(pdev);
        if (ret)
@@ -356,14 +417,11 @@ int xe_survivability_mode_boot_enable(struct xe_device *xe)
 {
        struct xe_survivability *survivability = &xe->survivability;
        struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
-       int ret;
 
        if (!xe_survivability_mode_is_requested(xe))
                return 0;
 
-       ret = init_survivability_mode(xe);
-       if (ret)
-               return ret;
+       populate_survivability_info(xe);
 
        /* Log breadcrumbs but do not enter survivability mode for Critical boot errors */
        if (survivability->boot_status == CRITICAL_FAILURE) {
index cd65a5d167c9c88b0985908c010d6bf7939227a6..f31b3907d933102ebe24db0bd6abc8071f187e09 100644 (file)
@@ -9,23 +9,29 @@
 #include <linux/limits.h>
 #include <linux/types.h>
 
+enum scratch_reg {
+       CAPABILITY_INFO,
+       POSTCODE_TRACE,
+       POSTCODE_TRACE_OVERFLOW,
+       AUX_INFO0,
+       AUX_INFO1,
+       AUX_INFO2,
+       AUX_INFO3,
+       AUX_INFO4,
+       MAX_SCRATCH_REG,
+};
+
 enum xe_survivability_type {
        XE_SURVIVABILITY_TYPE_BOOT,
        XE_SURVIVABILITY_TYPE_RUNTIME,
 };
 
-struct xe_survivability_info {
-       char name[NAME_MAX];
-       u32 reg;
-       u32 value;
-};
-
 /**
  * struct xe_survivability: Contains survivability mode information
  */
 struct xe_survivability {
-       /** @info: struct that holds survivability info from scratch registers */
-       struct xe_survivability_info *info;
+       /** @info: survivability debug info */
+       u32 info[MAX_SCRATCH_REG];
 
        /** @size: number of scratch registers */
        u32 size;