From f4e9fc967afdb53b1203f894fb4b68451a7fe202 Mon Sep 17 00:00:00 2001 From: Riana Tauro Date: Mon, 8 Dec 2025 14:15:41 +0530 Subject: [PATCH] drm/xe/xe_survivability: Redesign survivability mode MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Redesign survivability mode to have only one value per file. 1) Retain the survivability_mode sysfs to indicate the type cat /sys/bus/pci/devices/0000\:03\:00.0/survivability_mode (Boot / Runtime) 2) Add survivability_info directory to expose boot breadcrumbs. Entries in survivability mode sysfs are only visible when boot breadcrumb registers are populated. /sys/bus/pci/devices/0000:03:00.0/survivability_info ├── aux_info0 ├── aux_info1 ├── aux_info2 ├── aux_info3 ├── aux_info4 ├── capability_info ├── postcode_trace └── postcode_trace_overflow Capability Info: Provides data about boot status and has bits that indicate the support for the other breadcrumbs Postcode Trace / Postcode Trace Overflow : Each postcode is represented as an 8-bit value and represents a boot failure event. When a new failure event is logged by Pcode the existing postcodes are shifted left. These entries provide a history of 8 postcodes. Auxiliary Info: Some failures have additional debug information. Signed-off-by: Riana Tauro Reviewed-by: Rodrigo Vivi Link: https://patch.msgid.link/20251208084539.3652902-5-riana.tauro@intel.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_survivability_mode.c | 222 +++++++++++------- .../gpu/drm/xe/xe_survivability_mode_types.h | 22 +- 2 files changed, 154 insertions(+), 90 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 1662bfddd4bc9..b6ff5da86a4df 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -19,8 +19,6 @@ #include "xe_pcode_api.h" #include "xe_vsec.h" -#define MAX_SCRATCH_MMIO 8 - /** * DOC: Survivability Mode * @@ -48,19 +46,38 @@ * * Refer :ref:`xe_configfs` for more details on how to use configfs * - * Survivability mode is indicated by the below admin-only readable sysfs which provides additional - * debug information:: + * Survivability mode is indicated by the below admin-only readable sysfs entry. It + * provides information about the type of survivability mode (Boot/Runtime). + * + * .. code-block:: shell + * + * # cat /sys/bus/pci/devices//survivability_mode + * Boot + * + * + * Any additional debug information if present will be visible under the directory + * ``survivability_info``:: + * + * /sys/bus/pci/devices//survivability_info/ + * ├── aux_info0 + * ├── aux_info1 + * ├── aux_info2 + * ├── aux_info3 + * ├── aux_info4 + * ├── capability_info + * ├── fdo_mode + * ├── postcode_trace + * └── postcode_trace_overflow + * + * This directory has the following attributes * - * /sys/bus/pci/devices//survivability_mode + * - ``capability_info`` : Indicates Boot status and support for additional information * - * Capability Information: - * Provides boot status - * Postcode Information: - * Provides information about the failure - * Overflow Information - * Provides history of previous failures - * Auxiliary Information - * Certain failures may have information in addition to postcode information + * - ``postcode_trace``, ``postcode_trace_overflow`` : Each postcode is a 8bit value and + * represents a boot failure event. When a new failure event is logged by PCODE the + * existing postcodes are shifted left. These entries provide a history of 8 postcodes. + * + * - ``aux_info`` : Some failures have additional debug information * * Runtime Survivability * ===================== @@ -68,60 +85,76 @@ * Certain runtime firmware errors can cause the device to enter a wedged state * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation. * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and - * is indicated by the presence of survivability mode sysfs:: + * is indicated by the presence of survivability mode sysfs. + * Survivability mode sysfs provides information about the type of survivability mode. * - * /sys/bus/pci/devices//survivability_mode + * .. code-block:: shell * - * Survivability mode sysfs provides information about the type of survivability mode. + * # cat /sys/bus/pci/devices//survivability_mode + * Runtime * * When such errors occur, userspace is notified with the drm device wedged uevent and runtime * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd * to restore device to normal operation. */ +static const char * const reg_map[] = { + [CAPABILITY_INFO] = "Capability Info", + [POSTCODE_TRACE] = "Postcode trace", + [POSTCODE_TRACE_OVERFLOW] = "Postcode trace overflow", + [AUX_INFO0] = "Auxiliary Info 0", + [AUX_INFO1] = "Auxiliary Info 1", + [AUX_INFO2] = "Auxiliary Info 2", + [AUX_INFO3] = "Auxiliary Info 3", + [AUX_INFO4] = "Auxiliary Info 4", +}; + +struct xe_survivability_attribute { + struct device_attribute attr; + u8 index; +}; + +static struct +xe_survivability_attribute *dev_attr_to_survivability_attr(struct device_attribute *attr) +{ + return container_of(attr, struct xe_survivability_attribute, attr); +} + static u32 aux_history_offset(u32 reg_value) { return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value); } -static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info, - int id, char *name) +static void set_survivability_info(struct xe_mmio *mmio, u32 *info, int id) { - strscpy(info[id].name, name, sizeof(info[id].name)); - info[id].reg = PCODE_SCRATCH(id).raw; - info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); + info[id] = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); } static void populate_survivability_info(struct xe_device *xe) { struct xe_survivability *survivability = &xe->survivability; - struct xe_survivability_info *info = survivability->info; + u32 *info = survivability->info; struct xe_mmio *mmio; u32 id = 0, reg_value; - char name[NAME_MAX]; int index; mmio = xe_root_tile_mmio(xe); - set_survivability_info(mmio, info, id, "Capability Info"); - reg_value = info[id].value; + set_survivability_info(mmio, info, CAPABILITY_INFO); + reg_value = info[CAPABILITY_INFO]; if (reg_value & HISTORY_TRACKING) { - id++; - set_survivability_info(mmio, info, id, "Postcode Info"); + set_survivability_info(mmio, info, POSTCODE_TRACE); - if (reg_value & OVERFLOW_SUPPORT) { - id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value); - set_survivability_info(mmio, info, id, "Overflow Info"); - } + if (reg_value & OVERFLOW_SUPPORT) + set_survivability_info(mmio, info, POSTCODE_TRACE_OVERFLOW); } if (reg_value & AUXINFO_SUPPORT) { id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value); - for (index = 0; id && reg_value; index++, reg_value = info[id].value, - id = aux_history_offset(reg_value)) { - snprintf(name, NAME_MAX, "Auxiliary Info %d", index); - set_survivability_info(mmio, info, id, name); + for (index = 0; id >= AUX_INFO0 && id < MAX_SCRATCH_REG; index++) { + set_survivability_info(mmio, info, id); + id = aux_history_offset(info[id]); } } } @@ -130,15 +163,14 @@ static void log_survivability_info(struct pci_dev *pdev) { struct xe_device *xe = pdev_to_xe_device(pdev); struct xe_survivability *survivability = &xe->survivability; - struct xe_survivability_info *info = survivability->info; + u32 *info = survivability->info; int id; dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n", survivability->boot_status); - for (id = 0; id < MAX_SCRATCH_MMIO; id++) { - if (info[id].reg) - dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name, - info[id].reg, info[id].value); + for (id = 0; id < MAX_SCRATCH_REG; id++) { + if (info[id]) + dev_info(&pdev->dev, "%s: 0x%x\n", reg_map[id], info[id]); } } @@ -156,25 +188,38 @@ static ssize_t survivability_mode_show(struct device *dev, struct pci_dev *pdev = to_pci_dev(dev); struct xe_device *xe = pdev_to_xe_device(pdev); struct xe_survivability *survivability = &xe->survivability; - struct xe_survivability_info *info = survivability->info; - int index = 0, count = 0; - count += sysfs_emit_at(buff, count, "Survivability mode type: %s\n", - survivability->type ? "Runtime" : "Boot"); + return sysfs_emit(buff, "%s\n", survivability->type ? "Runtime" : "Boot"); +} - if (!check_boot_failure(xe)) - return count; +static DEVICE_ATTR_ADMIN_RO(survivability_mode); - for (index = 0; index < MAX_SCRATCH_MMIO; index++) { - if (info[index].reg) - count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name, - info[index].reg, info[index].value); - } +static ssize_t survivability_info_show(struct device *dev, + struct device_attribute *attr, char *buff) +{ + struct xe_survivability_attribute *sa = dev_attr_to_survivability_attr(attr); + struct pci_dev *pdev = to_pci_dev(dev); + struct xe_device *xe = pdev_to_xe_device(pdev); + struct xe_survivability *survivability = &xe->survivability; + u32 *info = survivability->info; - return count; + return sysfs_emit(buff, "0x%x\n", info[sa->index]); } -static DEVICE_ATTR_ADMIN_RO(survivability_mode); +#define SURVIVABILITY_ATTR_RO(name, _index) \ + struct xe_survivability_attribute attr_##name = { \ + .attr = __ATTR(name, 0400, survivability_info_show, NULL), \ + .index = _index, \ + } + +SURVIVABILITY_ATTR_RO(capability_info, CAPABILITY_INFO); +SURVIVABILITY_ATTR_RO(postcode_trace, POSTCODE_TRACE); +SURVIVABILITY_ATTR_RO(postcode_trace_overflow, POSTCODE_TRACE_OVERFLOW); +SURVIVABILITY_ATTR_RO(aux_info0, AUX_INFO0); +SURVIVABILITY_ATTR_RO(aux_info1, AUX_INFO1); +SURVIVABILITY_ATTR_RO(aux_info2, AUX_INFO2); +SURVIVABILITY_ATTR_RO(aux_info3, AUX_INFO3); +SURVIVABILITY_ATTR_RO(aux_info4, AUX_INFO4); static void xe_survivability_mode_fini(void *arg) { @@ -182,17 +227,48 @@ static void xe_survivability_mode_fini(void *arg) struct pci_dev *pdev = to_pci_dev(xe->drm.dev); struct device *dev = &pdev->dev; - sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); + device_remove_file(dev, &dev_attr_survivability_mode); } +static umode_t survivability_info_attrs_visible(struct kobject *kobj, struct attribute *attr, + int idx) +{ + struct xe_device *xe = kdev_to_xe_device(kobj_to_dev(kobj)); + struct xe_survivability *survivability = &xe->survivability; + u32 *info = survivability->info; + + if (info[idx]) + return 0400; + + return 0; +} + +/* Attributes are ordered according to enum scratch_reg */ +static struct attribute *survivability_info_attrs[] = { + &attr_capability_info.attr.attr, + &attr_postcode_trace.attr.attr, + &attr_postcode_trace_overflow.attr.attr, + &attr_aux_info0.attr.attr, + &attr_aux_info1.attr.attr, + &attr_aux_info2.attr.attr, + &attr_aux_info3.attr.attr, + &attr_aux_info4.attr.attr, + NULL, +}; + +static const struct attribute_group survivability_info_group = { + .name = "survivability_info", + .attrs = survivability_info_attrs, + .is_visible = survivability_info_attrs_visible, +}; + static int create_survivability_sysfs(struct pci_dev *pdev) { struct device *dev = &pdev->dev; struct xe_device *xe = pdev_to_xe_device(pdev); int ret; - /* create survivability mode sysfs */ - ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); + ret = device_create_file(dev, &dev_attr_survivability_mode); if (ret) { dev_warn(dev, "Failed to create survivability sysfs files\n"); return ret; @@ -203,6 +279,12 @@ static int create_survivability_sysfs(struct pci_dev *pdev) if (ret) return ret; + if (check_boot_failure(xe)) { + ret = devm_device_add_group(dev, &survivability_info_group); + if (ret) + return ret; + } + return 0; } @@ -239,25 +321,6 @@ err: return ret; } -static int init_survivability_mode(struct xe_device *xe) -{ - struct xe_survivability *survivability = &xe->survivability; - struct xe_survivability_info *info; - - survivability->size = MAX_SCRATCH_MMIO; - - info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), - GFP_KERNEL); - if (!info) - return -ENOMEM; - - survivability->info = info; - - populate_survivability_info(xe); - - return 0; -} - /** * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled * @xe: xe device instance @@ -325,9 +388,7 @@ int xe_survivability_mode_runtime_enable(struct xe_device *xe) return -EINVAL; } - ret = init_survivability_mode(xe); - if (ret) - return ret; + populate_survivability_info(xe); ret = create_survivability_sysfs(pdev); if (ret) @@ -356,14 +417,11 @@ int xe_survivability_mode_boot_enable(struct xe_device *xe) { struct xe_survivability *survivability = &xe->survivability; struct pci_dev *pdev = to_pci_dev(xe->drm.dev); - int ret; if (!xe_survivability_mode_is_requested(xe)) return 0; - ret = init_survivability_mode(xe); - if (ret) - return ret; + populate_survivability_info(xe); /* Log breadcrumbs but do not enter survivability mode for Critical boot errors */ if (survivability->boot_status == CRITICAL_FAILURE) { diff --git a/drivers/gpu/drm/xe/xe_survivability_mode_types.h b/drivers/gpu/drm/xe/xe_survivability_mode_types.h index cd65a5d167c9c..f31b3907d9331 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode_types.h +++ b/drivers/gpu/drm/xe/xe_survivability_mode_types.h @@ -9,23 +9,29 @@ #include #include +enum scratch_reg { + CAPABILITY_INFO, + POSTCODE_TRACE, + POSTCODE_TRACE_OVERFLOW, + AUX_INFO0, + AUX_INFO1, + AUX_INFO2, + AUX_INFO3, + AUX_INFO4, + MAX_SCRATCH_REG, +}; + enum xe_survivability_type { XE_SURVIVABILITY_TYPE_BOOT, XE_SURVIVABILITY_TYPE_RUNTIME, }; -struct xe_survivability_info { - char name[NAME_MAX]; - u32 reg; - u32 value; -}; - /** * struct xe_survivability: Contains survivability mode information */ struct xe_survivability { - /** @info: struct that holds survivability info from scratch registers */ - struct xe_survivability_info *info; + /** @info: survivability debug info */ + u32 info[MAX_SCRATCH_REG]; /** @size: number of scratch registers */ u32 size; -- 2.47.3