Previously, the driver's internal wedged.mode state was updated without
verifying whether the corresponding engine reset policy update in GuC
succeeded. This could leave the driver reporting a wedged.mode state
that doesn't match the actual reset behavior programmed in GuC.
With this change, the reset policy is updated first, and the driver's
wedged.mode state is modified only if the policy update succeeds on all
available GTs.
This patch also introduces two functional improvements:
- The policy is sent to GuC only when a change is required. An update
is needed only when entering or leaving XE_WEDGED_MODE_UPON_ANY_HANG,
because only in that case the reset policy changes. For example,
switching between XE_WEDGED_MODE_UPON_CRITICAL_ERROR and
XE_WEDGED_MODE_NEVER doesn't affect the reset policy, so there is no
need to send the same value to GuC.
- An inconsistent_reset flag is added to track cases where reset policy
update succeeds only on a subset of GTs. If such inconsistency is
detected, future wedged mode configuration will force a retry of the
reset policy update to restore a consistent state across all GTs.
Fixes: 6b8ef44cc0a9 ("drm/xe: Introduce the wedged_mode debugfs")
Signed-off-by: Lukasz Laguna <lukasz.laguna@intel.com>
Link: https://patch.msgid.link/20260107174741.29163-3-lukasz.laguna@intel.com
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
(cherry picked from commit
0f13dead4e0385859f5c9c3625a19df116b389d3)
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
return simple_read_from_buffer(ubuf, size, pos, buf, len);
}
+static int __wedged_mode_set_reset_policy(struct xe_gt *gt, enum xe_wedged_mode mode)
+{
+ bool enable_engine_reset;
+ int ret;
+
+ enable_engine_reset = (mode != XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET);
+ ret = xe_guc_ads_scheduler_policy_toggle_reset(>->uc.guc.ads,
+ enable_engine_reset);
+ if (ret)
+ xe_gt_err(gt, "Failed to update GuC ADS scheduler policy (%pe)\n", ERR_PTR(ret));
+
+ return ret;
+}
+
+static int wedged_mode_set_reset_policy(struct xe_device *xe, enum xe_wedged_mode mode)
+{
+ struct xe_gt *gt;
+ int ret;
+ u8 id;
+
+ guard(xe_pm_runtime)(xe);
+ for_each_gt(gt, xe, id) {
+ ret = __wedged_mode_set_reset_policy(gt, mode);
+ if (ret) {
+ if (id > 0) {
+ xe->wedged.inconsistent_reset = true;
+ drm_err(&xe->drm, "Inconsistent reset policy state between GTs\n");
+ }
+ return ret;
+ }
+ }
+
+ xe->wedged.inconsistent_reset = false;
+
+ return 0;
+}
+
+static bool wedged_mode_needs_policy_update(struct xe_device *xe, enum xe_wedged_mode mode)
+{
+ if (xe->wedged.inconsistent_reset)
+ return true;
+
+ if (xe->wedged.mode == mode)
+ return false;
+
+ if (xe->wedged.mode == XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET ||
+ mode == XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET)
+ return true;
+
+ return false;
+}
+
static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
size_t size, loff_t *pos)
{
struct xe_device *xe = file_inode(f)->i_private;
- struct xe_gt *gt;
u32 wedged_mode;
ssize_t ret;
- u8 id;
ret = kstrtouint_from_user(ubuf, size, 0, &wedged_mode);
if (ret)
if (wedged_mode > 2)
return -EINVAL;
- if (xe->wedged.mode == wedged_mode)
- return size;
+ if (wedged_mode_needs_policy_update(xe, wedged_mode)) {
+ ret = wedged_mode_set_reset_policy(xe, wedged_mode);
+ if (ret)
+ return ret;
+ }
xe->wedged.mode = wedged_mode;
- xe_pm_runtime_get(xe);
- for_each_gt(gt, xe, id) {
- ret = xe_guc_ads_scheduler_policy_toggle_reset(>->uc.guc.ads);
- if (ret) {
- xe_gt_err(gt, "Failed to update GuC ADS scheduler policy. GuC may still cause engine reset even with wedged_mode=2\n");
- xe_pm_runtime_put(xe);
- return -EIO;
- }
- }
- xe_pm_runtime_put(xe);
-
return size;
}
struct xe_pxp;
struct xe_vram_region;
+/**
+ * enum xe_wedged_mode - possible wedged modes
+ * @XE_WEDGED_MODE_NEVER: Device will never be declared wedged.
+ * @XE_WEDGED_MODE_UPON_CRITICAL_ERROR: Device will be declared wedged only
+ * when critical error occurs like GT reset failure or firmware failure.
+ * This is the default mode.
+ * @XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET: Device will be declared wedged on
+ * any hang. In this mode, engine resets are disabled to avoid automatic
+ * recovery attempts. This mode is primarily intended for debugging hangs.
+ */
+enum xe_wedged_mode {
+ XE_WEDGED_MODE_NEVER = 0,
+ XE_WEDGED_MODE_UPON_CRITICAL_ERROR = 1,
+ XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET = 2,
+};
+
#define XE_BO_INVALID_OFFSET LONG_MAX
#define GRAPHICS_VER(xe) ((xe)->info.graphics_verx100 / 100)
int mode;
/** @wedged.method: Recovery method to be sent in the drm device wedged uevent */
unsigned long method;
+ /** @wedged.inconsistent_reset: Inconsistent reset policy state between GTs */
+ bool inconsistent_reset;
} wedged;
/** @bo_device: Struct to control async free of BOs */
/**
* xe_guc_ads_scheduler_policy_toggle_reset - Toggle reset policy
* @ads: Additional data structures object
+ * @enable_engine_reset: true to enable engine resets, false otherwise
*
- * This function update the GuC's engine reset policy based on wedged.mode.
+ * This function update the GuC's engine reset policy.
*
* Return: 0 on success, and negative error code otherwise.
*/
-int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads)
+int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads,
+ bool enable_engine_reset)
{
struct guc_policies *policies;
struct xe_guc *guc = ads_to_guc(ads);
- struct xe_device *xe = ads_to_xe(ads);
CLASS(xe_guc_buf, buf)(&guc->buf, sizeof(*policies));
if (!xe_guc_buf_is_valid(buf))
policies->dpc_promote_time = ads_blob_read(ads, policies.dpc_promote_time);
policies->max_num_work_items = ads_blob_read(ads, policies.max_num_work_items);
policies->is_valid = 1;
- if (xe->wedged.mode == 2)
- policies->global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
- else
+
+ if (enable_engine_reset)
policies->global_flags &= ~GLOBAL_POLICY_DISABLE_ENGINE_RESET;
+ else
+ policies->global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
return guc_ads_action_update_policies(ads, xe_guc_buf_flush(buf));
}
#ifndef _XE_GUC_ADS_H_
#define _XE_GUC_ADS_H_
+#include <linux/types.h>
+
struct xe_guc_ads;
int xe_guc_ads_init(struct xe_guc_ads *ads);
void xe_guc_ads_populate(struct xe_guc_ads *ads);
void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads);
void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads);
-int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads);
+int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads,
+ bool enable_engine_reset);
#endif