]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/i915/gt: Mark the GT as dead when mmio is unreliable
authorChris Wilson <chris.p.wilson@intel.com>
Wed, 7 Aug 2024 09:10:14 +0000 (10:10 +0100)
committerAndi Shyti <andi.shyti@linux.intel.com>
Fri, 9 Aug 2024 11:51:17 +0000 (12:51 +0100)
After we detect that mmio is returning all 0xff, we believe that the GPU
has dropped off the pci bus and is dead. Mark the device as wedged such
that we can propagate the failure back to userspace and wait for
recovery.

Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240807091014.469992-1-andi.shyti@linux.intel.com
drivers/gpu/drm/i915/gt/intel_gt.h
drivers/gpu/drm/i915/gt/intel_gt_types.h
drivers/gpu/drm/i915/gt/intel_reset.c
drivers/gpu/drm/i915/intel_uncore.c

index b5e114d284ad3f4139724b1c8a5bfa525e19b363..b73555889d50befffe4fb71e4fdcc82ac6dd0e91 100644 (file)
@@ -208,4 +208,10 @@ enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt,
 void intel_gt_bind_context_set_ready(struct intel_gt *gt);
 void intel_gt_bind_context_set_unready(struct intel_gt *gt);
 bool intel_gt_is_bind_context_ready(struct intel_gt *gt);
+
+static inline void intel_gt_set_wedged_async(struct intel_gt *gt)
+{
+       queue_work(system_highpri_wq, &gt->wedge);
+}
+
 #endif /* __INTEL_GT_H__ */
index cfdd2ad5e9549c3b50bfcbba9a02eb67d185a163..bcee084b1f272b7813b0c51645028be7547549cd 100644 (file)
@@ -292,6 +292,8 @@ struct intel_gt {
        struct gt_defaults defaults;
        struct kobject *sysfs_defaults;
 
+       struct work_struct wedge;
+
        struct i915_perf_gt perf;
 
        /** link: &ggtt.gt_list */
index 6161f7a3ff708bf4452f7a0fb2d816aa43c69671..76810ebb5e83957aa13ef83d35c817153879a2bf 100644 (file)
@@ -1013,6 +1013,15 @@ static void __intel_gt_set_wedged(struct intel_gt *gt)
        GT_TRACE(gt, "end\n");
 }
 
+static void set_wedged_work(struct work_struct *w)
+{
+       struct intel_gt *gt = container_of(w, struct intel_gt, wedge);
+       intel_wakeref_t wf;
+
+       with_intel_runtime_pm(gt->uncore->rpm, wf)
+               __intel_gt_set_wedged(gt);
+}
+
 void intel_gt_set_wedged(struct intel_gt *gt)
 {
        intel_wakeref_t wakeref;
@@ -1614,6 +1623,7 @@ void intel_gt_init_reset(struct intel_gt *gt)
        init_waitqueue_head(&gt->reset.queue);
        mutex_init(&gt->reset.mutex);
        init_srcu_struct(&gt->reset.backoff_srcu);
+       INIT_WORK(&gt->wedge, set_wedged_work);
 
        /*
         * While undesirable to wait inside the shrinker, complain anyway.
@@ -1640,7 +1650,7 @@ static void intel_wedge_me(struct work_struct *work)
        struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
 
        gt_err(w->gt, "%s timed out, cancelling all in-flight rendering.\n", w->name);
-       intel_gt_set_wedged(w->gt);
+       set_wedged_work(&w->gt->wedge);
 }
 
 void __intel_init_wedge(struct intel_wedge_me *w,
index 2eba289d88ad116f4e8ae292e3e5672c240caaf5..6aa179a3e92aac46f805461ed6eac3ce271049b8 100644 (file)
@@ -24,6 +24,7 @@
 #include <drm/drm_managed.h>
 #include <linux/pm_runtime.h>
 
+#include "gt/intel_gt.h"
 #include "gt/intel_engine_regs.h"
 #include "gt/intel_gt_regs.h"
 
@@ -180,14 +181,16 @@ fw_domain_wait_ack_clear(const struct intel_uncore_forcewake_domain *d)
        if (!wait_ack_clear(d, FORCEWAKE_KERNEL))
                return;
 
-       if (fw_ack(d) == ~0)
+       if (fw_ack(d) == ~0) {
                drm_err(&d->uncore->i915->drm,
                        "%s: MMIO unreliable (forcewake register returns 0xFFFFFFFF)!\n",
                        intel_uncore_forcewake_domain_to_str(d->id));
-       else
+               intel_gt_set_wedged_async(d->uncore->gt);
+       } else {
                drm_err(&d->uncore->i915->drm,
                        "%s: timed out waiting for forcewake ack to clear.\n",
                        intel_uncore_forcewake_domain_to_str(d->id));
+       }
 
        add_taint_for_CI(d->uncore->i915, TAINT_WARN); /* CI now unreliable */
 }