]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
drm/i915: Flush TLBs before releasing backing store
authorTvrtko Ursulin <tvrtko.ursulin@intel.com>
Tue, 19 Oct 2021 12:27:10 +0000 (13:27 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 29 Jan 2022 09:25:09 +0000 (10:25 +0100)
commit 7938d61591d33394a21bdd7797a245b65428f44c upstream.

We need to flush TLBs before releasing backing store otherwise userspace
is able to encounter stale entries if a) it is not declaring access to
certain buffers and b) it races with the backing store release from a
such undeclared execution already executing on the GPU in parallel.

The approach taken is to mark any buffer objects which were ever bound
to the GPU and to trigger a serialized TLB flush when their backing
store is released.

Alternatively the flushing could be done on VMA unbind, at which point
we would be able to ascertain whether there is potential a parallel GPU
execution (which could race), but essentially it boils down to paying
the cost of TLB flushes potentially needlessly at VMA unbind time (when
the backing store is not known to be going away so not needed for
safety), versus potentially needlessly at backing store relase time
(since we at that point cannot tell whether there is anything executing
on the GPU which uses that object).

Thereforce simplicity of implementation has been chosen for now with
scope to benchmark and refine later as required.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reported-by: Sushma Venkatesh Reddy <sushma.venkatesh.reddy@intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Dave Airlie <airlied@redhat.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/gpu/drm/i915/gem/i915_gem_object_types.h
drivers/gpu/drm/i915/gem/i915_gem_pages.c
drivers/gpu/drm/i915/gt/intel_gt.c
drivers/gpu/drm/i915/gt/intel_gt.h
drivers/gpu/drm/i915/gt/intel_gt_types.h
drivers/gpu/drm/i915/i915_reg.h
drivers/gpu/drm/i915/i915_vma.c

index 08b35587bc6dc30f92d971ad7cd1bdaa6bd6e690..352c102f3459ca9b551856f7e9b2bf7aefbc9a1d 100644 (file)
@@ -118,6 +118,9 @@ struct drm_i915_gem_object {
 
        I915_SELFTEST_DECLARE(struct list_head st_link);
 
+       unsigned long flags;
+#define I915_BO_WAS_BOUND_BIT    0
+
        /*
         * Is the object to be mapped as read-only to the GPU
         * Only honoured if hardware has relevant pte bit
index 18f0ce0135c179745969da6abdb71a62bf0e826c..aa63fa0ab575eda2144dd456237c5b9156ea85e6 100644 (file)
@@ -8,6 +8,8 @@
 #include "i915_gem_object.h"
 #include "i915_scatterlist.h"
 
+#include "gt/intel_gt.h"
+
 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
                                 struct sg_table *pages,
                                 unsigned int sg_page_sizes)
@@ -176,6 +178,14 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
        __i915_gem_object_reset_page_iter(obj);
        obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
 
+       if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
+               struct drm_i915_private *i915 = to_i915(obj->base.dev);
+               intel_wakeref_t wakeref;
+
+               with_intel_runtime_pm_if_in_use(&i915->runtime_pm, wakeref)
+                       intel_gt_invalidate_tlbs(&i915->gt);
+       }
+
        return pages;
 }
 
index d48ec9a76ed1635914e1dda606e29232ba1b2084..c8c070375d298cc432f7b5d66f513bb51fc28400 100644 (file)
@@ -15,6 +15,8 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
 
        spin_lock_init(&gt->irq_lock);
 
+       mutex_init(&gt->tlb_invalidate_lock);
+
        INIT_LIST_HEAD(&gt->closed_vma);
        spin_lock_init(&gt->closed_lock);
 
@@ -266,3 +268,100 @@ void intel_gt_driver_late_release(struct intel_gt *gt)
        intel_uc_driver_late_release(&gt->uc);
        intel_gt_fini_reset(gt);
 }
+
+struct reg_and_bit {
+       i915_reg_t reg;
+       u32 bit;
+};
+
+static struct reg_and_bit
+get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
+               const i915_reg_t *regs, const unsigned int num)
+{
+       const unsigned int class = engine->class;
+       struct reg_and_bit rb = { };
+
+       if (WARN_ON_ONCE(class >= num || !regs[class].reg))
+               return rb;
+
+       rb.reg = regs[class];
+       if (gen8 && class == VIDEO_DECODE_CLASS)
+               rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
+       else
+               rb.bit = engine->instance;
+
+       rb.bit = BIT(rb.bit);
+
+       return rb;
+}
+
+void intel_gt_invalidate_tlbs(struct intel_gt *gt)
+{
+       static const i915_reg_t gen8_regs[] = {
+               [RENDER_CLASS]                  = GEN8_RTCR,
+               [VIDEO_DECODE_CLASS]            = GEN8_M1TCR, /* , GEN8_M2TCR */
+               [VIDEO_ENHANCEMENT_CLASS]       = GEN8_VTCR,
+               [COPY_ENGINE_CLASS]             = GEN8_BTCR,
+       };
+       static const i915_reg_t gen12_regs[] = {
+               [RENDER_CLASS]                  = GEN12_GFX_TLB_INV_CR,
+               [VIDEO_DECODE_CLASS]            = GEN12_VD_TLB_INV_CR,
+               [VIDEO_ENHANCEMENT_CLASS]       = GEN12_VE_TLB_INV_CR,
+               [COPY_ENGINE_CLASS]             = GEN12_BLT_TLB_INV_CR,
+       };
+       struct drm_i915_private *i915 = gt->i915;
+       struct intel_uncore *uncore = gt->uncore;
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
+       const i915_reg_t *regs;
+       unsigned int num = 0;
+
+       if (I915_SELFTEST_ONLY(gt->awake == -ENODEV))
+               return;
+
+       if (INTEL_GEN(i915) == 12) {
+               regs = gen12_regs;
+               num = ARRAY_SIZE(gen12_regs);
+       } else if (INTEL_GEN(i915) >= 8 && INTEL_GEN(i915) <= 11) {
+               regs = gen8_regs;
+               num = ARRAY_SIZE(gen8_regs);
+       } else if (INTEL_GEN(i915) < 8) {
+               return;
+       }
+
+       if (WARN_ONCE(!num, "Platform does not implement TLB invalidation!"))
+               return;
+
+       GEM_TRACE("\n");
+
+       assert_rpm_wakelock_held(&i915->runtime_pm);
+
+       mutex_lock(&gt->tlb_invalidate_lock);
+       intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
+
+       for_each_engine(engine, gt, id) {
+               /*
+                * HW architecture suggest typical invalidation time at 40us,
+                * with pessimistic cases up to 100us and a recommendation to
+                * cap at 1ms. We go a bit higher just in case.
+                */
+               const unsigned int timeout_us = 100;
+               const unsigned int timeout_ms = 4;
+               struct reg_and_bit rb;
+
+               rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
+               if (!i915_mmio_reg_offset(rb.reg))
+                       continue;
+
+               intel_uncore_write_fw(uncore, rb.reg, rb.bit);
+               if (__intel_wait_for_register_fw(uncore,
+                                                rb.reg, rb.bit, 0,
+                                                timeout_us, timeout_ms,
+                                                NULL))
+                       DRM_ERROR_RATELIMITED("%s TLB invalidation did not complete in %ums!\n",
+                                             engine->name, timeout_ms);
+       }
+
+       intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL);
+       mutex_unlock(&gt->tlb_invalidate_lock);
+}
index 4920cb351f1093212b5b06a4809c1b4d63f5d4e3..4eab15bdcd97be895147ebf4c5e9fed75ea4dd12 100644 (file)
@@ -57,4 +57,6 @@ static inline bool intel_gt_is_wedged(struct intel_gt *gt)
 
 void intel_gt_queue_hangcheck(struct intel_gt *gt);
 
+void intel_gt_invalidate_tlbs(struct intel_gt *gt);
+
 #endif /* __INTEL_GT_H__ */
index dc295c196d11c60810c6ee0fad0f1334ef3ef8d8..82a78719b32d5ad0978d6e348ac01762bf4eb2fe 100644 (file)
@@ -40,6 +40,8 @@ struct intel_gt {
 
        struct intel_uc uc;
 
+       struct mutex tlb_invalidate_lock;
+
        struct intel_gt_timelines {
                spinlock_t lock; /* protects active_list */
                struct list_head active_list;
index 7b6e68f082f8c58b7da765e2b4c182eb3d9fe010..1386d0f5eac63768f3aa4d01fc6b3ece30d484c0 100644 (file)
@@ -2519,6 +2519,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING     (1 << 28)
 #define   GAMT_CHKN_DISABLE_I2M_CYCLE_ON_WR_PORT       (1 << 24)
 
+#define GEN8_RTCR      _MMIO(0x4260)
+#define GEN8_M1TCR     _MMIO(0x4264)
+#define GEN8_M2TCR     _MMIO(0x4268)
+#define GEN8_BTCR      _MMIO(0x426c)
+#define GEN8_VTCR      _MMIO(0x4270)
+
 #if 0
 #define PRB0_TAIL      _MMIO(0x2030)
 #define PRB0_HEAD      _MMIO(0x2034)
@@ -2602,6 +2608,11 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   FAULT_VA_HIGH_BITS           (0xf << 0)
 #define   FAULT_GTT_SEL                        (1 << 4)
 
+#define GEN12_GFX_TLB_INV_CR   _MMIO(0xced8)
+#define GEN12_VD_TLB_INV_CR    _MMIO(0xcedc)
+#define GEN12_VE_TLB_INV_CR    _MMIO(0xcee0)
+#define GEN12_BLT_TLB_INV_CR   _MMIO(0xcee4)
+
 #define FPGA_DBG               _MMIO(0x42300)
 #define   FPGA_DBG_RM_NOCLAIM  (1 << 31)
 
index e0e677b2a3a94e8348b70fa7bdc9a3a41ad06140..c24f49ee10d7378bcaffa1ff0449d3c16a9409ea 100644 (file)
@@ -341,6 +341,10 @@ int i915_vma_bind(struct i915_vma *vma, enum i915_cache_level cache_level,
                return ret;
 
        vma->flags |= bind_flags;
+
+       if (vma->obj)
+               set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags);
+
        return 0;
 }