]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
drm/i915: Flush TLBs before releasing backing store
authorTvrtko Ursulin <tvrtko.ursulin@intel.com>
Tue, 19 Oct 2021 12:27:10 +0000 (13:27 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 29 Jan 2022 09:14:33 +0000 (10:14 +0100)
commit 7938d61591d33394a21bdd7797a245b65428f44c upstream.

We need to flush TLBs before releasing backing store otherwise userspace
is able to encounter stale entries if a) it is not declaring access to
certain buffers and b) it races with the backing store release from a
such undeclared execution already executing on the GPU in parallel.

The approach taken is to mark any buffer objects which were ever bound
to the GPU and to trigger a serialized TLB flush when their backing
store is released.

Alternatively the flushing could be done on VMA unbind, at which point
we would be able to ascertain whether there is potential a parallel GPU
execution (which could race), but essentially it boils down to paying
the cost of TLB flushes potentially needlessly at VMA unbind time (when
the backing store is not known to be going away so not needed for
safety), versus potentially needlessly at backing store relase time
(since we at that point cannot tell whether there is anything executing
on the GPU which uses that object).

Thereforce simplicity of implementation has been chosen for now with
scope to benchmark and refine later as required.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reported-by: Sushma Venkatesh Reddy <sushma.venkatesh.reddy@intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Dave Airlie <airlied@redhat.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/i915_gem_gtt.c
drivers/gpu/drm/i915/i915_reg.h

index adbbcaf14af676597c65fef6b27e0c297cf6dd86..8d7d102af52f4ade7738506825ba17f3bcb506ec 100644 (file)
@@ -1719,6 +1719,8 @@ struct drm_i915_private {
 
        struct intel_uncore uncore;
 
+       struct mutex tlb_invalidate_lock;
+
        struct i915_virtual_gpu vgpu;
 
        struct intel_guc guc;
@@ -2066,6 +2068,9 @@ struct drm_i915_gem_object {
         */
        unsigned int active:I915_NUM_RINGS;
 
+       unsigned long flags;
+#define I915_BO_WAS_BOUND_BIT    0
+
        /**
         * This is set if the object has been written to since last bound
         * to the GTT
index f56af0aaafde511c1add59ce5f89fe2fa9be5c42..ffce88930371ef9508762768c597f2a6b34cb38c 100644 (file)
@@ -2212,6 +2212,85 @@ i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj)
        kfree(obj->pages);
 }
 
+#define _wait_for_us(COND, US, W) ({ \
+       unsigned long timeout__ = jiffies + usecs_to_jiffies(US) + 1;   \
+       int ret__;                                                      \
+       for (;;) {                                                      \
+               bool expired__ = time_after(jiffies, timeout__);        \
+               if (COND) {                                             \
+                       ret__ = 0;                                      \
+                       break;                                          \
+               }                                                       \
+               if (expired__) {                                        \
+                       ret__ = -ETIMEDOUT;                             \
+                       break;                                          \
+               }                                                       \
+               usleep_range((W), (W)*2);                               \
+       }                                                               \
+       ret__;                                                          \
+})
+
+static int
+__intel_wait_for_register_fw(struct drm_i915_private *dev_priv,
+                            u32 reg,
+                            const u32 mask,
+                            const u32 value,
+                            const unsigned int timeout_us,
+                            const unsigned int timeout_ms)
+{
+#define done ((I915_READ_FW(reg) & mask) == value)
+       int ret = _wait_for_us(done, timeout_us, 2);
+       if (ret)
+               ret = wait_for(done, timeout_ms);
+       return ret;
+#undef done
+}
+
+static void invalidate_tlbs(struct drm_i915_private *dev_priv)
+{
+       static const u32 gen8_regs[] = {
+               [RCS]  = GEN8_RTCR,
+               [VCS]  = GEN8_M1TCR,
+               [VCS2] = GEN8_M2TCR,
+               [VECS] = GEN8_VTCR,
+               [BCS]  = GEN8_BTCR,
+       };
+       enum intel_ring_id id;
+
+       if (INTEL_INFO(dev_priv)->gen < 8)
+               return;
+
+       mutex_lock(&dev_priv->tlb_invalidate_lock);
+       intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+       for (id = 0; id < I915_NUM_RINGS; id++) {
+               struct intel_engine_cs *engine = &dev_priv->ring[id];
+               /*
+                * HW architecture suggest typical invalidation time at 40us,
+                * with pessimistic cases up to 100us and a recommendation to
+                * cap at 1ms. We go a bit higher just in case.
+                */
+               const unsigned int timeout_us = 100;
+               const unsigned int timeout_ms = 4;
+
+               if (!intel_ring_initialized(engine))
+                       continue;
+
+               if (WARN_ON_ONCE(id >= ARRAY_SIZE(gen8_regs) || !gen8_regs[id]))
+                       continue;
+
+               I915_WRITE_FW(gen8_regs[id], 1);
+               if (__intel_wait_for_register_fw(dev_priv,
+                                                gen8_regs[id], 1, 0,
+                                                timeout_us, timeout_ms))
+                       DRM_ERROR_RATELIMITED("%s TLB invalidation did not complete in %ums!\n",
+                                             engine->name, timeout_ms);
+       }
+
+       intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+       mutex_unlock(&dev_priv->tlb_invalidate_lock);
+}
+
 int
 i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
 {
@@ -2230,6 +2309,14 @@ i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
         * lists early. */
        list_del(&obj->global_list);
 
+       if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
+               struct drm_i915_private *i915 = to_i915(obj->base.dev);
+
+               intel_runtime_pm_get(i915);
+               invalidate_tlbs(i915);
+               intel_runtime_pm_put(i915);
+       }
+
        ops->put_pages(obj);
        obj->pages = NULL;
 
@@ -5050,6 +5137,8 @@ i915_gem_load(struct drm_device *dev)
        i915_gem_shrinker_init(dev_priv);
 
        mutex_init(&dev_priv->fb_tracking.lock);
+
+       mutex_init(&dev_priv->tlb_invalidate_lock);
 }
 
 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
index 65a53ee398b8e7b1373ee10bd165f4e14456f538..b2bb0b268ea9c8bfac19a907dabcaf7f09e43be8 100644 (file)
@@ -3538,6 +3538,9 @@ int i915_vma_bind(struct i915_vma *vma, enum i915_cache_level cache_level,
 
        vma->bound |= bind_flags;
 
+       if (vma->obj)
+               set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags);
+
        return 0;
 }
 
index 603d8cdfc5f1f14e417aca16432cf0f3d784d68c..33a9b80da5dc8f56862ab3b370f120f4212e334d 100644 (file)
@@ -1592,6 +1592,12 @@ enum skl_disp_power_wells {
 
 #define GEN7_TLB_RD_ADDR       0x4700
 
+#define GEN8_RTCR      0x4260
+#define GEN8_M1TCR     0x4264
+#define GEN8_M2TCR     0x4268
+#define GEN8_BTCR      0x426c
+#define GEN8_VTCR      0x4270
+
 #if 0
 #define PRB0_TAIL      0x02030
 #define PRB0_HEAD      0x02034