--- /dev/null
+From 7938d61591d33394a21bdd7797a245b65428f44c Mon Sep 17 00:00:00 2001
+From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+Date: Tue, 19 Oct 2021 13:27:10 +0100
+Subject: drm/i915: Flush TLBs before releasing backing store
+
+From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+
+commit 7938d61591d33394a21bdd7797a245b65428f44c upstream.
+
+We need to flush TLBs before releasing backing store otherwise userspace
+is able to encounter stale entries if a) it is not declaring access to
+certain buffers and b) it races with the backing store release from a
+such undeclared execution already executing on the GPU in parallel.
+
+The approach taken is to mark any buffer objects which were ever bound
+to the GPU and to trigger a serialized TLB flush when their backing
+store is released.
+
+Alternatively the flushing could be done on VMA unbind, at which point
+we would be able to ascertain whether there is potential a parallel GPU
+execution (which could race), but essentially it boils down to paying
+the cost of TLB flushes potentially needlessly at VMA unbind time (when
+the backing store is not known to be going away so not needed for
+safety), versus potentially needlessly at backing store relase time
+(since we at that point cannot tell whether there is anything executing
+on the GPU which uses that object).
+
+Thereforce simplicity of implementation has been chosen for now with
+scope to benchmark and refine later as required.
+
+Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+Reported-by: Sushma Venkatesh Reddy <sushma.venkatesh.reddy@intel.com>
+Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
+Acked-by: Dave Airlie <airlied@redhat.com>
+Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
+Cc: Jon Bloomfield <jon.bloomfield@intel.com>
+Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
+Cc: Jani Nikula <jani.nikula@intel.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/i915_drv.h | 5 ++
+ drivers/gpu/drm/i915/i915_gem.c | 89 ++++++++++++++++++++++++++++++++++++
+ drivers/gpu/drm/i915/i915_gem_gtt.c | 3 +
+ drivers/gpu/drm/i915/i915_reg.h | 6 ++
+ 4 files changed, 103 insertions(+)
+
+--- a/drivers/gpu/drm/i915/i915_drv.h
++++ b/drivers/gpu/drm/i915/i915_drv.h
+@@ -1719,6 +1719,8 @@ struct drm_i915_private {
+
+ struct intel_uncore uncore;
+
++ struct mutex tlb_invalidate_lock;
++
+ struct i915_virtual_gpu vgpu;
+
+ struct intel_guc guc;
+@@ -2066,6 +2068,9 @@ struct drm_i915_gem_object {
+ */
+ unsigned int active:I915_NUM_RINGS;
+
++ unsigned long flags;
++#define I915_BO_WAS_BOUND_BIT 0
++
+ /**
+ * This is set if the object has been written to since last bound
+ * to the GTT
+--- a/drivers/gpu/drm/i915/i915_gem.c
++++ b/drivers/gpu/drm/i915/i915_gem.c
+@@ -2212,6 +2212,85 @@ i915_gem_object_put_pages_gtt(struct drm
+ kfree(obj->pages);
+ }
+
++#define _wait_for_us(COND, US, W) ({ \
++ unsigned long timeout__ = jiffies + usecs_to_jiffies(US) + 1; \
++ int ret__; \
++ for (;;) { \
++ bool expired__ = time_after(jiffies, timeout__); \
++ if (COND) { \
++ ret__ = 0; \
++ break; \
++ } \
++ if (expired__) { \
++ ret__ = -ETIMEDOUT; \
++ break; \
++ } \
++ usleep_range((W), (W)*2); \
++ } \
++ ret__; \
++})
++
++static int
++__intel_wait_for_register_fw(struct drm_i915_private *dev_priv,
++ u32 reg,
++ const u32 mask,
++ const u32 value,
++ const unsigned int timeout_us,
++ const unsigned int timeout_ms)
++{
++#define done ((I915_READ_FW(reg) & mask) == value)
++ int ret = _wait_for_us(done, timeout_us, 2);
++ if (ret)
++ ret = wait_for(done, timeout_ms);
++ return ret;
++#undef done
++}
++
++static void invalidate_tlbs(struct drm_i915_private *dev_priv)
++{
++ static const u32 gen8_regs[] = {
++ [RCS] = GEN8_RTCR,
++ [VCS] = GEN8_M1TCR,
++ [VCS2] = GEN8_M2TCR,
++ [VECS] = GEN8_VTCR,
++ [BCS] = GEN8_BTCR,
++ };
++ enum intel_ring_id id;
++
++ if (INTEL_INFO(dev_priv)->gen < 8)
++ return;
++
++ mutex_lock(&dev_priv->tlb_invalidate_lock);
++ intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
++
++ for (id = 0; id < I915_NUM_RINGS; id++) {
++ struct intel_engine_cs *engine = &dev_priv->ring[id];
++ /*
++ * HW architecture suggest typical invalidation time at 40us,
++ * with pessimistic cases up to 100us and a recommendation to
++ * cap at 1ms. We go a bit higher just in case.
++ */
++ const unsigned int timeout_us = 100;
++ const unsigned int timeout_ms = 4;
++
++ if (!intel_ring_initialized(engine))
++ continue;
++
++ if (WARN_ON_ONCE(id >= ARRAY_SIZE(gen8_regs) || !gen8_regs[id]))
++ continue;
++
++ I915_WRITE_FW(gen8_regs[id], 1);
++ if (__intel_wait_for_register_fw(dev_priv,
++ gen8_regs[id], 1, 0,
++ timeout_us, timeout_ms))
++ DRM_ERROR_RATELIMITED("%s TLB invalidation did not complete in %ums!\n",
++ engine->name, timeout_ms);
++ }
++
++ intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
++ mutex_unlock(&dev_priv->tlb_invalidate_lock);
++}
++
+ int
+ i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
+ {
+@@ -2230,6 +2309,14 @@ i915_gem_object_put_pages(struct drm_i91
+ * lists early. */
+ list_del(&obj->global_list);
+
++ if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
++ struct drm_i915_private *i915 = to_i915(obj->base.dev);
++
++ intel_runtime_pm_get(i915);
++ invalidate_tlbs(i915);
++ intel_runtime_pm_put(i915);
++ }
++
+ ops->put_pages(obj);
+ obj->pages = NULL;
+
+@@ -5050,6 +5137,8 @@ i915_gem_load(struct drm_device *dev)
+ i915_gem_shrinker_init(dev_priv);
+
+ mutex_init(&dev_priv->fb_tracking.lock);
++
++ mutex_init(&dev_priv->tlb_invalidate_lock);
+ }
+
+ void i915_gem_release(struct drm_device *dev, struct drm_file *file)
+--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
++++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
+@@ -3538,6 +3538,9 @@ int i915_vma_bind(struct i915_vma *vma,
+
+ vma->bound |= bind_flags;
+
++ if (vma->obj)
++ set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags);
++
+ return 0;
+ }
+
+--- a/drivers/gpu/drm/i915/i915_reg.h
++++ b/drivers/gpu/drm/i915/i915_reg.h
+@@ -1592,6 +1592,12 @@ enum skl_disp_power_wells {
+
+ #define GEN7_TLB_RD_ADDR 0x4700
+
++#define GEN8_RTCR 0x4260
++#define GEN8_M1TCR 0x4264
++#define GEN8_M2TCR 0x4268
++#define GEN8_BTCR 0x426c
++#define GEN8_VTCR 0x4270
++
+ #if 0
+ #define PRB0_TAIL 0x02030
+ #define PRB0_HEAD 0x02034