From b402d4caa935f8b196154db6f5ae344d2e58954e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 27 Jan 2022 16:19:57 +0100 Subject: [PATCH] 4.4-stable patches added patches: drm-i915-flush-tlbs-before-releasing-backing-store.patch --- ...-tlbs-before-releasing-backing-store.patch | 208 ++++++++++++++++++ queue-4.4/series | 1 + 2 files changed, 209 insertions(+) create mode 100644 queue-4.4/drm-i915-flush-tlbs-before-releasing-backing-store.patch create mode 100644 queue-4.4/series diff --git a/queue-4.4/drm-i915-flush-tlbs-before-releasing-backing-store.patch b/queue-4.4/drm-i915-flush-tlbs-before-releasing-backing-store.patch new file mode 100644 index 00000000000..5031c8ee7e6 --- /dev/null +++ b/queue-4.4/drm-i915-flush-tlbs-before-releasing-backing-store.patch @@ -0,0 +1,208 @@ +From 7938d61591d33394a21bdd7797a245b65428f44c Mon Sep 17 00:00:00 2001 +From: Tvrtko Ursulin +Date: Tue, 19 Oct 2021 13:27:10 +0100 +Subject: drm/i915: Flush TLBs before releasing backing store + +From: Tvrtko Ursulin + +commit 7938d61591d33394a21bdd7797a245b65428f44c upstream. + +We need to flush TLBs before releasing backing store otherwise userspace +is able to encounter stale entries if a) it is not declaring access to +certain buffers and b) it races with the backing store release from a +such undeclared execution already executing on the GPU in parallel. + +The approach taken is to mark any buffer objects which were ever bound +to the GPU and to trigger a serialized TLB flush when their backing +store is released. + +Alternatively the flushing could be done on VMA unbind, at which point +we would be able to ascertain whether there is potential a parallel GPU +execution (which could race), but essentially it boils down to paying +the cost of TLB flushes potentially needlessly at VMA unbind time (when +the backing store is not known to be going away so not needed for +safety), versus potentially needlessly at backing store relase time +(since we at that point cannot tell whether there is anything executing +on the GPU which uses that object). + +Thereforce simplicity of implementation has been chosen for now with +scope to benchmark and refine later as required. + +Signed-off-by: Tvrtko Ursulin +Reported-by: Sushma Venkatesh Reddy +Reviewed-by: Daniel Vetter +Acked-by: Dave Airlie +Cc: Daniel Vetter +Cc: Jon Bloomfield +Cc: Joonas Lahtinen +Cc: Jani Nikula +Cc: stable@vger.kernel.org +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/i915/i915_drv.h | 5 ++ + drivers/gpu/drm/i915/i915_gem.c | 89 ++++++++++++++++++++++++++++++++++++ + drivers/gpu/drm/i915/i915_gem_gtt.c | 3 + + drivers/gpu/drm/i915/i915_reg.h | 6 ++ + 4 files changed, 103 insertions(+) + +--- a/drivers/gpu/drm/i915/i915_drv.h ++++ b/drivers/gpu/drm/i915/i915_drv.h +@@ -1719,6 +1719,8 @@ struct drm_i915_private { + + struct intel_uncore uncore; + ++ struct mutex tlb_invalidate_lock; ++ + struct i915_virtual_gpu vgpu; + + struct intel_guc guc; +@@ -2066,6 +2068,9 @@ struct drm_i915_gem_object { + */ + unsigned int active:I915_NUM_RINGS; + ++ unsigned long flags; ++#define I915_BO_WAS_BOUND_BIT 0 ++ + /** + * This is set if the object has been written to since last bound + * to the GTT +--- a/drivers/gpu/drm/i915/i915_gem.c ++++ b/drivers/gpu/drm/i915/i915_gem.c +@@ -2212,6 +2212,85 @@ i915_gem_object_put_pages_gtt(struct drm + kfree(obj->pages); + } + ++#define _wait_for_us(COND, US, W) ({ \ ++ unsigned long timeout__ = jiffies + usecs_to_jiffies(US) + 1; \ ++ int ret__; \ ++ for (;;) { \ ++ bool expired__ = time_after(jiffies, timeout__); \ ++ if (COND) { \ ++ ret__ = 0; \ ++ break; \ ++ } \ ++ if (expired__) { \ ++ ret__ = -ETIMEDOUT; \ ++ break; \ ++ } \ ++ usleep_range((W), (W)*2); \ ++ } \ ++ ret__; \ ++}) ++ ++static int ++__intel_wait_for_register_fw(struct drm_i915_private *dev_priv, ++ u32 reg, ++ const u32 mask, ++ const u32 value, ++ const unsigned int timeout_us, ++ const unsigned int timeout_ms) ++{ ++#define done ((I915_READ_FW(reg) & mask) == value) ++ int ret = _wait_for_us(done, timeout_us, 2); ++ if (ret) ++ ret = wait_for(done, timeout_ms); ++ return ret; ++#undef done ++} ++ ++static void invalidate_tlbs(struct drm_i915_private *dev_priv) ++{ ++ static const u32 gen8_regs[] = { ++ [RCS] = GEN8_RTCR, ++ [VCS] = GEN8_M1TCR, ++ [VCS2] = GEN8_M2TCR, ++ [VECS] = GEN8_VTCR, ++ [BCS] = GEN8_BTCR, ++ }; ++ enum intel_ring_id id; ++ ++ if (INTEL_INFO(dev_priv)->gen < 8) ++ return; ++ ++ mutex_lock(&dev_priv->tlb_invalidate_lock); ++ intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); ++ ++ for (id = 0; id < I915_NUM_RINGS; id++) { ++ struct intel_engine_cs *engine = &dev_priv->ring[id]; ++ /* ++ * HW architecture suggest typical invalidation time at 40us, ++ * with pessimistic cases up to 100us and a recommendation to ++ * cap at 1ms. We go a bit higher just in case. ++ */ ++ const unsigned int timeout_us = 100; ++ const unsigned int timeout_ms = 4; ++ ++ if (!intel_ring_initialized(engine)) ++ continue; ++ ++ if (WARN_ON_ONCE(id >= ARRAY_SIZE(gen8_regs) || !gen8_regs[id])) ++ continue; ++ ++ I915_WRITE_FW(gen8_regs[id], 1); ++ if (__intel_wait_for_register_fw(dev_priv, ++ gen8_regs[id], 1, 0, ++ timeout_us, timeout_ms)) ++ DRM_ERROR_RATELIMITED("%s TLB invalidation did not complete in %ums!\n", ++ engine->name, timeout_ms); ++ } ++ ++ intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); ++ mutex_unlock(&dev_priv->tlb_invalidate_lock); ++} ++ + int + i915_gem_object_put_pages(struct drm_i915_gem_object *obj) + { +@@ -2230,6 +2309,14 @@ i915_gem_object_put_pages(struct drm_i91 + * lists early. */ + list_del(&obj->global_list); + ++ if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) { ++ struct drm_i915_private *i915 = to_i915(obj->base.dev); ++ ++ intel_runtime_pm_get(i915); ++ invalidate_tlbs(i915); ++ intel_runtime_pm_put(i915); ++ } ++ + ops->put_pages(obj); + obj->pages = NULL; + +@@ -5050,6 +5137,8 @@ i915_gem_load(struct drm_device *dev) + i915_gem_shrinker_init(dev_priv); + + mutex_init(&dev_priv->fb_tracking.lock); ++ ++ mutex_init(&dev_priv->tlb_invalidate_lock); + } + + void i915_gem_release(struct drm_device *dev, struct drm_file *file) +--- a/drivers/gpu/drm/i915/i915_gem_gtt.c ++++ b/drivers/gpu/drm/i915/i915_gem_gtt.c +@@ -3538,6 +3538,9 @@ int i915_vma_bind(struct i915_vma *vma, + + vma->bound |= bind_flags; + ++ if (vma->obj) ++ set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags); ++ + return 0; + } + +--- a/drivers/gpu/drm/i915/i915_reg.h ++++ b/drivers/gpu/drm/i915/i915_reg.h +@@ -1592,6 +1592,12 @@ enum skl_disp_power_wells { + + #define GEN7_TLB_RD_ADDR 0x4700 + ++#define GEN8_RTCR 0x4260 ++#define GEN8_M1TCR 0x4264 ++#define GEN8_M2TCR 0x4268 ++#define GEN8_BTCR 0x426c ++#define GEN8_VTCR 0x4270 ++ + #if 0 + #define PRB0_TAIL 0x02030 + #define PRB0_HEAD 0x02034 diff --git a/queue-4.4/series b/queue-4.4/series new file mode 100644 index 00000000000..4f89d40aca8 --- /dev/null +++ b/queue-4.4/series @@ -0,0 +1 @@ +drm-i915-flush-tlbs-before-releasing-backing-store.patch -- 2.47.2