Similar to i915's commit
cebc13de7e704b1355bea208a9f9cdb042c74588
("drm/i915: Whitelist COMMON_SLICE_CHICKEN3 for UMD access"), except
that instead of putting the register on the allowlist for UMD to
program, the KMD is doing the programming at context initialization
based on a queue creation flag.
This is a recommended tuning setting for both gen12 and Xe_HP
platforms.
If a render queue is created with
DRM_XE_EXEC_QUEUE_SET_STATE_CACHE_PERF_FIX, COMMON_SLICE_CHICKEN3 will
be programmed at initialization to enable the render color cache to
key with BTP+BTI (binding table pool + binding table entry) instead of
just BTI (binding table entry). This enables the UMD to avoid emitting
render-target-cache-flush + stall-at-pixel-scoreboard every time a
binding table entry pointing to a render target is changed.
v2: Use xe_lrc_write_ring()
v3: Update xe_query.c to report availability
v4: Rename defines to add DISABLE_
v5: update commit message
v6: rebase
Mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39982
Bspec: 73993, 73994, 72161, 31870, 68331
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Link: https://patch.msgid.link/20260306075504.1288676-1-lionel.g.landwerlin@intel.com
#define COMMON_SLICE_CHICKEN3 XE_REG(0x7304, XE_REG_OPTION_MASKED)
#define XEHP_COMMON_SLICE_CHICKEN3 XE_REG_MCR(0x7304, XE_REG_OPTION_MASKED)
+#define DISABLE_STATE_CACHE_PERF_FIX REG_BIT(13)
#define DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN REG_BIT(12)
#define XEHP_DUAL_SIMD8_SEQ_MERGE_DISABLE REG_BIT(12)
#define BLEND_EMB_FIX_DISABLE_IN_RCC REG_BIT(11)
if (!(exec_queue_flags & EXEC_QUEUE_FLAG_KERNEL))
flags |= XE_LRC_CREATE_USER_CTX;
+ if (q->flags & EXEC_QUEUE_FLAG_DISABLE_STATE_CACHE_PERF_FIX)
+ flags |= XE_LRC_DISABLE_STATE_CACHE_PERF_FIX;
+
err = q->ops->init(q);
if (err)
return err;
return q->ops->set_multi_queue_priority(q, value);
}
+static int exec_queue_set_state_cache_perf_fix(struct xe_device *xe, struct xe_exec_queue *q,
+ u64 value)
+{
+ if (XE_IOCTL_DBG(xe, q->class != XE_ENGINE_CLASS_RENDER))
+ return -EOPNOTSUPP;
+
+ q->flags |= value != 0 ? EXEC_QUEUE_FLAG_DISABLE_STATE_CACHE_PERF_FIX : 0;
+
+ return 0;
+}
+
typedef int (*xe_exec_queue_set_property_fn)(struct xe_device *xe,
struct xe_exec_queue *q,
u64 value);
[DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP] = exec_queue_set_multi_group,
[DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY] =
exec_queue_set_multi_queue_priority,
+ [DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX] =
+ exec_queue_set_state_cache_perf_fix,
};
/**
ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_PXP_TYPE &&
ext.property != DRM_XE_EXEC_QUEUE_SET_HANG_REPLAY_STATE &&
ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP &&
- ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY))
+ ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY &&
+ ext.property != DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX))
return -EINVAL;
idx = array_index_nospec(ext.property, ARRAY_SIZE(exec_queue_set_property_funcs));
#define EXEC_QUEUE_FLAG_LOW_LATENCY BIT(5)
/* for migration (kernel copy, clear, bind) jobs */
#define EXEC_QUEUE_FLAG_MIGRATE BIT(6)
+/* for programming COMMON_SLICE_CHICKEN3 on first submission */
+#define EXEC_QUEUE_FLAG_DISABLE_STATE_CACHE_PERF_FIX BIT(7)
/**
* @flags: flags for this exec queue, should statically setup aside from ban
#include "instructions/xe_gfxpipe_commands.h"
#include "instructions/xe_gfx_state_commands.h"
#include "regs/xe_engine_regs.h"
+#include "regs/xe_gt_regs.h"
#include "regs/xe_lrc_layout.h"
#include "xe_bb.h"
#include "xe_bo.h"
struct xe_device *xe = gt_to_xe(gt);
struct iosys_map map;
u32 arb_enable;
+ u32 state_cache_perf_fix[3];
int err;
/*
arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
+ if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) {
+ state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
+ state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr;
+ state_cache_perf_fix[2] = _MASKED_BIT_ENABLE(DISABLE_STATE_CACHE_PERF_FIX);
+ xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix));
+ }
+
map = __xe_lrc_seqno_map(lrc);
xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
#define XE_LRC_CREATE_RUNALONE BIT(0)
#define XE_LRC_CREATE_PXP BIT(1)
#define XE_LRC_CREATE_USER_CTX BIT(2)
+#define XE_LRC_DISABLE_STATE_CACHE_PERF_FIX BIT(3)
struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
void *replay_state, u32 ring_size, u16 msix_vec, u32 flags);
DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT;
config->info[DRM_XE_QUERY_CONFIG_FLAGS] |=
DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY;
+ config->info[DRM_XE_QUERY_CONFIG_FLAGS] |=
+ DRM_XE_QUERY_CONFIG_FLAG_HAS_DISABLE_STATE_CACHE_PERF_FIX;
config->info[DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT] =
xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K;
config->info[DRM_XE_QUERY_CONFIG_VA_BITS] = xe->info.va_bits;
* - %DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT - Flag is set if the
* device supports the userspace hint %DRM_XE_GEM_CREATE_FLAG_NO_COMPRESSION.
* This is exposed only on Xe2+.
+ * - %DRM_XE_QUERY_CONFIG_FLAG_HAS_DISABLE_STATE_CACHE_PERF_FIX - Flag is set
+ * if a queue can be creaed with
+ * %DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX
* - %DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT - Minimal memory alignment
* required by this device, typically SZ_4K or SZ_64K
* - %DRM_XE_QUERY_CONFIG_VA_BITS - Maximum bits of a virtual address
#define DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY (1 << 1)
#define DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR (1 << 2)
#define DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT (1 << 3)
+ #define DRM_XE_QUERY_CONFIG_FLAG_HAS_DISABLE_STATE_CACHE_PERF_FIX (1 << 4)
#define DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT 2
#define DRM_XE_QUERY_CONFIG_VA_BITS 3
#define DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY 4
* - %DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY - Set the queue
* priority within the multi-queue group. Current valid priority values are 0–2
* (default is 1), with higher values indicating higher priority.
+ * - %DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX - Set the queue to
+ * enable render color cache keying on BTP+BTI instead of just BTI
+ * (only valid for render queues).
*
* The example below shows how to use @drm_xe_exec_queue_create to create
* a simple exec_queue (no parallel submission) of class
#define DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP 4
#define DRM_XE_MULTI_GROUP_CREATE (1ull << 63)
#define DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY 5
+#define DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX 6
/** @extensions: Pointer to the first extension struct, if any */
__u64 extensions;