]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/xe/migrate: support MEM_COPY instruction
authorMatthew Auld <matthew.auld@intel.com>
Wed, 22 Oct 2025 16:38:35 +0000 (17:38 +0100)
committerMatthew Auld <matthew.auld@intel.com>
Thu, 23 Oct 2025 09:48:39 +0000 (10:48 +0100)
Make this the default on xe2+ when doing a copy. This has a few
advantages over the exiting copy instruction:

1) It has a special PAGE_COPY mode that claims to be optimised for
   page-in/page-out, which is the vast majority of current users.

2) It also has a simple BYTE_COPY mode that supports byte granularity
   copying without any restrictions.

With 2) we can now easily skip the bounce buffer flow when copying
buffers with strange sizing/alignment, like for memory_access. But that
is left for the next patch.

v2 (Matt Brost):
  - Use device info to check whether device should use the MEM_COPY
    path. This should fit better with making this a configfs tunable.
  - And with that also keep old path still functional on xe2 for possible
    experimentation.
  - Add a define for PAGE_COPY page-size.
v3 (Matt Brost):
  - Fallback to an actual linear copy for pitch=1.
  - Also update NVL.

BSpec: 57561
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://lore.kernel.org/r/20251022163836.191405-7-matthew.auld@intel.com
drivers/gpu/drm/xe/instructions/xe_gpu_commands.h
drivers/gpu/drm/xe/xe_device_types.h
drivers/gpu/drm/xe/xe_migrate.c
drivers/gpu/drm/xe/xe_pci.c
drivers/gpu/drm/xe/xe_pci_types.h

index 8cfcd3360896c2b542494c9da697b8cb012575be..5d41ca297447124be7c375f3bd649db24dd3e734 100644 (file)
 #define   XY_FAST_COPY_BLT_D1_DST_TILE4        REG_BIT(30)
 #define   XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK GENMASK(23, 20)
 
+#define MEM_COPY_CMD (2 << 29 | 0x5a << 22 | 0x8)
+#define   MEM_COPY_PAGE_COPY_MODE REG_BIT(19)
+#define   MEM_COPY_MATRIX_COPY REG_BIT(17)
+#define   MEM_COPY_SRC_MOCS_INDEX_MASK GENMASK(31, 28)
+#define   MEM_COPY_DST_MOCS_INDEX_MASK GENMASK(6, 3)
+
 #define        PVC_MEM_SET_CMD         (2 << 29 | 0x5b << 22)
 #define   PVC_MEM_SET_CMD_LEN_DW       7
 #define   PVC_MEM_SET_MATRIX           REG_BIT(17)
index 9e3666a226da8eb6c5dfa40685dac4af50e17727..8f62ee7a73acb410c157682c266b40f21b2854b5 100644 (file)
@@ -300,6 +300,8 @@ struct xe_device {
                 * pcode mailbox commands.
                 */
                u8 has_mbx_power_limits:1;
+               /** @info.has_mem_copy_instr: Device supports MEM_COPY instruction */
+               u8 has_mem_copy_instr:1;
                /** @info.has_pxp: Device has PXP support */
                u8 has_pxp:1;
                /** @info.has_range_tlb_inval: Has range based TLB invalidations */
index 95aefe2e71f51f273d7a53d7c4926e429c07c702..1bbc7bca33ed3df4965ad798441da204dfdb038f 100644 (file)
@@ -699,9 +699,9 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
 }
 
 #define EMIT_COPY_DW 10
-static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
-                     u64 src_ofs, u64 dst_ofs, unsigned int size,
-                     unsigned int pitch)
+static void emit_xy_fast_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
+                             u64 dst_ofs, unsigned int size,
+                             unsigned int pitch)
 {
        struct xe_device *xe = gt_to_xe(gt);
        u32 mocs = 0;
@@ -730,6 +730,61 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
        bb->cs[bb->len++] = upper_32_bits(src_ofs);
 }
 
+#define PAGE_COPY_MODE_PS SZ_256 /* hw uses 256 bytes as the page-size */
+static void emit_mem_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
+                         u64 dst_ofs, unsigned int size, unsigned int pitch)
+{
+       u32 mode, copy_type, width;
+
+       xe_gt_assert(gt, IS_ALIGNED(size, pitch));
+       xe_gt_assert(gt, pitch <= U16_MAX);
+       xe_gt_assert(gt, pitch);
+       xe_gt_assert(gt, size);
+
+       if (IS_ALIGNED(size, PAGE_COPY_MODE_PS) &&
+           IS_ALIGNED(lower_32_bits(src_ofs), PAGE_COPY_MODE_PS) &&
+           IS_ALIGNED(lower_32_bits(dst_ofs), PAGE_COPY_MODE_PS)) {
+               mode = MEM_COPY_PAGE_COPY_MODE;
+               copy_type = 0; /* linear copy */
+               width = size / PAGE_COPY_MODE_PS;
+       } else if (pitch > 1) {
+               xe_gt_assert(gt, size / pitch <= U16_MAX);
+               mode = 0; /* BYTE_COPY */
+               copy_type = MEM_COPY_MATRIX_COPY;
+               width = pitch;
+       } else {
+               mode = 0; /* BYTE_COPY */
+               copy_type = 0; /* linear copy */
+               width = size;
+       }
+
+       xe_gt_assert(gt, width <= U16_MAX);
+
+       bb->cs[bb->len++] = MEM_COPY_CMD | mode | copy_type;
+       bb->cs[bb->len++] = width - 1;
+       bb->cs[bb->len++] = size / pitch - 1; /* ignored by hw for page-copy/linear above */
+       bb->cs[bb->len++] = pitch - 1;
+       bb->cs[bb->len++] = pitch - 1;
+       bb->cs[bb->len++] = lower_32_bits(src_ofs);
+       bb->cs[bb->len++] = upper_32_bits(src_ofs);
+       bb->cs[bb->len++] = lower_32_bits(dst_ofs);
+       bb->cs[bb->len++] = upper_32_bits(dst_ofs);
+       bb->cs[bb->len++] = FIELD_PREP(MEM_COPY_SRC_MOCS_INDEX_MASK, gt->mocs.uc_index) |
+                           FIELD_PREP(MEM_COPY_DST_MOCS_INDEX_MASK, gt->mocs.uc_index);
+}
+
+static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
+                     u64 src_ofs, u64 dst_ofs, unsigned int size,
+                     unsigned int pitch)
+{
+       struct xe_device *xe = gt_to_xe(gt);
+
+       if (xe->info.has_mem_copy_instr)
+               emit_mem_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
+       else
+               emit_xy_fast_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
+}
+
 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
 {
        return usm ? m->usm_batch_base_ofs : m->batch_base_ofs;
index c326430e75b539144d5c0ef454e60d561cf63044..ece45157fd31132afdc258c571635a1e0a25139a 100644 (file)
@@ -342,6 +342,7 @@ static const struct xe_device_desc lnl_desc = {
        .has_display = true,
        .has_flat_ccs = 1,
        .has_pxp = true,
+       .has_mem_copy_instr = true,
        .max_gt_per_tile = 2,
        .needs_scratch = true,
        .va_bits = 48,
@@ -362,6 +363,7 @@ static const struct xe_device_desc bmg_desc = {
        .has_heci_cscfi = 1,
        .has_late_bind = true,
        .has_sriov = true,
+       .has_mem_copy_instr = true,
        .max_gt_per_tile = 2,
        .needs_scratch = true,
        .subplatforms = (const struct xe_subplatform_desc[]) {
@@ -378,6 +380,7 @@ static const struct xe_device_desc ptl_desc = {
        .has_display = true,
        .has_flat_ccs = 1,
        .has_sriov = true,
+       .has_mem_copy_instr = true,
        .max_gt_per_tile = 2,
        .needs_scratch = true,
        .needs_shared_vf_gt_wq = true,
@@ -390,6 +393,7 @@ static const struct xe_device_desc nvls_desc = {
        .dma_mask_size = 46,
        .has_display = true,
        .has_flat_ccs = 1,
+       .has_mem_copy_instr = true,
        .max_gt_per_tile = 2,
        .require_force_probe = true,
        .va_bits = 48,
@@ -655,6 +659,7 @@ static int xe_info_init_early(struct xe_device *xe,
        xe->info.has_pxp = desc->has_pxp;
        xe->info.has_sriov = xe_configfs_primary_gt_allowed(to_pci_dev(xe->drm.dev)) &&
                desc->has_sriov;
+       xe->info.has_mem_copy_instr = desc->has_mem_copy_instr;
        xe->info.skip_guc_pc = desc->skip_guc_pc;
        xe->info.skip_mtcfg = desc->skip_mtcfg;
        xe->info.skip_pcode = desc->skip_pcode;
index a4451bdc79fb35ec0169ebc0fb366eab255a5612..9892c063a9c543a25f6f2d7d4e4cdce898d3effc 100644 (file)
@@ -46,6 +46,7 @@ struct xe_device_desc {
        u8 has_late_bind:1;
        u8 has_llc:1;
        u8 has_mbx_power_limits:1;
+       u8 has_mem_copy_instr:1;
        u8 has_pxp:1;
        u8 has_sriov:1;
        u8 needs_scratch:1;