AArch64: Add inline memmove expansion

author Wilco Dijkstra <wilco.dijkstra@arm.com>

Fri, 1 Dec 2023 15:05:53 +0000 (15:05 +0000)

committer Wilco Dijkstra <wilco.dijkstra@arm.com>

Fri, 15 Dec 2023 13:19:40 +0000 (13:19 +0000)
author Wilco Dijkstra <wilco.dijkstra@arm.com>
Fri, 1 Dec 2023 15:05:53 +0000 (15:05 +0000)
committer Wilco Dijkstra <wilco.dijkstra@arm.com>
Fri, 15 Dec 2023 13:19:40 +0000 (13:19 +0000)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index 8baae4003fc38920b05a78264f5d5eeae65edc2a..88e59426a04c62c2495e89eedc0a95a279881781 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -770,7 +770,7 @@ tree aarch64_vector_load_decl (tree);
  rtx aarch64_gen_callee_cookie (aarch64_feature_flags, arm_pcs);
  void aarch64_expand_call (rtx, rtx, rtx, bool);
  bool aarch64_expand_cpymem_mops (rtx *, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
  bool aarch64_expand_setmem (rtx *);
  bool aarch64_float_const_zero_rtx_p (rtx);
  bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 51673e9a847b23d027e1cece0a6e77cc1517ba8c..190608d13810b48547e479908909a6ccd30d345a 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25428,52 +25428,40 @@ aarch64_progress_pointer (rtx pointer)
    return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
  }
  
-/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
-   MODE bytes.  */
+typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
  
+/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
  static void
-aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
-                                             machine_mode mode)
+aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
+                       int offset, machine_mode mode)
  {
-  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
-     address copies using V4SImode so that we can use Q registers.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+  /* Emit explict load/store pair instructions for 32-byte copies.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
      {
        mode = V4SImode;
+      rtx src1 = adjust_address (src, mode, offset);
+      rtx src2 = adjust_address (src, mode, offset + 16);
+      rtx dst1 = adjust_address (dst, mode, offset);
+      rtx dst2 = adjust_address (dst, mode, offset + 16);
        rtx reg1 = gen_reg_rtx (mode);
        rtx reg2 = gen_reg_rtx (mode);
-      /* "Cast" the pointers to the correct mode.  */
-      *src = adjust_address (*src, mode, 0);
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memcpy.  */
-      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
-                                       aarch64_progress_pointer (*src)));
-      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
-                                        aarch64_progress_pointer (*dst), reg2));
-      /* Move the pointers forward.  */
-      *src = aarch64_move_pointer (*src, 32);
-      *dst = aarch64_move_pointer (*dst, 32);
+      rtx load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
+      rtx store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
+      ops.safe_push ({ load, store });
        return;
      }
  
    rtx reg = gen_reg_rtx (mode);
-
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_move_insn (reg, *src);
-  emit_move_insn (*dst, reg);
-  /* Move the pointers forward.  */
-  *src = aarch64_progress_pointer (*src);
-  *dst = aarch64_progress_pointer (*dst);
+  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
+  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
+  ops.safe_push ({ load, store });
  }
  
  /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
     from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
     rather than memcpy.  Return true iff we succeeded.  */
  bool
-aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
  {
    if (!TARGET_MOPS)
      return false;
@@ -25492,55 +25480,51 @@ aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
    return true;
  }
  
-/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false, indicating that a libcall to
-   memcpy should be emitted.  */
-
+/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
+   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
+   if this is a memmove rather than memcpy.  Return true if we succeed,
+   otherwise return false, indicating that a libcall should be emitted.  */
  bool
-aarch64_expand_cpymem (rtx *operands)
+aarch64_expand_cpymem (rtx *operands, bool is_memmove)
  {
-  int mode_bits;
+  int mode_bytes;
    rtx dst = operands[0];
    rtx src = operands[1];
    unsigned align = UINTVAL (operands[3]);
    rtx base;
-  machine_mode cur_mode = BLKmode;
-  bool size_p = optimize_function_for_size_p (cfun);
+  machine_mode cur_mode = BLKmode, next_mode;
  
    /* Variable-sized or strict-align copies may use the MOPS expansion.  */
    if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
-    return aarch64_expand_cpymem_mops (operands);
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
  
    unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
+  bool use_ldpq = TARGET_SIMD && !(aarch64_tune_params.extra_tuning_flags
+                                  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
  
-  /* Try to inline up to 256 bytes.  */
-  unsigned max_copy_size = 256;
-  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
+  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
+  unsigned max_copy_size = use_ldpq ? 256 : 128;
+  unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
+                                      : aarch64_mops_memcpy_size_threshold;
+
+  /* Reduce the maximum size with -Os.  */
+  if (optimize_function_for_size_p (cfun))
+    max_copy_size /= 4;
  
    /* Large copies use MOPS when available or a library call.  */
    if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
-    return aarch64_expand_cpymem_mops (operands);
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
  
-  int copy_bits = 256;
+  unsigned copy_max = 32;
  
-  /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
-     support or slow 256-bit LDP/STP fall back to 128-bit chunks.
+  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
+     support or slow LDP/STP fall back to 16-byte chunks.
  
       ??? Although it would be possible to use LDP/STP Qn in streaming mode
       (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
       whether that would improve performance.  */
-  if (size <= 24
-      || !TARGET_SIMD
-      || (aarch64_tune_params.extra_tuning_flags
-         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-    copy_bits = 128;
-
-  /* Emit an inline load+store sequence and count the number of operations
-     involved.  We use a simple count of just the loads and stores emitted
-     rather than rtx_insn count as all the pointer adjustments and reg copying
-     in this function will get optimized away later in the pipeline.  */
-  start_sequence ();
-  unsigned nops = 0;
+  if (size <= 24 || !use_ldpq)
+    copy_max = 16;
  
    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -25548,69 +25532,55 @@ aarch64_expand_cpymem (rtx *operands)
    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
    src = adjust_automodify_address (src, VOIDmode, base, 0);
  
-  /* Convert size to bits to make the rest of the code simpler.  */
-  int n = size * BITS_PER_UNIT;
+  copy_ops ops;
+  int offset = 0;
  
-  while (n > 0)
+  while (size > 0)
      {
        /* Find the largest mode in which to do the copy in without over reading
          or writing.  */
        opt_scalar_int_mode mode_iter;
        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
+       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
           cur_mode = mode_iter.require ();
  
        gcc_assert (cur_mode != BLKmode);
  
-      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
+      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
  
        /* Prefer Q-register accesses for the last bytes.  */
-      if (mode_bits == 128 && copy_bits == 256)
+      if (mode_bytes == 16 && copy_max == 32)
         cur_mode = V4SImode;
-
-      aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
-      /* A single block copy is 1 load + 1 store.  */
-      nops += 2;
-      n -= mode_bits;
+      aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
+      size -= mode_bytes;
+      offset += mode_bytes;
  
        /* Emit trailing copies using overlapping unaligned accesses
-       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
+        (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
         {
-         machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
-         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-         gcc_assert (n_bits <= mode_bits);
-         src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
-         dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
-         n = n_bits;
+         next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
+         int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+         gcc_assert (n_bytes <= mode_bytes);
+         offset -= n_bytes - size;
+         size = n_bytes;
         }
      }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
-  /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
-     the constant size into a register.  */
-  unsigned mops_cost = 3 + 1;
-
-  /* If MOPS is available at this point we don't consider the libcall as it's
-     not a win even on code size.  At this point only consider MOPS if
-     optimizing for size.  For speed optimizations we will have chosen between
-     the two based on copy size already.  */
-  if (TARGET_MOPS)
-    {
-      if (size_p && mops_cost < nops)
-       return aarch64_expand_cpymem_mops (operands);
-      emit_insn (seq);
-      return true;
-    }
  
-  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
-     arguments + 1 for the call.  When MOPS is not available and we're
-     optimizing for size a libcall may be preferable.  */
-  unsigned libcall_cost = 4;
-  if (size_p && libcall_cost < nops)
-    return false;
+  /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
+  int nops = ops.length();
+  int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
  
-  emit_insn (seq);
+  for (int i = 0; i < nops; i += inc)
+    {
+      int m = MIN (nops, i + inc);
+      /* Emit loads.  */
+      for (int j = i; j < m; j++)
+       emit_insn (ops[j].first);
+      /* Emit stores.  */
+      for (int j = i; j < m; j++)
+       emit_insn (ops[j].second);
+    }
    return true;
  }
  
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md

index d70535e87beba167cf5737e5f4d645a2309922eb..228c98ab06d1ccf375789108e946c67baf1c8eac 100644 (file)
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1768,7 +1768,7 @@
     (match_operand:DI 3 "immediate_operand")]
     ""
  {
-  if (aarch64_expand_cpymem (operands))
+  if (aarch64_expand_cpymem (operands, false))
      DONE;
    FAIL;
  }
@@ -1812,17 +1812,9 @@
     (match_operand:BLK 1 "memory_operand")
     (match_operand:DI 2 "general_operand")
     (match_operand:DI 3 "immediate_operand")]
-   "TARGET_MOPS"
+   ""
  {
-   rtx sz_reg = operands[2];
-   /* For constant-sized memmoves check the threshold.
-      FIXME: We should add a non-MOPS memmove expansion for smaller,
-      constant-sized memmove to avoid going to a libcall.  */
-   if (CONST_INT_P (sz_reg)
-       && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
-     FAIL;
-
-  if (aarch64_expand_cpymem_mops (operands, true))
+  if (aarch64_expand_cpymem (operands, true))
      DONE;
    FAIL;
  }
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt

index df84c662d2435c3fbddfc98dfc30ef4500566520..218e6c86db4881d3c2bfb92772084f9137bf44c4 100644 (file)
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -345,7 +345,7 @@ Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
  Constant memcpy size in bytes above which to start using MOPS sequence.
  
  -param=aarch64-mops-memmove-size-threshold=
-Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
+Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(256) Param
  Constant memmove size in bytes above which to start using MOPS sequence.
  
  -param=aarch64-mops-memset-size-threshold=
diff --git a/gcc/testsuite/gcc.target/aarch64/memmove.c b/gcc/testsuite/gcc.target/aarch64/memmove.c

new file mode 100644 (file)

index 0000000..d2dd65b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memmove.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#pragma GCC target "+nomops"
+
+void
+copy1 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 12);
+}
+
+void
+copy2 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 128);
+}
+
+void
+copy3 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 255);
+}
+
+/* { dg-final { scan-assembler-not {\tb\tmemmove} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/memmove2.c b/gcc/testsuite/gcc.target/aarch64/memmove2.c

new file mode 100644 (file)

index 0000000..4c590a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memmove2.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mstrict-align" } */
+
+#pragma GCC target "+nomops"
+
+void
+copy1 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 12);
+}
+
+void
+copy2 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 128);
+}
+
+void
+copy3 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 255);
+}
+
+/* { dg-final { scan-assembler-times {\tb\tmemmove} 3 } } */
author	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Fri, 1 Dec 2023 15:05:53 +0000 (15:05 +0000)
committer	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Fri, 15 Dec 2023 13:19:40 +0000 (13:19 +0000)
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.md		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.opt		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/memmove.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/memmove2.c	[new file with mode: 0644]	patch \| blob