[AArch64] Implement movmem for the benefit of inline memcpy

author James Greenhalgh <james.greenhalgh@arm.com>

Fri, 6 Jun 2014 13:16:40 +0000 (13:16 +0000)

committer James Greenhalgh <jgreenhalgh@gcc.gnu.org>

Fri, 6 Jun 2014 13:16:40 +0000 (13:16 +0000)
author James Greenhalgh <james.greenhalgh@arm.com>
Fri, 6 Jun 2014 13:16:40 +0000 (13:16 +0000)
committer James Greenhalgh <jgreenhalgh@gcc.gnu.org>
Fri, 6 Jun 2014 13:16:40 +0000 (13:16 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 2ec7fb90c89c2f3dd6ed97ccd0074d078f221dd4..9d2e20129a664cfc7a8742af6bb83158bbf5df05 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,13 @@
+2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
+
+       * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
+       * config/aarch64/aarch64.c (aarch64_move_pointer): New.
+       (aarch64_progress_pointer): Likewise.
+       (aarch64_copy_one_part_and_move_pointers): Likewise.
+       (aarch64_expand_movmen): Likewise.
+       * config/aarch64/aarch64.h (MOVE_RATIO): Set low.
+       * config/aarch64/aarch64.md (movmem<mode>): New.
+
  2014-06-06  Bingfeng Mei  <bmei@broadcom.com>
  
         * targhooks.c (default_add_stmt_cost): Call target specific
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index 68d488d2fd9f401c986855ac6f043c500cda6c8f..c4f75b36a173a11b09120b8943da40c7e1eee9f5 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -180,6 +180,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode,
  enum aarch64_symbol_type
  aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context);
  bool aarch64_constant_address_p (rtx);
+bool aarch64_expand_movmem (rtx *);
  bool aarch64_float_const_zero_rtx_p (rtx);
  bool aarch64_function_arg_regno_p (unsigned);
  bool aarch64_gen_movmemqi (rtx *);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 621692e4f7998bb1c2ceafea4226efa9ddb7167b..a8b1523a838660653a83848e5646e3f779088185 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9409,6 +9409,164 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
    return false;
  }
  
+/* Return a new RTX holding the result of moving POINTER forward by
+   AMOUNT bytes.  */
+
+static rtx
+aarch64_move_pointer (rtx pointer, int amount)
+{
+  rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
+
+  return adjust_automodify_address (pointer, GET_MODE (pointer),
+                                   next, amount);
+}
+
+/* Return a new RTX holding the result of moving POINTER forward by the
+   size of the mode it points to.  */
+
+static rtx
+aarch64_progress_pointer (rtx pointer)
+{
+  HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
+
+  return aarch64_move_pointer (pointer, amount);
+}
+
+/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
+   MODE bytes.  */
+
+static void
+aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
+                                             enum machine_mode mode)
+{
+  rtx reg = gen_reg_rtx (mode);
+
+  /* "Cast" the pointers to the correct mode.  */
+  *src = adjust_address (*src, mode, 0);
+  *dst = adjust_address (*dst, mode, 0);
+  /* Emit the memcpy.  */
+  emit_move_insn (reg, *src);
+  emit_move_insn (*dst, reg);
+  /* Move the pointers forward.  */
+  *src = aarch64_progress_pointer (*src);
+  *dst = aarch64_progress_pointer (*dst);
+}
+
+/* Expand movmem, as if from a __builtin_memcpy.  Return true if
+   we succeed, otherwise return false.  */
+
+bool
+aarch64_expand_movmem (rtx *operands)
+{
+  unsigned int n;
+  rtx dst = operands[0];
+  rtx src = operands[1];
+  rtx base;
+  bool speed_p = !optimize_function_for_size_p (cfun);
+
+  /* When optimizing for size, give a better estimate of the length of a
+     memcpy call, but use the default otherwise.  */
+  unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
+
+  /* We can't do anything smart if the amount to copy is not constant.  */
+  if (!CONST_INT_P (operands[2]))
+    return false;
+
+  n = UINTVAL (operands[2]);
+
+  /* Try to keep the number of instructions low.  For cases below 16 bytes we
+     need to make at most two moves.  For cases above 16 bytes it will be one
+     move for each 16 byte chunk, then at most two additional moves.  */
+  if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
+    return false;
+
+  base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
+  dst = adjust_automodify_address (dst, VOIDmode, base, 0);
+
+  base = copy_to_mode_reg (Pmode, XEXP (src, 0));
+  src = adjust_automodify_address (src, VOIDmode, base, 0);
+
+  /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
+     1-byte chunk.  */
+  if (n < 4)
+    {
+      if (n >= 2)
+       {
+         aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
+         n -= 2;
+       }
+
+      if (n == 1)
+       aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
+
+      return true;
+    }
+
+  /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
+     4-byte chunk, partially overlapping with the previously copied chunk.  */
+  if (n < 8)
+    {
+      aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+      n -= 4;
+      if (n > 0)
+       {
+         int move = n - 4;
+
+         src = aarch64_move_pointer (src, move);
+         dst = aarch64_move_pointer (dst, move);
+         aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+       }
+      return true;
+    }
+
+  /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
+     them, then (if applicable) an 8-byte chunk.  */
+  while (n >= 8)
+    {
+      if (n / 16)
+       {
+         aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
+         n -= 16;
+       }
+      else
+       {
+         aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
+         n -= 8;
+       }
+    }
+
+  /* Finish the final bytes of the copy.  We can always do this in one
+     instruction.  We either copy the exact amount we need, or partially
+     overlap with the previous chunk we copied and copy 8-bytes.  */
+  if (n == 0)
+    return true;
+  else if (n == 1)
+    aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
+  else if (n == 2)
+    aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
+  else if (n == 4)
+    aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+  else
+    {
+      if (n == 3)
+       {
+         src = aarch64_move_pointer (src, -1);
+         dst = aarch64_move_pointer (dst, -1);
+         aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+       }
+      else
+       {
+         int move = n - 8;
+
+         src = aarch64_move_pointer (src, move);
+         dst = aarch64_move_pointer (dst, move);
+         aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
+       }
+    }
+
+  return true;
+}
+
  #undef TARGET_ADDRESS_COST
  #define TARGET_ADDRESS_COST aarch64_address_cost
  
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h

index ae94356f3aef97f9f9e07d49a6cda69ea159cdf7..a191162daf8d25f1cd8c40560b69236941fb0267 100644 (file)
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -672,12 +672,14 @@ do {                                                                           \
  /* The base cost overhead of a memcpy call, for MOVE_RATIO and friends.  */
  #define AARCH64_CALL_RATIO 8
  
-/* When optimizing for size, give a better estimate of the length of a memcpy
-   call, but use the default otherwise.  But move_by_pieces_ninsns() counts
-   memory-to-memory moves, and we'll have to generate a load & store for each,
-   so halve the value to take that into account.  */
+/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
+   move_by_pieces will continually copy the largest safe chunks.  So a
+   7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
+   for both size and speed of copy, so we will instead use the "movmem"
+   standard name to implement the copy.  This logic does not apply when
+   targeting -mstrict-align, so keep a sensible default in that case.  */
  #define MOVE_RATIO(speed) \
-  (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
+  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
  
  /* For CLEAR_RATIO, when optimizing for size, give a better estimate
     of the length of a memset call, but use the default otherwise.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md

index 6e605c19f0acbe88d53f460cb513d24dde6d658f..661d784b93e60fd2f636f5b5f03c10c6d53493dd 100644 (file)
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -883,6 +883,24 @@
    }
  )
  
+;; 0 is dst
+;; 1 is src
+;; 2 is size of move in bytes
+;; 3 is alignment
+
+(define_expand "movmemdi"
+  [(match_operand:BLK 0 "memory_operand")
+   (match_operand:BLK 1 "memory_operand")
+   (match_operand:DI 2 "immediate_operand")
+   (match_operand:DI 3 "immediate_operand")]
+   "!STRICT_ALIGNMENT"
+{
+  if (aarch64_expand_movmem (operands))
+    DONE;
+  FAIL;
+}
+)
+
  ;; Operands 1 and 3 are tied together by the final condition; so we allow
  ;; fairly lax checking on the second memory operation.
  (define_insn "load_pair<mode>"
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 7556316cd56eee7abeeb6fe217a1d3ac858a5bd7..d616616b545aae406f64bc3cfb5fcc6e7ff190f7 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
+
+       * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
+       * gcc.dg/tree-ssa/sra-12.c: Likewise.
+
  2014-06-06  Thomas Preud'homme  <thomas.preudhomme@arm.com>
  
         * gcc.c-torture/execute/bswap-2.c: Add alignment constraints to
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c

index a970c855b884acaf93cc5e681bb7b8104a92c87c..07f575db15767acb32da8712ce876f7ba19d8591 100644 (file)
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
@@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
  /* Whether the structs are totally scalarized or not depends on the
     MOVE_RATIO macro definition in the back end.  The scalarization will
     not take place when using small values for MOVE_RATIO.  */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
  /* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c

index 59e5e6af6d960b6d1c30d4ce8ae96415bb5307c0..45aa9631d8b6dddbf4ff2d7fef6994b4cd41a472 100644 (file)
--- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
@@ -21,5 +21,5 @@ int foo (struct S *p)
    *p = l;
  }
  
-/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
  /* { dg-final { cleanup-tree-dump "release_ssa" } } */
author	James Greenhalgh <james.greenhalgh@arm.com>
	Fri, 6 Jun 2014 13:16:40 +0000 (13:16 +0000)
committer	James Greenhalgh <jgreenhalgh@gcc.gnu.org>
	Fri, 6 Jun 2014 13:16:40 +0000 (13:16 +0000)
gcc/ChangeLog		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.h		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.md		patch \| blob \| blame \| history
gcc/testsuite/ChangeLog		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/tree-ssa/pr42585.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/tree-ssa/sra-12.c		patch \| blob \| blame \| history