gcc/

author Julian Brown <julian@codesourcery.com>

Tue, 18 Oct 2011 10:49:44 +0000 (10:49 +0000)

committer Julian Brown <jules@gcc.gnu.org>

Tue, 18 Oct 2011 10:49:44 +0000 (10:49 +0000)
author Julian Brown <julian@codesourcery.com>
Tue, 18 Oct 2011 10:49:44 +0000 (10:49 +0000)
committer Julian Brown <jules@gcc.gnu.org>
Tue, 18 Oct 2011 10:49:44 +0000 (10:49 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 1cc34b217e253939db84f713344ce1e67528bf91..2968fb0de5e1c7727b63d35b02d125f93cd563a0 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2011-10-18  Julian Brown  <julian@codesourcery.com>
+
+       * config/arm/arm.c (arm_block_move_unaligned_straight)
+       (arm_adjust_block_mem, arm_block_move_unaligned_loop)
+       (arm_movmemqi_unaligned): New.
+       (arm_gen_movmemqi): Support unaligned block copies.
+
  2011-10-18  Ira Rosen  <ira.rosen@linaro.org>
  
         * doc/md.texi (vec_widen_ushiftl_hi, vec_widen_ushiftl_lo,
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c

index a429c192fdbdcb540e1550431674b2b71a1e5710..f1ada6f9a7386dca53e59b350fbe0e1118788ad3 100644 (file)
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -10766,6 +10766,335 @@ gen_const_stm_seq (rtx *operands, int nops)
    return true;
  }
  
+/* Copy a block of memory using plain ldr/str/ldrh/strh instructions, to permit
+   unaligned copies on processors which support unaligned semantics for those
+   instructions.  INTERLEAVE_FACTOR can be used to attempt to hide load latency
+   (using more registers) by doing e.g. load/load/store/store for a factor of 2.
+   An interleave factor of 1 (the minimum) will perform no interleaving. 
+   Load/store multiple are used for aligned addresses where possible.  */
+
+static void
+arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase,
+                                  HOST_WIDE_INT length,
+                                  unsigned int interleave_factor)
+{
+  rtx *regs = XALLOCAVEC (rtx, interleave_factor);
+  int *regnos = XALLOCAVEC (int, interleave_factor);
+  HOST_WIDE_INT block_size_bytes = interleave_factor * UNITS_PER_WORD;
+  HOST_WIDE_INT i, j;
+  HOST_WIDE_INT remaining = length, words;
+  rtx halfword_tmp = NULL, byte_tmp = NULL;
+  rtx dst, src;
+  bool src_aligned = MEM_ALIGN (srcbase) >= BITS_PER_WORD;
+  bool dst_aligned = MEM_ALIGN (dstbase) >= BITS_PER_WORD;
+  HOST_WIDE_INT srcoffset, dstoffset;
+  HOST_WIDE_INT src_autoinc, dst_autoinc;
+  rtx mem, addr;
+  
+  gcc_assert (1 <= interleave_factor && interleave_factor <= 4);
+  
+  /* Use hard registers if we have aligned source or destination so we can use
+     load/store multiple with contiguous registers.  */
+  if (dst_aligned || src_aligned)
+    for (i = 0; i < interleave_factor; i++)
+      regs[i] = gen_rtx_REG (SImode, i);
+  else
+    for (i = 0; i < interleave_factor; i++)
+      regs[i] = gen_reg_rtx (SImode);
+
+  dst = copy_addr_to_reg (XEXP (dstbase, 0));
+  src = copy_addr_to_reg (XEXP (srcbase, 0));
+
+  srcoffset = dstoffset = 0;
+  
+  /* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST.
+     For copying the last bytes we want to subtract this offset again.  */
+  src_autoinc = dst_autoinc = 0;
+
+  for (i = 0; i < interleave_factor; i++)
+    regnos[i] = i;
+
+  /* Copy BLOCK_SIZE_BYTES chunks.  */
+
+  for (i = 0; i + block_size_bytes <= length; i += block_size_bytes)
+    {
+      /* Load words.  */
+      if (src_aligned && interleave_factor > 1)
+       {
+         emit_insn (arm_gen_load_multiple (regnos, interleave_factor, src,
+                                           TRUE, srcbase, &srcoffset));
+         src_autoinc += UNITS_PER_WORD * interleave_factor;
+       }
+      else
+       {
+         for (j = 0; j < interleave_factor; j++)
+           {
+             addr = plus_constant (src, srcoffset + j * UNITS_PER_WORD
+                                        - src_autoinc);
+             mem = adjust_automodify_address (srcbase, SImode, addr,
+                                              srcoffset + j * UNITS_PER_WORD);
+             emit_insn (gen_unaligned_loadsi (regs[j], mem));
+           }
+         srcoffset += block_size_bytes;
+       }
+
+      /* Store words.  */
+      if (dst_aligned && interleave_factor > 1)
+       {
+         emit_insn (arm_gen_store_multiple (regnos, interleave_factor, dst,
+                                            TRUE, dstbase, &dstoffset));
+         dst_autoinc += UNITS_PER_WORD * interleave_factor;
+       }
+      else
+       {
+         for (j = 0; j < interleave_factor; j++)
+           {
+             addr = plus_constant (dst, dstoffset + j * UNITS_PER_WORD
+                                        - dst_autoinc);
+             mem = adjust_automodify_address (dstbase, SImode, addr,
+                                              dstoffset + j * UNITS_PER_WORD);
+             emit_insn (gen_unaligned_storesi (mem, regs[j]));
+           }
+         dstoffset += block_size_bytes;
+       }
+
+      remaining -= block_size_bytes;
+    }
+  
+  /* Copy any whole words left (note these aren't interleaved with any
+     subsequent halfword/byte load/stores in the interests of simplicity).  */
+  
+  words = remaining / UNITS_PER_WORD;
+
+  gcc_assert (words < interleave_factor);
+  
+  if (src_aligned && words > 1)
+    {
+      emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase,
+                                       &srcoffset));
+      src_autoinc += UNITS_PER_WORD * words;
+    }
+  else
+    {
+      for (j = 0; j < words; j++)
+       {
+         addr = plus_constant (src,
+                               srcoffset + j * UNITS_PER_WORD - src_autoinc);
+         mem = adjust_automodify_address (srcbase, SImode, addr,
+                                          srcoffset + j * UNITS_PER_WORD);
+         emit_insn (gen_unaligned_loadsi (regs[j], mem));
+       }
+      srcoffset += words * UNITS_PER_WORD;
+    }
+
+  if (dst_aligned && words > 1)
+    {
+      emit_insn (arm_gen_store_multiple (regnos, words, dst, TRUE, dstbase,
+                                        &dstoffset));
+      dst_autoinc += words * UNITS_PER_WORD;
+    }
+  else
+    {
+      for (j = 0; j < words; j++)
+       {
+         addr = plus_constant (dst,
+                               dstoffset + j * UNITS_PER_WORD - dst_autoinc);
+         mem = adjust_automodify_address (dstbase, SImode, addr,
+                                          dstoffset + j * UNITS_PER_WORD);
+         emit_insn (gen_unaligned_storesi (mem, regs[j]));
+       }
+      dstoffset += words * UNITS_PER_WORD;
+    }
+
+  remaining -= words * UNITS_PER_WORD;
+  
+  gcc_assert (remaining < 4);
+  
+  /* Copy a halfword if necessary.  */
+  
+  if (remaining >= 2)
+    {
+      halfword_tmp = gen_reg_rtx (SImode);
+
+      addr = plus_constant (src, srcoffset - src_autoinc);
+      mem = adjust_automodify_address (srcbase, HImode, addr, srcoffset);
+      emit_insn (gen_unaligned_loadhiu (halfword_tmp, mem));
+
+      /* Either write out immediately, or delay until we've loaded the last
+        byte, depending on interleave factor.  */
+      if (interleave_factor == 1)
+       {
+         addr = plus_constant (dst, dstoffset - dst_autoinc);
+         mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+         emit_insn (gen_unaligned_storehi (mem,
+                      gen_lowpart (HImode, halfword_tmp)));
+         halfword_tmp = NULL;
+         dstoffset += 2;
+       }
+
+      remaining -= 2;
+      srcoffset += 2;
+    }
+  
+  gcc_assert (remaining < 2);
+  
+  /* Copy last byte.  */
+  
+  if ((remaining & 1) != 0)
+    {
+      byte_tmp = gen_reg_rtx (SImode);
+
+      addr = plus_constant (src, srcoffset - src_autoinc);
+      mem = adjust_automodify_address (srcbase, QImode, addr, srcoffset);
+      emit_move_insn (gen_lowpart (QImode, byte_tmp), mem);
+
+      if (interleave_factor == 1)
+       {
+         addr = plus_constant (dst, dstoffset - dst_autoinc);
+         mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+         emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+         byte_tmp = NULL;
+         dstoffset++;
+       }
+
+      remaining--;
+      srcoffset++;
+    }
+  
+  /* Store last halfword if we haven't done so already.  */
+  
+  if (halfword_tmp)
+    {
+      addr = plus_constant (dst, dstoffset - dst_autoinc);
+      mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+      emit_insn (gen_unaligned_storehi (mem,
+                  gen_lowpart (HImode, halfword_tmp)));
+      dstoffset += 2;
+    }
+
+  /* Likewise for last byte.  */
+
+  if (byte_tmp)
+    {
+      addr = plus_constant (dst, dstoffset - dst_autoinc);
+      mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+      emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+      dstoffset++;
+    }
+  
+  gcc_assert (remaining == 0 && srcoffset == dstoffset);
+}
+
+/* From mips_adjust_block_mem:
+
+   Helper function for doing a loop-based block operation on memory
+   reference MEM.  Each iteration of the loop will operate on LENGTH
+   bytes of MEM.
+
+   Create a new base register for use within the loop and point it to
+   the start of MEM.  Create a new memory reference that uses this
+   register.  Store them in *LOOP_REG and *LOOP_MEM respectively.  */
+
+static void
+arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
+                     rtx *loop_mem)
+{
+  *loop_reg = copy_addr_to_reg (XEXP (mem, 0));
+  
+  /* Although the new mem does not refer to a known location,
+     it does keep up to LENGTH bytes of alignment.  */
+  *loop_mem = change_address (mem, BLKmode, *loop_reg);
+  set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
+}
+
+/* From mips_block_move_loop:
+
+   Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
+   bytes at a time.  LENGTH must be at least BYTES_PER_ITER.  Assume that
+   the memory regions do not overlap.  */
+
+static void
+arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length,
+                              unsigned int interleave_factor,
+                              HOST_WIDE_INT bytes_per_iter)
+{
+  rtx label, src_reg, dest_reg, final_src, test;
+  HOST_WIDE_INT leftover;
+  
+  leftover = length % bytes_per_iter;
+  length -= leftover;
+  
+  /* Create registers and memory references for use within the loop.  */
+  arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
+  arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
+  
+  /* Calculate the value that SRC_REG should have after the last iteration of
+     the loop.  */
+  final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
+                                  0, 0, OPTAB_WIDEN);
+
+  /* Emit the start of the loop.  */
+  label = gen_label_rtx ();
+  emit_label (label);
+  
+  /* Emit the loop body.  */
+  arm_block_move_unaligned_straight (dest, src, bytes_per_iter,
+                                    interleave_factor);
+
+  /* Move on to the next block.  */
+  emit_move_insn (src_reg, plus_constant (src_reg, bytes_per_iter));
+  emit_move_insn (dest_reg, plus_constant (dest_reg, bytes_per_iter));
+  
+  /* Emit the loop condition.  */
+  test = gen_rtx_NE (VOIDmode, src_reg, final_src);
+  emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));
+  
+  /* Mop up any left-over bytes.  */
+  if (leftover)
+    arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor);
+}
+
+/* Emit a block move when either the source or destination is unaligned (not
+   aligned to a four-byte boundary).  This may need further tuning depending on
+   core type, optimize_size setting, etc.  */
+
+static int
+arm_movmemqi_unaligned (rtx *operands)
+{
+  HOST_WIDE_INT length = INTVAL (operands[2]);
+  
+  if (optimize_size)
+    {
+      bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD;
+      bool dst_aligned = MEM_ALIGN (operands[0]) >= BITS_PER_WORD;
+      /* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit
+        size of code if optimizing for size.  We'll use ldm/stm if src_aligned
+        or dst_aligned though: allow more interleaving in those cases since the
+        resulting code can be smaller.  */
+      unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1;
+      HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4;
+      
+      if (length > 12)
+       arm_block_move_unaligned_loop (operands[0], operands[1], length,
+                                      interleave_factor, bytes_per_iter);
+      else
+       arm_block_move_unaligned_straight (operands[0], operands[1], length,
+                                          interleave_factor);
+    }
+  else
+    {
+      /* Note that the loop created by arm_block_move_unaligned_loop may be
+        subject to loop unrolling, which makes tuning this condition a little
+        redundant.  */
+      if (length > 32)
+       arm_block_move_unaligned_loop (operands[0], operands[1], length, 4, 16);
+      else
+       arm_block_move_unaligned_straight (operands[0], operands[1], length, 4);
+    }
+  
+  return 1;
+}
+
  int
  arm_gen_movmemqi (rtx *operands)
  {
@@ -10778,8 +11107,13 @@ arm_gen_movmemqi (rtx *operands)
  
    if (GET_CODE (operands[2]) != CONST_INT
        || GET_CODE (operands[3]) != CONST_INT
-      || INTVAL (operands[2]) > 64
-      || INTVAL (operands[3]) & 3)
+      || INTVAL (operands[2]) > 64)
+    return 0;
+
+  if (unaligned_access && (INTVAL (operands[3]) & 3) != 0)
+    return arm_movmemqi_unaligned (operands);
+
+  if (INTVAL (operands[3]) & 3)
      return 0;
  
    dstbase = operands[0];
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index db9417b9ddbddad45cb2aecc55d2040f7b4bc10a..3216bfcb47a5eb3f451e840bf71a9eb1e0fc0566 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,11 @@
+2011-10-18  Julian Brown  <julian@codesourcery.com>
+
+       * lib/target-supports.exp (check_effective_target_arm_unaligned): New.
+       * gcc.target/arm/unaligned-memcpy-1.c: New.
+       * gcc.target/arm/unaligned-memcpy-2.c: New.
+       * gcc.target/arm/unaligned-memcpy-3.c: New.
+       * gcc.target/arm/unaligned-memcpy-4.c: New.
+
  2011-10-18  Janus Weil  <janus@gcc.gnu.org>
  
         PR fortran/47023
diff --git a/gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c

new file mode 100644 (file)

index 0000000..c4f5640
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+void unknown_alignment (char *dest, char *src)
+{
+  memcpy (dest, src, 15);
+}
+
+/* We should see three unaligned word loads and store pairs, one unaligned
+   ldrh/strh pair, and an ldrb/strb pair.  Sanity check that.  */
+
+/* { dg-final { scan-assembler-times "@ unaligned" 8 } } */
+/* { dg-final { scan-assembler-times "ldrh" 1 } } */
+/* { dg-final { scan-assembler-times "strh" 1 } } */
+/* { dg-final { scan-assembler-times "ldrb" 1 } } */
+/* { dg-final { scan-assembler-times "strb" 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c

new file mode 100644 (file)

index 0000000..c7d24c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+char dest[16];
+
+void aligned_dest (char *src)
+{
+  memcpy (dest, src, 15);
+}
+
+/* Expect a multi-word store for the main part of the copy, but subword
+   loads/stores for the remainder.  */
+
+/* { dg-final { scan-assembler-times "stmia" 1 } } */
+/* { dg-final { scan-assembler-times "ldrh" 1 } } */
+/* { dg-final { scan-assembler-times "strh" 1 } } */
+/* { dg-final { scan-assembler-times "ldrb" 1 } } */
+/* { dg-final { scan-assembler-times "strb" 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c

new file mode 100644 (file)

index 0000000..5f04137
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+char src[16];
+
+void aligned_src (char *dest)
+{
+  memcpy (dest, src, 15);
+}
+
+/* Expect a multi-word load for the main part of the copy, but subword
+   loads/stores for the remainder.  */
+
+/* { dg-final { scan-assembler-times "ldmia" 1 } } */
+/* { dg-final { scan-assembler-times "ldrh" 1 } } */
+/* { dg-final { scan-assembler-times "strh" 1 } } */
+/* { dg-final { scan-assembler-times "ldrb" 1 } } */
+/* { dg-final { scan-assembler-times "strb" 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c

new file mode 100644 (file)

index 0000000..9995708
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+char src[16];
+char dest[16];
+
+void aligned_both (void)
+{
+  memcpy (dest, src, 15);
+}
+
+/* We know both src and dest to be aligned: expect multiword loads/stores.  */
+
+/* { dg-final { scan-assembler-times "ldmia" 1 } } */
+/* { dg-final { scan-assembler-times "stmia" 1 } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp

index c4077ffabb6c8a87f7589b5f61005471bd6d17e1..f19c3c566c66f32ac0e09c6bdb332f920d17d6bf 100644 (file)
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -1973,6 +1973,18 @@ proc check_effective_target_arm_dsp { } {
      }]
  }
  
+# Return 1 if this is an ARM target that supports unaligned word/halfword
+# load/store instructions.
+
+proc check_effective_target_arm_unaligned { } {
+    return [check_no_compiler_messages arm_unaligned assembly {
+       #ifndef __ARM_FEATURE_UNALIGNED
+       #error no unaligned support
+       #endif
+       int i;
+    }]
+}
+
  # Add the options needed for NEON.  We need either -mfloat-abi=softfp
  # or -mfloat-abi=hard, but if one is already specified by the
  # multilib, use it.  Similarly, if a -mfpu option already enables
author	Julian Brown <julian@codesourcery.com>
	Tue, 18 Oct 2011 10:49:44 +0000 (10:49 +0000)
committer	Julian Brown <jules@gcc.gnu.org>
	Tue, 18 Oct 2011 10:49:44 +0000 (10:49 +0000)
gcc/ChangeLog		patch \| blob \| blame \| history
gcc/config/arm/arm.c		patch \| blob \| blame \| history
gcc/testsuite/ChangeLog		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/lib/target-supports.exp		patch \| blob \| blame \| history