[ARM] PR68532: Fix up vuzp for big endian

author cbaylis <cbaylis@138bc75d-0d04-0410-961f-82ee72b054a4>

Tue, 9 Feb 2016 18:47:55 +0000 (18:47 +0000)

committer cbaylis <cbaylis@138bc75d-0d04-0410-961f-82ee72b054a4>

Tue, 9 Feb 2016 18:47:55 +0000 (18:47 +0000)
author cbaylis <cbaylis@138bc75d-0d04-0410-961f-82ee72b054a4>
Tue, 9 Feb 2016 18:47:55 +0000 (18:47 +0000)
committer cbaylis <cbaylis@138bc75d-0d04-0410-961f-82ee72b054a4>
Tue, 9 Feb 2016 18:47:55 +0000 (18:47 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index c4711f35c03e67f818b3375fb2fc8001e5f833e3..28029a8c05b24cff66299bddf12e99666f6e33c3 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,20 @@
+2016-02-09  Charles Baylis  <charles.baylis@linaro.org>
+
+       PR target/68532
+       * config/arm/arm.c (neon_endian_lane_map): New function.
+       (neon_vector_pair_endian_lane_map): New function.
+       (arm_evpc_neon_vuzp): Allow for big endian lane order.
+       * config/arm/arm_neon.h (vuzpq_s8): Adjust shuffle patterns for big
+       endian.
+       (vuzpq_s16): Likewise.
+       (vuzpq_s32): Likewise.
+       (vuzpq_f32): Likewise.
+       (vuzpq_u8): Likewise.
+       (vuzpq_u16): Likewise.
+       (vuzpq_u32): Likewise.
+       (vuzpq_p8): Likewise.
+       (vuzpq_p16): Likewise.
+
  2016-02-11  Alexandre Oliva  <aoliva@redhat.com>
  
         PR target/69634
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c

index 2b82e13513279cea8cae6c3ccc3cd29b08ae72e0..ab8cfff3add173d0ff3aad805cff4ed61908785c 100644 (file)
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -28203,6 +28203,37 @@ arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
    arm_expand_vec_perm_1 (target, op0, op1, sel);
  }
  
+/* Map lane ordering between architectural lane order, and GCC lane order,
+   taking into account ABI.  See comment above output_move_neon for details.  */
+
+static int
+neon_endian_lane_map (machine_mode mode, int lane)
+{
+  if (BYTES_BIG_ENDIAN)
+  {
+    int nelems = GET_MODE_NUNITS (mode);
+    /* Reverse lane order.  */
+    lane = (nelems - 1 - lane);
+    /* Reverse D register order, to match ABI.  */
+    if (GET_MODE_SIZE (mode) == 16)
+      lane = lane ^ (nelems / 2);
+  }
+  return lane;
+}
+
+/* Some permutations index into pairs of vectors, this is a helper function
+   to map indexes into those pairs of vectors.  */
+
+static int
+neon_pair_endian_lane_map (machine_mode mode, int lane)
+{
+  int nelem = GET_MODE_NUNITS (mode);
+  if (BYTES_BIG_ENDIAN)
+    lane =
+      neon_endian_lane_map (mode, lane & (nelem - 1)) + (lane & nelem);
+  return lane;
+}
+
  /* Generate or test for an insn that supports a constant permutation.  */
  
  /* Recognize patterns for the VUZP insns.  */
@@ -28213,14 +28244,22 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
    unsigned int i, odd, mask, nelt = d->nelt;
    rtx out0, out1, in0, in1;
    rtx (*gen)(rtx, rtx, rtx, rtx);
+  int first_elem;
+  int swap_nelt;
  
    if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
      return false;
  
-  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
-  if (d->perm[0] == 0)
+  /* arm_expand_vec_perm_const_1 () helpfully swaps the operands for the
+     big endian pattern on 64 bit vectors, so we correct for that.  */
+  swap_nelt = BYTES_BIG_ENDIAN && !d->one_vector_p
+    && GET_MODE_SIZE (d->vmode) == 8 ? d->nelt : 0;
+
+  first_elem = d->perm[neon_endian_lane_map (d->vmode, 0)] ^ swap_nelt;
+
+  if (first_elem == neon_endian_lane_map (d->vmode, 0))
      odd = 0;
-  else if (d->perm[0] == 1)
+  else if (first_elem == neon_endian_lane_map (d->vmode, 1))
      odd = 1;
    else
      return false;
@@ -28228,8 +28267,9 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
  
    for (i = 0; i < nelt; i++)
      {
-      unsigned elt = (i * 2 + odd) & mask;
-      if (d->perm[i] != elt)
+      unsigned elt =
+       (neon_pair_endian_lane_map (d->vmode, i) * 2 + odd) & mask;
+      if ((d->perm[i] ^ swap_nelt) != neon_pair_endian_lane_map (d->vmode, elt))
         return false;
      }
  
@@ -28253,11 +28293,8 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
  
    in0 = d->op0;
    in1 = d->op1;
-  if (BYTES_BIG_ENDIAN)
-    {
-      std::swap (in0, in1);
-      odd = !odd;
-    }
+  if (swap_nelt != 0)
+    std::swap (in0, in1);
  
    out0 = d->target;
    out1 = gen_reg_rtx (d->vmode);
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h

index 41e23476ad00859f16e46f2333ad15b31e211a20..e807ad1f2d3e6eb073a1ca54611e34cdd8bffdfb 100644 (file)
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -8741,9 +8741,9 @@ vuzpq_s8 (int8x16_t __a, int8x16_t __b)
    int8x16x2_t __rv;
  #ifdef __ARM_BIG_ENDIAN
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
-      { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
+      { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
    __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
-      { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
+      { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
  #else
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
        { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
@@ -8759,9 +8759,9 @@ vuzpq_s16 (int16x8_t __a, int16x8_t __b)
    int16x8x2_t __rv;
  #ifdef __ARM_BIG_ENDIAN
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
-      { 9, 11, 13, 15, 1, 3, 5, 7 });
+      { 5, 7, 1, 3, 13, 15, 9, 11 });
    __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
-      { 8, 10, 12, 14, 0, 2, 4, 6 });
+      { 4, 6, 0, 2, 12, 14, 8, 10 });
  #else
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
        { 0, 2, 4, 6, 8, 10, 12, 14 });
@@ -8776,8 +8776,8 @@ vuzpq_s32 (int32x4_t __a, int32x4_t __b)
  {
    int32x4x2_t __rv;
  #ifdef __ARM_BIG_ENDIAN
-  __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
-  __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
  #else
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
    __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
@@ -8790,8 +8790,8 @@ vuzpq_f32 (float32x4_t __a, float32x4_t __b)
  {
    float32x4x2_t __rv;
  #ifdef __ARM_BIG_ENDIAN
-  __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
-  __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
  #else
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
    __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
@@ -8805,9 +8805,9 @@ vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
    uint8x16x2_t __rv;
  #ifdef __ARM_BIG_ENDIAN
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
-      { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
+      { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
    __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
-      { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
+      { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
  #else
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
        { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
@@ -8823,9 +8823,9 @@ vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
    uint16x8x2_t __rv;
  #ifdef __ARM_BIG_ENDIAN
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
-      { 9, 11, 13, 15, 1, 3, 5, 7 });
+      { 5, 7, 1, 3, 13, 15, 9, 11 });
    __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
-      { 8, 10, 12, 14, 0, 2, 4, 6 });
+      { 4, 6, 0, 2, 12, 14, 8, 10 });
  #else
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
        { 0, 2, 4, 6, 8, 10, 12, 14 });
@@ -8840,8 +8840,8 @@ vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
  {
    uint32x4x2_t __rv;
  #ifdef __ARM_BIG_ENDIAN
-  __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
-  __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
  #else
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
    __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
@@ -8855,9 +8855,9 @@ vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
    poly8x16x2_t __rv;
  #ifdef __ARM_BIG_ENDIAN
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
-      { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
+      { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
    __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
-      { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
+      { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
  #else
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
        { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
@@ -8873,9 +8873,9 @@ vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
    poly16x8x2_t __rv;
  #ifdef __ARM_BIG_ENDIAN
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
-      { 9, 11, 13, 15, 1, 3, 5, 7 });
+      { 5, 7, 1, 3, 13, 15, 9, 11 });
    __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
-      { 8, 10, 12, 14, 0, 2, 4, 6 });
+      { 4, 6, 0, 2, 12, 14, 8, 10 });
  #else
    __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
        { 0, 2, 4, 6, 8, 10, 12, 14 });
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 5fab7cb797d08a3ce8731f7b4115d3c8b8e38a7f..246f0caa20c0d7f6f75a8169dd5e3bb517ebe121 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2016-02-09  Charles Baylis  <charles.baylis@linaro.org>
+
+       PR target/68532
+       * gcc.c-torture/execute/pr68532.c: New test.
+
  2016-02-11  Alexandre Oliva  <aoliva@redhat.com>
  
         PR target/69634
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr68532.c b/gcc/testsuite/gcc.c-torture/execute/pr68532.c

new file mode 100644 (file)

index 0000000..5d4bd8e
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr68532.c
@@ -0,0 +1,22 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#define SIZE 128
+unsigned short _Alignas (16) in[SIZE];
+
+__attribute__ ((noinline)) int
+test (unsigned short sum, unsigned short *in, int x)
+{
+  for (int j = 0; j < SIZE; j += 8)
+    sum += in[j] * x;
+  return sum;
+}
+
+int
+main ()
+{
+  for (int i = 0; i < SIZE; i++)
+    in[i] = i;
+  if (test (0, in, 1) != 960)
+    __builtin_abort ();
+  return 0;
+}
author	cbaylis <cbaylis@138bc75d-0d04-0410-961f-82ee72b054a4>
	Tue, 9 Feb 2016 18:47:55 +0000 (18:47 +0000)
committer	cbaylis <cbaylis@138bc75d-0d04-0410-961f-82ee72b054a4>
	Tue, 9 Feb 2016 18:47:55 +0000 (18:47 +0000)
gcc/ChangeLog		patch \| blob \| blame \| history
gcc/config/arm/arm.c		patch \| blob \| blame \| history
gcc/config/arm/arm_neon.h		patch \| blob \| blame \| history
gcc/testsuite/ChangeLog		patch \| blob \| blame \| history
gcc/testsuite/gcc.c-torture/execute/pr68532.c	[new file with mode: 0644]	patch \| blob