aarch64: Don't include vec_select high-half in SIMD subtract cost

author Jonathan Wright <jonathan.wright@arm.com>

Wed, 28 Jul 2021 16:45:36 +0000 (17:45 +0100)

committer Jonathan Wright <jonathan.wright@arm.com>

Thu, 5 Aug 2021 10:52:13 +0000 (11:52 +0100)
author Jonathan Wright <jonathan.wright@arm.com>
Wed, 28 Jul 2021 16:45:36 +0000 (17:45 +0100)
committer Jonathan Wright <jonathan.wright@arm.com>
Thu, 5 Aug 2021 10:52:13 +0000 (11:52 +0100)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index aa687c579468d45091a05cfc55ebbd873fb86630..30f836549c685e7f6cde05e72e3025c2db23b1e7 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13089,6 +13089,21 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
         op1 = XEXP (x, 1);
  
  cost_minus:
+       if (VECTOR_MODE_P (mode))
+         {
+           /* SUBL2 and SUBW2.  */
+           unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+           if (vec_flags & VEC_ADVSIMD)
+             {
+               /* The select-operand-high-half versions of the sub instruction
+                  have the same cost as the regular three vector version -
+                  don't add the costs of the select into the costs of the sub.
+                  */
+               op0 = aarch64_strip_extend_vec_half (op0);
+               op1 = aarch64_strip_extend_vec_half (op1);
+             }
+         }
+
         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
  
         /* Detect valid immediates.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c

new file mode 100644 (file)

index 0000000..09bc7fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_SUBL(rettype, intype, ts, rs) \
+  rettype test_vsubl_ ## ts (intype a, intype b, intype c) \
+       { \
+               rettype t0 = vsubl_ ## ts (vget_high_ ## ts (a), \
+                                          vget_high_ ## ts (c)); \
+               rettype t1 = vsubl_ ## ts (vget_high_ ## ts (b), \
+                                          vget_high_ ## ts (c)); \
+               return vaddq ## _ ## rs (t0, t1); \
+       }
+
+TEST_SUBL (int16x8_t, int8x16_t, s8, s16)
+TEST_SUBL (uint16x8_t, uint8x16_t, u8, u16)
+TEST_SUBL (int32x4_t, int16x8_t, s16, s32)
+TEST_SUBL (uint32x4_t, uint16x8_t, u16, u32)
+TEST_SUBL (int64x2_t, int32x4_t, s32, s64)
+TEST_SUBL (uint64x2_t, uint32x4_t, u32, u64)
+
+#define TEST_SUBW(rettype, intype, intypel, ts, rs) \
+  rettype test_vsubw_ ## ts (intype a, intype b, intypel c) \
+       { \
+               rettype t0 = vsubw_ ## ts (a, vget_high_ ## ts (c)); \
+               rettype t1 = vsubw_ ## ts (b, vget_high_ ## ts (c)); \
+               return vaddq ## _ ## rs (t0, t1); \
+       }
+
+TEST_SUBW (int16x8_t, int16x8_t, int8x16_t, s8, s16)
+TEST_SUBW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16)
+TEST_SUBW (int32x4_t, int32x4_t, int16x8_t, s16, s32)
+TEST_SUBW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32)
+TEST_SUBW (int64x2_t, int64x2_t, int32x4_t, s32, s64)
+TEST_SUBW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
author	Jonathan Wright <jonathan.wright@arm.com>
	Wed, 28 Jul 2021 16:45:36 +0000 (17:45 +0100)
committer	Jonathan Wright <jonathan.wright@arm.com>
	Thu, 5 Aug 2021 10:52:13 +0000 (11:52 +0100)
gcc/config/aarch64/aarch64.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c	[new file with mode: 0644]	patch \| blob