]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
aarch64: Don't include vec_select high-half in SIMD subtract cost
authorJonathan Wright <jonathan.wright@arm.com>
Wed, 28 Jul 2021 16:45:36 +0000 (17:45 +0100)
committerJonathan Wright <jonathan.wright@arm.com>
Thu, 5 Aug 2021 10:52:13 +0000 (11:52 +0100)
The Neon subtract-long/subract-widen instructions can select the top
or bottom half of the operand registers. This selection does not
change the cost of the underlying instruction and this should be
reflected by the RTL cost function.

This patch adds RTL tree traversal in the Neon subtract cost function
to match vec_select high-half of its operands. This traversal
prevents the cost of the vec_select from being added into the cost of
the subtract - meaning that these instructions can now be emitted in
the combine pass as they are no longer deemed prohibitively
expensive.

gcc/ChangeLog:

2021-07-28  Jonathan Wright  <jonathan.wright@arm.com>

* config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
of vec_select high-half from being added into Neon subtract
cost.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vsubX_high_cost.c: New test.

gcc/config/aarch64/aarch64.c
gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c [new file with mode: 0644]

index aa687c579468d45091a05cfc55ebbd873fb86630..30f836549c685e7f6cde05e72e3025c2db23b1e7 100644 (file)
@@ -13089,6 +13089,21 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
        op1 = XEXP (x, 1);
 
 cost_minus:
+       if (VECTOR_MODE_P (mode))
+         {
+           /* SUBL2 and SUBW2.  */
+           unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+           if (vec_flags & VEC_ADVSIMD)
+             {
+               /* The select-operand-high-half versions of the sub instruction
+                  have the same cost as the regular three vector version -
+                  don't add the costs of the select into the costs of the sub.
+                  */
+               op0 = aarch64_strip_extend_vec_half (op0);
+               op1 = aarch64_strip_extend_vec_half (op1);
+             }
+         }
+
        *cost += rtx_cost (op0, mode, MINUS, 0, speed);
 
        /* Detect valid immediates.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c
new file mode 100644 (file)
index 0000000..09bc7fc
--- /dev/null
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_SUBL(rettype, intype, ts, rs) \
+  rettype test_vsubl_ ## ts (intype a, intype b, intype c) \
+       { \
+               rettype t0 = vsubl_ ## ts (vget_high_ ## ts (a), \
+                                          vget_high_ ## ts (c)); \
+               rettype t1 = vsubl_ ## ts (vget_high_ ## ts (b), \
+                                          vget_high_ ## ts (c)); \
+               return vaddq ## _ ## rs (t0, t1); \
+       }
+
+TEST_SUBL (int16x8_t, int8x16_t, s8, s16)
+TEST_SUBL (uint16x8_t, uint8x16_t, u8, u16)
+TEST_SUBL (int32x4_t, int16x8_t, s16, s32)
+TEST_SUBL (uint32x4_t, uint16x8_t, u16, u32)
+TEST_SUBL (int64x2_t, int32x4_t, s32, s64)
+TEST_SUBL (uint64x2_t, uint32x4_t, u32, u64)
+
+#define TEST_SUBW(rettype, intype, intypel, ts, rs) \
+  rettype test_vsubw_ ## ts (intype a, intype b, intypel c) \
+       { \
+               rettype t0 = vsubw_ ## ts (a, vget_high_ ## ts (c)); \
+               rettype t1 = vsubw_ ## ts (b, vget_high_ ## ts (c)); \
+               return vaddq ## _ ## rs (t0, t1); \
+       }
+
+TEST_SUBW (int16x8_t, int16x8_t, int8x16_t, s8, s16)
+TEST_SUBW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16)
+TEST_SUBW (int32x4_t, int32x4_t, int16x8_t, s16, s32)
+TEST_SUBW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32)
+TEST_SUBW (int64x2_t, int64x2_t, int32x4_t, s32, s64)
+TEST_SUBW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */