The final patch in the series tackles the most complex of this family of patterns, UABAL2 and SABAL2.
These extract the high part of the sources, perform an absdiff on them, widen the result and accumulate.
The motivating testcase for this patch (series) is included and the simplification required doesn't actually
trigger with just the RTL pattern change because rtx_costs block it.
So this patch also extends rtx costs to recognise the (minus (smax (x, y) (smin (x, y)))) expression we use
to describe absdiff in the backend and avoid recursing into its arms.
This allows us to generate the single-instruction sequence expected here.
Bootstrapped and tested on aarch64-none-linux-gnu.
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md (aarch64_<sur>abal2<mode>): Rename to...
(aarch64_<su>abal2<mode>_insn): ... This. Use RTL codes instead of unspec.
(aarch64_<su>abal2<mode>): New define_expand.
* config/aarch64/aarch64.cc (aarch64_abd_rtx_p): New function.
(aarch64_rtx_costs): Handle ABD rtxes.
* config/aarch64/aarch64.md (UNSPEC_SABAL2, UNSPEC_UABAL2): Delete.
* config/aarch64/iterators.md (ABAL2): Delete.
(sur): Remove handling of UNSPEC_UABAL2 and UNSPEC_SABAL2.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/vabal_combine.c: New test.
[(set_attr "type" "neon_arith_acc<q>")]
)
-(define_insn "aarch64_<sur>abal2<mode>"
- [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
- (unspec:<VWIDE> [(match_operand:VQW 2 "register_operand" "w")
- (match_operand:VQW 3 "register_operand" "w")
- (match_operand:<VWIDE> 1 "register_operand" "0")]
- ABAL2))]
+(define_insn "aarch64_<su>abal2<mode>_insn"
+ [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
+ (plus:<VDBLW>
+ (zero_extend:<VDBLW>
+ (minus:<VHALF>
+ (USMAX:<VHALF>
+ (vec_select:<VHALF>
+ (match_operand:VQW 2 "register_operand" "w")
+ (match_operand:VQW 4 "vect_par_cnst_hi_half" ""))
+ (vec_select:<VHALF>
+ (match_operand:VQW 3 "register_operand" "w")
+ (match_dup 4)))
+ (<max_opp>:<VHALF>
+ (vec_select:<VHALF>
+ (match_dup 2)
+ (match_dup 4))
+ (vec_select:<VHALF>
+ (match_dup 3)
+ (match_dup 4)))))
+ (match_operand:<VDBLW> 1 "register_operand" "0")))]
"TARGET_SIMD"
- "<sur>abal2\t%0.<Vwtype>, %2.<Vtype>, %3.<Vtype>"
+ "<su>abal2\t%0.<Vwtype>, %2.<Vtype>, %3.<Vtype>"
[(set_attr "type" "neon_arith_acc<q>")]
)
+(define_expand "aarch64_<su>abal2<mode>"
+ [(match_operand:<VDBLW> 0 "register_operand")
+ (match_operand:<VDBLW> 1 "register_operand")
+ (USMAX:VQW
+ (match_operand:VQW 2 "register_operand")
+ (match_operand:VQW 3 "register_operand"))]
+ "TARGET_SIMD"
+ {
+ rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+ emit_insn (gen_aarch64_<su>abal2<mode>_insn (operands[0], operands[1],
+ operands[2], operands[3], hi));
+ DONE;
+ }
+)
+
(define_insn "aarch64_<sur>adalp<mode>"
[(set (match_operand:<VDBLW> 0 "register_operand" "=w")
(unspec:<VDBLW> [(match_operand:VDQV_L 2 "register_operand" "w")
return (t == (t & -t));
}
+/* Return true if X is an RTX representing an operation in the ABD family
+ of instructions. */
+
+static bool
+aarch64_abd_rtx_p (rtx x)
+{
+ if (GET_CODE (x) != MINUS)
+ return false;
+ rtx max_arm = XEXP (x, 0);
+ rtx min_arm = XEXP (x, 1);
+ if (GET_CODE (max_arm) != SMAX && GET_CODE (max_arm) != UMAX)
+ return false;
+ bool signed_p = GET_CODE (max_arm) == SMAX;
+ if (signed_p && GET_CODE (min_arm) != SMIN)
+ return false;
+ else if (!signed_p && GET_CODE (min_arm) != UMIN)
+ return false;
+
+ rtx maxop0 = XEXP (max_arm, 0);
+ rtx maxop1 = XEXP (max_arm, 1);
+ rtx minop0 = XEXP (min_arm, 0);
+ rtx minop1 = XEXP (min_arm, 1);
+ return rtx_equal_p (maxop0, minop0) && rtx_equal_p (maxop1, minop1);
+}
+
/* Calculate the cost of calculating X, storing it in *COST. Result
is true if the total cost of the operation has now been calculated. */
static bool
cost_minus:
if (VECTOR_MODE_P (mode))
{
- /* SUBL2 and SUBW2. */
unsigned int vec_flags = aarch64_classify_vector_mode (mode);
if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
{
- /* The select-operand-high-half versions of the sub instruction
+ /* Recognise the SABD and UABD operation here.
+ Recursion from the PLUS case will catch the accumulating
+ forms. */
+ if (aarch64_abd_rtx_p (x))
+ {
+ if (speed)
+ *cost += extra_cost->vect.alu;
+ return true;
+ }
+ /* SUBL2 and SUBW2.
+ The select-operand-high-half versions of the sub instruction
have the same cost as the regular three vector version -
don't add the costs of the select into the costs of the sub.
*/
UNSPEC_PRLG_STK
UNSPEC_REV
UNSPEC_RBIT
- UNSPEC_SABAL2
UNSPEC_SADALP
UNSPEC_SCVTF
UNSPEC_SETMEM
UNSPEC_TLSLE24
UNSPEC_TLSLE32
UNSPEC_TLSLE48
- UNSPEC_UABAL2
UNSPEC_UADALP
UNSPEC_UCVTF
UNSPEC_USHL_2S
;; Int Iterators.
;; -------------------------------------------------------------------
-;; The unspec codes for the SABAL2, UABAL2 AdvancedSIMD instructions.
-(define_int_iterator ABAL2 [UNSPEC_SABAL2 UNSPEC_UABAL2])
-
;; The unspec codes for the SADALP, UADALP AdvancedSIMD instructions.
(define_int_iterator ADALP [UNSPEC_SADALP UNSPEC_UADALP])
(UNSPEC_SRHADD "sr") (UNSPEC_URHADD "ur")
(UNSPEC_SHSUB "s") (UNSPEC_UHSUB "u")
(UNSPEC_ADDHN "") (UNSPEC_RADDHN "r")
- (UNSPEC_SABAL2 "s") (UNSPEC_UABAL2 "u")
(UNSPEC_SADALP "s") (UNSPEC_UADALP "u")
(UNSPEC_SUBHN "") (UNSPEC_RSUBHN "r")
(UNSPEC_USQADD "us") (UNSPEC_SUQADD "su")
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** test_vabal_s8:
+** sabal2 v0.8h, v2.16b, v1.16b
+** ret
+*/
+int16x8_t
+test_vabal_s8 (int16x8_t sadv, int8x16_t pv, int8x16_t sv)
+{
+ return vabal_s8 (sadv, vget_high_s8 (pv), vget_high_s8 (sv));
+}
+
+/*
+** test_vabal_u8:
+** uabal2 v0.8h, v2.16b, v1.16b
+** ret
+*/
+uint16x8_t
+test_vabal_u8 (uint16x8_t sadv, uint8x16_t pv, uint8x16_t sv)
+{
+ return vabal_u8 (sadv, vget_high_u8 (pv), vget_high_u8 (sv));
+}
+
+/*
+** test_vabal_s16:
+** sabal2 v0.4s, v2.8h, v1.8h
+** ret
+*/
+int32x4_t
+test_vabal_s16 (int32x4_t sadv, int16x8_t pv, int16x8_t sv)
+{
+ return vabal_s16 (sadv, vget_high_s16 (pv), vget_high_s16 (sv));
+}
+
+/*
+** test_vabal_u16:
+** uabal2 v0.4s, v2.8h, v1.8h
+** ret
+*/
+uint32x4_t
+test_vabal_u16 (uint32x4_t sadv, uint16x8_t pv, uint16x8_t sv)
+{
+ return vabal_u16 (sadv, vget_high_u16 (pv), vget_high_u16 (sv));
+}
+
+/*
+** test_vabal_s32:
+** sabal2 v0.2d, v2.4s, v1.4s
+** ret
+*/
+int64x2_t
+test_vabal_s32 (int64x2_t sadv, int32x4_t pv, int32x4_t sv)
+{
+ return vabal_s32 (sadv, vget_high_s32 (pv), vget_high_s32 (sv));
+}
+
+/*
+** test_vabal_u32:
+** uabal2 v0.2d, v2.4s, v1.4s
+** ret
+*/
+uint64x2_t
+test_vabal_u32 (uint64x2_t sadv, uint32x4_t pv, uint32x4_t sv)
+{
+ return vabal_u32 (sadv, vget_high_u32 (pv), vget_high_u32 (sv));
+}
+