We've received requests to optimise the attached intrinsics testcase.
We currently generate:
foo_1:
uaddlp v0.4s, v0.8h
uaddlv d31, v0.4s
fmov x0, d31
ret
foo_2:
uaddlp v0.4s, v0.8h
addv s31, v0.4s
fmov w0, s31
ret
foo_3:
saddlp v0.4s, v0.8h
addv s31, v0.4s
fmov w0, s31
ret
The widening pair-wise addition addlp instructions can be omitted if we're just doing an ADDV afterwards.
Making this optimisation would be quite simple if we had a standard RTL PLUS vector reduction code.
As we don't, we can use UNSPEC_ADDV as a stand in.
This patch expresses the SADDLV and UADDLV instructions as an UNSPEC_ADDV over a widened input, thus removing
the need for separate UNSPEC_SADDLV and UNSPEC_UADDLV codes.
To optimise the testcases involved we add two splitters that match a vector addition where all participating elements
are taken and widened from the same vector and then fed into an UNSPEC_ADDV. In that case we can just remove the
vector PLUS and just emit the simple RTL for SADDLV/UADDLV.
Bootstrapped and tested on aarch64-none-linux-gnu.
gcc/ChangeLog:
* config/aarch64/aarch64-protos.h (aarch64_parallel_select_half_p):
Define prototype.
(aarch64_pars_overlap_p): Likewise.
* config/aarch64/aarch64-simd.md (aarch64_<su>addlv<mode>):
Express in terms of UNSPEC_ADDV.
(*aarch64_<su>addlv<VDQV_L:mode>_ze<GPI:mode>): Likewise.
(*aarch64_<su>addlv<mode>_reduction): Define.
(*aarch64_uaddlv<mode>_reduction_2): Likewise.
* config/aarch64/aarch64.cc (aarch64_parallel_select_half_p): Define.
(aarch64_pars_overlap_p): Likewise.
* config/aarch64/iterators.md (UNSPEC_SADDLV, UNSPEC_UADDLV): Delete.
(VQUADW): New mode attribute.
(VWIDE2X_S): Likewise.
(USADDLV): Delete.
(su): Delete handling of UNSPEC_SADDLV, UNSPEC_UADDLV.
* config/aarch64/predicates.md (vect_par_cnst_select_half): Define.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/addlv_1.c: New test.
bool aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *fail);
bool aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
bool high);
+bool aarch64_parallel_select_half_p (machine_mode, rtx);
+bool aarch64_pars_overlap_p (rtx, rtx);
bool aarch64_simd_scalar_immediate_valid_for_move (rtx, scalar_int_mode);
bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *);
DONE;
})
+;; SADDLV and UADDLV can be expressed as an ADDV instruction that first
+;; sign or zero-extends its elements.
(define_insn "aarch64_<su>addlv<mode>"
[(set (match_operand:<VWIDE_S> 0 "register_operand" "=w")
- (unspec:<VWIDE_S> [(match_operand:VDQV_L 1 "register_operand" "w")]
- USADDLV))]
+ (unspec:<VWIDE_S>
+ [(ANY_EXTEND:<V2XWIDE>
+ (match_operand:VDQV_L 1 "register_operand" "w"))]
+ UNSPEC_ADDV))]
"TARGET_SIMD"
"<su>addl<vp>\\t%<Vwstype>0<Vwsuf>, %1.<Vtype>"
[(set_attr "type" "neon_reduc_add<q>")]
)
+;; An ADDV over a vector PLUS of elements extracted and widened all from the
+;; same vector is the same as an [SU]ADDLV above, so long as all the elements
+;; of that vector are used. We can greatly simplify the RTL expression using
+;; this splitter.
+(define_insn_and_split "*aarch64_<su>addlv<mode>_reduction"
+ [(set (match_operand:<VWIDE_S> 0 "register_operand")
+ (unspec:<VWIDE_S>
+ [(plus:<VDBLW>
+ (vec_select:<VDBLW>
+ (ANY_EXTEND:<V2XWIDE>
+ (match_operand:VDQV_L 1 "register_operand"))
+ (match_operand:<V2XWIDE> 2 "vect_par_cnst_select_half"))
+ (vec_select:<VDBLW> (ANY_EXTEND:<V2XWIDE> (match_dup 1))
+ (match_operand:<V2XWIDE> 3 "vect_par_cnst_select_half")))]
+ UNSPEC_ADDV))]
+ "TARGET_SIMD && !aarch64_pars_overlap_p (operands[2], operands[3])"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:<VWIDE_S>
+ [(ANY_EXTEND:<V2XWIDE>
+ (match_dup 1))]
+ UNSPEC_ADDV))]
+ {}
+)
+
+;; Similar to the above but for two-step zero-widening reductions.
+;; We can push the outer zero_extend outside the ADDV unspec and make
+;; use of the implicit high-part zeroing semantics of UADDLV to do it all
+;; in a single instruction.
+(define_insn_and_split "*aarch64_uaddlv<mode>_reduction_2"
+ [(set (match_operand:<VWIDE2X_S> 0 "register_operand" "=w")
+ (unspec:<VWIDE2X_S>
+ [(zero_extend:<VQUADW>
+ (plus:<VDBLW>
+ (vec_select:<VDBLW>
+ (zero_extend:<V2XWIDE>
+ (match_operand:VDQQH 1 "register_operand" "w"))
+ (match_operand:<V2XWIDE> 2 "vect_par_cnst_select_half"))
+ (vec_select:<VDBLW> (zero_extend:<V2XWIDE> (match_dup 1))
+ (match_operand:<V2XWIDE> 3 "vect_par_cnst_select_half"))))]
+ UNSPEC_ADDV))]
+ "TARGET_SIMD && !aarch64_pars_overlap_p (operands[2], operands[3])"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (zero_extend:<VWIDE2X_S>
+ (unspec:<VWIDE_S>
+ [(zero_extend:<V2XWIDE>
+ (match_dup 1))]
+ UNSPEC_ADDV)))]
+ {}
+)
+
;; Zero-extending version of the above. As these intrinsics produce a scalar
;; value that may be used by further intrinsics we want to avoid moving the
;; result into GP regs to do a zero-extension that ADDLV/ADDLP gives for free.
(define_insn "*aarch64_<su>addlv<VDQV_L:mode>_ze<GPI:mode>"
[(set (match_operand:GPI 0 "register_operand" "=w")
(zero_extend:GPI
- (unspec:<VWIDE_S>
- [(match_operand:VDQV_L 1 "register_operand" "w")]
- USADDLV)))]
+ (unspec:<VWIDE_S>
+ [(ANY_EXTEND:<VDQV_L:V2XWIDE>
+ (match_operand:VDQV_L 1 "register_operand" "w"))]
+ UNSPEC_ADDV)))]
"TARGET_SIMD
&& (GET_MODE_SIZE (<GPI:MODE>mode) > GET_MODE_SIZE (<VWIDE_S>mode))"
"<su>addl<VDQV_L:vp>\\t%<VDQV_L:Vwstype>0<VDQV_L:Vwsuf>, %1.<VDQV_L:Vtype>"
reg_alloc_order[i] = i;
}
+/* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
+ of vector mode MODE to select half the elements of that vector.
+ Allow any combination of indices except duplicates (or out of range of
+ the mode units). */
+
+bool
+aarch64_parallel_select_half_p (machine_mode mode, rtx par)
+{
+ int nunits = XVECLEN (par, 0);
+ if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2))
+ return false;
+ int mode_nunits = nunits * 2;
+ /* Put all the elements of PAR into a hash_set and use its
+ uniqueness guarantees to check that we don't try to insert the same
+ element twice. */
+ hash_set<rtx> parset;
+ for (int i = 0; i < nunits; ++i)
+ {
+ rtx elt = XVECEXP (par, 0, i);
+ if (!CONST_INT_P (elt)
+ || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1)
+ || parset.add (elt))
+ return false;
+ }
+ return true;
+}
+
+/* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
+ contain any common elements. */
+
+bool
+aarch64_pars_overlap_p (rtx par1, rtx par2)
+{
+ int len1 = XVECLEN (par1, 0);
+ int len2 = XVECLEN (par2, 0);
+ hash_set<rtx> parset;
+ for (int i = 0; i < len1; ++i)
+ parset.add (XVECEXP (par1, 0, i));
+ for (int i = 0; i < len2; ++i)
+ if (parset.contains (XVECEXP (par2, 0, i)))
+ return true;
+ return false;
+}
+
/* Target-specific selftests. */
#if CHECKING_P
UNSPEC_FMINV ; Used in aarch64-simd.md.
UNSPEC_FADDV ; Used in aarch64-simd.md.
UNSPEC_ADDV ; Used in aarch64-simd.md.
- UNSPEC_SADDLV ; Used in aarch64-simd.md.
- UNSPEC_UADDLV ; Used in aarch64-simd.md.
UNSPEC_SMAXV ; Used in aarch64-simd.md.
UNSPEC_SMINV ; Used in aarch64-simd.md.
UNSPEC_UMAXV ; Used in aarch64-simd.md.
(V4HI "V2SI") (V8HI "V4SI")
(V2SI "DI") (V4SI "V2DI")])
+(define_mode_attr VQUADW [(V8QI "V4SI") (V16QI "V8SI")
+ (V4HI "V2DI") (V8HI "V4DI")])
+
;; Narrowed modes for VDN.
(define_mode_attr VNARROWD [(V4HI "V8QI") (V2SI "V4HI")
(DI "V2SI")])
(V2SI "DI") (V16QI "HI")
(V8HI "SI") (V4SI "DI")])
+(define_mode_attr VWIDE2X_S [(V8QI "SI") (V4HI "DI")
+ (V16QI "SI") (V8HI "DI")])
+
;; Widened mode with half the element register suffixes for VD_BHSI/VQW/VQ_HSF.
(define_mode_attr Vwhalf [(V8QI "4h") (V4HI "2s")
(V2SI "1d") (V16QI "8h")
(define_int_iterator SVE_INT_ADDV [UNSPEC_SADDV UNSPEC_UADDV])
-(define_int_iterator USADDLV [UNSPEC_SADDLV UNSPEC_UADDLV])
-
(define_int_iterator LOGICALF [UNSPEC_ANDF UNSPEC_IORF UNSPEC_XORF])
(define_int_iterator HADDSUB [UNSPEC_SHADD UNSPEC_UHADD
;; "s" for signed operations and "u" for unsigned ones.
(define_int_attr su [(UNSPEC_SADDV "s")
(UNSPEC_UADDV "u")
- (UNSPEC_SADDLV "s")
- (UNSPEC_UADDLV "u")
(UNSPEC_UNPACKSHI "s")
(UNSPEC_UNPACKUHI "u")
(UNSPEC_UNPACKSLO "s")
&& aarch64_stepped_int_parallel_p (op, 2);
})
+;; PARALLEL for a vec_select that selects half the elements in a vector of
+;; MODE. Allows any combination of elements, as long as there's no
+;; duplicate entries.
+(define_special_predicate "vect_par_cnst_select_half"
+ (match_code "parallel")
+{
+ return aarch64_parallel_select_half_p (mode, op);
+})
+
(define_predicate "descending_int_parallel"
(match_code "parallel")
{
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+#include <arm_neon.h>
+
+/*
+** foo_1:
+** uaddlv s([0-9]+), v0.8h
+** fmov x0, d\1
+** ret
+*/
+
+uint64_t
+foo_1 (uint16x8_t b)
+{
+ return vaddlvq_u32 (vpadalq_u16 (vdupq_n_u32 (0), b));
+}
+
+/*
+** foo_2:
+** uaddlv s([0-9]+), v0.8h
+** fmov w0, s\1
+** ret
+*/
+
+uint32_t
+foo_2 (uint16x8_t b)
+{
+ return vaddvq_u32 (vpadalq_u16 (vdupq_n_u32 (0), b));
+}
+
+/*
+** foo_3:
+** saddlv s([0-9]+), v0.8h
+** fmov w0, s\1
+** ret
+*/
+
+int32_t
+foo_3 (int16x8_t b)
+{
+ return vaddvq_s32 (vpadalq_s16 (vdupq_n_s32 (0), b));
+}