@tindex IFN_VEC_WIDEN_MINUS_LO
@tindex IFN_VEC_WIDEN_MINUS_EVEN
@tindex IFN_VEC_WIDEN_MINUS_ODD
+@tindex IFN_VEC_TRUNC_ADD_HIGH
@tindex VEC_UNPACK_HI_EXPR
@tindex VEC_UNPACK_LO_EXPR
@tindex VEC_UNPACK_FLOAT_HI_EXPR
vector are subtracted from the odd @code{N/2} of the first to produce the
vector of @code{N/2} subtractions.
+@item IFN_VEC_TRUNC_ADD_HIGH
+This internal function performs an addition of two input vectors,
+then extracts the most significant half of each result element and
+narrows it to elements of half the original width.
+
+Concretely, it computes:
+@code{(bits(a)/2)((a + b) >> bits(a)/2)}
+
+where @code{bits(a)} is the width in bits of each input element.
+
+Its operands are vectors containing the same number of elements (@code{N})
+of the same integral type. The result is a vector of length @code{N}, with
+elements of an integral type whose size is half that of the input element
+type.
+
+This operation currently only used for early break result compression when the
+result of a vector boolean can be represented as 0 or -1.
+
@item VEC_UNPACK_HI_EXPR
@itemx VEC_UNPACK_LO_EXPR
These nodes represent unpacking of the high and low parts of the input vector,
difference between operands 1 and 2 and widen the resulting elements.
Put the N/2 results of size 2*S in the output vector (operand 0).
+@cindex @code{vec_trunc_add_high@var{m}} instruction pattern
+@item @samp{vec_trunc_add_high@var{m}}
+Signed or unsigned addition of two input integer vectors of mode @var{m}, then
+extracts the most significant half of each result element and narrows it to
+elements of half the original width.
+
+Concretely, it computes:
+@code{(bits(a)/2)((a + b) >> bits(a)/2)}
+
+where @code{bits(a)} is the width in bits of each input element.
+
+Operand 1 and 2 are of integer vector mode @var{m} containing the same number
+of signed or unsigned integral elements. The result (operand @code{0}) is of an
+integer vector mode with the same number of elements but elements of half of the
+width of those of mode @var{m}.
+
+This operation currently only used for early break result compression when the
+result of a vector boolean can be represented as 0 or -1.
+
@cindex @code{vec_addsub@var{m}3} instruction pattern
@item @samp{vec_addsub@var{m}3}
Alternating subtract, add with even lanes doing subtract and odd
DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary)
DEF_INTERNAL_OPTAB_FN (VEC_ADDSUB, ECF_CONST, vec_addsub, binary)
+DEF_INTERNAL_OPTAB_FN (VEC_TRUNC_ADD_HIGH, ECF_CONST | ECF_NOTHROW,
+ vec_trunc_add_high, binary)
DEF_INTERNAL_WIDENING_OPTAB_FN (VEC_WIDEN_PLUS,
ECF_CONST | ECF_NOTHROW,
first,
OPTAB_D (vec_widen_uabd_lo_optab, "vec_widen_uabd_lo_$a")
OPTAB_D (vec_widen_uabd_odd_optab, "vec_widen_uabd_odd_$a")
OPTAB_D (vec_widen_uabd_even_optab, "vec_widen_uabd_even_$a")
+OPTAB_D (vec_trunc_add_high_optab, "vec_trunc_add_high$a")
OPTAB_D (vec_addsub_optab, "vec_addsub$a3")
OPTAB_D (vec_fmaddsub_optab, "vec_fmaddsub$a4")
OPTAB_D (vec_fmsubadd_optab, "vec_fmsubadd$a4")
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE int
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+** ...
+** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+** cmeq v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** cmeq v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** fmov x[0-9]+, d[0-9]+
+** ...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 8
+ for (int i = 0; i < N; i++)
+ if (a[i] == 124)
+ return 1;
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_TRUNC_ADD_HIGH" "vect" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE long long
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+** ...
+** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+** cmeq v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** cmeq v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** fmov x[0-9]+, d[0-9]+
+** ...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 4
+ for (int i = 0; i < N; i++)
+ if (a[i] == 124)
+ return 1;
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_TRUNC_ADD_HIGH" "vect" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE short
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+** ...
+** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+** cmeq v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** cmeq v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** fmov x[0-9]+, d[0-9]+
+** ...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 16
+ for (int i = 0; i < N; i++)
+ if (a[i] == 124)
+ return 1;
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_TRUNC_ADD_HIGH" "vect" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+
+#define TYPE char
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+int foo ()
+{
+#pragma GCC unroll 32
+ for (int i = 0; i < N; i++)
+ if (a[i] == 124)
+ return 1;
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "VEC_TRUNC_ADD_HIGH" "vect" } } */
gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info));
gcond *cond_stmt = as_a <gcond *>(orig_stmt);
- tree cst = build_zero_cst (vectype);
+ tree vectype_out = vectype;
auto bb = gimple_bb (cond_stmt);
edge exit_true_edge = EDGE_SUCC (bb, 0);
if (exit_true_edge->flags & EDGE_FALSE_VALUE)
bool flipped = flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
exit_true_edge->dest);
+ /* See if we support ADDHN and use that for the reduction. */
+ internal_fn ifn = IFN_VEC_TRUNC_ADD_HIGH;
+ bool addhn_supported_p
+ = direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_BOTH);
+ tree narrow_type = NULL_TREE;
+ if (addhn_supported_p)
+ {
+ /* Calculate the narrowing type for the result. */
+ auto halfprec = TYPE_PRECISION (TREE_TYPE (vectype)) / 2;
+ auto unsignedp = TYPE_UNSIGNED (TREE_TYPE (vectype));
+ tree itype = build_nonstandard_integer_type (halfprec, unsignedp);
+ tree tmp_type = build_vector_type (itype, TYPE_VECTOR_SUBPARTS (vectype));
+ narrow_type = truth_type_for (tmp_type);
+
+ if (direct_optab_handler (cbranch_optab, TYPE_MODE (narrow_type))
+ == CODE_FOR_nothing)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use ADDHN reduction because cbranch for "
+ "the narrowed type is not supported by the "
+ "target.\n");
+ addhn_supported_p = false;
+ }
+ }
+
/* Analyze only. */
if (cost_vec)
{
- if (direct_optab_handler (cbranch_optab, mode) == CODE_FOR_nothing)
+ if (!addhn_supported_p
+ && direct_optab_handler (cbranch_optab, mode) == CODE_FOR_nothing)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
while (workset.length () > 1)
{
- new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc");
tree arg0 = workset.pop ();
tree arg1 = workset.pop ();
- new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
+ if (addhn_supported_p && workset.length () == 0)
+ {
+ new_stmt = gimple_build_call_internal (ifn, 2, arg0, arg1);
+ vectype_out = narrow_type;
+ new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc");
+ gimple_call_set_lhs (as_a <gcall *> (new_stmt), new_temp);
+ gimple_call_set_nothrow (as_a <gcall *> (new_stmt), true);
+ }
+ else
+ {
+ new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc");
+ new_stmt
+ = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
+ }
vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
&cond_gsi);
workset.quick_insert (0, new_temp);
gcc_assert (new_temp);
+ tree cst = build_zero_cst (vectype_out);
gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst);
update_stmt (orig_stmt);