middle-end: Use addhn for compression instead of inclusive OR when reducing compariso...

author Tamar Christina <tamar.christina@arm.com>

Fri, 12 Sep 2025 07:30:55 +0000 (08:30 +0100)

committer Tamar Christina <tamar.christina@arm.com>

Fri, 12 Sep 2025 07:30:55 +0000 (08:30 +0100)
author Tamar Christina <tamar.christina@arm.com>
Fri, 12 Sep 2025 07:30:55 +0000 (08:30 +0100)
committer Tamar Christina <tamar.christina@arm.com>
Fri, 12 Sep 2025 07:30:55 +0000 (08:30 +0100)
diff --git a/gcc/doc/generic.texi b/gcc/doc/generic.texi

index 55083b278da4dacf4e9114ca444bf11b2ae0caf0..0ea4860703e616315acd6d8d4f51c76598f6e3ef 100644 (file)
--- a/gcc/doc/generic.texi
+++ b/gcc/doc/generic.texi
@@ -1833,6 +1833,7 @@ a value from @code{enum annot_expr_kind}, the third is an @code{INTEGER_CST}.
  @tindex IFN_VEC_WIDEN_MINUS_LO
  @tindex IFN_VEC_WIDEN_MINUS_EVEN
  @tindex IFN_VEC_WIDEN_MINUS_ODD
+@tindex IFN_VEC_TRUNC_ADD_HIGH
  @tindex VEC_UNPACK_HI_EXPR
  @tindex VEC_UNPACK_LO_EXPR
  @tindex VEC_UNPACK_FLOAT_HI_EXPR
@@ -1955,6 +1956,24 @@ vector of @code{N/2} subtractions.  In the case of
  vector are subtracted from the odd @code{N/2} of the first to produce the
  vector of @code{N/2} subtractions.
  
+@item IFN_VEC_TRUNC_ADD_HIGH
+This internal function performs an addition of two input vectors,
+then extracts the most significant half of each result element and
+narrows it to elements of half the original width.
+
+Concretely, it computes:
+@code{(bits(a)/2)((a + b) >> bits(a)/2)}
+
+where @code{bits(a)} is the width in bits of each input element.
+
+Its operands are vectors containing the same number of elements (@code{N})
+of the same integral type.  The result is a vector of length @code{N}, with
+elements of an integral type whose size is half that of the input element
+type.
+
+This operation currently only used for early break result compression when the
+result of a vector boolean can be represented as 0 or -1.
+
  @item VEC_UNPACK_HI_EXPR
  @itemx VEC_UNPACK_LO_EXPR
  These nodes represent unpacking of the high and low parts of the input vector,
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi

index 973c0dd302964966a91fa8dbab85930d6dbeec9e..bac22b6338042a5a546db7854988eab628f08eea 100644 (file)
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6087,6 +6087,25 @@ vectors with N signed/unsigned elements of size S@.  Find the absolute
  difference between operands 1 and 2 and widen the resulting elements.
  Put the N/2 results of size 2*S in the output vector (operand 0).
  
+@cindex @code{vec_trunc_add_high@var{m}} instruction pattern
+@item @samp{vec_trunc_add_high@var{m}}
+Signed or unsigned addition of two input integer vectors of mode @var{m}, then
+extracts the most significant half of each result element and narrows it to
+elements of half the original width.
+
+Concretely, it computes:
+@code{(bits(a)/2)((a + b) >> bits(a)/2)}
+
+where @code{bits(a)} is the width in bits of each input element.
+
+Operand 1 and 2 are of integer vector mode @var{m} containing the same number
+of signed or unsigned integral elements.  The result (operand @code{0}) is of an
+integer vector mode with the same number of elements but elements of half of the
+width of those of mode @var{m}.
+
+This operation currently only used for early break result compression when the
+result of a vector boolean can be represented as 0 or -1.
+
  @cindex @code{vec_addsub@var{m}3} instruction pattern
  @item @samp{vec_addsub@var{m}3}
  Alternating subtract, add with even lanes doing subtract and odd
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def

index d2480a1bf7927476215bc7bb99c0b74197d2b7e9..8434a805e289e109c49c53ef887a519112af1f33 100644 (file)
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -422,6 +422,8 @@ DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
  DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
  DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary)
  DEF_INTERNAL_OPTAB_FN (VEC_ADDSUB, ECF_CONST, vec_addsub, binary)
+DEF_INTERNAL_OPTAB_FN (VEC_TRUNC_ADD_HIGH, ECF_CONST | ECF_NOTHROW,
+                      vec_trunc_add_high, binary)
  DEF_INTERNAL_WIDENING_OPTAB_FN (VEC_WIDEN_PLUS,
                                 ECF_CONST | ECF_NOTHROW,
                                 first,
diff --git a/gcc/optabs.def b/gcc/optabs.def

index b59d02bce14cd8cd4392ac568d2547601aac4481..790e43f08f476c8025dc2797f9ecaffe5b66acc5 100644 (file)
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -493,6 +493,7 @@ OPTAB_D (vec_widen_uabd_hi_optab, "vec_widen_uabd_hi_$a")
  OPTAB_D (vec_widen_uabd_lo_optab, "vec_widen_uabd_lo_$a")
  OPTAB_D (vec_widen_uabd_odd_optab, "vec_widen_uabd_odd_$a")
  OPTAB_D (vec_widen_uabd_even_optab, "vec_widen_uabd_even_$a")
+OPTAB_D (vec_trunc_add_high_optab, "vec_trunc_add_high$a")
  OPTAB_D (vec_addsub_optab, "vec_addsub$a3")
  OPTAB_D (vec_fmaddsub_optab, "vec_fmaddsub$a4")
  OPTAB_D (vec_fmsubadd_optab, "vec_fmsubadd$a4")
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c

new file mode 100644 (file)

index 0000000..b22e7d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE int
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+**     ...
+**     ldp     q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+**     cmeq    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     cmeq    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     addhn   v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+**     fmov    x[0-9]+, d[0-9]+
+**     ...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 8
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_TRUNC_ADD_HIGH" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c

new file mode 100644 (file)

index 0000000..31d2515
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE long long
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+**     ...
+**     ldp     q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+**     cmeq    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**     cmeq    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**     addhn   v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+**     fmov    x[0-9]+, d[0-9]+
+**     ...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 4
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_TRUNC_ADD_HIGH" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c

new file mode 100644 (file)

index 0000000..375fe17
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE short
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+**     ...
+**     ldp     q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+**     cmeq    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+**     cmeq    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+**     addhn   v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+**     fmov    x[0-9]+, d[0-9]+
+**     ...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 16
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_TRUNC_ADD_HIGH" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c

new file mode 100644 (file)

index 0000000..e584bfa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+
+#define TYPE char
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+int foo ()
+{
+#pragma GCC unroll 32
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "VEC_TRUNC_ADD_HIGH" "vect" } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc

index 7eabf169a2be869168916f6a5b402f24381c1d4f..ba78f601fb98a96faab3fca592de9b042b732af2 100644 (file)
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12336,7 +12336,7 @@ vectorizable_early_exit (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
    gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info));
    gcond *cond_stmt = as_a <gcond *>(orig_stmt);
  
-  tree cst = build_zero_cst (vectype);
+  tree vectype_out = vectype;
    auto bb = gimple_bb (cond_stmt);
    edge exit_true_edge = EDGE_SUCC (bb, 0);
    if (exit_true_edge->flags & EDGE_FALSE_VALUE)
@@ -12353,10 +12353,37 @@ vectorizable_early_exit (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
    bool flipped = flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
                                         exit_true_edge->dest);
  
+  /* See if we support ADDHN and use that for the reduction.  */
+  internal_fn ifn = IFN_VEC_TRUNC_ADD_HIGH;
+  bool addhn_supported_p
+    = direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_BOTH);
+  tree narrow_type = NULL_TREE;
+  if (addhn_supported_p)
+    {
+      /* Calculate the narrowing type for the result.  */
+      auto halfprec = TYPE_PRECISION (TREE_TYPE (vectype)) / 2;
+      auto unsignedp = TYPE_UNSIGNED (TREE_TYPE (vectype));
+      tree itype = build_nonstandard_integer_type (halfprec, unsignedp);
+      tree tmp_type = build_vector_type (itype, TYPE_VECTOR_SUBPARTS (vectype));
+      narrow_type = truth_type_for (tmp_type);
+
+      if (direct_optab_handler (cbranch_optab, TYPE_MODE (narrow_type))
+         == CODE_FOR_nothing)
+       {
+         if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "can't use ADDHN reduction because cbranch for "
+                              "the narrowed type is not supported by the "
+                              "target.\n");
+         addhn_supported_p = false;
+       }
+    }
+
    /* Analyze only.  */
    if (cost_vec)
      {
-      if (direct_optab_handler (cbranch_optab, mode) == CODE_FOR_nothing)
+      if (!addhn_supported_p
+         && direct_optab_handler (cbranch_optab, mode) == CODE_FOR_nothing)
         {
           if (dump_enabled_p ())
               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -12462,10 +12489,22 @@ vectorizable_early_exit (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
  
        while (workset.length () > 1)
         {
-         new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc");
           tree arg0 = workset.pop ();
           tree arg1 = workset.pop ();
-         new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
+         if (addhn_supported_p && workset.length () == 0)
+           {
+             new_stmt = gimple_build_call_internal (ifn, 2, arg0, arg1);
+             vectype_out = narrow_type;
+             new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc");
+             gimple_call_set_lhs (as_a <gcall *> (new_stmt), new_temp);
+             gimple_call_set_nothrow (as_a <gcall *> (new_stmt), true);
+           }
+         else
+           {
+             new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc");
+             new_stmt
+               = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
+           }
           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
                                        &cond_gsi);
           workset.quick_insert (0, new_temp);
@@ -12488,6 +12527,7 @@ vectorizable_early_exit (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
  
    gcc_assert (new_temp);
  
+  tree cst = build_zero_cst (vectype_out);
    gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst);
    update_stmt (orig_stmt);
author	Tamar Christina <tamar.christina@arm.com>
	Fri, 12 Sep 2025 07:30:55 +0000 (08:30 +0100)
committer	Tamar Christina <tamar.christina@arm.com>
	Fri, 12 Sep 2025 07:30:55 +0000 (08:30 +0100)
gcc/doc/generic.texi		patch \| blob \| blame \| history
gcc/doc/md.texi		patch \| blob \| blame \| history
gcc/internal-fn.def		patch \| blob \| blame \| history
gcc/optabs.def		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-stmts.cc		patch \| blob \| blame \| history