DONE;
})
+(define_expand "widen_ssum<mode><vsi2qi>3"
+ [(set (match_operand:VS 0 "register_operand")
+ (plus:VS (sign_extend:VS
+ (match_operand:<VSI2QI> 1 "register_operand"))
+ (match_operand:VS 2 "register_operand")))]
+ "TARGET_DOTPROD"
+ {
+ rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+ emit_insn (gen_sdot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
+ operands[2]));
+ DONE;
+ }
+)
+
+;; Use dot product to perform double widening sum reductions by
+;; changing += a into += (a * 1). i.e. we seed the multiplication with 1.
+(define_expand "widen_usum<mode><vsi2qi>3"
+ [(set (match_operand:VS 0 "register_operand")
+ (plus:VS (zero_extend:VS
+ (match_operand:<VSI2QI> 1 "register_operand"))
+ (match_operand:VS 2 "register_operand")))]
+ "TARGET_DOTPROD"
+ {
+ rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+ emit_insn (gen_udot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
+ operands[2]));
+ DONE;
+ }
+)
+
+;; Use dot product to perform double widening sum reductions by
+;; changing += a into += (a * 1). i.e. we seed the multiplication with 1.
(define_insn "aarch64_<ANY_EXTEND:su>subw<mode>"
[(set (match_operand:<VWIDE> 0 "register_operand" "=w")
(minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+dotprod -mautovec-preference=asimd-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+** ...
+** sub v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** udot v[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+/*
+** foo2_int:
+** ...
+** add v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** uaddw2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
+** ...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ {
+ x[i] = x[i] + y[i];
+ sum += x[i];
+ }
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw }*/
+/* { dg-options "-O3 -march=armv8.2-a+dotprod -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+int main ()
+{
+ unsigned short a[100];
+ unsigned short b[100];
+ unsigned short r1[100];
+ unsigned short r2[100];
+ unsigned char c[100];
+ unsigned char d[100];
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ a[i] = c[i] = i;
+ b[i] = d[i] = 100 - i;
+ }
+
+ if (foo_int (c, d) != foo_int2 (c, d))
+ __builtin_abort();
+
+
+ if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+ __builtin_abort();
+
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ if (r1[i] != r2[i])
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
\ No newline at end of file