From: Tamar Christina <tamar.christina@arm.com>
Date: Wed, 14 Jul 2021 14:19:32 +0000 (+0100)
Subject: AArch64: Add support for sign differing dot-product usdot for NEON and SVE.
X-Git-Tag: basepoints/gcc-13~6015
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=752045ed1eea0eddc48923df78999dab7f2827ba;p=thirdparty%2Fgcc.git

AArch64: Add support for sign differing dot-product usdot for NEON and SVE.

Hi All,

This adds optabs implementing usdot_prod.

The following testcase:

#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned

SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
   SIGNEDNESS_4 char *restrict b)
{
  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
    {
      int av = a[i];
      int bv = b[i];
      SIGNEDNESS_2 short mult = av * bv;
      res += mult;
    }
  return res;
}

Generates for NEON

f:
        movi    v0.4s, 0
        mov     x3, 0
        .p2align 3,,7
.L2:
        ldr     q1, [x2, x3]
        ldr     q2, [x1, x3]
        usdot   v0.4s, v1.16b, v2.16b
        add     x3, x3, 16
        cmp     x3, 480
        bne     .L2
        addv    s0, v0.4s
        fmov    w1, s0
        add     w0, w0, w1
        ret

and for SVE

f:
        mov     x3, 0
        cntb    x5
        mov     w4, 480
        mov     z1.b, #0
        whilelo p0.b, wzr, w4
        mov     z3.b, #0
        ptrue   p1.b, all
        .p2align 3,,7
.L2:
        ld1b    z2.b, p0/z, [x1, x3]
        ld1b    z0.b, p0/z, [x2, x3]
        add     x3, x3, x5
        sel     z0.b, p0, z0.b, z3.b
        whilelo p0.b, w3, w4
        usdot   z1.s, z0.b, z2.b
        b.any   .L2
        uaddv   d0, p1, z1.s
        fmov    x1, d0
        add     w0, w0, w1
        ret

instead of

f:
        movi    v0.4s, 0
        mov     x3, 0
        .p2align 3,,7
.L2:
        ldr     q2, [x1, x3]
        ldr     q1, [x2, x3]
        add     x3, x3, 16
        sxtl    v4.8h, v2.8b
        sxtl2   v3.8h, v2.16b
        uxtl    v2.8h, v1.8b
        uxtl2   v1.8h, v1.16b
        mul     v2.8h, v2.8h, v4.8h
        mul     v1.8h, v1.8h, v3.8h
        saddw   v0.4s, v0.4s, v2.4h
        saddw2  v0.4s, v0.4s, v2.8h
        saddw   v0.4s, v0.4s, v1.4h
        saddw2  v0.4s, v0.4s, v1.8h
        cmp     x3, 480
        bne     .L2
        addv    s0, v0.4s
        fmov    w1, s0
        add     w0, w0, w1
        ret

and

f:
        mov     x3, 0
        cnth    x5
        mov     w4, 480
        mov     z1.b, #0
        whilelo p0.h, wzr, w4
        ptrue   p2.b, all
        .p2align 3,,7
.L2:
        ld1sb   z2.h, p0/z, [x1, x3]
        punpklo p1.h, p0.b
        ld1b    z0.h, p0/z, [x2, x3]
        add     x3, x3, x5
        mul     z0.h, p2/m, z0.h, z2.h
        sunpklo z2.s, z0.h
        sunpkhi z0.s, z0.h
        add     z1.s, p1/m, z1.s, z2.s
        punpkhi p1.h, p0.b
        whilelo p0.h, w3, w4
        add     z1.s, p1/m, z1.s, z0.s
        b.any   .L2
        uaddv   d0, p2, z1.s
        fmov    x1, d0
        add     w0, w0, w1
        ret

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (aarch64_usdot<vsi2qi>): Rename to...
	(usdot_prod<vsi2qi>): ... This.
	* config/aarch64/aarch64-simd-builtins.def (usdot): Rename to...
	(usdot_prod): ...This.
	* config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Likewise.
	* config/aarch64/aarch64-sve.md (@aarch64_<sur>dot_prod<vsi2qi>):
	Rename to...
	(@<sur>dot_prod<vsi2qi>): ...This.
	* config/aarch64/aarch64-sve-builtins-base.cc
	(svusdot_impl::expand): Use it.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/vusdot-autovec.c: New test.
	* gcc.target/aarch64/sve/vusdot-autovec.c: New test.
---

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index ac5d4fc7ff1e..063f503ebd96 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -374,10 +374,11 @@
   BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, NONE)
   BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE)
 
-  /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
+  /* Implemented by <sur><dotprod>_prod<dot_mode>.  */
   BUILTIN_VB (TERNOP, sdot, 0, NONE)
   BUILTIN_VB (TERNOPU, udot, 0, NONE)
-  BUILTIN_VB (TERNOP_SSUS, usdot, 0, NONE)
+  BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE)
+  /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>.  */
   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
   BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE)
   BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 540244cf0a91..74890989cb30 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -601,7 +601,7 @@
 
 ;; These instructions map to the __builtins for the armv8.6a I8MM usdot
 ;; (vector) Dot Product operation.
-(define_insn "aarch64_usdot<vsi2qi>"
+(define_insn "usdot_prod<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
 	(plus:VS
 	  (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 8fd6d3fb3171..02e42a71e5e8 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -2366,7 +2366,7 @@ public:
        Hence we do the same rotation on arguments as svdot_impl does.  */
     e.rotate_inputs_left (0, 3);
     machine_mode mode = e.vector_mode (0);
-    insn_code icode = code_for_aarch64_dot_prod (UNSPEC_USDOT, mode);
+    insn_code icode = code_for_dot_prod (UNSPEC_USDOT, mode);
     return e.use_exact_insn (icode);
   }
 
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 9e48c0ea2fb2..359fe0e45709 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6870,7 +6870,7 @@
   [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn "@aarch64_<sur>dot_prod<vsi2qi>"
+(define_insn "@<sur>dot_prod<vsi2qi>"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
         (plus:VNx4SI_ONLY
 	  (unspec:VNx4SI_ONLY
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 17e059efb80f..00d76ea937ac 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34039,14 +34039,14 @@ __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
+  return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b);
+  return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b);
 }
 
 __extension__ extern __inline int32x2_t
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
new file mode 100644
index 000000000000..b99a945903c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b,
+   SIGNEDNESS_4 char *restrict a)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
new file mode 100644
index 000000000000..094dd51cea62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+i8mm+sve" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b,
+   SIGNEDNESS_4 char *restrict a)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */