]> git.ipfire.org Git - thirdparty/gcc.git/commit
AArch32: Add support for sign differing dot-product usdot for NEON.
authorTamar Christina <tamar.christina@arm.com>
Wed, 14 Jul 2021 14:20:45 +0000 (15:20 +0100)
committerTamar Christina <tamar.christina@arm.com>
Wed, 14 Jul 2021 14:20:45 +0000 (15:20 +0100)
commit6412c58c781f64b60e7353e762cd5cec62a863e7
tree99440fc19492cfd9556e536ce27f5d2e4a332332
parent752045ed1eea0eddc48923df78999dab7f2827ba
AArch32: Add support for sign differing dot-product usdot for NEON.

This adds optabs implementing usdot_prod.

The following testcase:

#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned

SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
   SIGNEDNESS_4 char *restrict b)
{
  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
    {
      int av = a[i];
      int bv = b[i];
      SIGNEDNESS_2 short mult = av * bv;
      res += mult;
    }
  return res;
}

Generates

f:
        vmov.i32        q8, #0  @ v4si
        add     r3, r2, #480
.L2:
        vld1.8  {q10}, [r2]!
        vld1.8  {q9}, [r1]!
        vusdot.s8       q8, q9, q10
        cmp     r3, r2
        bne     .L2
        vadd.i32        d16, d16, d17
        vpadd.i32       d16, d16, d16
        vmov.32 r3, d16[0]
        add     r0, r0, r3
        bx      lr

instead of

f:
        vmov.i32        q8, #0  @ v4si
        add     r3, r2, #480
.L2:
        vld1.8  {q9}, [r2]!
        vld1.8  {q11}, [r1]!
        cmp     r3, r2
        vmull.s8 q10, d18, d22
        vmull.s8 q9, d19, d23
        vaddw.s16       q8, q8, d20
        vaddw.s16       q8, q8, d21
        vaddw.s16       q8, q8, d18
        vaddw.s16       q8, q8, d19
        bne     .L2
        vadd.i32        d16, d16, d17
        vpadd.i32       d16, d16, d16
        vmov.32 r3, d16[0]
        add     r0, r0, r3
        bx      lr

For NEON.  I couldn't figure out if the MVE instruction vmlaldav.s16 could be
used to emulate this.  Because it would require additional widening to work I
left MVE out of this patch set but perhaps someone should take a look.

gcc/ChangeLog:

* config/arm/neon.md (usdot_prod<vsi2qi>): New.

gcc/testsuite/ChangeLog:

* gcc.target/arm/simd/vusdot-autovec.c: New test.
gcc/config/arm/neon.md
gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c [new file with mode: 0644]