]> git.ipfire.org Git - thirdparty/valgrind.git/commitdiff
Bug 460616 - Add support for aarch64 dotprod instructions
authorWilliam Ashley <wash@amazon.com>
Fri, 10 Nov 2023 16:51:12 +0000 (17:51 +0100)
committerMark Wielaard <mark@klomp.org>
Fri, 10 Nov 2023 16:55:22 +0000 (17:55 +0100)
This change adds support for the FEAT_DotProd instructions
SDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.4B[<index>]
SDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tb>
UDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.4B[<index>]
UDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tb>

.gitignore
NEWS
VEX/priv/guest_arm64_toIR.c
configure.ac
coregrind/m_initimg/initimg-linux.c
none/tests/arm64/Makefile.am
none/tests/arm64/simd_dotprod.c [new file with mode: 0644]
none/tests/arm64/simd_dotprod.stderr.exp [new file with mode: 0644]
none/tests/arm64/simd_dotprod.stdout.exp [new file with mode: 0644]
none/tests/arm64/simd_dotprod.vgtest [new file with mode: 0644]

index 5090d773b5df74291ca42663fd8ae0fbd1f95295..8fc5bfd774e67b9866b0a75e5dadc47f29add615 100644 (file)
 /none/tests/arm64/fp_and_simd_v82
 /none/tests/arm64/integer
 /none/tests/arm64/memory
+/none/tests/arm64/simd_dotprod
 /none/tests/arm64/simd_v81
 
 # /none/tests/darwin/
diff --git a/NEWS b/NEWS
index 05fb4a8ddc8ab03e305525a152011d6c9ad995ee..33327182ee5bc5d4c45a0007cfd684f9692ae2f9 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -25,6 +25,7 @@ are not entered into bugzilla tend to get forgotten about or ignored.
 
 
 401284  False positive "Source and destination overlap in strncat"
+460616  disInstr(arm64): unhandled instruction 0x4E819402 (dotprod/ASIMDDP)
 475498  Add reallocarray wrapper
 476320  Build failure with GCC
 476535  Difference in allocation size for massif/tests/overloaded-new between clang++/libc++ and g++/libstdc++
index 97b09417561214e7e1810bc03269b73feb19dabb..fcfeca70d86fbe683d55c8d380c22b09fe3fd0e4 100644 (file)
@@ -9113,6 +9113,21 @@ IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb )
 }
 
 
+/* Generate IR to do {U,S}ADDLP */
+static
+IRTemp math_ADDLP ( UInt sizeNarrow, Bool isU, IRTemp src )
+{
+   IRTemp res = newTempV128();
+   assign(res,
+            binop(mkVecADD(sizeNarrow+1),
+                  mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
+                           isU, True/*fromOdd*/, sizeNarrow, mkexpr(src))),
+                  mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
+                           isU, False/*!fromOdd*/, sizeNarrow, mkexpr(src)))));
+   return res;
+}
+
+
 /* QCFLAG tracks the SIMD sticky saturation status.  Update the status
    thusly: if, after application of |opZHI| to both |qres| and |nres|,
    they have the same value, leave QCFLAG unchanged.  Otherwise, set it
@@ -13406,12 +13421,7 @@ Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
       IRTemp sum   = newTempV128();
       IRTemp res   = newTempV128();
       assign(src, getQReg128(nn));
-      assign(sum,
-             binop(mkVecADD(size+1),
-                   mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
-                             isU, True/*fromOdd*/, size, mkexpr(src))),
-                   mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
-                             isU, False/*!fromOdd*/, size, mkexpr(src)))));
+      sum = math_ADDLP(size, isU, src);
       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd))
                         : mkexpr(sum));
       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
@@ -15692,6 +15702,91 @@ Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn)
 }
 
 
+static
+Bool dis_AdvSIMD_dot_product(/*MB_OUT*/DisResult* dres, UInt insn)
+{
+   /* by element
+      31 30 29 28    23   21 20 15   11 10 9 4
+      0  Q  U  01111 size L  m  1110 H  0  n d
+      vector
+      31 30 29 28    23   21 20 15   11 10 9 4
+      0  Q  U  01110 size 0  m  1001 0  1  n d
+   */
+#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+   if (INSN(31,31) != 0) {
+      return False;
+   }
+   UInt bitQ    = INSN(30,30);
+   UInt bitU    = INSN(29,29);
+   UInt opcode1 = INSN(28,24);
+   UInt size    = INSN(23,22);
+   UInt bitL    = INSN(21,21);
+   UInt mm      = INSN(20,16);
+   UInt opcode2 = INSN(15,12);
+   UInt bitH    = INSN(11,11);
+   UInt opcode3 = INSN(10,10);
+   UInt nn      = INSN(9,5);
+   UInt dd      = INSN(4,0);
+   UInt index   = (bitH << 1) + bitL;
+   vassert(index <= 3);
+
+   Bool byElement;
+   if (opcode1 == BITS5(0,1,1,1,1)
+       && opcode2 == BITS4(1,1,1,0)
+       && opcode3 == 0) {
+      byElement = True;
+   } else if (opcode1 == BITS5(0,1,1,1,0)
+       && opcode2 == BITS4(1,0,0,1)
+       && opcode3 == 1
+       && bitL == 0 && bitH == 0) {
+      byElement = False;
+   } else {
+      return False;
+   }
+
+   // '10' is the only valid size
+   if (size != X10) return False;
+
+   IRExpr* src1 = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn));
+   IRExpr* src2 = getQReg128(mm);
+   if (byElement) {
+      src2 = mkexpr(math_DUP_VEC_ELEM(src2, X10, index));
+   }
+
+   IROp mulOp = bitU ? Iop_Mull8Ux8 : Iop_Mull8Sx8;
+   IRTemp loProductSums = math_ADDLP(
+         X01, bitU, math_BINARY_WIDENING_V128(False, mulOp, src1, src2));
+   IRTemp hiProductSums = math_ADDLP(
+         X01, bitU, math_BINARY_WIDENING_V128(True, mulOp, src1, src2));
+
+   IRTemp res = newTempV128();
+   assign(res, binop(Iop_Add32x4,
+          mk_CatEvenLanes32x4(hiProductSums, loProductSums),
+          mk_CatOddLanes32x4(hiProductSums, loProductSums)));
+
+   // These instructions accumulate into the destination, but in non-q
+   // form the upper 64 bits get forced to 0
+   IRExpr* accVal = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(dd));
+   putQReg128(dd, binop(mkVecADD(size), mkexpr(res), accVal));
+
+   const HChar* nm = bitU ? "udot" : "sdot";
+   const HChar* destWidth = nameArr_Q_SZ(bitQ, size);
+   const HChar* srcWidth  = nameArr_Q_SZ(bitQ, X00);
+   if (byElement) {
+      DIP("%s v%u.%s, v%u.%s, v%u.4b[%u]\n", nm,
+         dd, destWidth,
+         nn, srcWidth, mm, index);
+   } else {
+      DIP("%s v%u.%s, v%u.%s, v%u.%s\n", nm,
+         dd, destWidth,
+         nn, srcWidth, mm, srcWidth);
+   }
+
+   return True;
+#  undef INSN
+}
+
+
 static
 Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn,
                            const VexArchInfo* archinfo, Bool sigill_diag)
@@ -15767,6 +15862,8 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn,
    if (UNLIKELY(ok)) return True;
    ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn);
    if (UNLIKELY(ok)) return True;
+   ok = dis_AdvSIMD_dot_product(dres, insn);
+   if (UNLIKELY(ok)) return True;
    return False;
 }
 
index b59f12efdb964baed5f24ca39ba0e2a4e8f28c2e..62d83371c1e333a58d19d05c0d9e14aaf018767e 100755 (executable)
@@ -3741,6 +3741,31 @@ CFLAGS="$save_CFLAGS"
 AM_CONDITIONAL(BUILD_ARMV82_TESTS, test x$ac_have_armv82_feature = xyes)
 
 
+# Does the C compiler support the armv82-a+dotprod flag and assembler dotprod instructions
+# Note, this doesn't generate a C-level symbol.  It generates a
+# automake-level symbol (BUILD_ARMV82_DOTPROD_TESTS), used in test Makefile.am's
+AC_MSG_CHECKING([if gcc supports the armv82-a+dotprod feature flag and assembler supports dotprod instructions])
+
+save_CFLAGS="$CFLAGS"
+CFLAGS="$CFLAGS -march=armv8.2-a+dotprod -Werror"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+int main()
+{
+    __asm__ __volatile__("sdot v1.4s, v2.16b, v3.16b");
+    return 0;
+}
+]])], [
+ac_have_armv82_dotprod_feature=yes
+AC_MSG_RESULT([yes])
+], [
+ac_have_armv82_dotprod_feature=no
+AC_MSG_RESULT([no])
+])
+CFLAGS="$save_CFLAGS"
+
+AM_CONDITIONAL(BUILD_ARMV82_DOTPROD_TESTS, test x$ac_have_armv82_dotprod_feature = xyes)
+
+
 # XXX JRS 2010 Oct 13: what is this for?  For sure, we don't need this
 # when building the tool executables.  I think we should get rid of it.
 #
index 7a7d453350e9f5a5288096f3788cd066c76b31da..7680baa8e75a576ad416512df839627406659ff6 100644 (file)
@@ -734,7 +734,8 @@ Addr setup_client_stack( void*  init_sp,
                                | VKI_HWCAP_SHA2         \
                                | VKI_HWCAP_CRC32        \
                                | VKI_HWCAP_FP           \
-                               | VKI_HWCAP_ASIMD)
+                               | VKI_HWCAP_ASIMD        \
+                               | VKI_HWCAP_ASIMDDP)
                auxv->u.a_val &= ARM64_SUPPORTED_HWCAP;
             }
 #           endif
index 4a06f099614fb0ff8c0d64489bcc1b66c24dad58..cc0ed1481178992af776d28dfccad5a796f77d61 100644 (file)
@@ -11,6 +11,7 @@ EXTRA_DIST = \
        memory.stdout.exp memory.stderr.exp memory.vgtest \
        atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \
        simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \
+       simd_dotprod.stdout.exp simd_dotprod.stderr.exp simd_dotprod.vgtest \
         fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \
        fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \
        fp_and_simd_v82.vgtest \
@@ -40,6 +41,10 @@ if BUILD_ARMV82_TESTS
   check_PROGRAMS += fp_and_simd_v82
 endif
 
+if BUILD_ARMV82_DOTPROD_TESTS
+  check_PROGRAMS += simd_dotprod
+endif
+
 AM_CFLAGS    += @FLAG_M64@
 AM_CXXFLAGS  += @FLAG_M64@
 AM_CCASFLAGS += @FLAG_M64@
@@ -49,6 +54,7 @@ allexec_CFLAGS     = $(AM_CFLAGS) @FLAG_W_NO_NONNULL@
 crc32_CFLAGS       = $(AM_CFLAGS) -march=armv8-a+crc
 atomics_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a
 simd_v81_CFLAGS    = $(AM_CFLAGS) -march=armv8.1-a+crypto
+simd_dotprod_CFLAGS    = $(AM_CFLAGS) -march=armv8.2-a+dotprod
 fp_and_simd_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crypto
 fp_and_simd_v82_CFLAGS = $(AM_CFLAGS) -march=armv8.2-a+fp16+crypto
 integer_CFLAGS     = $(AM_CFLAGS) -g -O0 -DTEST_BFM=0
diff --git a/none/tests/arm64/simd_dotprod.c b/none/tests/arm64/simd_dotprod.c
new file mode 100644 (file)
index 0000000..ca67da5
--- /dev/null
@@ -0,0 +1,110 @@
+#include <stdio.h>
+#include <assert.h>
+
+typedef  unsigned char UChar;
+typedef  unsigned int  UInt;
+typedef  signed int    Int;
+
+#define ITERS 1
+
+union _V128 {
+   UChar  u8[16];
+};
+typedef  union _V128   V128;
+
+static inline UChar randUChar ( void )
+{
+   static UInt seed = 80021;
+   seed = 1103515245 * seed + 12345;
+   return (seed >> 17) & 0xFF;
+}
+
+/* Generates a random V128. */
+static void randV128 ( /*OUT*/V128* v)
+{
+   static UInt nCalls = 0;
+   Int i;
+   nCalls++;
+   for (i = 0; i < 16; i++) {
+      v->u8[i] = randUChar();
+   }
+   if (0 == (nCalls & 0xFF))
+      printf("randV128: %u calls\n", nCalls);
+}
+
+static void showV128 ( V128* v )
+{
+   Int i;
+   for (i = 15; i >= 0; i--)
+      printf("%02x", (Int)v->u8[i]);
+}
+
+#define GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \
+      Int i; \
+      for (i = 0; i < ITERS; i++) { \
+         V128 block[3]; \
+         randV128(&block[0]); \
+         randV128(&block[1]); \
+         randV128(&block[2]); \
+         __asm__ __volatile__( \
+            "ldr q7, [%0, #0];" \
+            "ldr q8, [%0, #16];" \
+            "ldr q9, [%0, #32];" \
+            #INSN " v9." #SUFFIXD ", v7." #SUFFIXN ", v8." SUFFIXM " ; " \
+            "str q9, [%0, #32];" \
+            : : "r"(&block[0]) : "memory", "v7", "v8", "v9" \
+         ); \
+         printf(#INSN " v9." #SUFFIXD \
+                ", v7." #SUFFIXN ", v8." SUFFIXM " "); \
+         showV128(&block[0]); printf(" "); \
+         showV128(&block[1]); printf(" "); \
+         showV128(&block[2]); printf("\n"); \
+      } \
+
+#define GEN_BINARY_TEST_BY_ELEM(INSN,SUFFIXD,SUFFIXN,MELEM) \
+   __attribute__((noinline)) \
+   static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_elem_##MELEM () { \
+      GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,"4b[" #MELEM "]") \
+   }
+
+#define GEN_BINARY_TEST(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \
+   __attribute__((noinline)) \
+   static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##SUFFIXM () { \
+      GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,#SUFFIXM) \
+   }
+
+GEN_BINARY_TEST(sdot, 2s, 8b, 8b)
+GEN_BINARY_TEST(udot, 2s, 8b, 8b)
+GEN_BINARY_TEST(sdot, 4s, 16b, 16b)
+GEN_BINARY_TEST(udot, 4s, 16b, 16b)
+GEN_BINARY_TEST_BY_ELEM(sdot, 2s, 8b, 0)
+GEN_BINARY_TEST_BY_ELEM(udot, 2s, 8b, 1)
+GEN_BINARY_TEST_BY_ELEM(sdot, 4s, 16b, 2)
+GEN_BINARY_TEST_BY_ELEM(udot, 4s, 16b, 3)
+
+int main ( void )
+{
+   assert(sizeof(V128) == 16);
+
+   // ======================== {S,U}DOT by element ====================
+   // sdot 2s,8b,4b[0]
+   // udot 2s,8b,4b[1]
+   // sdot 4s,16b,4b[2]
+   // udot 4s,16b,4b[3]
+   test_sdot_2s_8b_elem_0();
+   test_udot_2s_8b_elem_1();
+   test_sdot_4s_16b_elem_2();
+   test_udot_4s_16b_elem_3();
+
+   // ======================== {S,U}DOT vector ========================
+   // sdot 2s,8b,8b
+   // udot 2s,8b,8b
+   // sdot 4s,16b,16b
+   // udot 4s,16b,16b
+   test_sdot_2s_8b_8b();
+   test_udot_2s_8b_8b();
+   test_sdot_4s_16b_16b();
+   test_udot_4s_16b_16b();
+
+   return 0;
+}
diff --git a/none/tests/arm64/simd_dotprod.stderr.exp b/none/tests/arm64/simd_dotprod.stderr.exp
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/none/tests/arm64/simd_dotprod.stdout.exp b/none/tests/arm64/simd_dotprod.stdout.exp
new file mode 100644 (file)
index 0000000..8872455
--- /dev/null
@@ -0,0 +1,8 @@
+sdot v9.2s, v7.8b, v8.4b[0] 5175e39d19c9ca1e98f24a4984175700 7d6528c5fa956a0d69c3e9a6af27d13b 000000000000000047b8fac3eeef3914
+udot v9.2s, v7.8b, v8.4b[1] b6d2fb5aa7bc5127fe9915e556a044b2 19a348215c3a67fd399182c2dbcc2d38 0000000000000000842c23cf5066b549
+sdot v9.4s, v7.16b, v8.4b[2] d89998df5035ed364a4bc43968bc40e5 cb509970b8136c85d740b80eb7839b97 f9dd31bff8c05f5456afd620b0ca1b30
+udot v9.4s, v7.16b, v8.4b[3] 5ff85bc9535c191fd3a727d1a705f65d d8bc5c6dee699597398e0039cf03663d 20a33823cbca1faf542f38453df87d2b
+sdot v9.2s, v7.8b, v8.8b d182c916cebc2e17cfaff39be272ef40 6897b536bbe4da8a369dab4f9465b86e 0000000000000000f4e068450523c8a1
+udot v9.2s, v7.8b, v8.8b 95264321bf3b68b255c2b9e2c95c9810 81f2a547be8d181184ededbc53239dcf 00000000000000008d6b78e8f7e97e90
+sdot v9.4s, v7.16b, v8.16b f0350ca70523e0e45ba1ec54e87d39b3 0a3e0f7c75cb0842b95ed64d3b13ff64 e98e9eeaa89323fc54cac842e13de403
+udot v9.4s, v7.16b, v8.16b 0a5f45c55f1c9202b76ddefcb0ebfe6e c84ab713406845904d325b2d5a70a792 5f49643cced88b926263a4c2727e0a11
diff --git a/none/tests/arm64/simd_dotprod.vgtest b/none/tests/arm64/simd_dotprod.vgtest
new file mode 100644 (file)
index 0000000..1997e64
--- /dev/null
@@ -0,0 +1,3 @@
+prog: simd_dotprod
+prereq: test -x simd_dotprod && ../../../tests/arm64_features asimddp
+vgopts: -q