From: William Ashley Date: Fri, 10 Nov 2023 16:51:12 +0000 (+0100) Subject: Bug 460616 - Add support for aarch64 dotprod instructions X-Git-Tag: VALGRIND_3_23_0~279 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f42b9a434e12bc14ec821183a69b86e91da0577c;p=thirdparty%2Fvalgrind.git Bug 460616 - Add support for aarch64 dotprod instructions This change adds support for the FEAT_DotProd instructions SDOT ., ., .4B[] SDOT ., ., . UDOT ., ., .4B[] UDOT ., ., . --- diff --git a/.gitignore b/.gitignore index 5090d773b5..8fc5bfd774 100644 --- a/.gitignore +++ b/.gitignore @@ -1764,6 +1764,7 @@ /none/tests/arm64/fp_and_simd_v82 /none/tests/arm64/integer /none/tests/arm64/memory +/none/tests/arm64/simd_dotprod /none/tests/arm64/simd_v81 # /none/tests/darwin/ diff --git a/NEWS b/NEWS index 05fb4a8ddc..33327182ee 100644 --- a/NEWS +++ b/NEWS @@ -25,6 +25,7 @@ are not entered into bugzilla tend to get forgotten about or ignored. 401284 False positive "Source and destination overlap in strncat" +460616 disInstr(arm64): unhandled instruction 0x4E819402 (dotprod/ASIMDDP) 475498 Add reallocarray wrapper 476320 Build failure with GCC 476535 Difference in allocation size for massif/tests/overloaded-new between clang++/libc++ and g++/libstdc++ diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 97b0941756..fcfeca70d8 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -9113,6 +9113,21 @@ IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb ) } +/* Generate IR to do {U,S}ADDLP */ +static +IRTemp math_ADDLP ( UInt sizeNarrow, Bool isU, IRTemp src ) +{ + IRTemp res = newTempV128(); + assign(res, + binop(mkVecADD(sizeNarrow+1), + mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( + isU, True/*fromOdd*/, sizeNarrow, mkexpr(src))), + mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( + isU, False/*!fromOdd*/, sizeNarrow, mkexpr(src))))); + return res; +} + + /* QCFLAG tracks the SIMD sticky saturation status. Update the status thusly: if, after application of |opZHI| to both |qres| and |nres|, they have the same value, leave QCFLAG unchanged. Otherwise, set it @@ -13406,12 +13421,7 @@ Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn) IRTemp sum = newTempV128(); IRTemp res = newTempV128(); assign(src, getQReg128(nn)); - assign(sum, - binop(mkVecADD(size+1), - mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( - isU, True/*fromOdd*/, size, mkexpr(src))), - mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( - isU, False/*!fromOdd*/, size, mkexpr(src))))); + sum = math_ADDLP(size, isU, src); assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd)) : mkexpr(sum)); putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res)); @@ -15692,6 +15702,91 @@ Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn) } +static +Bool dis_AdvSIMD_dot_product(/*MB_OUT*/DisResult* dres, UInt insn) +{ + /* by element + 31 30 29 28 23 21 20 15 11 10 9 4 + 0 Q U 01111 size L m 1110 H 0 n d + vector + 31 30 29 28 23 21 20 15 11 10 9 4 + 0 Q U 01110 size 0 m 1001 0 1 n d + */ +# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin)) + if (INSN(31,31) != 0) { + return False; + } + UInt bitQ = INSN(30,30); + UInt bitU = INSN(29,29); + UInt opcode1 = INSN(28,24); + UInt size = INSN(23,22); + UInt bitL = INSN(21,21); + UInt mm = INSN(20,16); + UInt opcode2 = INSN(15,12); + UInt bitH = INSN(11,11); + UInt opcode3 = INSN(10,10); + UInt nn = INSN(9,5); + UInt dd = INSN(4,0); + UInt index = (bitH << 1) + bitL; + vassert(index <= 3); + + Bool byElement; + if (opcode1 == BITS5(0,1,1,1,1) + && opcode2 == BITS4(1,1,1,0) + && opcode3 == 0) { + byElement = True; + } else if (opcode1 == BITS5(0,1,1,1,0) + && opcode2 == BITS4(1,0,0,1) + && opcode3 == 1 + && bitL == 0 && bitH == 0) { + byElement = False; + } else { + return False; + } + + // '10' is the only valid size + if (size != X10) return False; + + IRExpr* src1 = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)); + IRExpr* src2 = getQReg128(mm); + if (byElement) { + src2 = mkexpr(math_DUP_VEC_ELEM(src2, X10, index)); + } + + IROp mulOp = bitU ? Iop_Mull8Ux8 : Iop_Mull8Sx8; + IRTemp loProductSums = math_ADDLP( + X01, bitU, math_BINARY_WIDENING_V128(False, mulOp, src1, src2)); + IRTemp hiProductSums = math_ADDLP( + X01, bitU, math_BINARY_WIDENING_V128(True, mulOp, src1, src2)); + + IRTemp res = newTempV128(); + assign(res, binop(Iop_Add32x4, + mk_CatEvenLanes32x4(hiProductSums, loProductSums), + mk_CatOddLanes32x4(hiProductSums, loProductSums))); + + // These instructions accumulate into the destination, but in non-q + // form the upper 64 bits get forced to 0 + IRExpr* accVal = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(dd)); + putQReg128(dd, binop(mkVecADD(size), mkexpr(res), accVal)); + + const HChar* nm = bitU ? "udot" : "sdot"; + const HChar* destWidth = nameArr_Q_SZ(bitQ, size); + const HChar* srcWidth = nameArr_Q_SZ(bitQ, X00); + if (byElement) { + DIP("%s v%u.%s, v%u.%s, v%u.4b[%u]\n", nm, + dd, destWidth, + nn, srcWidth, mm, index); + } else { + DIP("%s v%u.%s, v%u.%s, v%u.%s\n", nm, + dd, destWidth, + nn, srcWidth, mm, srcWidth); + } + + return True; +# undef INSN +} + + static Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn, const VexArchInfo* archinfo, Bool sigill_diag) @@ -15767,6 +15862,8 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn, if (UNLIKELY(ok)) return True; ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn); if (UNLIKELY(ok)) return True; + ok = dis_AdvSIMD_dot_product(dres, insn); + if (UNLIKELY(ok)) return True; return False; } diff --git a/configure.ac b/configure.ac index b59f12efdb..62d83371c1 100755 --- a/configure.ac +++ b/configure.ac @@ -3741,6 +3741,31 @@ CFLAGS="$save_CFLAGS" AM_CONDITIONAL(BUILD_ARMV82_TESTS, test x$ac_have_armv82_feature = xyes) +# Does the C compiler support the armv82-a+dotprod flag and assembler dotprod instructions +# Note, this doesn't generate a C-level symbol. It generates a +# automake-level symbol (BUILD_ARMV82_DOTPROD_TESTS), used in test Makefile.am's +AC_MSG_CHECKING([if gcc supports the armv82-a+dotprod feature flag and assembler supports dotprod instructions]) + +save_CFLAGS="$CFLAGS" +CFLAGS="$CFLAGS -march=armv8.2-a+dotprod -Werror" +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +int main() +{ + __asm__ __volatile__("sdot v1.4s, v2.16b, v3.16b"); + return 0; +} +]])], [ +ac_have_armv82_dotprod_feature=yes +AC_MSG_RESULT([yes]) +], [ +ac_have_armv82_dotprod_feature=no +AC_MSG_RESULT([no]) +]) +CFLAGS="$save_CFLAGS" + +AM_CONDITIONAL(BUILD_ARMV82_DOTPROD_TESTS, test x$ac_have_armv82_dotprod_feature = xyes) + + # XXX JRS 2010 Oct 13: what is this for? For sure, we don't need this # when building the tool executables. I think we should get rid of it. # diff --git a/coregrind/m_initimg/initimg-linux.c b/coregrind/m_initimg/initimg-linux.c index 7a7d453350..7680baa8e7 100644 --- a/coregrind/m_initimg/initimg-linux.c +++ b/coregrind/m_initimg/initimg-linux.c @@ -734,7 +734,8 @@ Addr setup_client_stack( void* init_sp, | VKI_HWCAP_SHA2 \ | VKI_HWCAP_CRC32 \ | VKI_HWCAP_FP \ - | VKI_HWCAP_ASIMD) + | VKI_HWCAP_ASIMD \ + | VKI_HWCAP_ASIMDDP) auxv->u.a_val &= ARM64_SUPPORTED_HWCAP; } # endif diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am index 4a06f09961..cc0ed14811 100644 --- a/none/tests/arm64/Makefile.am +++ b/none/tests/arm64/Makefile.am @@ -11,6 +11,7 @@ EXTRA_DIST = \ memory.stdout.exp memory.stderr.exp memory.vgtest \ atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \ simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \ + simd_dotprod.stdout.exp simd_dotprod.stderr.exp simd_dotprod.vgtest \ fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \ fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \ fp_and_simd_v82.vgtest \ @@ -40,6 +41,10 @@ if BUILD_ARMV82_TESTS check_PROGRAMS += fp_and_simd_v82 endif +if BUILD_ARMV82_DOTPROD_TESTS + check_PROGRAMS += simd_dotprod +endif + AM_CFLAGS += @FLAG_M64@ AM_CXXFLAGS += @FLAG_M64@ AM_CCASFLAGS += @FLAG_M64@ @@ -49,6 +54,7 @@ allexec_CFLAGS = $(AM_CFLAGS) @FLAG_W_NO_NONNULL@ crc32_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crc atomics_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a simd_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a+crypto +simd_dotprod_CFLAGS = $(AM_CFLAGS) -march=armv8.2-a+dotprod fp_and_simd_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crypto fp_and_simd_v82_CFLAGS = $(AM_CFLAGS) -march=armv8.2-a+fp16+crypto integer_CFLAGS = $(AM_CFLAGS) -g -O0 -DTEST_BFM=0 diff --git a/none/tests/arm64/simd_dotprod.c b/none/tests/arm64/simd_dotprod.c new file mode 100644 index 0000000000..ca67da5510 --- /dev/null +++ b/none/tests/arm64/simd_dotprod.c @@ -0,0 +1,110 @@ +#include +#include + +typedef unsigned char UChar; +typedef unsigned int UInt; +typedef signed int Int; + +#define ITERS 1 + +union _V128 { + UChar u8[16]; +}; +typedef union _V128 V128; + +static inline UChar randUChar ( void ) +{ + static UInt seed = 80021; + seed = 1103515245 * seed + 12345; + return (seed >> 17) & 0xFF; +} + +/* Generates a random V128. */ +static void randV128 ( /*OUT*/V128* v) +{ + static UInt nCalls = 0; + Int i; + nCalls++; + for (i = 0; i < 16; i++) { + v->u8[i] = randUChar(); + } + if (0 == (nCalls & 0xFF)) + printf("randV128: %u calls\n", nCalls); +} + +static void showV128 ( V128* v ) +{ + Int i; + for (i = 15; i >= 0; i--) + printf("%02x", (Int)v->u8[i]); +} + +#define GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[3]; \ + randV128(&block[0]); \ + randV128(&block[1]); \ + randV128(&block[2]); \ + __asm__ __volatile__( \ + "ldr q7, [%0, #0];" \ + "ldr q8, [%0, #16];" \ + "ldr q9, [%0, #32];" \ + #INSN " v9." #SUFFIXD ", v7." #SUFFIXN ", v8." SUFFIXM " ; " \ + "str q9, [%0, #32];" \ + : : "r"(&block[0]) : "memory", "v7", "v8", "v9" \ + ); \ + printf(#INSN " v9." #SUFFIXD \ + ", v7." #SUFFIXN ", v8." SUFFIXM " "); \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf("\n"); \ + } \ + +#define GEN_BINARY_TEST_BY_ELEM(INSN,SUFFIXD,SUFFIXN,MELEM) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_elem_##MELEM () { \ + GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,"4b[" #MELEM "]") \ + } + +#define GEN_BINARY_TEST(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##SUFFIXM () { \ + GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,#SUFFIXM) \ + } + +GEN_BINARY_TEST(sdot, 2s, 8b, 8b) +GEN_BINARY_TEST(udot, 2s, 8b, 8b) +GEN_BINARY_TEST(sdot, 4s, 16b, 16b) +GEN_BINARY_TEST(udot, 4s, 16b, 16b) +GEN_BINARY_TEST_BY_ELEM(sdot, 2s, 8b, 0) +GEN_BINARY_TEST_BY_ELEM(udot, 2s, 8b, 1) +GEN_BINARY_TEST_BY_ELEM(sdot, 4s, 16b, 2) +GEN_BINARY_TEST_BY_ELEM(udot, 4s, 16b, 3) + +int main ( void ) +{ + assert(sizeof(V128) == 16); + + // ======================== {S,U}DOT by element ==================== + // sdot 2s,8b,4b[0] + // udot 2s,8b,4b[1] + // sdot 4s,16b,4b[2] + // udot 4s,16b,4b[3] + test_sdot_2s_8b_elem_0(); + test_udot_2s_8b_elem_1(); + test_sdot_4s_16b_elem_2(); + test_udot_4s_16b_elem_3(); + + // ======================== {S,U}DOT vector ======================== + // sdot 2s,8b,8b + // udot 2s,8b,8b + // sdot 4s,16b,16b + // udot 4s,16b,16b + test_sdot_2s_8b_8b(); + test_udot_2s_8b_8b(); + test_sdot_4s_16b_16b(); + test_udot_4s_16b_16b(); + + return 0; +} diff --git a/none/tests/arm64/simd_dotprod.stderr.exp b/none/tests/arm64/simd_dotprod.stderr.exp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/none/tests/arm64/simd_dotprod.stdout.exp b/none/tests/arm64/simd_dotprod.stdout.exp new file mode 100644 index 0000000000..88724550d1 --- /dev/null +++ b/none/tests/arm64/simd_dotprod.stdout.exp @@ -0,0 +1,8 @@ +sdot v9.2s, v7.8b, v8.4b[0] 5175e39d19c9ca1e98f24a4984175700 7d6528c5fa956a0d69c3e9a6af27d13b 000000000000000047b8fac3eeef3914 +udot v9.2s, v7.8b, v8.4b[1] b6d2fb5aa7bc5127fe9915e556a044b2 19a348215c3a67fd399182c2dbcc2d38 0000000000000000842c23cf5066b549 +sdot v9.4s, v7.16b, v8.4b[2] d89998df5035ed364a4bc43968bc40e5 cb509970b8136c85d740b80eb7839b97 f9dd31bff8c05f5456afd620b0ca1b30 +udot v9.4s, v7.16b, v8.4b[3] 5ff85bc9535c191fd3a727d1a705f65d d8bc5c6dee699597398e0039cf03663d 20a33823cbca1faf542f38453df87d2b +sdot v9.2s, v7.8b, v8.8b d182c916cebc2e17cfaff39be272ef40 6897b536bbe4da8a369dab4f9465b86e 0000000000000000f4e068450523c8a1 +udot v9.2s, v7.8b, v8.8b 95264321bf3b68b255c2b9e2c95c9810 81f2a547be8d181184ededbc53239dcf 00000000000000008d6b78e8f7e97e90 +sdot v9.4s, v7.16b, v8.16b f0350ca70523e0e45ba1ec54e87d39b3 0a3e0f7c75cb0842b95ed64d3b13ff64 e98e9eeaa89323fc54cac842e13de403 +udot v9.4s, v7.16b, v8.16b 0a5f45c55f1c9202b76ddefcb0ebfe6e c84ab713406845904d325b2d5a70a792 5f49643cced88b926263a4c2727e0a11 diff --git a/none/tests/arm64/simd_dotprod.vgtest b/none/tests/arm64/simd_dotprod.vgtest new file mode 100644 index 0000000000..1997e64fa2 --- /dev/null +++ b/none/tests/arm64/simd_dotprod.vgtest @@ -0,0 +1,3 @@ +prog: simd_dotprod +prereq: test -x simd_dotprod && ../../../tests/arm64_features asimddp +vgopts: -q