/none/tests/arm64/fp_and_simd_v82
/none/tests/arm64/integer
/none/tests/arm64/memory
+/none/tests/arm64/simd_dotprod
/none/tests/arm64/simd_v81
# /none/tests/darwin/
401284 False positive "Source and destination overlap in strncat"
+460616 disInstr(arm64): unhandled instruction 0x4E819402 (dotprod/ASIMDDP)
475498 Add reallocarray wrapper
476320 Build failure with GCC
476535 Difference in allocation size for massif/tests/overloaded-new between clang++/libc++ and g++/libstdc++
}
+/* Generate IR to do {U,S}ADDLP */
+static
+IRTemp math_ADDLP ( UInt sizeNarrow, Bool isU, IRTemp src )
+{
+ IRTemp res = newTempV128();
+ assign(res,
+ binop(mkVecADD(sizeNarrow+1),
+ mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
+ isU, True/*fromOdd*/, sizeNarrow, mkexpr(src))),
+ mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
+ isU, False/*!fromOdd*/, sizeNarrow, mkexpr(src)))));
+ return res;
+}
+
+
/* QCFLAG tracks the SIMD sticky saturation status. Update the status
thusly: if, after application of |opZHI| to both |qres| and |nres|,
they have the same value, leave QCFLAG unchanged. Otherwise, set it
IRTemp sum = newTempV128();
IRTemp res = newTempV128();
assign(src, getQReg128(nn));
- assign(sum,
- binop(mkVecADD(size+1),
- mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
- isU, True/*fromOdd*/, size, mkexpr(src))),
- mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
- isU, False/*!fromOdd*/, size, mkexpr(src)))));
+ sum = math_ADDLP(size, isU, src);
assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd))
: mkexpr(sum));
putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
}
+static
+Bool dis_AdvSIMD_dot_product(/*MB_OUT*/DisResult* dres, UInt insn)
+{
+ /* by element
+ 31 30 29 28 23 21 20 15 11 10 9 4
+ 0 Q U 01111 size L m 1110 H 0 n d
+ vector
+ 31 30 29 28 23 21 20 15 11 10 9 4
+ 0 Q U 01110 size 0 m 1001 0 1 n d
+ */
+# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin))
+ if (INSN(31,31) != 0) {
+ return False;
+ }
+ UInt bitQ = INSN(30,30);
+ UInt bitU = INSN(29,29);
+ UInt opcode1 = INSN(28,24);
+ UInt size = INSN(23,22);
+ UInt bitL = INSN(21,21);
+ UInt mm = INSN(20,16);
+ UInt opcode2 = INSN(15,12);
+ UInt bitH = INSN(11,11);
+ UInt opcode3 = INSN(10,10);
+ UInt nn = INSN(9,5);
+ UInt dd = INSN(4,0);
+ UInt index = (bitH << 1) + bitL;
+ vassert(index <= 3);
+
+ Bool byElement;
+ if (opcode1 == BITS5(0,1,1,1,1)
+ && opcode2 == BITS4(1,1,1,0)
+ && opcode3 == 0) {
+ byElement = True;
+ } else if (opcode1 == BITS5(0,1,1,1,0)
+ && opcode2 == BITS4(1,0,0,1)
+ && opcode3 == 1
+ && bitL == 0 && bitH == 0) {
+ byElement = False;
+ } else {
+ return False;
+ }
+
+ // '10' is the only valid size
+ if (size != X10) return False;
+
+ IRExpr* src1 = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn));
+ IRExpr* src2 = getQReg128(mm);
+ if (byElement) {
+ src2 = mkexpr(math_DUP_VEC_ELEM(src2, X10, index));
+ }
+
+ IROp mulOp = bitU ? Iop_Mull8Ux8 : Iop_Mull8Sx8;
+ IRTemp loProductSums = math_ADDLP(
+ X01, bitU, math_BINARY_WIDENING_V128(False, mulOp, src1, src2));
+ IRTemp hiProductSums = math_ADDLP(
+ X01, bitU, math_BINARY_WIDENING_V128(True, mulOp, src1, src2));
+
+ IRTemp res = newTempV128();
+ assign(res, binop(Iop_Add32x4,
+ mk_CatEvenLanes32x4(hiProductSums, loProductSums),
+ mk_CatOddLanes32x4(hiProductSums, loProductSums)));
+
+ // These instructions accumulate into the destination, but in non-q
+ // form the upper 64 bits get forced to 0
+ IRExpr* accVal = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(dd));
+ putQReg128(dd, binop(mkVecADD(size), mkexpr(res), accVal));
+
+ const HChar* nm = bitU ? "udot" : "sdot";
+ const HChar* destWidth = nameArr_Q_SZ(bitQ, size);
+ const HChar* srcWidth = nameArr_Q_SZ(bitQ, X00);
+ if (byElement) {
+ DIP("%s v%u.%s, v%u.%s, v%u.4b[%u]\n", nm,
+ dd, destWidth,
+ nn, srcWidth, mm, index);
+ } else {
+ DIP("%s v%u.%s, v%u.%s, v%u.%s\n", nm,
+ dd, destWidth,
+ nn, srcWidth, mm, srcWidth);
+ }
+
+ return True;
+# undef INSN
+}
+
+
static
Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn,
const VexArchInfo* archinfo, Bool sigill_diag)
if (UNLIKELY(ok)) return True;
ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn);
if (UNLIKELY(ok)) return True;
+ ok = dis_AdvSIMD_dot_product(dres, insn);
+ if (UNLIKELY(ok)) return True;
return False;
}
AM_CONDITIONAL(BUILD_ARMV82_TESTS, test x$ac_have_armv82_feature = xyes)
+# Does the C compiler support the armv82-a+dotprod flag and assembler dotprod instructions
+# Note, this doesn't generate a C-level symbol. It generates a
+# automake-level symbol (BUILD_ARMV82_DOTPROD_TESTS), used in test Makefile.am's
+AC_MSG_CHECKING([if gcc supports the armv82-a+dotprod feature flag and assembler supports dotprod instructions])
+
+save_CFLAGS="$CFLAGS"
+CFLAGS="$CFLAGS -march=armv8.2-a+dotprod -Werror"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+int main()
+{
+ __asm__ __volatile__("sdot v1.4s, v2.16b, v3.16b");
+ return 0;
+}
+]])], [
+ac_have_armv82_dotprod_feature=yes
+AC_MSG_RESULT([yes])
+], [
+ac_have_armv82_dotprod_feature=no
+AC_MSG_RESULT([no])
+])
+CFLAGS="$save_CFLAGS"
+
+AM_CONDITIONAL(BUILD_ARMV82_DOTPROD_TESTS, test x$ac_have_armv82_dotprod_feature = xyes)
+
+
# XXX JRS 2010 Oct 13: what is this for? For sure, we don't need this
# when building the tool executables. I think we should get rid of it.
#
| VKI_HWCAP_SHA2 \
| VKI_HWCAP_CRC32 \
| VKI_HWCAP_FP \
- | VKI_HWCAP_ASIMD)
+ | VKI_HWCAP_ASIMD \
+ | VKI_HWCAP_ASIMDDP)
auxv->u.a_val &= ARM64_SUPPORTED_HWCAP;
}
# endif
memory.stdout.exp memory.stderr.exp memory.vgtest \
atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \
simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \
+ simd_dotprod.stdout.exp simd_dotprod.stderr.exp simd_dotprod.vgtest \
fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \
fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \
fp_and_simd_v82.vgtest \
check_PROGRAMS += fp_and_simd_v82
endif
+if BUILD_ARMV82_DOTPROD_TESTS
+ check_PROGRAMS += simd_dotprod
+endif
+
AM_CFLAGS += @FLAG_M64@
AM_CXXFLAGS += @FLAG_M64@
AM_CCASFLAGS += @FLAG_M64@
crc32_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crc
atomics_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a
simd_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a+crypto
+simd_dotprod_CFLAGS = $(AM_CFLAGS) -march=armv8.2-a+dotprod
fp_and_simd_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crypto
fp_and_simd_v82_CFLAGS = $(AM_CFLAGS) -march=armv8.2-a+fp16+crypto
integer_CFLAGS = $(AM_CFLAGS) -g -O0 -DTEST_BFM=0
--- /dev/null
+#include <stdio.h>
+#include <assert.h>
+
+typedef unsigned char UChar;
+typedef unsigned int UInt;
+typedef signed int Int;
+
+#define ITERS 1
+
+union _V128 {
+ UChar u8[16];
+};
+typedef union _V128 V128;
+
+static inline UChar randUChar ( void )
+{
+ static UInt seed = 80021;
+ seed = 1103515245 * seed + 12345;
+ return (seed >> 17) & 0xFF;
+}
+
+/* Generates a random V128. */
+static void randV128 ( /*OUT*/V128* v)
+{
+ static UInt nCalls = 0;
+ Int i;
+ nCalls++;
+ for (i = 0; i < 16; i++) {
+ v->u8[i] = randUChar();
+ }
+ if (0 == (nCalls & 0xFF))
+ printf("randV128: %u calls\n", nCalls);
+}
+
+static void showV128 ( V128* v )
+{
+ Int i;
+ for (i = 15; i >= 0; i--)
+ printf("%02x", (Int)v->u8[i]);
+}
+
+#define GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \
+ Int i; \
+ for (i = 0; i < ITERS; i++) { \
+ V128 block[3]; \
+ randV128(&block[0]); \
+ randV128(&block[1]); \
+ randV128(&block[2]); \
+ __asm__ __volatile__( \
+ "ldr q7, [%0, #0];" \
+ "ldr q8, [%0, #16];" \
+ "ldr q9, [%0, #32];" \
+ #INSN " v9." #SUFFIXD ", v7." #SUFFIXN ", v8." SUFFIXM " ; " \
+ "str q9, [%0, #32];" \
+ : : "r"(&block[0]) : "memory", "v7", "v8", "v9" \
+ ); \
+ printf(#INSN " v9." #SUFFIXD \
+ ", v7." #SUFFIXN ", v8." SUFFIXM " "); \
+ showV128(&block[0]); printf(" "); \
+ showV128(&block[1]); printf(" "); \
+ showV128(&block[2]); printf("\n"); \
+ } \
+
+#define GEN_BINARY_TEST_BY_ELEM(INSN,SUFFIXD,SUFFIXN,MELEM) \
+ __attribute__((noinline)) \
+ static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_elem_##MELEM () { \
+ GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,"4b[" #MELEM "]") \
+ }
+
+#define GEN_BINARY_TEST(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \
+ __attribute__((noinline)) \
+ static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##SUFFIXM () { \
+ GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,#SUFFIXM) \
+ }
+
+GEN_BINARY_TEST(sdot, 2s, 8b, 8b)
+GEN_BINARY_TEST(udot, 2s, 8b, 8b)
+GEN_BINARY_TEST(sdot, 4s, 16b, 16b)
+GEN_BINARY_TEST(udot, 4s, 16b, 16b)
+GEN_BINARY_TEST_BY_ELEM(sdot, 2s, 8b, 0)
+GEN_BINARY_TEST_BY_ELEM(udot, 2s, 8b, 1)
+GEN_BINARY_TEST_BY_ELEM(sdot, 4s, 16b, 2)
+GEN_BINARY_TEST_BY_ELEM(udot, 4s, 16b, 3)
+
+int main ( void )
+{
+ assert(sizeof(V128) == 16);
+
+ // ======================== {S,U}DOT by element ====================
+ // sdot 2s,8b,4b[0]
+ // udot 2s,8b,4b[1]
+ // sdot 4s,16b,4b[2]
+ // udot 4s,16b,4b[3]
+ test_sdot_2s_8b_elem_0();
+ test_udot_2s_8b_elem_1();
+ test_sdot_4s_16b_elem_2();
+ test_udot_4s_16b_elem_3();
+
+ // ======================== {S,U}DOT vector ========================
+ // sdot 2s,8b,8b
+ // udot 2s,8b,8b
+ // sdot 4s,16b,16b
+ // udot 4s,16b,16b
+ test_sdot_2s_8b_8b();
+ test_udot_2s_8b_8b();
+ test_sdot_4s_16b_16b();
+ test_udot_4s_16b_16b();
+
+ return 0;
+}
--- /dev/null
+sdot v9.2s, v7.8b, v8.4b[0] 5175e39d19c9ca1e98f24a4984175700 7d6528c5fa956a0d69c3e9a6af27d13b 000000000000000047b8fac3eeef3914
+udot v9.2s, v7.8b, v8.4b[1] b6d2fb5aa7bc5127fe9915e556a044b2 19a348215c3a67fd399182c2dbcc2d38 0000000000000000842c23cf5066b549
+sdot v9.4s, v7.16b, v8.4b[2] d89998df5035ed364a4bc43968bc40e5 cb509970b8136c85d740b80eb7839b97 f9dd31bff8c05f5456afd620b0ca1b30
+udot v9.4s, v7.16b, v8.4b[3] 5ff85bc9535c191fd3a727d1a705f65d d8bc5c6dee699597398e0039cf03663d 20a33823cbca1faf542f38453df87d2b
+sdot v9.2s, v7.8b, v8.8b d182c916cebc2e17cfaff39be272ef40 6897b536bbe4da8a369dab4f9465b86e 0000000000000000f4e068450523c8a1
+udot v9.2s, v7.8b, v8.8b 95264321bf3b68b255c2b9e2c95c9810 81f2a547be8d181184ededbc53239dcf 00000000000000008d6b78e8f7e97e90
+sdot v9.4s, v7.16b, v8.16b f0350ca70523e0e45ba1ec54e87d39b3 0a3e0f7c75cb0842b95ed64d3b13ff64 e98e9eeaa89323fc54cac842e13de403
+udot v9.4s, v7.16b, v8.16b 0a5f45c55f1c9202b76ddefcb0ebfe6e c84ab713406845904d325b2d5a70a792 5f49643cced88b926263a4c2727e0a11
--- /dev/null
+prog: simd_dotprod
+prereq: test -x simd_dotprod && ../../../tests/arm64_features asimddp
+vgopts: -q