From: Julian Seward Date: Thu, 7 Jan 2021 07:34:14 +0000 (+0100) Subject: Bug 413547 - regression test does not check for Arm 64 features. X-Git-Tag: VALGRIND_3_17_0~78 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3b1710d38cf19619242c9113a2dbe291e914a8c2;p=thirdparty%2Fvalgrind.git Bug 413547 - regression test does not check for Arm 64 features. Patches from/by Assad Hashmi (assad.hashmi@linaro.org). --- diff --git a/.gitignore b/.gitignore index edb8edd22b..dff20848e6 100644 --- a/.gitignore +++ b/.gitignore @@ -2078,6 +2078,7 @@ /tests/true /tests/vg_regtest /tests/x86_amd64_features +/tests/arm64_features # /VEX/ /VEX/libvex*.a diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 1121ce88af..a296565591 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -993,7 +993,7 @@ static IROp mkVecQSHLNSATSU ( UInt size ) { static IROp mkVecADDF ( UInt size ) { const IROp ops[4] - = { Iop_INVALID, Iop_INVALID, Iop_Add32Fx4, Iop_Add64Fx2 }; + = { Iop_INVALID, Iop_Add16Fx8, Iop_Add32Fx4, Iop_Add64Fx2 }; vassert(size < 4); return ops[size]; } @@ -9806,7 +9806,8 @@ Bool dis_AdvSIMD_scalar_copy(/*MB_OUT*/DisResult* dres, UInt insn) static -Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn) +Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn, + const VexArchInfo* archinfo) { /* 31 28 23 21 16 11 9 4 01 u 11110 sz 11000 opcode 10 n d @@ -9857,6 +9858,27 @@ Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn) return True; } + /* Half-precision floating point ADDP (v8.2). */ + if (bitU == 0 && sz <= X00 && opcode == BITS5(0,1,1,0,1)) { + /* -------- 0,00,01101 ADDP h_2h -------- */ + if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0) + return False; + IROp opZHI = mkVecZEROHIxxOFV128(1); + IROp opADD = mkVecADDF(1); + IRTemp src = newTempV128(); + IRTemp argL = newTempV128(); + IRTemp argR = newTempV128(); + assign(src, getQReg128(nn)); + assign(argL, unop(opZHI, mkexpr(src))); + assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src), + mkU8(2)))); + putQReg128(dd, unop(opZHI, + triop(opADD, mkexpr(mk_get_IR_rounding_mode()), + mkexpr(argL), mkexpr(argR)))); + DIP("faddp h%u, v%u.2h\n", dd, nn); + return True; + } + if (bitU == 1 && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) { /* -------- 1,0x,01100 FMAXNMP d_2d, s_2s -------- */ @@ -14946,7 +14968,8 @@ Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn) static -Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) +Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn, + const VexArchInfo* archinfo) { Bool ok; ok = dis_AdvSIMD_EXT(dres, insn); @@ -14963,7 +14986,7 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) if (UNLIKELY(ok)) return True; ok = dis_AdvSIMD_scalar_copy(dres, insn); if (UNLIKELY(ok)) return True; - ok = dis_AdvSIMD_scalar_pairwise(dres, insn); + ok = dis_AdvSIMD_scalar_pairwise(dres, insn, archinfo); if (UNLIKELY(ok)) return True; ok = dis_AdvSIMD_scalar_shift_by_imm(dres, insn); if (UNLIKELY(ok)) return True; @@ -15175,7 +15198,7 @@ Bool disInstr_ARM64_WRK ( break; case BITS4(0,1,1,1): case BITS4(1,1,1,1): // Data processing - SIMD and floating point - ok = dis_ARM64_simd_and_fp(dres, insn); + ok = dis_ARM64_simd_and_fp(dres, insn, archinfo); break; case BITS4(0,0,0,0): case BITS4(0,0,0,1): case BITS4(0,0,1,0): case BITS4(0,0,1,1): diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index 526da570a5..e6b06e5fbe 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -600,6 +600,7 @@ static void showARM64VecBinOp(/*OUT*/const HChar** nm, case ARM64vecb_FMUL64x2: *nm = "fmul "; *ar = "2d"; return; case ARM64vecb_FDIV64x2: *nm = "fdiv "; *ar = "2d"; return; case ARM64vecb_FADD32x4: *nm = "fadd "; *ar = "4s"; return; + case ARM64vecb_FADD16x8: *nm = "fadd "; *ar = "8h"; return; case ARM64vecb_FSUB32x4: *nm = "fsub "; *ar = "4s"; return; case ARM64vecb_FMUL32x4: *nm = "fmul "; *ar = "4s"; return; case ARM64vecb_FDIV32x4: *nm = "fdiv "; *ar = "4s"; return; @@ -2869,6 +2870,7 @@ static inline UInt qregEnc ( HReg r ) #define X000010 BITS8(0,0, 0,0,0,0,1,0) #define X000011 BITS8(0,0, 0,0,0,0,1,1) #define X000100 BITS8(0,0, 0,0,0,1,0,0) +#define X000101 BITS8(0,0, 0,0,0,1,0,1) #define X000110 BITS8(0,0, 0,0,0,1,1,0) #define X000111 BITS8(0,0, 0,0,0,1,1,1) #define X001000 BITS8(0,0, 0,0,1,0,0,0) @@ -4831,6 +4833,9 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, case ARM64vecb_FADD32x4: *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X110101, vN, vD); break; + case ARM64vecb_FADD16x8: + *p++ = X_3_8_5_6_5_5(X010, X01110010, vM, X000101, vN, vD); + break; case ARM64vecb_FSUB64x2: *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X110101, vN, vD); break; diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h index 105d7ce843..8cece7b9c5 100644 --- a/VEX/priv/host_arm64_defs.h +++ b/VEX/priv/host_arm64_defs.h @@ -335,6 +335,7 @@ typedef ARM64vecb_MUL32x4, ARM64vecb_MUL16x8, ARM64vecb_MUL8x16, ARM64vecb_FADD64x2, ARM64vecb_FADD32x4, + ARM64vecb_FADD16x8, ARM64vecb_FSUB64x2, ARM64vecb_FSUB32x4, ARM64vecb_FMUL64x2, ARM64vecb_FMUL32x4, ARM64vecb_FDIV64x2, ARM64vecb_FDIV32x4, diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index 1b8ad20a5a..c0464abf33 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -3157,6 +3157,7 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) case Iop_Sub32Fx4: vecbop = ARM64vecb_FSUB32x4; break; case Iop_Mul32Fx4: vecbop = ARM64vecb_FMUL32x4; break; case Iop_Div32Fx4: vecbop = ARM64vecb_FDIV32x4; break; + case Iop_Add16Fx8: vecbop = ARM64vecb_FADD16x8; break; default: break; } if (vecbop != ARM64vecb_INVALID) { diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 1359abb3f2..2734776f5b 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -640,6 +640,7 @@ void ppIROp ( IROp op ) case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return; case Iop_CmpNEZ8x8: vex_printf("CmpNEZ8x8"); return; + case Iop_Add16Fx8: vex_printf("Add16Fx8"); return; case Iop_Add32Fx4: vex_printf("Add32Fx4"); return; case Iop_Add32Fx2: vex_printf("Add32Fx2"); return; case Iop_Add32F0x4: vex_printf("Add32F0x4"); return; @@ -1546,6 +1547,7 @@ Bool primopMightTrap ( IROp op ) case Iop_DPBtoBCD: case Iop_BCDtoDPB: case Iop_BCDAdd: case Iop_BCDSub: case Iop_I128StoBCD128: case Iop_BCD128toI128S: case Iop_ReinterpI64asD64: case Iop_ReinterpD64asI64: + case Iop_Add16Fx8: case Iop_Add32Fx4: case Iop_Sub32Fx4: case Iop_Mul32Fx4: case Iop_Div32Fx4: case Iop_Max32Fx4: case Iop_Min32Fx4: case Iop_Add32Fx2: case Iop_Sub32Fx2: @@ -3760,6 +3762,7 @@ void typeOfPrimop ( IROp op, case Iop_Mul64Fx2: case Iop_Div64Fx2: case Iop_Add32Fx4: case Iop_Sub32Fx4: case Iop_Mul32Fx4: case Iop_Div32Fx4: + case Iop_Add16Fx8: case Iop_F64x2_2toQ32x4: case Iop_F32x4_2toQ16x8: TERNARY(ity_RMode,Ity_V128,Ity_V128, Ity_V128); diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 6a854e43f1..00899e5335 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1344,6 +1344,11 @@ typedef /* ------------------ 128-bit SIMD FP. ------------------ */ + /* --- 16x8 vector FP --- */ + + /* ternary :: IRRoundingMode(I16) x V128 x V128 -> V128 */ + Iop_Add16Fx8, + /* --- 32x4 vector FP --- */ /* ternary :: IRRoundingMode(I32) x V128 x V128 -> V128 */ diff --git a/configure.ac b/configure.ac index 2b949ed84e..41ae942429 100755 --- a/configure.ac +++ b/configure.ac @@ -3240,6 +3240,31 @@ CFLAGS="$save_CFLAGS" AM_CONDITIONAL(BUILD_ARMV81_TESTS, test x$ac_have_armv81_feature = xyes) +# Does the C compiler support the armv82 flag and the assembler v8.2 instructions +# Note, this doesn't generate a C-level symbol. It generates a +# automake-level symbol (BUILD_ARMV82_TESTS), used in test Makefile.am's +AC_MSG_CHECKING([if gcc supports the armv82 feature flag and assembler supports v8.2 instructions]) + +save_CFLAGS="$CFLAGS" +CFLAGS="$CFLAGS -march=armv8.2-a+fp16 -Werror" +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +int main() +{ + __asm__ __volatile__("faddp h0, v1.2h"); + return 0; +} +]])], [ +ac_have_armv82_feature=yes +AC_MSG_RESULT([yes]) +], [ +ac_have_armv82_feature=no +AC_MSG_RESULT([no]) +]) +CFLAGS="$save_CFLAGS" + +AM_CONDITIONAL(BUILD_ARMV82_TESTS, test x$ac_have_armv82_feature = xyes) + + # XXX JRS 2010 Oct 13: what is this for? For sure, we don't need this # when building the tool executables. I think we should get rid of it. # diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c index e91d510946..91f23ed60e 100644 --- a/memcheck/mc_translate.c +++ b/memcheck/mc_translate.c @@ -2692,6 +2692,23 @@ IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX ) return at; } +/* --- --- ... and ... 16Fx8 versions of the same --- --- */ + +static +IRAtom* binary16Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY ) +{ + IRAtom* at; + tl_assert(isShadowAtom(mce, vatomX)); + tl_assert(isShadowAtom(mce, vatomY)); + at = mkUifUV128(mce, vatomX, vatomY); + at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, at)); + return at; +} + +/* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is + implemented. +*/ + /* --- --- ... and ... 32Fx2 versions of the same --- --- */ static @@ -2806,6 +2823,24 @@ IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, return t1; } +/* --- ... and ... 16Fx8 versions of the same --- */ + +static +IRAtom* binary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, + IRAtom* vatomX, IRAtom* vatomY ) +{ + IRAtom* t1 = binary16Fx8(mce, vatomX, vatomY); + // PCast the RM, and widen it to 128 bits + IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM); + // Roll it into the result + t1 = mkUifUV128(mce, t1, t2); + return t1; +} + +/* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is + implemented. +*/ + /* --- ... and ... 32Fx8 versions of the same --- */ static @@ -3393,6 +3428,12 @@ IRAtom* expr2vbits_Triop ( MCEnv* mce, case Iop_Div64Fx4: return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3); + /* TODO: remaining versions of 16x4 FP ops when more of the half-precision + IR is implemented. + */ + case Iop_Add16Fx8: + return binary16Fx8_w_rm(mce, vatom1, vatom2, vatom3); + case Iop_Add32Fx8: case Iop_Sub32Fx8: case Iop_Mul32Fx8: diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c index 1f1ba909ac..39939b711d 100644 --- a/memcheck/tests/vbit-test/irops.c +++ b/memcheck/tests/vbit-test/irops.c @@ -613,6 +613,7 @@ static irop_t irops[] = { { DEFOP(Iop_ReinterpI64asD64, UNDEF_SAME), .s390x = 1, .ppc64 = 1, .ppc32 = 1 }, { DEFOP(Iop_ReinterpD64asI64, UNDEF_SAME), .s390x = 1, .ppc64 = 1, .ppc32 = 1 }, /* ------------------ 128-bit SIMD FP. ------------------ */ + { DEFOP(Iop_Add16Fx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Add32Fx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_Sub32Fx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_Mul32Fx4, UNDEF_UNKNOWN), }, diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am index 4ecab36add..00cbfa52c1 100644 --- a/none/tests/arm64/Makefile.am +++ b/none/tests/arm64/Makefile.am @@ -11,7 +11,8 @@ EXTRA_DIST = \ memory.stdout.exp memory.stderr.exp memory.vgtest \ atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \ simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \ - fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest + fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \ + fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp fp_and_simd_v82.vgtest check_PROGRAMS = \ allexec \ @@ -29,6 +30,10 @@ if BUILD_ARMV81_TESTS check_PROGRAMS += atomics_v81 simd_v81 endif +if BUILD_ARMV82_TESTS + check_PROGRAMS += fp_and_simd_v82 +endif + AM_CFLAGS += @FLAG_M64@ AM_CXXFLAGS += @FLAG_M64@ AM_CCASFLAGS += @FLAG_M64@ @@ -39,7 +44,9 @@ crc32_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crc atomics_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a simd_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a+crypto fp_and_simd_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crypto +fp_and_simd_v82_CFLAGS = $(AM_CFLAGS) -march=armv8.2-a+fp16+crypto integer_CFLAGS = $(AM_CFLAGS) -g -O0 -DTEST_BFM=0 fp_and_simd_LDADD = -lm simd_v81_LDADD = -lm +fp_and_simd_v82_LDADD = -lm diff --git a/none/tests/arm64/fp_and_simd_v82.c b/none/tests/arm64/fp_and_simd_v82.c new file mode 100644 index 0000000000..8c66ff27bd --- /dev/null +++ b/none/tests/arm64/fp_and_simd_v82.c @@ -0,0 +1,2285 @@ + +#include +#include +#include // memalign +#include // memset +#include "tests/malloc.h" +#include // isnormal + +typedef unsigned char UChar; +typedef unsigned short int UShort; +typedef unsigned int UInt; +typedef signed int Int; +typedef unsigned char UChar; +typedef unsigned long long int ULong; +typedef signed long long int Long; +typedef double Double; +typedef float Float; +/* To test half-precision floating point instructions a synthesized 16 bit type + is used rather than native __fp16. This allows gradual support of v8.2 + instructions without test binaries like this failing to run with Valgrind + because a half-precision instruction which is not supported appears in the + test binary. The functions halfToSingleFPAsInt() and shortToSingle() below + are used to create a Float16 type for testing purposes. Float16 should be + typedefed to __fp16 when all v8.2 instructions are supported. +*/ +typedef unsigned short int Float16; + +typedef unsigned char Bool; +#define False ((Bool)0) +#define True ((Bool)1) + + +#define ITERS 1 + +typedef + enum { TyHF=1234, TySF, TyDF, TyB, TyH, TyS, TyD, TyNONE } + LaneTy; + +union _V128 { + UChar u8[16]; + UShort u16[8]; + UInt u32[4]; + ULong u64[2]; + Float16 f16[8]; + Float f32[4]; + Double f64[2]; +}; +typedef union _V128 V128; + +/* Conversion based on IEEE half-precision, as described in the IEEE 754-2008 + standard and Arm Reference Manual 'A1.4.2 Half-precision floating-point + formats' where hardware capability supports __fp16 (VEX_HWCAPS_ARM64_FP16 + and VEX_HWCAPS_ARM64_VFP16 set). +*/ +static UInt halfToSingleFPAsInt(UShort y) +{ + int s = (y >> 15) & 0x00000001; // Sign bit + int e = (y >> 10) & 0x0000001f; // Exponent + int f = y & 0x000003ff; // Fraction + + // Handle +/- INF (7c00 and fc00 -INF) and +/-0 + if (e == 0) { + if (f == 0) + return s << 31; + else { // Normalize + while (!(f & 0x00000400)) { + f <<= 1; + e -= 1; + } + e += 1; + f &= ~0x00000400; + } + } else if (e == 31) { + if (f == 0) // INF + return (s << 31) | 0x7f800000; + else // NaN + return (s << 31) | 0x7f800000 | (f << 13); + } + + e = e + (127 - 15); + f = f << 13; + + return ((s << 31) | (e << 23) | f); +} + +static float shortToSingle(UShort imm) +{ + union { float f; UInt i; } v; + v.i = halfToSingleFPAsInt(imm); + return v.f; +} + +static inline UChar randUChar ( void ) +{ + static UInt seed = 80021; + seed = 1103515245 * seed + 12345; + return (seed >> 17) & 0xFF; +} + +static ULong randULong ( LaneTy ty ) +{ + Int i; + ULong r = 0; + for (i = 0; i < 8; i++) { + r = (r << 8) | (ULong)(0xFF & randUChar()); + } + return r; +} + +/* Generates a random V128. Ensures that that it contains normalised FP numbers + when viewed as either F16x8, F32x4 or F64x2, so that it is reasonable to use + in FP test cases. */ +static void randV128 ( /*OUT*/V128* v, LaneTy ty ) +{ + static UInt nCalls = 0, nIters = 0; + Int i; + nCalls++; + while (1) { + nIters++; + for (i = 0; i < 16; i++) { + v->u8[i] = randUChar(); + } + if (isnormal(v->f32[0]) && isnormal(v->f32[1]) && isnormal(v->f32[2]) + && isnormal(v->f32[3]) && isnormal(v->f64[0]) && isnormal(v->f64[1]) + && isnormal(shortToSingle(v->f16[0])) && isnormal(shortToSingle(v->f16[1])) + && isnormal(shortToSingle(v->f16[2])) && isnormal(shortToSingle(v->f16[3])) + && isnormal(shortToSingle(v->f16[4])) && isnormal(shortToSingle(v->f16[5])) + && isnormal(shortToSingle(v->f16[6])) && isnormal(shortToSingle(v->f16[7]))) { + break; + } + } + if (0 == (nCalls & 0xFF)) + printf("randV128: %u calls, %u iters\n", nCalls, nIters); +} + +static void showV128 ( V128* v ) +{ + Int i; + for (i = 15; i >= 0; i--) + printf("%02x", (Int)v->u8[i]); +} + +static void showBlock ( const char* msg, V128* block, Int nBlock ) +{ + Int i; + printf("%s\n", msg); + for (i = 0; i < nBlock; i++) { + printf(" "); + showV128(&block[i]); + printf("\n"); + } +} + +static ULong dup4x16 ( UInt x ) +{ + ULong r = x & 0xF; + r |= (r << 4); + r |= (r << 8); + r |= (r << 16); + r |= (r << 32); + return r; +} + +// Generate a random double-precision number. About 1 time in 2, +// instead return a special value (+/- Inf, +/-Nan, denorm). +// This ensures that many of the groups of 4 calls here will +// return a special value. + +static Double special_values[10]; +static Bool special_values_initted = False; + +static __attribute__((noinline)) +Double negate ( Double d ) { return -d; } +static __attribute__((noinline)) +Double divf64 ( Double x, Double y ) { return x/y; } + +static __attribute__((noinline)) +Double plusZero ( void ) { return 0.0; } +static __attribute__((noinline)) +Double minusZero ( void ) { return negate(plusZero()); } + +static __attribute__((noinline)) +Double plusOne ( void ) { return 1.0; } +static __attribute__((noinline)) +Double minusOne ( void ) { return negate(plusOne()); } + +static __attribute__((noinline)) +Double plusInf ( void ) { return 1.0 / 0.0; } +static __attribute__((noinline)) +Double minusInf ( void ) { return negate(plusInf()); } + +static __attribute__((noinline)) +Double plusNaN ( void ) { return divf64(plusInf(),plusInf()); } +static __attribute__((noinline)) +Double minusNaN ( void ) { return negate(plusNaN()); } + +static __attribute__((noinline)) +Double plusDenorm ( void ) { return 1.23e-315 / 1e3; } +static __attribute__((noinline)) +Double minusDenorm ( void ) { return negate(plusDenorm()); } + + +static void ensure_special_values_initted ( void ) +{ + if (special_values_initted) return; + special_values[0] = plusZero(); + special_values[1] = minusZero(); + special_values[2] = plusOne(); + special_values[3] = minusOne(); + special_values[4] = plusInf(); + special_values[5] = minusInf(); + special_values[6] = plusNaN(); + special_values[7] = minusNaN(); + special_values[8] = plusDenorm(); + special_values[9] = minusDenorm(); + special_values_initted = True; + int i; + printf("\n"); + for (i = 0; i < 10; i++) { + printf("special value %d = %e\n", i, special_values[i]); + } + printf("\n"); +} + +static Double randDouble ( void ) +{ + ensure_special_values_initted(); + UChar c = randUChar(); + if (c >= 128) { + // return a normal number about half of the time. + // 0 .. 2^63-1 + ULong u64 = randULong(TyDF); + // -2^62 .. 2^62-1 + Long s64 = (Long)u64; + // -2^55 .. 2^55-1 + s64 >>= (62-55); + // and now as a float + return (Double)s64; + } + c = randUChar() % 10; + return special_values[c]; +} + +static Float randFloat ( void ) +{ + ensure_special_values_initted(); + UChar c = randUChar(); + if (c >= 128) { + // return a normal number about half of the time. + // 0 .. 2^63-1 + ULong u64 = randULong(TyDF); + // -2^62 .. 2^62-1 + Long s64 = (Long)u64; + // -2^25 .. 2^25-1 + s64 >>= (62-25); + // and now as a float + return (Float)s64; + } + c = randUChar() % 10; + return special_values[c]; +} + +void randBlock_Doubles ( V128* block, Int nBlock ) +{ + Int i; + for (i = 0; i < nBlock; i++) { + block[i].f64[0] = randDouble(); + block[i].f64[1] = randDouble(); + } +} + +void randBlock_Floats ( V128* block, Int nBlock ) +{ + Int i; + for (i = 0; i < nBlock; i++) { + block[i].f32[0] = randFloat(); + block[i].f32[1] = randFloat(); + block[i].f32[2] = randFloat(); + block[i].f32[3] = randFloat(); + } +} + + +/* ---------------------------------------------------------------- */ +/* -- Parameterisable test macros -- */ +/* ---------------------------------------------------------------- */ + +#define DO50(_action) \ + do { \ + Int _qq; for (_qq = 0; _qq < 50; _qq++) { _action ; } \ + } while (0) + + +/* Note this also sets the destination register to a known value (0x55..55) + since it can sometimes be an input to the instruction too. */ +#define GEN_UNARY_TEST(INSN,SUFFIXD,SUFFIXN) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[2+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q7, [%0, #0] ; " \ + "ldr q8, [%0, #16] ; " \ + #INSN " v8." #SUFFIXD ", v7." #SUFFIXN " ; " \ + "str q8, [%0, #16] ; " \ + "mrs x30, fpsr ; str x30, [%0, #32] " \ + : : "r"(&block[0]) : "memory", "v7", "v8", "x30" \ + ); \ + printf(#INSN " v8." #SUFFIXD ", v7." #SUFFIXN); \ + UInt fpsr = 0xFFFFFF60 & block[2].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Note this also sets the destination register to a known value (0x55..55) + since it can sometimes be an input to the instruction too. */ +#define GEN_BINARY_TEST(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##SUFFIXM ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[3+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + randV128(&block[2], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q7, [%0, #0] ; " \ + "ldr q8, [%0, #16] ; " \ + "ldr q9, [%0, #32] ; " \ + #INSN " v9." #SUFFIXD ", v7." #SUFFIXN ", v8." #SUFFIXM " ; " \ + "str q9, [%0, #32] ; " \ + "mrs x30, fpsr ; str x30, [%0, #48] " \ + : : "r"(&block[0]) : "memory", "v7", "v8", "v9", "x30" \ + ); \ + printf(#INSN " v9." #SUFFIXD \ + ", v7." #SUFFIXN ", v8." #SUFFIXM " "); \ + UInt fpsr = 0xFFFFFF60 & block[3].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Note this also sets the destination register to a known value (0x55..55) + since it can sometimes be an input to the instruction too. */ +#define GEN_SHIFT_TEST(INSN,SUFFIXD,SUFFIXN,AMOUNT) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##AMOUNT ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[2+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q7, [%0, #0] ; " \ + "ldr q8, [%0, #16] ; " \ + #INSN " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT " ; " \ + "str q8, [%0, #16] ; " \ + "mrs x30, fpsr ; str x30, [%0, #32] " \ + : : "r"(&block[0]) : "memory", "v7", "v8", "x30" \ + ); \ + printf(#INSN " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT " "); \ + UInt fpsr = 0xFFFFFF60 & block[2].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Generate a test that involves one integer reg and one vector reg, + with no bias as towards which is input or output. */ +#define GEN_ONEINT_ONEVEC_TEST(TESTNAME,INSN,INTREGNO,VECREGNO) \ + __attribute__((noinline)) \ + static void test_##TESTNAME ( LaneTy ty ) { \ + Int i; \ + assert(INTREGNO != 30); \ + for (i = 0; i < ITERS; i++) { \ + V128 block[4+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + randV128(&block[2], ty); \ + randV128(&block[3], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q"#VECREGNO", [%0, #0] ; " \ + "ldr x"#INTREGNO", [%0, #16] ; " \ + INSN " ; " \ + "str q"#VECREGNO", [%0, #32] ; " \ + "str x"#INTREGNO", [%0, #48] ; " \ + "mrs x30, fpsr ; str x30, [%0, #64] " \ + : : "r"(&block[0]) : "memory", "v"#VECREGNO, "x"#INTREGNO, "x30" \ + ); \ + printf(INSN " "); \ + UInt fpsr = 0xFFFFFF60 & block[4].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf(" "); \ + showV128(&block[3]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Generate a test that involves two vector regs, + with no bias as towards which is input or output. + It's OK to use x10 as scratch.*/ +#define GEN_TWOVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO) \ + __attribute__((noinline)) \ + static void test_##TESTNAME ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[4+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + randV128(&block[2], ty); \ + randV128(&block[3], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q"#VECREG1NO", [%0, #0] ; " \ + "ldr q"#VECREG2NO", [%0, #16] ; " \ + INSN " ; " \ + "str q"#VECREG1NO", [%0, #32] ; " \ + "str q"#VECREG2NO", [%0, #48] ; " \ + "mrs x30, fpsr ; str x30, [%0, #64] " \ + : : "r"(&block[0]) \ + : "memory", "v"#VECREG1NO, "v"#VECREG2NO, "x10", "x30" \ + ); \ + printf(INSN " "); \ + UInt fpsr = 0xFFFFFF60 & block[4].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf(" "); \ + showV128(&block[3]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Generate a test that involves three vector regs, + with no bias as towards which is input or output. It's also OK + to use v16, v17, v18 as scratch. */ +#define GEN_THREEVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO,VECREG3NO) \ + __attribute__((noinline)) \ + static void test_##TESTNAME ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[6+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + randV128(&block[2], ty); \ + randV128(&block[3], ty); \ + randV128(&block[4], ty); \ + randV128(&block[5], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q"#VECREG1NO", [%0, #0] ; " \ + "ldr q"#VECREG2NO", [%0, #16] ; " \ + "ldr q"#VECREG3NO", [%0, #32] ; " \ + INSN " ; " \ + "str q"#VECREG1NO", [%0, #48] ; " \ + "str q"#VECREG2NO", [%0, #64] ; " \ + "str q"#VECREG3NO", [%0, #80] ; " \ + "mrs x30, fpsr ; str x30, [%0, #96] " \ + : : "r"(&block[0]) \ + : "memory", "v"#VECREG1NO, "v"#VECREG2NO, "v"#VECREG3NO, \ + "v16", "v17", "v18", "x30" \ + ); \ + printf(INSN " "); \ + UInt fpsr = 0xFFFFFF60 & block[6].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf(" "); \ + showV128(&block[3]); printf(" "); \ + showV128(&block[4]); printf(" "); \ + showV128(&block[5]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Generate a test that involves four vector regs, + with no bias as towards which is input or output. It's also OK + to use v16, v17, v18 as scratch. */ +#define GEN_FOURVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO, \ + VECREG3NO,VECREG4NO) \ + __attribute__((noinline)) \ + static void test_##TESTNAME ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[8+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + randV128(&block[2], ty); \ + randV128(&block[3], ty); \ + randV128(&block[4], ty); \ + randV128(&block[5], ty); \ + randV128(&block[6], ty); \ + randV128(&block[7], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q"#VECREG1NO", [%0, #0] ; " \ + "ldr q"#VECREG2NO", [%0, #16] ; " \ + "ldr q"#VECREG3NO", [%0, #32] ; " \ + "ldr q"#VECREG4NO", [%0, #48] ; " \ + INSN " ; " \ + "str q"#VECREG1NO", [%0, #64] ; " \ + "str q"#VECREG2NO", [%0, #80] ; " \ + "str q"#VECREG3NO", [%0, #96] ; " \ + "str q"#VECREG4NO", [%0, #112] ; " \ + "mrs x30, fpsr ; str x30, [%0, #128] " \ + : : "r"(&block[0]) \ + : "memory", "v"#VECREG1NO, "v"#VECREG2NO, \ + "v"#VECREG3NO, "v"#VECREG4NO, \ + "v16", "v17", "v18", "x30" \ + ); \ + printf(INSN " "); \ + UInt fpsr = 0xFFFFFF60 & block[8].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf(" "); \ + showV128(&block[3]); printf(" "); \ + showV128(&block[4]); printf(" "); \ + showV128(&block[5]); printf(" "); \ + showV128(&block[6]); printf(" "); \ + showV128(&block[7]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* ---------------------------------------------------------------- */ +/* -- Test functions and non-parameterisable test macros -- */ +/* ---------------------------------------------------------------- */ + +void test_UMINV ( void ) +{ + int i; + V128 block[2]; + + /* -- 4s -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyS); + randV128(&block[1], TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "uminv s8, v7.4s ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMINV v8, v7.4s "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "uminv h8, v7.8h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMINV h8, v7.8h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 4h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "uminv h8, v7.4h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMINV h8, v7.4h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 16b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "uminv b8, v7.16b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMINV b8, v7.16b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "uminv b8, v7.8b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMINV b8, v7.8b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + +} + + +void test_UMAXV ( void ) +{ + int i; + V128 block[2]; + + /* -- 4s -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyS); + randV128(&block[1], TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "umaxv s8, v7.4s ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMAXV v8, v7.4s "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "umaxv h8, v7.8h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMAXV h8, v7.8h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 4h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "umaxv h8, v7.4h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMAXV h8, v7.4h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 16b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "umaxv b8, v7.16b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMAXV b8, v7.16b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "umaxv b8, v7.8b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMAXV b8, v7.8b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + +} + + +void test_INS_general ( void ) +{ + V128 block[3]; + + /* -- D[0..1] -- */ + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyD); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.d[0], x19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.u64[0],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyD); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.d[1], x19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.d[1],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + /* -- S[0..3] -- */ + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.s[0], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.s[0],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.s[1], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.s[1],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.s[2], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.s[2],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.s[3], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.s[3],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + /* -- H[0..7] -- */ + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[0], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[0],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[1], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[1],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[2], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[2],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[3], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[3],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[4], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[4],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[5], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[5],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[6], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[6],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[7], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[7],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + /* -- B[0,15] -- */ + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.b[0], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.b[0],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.b[15], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.b[15],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); +} + + + +void test_SMINV ( void ) +{ + int i; + V128 block[2]; + + /* -- 4s -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyS); + randV128(&block[1], TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "sminv s8, v7.4s ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMINV v8, v7.4s "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "sminv h8, v7.8h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMINV h8, v7.8h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 4h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "sminv h8, v7.4h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMINV h8, v7.4h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 16b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "sminv b8, v7.16b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMINV b8, v7.16b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "sminv b8, v7.8b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMINV b8, v7.8b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + +} + + +void test_SMAXV ( void ) +{ + int i; + V128 block[2]; + + /* -- 4s -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyS); + randV128(&block[1], TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "smaxv s8, v7.4s ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMAXV v8, v7.4s "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "smaxv h8, v7.8h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMAXV h8, v7.8h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 4h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "smaxv h8, v7.4h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMAXV h8, v7.4h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 16b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "smaxv b8, v7.16b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMAXV b8, v7.16b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "smaxv b8, v7.8b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMAXV b8, v7.8b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + +} + + +//======== FCCMP_D ========// + +#define GEN_test_FCCMP_D_D_0xF_EQ \ + __attribute__((noinline)) static void test_FCCMP_D_D_0xF_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_D_D_0xF_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp d29, d11, #0xf, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_D_D_0xF_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_D_D_0xF_NE \ + __attribute__((noinline)) static void test_FCCMP_D_D_0xF_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_D_D_0xF_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp d29, d11, #0xf, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_D_D_0xF_NE after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_D_D_0x0_EQ \ + __attribute__((noinline)) static void test_FCCMP_D_D_0x0_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_D_D_0x0_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp d29, d11, #0x0, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_D_D_0x0_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_D_D_0x0_NE \ + __attribute__((noinline)) static void test_FCCMP_D_D_0x0_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_D_D_0x0_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp d29, d11, #0x0, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_D_D_0x0_NE after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCCMP_S ========// + +#define GEN_test_FCCMP_S_S_0xF_EQ \ + __attribute__((noinline)) static void test_FCCMP_S_S_0xF_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0xF_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp s29, s11, #0xf, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_S_S_0xF_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_S_S_0xF_NE \ + __attribute__((noinline)) static void test_FCCMP_S_S_0xF_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0xF_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp s29, s11, #0xf, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_S_S_0xF_NE after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_S_S_0x0_EQ \ + __attribute__((noinline)) static void test_FCCMP_S_S_0x0_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0x0_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp s29, s11, #0x0, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_S_S_0x0_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_S_S_0x0_NE \ + __attribute__((noinline)) static void test_FCCMP_S_S_0x0_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0x0_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp s29, s11, #0x0, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_S_S_0x0_NE after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCCMPE_D ========// + +#define GEN_test_FCCMPE_D_D_0xF_EQ \ + __attribute__((noinline)) static void test_FCCMPE_D_D_0xF_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMPE_D_D_0xF_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe d29, d11, #0xf, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_D_D_0xF_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMPE_D_D_0xF_NE \ + __attribute__((noinline)) static void test_FCCMPE_D_D_0xF_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMPE_D_D_0xF_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe d29, d11, #0xf, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_D_D_0xF_NE after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMPE_D_D_0x0_EQ \ + __attribute__((noinline)) static void test_FCCMPE_D_D_0x0_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMPE_D_D_0x0_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe d29, d11, #0x0, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_D_D_0x0_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMPE_D_D_0x0_NE \ + __attribute__((noinline)) static void test_FCCMPE_D_D_0x0_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMPE_D_D_0x0_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe d29, d11, #0x0, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_D_D_0x0_NE after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCCMPE_S ========// + +#define GEN_test_FCCMPE_S_S_0xF_EQ \ + __attribute__((noinline)) static void test_FCCMPE_S_S_0xF_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0xF_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe s29, s11, #0xf, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_S_S_0xF_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMPE_S_S_0xF_NE \ + __attribute__((noinline)) static void test_FCCMPE_S_S_0xF_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMPE_S_S_0xF_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe s29, s11, #0xf, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_S_S_0xF_NE after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMPE_S_S_0x0_EQ \ + __attribute__((noinline)) static void test_FCCMPE_S_S_0x0_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0x0_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe s29, s11, #0x0, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_S_S_0x0_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMPE_S_S_0x0_NE \ + __attribute__((noinline)) static void test_FCCMPE_S_S_0x0_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0x0_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe s29, s11, #0x0, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_S_S_0x0_NE after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMEQ_D_D_D ========// + +#define GEN_test_FCMEQ_D_D_D \ + __attribute__((noinline)) static void test_FCMEQ_D_D_D ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMEQ_D_D_D before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmeq d29, d11, d9; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMEQ_D_D_D after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMEQ_S_S_S ========// + +#define GEN_test_FCMEQ_S_S_S \ + __attribute__((noinline)) static void test_FCMEQ_S_S_S ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMEQ_S_S_S before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmeq s29, s11, s9; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMEQ_S_S_S after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMGE_D_D_D ========// + +#define GEN_test_FCMGE_D_D_D \ + __attribute__((noinline)) static void test_FCMGE_D_D_D ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMGE_D_D_D before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmge d29, d11, d9; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMGE_D_D_D after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMGE_S_S_S ========// + +#define GEN_test_FCMGE_S_S_S \ + __attribute__((noinline)) static void test_FCMGE_S_S_S ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMGE_S_S_S before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmge s29, s11, s9; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMGE_S_S_S after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMGT_D_D_D ========// + +#define GEN_test_FCMGT_D_D_D \ + __attribute__((noinline)) static void test_FCMGT_D_D_D ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMGT_D_D_D before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmgt d29, d11, d9; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMGT_D_D_D after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMGT_S_S_S ========// + +#define GEN_test_FCMGT_S_S_S \ + __attribute__((noinline)) static void test_FCMGT_S_S_S ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMGT_S_S_S before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmgt s29, s11, s9; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMGT_S_S_S after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FACGT_D_D_D ========// + +#define GEN_test_FACGT_D_D_D \ + __attribute__((noinline)) static void test_FACGT_D_D_D ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FACGT_D_D_D before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "facgt d29, d11, d9; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FACGT_D_D_D after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FACGT_S_S_S ========// + +#define GEN_test_FACGT_S_S_S \ + __attribute__((noinline)) static void test_FACGT_S_S_S ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FACGT_S_S_S before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "facgt s29, s11, s9; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FACGT_S_S_S after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FACGE_D_D_D ========// + +#define GEN_test_FACGE_D_D_D \ + __attribute__((noinline)) static void test_FACGE_D_D_D ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FACGE_D_D_D before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "facge d29, d11, d9; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FACGE_D_D_D after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FACGE_S_S_S ========// + +#define GEN_test_FACGE_S_S_S \ + __attribute__((noinline)) static void test_FACGE_S_S_S ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FACGE_S_S_S before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "facge s29, s11, s9; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FACGE_S_S_S after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMEQ_D_D_Z ========// + +#define GEN_test_FCMEQ_D_D_Z \ + __attribute__((noinline)) static void test_FCMEQ_D_D_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMEQ_D_D_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmeq d29, d11, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMEQ_D_D_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMEQ_S_S_Z ========// + +#define GEN_test_FCMEQ_S_S_Z \ + __attribute__((noinline)) static void test_FCMEQ_S_S_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMEQ_S_S_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmeq s29, s11, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMEQ_S_S_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMGE_D_D_Z ========// + +#define GEN_test_FCMGE_D_D_Z \ + __attribute__((noinline)) static void test_FCMGE_D_D_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMGE_D_D_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmge d29, d11, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMGE_D_D_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMGE_S_S_Z ========// + +#define GEN_test_FCMGE_S_S_Z \ + __attribute__((noinline)) static void test_FCMGE_S_S_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMGE_S_S_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmge s29, s11, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMGE_S_S_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMGT_D_D_Z ========// + +#define GEN_test_FCMGT_D_D_Z \ + __attribute__((noinline)) static void test_FCMGT_D_D_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMGT_D_D_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmgt d29, d11, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMGT_D_D_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMGT_S_S_Z ========// + +#define GEN_test_FCMGT_S_S_Z \ + __attribute__((noinline)) static void test_FCMGT_S_S_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMGT_S_S_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmgt s29, s11, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMGT_S_S_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMLE_D_D_Z ========// + +#define GEN_test_FCMLE_D_D_Z \ + __attribute__((noinline)) static void test_FCMLE_D_D_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMLE_D_D_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmle d29, d11, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMLE_D_D_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMLE_S_S_Z ========// + +#define GEN_test_FCMLE_S_S_Z \ + __attribute__((noinline)) static void test_FCMLE_S_S_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMLE_S_S_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmle s29, s11, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMLE_S_S_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMLT_D_D_Z ========// + +#define GEN_test_FCMLT_D_D_Z \ + __attribute__((noinline)) static void test_FCMLT_D_D_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMLT_D_D_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmlt d29, d11, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMLT_D_D_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMLT_S_S_Z ========// + +#define GEN_test_FCMLT_S_S_Z \ + __attribute__((noinline)) static void test_FCMLT_S_S_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMLT_S_S_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmlt s29, s11, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMLT_S_S_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMP_D_D ========// + +#define GEN_test_FCMP_D_D \ + __attribute__((noinline)) static void test_FCMP_D_D ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMP_D_D before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmp d29, d11; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMP_D_D after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMP_S_S ========// + +#define GEN_test_FCMP_S_S \ + __attribute__((noinline)) static void test_FCMP_S_S ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMP_S_S before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmp s29, s11; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMP_S_S after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMPE_D_D ========// + +#define GEN_test_FCMPE_D_D \ + __attribute__((noinline)) static void test_FCMPE_D_D ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMPE_D_D before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmpe d29, d11; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMPE_D_D after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMPE_S_S ========// + +#define GEN_test_FCMPE_S_S \ + __attribute__((noinline)) static void test_FCMPE_S_S ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMPE_S_S before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmpe s29, s11; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMPE_S_S after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMP_D_Z ========// + +#define GEN_test_FCMP_D_Z \ + __attribute__((noinline)) static void test_FCMP_D_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMP_D_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmp d29, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMP_D_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMP_S_Z ========// + +#define GEN_test_FCMP_S_Z \ + __attribute__((noinline)) static void test_FCMP_S_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMP_S_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmp s29, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMP_S_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMPE_D_Z ========// + +#define GEN_test_FCMPE_D_Z \ + __attribute__((noinline)) static void test_FCMPE_D_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMPE_D_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmpe d29, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMPE_D_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCMPE_S_Z ========// + +#define GEN_test_FCMPE_S_Z \ + __attribute__((noinline)) static void test_FCMPE_S_Z ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCMPE_S_Z before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcmpe s29, #0; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCMPE_S_Z after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCSEL_D_D_D_EQ ========// + +#define GEN_test_FCSEL_D_D_D_EQ \ + __attribute__((noinline)) static void test_FCSEL_D_D_D_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCSEL_D_D_D_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcsel d29, d11, d9, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCSEL_D_D_D_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCSEL_D_D_D_NE ========// + +#define GEN_test_FCSEL_D_D_D_NE \ + __attribute__((noinline)) static void test_FCSEL_D_D_D_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCSEL_D_D_D_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcsel d29, d11, d9, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCSEL_D_D_D_NE after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCSEL_S_S_S_EQ ========// + +#define GEN_test_FCSEL_S_S_S_EQ \ + __attribute__((noinline)) static void test_FCSEL_S_S_S_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCSEL_S_S_S_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcsel s29, s11, s9, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCSEL_S_S_S_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCSEL_S_S_S_NE ========// + +#define GEN_test_FCSEL_S_S_S_NE \ + __attribute__((noinline)) static void test_FCSEL_S_S_S_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCSEL_S_S_S_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fcsel s29, s11, s9, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCSEL_S_S_S_NE after", &block[0], 4); \ + printf("\n"); \ + } + + +/* ---------------------------------------------------------------- */ +/* -- Tests, in the same order that they appear in main() -- */ +/* ---------------------------------------------------------------- */ + +// ======================== FP ======================== + +GEN_TWOVEC_TEST(faddp_h_2h_00_00, "faddp h0, v0.2h", 0, 0) +GEN_TWOVEC_TEST(faddp_h_2h_01_01, "faddp h1, v1.2h", 1, 1) +GEN_TWOVEC_TEST(faddp_h_2h_02_02, "faddp h2, v2.2h", 2, 2) +GEN_TWOVEC_TEST(faddp_h_2h_03_03, "faddp h3, v3.2h", 3, 3) +GEN_TWOVEC_TEST(faddp_h_2h_04_04, "faddp h4, v4.2h", 4, 4) +GEN_TWOVEC_TEST(faddp_h_2h_05_05, "faddp h5, v5.2h", 5, 5) +GEN_TWOVEC_TEST(faddp_h_2h_06_06, "faddp h6, v6.2h", 6, 6) +GEN_TWOVEC_TEST(faddp_h_2h_07_07, "faddp h7, v7.2h", 7, 7) +GEN_TWOVEC_TEST(faddp_h_2h_08_08, "faddp h8, v8.2h", 8, 8) +GEN_TWOVEC_TEST(faddp_h_2h_09_09, "faddp h9, v9.2h", 9, 9) +GEN_TWOVEC_TEST(faddp_h_2h_10_10, "faddp h10, v10.2h", 10, 10) +GEN_TWOVEC_TEST(faddp_h_2h_11_11, "faddp h11, v11.2h", 11, 11) +GEN_TWOVEC_TEST(faddp_h_2h_12_12, "faddp h12, v12.2h", 12, 12) +GEN_TWOVEC_TEST(faddp_h_2h_13_13, "faddp h13, v13.2h", 13, 13) +GEN_TWOVEC_TEST(faddp_h_2h_14_14, "faddp h14, v14.2h", 14, 14) +GEN_TWOVEC_TEST(faddp_h_2h_15_15, "faddp h15, v15.2h", 15, 15) +GEN_TWOVEC_TEST(faddp_h_2h_16_16, "faddp h16, v16.2h", 16, 16) +GEN_TWOVEC_TEST(faddp_h_2h_17_17, "faddp h17, v17.2h", 17, 17) +GEN_TWOVEC_TEST(faddp_h_2h_18_18, "faddp h18, v18.2h", 18, 18) +GEN_TWOVEC_TEST(faddp_h_2h_19_19, "faddp h19, v19.2h", 19, 19) +GEN_TWOVEC_TEST(faddp_h_2h_20_20, "faddp h20, v20.2h", 20, 20) +GEN_TWOVEC_TEST(faddp_h_2h_21_21, "faddp h21, v21.2h", 21, 21) +GEN_TWOVEC_TEST(faddp_h_2h_22_22, "faddp h22, v22.2h", 22, 22) +GEN_TWOVEC_TEST(faddp_h_2h_23_23, "faddp h23, v23.2h", 23, 23) +GEN_TWOVEC_TEST(faddp_h_2h_24_24, "faddp h24, v24.2h", 24, 24) +GEN_TWOVEC_TEST(faddp_h_2h_25_25, "faddp h25, v25.2h", 25, 25) +GEN_TWOVEC_TEST(faddp_h_2h_26_26, "faddp h26, v26.2h", 26, 26) +GEN_TWOVEC_TEST(faddp_h_2h_27_27, "faddp h27, v27.2h", 27, 27) +GEN_TWOVEC_TEST(faddp_h_2h_28_28, "faddp h28, v28.2h", 28, 28) +GEN_TWOVEC_TEST(faddp_h_2h_29_29, "faddp h29, v29.2h", 29, 29) +GEN_TWOVEC_TEST(faddp_h_2h_30_30, "faddp h30, v30.2h", 30, 30) +GEN_TWOVEC_TEST(faddp_h_2h_31_31, "faddp h31, v31.2h", 31, 31) + +GEN_TWOVEC_TEST(faddp_h_2h_00_01, "faddp h0, v1.2h", 0, 1) +GEN_TWOVEC_TEST(faddp_h_2h_01_02, "faddp h1, v2.2h", 1, 2) +GEN_TWOVEC_TEST(faddp_h_2h_02_03, "faddp h2, v3.2h", 2, 3) +GEN_TWOVEC_TEST(faddp_h_2h_03_04, "faddp h3, v4.2h", 3, 4) +GEN_TWOVEC_TEST(faddp_h_2h_04_05, "faddp h4, v5.2h", 4, 5) +GEN_TWOVEC_TEST(faddp_h_2h_05_06, "faddp h5, v6.2h", 5, 6) +GEN_TWOVEC_TEST(faddp_h_2h_06_07, "faddp h6, v7.2h", 6, 7) +GEN_TWOVEC_TEST(faddp_h_2h_07_08, "faddp h7, v8.2h", 7, 8) +GEN_TWOVEC_TEST(faddp_h_2h_08_09, "faddp h8, v9.2h", 8, 9) +GEN_TWOVEC_TEST(faddp_h_2h_09_10, "faddp h9, v10.2h", 9, 10) +GEN_TWOVEC_TEST(faddp_h_2h_10_11, "faddp h10, v11.2h", 10, 11) +GEN_TWOVEC_TEST(faddp_h_2h_11_12, "faddp h11, v12.2h", 11, 12) +GEN_TWOVEC_TEST(faddp_h_2h_12_13, "faddp h12, v13.2h", 12, 13) +GEN_TWOVEC_TEST(faddp_h_2h_13_14, "faddp h13, v14.2h", 13, 14) +GEN_TWOVEC_TEST(faddp_h_2h_14_15, "faddp h14, v15.2h", 14, 15) +GEN_TWOVEC_TEST(faddp_h_2h_15_16, "faddp h15, v16.2h", 15, 16) +GEN_TWOVEC_TEST(faddp_h_2h_16_17, "faddp h16, v17.2h", 16, 17) +GEN_TWOVEC_TEST(faddp_h_2h_17_18, "faddp h17, v18.2h", 17, 18) +GEN_TWOVEC_TEST(faddp_h_2h_18_19, "faddp h18, v19.2h", 18, 19) +GEN_TWOVEC_TEST(faddp_h_2h_19_20, "faddp h19, v20.2h", 19, 20) +GEN_TWOVEC_TEST(faddp_h_2h_20_21, "faddp h20, v21.2h", 20, 21) +GEN_TWOVEC_TEST(faddp_h_2h_21_22, "faddp h21, v22.2h", 21, 22) +GEN_TWOVEC_TEST(faddp_h_2h_22_23, "faddp h22, v23.2h", 22, 23) +GEN_TWOVEC_TEST(faddp_h_2h_23_24, "faddp h23, v24.2h", 23, 24) +GEN_TWOVEC_TEST(faddp_h_2h_24_25, "faddp h24, v25.2h", 24, 25) +GEN_TWOVEC_TEST(faddp_h_2h_25_26, "faddp h25, v26.2h", 25, 26) +GEN_TWOVEC_TEST(faddp_h_2h_26_27, "faddp h26, v27.2h", 26, 27) +GEN_TWOVEC_TEST(faddp_h_2h_27_28, "faddp h27, v28.2h", 27, 28) +GEN_TWOVEC_TEST(faddp_h_2h_28_29, "faddp h28, v29.2h", 28, 29) +GEN_TWOVEC_TEST(faddp_h_2h_29_30, "faddp h29, v30.2h", 29, 30) +GEN_TWOVEC_TEST(faddp_h_2h_30_31, "faddp h30, v31.2h", 30, 31) + + +/* ---------------------------------------------------------------- */ +/* -- main() -- */ +/* ---------------------------------------------------------------- */ + +int main ( void ) +{ + assert(sizeof(V128) == 16); + + // ======================== FP ======================== + + // faddp h (half-precision floating add pair) + // faddp 2h + if (1) test_faddp_h_2h_00_00(TyH); + if (1) test_faddp_h_2h_01_01(TyH); + if (1) test_faddp_h_2h_02_02(TyH); + if (1) test_faddp_h_2h_03_03(TyH); + if (1) test_faddp_h_2h_04_04(TyH); + if (1) test_faddp_h_2h_05_05(TyH); + if (1) test_faddp_h_2h_06_06(TyH); + if (1) test_faddp_h_2h_07_07(TyH); + if (1) test_faddp_h_2h_08_08(TyH); + if (1) test_faddp_h_2h_09_09(TyH); + if (1) test_faddp_h_2h_10_10(TyH); + if (1) test_faddp_h_2h_11_11(TyH); + if (1) test_faddp_h_2h_12_12(TyH); + if (1) test_faddp_h_2h_13_13(TyH); + if (1) test_faddp_h_2h_14_14(TyH); + if (1) test_faddp_h_2h_15_15(TyH); + if (1) test_faddp_h_2h_16_16(TyH); + if (1) test_faddp_h_2h_17_17(TyH); + if (1) test_faddp_h_2h_18_18(TyH); + if (1) test_faddp_h_2h_19_19(TyH); + if (1) test_faddp_h_2h_20_20(TyH); + if (1) test_faddp_h_2h_21_21(TyH); + if (1) test_faddp_h_2h_22_22(TyH); + if (1) test_faddp_h_2h_23_23(TyH); + if (1) test_faddp_h_2h_24_24(TyH); + if (1) test_faddp_h_2h_25_25(TyH); + if (1) test_faddp_h_2h_26_26(TyH); + if (1) test_faddp_h_2h_27_27(TyH); + if (1) test_faddp_h_2h_28_28(TyH); + if (1) test_faddp_h_2h_29_29(TyH); + if (1) test_faddp_h_2h_30_30(TyH); + if (1) test_faddp_h_2h_31_31(TyH); + + if (1) test_faddp_h_2h_00_01(TyH); + if (1) test_faddp_h_2h_01_02(TyH); + if (1) test_faddp_h_2h_02_03(TyH); + if (1) test_faddp_h_2h_03_04(TyH); + if (1) test_faddp_h_2h_04_05(TyH); + if (1) test_faddp_h_2h_05_06(TyH); + if (1) test_faddp_h_2h_06_07(TyH); + if (1) test_faddp_h_2h_07_08(TyH); + if (1) test_faddp_h_2h_08_09(TyH); + if (1) test_faddp_h_2h_09_10(TyH); + if (1) test_faddp_h_2h_10_11(TyH); + if (1) test_faddp_h_2h_11_12(TyH); + if (1) test_faddp_h_2h_12_13(TyH); + if (1) test_faddp_h_2h_13_14(TyH); + if (1) test_faddp_h_2h_14_15(TyH); + if (1) test_faddp_h_2h_15_16(TyH); + if (1) test_faddp_h_2h_16_17(TyH); + if (1) test_faddp_h_2h_17_18(TyH); + if (1) test_faddp_h_2h_18_19(TyH); + if (1) test_faddp_h_2h_19_20(TyH); + if (1) test_faddp_h_2h_20_21(TyH); + if (1) test_faddp_h_2h_21_22(TyH); + if (1) test_faddp_h_2h_22_23(TyH); + if (1) test_faddp_h_2h_23_24(TyH); + if (1) test_faddp_h_2h_24_25(TyH); + if (1) test_faddp_h_2h_25_26(TyH); + if (1) test_faddp_h_2h_26_27(TyH); + if (1) test_faddp_h_2h_27_28(TyH); + if (1) test_faddp_h_2h_28_29(TyH); + if (1) test_faddp_h_2h_29_30(TyH); + if (1) test_faddp_h_2h_30_31(TyH); + + return 0; +} + +/* ---------------------------------------------------------------- */ +/* -- Alphabetical list of insns -- */ +/* ---------------------------------------------------------------- */ +/* + faddp h (half-precision floating add pair) + faddp 2h +*/ diff --git a/none/tests/arm64/fp_and_simd_v82.stderr.exp b/none/tests/arm64/fp_and_simd_v82.stderr.exp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/none/tests/arm64/fp_and_simd_v82.stdout.exp b/none/tests/arm64/fp_and_simd_v82.stdout.exp new file mode 100644 index 0000000000..8ef1dee2f1 --- /dev/null +++ b/none/tests/arm64/fp_and_simd_v82.stdout.exp @@ -0,0 +1,63 @@ +faddp h0, v0.2h 5175e39d19c9ca1e98f24a4984175700 19a348215c3a67fd399182c2dbcc2d38 0000000000000000000000000000dbcb 0000000000000000000000000000dbcb fpsr=00000000 +faddp h1, v1.2h cb509970b8136c85d740b80eb7839b97 f9dd4a29f8c093db56b01a12b0ca1583 0000000000000000000000000000b0bf 0000000000000000000000000000b0bf fpsr=00000000 +faddp h2, v2.2h d182c916cebc2e17cfaff39be272ef40 6897b536bbe4da8a369dab4f9465b86e 0000000000000000000000000000b870 0000000000000000000000000000b870 fpsr=00000000 +faddp h3, v3.2h 81f2a547be8d181184ededbc53239dcf 019963bf7459630b8d69483df7e8c6a9 0000000000000000000000000000f7e8 0000000000000000000000000000f7e8 fpsr=00000000 +faddp h4, v4.2h 5f490104ced83ff86262dd37727c80f3 e9b5f3f66b2e58c121a6c3476d21f1e5 0000000000000000000000000000eea9 0000000000000000000000000000eea9 fpsr=00000000 +faddp h5, v5.2h 61c82534e9bf6f37c9e25f72d82e582b ecb42ac54b0966d4089b756aa3f77018 00000000000000000000000000007018 00000000000000000000000000007018 fpsr=00000000 +faddp h6, v6.2h 36b2a38dcef18acf0e0f01a829ba3c66 65ce6d498492e7e796df010bf4b23b84 0000000000000000000000000000f4b2 0000000000000000000000000000f4b2 fpsr=00000000 +faddp h7, v7.2h 6d08ed19fa045f841810cd8c109ed568 1c4a678450562685769ab818a5b7985e 0000000000000000000000000000a643 0000000000000000000000000000a643 fpsr=00000000 +faddp h8, v8.2h 048612e51a468e36c51cdd8f87e12ab4 0c05cb6ebd128663d7568e3e8a3ac80e 0000000000000000000000000000c80e 0000000000000000000000000000c80e fpsr=00000000 +faddp h9, v9.2h 6489eab2c96df363d52c4330a7aae391 14575775bc3a12029d8e66ea90352a18 00000000000000000000000000002a07 00000000000000000000000000002a07 fpsr=00000000 +faddp h10, v10.2h 4784d95987cd4ed80c3ca578a32bd88e 08aebee85fda964fbba02737f3c98220 0000000000000000000000000000f3c9 0000000000000000000000000000f3c9 fpsr=00000000 +faddp h11, v11.2h 4e92f1b240a122141a366d352714867e 0aaa836b194e242cc5fc3ae904033357 00000000000000000000000000003358 00000000000000000000000000003358 fpsr=00000000 +faddp h12, v12.2h ac82c1007a7d3cd8f54b130cdaa89cef 627bb6e12d1f6d4651ef145cb9b83843 0000000000000000000000000000b1d4 0000000000000000000000000000b1d4 fpsr=00000000 +faddp h13, v13.2h f0f1798fe3c1699cf02b3b25bca27a9c d4ba52a206ff21b170fbbab6a7f19faf 0000000000000000000000000000a8ee 0000000000000000000000000000a8ee fpsr=00000000 +faddp h14, v14.2h 16559ec50352a3d92d460a61a5dd0f6f 0784892e9360315bf0177599dbe14b46 0000000000000000000000000000db6d 0000000000000000000000000000db6d fpsr=00000000 +faddp h15, v15.2h 9432a2e46543b956b819f459105730e9 5da3cfd6aea6558e0c28728e28dc3c9c 00000000000000000000000000003cc3 00000000000000000000000000003cc3 fpsr=00000000 +faddp h16, v16.2h 89fba268812abdb21e4a9e0958fac555 e4450ababbfae0f9bc3127138b19183c 00000000000000000000000000001795 00000000000000000000000000001795 fpsr=00000000 +faddp h17, v17.2h 7acb193b9abab2f9e1917689e3f6bf86 2573776df1835e3ede9a220dce0e75e0 000000000000000000000000000075de 000000000000000000000000000075de fpsr=00000000 +faddp h18, v18.2h fa99500fef6024ba39dce32c239cf309 570037914d04ab3d05d75ec6f616ee9a 0000000000000000000000000000f7bc 0000000000000000000000000000f7bc fpsr=00000000 +faddp h19, v19.2h 8fcf04e5b2dca44fcf4c517ea3a413ff 5d700527e24d9241c57eb74d70183523 00000000000000000000000000007018 00000000000000000000000000007018 fpsr=00000000 +faddp h20, v20.2h f34428d9c8833f5b78fb29445f3bc8d7 da30ef8bc0b5573e34a901384a97a32f 00000000000000000000000000004a95 00000000000000000000000000004a95 fpsr=00000000 +faddp h21, v21.2h ac8dd5bbc503330eb9dd5dab8e212ab7 4e94ec120b386f523bfcd80321664d3e 00000000000000000000000000004d3f 00000000000000000000000000004d3f fpsr=00000000 +faddp h22, v22.2h 125934a781e479d33d431279cce48fce d4d14e592776b1ef0b40d58cb22d00b1 0000000000000000000000000000b22d 0000000000000000000000000000b22d fpsr=00000000 +faddp h23, v23.2h 9e477892854b43e0beafe48541dc8da0 acb9433f079dacacabeb000208c90296 00000000000000000000000000000a14 00000000000000000000000000000a14 fpsr=00000000 +faddp h24, v24.2h ef56701db49bea4ce52e79ce9700a7f7 200d17261638b12a2a6a07863ec28077 00000000000000000000000000003ec2 00000000000000000000000000003ec2 fpsr=00000000 +faddp h25, v25.2h 445ef059e641a1ccb097e047aacc5b89 1c9c7740ef193457959960926235021b 00000000000000000000000000006235 00000000000000000000000000006235 fpsr=00000000 +faddp h26, v26.2h 80c745ef729f1792ccd7e987538166e1 f4ad41832c22ba116c949cea66e687ae 000000000000000000000000000066e6 000000000000000000000000000066e6 fpsr=00000000 +faddp h27, v27.2h e309aef8a605af130821eb96e737777e b5a9377eb31749ef710cf757885d2728 0000000000000000000000000000271f 0000000000000000000000000000271f fpsr=00000000 +faddp h28, v28.2h dbacfa35b7d2b75af8ad6b99bb3fa4c2 c673c91ec9aed3f8b9c3e32f2103009d 00000000000000000000000000002104 00000000000000000000000000002104 fpsr=00000000 +faddp h29, v29.2h 76f140aa4182b4e706a17746411ab40c 5e58aa8b4c88ae0d34fa174f9ce927c4 0000000000000000000000000000268a 0000000000000000000000000000268a fpsr=00000000 +faddp h30, v30.2h 61cd123e19cf1e2bb001f1161e946f5c d5f13a9ab645e140698bec649583f5aa 0000000000000000000000000000f5aa 0000000000000000000000000000f5aa fpsr=00000000 +faddp h31, v31.2h 2993e139f7d64ff4532f9ae1d7da8010 19714a711ce1284318b88425f2de758f 00000000000000000000000000007040 00000000000000000000000000007040 fpsr=00000000 +faddp h0, v1.2h 7af177f11da748fc8b9145fe16d0390f a77700084a491a0ef099b6dd61462ec3 00000000000000000000000000006146 a77700084a491a0ef099b6dd61462ec3 fpsr=00000000 +faddp h1, v2.2h c50f1401e45b82d3086a7a39a1e6217d b79cd058188318692112ca1cf9f1dd31 0000000000000000000000000000f9fb b79cd058188318692112ca1cf9f1dd31 fpsr=00000000 +faddp h2, v3.2h d4ec68f21f468712f7b8ab3708137382 0b9c016be95f18de62bba1a11cc04c89 00000000000000000000000000004c89 0b9c016be95f18de62bba1a11cc04c89 fpsr=00000000 +faddp h3, v4.2h 30c9028972f8733d11f7fa4450de2529 a1cd852d9cd970502d146432e64644c9 0000000000000000000000000000e641 a1cd852d9cd970502d146432e64644c9 fpsr=00000000 +faddp h4, v5.2h 35e7926e777aa43f56470887bfdd3daf b2ed4ecc1e172df2d3a0a41fce854ae7 0000000000000000000000000000ca23 b2ed4ecc1e172df2d3a0a41fce854ae7 fpsr=00000000 +faddp h5, v6.2h 04b4378bce1492e08680a7399beeae16 09e14df041cdc14f0bf7ba2283e22a31 00000000000000000000000000002a2f 09e14df041cdc14f0bf7ba2283e22a31 fpsr=00000000 +faddp h6, v7.2h 9c86e5cb54c594021c25022200a7415e 1adad8978cbfb47829861f0d48dc87f5 000000000000000000000000000048dc 1adad8978cbfb47829861f0d48dc87f5 fpsr=00000000 +faddp h7, v8.2h b168a24af5479e7bc9f1d5f8e2de4bd3 894d9fe1f98d1aa0861ef69cf4e34e11 0000000000000000000000000000f4e1 894d9fe1f98d1aa0861ef69cf4e34e11 fpsr=00000000 +faddp h8, v9.2h 7b813bf15120fbc8683cbc58f8b23fca 74876ac63afb7562c67d2c86fa7c09a3 0000000000000000000000000000fa7c 74876ac63afb7562c67d2c86fa7c09a3 fpsr=00000000 +faddp h9, v10.2h c501b4c64209aa2e0719232dba0b82d5 077815d35567232e66c997070e860c39 00000000000000000000000000001160 077815d35567232e66c997070e860c39 fpsr=00000000 +faddp h10, v11.2h 462deabeada6093241150c7a1a4df892 89ad76dc21a1f8f15acd7ad9f991bada 0000000000000000000000000000f991 89ad76dc21a1f8f15acd7ad9f991bada fpsr=00000000 +faddp h11, v12.2h f82db3448c8c9a654f1c8c8db3b639e1 de62d56351fe96dabe7a2cefcf2b96bb 0000000000000000000000000000cf2b de62d56351fe96dabe7a2cefcf2b96bb fpsr=00000000 +faddp h12, v13.2h 8514e93e478d067a5a4ac156a6cb98bf d4442998096825896787a06c436d8e39 0000000000000000000000000000436d d4442998096825896787a06c436d8e39 fpsr=00000000 +faddp h13, v14.2h a20cab554a62dd2468a718ec4422710c b330aadc8a7cbfaf26fbc229d962e2d7 0000000000000000000000000000e418 b330aadc8a7cbfaf26fbc229d962e2d7 fpsr=00000000 +faddp h14, v15.2h 3028339e0d3a0c468e8f584ceae94e7a e33fad8f313a964967940f284cfce9a3 0000000000000000000000000000e999 e33fad8f313a964967940f284cfce9a3 fpsr=00000000 +faddp h15, v16.2h 5df79fd3324f914fb79f41ec172107e2 d6006035af2e8bb7b3736be34585abe2 00000000000000000000000000004575 d6006035af2e8bb7b3736be34585abe2 fpsr=00000000 +faddp h16, v17.2h 57d0e8a18b5417adc6b295b85f1c3056 e70216ec5cbcf49e8a09cb539549408a 00000000000000000000000000004089 e70216ec5cbcf49e8a09cb539549408a fpsr=00000000 +faddp h17, v18.2h a9430469f9a6aaf90d07193d2e134034 e0fd1393714954977124406c74e81e7a 000000000000000000000000000074e8 e0fd1393714954977124406c74e81e7a fpsr=00000000 +faddp h18, v19.2h 3b947b8f0a536415b779aada6ea680b0 3fa5c4d84771e518605a54f56dfe15b7 00000000000000000000000000006dfe 3fa5c4d84771e518605a54f56dfe15b7 fpsr=00000000 +faddp h19, v20.2h b71315802c502c586d5043a8665c8797 d4eaedef93c21b55bdb0c6ce36392d36 00000000000000000000000000003786 d4eaedef93c21b55bdb0c6ce36392d36 fpsr=00000000 +faddp h20, v21.2h 98bf1ba36919393bc4d999db7390839e 44d5584589abea635dc49b10189f4c14 00000000000000000000000000004c14 44d5584589abea635dc49b10189f4c14 fpsr=00000000 +faddp h21, v22.2h 0b0b9f6018e987aeba97106bb88dbd45 9d5fe4af824eabd8f8f577d6f4dd0223 0000000000000000000000000000f4dd 9d5fe4af824eabd8f8f577d6f4dd0223 fpsr=00000000 +faddp h22, v23.2h 1eca927d6d5eee012a6fe8ae3cfe5e6a 22d9446284e6ae8126fc5ee9b286181e 0000000000000000000000000000b276 22d9446284e6ae8126fc5ee9b286181e fpsr=00000000 +faddp h23, v24.2h 3131620a2265f8c8f64df6cdcb51c286 6eeb8d90d86668b60a08b6d0cfc59797 0000000000000000000000000000cfc5 6eeb8d90d86668b60a08b6d0cfc59797 fpsr=00000000 +faddp h24, v25.2h 1854ddf6d8b991ce01deaf4923243fc0 4210b3d32431d146a45cad2eccb0e21a 0000000000000000000000000000e240 4210b3d32431d146a45cad2eccb0e21a fpsr=00000000 +faddp h25, v26.2h ee7d691b146130944d3d038a0b69312c 4df433720fd7245dafacd5bdced9cd88 0000000000000000000000000000d230 4df433720fd7245dafacd5bdced9cd88 fpsr=00000000 +faddp h26, v27.2h 9c423a145875f5144ccc5e105c99661d a353e8d137de89d3071b5bad6b52ee61 0000000000000000000000000000e970 a353e8d137de89d3071b5bad6b52ee61 fpsr=00000000 +faddp h27, v28.2h d04b750405c33deba68d8a6feefdf8d2 e11053b38ffdcd305e88d8c318f5aa57 0000000000000000000000000000aa08 e11053b38ffdcd305e88d8c318f5aa57 fpsr=00000000 +faddp h28, v29.2h 2af3bd4b509e6608a513cfe482162be8 6f8ae74d5f7960b4a01933ef595f6af1 00000000000000000000000000006b47 6f8ae74d5f7960b4a01933ef595f6af1 fpsr=00000000 +faddp h29, v30.2h 95d26cc246074b10bda9f7bf92a71bac 8932e026330d2e5552f8564f761e13a8 0000000000000000000000000000761e 8932e026330d2e5552f8564f761e13a8 fpsr=00000000 +faddp h30, v31.2h 470818041ac5e9b218db305838ff3248 06ced856b4d04648a668c3da0fcbe652 0000000000000000000000000000e652 06ced856b4d04648a668c3da0fcbe652 fpsr=00000000 diff --git a/none/tests/arm64/fp_and_simd_v82.vgtest b/none/tests/arm64/fp_and_simd_v82.vgtest new file mode 100644 index 0000000000..992c87a51c --- /dev/null +++ b/none/tests/arm64/fp_and_simd_v82.vgtest @@ -0,0 +1,3 @@ +prog: fp_and_simd_v82 +prereq: test -x fp_and_simd_v82 && ../../../tests/arm64_features fphp +vgopts: -q diff --git a/tests/Makefile.am b/tests/Makefile.am index 7233626647..916e5085d0 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -51,7 +51,9 @@ check_PROGRAMS = \ mips_features \ power_insn_available \ is_ppc64_BE \ - min_power_isa + min_power_isa \ + arm64_features + AM_CFLAGS += $(AM_FLAG_M3264_PRI) AM_CXXFLAGS += $(AM_FLAG_M3264_PRI) diff --git a/tests/arm64_features.c b/tests/arm64_features.c new file mode 100644 index 0000000000..01b12f784a --- /dev/null +++ b/tests/arm64_features.c @@ -0,0 +1,178 @@ +#include +#include +#include +#include + +// This file determines arm64 features a processor supports. +// Arm processors do not have a x86-like cpuinfo instruction. Instead the +// getauxval() syscall is used with capabilities parameters: getauxval(AT_HWCAP) +// and getauxval(AT_HWCAP2). +// +// We return: +// - 0 if the machine has the asked-for feature. +// - 1 if the machine doesn't have the asked-for feature. +// - 2 if the asked-for feature isn't recognised (this will always be the case +// for any feature if run on a non-arm64 machine). +// - 3 if there was a usage error (it also prints an error message). +#define FEATURE_PRESENT 0 +#define FEATURE_NOT_PRESENT 1 +#define UNRECOGNISED_FEATURE 2 +#define USAGE_ERROR 3 + +#define False 0 +#define True 1 +typedef int Bool; + +#if defined(VGA_arm64) + +// The processor's capabilities/features are returned by getauxval() as an +// unsigned long with each bit representing a capability/feature. +#define HWCAP_FP (1 << 0) +#define HWCAP_ASIMD (1 << 1) +#define HWCAP_EVTSTRM (1 << 2) +#define HWCAP_AES (1 << 3) +#define HWCAP_PMULL (1 << 4) +#define HWCAP_SHA1 (1 << 5) +#define HWCAP_SHA2 (1 << 6) +#define HWCAP_CRC32 (1 << 7) +#define HWCAP_ATOMICS (1 << 8) +#define HWCAP_FPHP (1 << 9) +#define HWCAP_ASIMDHP (1 << 10) +#define HWCAP_CPUID (1 << 11) +#define HWCAP_ASIMDRDM (1 << 12) +#define HWCAP_JSCVT (1 << 13) +#define HWCAP_FCMA (1 << 14) +#define HWCAP_LRCPC (1 << 15) +#define HWCAP_DCPOP (1 << 16) +#define HWCAP_SHA3 (1 << 17) +#define HWCAP_SM3 (1 << 18) +#define HWCAP_SM4 (1 << 19) +#define HWCAP_ASIMDDP (1 << 20) +#define HWCAP_SHA512 (1 << 21) +#define HWCAP_SVE (1 << 22) +#define HWCAP_ASIMDFHM (1 << 23) +#define HWCAP_DIT (1 << 24) +#define HWCAP_USCAT (1 << 25) +#define HWCAP_ILRCPC (1 << 26) +#define HWCAP_FLAGM (1 << 27) +#define HWCAP_SSBS (1 << 28) +#define HWCAP_SB (1 << 29) +#define HWCAP_PACA (1 << 30) +#define HWCAP_PACG (1UL << 31) + +#define HWCAP2_DCPODP (1 << 0) +#define HWCAP2_SVE2 (1 << 1) +#define HWCAP2_SVEAES (1 << 2) +#define HWCAP2_SVEPMULL (1 << 3) +#define HWCAP2_SVEBITPERM (1 << 4) +#define HWCAP2_SVESHA3 (1 << 5) +#define HWCAP2_SVESM4 (1 << 6) +#define HWCAP2_FLAGM2 (1 << 7) +#define HWCAP2_FRINT (1 << 8) + +unsigned long hwcaps[] = { + HWCAP_FP, HWCAP_ASIMD, HWCAP_EVTSTRM, HWCAP_AES, HWCAP_PMULL, + HWCAP_SHA1, HWCAP_SHA2, HWCAP_CRC32, HWCAP_ATOMICS, HWCAP_FPHP, + HWCAP_ASIMDHP,HWCAP_CPUID, HWCAP_ASIMDRDM,HWCAP_JSCVT, HWCAP_FCMA, + HWCAP_LRCPC, HWCAP_DCPOP, HWCAP_SHA3, HWCAP_SM3, HWCAP_SM4, + HWCAP_ASIMDDP,HWCAP_SHA512, HWCAP_SVE, HWCAP_ASIMDFHM,HWCAP_DIT, + HWCAP_USCAT, HWCAP_ILRCPC, HWCAP_FLAGM, HWCAP_SSBS, HWCAP_SB, + HWCAP_PACA, HWCAP_PACG, 0ul}; + +unsigned long hwcaps2[] = { + HWCAP2_DCPODP, HWCAP2_SVE2, HWCAP2_SVEAES, HWCAP2_SVEPMULL, + HWCAP2_SVEBITPERM, HWCAP2_SVESHA3, HWCAP2_SVESM4, HWCAP2_FLAGM2, + HWCAP2_FRINT, 0ul}; + +typedef struct +{ + char name[16]; + unsigned long cap_bit; +} capability; + +capability capabilities[] = { + {"fp", HWCAP_FP}, {"asimd", HWCAP_ASIMD}, + {"evtstrm", HWCAP_EVTSTRM}, {"aes", HWCAP_AES}, + {"pmull", HWCAP_PMULL}, {"sha1", HWCAP_SHA1}, + {"sha2", HWCAP_SHA2}, {"crc32", HWCAP_CRC32}, + {"atomics", HWCAP_ATOMICS}, {"fphp", HWCAP_FPHP}, + {"asimdhp", HWCAP_ASIMDHP}, {"cpuid", HWCAP_CPUID}, + {"asimdrdm", HWCAP_ASIMDRDM}, {"jscvt", HWCAP_JSCVT}, + {"fcma", HWCAP_FCMA}, {"lrcpc", HWCAP_LRCPC}, + {"dcpop", HWCAP_DCPOP}, {"sha3", HWCAP_SHA3}, + {"sm3", HWCAP_SM3}, {"sm4", HWCAP_SM4}, + {"asimddp", HWCAP_ASIMDDP}, {"sha512", HWCAP_SHA512}, + {"sve", HWCAP_SVE}, {"asimdfhm", HWCAP_ASIMDFHM}, + {"dit", HWCAP_DIT}, {"uscat", HWCAP_USCAT}, + {"ilrcpc", HWCAP_ILRCPC}, {"flagm", HWCAP_FLAGM}, + {"ssbs", HWCAP_SSBS}, {"sb", HWCAP_SB}, + {"paca", HWCAP_PACA}, {"pacg", HWCAP_PACG}, + {"", 0ul} +}; + +capability capabilities2[] = { + {"dcpodp", HWCAP2_DCPODP}, {"sve2", HWCAP2_SVE2}, + {"sveaes", HWCAP2_SVEAES}, {"svepmull", HWCAP2_SVEPMULL}, + {"svebitperm", HWCAP2_SVEBITPERM}, {"svesha3", HWCAP2_SVESHA3}, + {"svesm4", HWCAP2_SVESM4}, {"flagm2", HWCAP2_FLAGM2}, + {"frint", HWCAP2_FRINT}, {"", 0ul} +}; + +typedef struct +{ + unsigned long hwcap; + unsigned long hwcap2; +} hwc; + +#define CAPABILITIES_SEARCH_LOOP(hwcversion) \ + for (int i = 0; capabilities ## hwcversion[i].cap_bit; ++i) \ + if (strcmp(name, capabilities ## hwcversion[i].name) == 0) { \ + caps->hwcap ## hwcversion = capabilities ## hwcversion[i].cap_bit; \ + return True; \ + } \ + +static Bool get_feature_from_string(const char *name, hwc *caps) +{ + caps->hwcap = caps->hwcap2 = 0; + CAPABILITIES_SEARCH_LOOP() + CAPABILITIES_SEARCH_LOOP(2) + return False; +} + +static int go(const char* feature_name) +{ + hwc hw; + unsigned long hwcap = getauxval(AT_HWCAP); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + + if (!get_feature_from_string(feature_name, &hw)) + return UNRECOGNISED_FEATURE; + + if ((hw.hwcap & hwcap) || (hw.hwcap2 & hwcap2)) + return FEATURE_PRESENT; + + return FEATURE_NOT_PRESENT; +} + +#else + +static Bool go(const char* feature_name) +{ + // Feature not recognised (non-arm64 machine!) + return UNRECOGNISED_FEATURE; +} + +#endif // defined(VGA_arm64) + + +//--------------------------------------------------------------------------- +// main +//--------------------------------------------------------------------------- +int main(int argc, char **argv) +{ + if (argc != 2) { + fprintf(stderr, "usage: arm64_features \n"); + exit(USAGE_ERROR); + } + return go(argv[1]); +}