From 150edab1ac5a07927065412811f4859b2f881ac6 Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Wed, 22 Oct 2014 13:52:51 +0000 Subject: [PATCH] Implement SIMD (de)interleaving loads/stores: LD1/ST1 (multiple 1-elem structs to/from 1 reg LD2/ST2 (multiple 2-elem structs to/from 2 regs LD3/ST3 (multiple 3-elem structs to/from 3 regs LD4/ST4 (multiple 4-elem structs to/from 4 regs Also: LDNP, STNP (load/store vector pair, non-temporal) git-svn-id: svn://svn.valgrind.org/vex/trunk@2976 --- VEX/priv/guest_arm64_toIR.c | 1471 ++++++++++++++++++++++++++++------- VEX/priv/host_arm64_defs.c | 75 +- VEX/priv/host_arm64_isel.c | 133 ++-- 3 files changed, 1313 insertions(+), 366 deletions(-) diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 5ebe0a6079..8021a4818a 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -404,18 +404,18 @@ void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3) *t3 = newTempV128(); } -//static -//void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4) -//{ -// vassert(t1 && *t1 == IRTemp_INVALID); -// vassert(t2 && *t2 == IRTemp_INVALID); -// vassert(t3 && *t3 == IRTemp_INVALID); -// vassert(t4 && *t4 == IRTemp_INVALID); -// *t1 = newTempV128(); -// *t2 = newTempV128(); -// *t3 = newTempV128(); -// *t4 = newTempV128(); -//} +static +void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4) +{ + vassert(t1 && *t1 == IRTemp_INVALID); + vassert(t2 && *t2 == IRTemp_INVALID); + vassert(t3 && *t3 == IRTemp_INVALID); + vassert(t4 && *t4 == IRTemp_INVALID); + *t1 = newTempV128(); + *t2 = newTempV128(); + *t3 = newTempV128(); + *t4 = newTempV128(); +} static void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3, @@ -3466,6 +3466,950 @@ Bool dis_ARM64_data_processing_register(/*MB_OUT*/DisResult* dres, } +/*------------------------------------------------------------*/ +/*--- Math helpers for vector interleave/deinterleave ---*/ +/*------------------------------------------------------------*/ + +#define EX(_tmp) \ + mkexpr(_tmp) +#define SL(_hi128,_lo128,_nbytes) \ + ( (_nbytes) == 0 \ + ? (_lo128) \ + : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) ) +#define ROR(_v128,_nbytes) \ + SL((_v128),(_v128),(_nbytes)) +#define ROL(_v128,_nbytes) \ + SL((_v128),(_v128),16-(_nbytes)) +#define SHR(_v128,_nbytes) \ + binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes))) +#define SHL(_v128,_nbytes) \ + binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes))) +#define ILO64x2(_argL,_argR) \ + binop(Iop_InterleaveLO64x2,(_argL),(_argR)) +#define IHI64x2(_argL,_argR) \ + binop(Iop_InterleaveHI64x2,(_argL),(_argR)) +#define ILO32x4(_argL,_argR) \ + binop(Iop_InterleaveLO32x4,(_argL),(_argR)) +#define IHI32x4(_argL,_argR) \ + binop(Iop_InterleaveHI32x4,(_argL),(_argR)) +#define ILO16x8(_argL,_argR) \ + binop(Iop_InterleaveLO16x8,(_argL),(_argR)) +#define IHI16x8(_argL,_argR) \ + binop(Iop_InterleaveHI16x8,(_argL),(_argR)) +#define ILO8x16(_argL,_argR) \ + binop(Iop_InterleaveLO8x16,(_argL),(_argR)) +#define IHI8x16(_argL,_argR) \ + binop(Iop_InterleaveHI8x16,(_argL),(_argR)) +#define CEV32x4(_argL,_argR) \ + binop(Iop_CatEvenLanes32x4,(_argL),(_argR)) +#define COD32x4(_argL,_argR) \ + binop(Iop_CatOddLanes32x4,(_argL),(_argR)) +#define COD16x8(_argL,_argR) \ + binop(Iop_CatOddLanes16x8,(_argL),(_argR)) +#define COD8x16(_argL,_argR) \ + binop(Iop_CatOddLanes8x16,(_argL),(_argR)) +#define CEV8x16(_argL,_argR) \ + binop(Iop_CatEvenLanes8x16,(_argL),(_argR)) +#define AND(_arg1,_arg2) \ + binop(Iop_AndV128,(_arg1),(_arg2)) +#define OR2(_arg1,_arg2) \ + binop(Iop_OrV128,(_arg1),(_arg2)) +#define OR3(_arg1,_arg2,_arg3) \ + binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3))) +#define OR4(_arg1,_arg2,_arg3,_arg4) \ + binop(Iop_OrV128, \ + binop(Iop_OrV128,(_arg1),(_arg2)), \ + binop(Iop_OrV128,(_arg3),(_arg4))) + + +/* Do interleaving for 1 128 bit vector, for ST1 insns. */ +static +void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0, + UInt laneSzBlg2, IRTemp u0 ) +{ + assign(*i0, mkexpr(u0)); +} + + +/* Do interleaving for 2 128 bit vectors, for ST2 insns. */ +static +void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1, + UInt laneSzBlg2, IRTemp u0, IRTemp u1 ) +{ + /* This is pretty easy, since we have primitives directly to + hand. */ + if (laneSzBlg2 == 3) { + // 64x2 + // u1 == B1 B0, u0 == A1 A0 + // i1 == B1 A1, i0 == B0 A0 + assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0))); + assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0))); + return; + } + if (laneSzBlg2 == 2) { + // 32x4 + // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0, + // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0 + assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0))); + assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0))); + return; + } + if (laneSzBlg2 == 1) { + // 16x8 + // u1 == B{7..0}, u0 == A{7..0} + // i0 == B3 A3 B2 A2 B1 A1 B0 A0 + // i1 == B7 A7 B6 A6 B5 A5 B4 A4 + assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0))); + assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0))); + return; + } + if (laneSzBlg2 == 0) { + // 8x16 + // u1 == B{f..0}, u0 == A{f..0} + // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0 + // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8 + assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0))); + assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0))); + return; + } + /*NOTREACHED*/ + vassert(0); +} + + +/* Do interleaving for 3 128 bit vectors, for ST3 insns. */ +static +void math_INTERLEAVE3_128( + /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, + UInt laneSzBlg2, + IRTemp u0, IRTemp u1, IRTemp u2 ) +{ + if (laneSzBlg2 == 3) { + // 64x2 + // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0 + // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0, + assign(*i2, IHI64x2( EX(u2), EX(u1) )); + assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) )); + assign(*i0, ILO64x2( EX(u1), EX(u0) )); + return; + } + + if (laneSzBlg2 == 2) { + // 32x4 + // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0 + // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0 + // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0 + IRTemp p0 = newTempV128(); + IRTemp p1 = newTempV128(); + IRTemp p2 = newTempV128(); + IRTemp c1100 = newTempV128(); + IRTemp c0011 = newTempV128(); + IRTemp c0110 = newTempV128(); + assign(c1100, mkV128(0xFF00)); + assign(c0011, mkV128(0x00FF)); + assign(c0110, mkV128(0x0FF0)); + // First interleave them at 64x2 granularity, + // generating partial ("p") values. + math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2); + // And more shuffling around for the final answer + assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ), + AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) )); + assign(*i1, OR3( SHL(EX(p2),12), + AND(EX(p1),EX(c0110)), + SHR(EX(p0),12) )); + assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ), + AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) )); + return; + } + + if (laneSzBlg2 == 1) { + // 16x8 + // u2 == C7 C6 C5 C4 C3 C2 C1 C0 + // u1 == B7 B6 B5 B4 B3 B2 B1 B0 + // u0 == A7 A6 A5 A4 A3 A2 A1 A0 + // + // p2 == C7 C6 B7 B6 A7 A6 C5 C4 + // p1 == B5 B4 A5 A4 C3 C2 B3 B2 + // p0 == A3 A2 C1 C0 B1 B0 A1 A0 + // + // i2 == C7 B7 A7 C6 B6 A6 C5 B5 + // i1 == A5 C4 B4 A4 C4 B3 A3 C2 + // i0 == B2 A2 C1 B1 A1 C0 B0 A0 + IRTemp p0 = newTempV128(); + IRTemp p1 = newTempV128(); + IRTemp p2 = newTempV128(); + IRTemp c1000 = newTempV128(); + IRTemp c0100 = newTempV128(); + IRTemp c0010 = newTempV128(); + IRTemp c0001 = newTempV128(); + assign(c1000, mkV128(0xF000)); + assign(c0100, mkV128(0x0F00)); + assign(c0010, mkV128(0x00F0)); + assign(c0001, mkV128(0x000F)); + // First interleave them at 32x4 granularity, + // generating partial ("p") values. + math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2); + // And more shuffling around for the final answer + assign(*i2, + OR4( AND( IHI16x8( EX(p2), ROL(EX(p2),4) ), EX(c1000) ), + AND( IHI16x8( ROL(EX(p2),6), EX(p2) ), EX(c0100) ), + AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ), + AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) ) + )); + assign(*i1, + OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ), + AND( IHI16x8( EX(p1), ROL(EX(p1),4) ), EX(c0100) ), + AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ), + AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) ) + )); + assign(*i0, + OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ), + AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ), + AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ), + AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) ) + )); + return; + } + + if (laneSzBlg2 == 0) { + // 8x16. It doesn't seem worth the hassle of first doing a + // 16x8 interleave, so just generate all 24 partial results + // directly :-( + // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0 + // i2 == Cf Bf Af Ce .. Bb Ab Ca + // i1 == Ba Aa C9 B9 .. A6 C5 B5 + // i0 == A5 C4 B4 A4 .. C0 B0 A0 + + IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128(); + IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128(); + IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128(); + IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128(); + IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128(); + IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128(); + IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128(); + IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128(); + IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128(); + + // eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector + // of the form 14 bytes junk : CC[0xF] : BB[0xA] + // +# define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \ + IRTemp t_##_tempName = newTempV128(); \ + assign(t_##_tempName, \ + ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \ + ROR(EX(_srcVec2),(_srcShift2)) ) ) + + // Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively + IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0; + + // The slicing and reassembly are done as interleavedly as possible, + // so as to minimise the demand for registers in the back end, which + // was observed to be a problem in testing. + + XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14] + XXXX(AfCe, AA, 0xf, CC, 0xe); + assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe))); + + XXXX(BeAe, BB, 0xe, AA, 0xe); + XXXX(CdBd, CC, 0xd, BB, 0xd); + assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd))); + assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98))); + + XXXX(AdCc, AA, 0xd, CC, 0xc); + XXXX(BcAc, BB, 0xc, AA, 0xc); + assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc))); + + XXXX(CbBb, CC, 0xb, BB, 0xb); + XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0] + assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa))); + assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210))); + assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64))); + + XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14] + XXXX(C9B9, CC, 0x9, BB, 0x9); + assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9))); + + XXXX(A9C8, AA, 0x9, CC, 0x8); + XXXX(B8A8, BB, 0x8, AA, 0x8); + assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8))); + assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98))); + + XXXX(C7B7, CC, 0x7, BB, 0x7); + XXXX(A7C6, AA, 0x7, CC, 0x6); + assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6))); + + XXXX(B6A6, BB, 0x6, AA, 0x6); + XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0] + assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5))); + assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210))); + assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64))); + + XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14] + XXXX(B4A4, BB, 0x4, AA, 0x4); + assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4))); + + XXXX(C3B3, CC, 0x3, BB, 0x3); + XXXX(A3C2, AA, 0x3, CC, 0x2); + assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2))); + assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98))); + + XXXX(B2A2, BB, 0x2, AA, 0x2); + XXXX(C1B1, CC, 0x1, BB, 0x1); + assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1))); + + XXXX(A1C0, AA, 0x1, CC, 0x0); + XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0] + assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0))); + assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210))); + assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64))); + +# undef XXXX + return; + } + + /*NOTREACHED*/ + vassert(0); +} + + +/* Do interleaving for 4 128 bit vectors, for ST4 insns. */ +static +void math_INTERLEAVE4_128( + /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3, + UInt laneSzBlg2, + IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 ) +{ + if (laneSzBlg2 == 3) { + // 64x2 + assign(*i0, ILO64x2(EX(u1), EX(u0))); + assign(*i1, ILO64x2(EX(u3), EX(u2))); + assign(*i2, IHI64x2(EX(u1), EX(u0))); + assign(*i3, IHI64x2(EX(u3), EX(u2))); + return; + } + if (laneSzBlg2 == 2) { + // 32x4 + // First, interleave at the 64-bit lane size. + IRTemp p0 = newTempV128(); + IRTemp p1 = newTempV128(); + IRTemp p2 = newTempV128(); + IRTemp p3 = newTempV128(); + math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3); + // And interleave (cat) at the 32 bit size. + assign(*i0, CEV32x4(EX(p1), EX(p0))); + assign(*i1, COD32x4(EX(p1), EX(p0))); + assign(*i2, CEV32x4(EX(p3), EX(p2))); + assign(*i3, COD32x4(EX(p3), EX(p2))); + return; + } + if (laneSzBlg2 == 1) { + // 16x8 + // First, interleave at the 32-bit lane size. + IRTemp p0 = newTempV128(); + IRTemp p1 = newTempV128(); + IRTemp p2 = newTempV128(); + IRTemp p3 = newTempV128(); + math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3); + // And rearrange within each vector, to get the right 16 bit lanes. + assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2))); + assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2))); + assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2))); + assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2))); + return; + } + if (laneSzBlg2 == 0) { + // 8x16 + // First, interleave at the 16-bit lane size. + IRTemp p0 = newTempV128(); + IRTemp p1 = newTempV128(); + IRTemp p2 = newTempV128(); + IRTemp p3 = newTempV128(); + math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3); + // And rearrange within each vector, to get the right 8 bit lanes. + assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0)))); + assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1)))); + assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2)))); + assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3)))); + return; + } + /*NOTREACHED*/ + vassert(0); +} + + +/* Do deinterleaving for 1 128 bit vector, for LD1 insns. */ +static +void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0, + UInt laneSzBlg2, IRTemp i0 ) +{ + assign(*u0, mkexpr(i0)); +} + + +/* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */ +static +void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1, + UInt laneSzBlg2, IRTemp i0, IRTemp i1 ) +{ + /* This is pretty easy, since we have primitives directly to + hand. */ + if (laneSzBlg2 == 3) { + // 64x2 + // i1 == B1 A1, i0 == B0 A0 + // u1 == B1 B0, u0 == A1 A0 + assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0))); + assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0))); + return; + } + if (laneSzBlg2 == 2) { + // 32x4 + // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0 + // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0, + assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0))); + assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0))); + return; + } + if (laneSzBlg2 == 1) { + // 16x8 + // i0 == B3 A3 B2 A2 B1 A1 B0 A0 + // i1 == B7 A7 B6 A6 B5 A5 B4 A4 + // u1 == B{7..0}, u0 == A{7..0} + assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0))); + assign(*u1, binop(Iop_CatOddLanes16x8, mkexpr(i1), mkexpr(i0))); + return; + } + if (laneSzBlg2 == 0) { + // 8x16 + // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0 + // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8 + // u1 == B{f..0}, u0 == A{f..0} + assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0))); + assign(*u1, binop(Iop_CatOddLanes8x16, mkexpr(i1), mkexpr(i0))); + return; + } + /*NOTREACHED*/ + vassert(0); +} + + +/* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */ +static +void math_DEINTERLEAVE3_128( + /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, + UInt laneSzBlg2, + IRTemp i0, IRTemp i1, IRTemp i2 ) +{ + if (laneSzBlg2 == 3) { + // 64x2 + // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0, + // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0 + assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1) )); + assign(*u1, ILO64x2( EX(i2), ROL(EX(i0),8) )); + assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0) )); + return; + } + + if (laneSzBlg2 == 2) { + // 32x4 + // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0 + // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0 + // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0 + IRTemp t_a1c0b0a0 = newTempV128(); + IRTemp t_a2c1b1a1 = newTempV128(); + IRTemp t_a3c2b2a2 = newTempV128(); + IRTemp t_a0c3b3a3 = newTempV128(); + IRTemp p0 = newTempV128(); + IRTemp p1 = newTempV128(); + IRTemp p2 = newTempV128(); + // Compute some intermediate values. + assign(t_a1c0b0a0, EX(i0)); + assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4)); + assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4)); + assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4)); + // First deinterleave into lane-pairs + assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0))); + assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)), + IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0)))); + assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4))); + // Then deinterleave at 64x2 granularity. + math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2); + return; + } + + if (laneSzBlg2 == 1) { + // 16x8 + // u2 == C7 C6 C5 C4 C3 C2 C1 C0 + // u1 == B7 B6 B5 B4 B3 B2 B1 B0 + // u0 == A7 A6 A5 A4 A3 A2 A1 A0 + // + // i2 == C7 B7 A7 C6 B6 A6 C5 B5 + // i1 == A5 C4 B4 A4 C4 B3 A3 C2 + // i0 == B2 A2 C1 B1 A1 C0 B0 A0 + // + // p2 == C7 C6 B7 B6 A7 A6 C5 C4 + // p1 == B5 B4 A5 A4 C3 C2 B3 B2 + // p0 == A3 A2 C1 C0 B1 B0 A1 A0 + + IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111; + s0 = s1 = s2 = s3 + = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID; + newTempsV128_4(&s0, &s1, &s2, &s3); + newTempsV128_4(&t0, &t1, &t2, &t3); + newTempsV128_4(&p0, &p1, &p2, &c00111111); + + // s0 == b2a2 c1b1a1 c0b0a0 + // s1 == b4a4 c3b3c3 c2b2a2 + // s2 == b6a6 c5b5a5 c4b4a4 + // s3 == b0a0 c7b7a7 c6b6a6 + assign(s0, EX(i0)); + assign(s1, SL(EX(i1),EX(i0),6*2)); + assign(s2, SL(EX(i2),EX(i1),4*2)); + assign(s3, SL(EX(i0),EX(i2),2*2)); + + // t0 == 0 0 c1c0 b1b0 a1a0 + // t1 == 0 0 c3c2 b3b2 a3a2 + // t2 == 0 0 c5c4 b5b4 a5a4 + // t3 == 0 0 c7c6 b7b6 a7a6 + assign(c00111111, mkV128(0x0FFF)); + assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111))); + assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111))); + assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111))); + assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111))); + + assign(p0, OR2(EX(t0), SHL(EX(t1),6*2))); + assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2))); + assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2))); + + // Then deinterleave at 32x4 granularity. + math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2); + return; + } + + if (laneSzBlg2 == 0) { + // 8x16. This is the same scheme as for 16x8, with twice the + // number of intermediate values. + // + // u2 == C{f..0} + // u1 == B{f..0} + // u0 == A{f..0} + // + // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a} + // i1 == BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5} + // i0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0} + // + // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba} + // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54} + // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10} + // + IRTemp s0, s1, s2, s3, s4, s5, s6, s7, + t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK; + s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7 + = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK + = IRTemp_INVALID; + newTempsV128_4(&s0, &s1, &s2, &s3); + newTempsV128_4(&s4, &s5, &s6, &s7); + newTempsV128_4(&t0, &t1, &t2, &t3); + newTempsV128_4(&t4, &t5, &t6, &t7); + newTempsV128_4(&p0, &p1, &p2, &cMASK); + + // s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0} + // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2} + // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4} + // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6} + // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8} + // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a} + // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c} + // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e} + assign(s0, SL(EX(i1),EX(i0), 0)); + assign(s1, SL(EX(i1),EX(i0), 6)); + assign(s2, SL(EX(i1),EX(i0),12)); + assign(s3, SL(EX(i2),EX(i1), 2)); + assign(s4, SL(EX(i2),EX(i1), 8)); + assign(s5, SL(EX(i2),EX(i1),14)); + assign(s6, SL(EX(i0),EX(i2), 4)); + assign(s7, SL(EX(i0),EX(i2),10)); + + // t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0 + // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2 + // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4 + // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6 + // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8 + // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa + // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac + // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae + assign(cMASK, mkV128(0x003F)); + assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK))); + assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK))); + assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK))); + assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK))); + assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK))); + assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK))); + assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK))); + assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK))); + + assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) )); + assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8), + SHL(EX(t3),2), SHR(EX(t2),4) )); + assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) )); + + // Then deinterleave at 16x8 granularity. + math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2); + return; + } + + /*NOTREACHED*/ + vassert(0); +} + + +/* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */ +static +void math_DEINTERLEAVE4_128( + /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3, + UInt laneSzBlg2, + IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 ) +{ + if (laneSzBlg2 == 3) { + // 64x2 + assign(*u0, ILO64x2(EX(i2), EX(i0))); + assign(*u1, IHI64x2(EX(i2), EX(i0))); + assign(*u2, ILO64x2(EX(i3), EX(i1))); + assign(*u3, IHI64x2(EX(i3), EX(i1))); + return; + } + if (laneSzBlg2 == 2) { + // 32x4 + IRTemp p0 = newTempV128(); + IRTemp p2 = newTempV128(); + IRTemp p1 = newTempV128(); + IRTemp p3 = newTempV128(); + assign(p0, ILO32x4(EX(i1), EX(i0))); + assign(p1, IHI32x4(EX(i1), EX(i0))); + assign(p2, ILO32x4(EX(i3), EX(i2))); + assign(p3, IHI32x4(EX(i3), EX(i2))); + // And now do what we did for the 64-bit case. + math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3); + return; + } + if (laneSzBlg2 == 1) { + // 16x8 + // Deinterleave into 32-bit chunks, then do as the 32-bit case. + IRTemp p0 = newTempV128(); + IRTemp p1 = newTempV128(); + IRTemp p2 = newTempV128(); + IRTemp p3 = newTempV128(); + assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8))); + assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8))); + assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8))); + assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8))); + // From here on is like the 32 bit case. + math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3); + return; + } + if (laneSzBlg2 == 0) { + // 8x16 + // Deinterleave into 16-bit chunks, then do as the 16-bit case. + IRTemp p0 = newTempV128(); + IRTemp p1 = newTempV128(); + IRTemp p2 = newTempV128(); + IRTemp p3 = newTempV128(); + assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)), + ILO8x16(EX(i0),ROL(EX(i0),4)) )); + assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)), + ILO8x16(EX(i1),ROL(EX(i1),4)) )); + assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)), + ILO8x16(EX(i2),ROL(EX(i2),4)) )); + assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)), + ILO8x16(EX(i3),ROL(EX(i3),4)) )); + // From here on is like the 16 bit case. + math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3); + return; + } + /*NOTREACHED*/ + vassert(0); +} + + +/* Wrappers that use the full-width (de)interleavers to do half-width + (de)interleaving. The scheme is to clone each input lane in the + lower half of each incoming value, do a full width (de)interleave + at the next lane size up, and remove every other lane of the the + result. The returned values may have any old junk in the upper + 64 bits -- the caller must ignore that. */ + +/* Helper function -- get doubling and narrowing operations. */ +static +void math_get_doubler_and_halver ( /*OUT*/IROp* doubler, + /*OUT*/IROp* halver, + UInt laneSzBlg2 ) +{ + switch (laneSzBlg2) { + case 2: + *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4; + break; + case 1: + *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8; + break; + case 0: + *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16; + break; + default: + vassert(0); + } +} + +/* Do interleaving for 1 64 bit vector, for ST1 insns. */ +static +void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0, + UInt laneSzBlg2, IRTemp u0 ) +{ + assign(*i0, mkexpr(u0)); +} + + +/* Do interleaving for 2 64 bit vectors, for ST2 insns. */ +static +void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1, + UInt laneSzBlg2, IRTemp u0, IRTemp u1 ) +{ + if (laneSzBlg2 == 3) { + // 1x64, degenerate case + assign(*i0, EX(u0)); + assign(*i1, EX(u1)); + return; + } + + vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2); + IROp doubler = Iop_INVALID, halver = Iop_INVALID; + math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2); + + IRTemp du0 = newTempV128(); + IRTemp du1 = newTempV128(); + assign(du0, binop(doubler, EX(u0), EX(u0))); + assign(du1, binop(doubler, EX(u1), EX(u1))); + IRTemp di0 = newTempV128(); + IRTemp di1 = newTempV128(); + math_INTERLEAVE2_128(&di0, &di1, laneSzBlg2 + 1, du0, du1); + assign(*i0, binop(halver, EX(di0), EX(di0))); + assign(*i1, binop(halver, EX(di1), EX(di1))); +} + + +/* Do interleaving for 3 64 bit vectors, for ST3 insns. */ +static +void math_INTERLEAVE3_64( + /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, + UInt laneSzBlg2, + IRTemp u0, IRTemp u1, IRTemp u2 ) +{ + if (laneSzBlg2 == 3) { + // 1x64, degenerate case + assign(*i0, EX(u0)); + assign(*i1, EX(u1)); + assign(*i2, EX(u2)); + return; + } + + vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2); + IROp doubler = Iop_INVALID, halver = Iop_INVALID; + math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2); + + IRTemp du0 = newTempV128(); + IRTemp du1 = newTempV128(); + IRTemp du2 = newTempV128(); + assign(du0, binop(doubler, EX(u0), EX(u0))); + assign(du1, binop(doubler, EX(u1), EX(u1))); + assign(du2, binop(doubler, EX(u2), EX(u2))); + IRTemp di0 = newTempV128(); + IRTemp di1 = newTempV128(); + IRTemp di2 = newTempV128(); + math_INTERLEAVE3_128(&di0, &di1, &di2, laneSzBlg2 + 1, du0, du1, du2); + assign(*i0, binop(halver, EX(di0), EX(di0))); + assign(*i1, binop(halver, EX(di1), EX(di1))); + assign(*i2, binop(halver, EX(di2), EX(di2))); +} + + +/* Do interleaving for 4 64 bit vectors, for ST4 insns. */ +static +void math_INTERLEAVE4_64( + /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3, + UInt laneSzBlg2, + IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 ) +{ + if (laneSzBlg2 == 3) { + // 1x64, degenerate case + assign(*i0, EX(u0)); + assign(*i1, EX(u1)); + assign(*i2, EX(u2)); + assign(*i3, EX(u3)); + return; + } + + vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2); + IROp doubler = Iop_INVALID, halver = Iop_INVALID; + math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2); + + IRTemp du0 = newTempV128(); + IRTemp du1 = newTempV128(); + IRTemp du2 = newTempV128(); + IRTemp du3 = newTempV128(); + assign(du0, binop(doubler, EX(u0), EX(u0))); + assign(du1, binop(doubler, EX(u1), EX(u1))); + assign(du2, binop(doubler, EX(u2), EX(u2))); + assign(du3, binop(doubler, EX(u3), EX(u3))); + IRTemp di0 = newTempV128(); + IRTemp di1 = newTempV128(); + IRTemp di2 = newTempV128(); + IRTemp di3 = newTempV128(); + math_INTERLEAVE4_128(&di0, &di1, &di2, &di3, + laneSzBlg2 + 1, du0, du1, du2, du3); + assign(*i0, binop(halver, EX(di0), EX(di0))); + assign(*i1, binop(halver, EX(di1), EX(di1))); + assign(*i2, binop(halver, EX(di2), EX(di2))); + assign(*i3, binop(halver, EX(di3), EX(di3))); +} + + +/* Do deinterleaving for 1 64 bit vector, for LD1 insns. */ +static +void math_DEINTERLEAVE1_64( /*OUTx1*/ IRTemp* u0, + UInt laneSzBlg2, IRTemp i0 ) +{ + assign(*u0, mkexpr(i0)); +} + + +/* Do deinterleaving for 2 64 bit vectors, for LD2 insns. */ +static +void math_DEINTERLEAVE2_64( /*OUTx2*/ IRTemp* u0, IRTemp* u1, + UInt laneSzBlg2, IRTemp i0, IRTemp i1 ) +{ + if (laneSzBlg2 == 3) { + // 1x64, degenerate case + assign(*u0, EX(i0)); + assign(*u1, EX(i1)); + return; + } + + vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2); + IROp doubler = Iop_INVALID, halver = Iop_INVALID; + math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2); + + IRTemp di0 = newTempV128(); + IRTemp di1 = newTempV128(); + assign(di0, binop(doubler, EX(i0), EX(i0))); + assign(di1, binop(doubler, EX(i1), EX(i1))); + + IRTemp du0 = newTempV128(); + IRTemp du1 = newTempV128(); + math_DEINTERLEAVE2_128(&du0, &du1, laneSzBlg2 + 1, di0, di1); + assign(*u0, binop(halver, EX(du0), EX(du0))); + assign(*u1, binop(halver, EX(du1), EX(du1))); +} + + +/* Do deinterleaving for 3 64 bit vectors, for LD3 insns. */ +static +void math_DEINTERLEAVE3_64( + /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, + UInt laneSzBlg2, + IRTemp i0, IRTemp i1, IRTemp i2 ) +{ + if (laneSzBlg2 == 3) { + // 1x64, degenerate case + assign(*u0, EX(i0)); + assign(*u1, EX(i1)); + assign(*u2, EX(i2)); + return; + } + + vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2); + IROp doubler = Iop_INVALID, halver = Iop_INVALID; + math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2); + + IRTemp di0 = newTempV128(); + IRTemp di1 = newTempV128(); + IRTemp di2 = newTempV128(); + assign(di0, binop(doubler, EX(i0), EX(i0))); + assign(di1, binop(doubler, EX(i1), EX(i1))); + assign(di2, binop(doubler, EX(i2), EX(i2))); + IRTemp du0 = newTempV128(); + IRTemp du1 = newTempV128(); + IRTemp du2 = newTempV128(); + math_DEINTERLEAVE3_128(&du0, &du1, &du2, laneSzBlg2 + 1, di0, di1, di2); + assign(*u0, binop(halver, EX(du0), EX(du0))); + assign(*u1, binop(halver, EX(du1), EX(du1))); + assign(*u2, binop(halver, EX(du2), EX(du2))); +} + + +/* Do deinterleaving for 4 64 bit vectors, for LD4 insns. */ +static +void math_DEINTERLEAVE4_64( + /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3, + UInt laneSzBlg2, + IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 ) +{ + if (laneSzBlg2 == 3) { + // 1x64, degenerate case + assign(*u0, EX(i0)); + assign(*u1, EX(i1)); + assign(*u2, EX(i2)); + assign(*u3, EX(i3)); + return; + } + + vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2); + IROp doubler = Iop_INVALID, halver = Iop_INVALID; + math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2); + + IRTemp di0 = newTempV128(); + IRTemp di1 = newTempV128(); + IRTemp di2 = newTempV128(); + IRTemp di3 = newTempV128(); + assign(di0, binop(doubler, EX(i0), EX(i0))); + assign(di1, binop(doubler, EX(i1), EX(i1))); + assign(di2, binop(doubler, EX(i2), EX(i2))); + assign(di3, binop(doubler, EX(i3), EX(i3))); + IRTemp du0 = newTempV128(); + IRTemp du1 = newTempV128(); + IRTemp du2 = newTempV128(); + IRTemp du3 = newTempV128(); + math_DEINTERLEAVE4_128(&du0, &du1, &du2, &du3, + laneSzBlg2 + 1, di0, di1, di2, di3); + assign(*u0, binop(halver, EX(du0), EX(du0))); + assign(*u1, binop(halver, EX(du1), EX(du1))); + assign(*u2, binop(halver, EX(du2), EX(du2))); + assign(*u3, binop(halver, EX(du3), EX(du3))); +} + + +#undef EX +#undef SL +#undef ROR +#undef ROL +#undef SHR +#undef SHL +#undef ILO64x2 +#undef IHI64x2 +#undef ILO32x4 +#undef IHI32x4 +#undef ILO16x8 +#undef IHI16x8 +#undef ILO16x8 +#undef IHI16x8 +#undef CEV32x4 +#undef COD32x4 +#undef COD16x8 +#undef COD8x16 +#undef CEV8x16 +#undef AND +#undef OR2 +#undef OR3 +#undef OR4 + + /*------------------------------------------------------------*/ /*--- Load and Store instructions ---*/ /*------------------------------------------------------------*/ @@ -3526,7 +4470,7 @@ static IRTemp gen_indexed_EA ( /*OUT*/HChar* buf, UInt insn, Bool isInt ) case BITS3(0,1,0): break; // 32 bit, valid for both int and vec case BITS3(0,1,1): break; // 64 bit, valid for both int and vec case BITS3(1,0,0): // can only ever be valid for the vector case - if (isInt) goto fail; else goto fail; + if (isInt) goto fail; else break; case BITS3(1,0,1): // these sizes are never valid case BITS3(1,1,0): case BITS3(1,1,1): goto fail; @@ -4250,21 +5194,21 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) sz==11 isn't allowed simm7 is scaled by the (single-register) transfer size - 31 29 22 21 14 9 4 - sz 101 1001 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP], #imm - (at-Rn-then-Rn=EA) + 31 29 26 22 21 14 9 4 - sz 101 1011 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP, #imm]! - (at-EA-then-Rn=EA) + sz 101 1000 L imm7 t2 n t1 mmNP SDQt1, SDQt2, [Xn|SP, #imm] + (at-EA, with nontemporal hint) + + sz 101 1001 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP], #imm + (at-Rn-then-Rn=EA) sz 101 1010 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP, #imm] - (at-EA) - */ + (at-EA) - UInt insn_29_23 = INSN(29,23); - if (insn_29_23 == BITS7(1,0,1,1,0,0,1) - || insn_29_23 == BITS7(1,0,1,1,0,1,1) - || insn_29_23 == BITS7(1,0,1,1,0,1,0)) { + sz 101 1011 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP, #imm]! + (at-EA-then-Rn=EA) + */ + if (INSN(29,25) == BITS5(1,0,1,1,0)) { UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units Bool isLD = INSN(22,22) == 1; Bool wBack = INSN(23,23) == 1; @@ -4293,6 +5237,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) case BITS2(1,1): assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break; case BITS2(1,0): + case BITS2(0,0): assign(tTA, mkexpr(tEA)); /* tWA is unused */ break; default: vassert(0); /* NOTREACHED */ @@ -4358,6 +5303,9 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) case BITS2(1,0): fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n"; break; + case BITS2(0,0): + fmt_str = "%snp %s, %s, [%s, #%lld] (at-Rn)\n"; + break; default: vassert(0); } @@ -4389,7 +5337,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) UInt szLg2 = (INSN(23,23) << 2) | INSN(31,30); Bool isLD = INSN(22,22) == 1; UInt tt = INSN(4,0); - if (szLg2 >= 4) goto after_LDR_STR_vector_register; + if (szLg2 > 4) goto after_LDR_STR_vector_register; IRTemp ea = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/); if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register; switch (szLg2) { @@ -4433,8 +5381,17 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf); } break; - case 4: return False; //ATC - default: vassert(0); + case 4: + if (isLD) { + putQReg128(tt, loadLE(Ity_V128, mkexpr(ea))); + DIP("ldr %s, %s\n", nameQReg128(tt), dis_buf); + } else { + storeLE(mkexpr(ea), getQReg128(tt)); + DIP("str %s, %s\n", nameQReg128(tt), dis_buf); + } + break; + default: + vassert(0); } return True; } @@ -4643,140 +5600,217 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) return True; } - /* ---------- LD1/ST1 (single structure, no offset) ---------- */ - /* 31 23 - 0100 1100 0100 0000 0111 11 N T LD1 {vT.2d}, [Xn|SP] - 0100 1100 0000 0000 0111 11 N T ST1 {vT.2d}, [Xn|SP] - 0100 1100 0100 0000 0111 10 N T LD1 {vT.4s}, [Xn|SP] - 0100 1100 0000 0000 0111 10 N T ST1 {vT.4s}, [Xn|SP] - 0100 1100 0100 0000 0111 01 N T LD1 {vT.8h}, [Xn|SP] - 0100 1100 0000 0000 0111 01 N T ST1 {vT.8h}, [Xn|SP] - 0100 1100 0100 0000 0111 00 N T LD1 {vT.16b}, [Xn|SP] - 0100 1100 0000 0000 0111 00 N T ST1 {vT.16b}, [Xn|SP] - FIXME does this assume that the host is little endian? - */ - if ( (insn & 0xFFFFF000) == 0x4C407000 // LD1 cases - || (insn & 0xFFFFF000) == 0x4C007000 // ST1 cases - ) { - Bool isLD = INSN(22,22) == 1; - UInt rN = INSN(9,5); - UInt vT = INSN(4,0); - IRTemp tEA = newTemp(Ity_I64); - const HChar* names[4] = { "2d", "4s", "8h", "16b" }; - const HChar* name = names[INSN(11,10)]; - assign(tEA, getIReg64orSP(rN)); - if (rN == 31) { /* FIXME generate stack alignment check */ } - if (isLD) { - putQReg128(vT, loadLE(Ity_V128, mkexpr(tEA))); - } else { - storeLE(mkexpr(tEA), getQReg128(vT)); - } - DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1", - vT, name, nameIReg64orSP(rN)); - return True; - } + /* ------ LD1/ST1 (multiple 1-elem structs to/from 1 reg ------ */ + /* ------ LD2/ST2 (multiple 2-elem structs to/from 2 regs ------ */ + /* ------ LD3/ST3 (multiple 3-elem structs to/from 3 regs ------ */ + /* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */ + /* 31 29 26 22 21 20 15 11 9 4 - /* 31 23 - 0000 1100 0100 0000 0111 11 N T LD1 {vT.1d}, [Xn|SP] - 0000 1100 0000 0000 0111 11 N T ST1 {vT.1d}, [Xn|SP] - 0000 1100 0100 0000 0111 10 N T LD1 {vT.2s}, [Xn|SP] - 0000 1100 0000 0000 0111 10 N T ST1 {vT.2s}, [Xn|SP] - 0000 1100 0100 0000 0111 01 N T LD1 {vT.4h}, [Xn|SP] - 0000 1100 0000 0000 0111 01 N T ST1 {vT.4h}, [Xn|SP] - 0000 1100 0100 0000 0111 00 N T LD1 {vT.8b}, [Xn|SP] - 0000 1100 0000 0000 0111 00 N T ST1 {vT.8b}, [Xn|SP] - FIXME does this assume that the host is little endian? - */ - if ( (insn & 0xFFFFF000) == 0x0C407000 // LD1 cases - || (insn & 0xFFFFF000) == 0x0C007000 // ST1 cases - ) { - Bool isLD = INSN(22,22) == 1; - UInt rN = INSN(9,5); - UInt vT = INSN(4,0); - IRTemp tEA = newTemp(Ity_I64); - const HChar* names[4] = { "1d", "2s", "4h", "8b" }; - const HChar* name = names[INSN(11,10)]; - assign(tEA, getIReg64orSP(rN)); - if (rN == 31) { /* FIXME generate stack alignment check */ } - if (isLD) { - putQRegLane(vT, 0, loadLE(Ity_I64, mkexpr(tEA))); - putQRegLane(vT, 1, mkU64(0)); - } else { - storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64)); - } - DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1", - vT, name, nameIReg64orSP(rN)); - return True; - } + 0q 001 1000 L 0 00000 0000 sz n t xx4 {Vt..t+3.T}, [Xn|SP] + 0q 001 1001 L 0 m 0000 sz n t xx4 {Vt..t+3.T}, [Xn|SP], step - /* ---------- LD1/ST1 (single structure, post index) ---------- */ - /* 31 23 - 0100 1100 1001 1111 0111 11 N T ST1 {vT.2d}, [xN|SP], #16 - 0100 1100 1101 1111 0111 11 N T LD1 {vT.2d}, [xN|SP], #16 - 0100 1100 1001 1111 0111 10 N T ST1 {vT.4s}, [xN|SP], #16 - 0100 1100 1101 1111 0111 10 N T LD1 {vT.4s}, [xN|SP], #16 - 0100 1100 1001 1111 0111 01 N T ST1 {vT.8h}, [xN|SP], #16 - 0100 1100 1101 1111 0111 01 N T LD1 {vT.8h}, [xN|SP], #16 - 0100 1100 1001 1111 0111 00 N T ST1 {vT.16b}, [xN|SP], #16 - 0100 1100 1101 1111 0111 00 N T LD1 {vT.16b}, [xN|SP], #16 - Note that #16 is implied and cannot be any other value. - FIXME does this assume that the host is little endian? + 0q 001 1000 L 0 00000 0100 sz n t xx3 {Vt..t+2.T}, [Xn|SP] + 0q 001 1001 L 0 m 0100 sz n t xx3 {Vt..t+2.T}, [Xn|SP], step + + 0q 001 1000 L 0 00000 1000 sz n t xx2 {Vt..t+1.T}, [Xn|SP] + 0q 001 1001 L 0 m 1000 sz n t xx2 {Vt..t+1.T}, [Xn|SP], step + + 0q 001 1000 L 0 00000 0111 sz n t xx1 {Vt.T}, [Xn|SP] + 0q 001 1001 L 0 m 0111 sz n t xx1 {Vt.T}, [Xn|SP], step + + T = defined by Q and sz in the normal way + step = if m == 11111 then transfer-size else Xm + xx = case L of 1 -> LD ; 0 -> ST */ - if ( (insn & 0xFFFFF000) == 0x4CDF7000 // LD1 cases - || (insn & 0xFFFFF000) == 0x4C9F7000 // ST1 cases - ) { - Bool isLD = INSN(22,22) == 1; - UInt rN = INSN(9,5); - UInt vT = INSN(4,0); - IRTemp tEA = newTemp(Ity_I64); - const HChar* names[4] = { "2d", "4s", "8h", "16b" }; - const HChar* name = names[INSN(11,10)]; - assign(tEA, getIReg64orSP(rN)); - if (rN == 31) { /* FIXME generate stack alignment check */ } - if (isLD) { - putQReg128(vT, loadLE(Ity_V128, mkexpr(tEA))); - } else { - storeLE(mkexpr(tEA), getQReg128(vT)); + if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0) + && INSN(21,21) == 0) { + Bool bitQ = INSN(30,30); + Bool isPX = INSN(23,23) == 1; + Bool isLD = INSN(22,22) == 1; + UInt mm = INSN(20,16); + UInt opc = INSN(15,12); + UInt sz = INSN(11,10); + UInt nn = INSN(9,5); + UInt tt = INSN(4,0); + Bool isQ = bitQ == 1; + Bool is1d = sz == BITS2(1,1) && !isQ; + UInt nRegs = 0; + switch (opc) { + case BITS4(0,0,0,0): nRegs = 4; break; + case BITS4(0,1,0,0): nRegs = 3; break; + case BITS4(1,0,0,0): nRegs = 2; break; + case BITS4(0,1,1,1): nRegs = 1; break; + default: break; } - putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(16))); - DIP("%s {v%u.%s}, [%s], #16\n", isLD ? "ld1" : "st1", - vT, name, nameIReg64orSP(rN)); - return True; - } + + if (nRegs == 1 /* .1d is allowed */ + || (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) { - /* 31 23 - 0000 1100 1001 1111 0111 11 N T ST1 {vT.1d}, [xN|SP], #8 - 0000 1100 1101 1111 0111 11 N T LD1 {vT.1d}, [xN|SP], #8 - 0000 1100 1001 1111 0111 10 N T ST1 {vT.2s}, [xN|SP], #8 - 0000 1100 1101 1111 0111 10 N T LD1 {vT.2s}, [xN|SP], #8 - 0000 1100 1001 1111 0111 01 N T ST1 {vT.4h}, [xN|SP], #8 - 0000 1100 1101 1111 0111 01 N T LD1 {vT.4h}, [xN|SP], #8 - 0000 1100 1001 1111 0111 00 N T ST1 {vT.8b}, [xN|SP], #8 - 0000 1100 1101 1111 0111 00 N T LD1 {vT.8b}, [xN|SP], #8 - Note that #8 is implied and cannot be any other value. - FIXME does this assume that the host is little endian? - */ - if ( (insn & 0xFFFFF000) == 0x0CDF7000 // LD1 cases - || (insn & 0xFFFFF000) == 0x0C9F7000 // ST1 cases - ) { - Bool isLD = INSN(22,22) == 1; - UInt rN = INSN(9,5); - UInt vT = INSN(4,0); - IRTemp tEA = newTemp(Ity_I64); - const HChar* names[4] = { "1d", "2s", "4h", "8b" }; - const HChar* name = names[INSN(11,10)]; - assign(tEA, getIReg64orSP(rN)); - if (rN == 31) { /* FIXME generate stack alignment check */ } - if (isLD) { - putQRegLane(vT, 0, loadLE(Ity_I64, mkexpr(tEA))); - putQRegLane(vT, 1, mkU64(0)); - } else { - storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64)); + UInt xferSzB = (isQ ? 16 : 8) * nRegs; + + /* Generate the transfer address (TA) and if necessary the + writeback address (WB) */ + IRTemp tTA = newTemp(Ity_I64); + assign(tTA, getIReg64orSP(nn)); + if (nn == 31) { /* FIXME generate stack alignment check */ } + IRTemp tWB = IRTemp_INVALID; + if (isPX) { + tWB = newTemp(Ity_I64); + assign(tWB, binop(Iop_Add64, + mkexpr(tTA), + mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB) + : getIReg64orZR(mm))); + } + + /* -- BEGIN generate the transfers -- */ + + IRTemp u0, u1, u2, u3, i0, i1, i2, i3; + u0 = u1 = u2 = u3 = i0 = i1 = i2 = i3 = IRTemp_INVALID; + switch (nRegs) { + case 4: u3 = newTempV128(); i3 = newTempV128(); /* fallthru */ + case 3: u2 = newTempV128(); i2 = newTempV128(); /* fallthru */ + case 2: u1 = newTempV128(); i1 = newTempV128(); /* fallthru */ + case 1: u0 = newTempV128(); i0 = newTempV128(); break; + default: vassert(0); + } + + /* -- Multiple 128 or 64 bit stores -- */ + if (!isLD) { + switch (nRegs) { + case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */ + case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */ + case 2: assign(u1, getQReg128((tt+1) % 32)); /* fallthru */ + case 1: assign(u0, getQReg128((tt+0) % 32)); break; + default: vassert(0); + } + switch (nRegs) { + case 4: (isQ ? math_INTERLEAVE4_128 : math_INTERLEAVE4_64) + (&i0, &i1, &i2, &i3, sz, u0, u1, u2, u3); + break; + case 3: (isQ ? math_INTERLEAVE3_128 : math_INTERLEAVE3_64) + (&i0, &i1, &i2, sz, u0, u1, u2); + break; + case 2: (isQ ? math_INTERLEAVE2_128 : math_INTERLEAVE2_64) + (&i0, &i1, sz, u0, u1); + break; + case 1: (isQ ? math_INTERLEAVE1_128 : math_INTERLEAVE1_64) + (&i0, sz, u0); + break; + default: vassert(0); + } +# define MAYBE_NARROW_TO_64(_expr) \ + (isQ ? (_expr) : unop(Iop_V128to64,(_expr))) + UInt step = isQ ? 16 : 8; + switch (nRegs) { + case 4: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)), + MAYBE_NARROW_TO_64(mkexpr(i3)) ); + /* fallthru */ + case 3: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)), + MAYBE_NARROW_TO_64(mkexpr(i2)) ); + /* fallthru */ + case 2: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)), + MAYBE_NARROW_TO_64(mkexpr(i1)) ); + /* fallthru */ + case 1: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)), + MAYBE_NARROW_TO_64(mkexpr(i0)) ); + break; + default: vassert(0); + } +# undef MAYBE_NARROW_TO_64 + } + + /* -- Multiple 128 or 64 bit loads -- */ + else /* isLD */ { + UInt step = isQ ? 16 : 8; + IRType loadTy = isQ ? Ity_V128 : Ity_I64; +# define MAYBE_WIDEN_FROM_64(_expr) \ + (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr))) + switch (nRegs) { + case 4: + assign(i3, MAYBE_WIDEN_FROM_64( + loadLE(loadTy, + binop(Iop_Add64, mkexpr(tTA), + mkU64(3 * step))))); + /* fallthru */ + case 3: + assign(i2, MAYBE_WIDEN_FROM_64( + loadLE(loadTy, + binop(Iop_Add64, mkexpr(tTA), + mkU64(2 * step))))); + /* fallthru */ + case 2: + assign(i1, MAYBE_WIDEN_FROM_64( + loadLE(loadTy, + binop(Iop_Add64, mkexpr(tTA), + mkU64(1 * step))))); + /* fallthru */ + + case 1: + assign(i0, MAYBE_WIDEN_FROM_64( + loadLE(loadTy, + binop(Iop_Add64, mkexpr(tTA), + mkU64(0 * step))))); + break; + default: + vassert(0); + } +# undef MAYBE_WIDEN_FROM_64 + switch (nRegs) { + case 4: (isQ ? math_DEINTERLEAVE4_128 : math_DEINTERLEAVE4_64) + (&u0, &u1, &u2, &u3, sz, i0,i1,i2,i3); + break; + case 3: (isQ ? math_DEINTERLEAVE3_128 : math_DEINTERLEAVE3_64) + (&u0, &u1, &u2, sz, i0, i1, i2); + break; + case 2: (isQ ? math_DEINTERLEAVE2_128 : math_DEINTERLEAVE2_64) + (&u0, &u1, sz, i0, i1); + break; + case 1: (isQ ? math_DEINTERLEAVE1_128 : math_DEINTERLEAVE1_64) + (&u0, sz, i0); + break; + default: vassert(0); + } + switch (nRegs) { + case 4: putQReg128( (tt+3) % 32, + math_MAYBE_ZERO_HI64(bitQ, u3)); + /* fallthru */ + case 3: putQReg128( (tt+2) % 32, + math_MAYBE_ZERO_HI64(bitQ, u2)); + /* fallthru */ + case 2: putQReg128( (tt+1) % 32, + math_MAYBE_ZERO_HI64(bitQ, u1)); + /* fallthru */ + case 1: putQReg128( (tt+0) % 32, + math_MAYBE_ZERO_HI64(bitQ, u0)); + break; + default: vassert(0); + } + } + + /* -- END generate the transfers -- */ + + /* Do the writeback, if necessary */ + if (isPX) { + putIReg64orSP(nn, mkexpr(tWB)); + } + + HChar pxStr[20]; + pxStr[0] = pxStr[sizeof(pxStr)-1] = 0; + if (isPX) { + if (mm == BITS5(1,1,1,1,1)) + vex_sprintf(pxStr, ", #%u", xferSzB); + else + vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm)); + } + const HChar* arr = nameArr_Q_SZ(bitQ, sz); + DIP("%s%u {v%u.%s .. v%u.%s}, [%s]%s\n", + isLD ? "ld" : "st", nRegs, + (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn), + pxStr); + + return True; } - putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(8))); - DIP("%s {v%u.%s}, [%s], #8\n", isLD ? "ld1" : "st1", - vT, name, nameIReg64orSP(rN)); - return True; + /* else fall through */ } /* ---------- LD1R (single structure, replicate) ---------- */ @@ -4824,81 +5858,6 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) return False; } - /* -------- LD2/ST2 (multi 2-elem structs, 2 regs, post index) -------- */ - /* Only a very few cases. */ - /* 31 23 11 9 4 - 0100 1100 1101 1111 1000 11 n t LD2 {Vt.2d, V(t+1)%32.2d}, [Xn|SP], #32 - 0100 1100 1001 1111 1000 11 n t ST2 {Vt.2d, V(t+1)%32.2d}, [Xn|SP], #32 - 0100 1100 1101 1111 1000 10 n t LD2 {Vt.4s, V(t+1)%32.4s}, [Xn|SP], #32 - 0100 1100 1001 1111 1000 10 n t ST2 {Vt.4s, V(t+1)%32.4s}, [Xn|SP], #32 - */ - if ( (insn & 0xFFFFFC00) == 0x4CDF8C00 // LD2 .2d - || (insn & 0xFFFFFC00) == 0x4C9F8C00 // ST2 .2d - || (insn & 0xFFFFFC00) == 0x4CDF8800 // LD2 .4s - || (insn & 0xFFFFFC00) == 0x4C9F8800 // ST2 .4s - ) { - Bool isLD = INSN(22,22) == 1; - UInt rN = INSN(9,5); - UInt vT = INSN(4,0); - IRTemp tEA = newTemp(Ity_I64); - UInt sz = INSN(11,10); - const HChar* name = "??"; - assign(tEA, getIReg64orSP(rN)); - if (rN == 31) { /* FIXME generate stack alignment check */ } - IRExpr* tEA_0 = binop(Iop_Add64, mkexpr(tEA), mkU64(0)); - IRExpr* tEA_8 = binop(Iop_Add64, mkexpr(tEA), mkU64(8)); - IRExpr* tEA_16 = binop(Iop_Add64, mkexpr(tEA), mkU64(16)); - IRExpr* tEA_24 = binop(Iop_Add64, mkexpr(tEA), mkU64(24)); - if (sz == BITS2(1,1)) { - name = "2d"; - if (isLD) { - putQRegLane((vT+0) % 32, 0, loadLE(Ity_I64, tEA_0)); - putQRegLane((vT+0) % 32, 1, loadLE(Ity_I64, tEA_16)); - putQRegLane((vT+1) % 32, 0, loadLE(Ity_I64, tEA_8)); - putQRegLane((vT+1) % 32, 1, loadLE(Ity_I64, tEA_24)); - } else { - storeLE(tEA_0, getQRegLane((vT+0) % 32, 0, Ity_I64)); - storeLE(tEA_16, getQRegLane((vT+0) % 32, 1, Ity_I64)); - storeLE(tEA_8, getQRegLane((vT+1) % 32, 0, Ity_I64)); - storeLE(tEA_24, getQRegLane((vT+1) % 32, 1, Ity_I64)); - } - } - else if (sz == BITS2(1,0)) { - /* Uh, this is ugly. TODO: better. */ - name = "4s"; - IRExpr* tEA_4 = binop(Iop_Add64, mkexpr(tEA), mkU64(4)); - IRExpr* tEA_12 = binop(Iop_Add64, mkexpr(tEA), mkU64(12)); - IRExpr* tEA_20 = binop(Iop_Add64, mkexpr(tEA), mkU64(20)); - IRExpr* tEA_28 = binop(Iop_Add64, mkexpr(tEA), mkU64(28)); - if (isLD) { - putQRegLane((vT+0) % 32, 0, loadLE(Ity_I32, tEA_0)); - putQRegLane((vT+0) % 32, 1, loadLE(Ity_I32, tEA_8)); - putQRegLane((vT+0) % 32, 2, loadLE(Ity_I32, tEA_16)); - putQRegLane((vT+0) % 32, 3, loadLE(Ity_I32, tEA_24)); - putQRegLane((vT+1) % 32, 0, loadLE(Ity_I32, tEA_4)); - putQRegLane((vT+1) % 32, 1, loadLE(Ity_I32, tEA_12)); - putQRegLane((vT+1) % 32, 2, loadLE(Ity_I32, tEA_20)); - putQRegLane((vT+1) % 32, 3, loadLE(Ity_I32, tEA_28)); - } else { - storeLE(tEA_0, getQRegLane((vT+0) % 32, 0, Ity_I32)); - storeLE(tEA_8, getQRegLane((vT+0) % 32, 1, Ity_I32)); - storeLE(tEA_16, getQRegLane((vT+0) % 32, 2, Ity_I32)); - storeLE(tEA_24, getQRegLane((vT+0) % 32, 3, Ity_I32)); - storeLE(tEA_4, getQRegLane((vT+1) % 32, 0, Ity_I32)); - storeLE(tEA_12, getQRegLane((vT+1) % 32, 1, Ity_I32)); - storeLE(tEA_20, getQRegLane((vT+1) % 32, 2, Ity_I32)); - storeLE(tEA_28, getQRegLane((vT+1) % 32, 3, Ity_I32)); - } - } - else { - vassert(0); // Can't happen. - } - putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(32))); - DIP("%s {v%u.%s, v%u.%s}, [%s], #32\n", isLD ? "ld2" : "st2", - (vT+0) % 32, name, (vT+1) % 32, name, nameIReg64orSP(rN)); - return True; - } - /* -------- LD1/ST1 (multi 1-elem structs, 2 regs, no offset) -------- */ /* Only a very few cases. */ /* 31 23 @@ -4995,58 +5954,6 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) return True; } - /* -------- LD3/ST3 (multi 3-elem structs, 3 regs, post index) -------- */ - /* Only a very few cases. */ - /* 31 23 11 9 4 - 0100 1100 1101 1111 0100 11 n t LD3 {Vt.2d .. V(t+2)%32.2d}, [Xn|SP], #48 - 0100 1100 1001 1111 0100 11 n t ST3 {Vt.2d .. V(t+2)%32.2d}, [Xn|SP], #48 - */ - if ( (insn & 0xFFFFFC00) == 0x4CDF4C00 // LD3 .2d - || (insn & 0xFFFFFC00) == 0x4C9F4C00 // ST3 .2d - ) { - Bool isLD = INSN(22,22) == 1; - UInt rN = INSN(9,5); - UInt vT = INSN(4,0); - IRTemp tEA = newTemp(Ity_I64); - UInt sz = INSN(11,10); - const HChar* name = "??"; - assign(tEA, getIReg64orSP(rN)); - if (rN == 31) { /* FIXME generate stack alignment check */ } - IRExpr* tEA_0 = binop(Iop_Add64, mkexpr(tEA), mkU64(0)); - IRExpr* tEA_8 = binop(Iop_Add64, mkexpr(tEA), mkU64(8)); - IRExpr* tEA_16 = binop(Iop_Add64, mkexpr(tEA), mkU64(16)); - IRExpr* tEA_24 = binop(Iop_Add64, mkexpr(tEA), mkU64(24)); - IRExpr* tEA_32 = binop(Iop_Add64, mkexpr(tEA), mkU64(32)); - IRExpr* tEA_40 = binop(Iop_Add64, mkexpr(tEA), mkU64(40)); - if (sz == BITS2(1,1)) { - name = "2d"; - if (isLD) { - putQRegLane((vT+0) % 32, 0, loadLE(Ity_I64, tEA_0)); - putQRegLane((vT+0) % 32, 1, loadLE(Ity_I64, tEA_24)); - putQRegLane((vT+1) % 32, 0, loadLE(Ity_I64, tEA_8)); - putQRegLane((vT+1) % 32, 1, loadLE(Ity_I64, tEA_32)); - putQRegLane((vT+2) % 32, 0, loadLE(Ity_I64, tEA_16)); - putQRegLane((vT+2) % 32, 1, loadLE(Ity_I64, tEA_40)); - } else { - storeLE(tEA_0, getQRegLane((vT+0) % 32, 0, Ity_I64)); - storeLE(tEA_24, getQRegLane((vT+0) % 32, 1, Ity_I64)); - storeLE(tEA_8, getQRegLane((vT+1) % 32, 0, Ity_I64)); - storeLE(tEA_32, getQRegLane((vT+1) % 32, 1, Ity_I64)); - storeLE(tEA_16, getQRegLane((vT+2) % 32, 0, Ity_I64)); - storeLE(tEA_40, getQRegLane((vT+2) % 32, 1, Ity_I64)); - } - } - else { - vassert(0); // Can't happen. - } - putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(48))); - DIP("%s {v%u.%s, v%u.%s, v%u.%s}, [%s], #32\n", - isLD ? "ld3" : "st3", - (vT+0) % 32, name, (vT+1) % 32, name, (vT+2) % 32, name, - nameIReg64orSP(rN)); - return True; - } - /* ------------------ LD{,A}X{R,RH,RB} ------------------ */ /* ------------------ ST{,L}X{R,RH,RB} ------------------ */ /* 31 29 23 20 14 9 4 diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index 1b8c818a99..62db9921bb 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -1235,6 +1235,14 @@ ARM64Instr* ARM64Instr_VImmQ (HReg rQ, UShort imm) { i->tag = ARM64in_VImmQ; i->ARM64in.VImmQ.rQ = rQ; i->ARM64in.VImmQ.imm = imm; + /* Check that this is something that can actually be emitted. */ + switch (imm) { + case 0x0000: case 0x0001: case 0x0003: + case 0x000F: case 0x003F: case 0x00FF: case 0xFFFF: + break; + default: + vassert(0); + } return i; } ARM64Instr* ARM64Instr_VDfromX ( HReg rD, HReg rX ) { @@ -4925,41 +4933,38 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, case ARM64in_VImmQ: { UInt rQ = qregNo(i->ARM64in.VImmQ.rQ); UShort imm = i->ARM64in.VImmQ.imm; - if (imm == 0x0000) { - /* movi rQ.4s, #0x0 == 0x4F 0x00 0x04 000 rQ */ - vassert(rQ < 32); - *p++ = 0x4F000400 | rQ; - goto done; - } - if (imm == 0x0001) { - /* movi rD, #0xFF == 0x2F 0x00 0xE4 001 rD */ - vassert(rQ < 32); - *p++ = 0x2F00E420 | rQ; - goto done; - } - if (imm == 0x0003) { - /* movi rD, #0xFFFF == 0x2F 0x00 0xE4 011 rD */ - vassert(rQ < 32); - *p++ = 0x2F00E460 | rQ; - goto done; - } - if (imm == 0x000F) { - /* movi rD, #0xFFFFFFFF == 0x2F 0x00 0xE5 111 rD */ - vassert(rQ < 32); - *p++ = 0x2F00E5E0 | rQ; - goto done; - } - if (imm == 0x00FF) { - /* movi rD, #0xFFFFFFFFFFFFFFFF == 0x2F 0x07 0xE7 111 rD */ - vassert(rQ < 32); - *p++ = 0x2F07E7E0 | rQ; - goto done; - } - if (imm == 0xFFFF) { - /* mvni rQ.4s, #0x0 == 0x6F 0x00 0x04 000 rQ */ - vassert(rQ < 32); - *p++ = 0x6F000400 | rQ; - goto done; + vassert(rQ < 32); + switch (imm) { + case 0x0000: + // movi rQ.4s, #0x0 == 0x4F 0x00 0x04 000 rQ + *p++ = 0x4F000400 | rQ; + goto done; + case 0x0001: + // movi rQ, #0xFF == 0x2F 0x00 0xE4 001 rQ + *p++ = 0x2F00E420 | rQ; + goto done; + case 0x0003: + // movi rQ, #0xFFFF == 0x2F 0x00 0xE4 011 rQ + *p++ = 0x2F00E460 | rQ; + goto done; + case 0x000F: + // movi rQ, #0xFFFFFFFF == 0x2F 0x00 0xE5 111 rQ + *p++ = 0x2F00E5E0 | rQ; + goto done; + case 0x003F: + // movi rQ, #0xFFFFFFFFFFFF == 0x2F 0x01 0xE7 111 rQ + *p++ = 0x2F01E7E0 | rQ; + goto done; + case 0x00FF: + // movi rQ, #0xFFFFFFFFFFFFFFFF == 0x2F 0x07 0xE7 111 rQ + *p++ = 0x2F07E7E0 | rQ; + goto done; + case 0xFFFF: + // mvni rQ.4s, #0x0 == 0x6F 0x00 0x04 000 rQ + *p++ = 0x6F000400 | rQ; + goto done; + default: + break; } goto bad; /* no other handled cases right now */ } diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index 376170a3d2..f11e0b358f 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -2150,10 +2150,38 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) /* Only a very limited range of constants is handled. */ vassert(e->Iex.Const.con->tag == Ico_V128); UShort con = e->Iex.Const.con->Ico.V128; - if (con == 0x0000 || con == 0xFFFF) { - HReg res = newVRegV(env); - addInstr(env, ARM64Instr_VImmQ(res, con)); - return res; + HReg res = newVRegV(env); + switch (con) { + case 0x0000: case 0x000F: case 0x003F: case 0x00FF: case 0xFFFF: + addInstr(env, ARM64Instr_VImmQ(res, con)); + return res; + case 0x00F0: + addInstr(env, ARM64Instr_VImmQ(res, 0x000F)); + addInstr(env, ARM64Instr_VExtV(res, res, res, 12)); + return res; + case 0x0F00: + addInstr(env, ARM64Instr_VImmQ(res, 0x000F)); + addInstr(env, ARM64Instr_VExtV(res, res, res, 8)); + return res; + case 0x0FF0: + addInstr(env, ARM64Instr_VImmQ(res, 0x00FF)); + addInstr(env, ARM64Instr_VExtV(res, res, res, 12)); + return res; + case 0x0FFF: + addInstr(env, ARM64Instr_VImmQ(res, 0x000F)); + addInstr(env, ARM64Instr_VExtV(res, res, res, 4)); + addInstr(env, ARM64Instr_VUnaryV(ARM64vecu_NOT, res, res)); + return res; + case 0xF000: + addInstr(env, ARM64Instr_VImmQ(res, 0x000F)); + addInstr(env, ARM64Instr_VExtV(res, res, res, 4)); + return res; + case 0xFF00: + addInstr(env, ARM64Instr_VImmQ(res, 0x00FF)); + addInstr(env, ARM64Instr_VExtV(res, res, res, 8)); + return res; + default: + break; } /* Unhandled */ goto v128_expr_bad; @@ -2753,51 +2781,50 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) break; } - // JRS 01 Sept 2014: these are tested and believed to be correct, - // but they are no longer used by the front end, hence commented - // out. They are replaced by Iop_SliceV128, which is more general - // and in many cases leads to better code overall. - //case Iop_ShlV128: - //case Iop_ShrV128: { - // Bool isSHR = e->Iex.Binop.op == Iop_ShrV128; - // /* This is tricky. Generate an EXT instruction with zeroes in - // the high operand (shift right) or low operand (shift left). - // Note that we can only slice in the EXT instruction at a byte - // level of granularity, so the shift amount needs careful - // checking. */ - // IRExpr* argL = e->Iex.Binop.arg1; - // IRExpr* argR = e->Iex.Binop.arg2; - // if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) { - // UInt amt = argR->Iex.Const.con->Ico.U8; - // Bool amtOK = False; - // switch (amt) { - // case 0x08: case 0x10: case 0x18: case 0x20: case 0x28: - // case 0x30: case 0x38: case 0x40: case 0x48: case 0x50: - // case 0x58: case 0x60: case 0x68: case 0x70: case 0x78: - // amtOK = True; break; - // } - // /* We could also deal with amt==0 by copying the source to - // the destination, but there's no need for that so far. */ - // if (amtOK) { - // HReg src = iselV128Expr(env, argL); - // HReg srcZ = newVRegV(env); - // addInstr(env, ARM64Instr_VImmQ(srcZ, 0x0000)); - // UInt immB = amt / 8; - // vassert(immB >= 1 && immB <= 15); - // HReg dst = newVRegV(env); - // if (isSHR) { - // addInstr(env, ARM64Instr_VExtV(dst, src/*lo*/, srcZ/*hi*/, - // immB)); - // } else { - // addInstr(env, ARM64Instr_VExtV(dst, srcZ/*lo*/, src/*hi*/, - // 16 - immB)); - // } - // return dst; - // } - // } - // /* else fall out; this is unhandled */ - // break; - //} + // Use Iop_SliceV128 in preference to Iop_ShlV128 and Iop_ShrV128, + // as it is in some ways more general and often leads to better + // code overall. + case Iop_ShlV128: + case Iop_ShrV128: { + Bool isSHR = e->Iex.Binop.op == Iop_ShrV128; + /* This is tricky. Generate an EXT instruction with zeroes in + the high operand (shift right) or low operand (shift left). + Note that we can only slice in the EXT instruction at a byte + level of granularity, so the shift amount needs careful + checking. */ + IRExpr* argL = e->Iex.Binop.arg1; + IRExpr* argR = e->Iex.Binop.arg2; + if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) { + UInt amt = argR->Iex.Const.con->Ico.U8; + Bool amtOK = False; + switch (amt) { + case 0x08: case 0x10: case 0x18: case 0x20: case 0x28: + case 0x30: case 0x38: case 0x40: case 0x48: case 0x50: + case 0x58: case 0x60: case 0x68: case 0x70: case 0x78: + amtOK = True; break; + } + /* We could also deal with amt==0 by copying the source to + the destination, but there's no need for that so far. */ + if (amtOK) { + HReg src = iselV128Expr(env, argL); + HReg srcZ = newVRegV(env); + addInstr(env, ARM64Instr_VImmQ(srcZ, 0x0000)); + UInt immB = amt / 8; + vassert(immB >= 1 && immB <= 15); + HReg dst = newVRegV(env); + if (isSHR) { + addInstr(env, ARM64Instr_VExtV(dst, src/*lo*/, srcZ/*hi*/, + immB)); + } else { + addInstr(env, ARM64Instr_VExtV(dst, srcZ/*lo*/, src/*hi*/, + 16 - immB)); + } + return dst; + } + } + /* else fall out; this is unhandled */ + break; + } case Iop_PolynomialMull8x8: case Iop_Mull32Ux2: @@ -3092,6 +3119,14 @@ static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e ) } } + if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) { + vassert(e->Iex.Load.ty == Ity_F32); + HReg addr = iselIntExpr_R(env, e->Iex.Load.addr); + HReg res = newVRegD(env); + addInstr(env, ARM64Instr_VLdStS(True/*isLoad*/, res, addr, 0)); + return res; + } + if (e->tag == Iex_Get) { Int offs = e->Iex.Get.offset; if (offs >= 0 && offs < 16384 && 0 == (offs & 3)) { -- 2.47.2