*t3 = newTempV128();
}
-//static
-//void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
-//{
-// vassert(t1 && *t1 == IRTemp_INVALID);
-// vassert(t2 && *t2 == IRTemp_INVALID);
-// vassert(t3 && *t3 == IRTemp_INVALID);
-// vassert(t4 && *t4 == IRTemp_INVALID);
-// *t1 = newTempV128();
-// *t2 = newTempV128();
-// *t3 = newTempV128();
-// *t4 = newTempV128();
-//}
+static
+void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
+{
+ vassert(t1 && *t1 == IRTemp_INVALID);
+ vassert(t2 && *t2 == IRTemp_INVALID);
+ vassert(t3 && *t3 == IRTemp_INVALID);
+ vassert(t4 && *t4 == IRTemp_INVALID);
+ *t1 = newTempV128();
+ *t2 = newTempV128();
+ *t3 = newTempV128();
+ *t4 = newTempV128();
+}
static
void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
}
+/*------------------------------------------------------------*/
+/*--- Math helpers for vector interleave/deinterleave ---*/
+/*------------------------------------------------------------*/
+
+#define EX(_tmp) \
+ mkexpr(_tmp)
+#define SL(_hi128,_lo128,_nbytes) \
+ ( (_nbytes) == 0 \
+ ? (_lo128) \
+ : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) )
+#define ROR(_v128,_nbytes) \
+ SL((_v128),(_v128),(_nbytes))
+#define ROL(_v128,_nbytes) \
+ SL((_v128),(_v128),16-(_nbytes))
+#define SHR(_v128,_nbytes) \
+ binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes)))
+#define SHL(_v128,_nbytes) \
+ binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes)))
+#define ILO64x2(_argL,_argR) \
+ binop(Iop_InterleaveLO64x2,(_argL),(_argR))
+#define IHI64x2(_argL,_argR) \
+ binop(Iop_InterleaveHI64x2,(_argL),(_argR))
+#define ILO32x4(_argL,_argR) \
+ binop(Iop_InterleaveLO32x4,(_argL),(_argR))
+#define IHI32x4(_argL,_argR) \
+ binop(Iop_InterleaveHI32x4,(_argL),(_argR))
+#define ILO16x8(_argL,_argR) \
+ binop(Iop_InterleaveLO16x8,(_argL),(_argR))
+#define IHI16x8(_argL,_argR) \
+ binop(Iop_InterleaveHI16x8,(_argL),(_argR))
+#define ILO8x16(_argL,_argR) \
+ binop(Iop_InterleaveLO8x16,(_argL),(_argR))
+#define IHI8x16(_argL,_argR) \
+ binop(Iop_InterleaveHI8x16,(_argL),(_argR))
+#define CEV32x4(_argL,_argR) \
+ binop(Iop_CatEvenLanes32x4,(_argL),(_argR))
+#define COD32x4(_argL,_argR) \
+ binop(Iop_CatOddLanes32x4,(_argL),(_argR))
+#define COD16x8(_argL,_argR) \
+ binop(Iop_CatOddLanes16x8,(_argL),(_argR))
+#define COD8x16(_argL,_argR) \
+ binop(Iop_CatOddLanes8x16,(_argL),(_argR))
+#define CEV8x16(_argL,_argR) \
+ binop(Iop_CatEvenLanes8x16,(_argL),(_argR))
+#define AND(_arg1,_arg2) \
+ binop(Iop_AndV128,(_arg1),(_arg2))
+#define OR2(_arg1,_arg2) \
+ binop(Iop_OrV128,(_arg1),(_arg2))
+#define OR3(_arg1,_arg2,_arg3) \
+ binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3)))
+#define OR4(_arg1,_arg2,_arg3,_arg4) \
+ binop(Iop_OrV128, \
+ binop(Iop_OrV128,(_arg1),(_arg2)), \
+ binop(Iop_OrV128,(_arg3),(_arg4)))
+
+
+/* Do interleaving for 1 128 bit vector, for ST1 insns. */
+static
+void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0,
+ UInt laneSzBlg2, IRTemp u0 )
+{
+ assign(*i0, mkexpr(u0));
+}
+
+
+/* Do interleaving for 2 128 bit vectors, for ST2 insns. */
+static
+void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
+ UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
+{
+ /* This is pretty easy, since we have primitives directly to
+ hand. */
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ // u1 == B1 B0, u0 == A1 A0
+ // i1 == B1 A1, i0 == B0 A0
+ assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0)));
+ assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0)));
+ return;
+ }
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
+ // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
+ assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0)));
+ assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0)));
+ return;
+ }
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // u1 == B{7..0}, u0 == A{7..0}
+ // i0 == B3 A3 B2 A2 B1 A1 B0 A0
+ // i1 == B7 A7 B6 A6 B5 A5 B4 A4
+ assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0)));
+ assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0)));
+ return;
+ }
+ if (laneSzBlg2 == 0) {
+ // 8x16
+ // u1 == B{f..0}, u0 == A{f..0}
+ // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
+ // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
+ assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0)));
+ assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0)));
+ return;
+ }
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Do interleaving for 3 128 bit vectors, for ST3 insns. */
+static
+void math_INTERLEAVE3_128(
+ /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
+ UInt laneSzBlg2,
+ IRTemp u0, IRTemp u1, IRTemp u2 )
+{
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
+ // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
+ assign(*i2, IHI64x2( EX(u2), EX(u1) ));
+ assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) ));
+ assign(*i0, ILO64x2( EX(u1), EX(u0) ));
+ return;
+ }
+
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
+ // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
+ // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp c1100 = newTempV128();
+ IRTemp c0011 = newTempV128();
+ IRTemp c0110 = newTempV128();
+ assign(c1100, mkV128(0xFF00));
+ assign(c0011, mkV128(0x00FF));
+ assign(c0110, mkV128(0x0FF0));
+ // First interleave them at 64x2 granularity,
+ // generating partial ("p") values.
+ math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2);
+ // And more shuffling around for the final answer
+ assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ),
+ AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) ));
+ assign(*i1, OR3( SHL(EX(p2),12),
+ AND(EX(p1),EX(c0110)),
+ SHR(EX(p0),12) ));
+ assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ),
+ AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) ));
+ return;
+ }
+
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // u2 == C7 C6 C5 C4 C3 C2 C1 C0
+ // u1 == B7 B6 B5 B4 B3 B2 B1 B0
+ // u0 == A7 A6 A5 A4 A3 A2 A1 A0
+ //
+ // p2 == C7 C6 B7 B6 A7 A6 C5 C4
+ // p1 == B5 B4 A5 A4 C3 C2 B3 B2
+ // p0 == A3 A2 C1 C0 B1 B0 A1 A0
+ //
+ // i2 == C7 B7 A7 C6 B6 A6 C5 B5
+ // i1 == A5 C4 B4 A4 C4 B3 A3 C2
+ // i0 == B2 A2 C1 B1 A1 C0 B0 A0
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp c1000 = newTempV128();
+ IRTemp c0100 = newTempV128();
+ IRTemp c0010 = newTempV128();
+ IRTemp c0001 = newTempV128();
+ assign(c1000, mkV128(0xF000));
+ assign(c0100, mkV128(0x0F00));
+ assign(c0010, mkV128(0x00F0));
+ assign(c0001, mkV128(0x000F));
+ // First interleave them at 32x4 granularity,
+ // generating partial ("p") values.
+ math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2);
+ // And more shuffling around for the final answer
+ assign(*i2,
+ OR4( AND( IHI16x8( EX(p2), ROL(EX(p2),4) ), EX(c1000) ),
+ AND( IHI16x8( ROL(EX(p2),6), EX(p2) ), EX(c0100) ),
+ AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ),
+ AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) )
+ ));
+ assign(*i1,
+ OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ),
+ AND( IHI16x8( EX(p1), ROL(EX(p1),4) ), EX(c0100) ),
+ AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ),
+ AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) )
+ ));
+ assign(*i0,
+ OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ),
+ AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ),
+ AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ),
+ AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) )
+ ));
+ return;
+ }
+
+ if (laneSzBlg2 == 0) {
+ // 8x16. It doesn't seem worth the hassle of first doing a
+ // 16x8 interleave, so just generate all 24 partial results
+ // directly :-(
+ // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0
+ // i2 == Cf Bf Af Ce .. Bb Ab Ca
+ // i1 == Ba Aa C9 B9 .. A6 C5 B5
+ // i0 == A5 C4 B4 A4 .. C0 B0 A0
+
+ IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128();
+ IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128();
+ IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128();
+ IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128();
+ IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128();
+ IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128();
+ IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128();
+ IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128();
+ IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128();
+
+ // eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector
+ // of the form 14 bytes junk : CC[0xF] : BB[0xA]
+ //
+# define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \
+ IRTemp t_##_tempName = newTempV128(); \
+ assign(t_##_tempName, \
+ ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \
+ ROR(EX(_srcVec2),(_srcShift2)) ) )
+
+ // Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively
+ IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0;
+
+ // The slicing and reassembly are done as interleavedly as possible,
+ // so as to minimise the demand for registers in the back end, which
+ // was observed to be a problem in testing.
+
+ XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14]
+ XXXX(AfCe, AA, 0xf, CC, 0xe);
+ assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe)));
+
+ XXXX(BeAe, BB, 0xe, AA, 0xe);
+ XXXX(CdBd, CC, 0xd, BB, 0xd);
+ assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd)));
+ assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98)));
+
+ XXXX(AdCc, AA, 0xd, CC, 0xc);
+ XXXX(BcAc, BB, 0xc, AA, 0xc);
+ assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc)));
+
+ XXXX(CbBb, CC, 0xb, BB, 0xb);
+ XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0]
+ assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa)));
+ assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210)));
+ assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64)));
+
+ XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14]
+ XXXX(C9B9, CC, 0x9, BB, 0x9);
+ assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9)));
+
+ XXXX(A9C8, AA, 0x9, CC, 0x8);
+ XXXX(B8A8, BB, 0x8, AA, 0x8);
+ assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8)));
+ assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98)));
+
+ XXXX(C7B7, CC, 0x7, BB, 0x7);
+ XXXX(A7C6, AA, 0x7, CC, 0x6);
+ assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6)));
+
+ XXXX(B6A6, BB, 0x6, AA, 0x6);
+ XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0]
+ assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5)));
+ assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210)));
+ assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64)));
+
+ XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14]
+ XXXX(B4A4, BB, 0x4, AA, 0x4);
+ assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4)));
+
+ XXXX(C3B3, CC, 0x3, BB, 0x3);
+ XXXX(A3C2, AA, 0x3, CC, 0x2);
+ assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2)));
+ assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98)));
+
+ XXXX(B2A2, BB, 0x2, AA, 0x2);
+ XXXX(C1B1, CC, 0x1, BB, 0x1);
+ assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1)));
+
+ XXXX(A1C0, AA, 0x1, CC, 0x0);
+ XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0]
+ assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0)));
+ assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210)));
+ assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64)));
+
+# undef XXXX
+ return;
+ }
+
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Do interleaving for 4 128 bit vectors, for ST4 insns. */
+static
+void math_INTERLEAVE4_128(
+ /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
+ UInt laneSzBlg2,
+ IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
+{
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ assign(*i0, ILO64x2(EX(u1), EX(u0)));
+ assign(*i1, ILO64x2(EX(u3), EX(u2)));
+ assign(*i2, IHI64x2(EX(u1), EX(u0)));
+ assign(*i3, IHI64x2(EX(u3), EX(u2)));
+ return;
+ }
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ // First, interleave at the 64-bit lane size.
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p3 = newTempV128();
+ math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3);
+ // And interleave (cat) at the 32 bit size.
+ assign(*i0, CEV32x4(EX(p1), EX(p0)));
+ assign(*i1, COD32x4(EX(p1), EX(p0)));
+ assign(*i2, CEV32x4(EX(p3), EX(p2)));
+ assign(*i3, COD32x4(EX(p3), EX(p2)));
+ return;
+ }
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // First, interleave at the 32-bit lane size.
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p3 = newTempV128();
+ math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3);
+ // And rearrange within each vector, to get the right 16 bit lanes.
+ assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2)));
+ assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2)));
+ assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2)));
+ assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2)));
+ return;
+ }
+ if (laneSzBlg2 == 0) {
+ // 8x16
+ // First, interleave at the 16-bit lane size.
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p3 = newTempV128();
+ math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3);
+ // And rearrange within each vector, to get the right 8 bit lanes.
+ assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0))));
+ assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1))));
+ assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2))));
+ assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3))));
+ return;
+ }
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Do deinterleaving for 1 128 bit vector, for LD1 insns. */
+static
+void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0,
+ UInt laneSzBlg2, IRTemp i0 )
+{
+ assign(*u0, mkexpr(i0));
+}
+
+
+/* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */
+static
+void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
+ UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
+{
+ /* This is pretty easy, since we have primitives directly to
+ hand. */
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ // i1 == B1 A1, i0 == B0 A0
+ // u1 == B1 B0, u0 == A1 A0
+ assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0)));
+ assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0)));
+ return;
+ }
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
+ // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
+ assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0)));
+ assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0)));
+ return;
+ }
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // i0 == B3 A3 B2 A2 B1 A1 B0 A0
+ // i1 == B7 A7 B6 A6 B5 A5 B4 A4
+ // u1 == B{7..0}, u0 == A{7..0}
+ assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0)));
+ assign(*u1, binop(Iop_CatOddLanes16x8, mkexpr(i1), mkexpr(i0)));
+ return;
+ }
+ if (laneSzBlg2 == 0) {
+ // 8x16
+ // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
+ // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
+ // u1 == B{f..0}, u0 == A{f..0}
+ assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0)));
+ assign(*u1, binop(Iop_CatOddLanes8x16, mkexpr(i1), mkexpr(i0)));
+ return;
+ }
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */
+static
+void math_DEINTERLEAVE3_128(
+ /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
+ UInt laneSzBlg2,
+ IRTemp i0, IRTemp i1, IRTemp i2 )
+{
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
+ // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
+ assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1) ));
+ assign(*u1, ILO64x2( EX(i2), ROL(EX(i0),8) ));
+ assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0) ));
+ return;
+ }
+
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
+ // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
+ // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
+ IRTemp t_a1c0b0a0 = newTempV128();
+ IRTemp t_a2c1b1a1 = newTempV128();
+ IRTemp t_a3c2b2a2 = newTempV128();
+ IRTemp t_a0c3b3a3 = newTempV128();
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ // Compute some intermediate values.
+ assign(t_a1c0b0a0, EX(i0));
+ assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4));
+ assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4));
+ assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4));
+ // First deinterleave into lane-pairs
+ assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0)));
+ assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)),
+ IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0))));
+ assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4)));
+ // Then deinterleave at 64x2 granularity.
+ math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2);
+ return;
+ }
+
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // u2 == C7 C6 C5 C4 C3 C2 C1 C0
+ // u1 == B7 B6 B5 B4 B3 B2 B1 B0
+ // u0 == A7 A6 A5 A4 A3 A2 A1 A0
+ //
+ // i2 == C7 B7 A7 C6 B6 A6 C5 B5
+ // i1 == A5 C4 B4 A4 C4 B3 A3 C2
+ // i0 == B2 A2 C1 B1 A1 C0 B0 A0
+ //
+ // p2 == C7 C6 B7 B6 A7 A6 C5 C4
+ // p1 == B5 B4 A5 A4 C3 C2 B3 B2
+ // p0 == A3 A2 C1 C0 B1 B0 A1 A0
+
+ IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111;
+ s0 = s1 = s2 = s3
+ = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID;
+ newTempsV128_4(&s0, &s1, &s2, &s3);
+ newTempsV128_4(&t0, &t1, &t2, &t3);
+ newTempsV128_4(&p0, &p1, &p2, &c00111111);
+
+ // s0 == b2a2 c1b1a1 c0b0a0
+ // s1 == b4a4 c3b3c3 c2b2a2
+ // s2 == b6a6 c5b5a5 c4b4a4
+ // s3 == b0a0 c7b7a7 c6b6a6
+ assign(s0, EX(i0));
+ assign(s1, SL(EX(i1),EX(i0),6*2));
+ assign(s2, SL(EX(i2),EX(i1),4*2));
+ assign(s3, SL(EX(i0),EX(i2),2*2));
+
+ // t0 == 0 0 c1c0 b1b0 a1a0
+ // t1 == 0 0 c3c2 b3b2 a3a2
+ // t2 == 0 0 c5c4 b5b4 a5a4
+ // t3 == 0 0 c7c6 b7b6 a7a6
+ assign(c00111111, mkV128(0x0FFF));
+ assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111)));
+ assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111)));
+ assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111)));
+ assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111)));
+
+ assign(p0, OR2(EX(t0), SHL(EX(t1),6*2)));
+ assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2)));
+ assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2)));
+
+ // Then deinterleave at 32x4 granularity.
+ math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2);
+ return;
+ }
+
+ if (laneSzBlg2 == 0) {
+ // 8x16. This is the same scheme as for 16x8, with twice the
+ // number of intermediate values.
+ //
+ // u2 == C{f..0}
+ // u1 == B{f..0}
+ // u0 == A{f..0}
+ //
+ // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a}
+ // i1 == BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5}
+ // i0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
+ //
+ // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba}
+ // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54}
+ // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10}
+ //
+ IRTemp s0, s1, s2, s3, s4, s5, s6, s7,
+ t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK;
+ s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7
+ = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK
+ = IRTemp_INVALID;
+ newTempsV128_4(&s0, &s1, &s2, &s3);
+ newTempsV128_4(&s4, &s5, &s6, &s7);
+ newTempsV128_4(&t0, &t1, &t2, &t3);
+ newTempsV128_4(&t4, &t5, &t6, &t7);
+ newTempsV128_4(&p0, &p1, &p2, &cMASK);
+
+ // s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
+ // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2}
+ // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4}
+ // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6}
+ // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8}
+ // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a}
+ // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c}
+ // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e}
+ assign(s0, SL(EX(i1),EX(i0), 0));
+ assign(s1, SL(EX(i1),EX(i0), 6));
+ assign(s2, SL(EX(i1),EX(i0),12));
+ assign(s3, SL(EX(i2),EX(i1), 2));
+ assign(s4, SL(EX(i2),EX(i1), 8));
+ assign(s5, SL(EX(i2),EX(i1),14));
+ assign(s6, SL(EX(i0),EX(i2), 4));
+ assign(s7, SL(EX(i0),EX(i2),10));
+
+ // t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0
+ // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2
+ // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4
+ // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6
+ // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8
+ // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa
+ // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac
+ // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae
+ assign(cMASK, mkV128(0x003F));
+ assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK)));
+ assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK)));
+ assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK)));
+ assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK)));
+ assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK)));
+ assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK)));
+ assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK)));
+ assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK)));
+
+ assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) ));
+ assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8),
+ SHL(EX(t3),2), SHR(EX(t2),4) ));
+ assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) ));
+
+ // Then deinterleave at 16x8 granularity.
+ math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2);
+ return;
+ }
+
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */
+static
+void math_DEINTERLEAVE4_128(
+ /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
+ UInt laneSzBlg2,
+ IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
+{
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ assign(*u0, ILO64x2(EX(i2), EX(i0)));
+ assign(*u1, IHI64x2(EX(i2), EX(i0)));
+ assign(*u2, ILO64x2(EX(i3), EX(i1)));
+ assign(*u3, IHI64x2(EX(i3), EX(i1)));
+ return;
+ }
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ IRTemp p0 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p3 = newTempV128();
+ assign(p0, ILO32x4(EX(i1), EX(i0)));
+ assign(p1, IHI32x4(EX(i1), EX(i0)));
+ assign(p2, ILO32x4(EX(i3), EX(i2)));
+ assign(p3, IHI32x4(EX(i3), EX(i2)));
+ // And now do what we did for the 64-bit case.
+ math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3);
+ return;
+ }
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // Deinterleave into 32-bit chunks, then do as the 32-bit case.
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p3 = newTempV128();
+ assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8)));
+ assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8)));
+ assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8)));
+ assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8)));
+ // From here on is like the 32 bit case.
+ math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3);
+ return;
+ }
+ if (laneSzBlg2 == 0) {
+ // 8x16
+ // Deinterleave into 16-bit chunks, then do as the 16-bit case.
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p3 = newTempV128();
+ assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)),
+ ILO8x16(EX(i0),ROL(EX(i0),4)) ));
+ assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)),
+ ILO8x16(EX(i1),ROL(EX(i1),4)) ));
+ assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)),
+ ILO8x16(EX(i2),ROL(EX(i2),4)) ));
+ assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)),
+ ILO8x16(EX(i3),ROL(EX(i3),4)) ));
+ // From here on is like the 16 bit case.
+ math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3);
+ return;
+ }
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Wrappers that use the full-width (de)interleavers to do half-width
+ (de)interleaving. The scheme is to clone each input lane in the
+ lower half of each incoming value, do a full width (de)interleave
+ at the next lane size up, and remove every other lane of the the
+ result. The returned values may have any old junk in the upper
+ 64 bits -- the caller must ignore that. */
+
+/* Helper function -- get doubling and narrowing operations. */
+static
+void math_get_doubler_and_halver ( /*OUT*/IROp* doubler,
+ /*OUT*/IROp* halver,
+ UInt laneSzBlg2 )
+{
+ switch (laneSzBlg2) {
+ case 2:
+ *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4;
+ break;
+ case 1:
+ *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8;
+ break;
+ case 0:
+ *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16;
+ break;
+ default:
+ vassert(0);
+ }
+}
+
+/* Do interleaving for 1 64 bit vector, for ST1 insns. */
+static
+void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0,
+ UInt laneSzBlg2, IRTemp u0 )
+{
+ assign(*i0, mkexpr(u0));
+}
+
+
+/* Do interleaving for 2 64 bit vectors, for ST2 insns. */
+static
+void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
+ UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*i0, EX(u0));
+ assign(*i1, EX(u1));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ assign(du0, binop(doubler, EX(u0), EX(u0)));
+ assign(du1, binop(doubler, EX(u1), EX(u1)));
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ math_INTERLEAVE2_128(&di0, &di1, laneSzBlg2 + 1, du0, du1);
+ assign(*i0, binop(halver, EX(di0), EX(di0)));
+ assign(*i1, binop(halver, EX(di1), EX(di1)));
+}
+
+
+/* Do interleaving for 3 64 bit vectors, for ST3 insns. */
+static
+void math_INTERLEAVE3_64(
+ /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
+ UInt laneSzBlg2,
+ IRTemp u0, IRTemp u1, IRTemp u2 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*i0, EX(u0));
+ assign(*i1, EX(u1));
+ assign(*i2, EX(u2));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ IRTemp du2 = newTempV128();
+ assign(du0, binop(doubler, EX(u0), EX(u0)));
+ assign(du1, binop(doubler, EX(u1), EX(u1)));
+ assign(du2, binop(doubler, EX(u2), EX(u2)));
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ IRTemp di2 = newTempV128();
+ math_INTERLEAVE3_128(&di0, &di1, &di2, laneSzBlg2 + 1, du0, du1, du2);
+ assign(*i0, binop(halver, EX(di0), EX(di0)));
+ assign(*i1, binop(halver, EX(di1), EX(di1)));
+ assign(*i2, binop(halver, EX(di2), EX(di2)));
+}
+
+
+/* Do interleaving for 4 64 bit vectors, for ST4 insns. */
+static
+void math_INTERLEAVE4_64(
+ /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
+ UInt laneSzBlg2,
+ IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*i0, EX(u0));
+ assign(*i1, EX(u1));
+ assign(*i2, EX(u2));
+ assign(*i3, EX(u3));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ IRTemp du2 = newTempV128();
+ IRTemp du3 = newTempV128();
+ assign(du0, binop(doubler, EX(u0), EX(u0)));
+ assign(du1, binop(doubler, EX(u1), EX(u1)));
+ assign(du2, binop(doubler, EX(u2), EX(u2)));
+ assign(du3, binop(doubler, EX(u3), EX(u3)));
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ IRTemp di2 = newTempV128();
+ IRTemp di3 = newTempV128();
+ math_INTERLEAVE4_128(&di0, &di1, &di2, &di3,
+ laneSzBlg2 + 1, du0, du1, du2, du3);
+ assign(*i0, binop(halver, EX(di0), EX(di0)));
+ assign(*i1, binop(halver, EX(di1), EX(di1)));
+ assign(*i2, binop(halver, EX(di2), EX(di2)));
+ assign(*i3, binop(halver, EX(di3), EX(di3)));
+}
+
+
+/* Do deinterleaving for 1 64 bit vector, for LD1 insns. */
+static
+void math_DEINTERLEAVE1_64( /*OUTx1*/ IRTemp* u0,
+ UInt laneSzBlg2, IRTemp i0 )
+{
+ assign(*u0, mkexpr(i0));
+}
+
+
+/* Do deinterleaving for 2 64 bit vectors, for LD2 insns. */
+static
+void math_DEINTERLEAVE2_64( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
+ UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*u0, EX(i0));
+ assign(*u1, EX(i1));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ assign(di0, binop(doubler, EX(i0), EX(i0)));
+ assign(di1, binop(doubler, EX(i1), EX(i1)));
+
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ math_DEINTERLEAVE2_128(&du0, &du1, laneSzBlg2 + 1, di0, di1);
+ assign(*u0, binop(halver, EX(du0), EX(du0)));
+ assign(*u1, binop(halver, EX(du1), EX(du1)));
+}
+
+
+/* Do deinterleaving for 3 64 bit vectors, for LD3 insns. */
+static
+void math_DEINTERLEAVE3_64(
+ /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
+ UInt laneSzBlg2,
+ IRTemp i0, IRTemp i1, IRTemp i2 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*u0, EX(i0));
+ assign(*u1, EX(i1));
+ assign(*u2, EX(i2));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ IRTemp di2 = newTempV128();
+ assign(di0, binop(doubler, EX(i0), EX(i0)));
+ assign(di1, binop(doubler, EX(i1), EX(i1)));
+ assign(di2, binop(doubler, EX(i2), EX(i2)));
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ IRTemp du2 = newTempV128();
+ math_DEINTERLEAVE3_128(&du0, &du1, &du2, laneSzBlg2 + 1, di0, di1, di2);
+ assign(*u0, binop(halver, EX(du0), EX(du0)));
+ assign(*u1, binop(halver, EX(du1), EX(du1)));
+ assign(*u2, binop(halver, EX(du2), EX(du2)));
+}
+
+
+/* Do deinterleaving for 4 64 bit vectors, for LD4 insns. */
+static
+void math_DEINTERLEAVE4_64(
+ /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
+ UInt laneSzBlg2,
+ IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*u0, EX(i0));
+ assign(*u1, EX(i1));
+ assign(*u2, EX(i2));
+ assign(*u3, EX(i3));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ IRTemp di2 = newTempV128();
+ IRTemp di3 = newTempV128();
+ assign(di0, binop(doubler, EX(i0), EX(i0)));
+ assign(di1, binop(doubler, EX(i1), EX(i1)));
+ assign(di2, binop(doubler, EX(i2), EX(i2)));
+ assign(di3, binop(doubler, EX(i3), EX(i3)));
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ IRTemp du2 = newTempV128();
+ IRTemp du3 = newTempV128();
+ math_DEINTERLEAVE4_128(&du0, &du1, &du2, &du3,
+ laneSzBlg2 + 1, di0, di1, di2, di3);
+ assign(*u0, binop(halver, EX(du0), EX(du0)));
+ assign(*u1, binop(halver, EX(du1), EX(du1)));
+ assign(*u2, binop(halver, EX(du2), EX(du2)));
+ assign(*u3, binop(halver, EX(du3), EX(du3)));
+}
+
+
+#undef EX
+#undef SL
+#undef ROR
+#undef ROL
+#undef SHR
+#undef SHL
+#undef ILO64x2
+#undef IHI64x2
+#undef ILO32x4
+#undef IHI32x4
+#undef ILO16x8
+#undef IHI16x8
+#undef ILO16x8
+#undef IHI16x8
+#undef CEV32x4
+#undef COD32x4
+#undef COD16x8
+#undef COD8x16
+#undef CEV8x16
+#undef AND
+#undef OR2
+#undef OR3
+#undef OR4
+
+
/*------------------------------------------------------------*/
/*--- Load and Store instructions ---*/
/*------------------------------------------------------------*/
case BITS3(0,1,0): break; // 32 bit, valid for both int and vec
case BITS3(0,1,1): break; // 64 bit, valid for both int and vec
case BITS3(1,0,0): // can only ever be valid for the vector case
- if (isInt) goto fail; else goto fail;
+ if (isInt) goto fail; else break;
case BITS3(1,0,1): // these sizes are never valid
case BITS3(1,1,0):
case BITS3(1,1,1): goto fail;
sz==11 isn't allowed
simm7 is scaled by the (single-register) transfer size
- 31 29 22 21 14 9 4
- sz 101 1001 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP], #imm
- (at-Rn-then-Rn=EA)
+ 31 29 26 22 21 14 9 4
- sz 101 1011 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP, #imm]!
- (at-EA-then-Rn=EA)
+ sz 101 1000 L imm7 t2 n t1 mmNP SDQt1, SDQt2, [Xn|SP, #imm]
+ (at-EA, with nontemporal hint)
+
+ sz 101 1001 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP], #imm
+ (at-Rn-then-Rn=EA)
sz 101 1010 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP, #imm]
- (at-EA)
- */
+ (at-EA)
- UInt insn_29_23 = INSN(29,23);
- if (insn_29_23 == BITS7(1,0,1,1,0,0,1)
- || insn_29_23 == BITS7(1,0,1,1,0,1,1)
- || insn_29_23 == BITS7(1,0,1,1,0,1,0)) {
+ sz 101 1011 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP, #imm]!
+ (at-EA-then-Rn=EA)
+ */
+ if (INSN(29,25) == BITS5(1,0,1,1,0)) {
UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units
Bool isLD = INSN(22,22) == 1;
Bool wBack = INSN(23,23) == 1;
case BITS2(1,1):
assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
case BITS2(1,0):
+ case BITS2(0,0):
assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
default:
vassert(0); /* NOTREACHED */
case BITS2(1,0):
fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
break;
+ case BITS2(0,0):
+ fmt_str = "%snp %s, %s, [%s, #%lld] (at-Rn)\n";
+ break;
default:
vassert(0);
}
UInt szLg2 = (INSN(23,23) << 2) | INSN(31,30);
Bool isLD = INSN(22,22) == 1;
UInt tt = INSN(4,0);
- if (szLg2 >= 4) goto after_LDR_STR_vector_register;
+ if (szLg2 > 4) goto after_LDR_STR_vector_register;
IRTemp ea = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/);
if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register;
switch (szLg2) {
DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
}
break;
- case 4: return False; //ATC
- default: vassert(0);
+ case 4:
+ if (isLD) {
+ putQReg128(tt, loadLE(Ity_V128, mkexpr(ea)));
+ DIP("ldr %s, %s\n", nameQReg128(tt), dis_buf);
+ } else {
+ storeLE(mkexpr(ea), getQReg128(tt));
+ DIP("str %s, %s\n", nameQReg128(tt), dis_buf);
+ }
+ break;
+ default:
+ vassert(0);
}
return True;
}
return True;
}
- /* ---------- LD1/ST1 (single structure, no offset) ---------- */
- /* 31 23
- 0100 1100 0100 0000 0111 11 N T LD1 {vT.2d}, [Xn|SP]
- 0100 1100 0000 0000 0111 11 N T ST1 {vT.2d}, [Xn|SP]
- 0100 1100 0100 0000 0111 10 N T LD1 {vT.4s}, [Xn|SP]
- 0100 1100 0000 0000 0111 10 N T ST1 {vT.4s}, [Xn|SP]
- 0100 1100 0100 0000 0111 01 N T LD1 {vT.8h}, [Xn|SP]
- 0100 1100 0000 0000 0111 01 N T ST1 {vT.8h}, [Xn|SP]
- 0100 1100 0100 0000 0111 00 N T LD1 {vT.16b}, [Xn|SP]
- 0100 1100 0000 0000 0111 00 N T ST1 {vT.16b}, [Xn|SP]
- FIXME does this assume that the host is little endian?
- */
- if ( (insn & 0xFFFFF000) == 0x4C407000 // LD1 cases
- || (insn & 0xFFFFF000) == 0x4C007000 // ST1 cases
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- const HChar* names[4] = { "2d", "4s", "8h", "16b" };
- const HChar* name = names[INSN(11,10)];
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- if (isLD) {
- putQReg128(vT, loadLE(Ity_V128, mkexpr(tEA)));
- } else {
- storeLE(mkexpr(tEA), getQReg128(vT));
- }
- DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1",
- vT, name, nameIReg64orSP(rN));
- return True;
- }
+ /* ------ LD1/ST1 (multiple 1-elem structs to/from 1 reg ------ */
+ /* ------ LD2/ST2 (multiple 2-elem structs to/from 2 regs ------ */
+ /* ------ LD3/ST3 (multiple 3-elem structs to/from 3 regs ------ */
+ /* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */
+ /* 31 29 26 22 21 20 15 11 9 4
- /* 31 23
- 0000 1100 0100 0000 0111 11 N T LD1 {vT.1d}, [Xn|SP]
- 0000 1100 0000 0000 0111 11 N T ST1 {vT.1d}, [Xn|SP]
- 0000 1100 0100 0000 0111 10 N T LD1 {vT.2s}, [Xn|SP]
- 0000 1100 0000 0000 0111 10 N T ST1 {vT.2s}, [Xn|SP]
- 0000 1100 0100 0000 0111 01 N T LD1 {vT.4h}, [Xn|SP]
- 0000 1100 0000 0000 0111 01 N T ST1 {vT.4h}, [Xn|SP]
- 0000 1100 0100 0000 0111 00 N T LD1 {vT.8b}, [Xn|SP]
- 0000 1100 0000 0000 0111 00 N T ST1 {vT.8b}, [Xn|SP]
- FIXME does this assume that the host is little endian?
- */
- if ( (insn & 0xFFFFF000) == 0x0C407000 // LD1 cases
- || (insn & 0xFFFFF000) == 0x0C007000 // ST1 cases
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- const HChar* names[4] = { "1d", "2s", "4h", "8b" };
- const HChar* name = names[INSN(11,10)];
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- if (isLD) {
- putQRegLane(vT, 0, loadLE(Ity_I64, mkexpr(tEA)));
- putQRegLane(vT, 1, mkU64(0));
- } else {
- storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64));
- }
- DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1",
- vT, name, nameIReg64orSP(rN));
- return True;
- }
+ 0q 001 1000 L 0 00000 0000 sz n t xx4 {Vt..t+3.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 0000 sz n t xx4 {Vt..t+3.T}, [Xn|SP], step
- /* ---------- LD1/ST1 (single structure, post index) ---------- */
- /* 31 23
- 0100 1100 1001 1111 0111 11 N T ST1 {vT.2d}, [xN|SP], #16
- 0100 1100 1101 1111 0111 11 N T LD1 {vT.2d}, [xN|SP], #16
- 0100 1100 1001 1111 0111 10 N T ST1 {vT.4s}, [xN|SP], #16
- 0100 1100 1101 1111 0111 10 N T LD1 {vT.4s}, [xN|SP], #16
- 0100 1100 1001 1111 0111 01 N T ST1 {vT.8h}, [xN|SP], #16
- 0100 1100 1101 1111 0111 01 N T LD1 {vT.8h}, [xN|SP], #16
- 0100 1100 1001 1111 0111 00 N T ST1 {vT.16b}, [xN|SP], #16
- 0100 1100 1101 1111 0111 00 N T LD1 {vT.16b}, [xN|SP], #16
- Note that #16 is implied and cannot be any other value.
- FIXME does this assume that the host is little endian?
+ 0q 001 1000 L 0 00000 0100 sz n t xx3 {Vt..t+2.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 0100 sz n t xx3 {Vt..t+2.T}, [Xn|SP], step
+
+ 0q 001 1000 L 0 00000 1000 sz n t xx2 {Vt..t+1.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 1000 sz n t xx2 {Vt..t+1.T}, [Xn|SP], step
+
+ 0q 001 1000 L 0 00000 0111 sz n t xx1 {Vt.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 0111 sz n t xx1 {Vt.T}, [Xn|SP], step
+
+ T = defined by Q and sz in the normal way
+ step = if m == 11111 then transfer-size else Xm
+ xx = case L of 1 -> LD ; 0 -> ST
*/
- if ( (insn & 0xFFFFF000) == 0x4CDF7000 // LD1 cases
- || (insn & 0xFFFFF000) == 0x4C9F7000 // ST1 cases
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- const HChar* names[4] = { "2d", "4s", "8h", "16b" };
- const HChar* name = names[INSN(11,10)];
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- if (isLD) {
- putQReg128(vT, loadLE(Ity_V128, mkexpr(tEA)));
- } else {
- storeLE(mkexpr(tEA), getQReg128(vT));
+ if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
+ && INSN(21,21) == 0) {
+ Bool bitQ = INSN(30,30);
+ Bool isPX = INSN(23,23) == 1;
+ Bool isLD = INSN(22,22) == 1;
+ UInt mm = INSN(20,16);
+ UInt opc = INSN(15,12);
+ UInt sz = INSN(11,10);
+ UInt nn = INSN(9,5);
+ UInt tt = INSN(4,0);
+ Bool isQ = bitQ == 1;
+ Bool is1d = sz == BITS2(1,1) && !isQ;
+ UInt nRegs = 0;
+ switch (opc) {
+ case BITS4(0,0,0,0): nRegs = 4; break;
+ case BITS4(0,1,0,0): nRegs = 3; break;
+ case BITS4(1,0,0,0): nRegs = 2; break;
+ case BITS4(0,1,1,1): nRegs = 1; break;
+ default: break;
}
- putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(16)));
- DIP("%s {v%u.%s}, [%s], #16\n", isLD ? "ld1" : "st1",
- vT, name, nameIReg64orSP(rN));
- return True;
- }
+
+ if (nRegs == 1 /* .1d is allowed */
+ || (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) {
- /* 31 23
- 0000 1100 1001 1111 0111 11 N T ST1 {vT.1d}, [xN|SP], #8
- 0000 1100 1101 1111 0111 11 N T LD1 {vT.1d}, [xN|SP], #8
- 0000 1100 1001 1111 0111 10 N T ST1 {vT.2s}, [xN|SP], #8
- 0000 1100 1101 1111 0111 10 N T LD1 {vT.2s}, [xN|SP], #8
- 0000 1100 1001 1111 0111 01 N T ST1 {vT.4h}, [xN|SP], #8
- 0000 1100 1101 1111 0111 01 N T LD1 {vT.4h}, [xN|SP], #8
- 0000 1100 1001 1111 0111 00 N T ST1 {vT.8b}, [xN|SP], #8
- 0000 1100 1101 1111 0111 00 N T LD1 {vT.8b}, [xN|SP], #8
- Note that #8 is implied and cannot be any other value.
- FIXME does this assume that the host is little endian?
- */
- if ( (insn & 0xFFFFF000) == 0x0CDF7000 // LD1 cases
- || (insn & 0xFFFFF000) == 0x0C9F7000 // ST1 cases
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- const HChar* names[4] = { "1d", "2s", "4h", "8b" };
- const HChar* name = names[INSN(11,10)];
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- if (isLD) {
- putQRegLane(vT, 0, loadLE(Ity_I64, mkexpr(tEA)));
- putQRegLane(vT, 1, mkU64(0));
- } else {
- storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64));
+ UInt xferSzB = (isQ ? 16 : 8) * nRegs;
+
+ /* Generate the transfer address (TA) and if necessary the
+ writeback address (WB) */
+ IRTemp tTA = newTemp(Ity_I64);
+ assign(tTA, getIReg64orSP(nn));
+ if (nn == 31) { /* FIXME generate stack alignment check */ }
+ IRTemp tWB = IRTemp_INVALID;
+ if (isPX) {
+ tWB = newTemp(Ity_I64);
+ assign(tWB, binop(Iop_Add64,
+ mkexpr(tTA),
+ mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
+ : getIReg64orZR(mm)));
+ }
+
+ /* -- BEGIN generate the transfers -- */
+
+ IRTemp u0, u1, u2, u3, i0, i1, i2, i3;
+ u0 = u1 = u2 = u3 = i0 = i1 = i2 = i3 = IRTemp_INVALID;
+ switch (nRegs) {
+ case 4: u3 = newTempV128(); i3 = newTempV128(); /* fallthru */
+ case 3: u2 = newTempV128(); i2 = newTempV128(); /* fallthru */
+ case 2: u1 = newTempV128(); i1 = newTempV128(); /* fallthru */
+ case 1: u0 = newTempV128(); i0 = newTempV128(); break;
+ default: vassert(0);
+ }
+
+ /* -- Multiple 128 or 64 bit stores -- */
+ if (!isLD) {
+ switch (nRegs) {
+ case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
+ case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
+ case 2: assign(u1, getQReg128((tt+1) % 32)); /* fallthru */
+ case 1: assign(u0, getQReg128((tt+0) % 32)); break;
+ default: vassert(0);
+ }
+ switch (nRegs) {
+ case 4: (isQ ? math_INTERLEAVE4_128 : math_INTERLEAVE4_64)
+ (&i0, &i1, &i2, &i3, sz, u0, u1, u2, u3);
+ break;
+ case 3: (isQ ? math_INTERLEAVE3_128 : math_INTERLEAVE3_64)
+ (&i0, &i1, &i2, sz, u0, u1, u2);
+ break;
+ case 2: (isQ ? math_INTERLEAVE2_128 : math_INTERLEAVE2_64)
+ (&i0, &i1, sz, u0, u1);
+ break;
+ case 1: (isQ ? math_INTERLEAVE1_128 : math_INTERLEAVE1_64)
+ (&i0, sz, u0);
+ break;
+ default: vassert(0);
+ }
+# define MAYBE_NARROW_TO_64(_expr) \
+ (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
+ UInt step = isQ ? 16 : 8;
+ switch (nRegs) {
+ case 4: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
+ MAYBE_NARROW_TO_64(mkexpr(i3)) );
+ /* fallthru */
+ case 3: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
+ MAYBE_NARROW_TO_64(mkexpr(i2)) );
+ /* fallthru */
+ case 2: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
+ MAYBE_NARROW_TO_64(mkexpr(i1)) );
+ /* fallthru */
+ case 1: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
+ MAYBE_NARROW_TO_64(mkexpr(i0)) );
+ break;
+ default: vassert(0);
+ }
+# undef MAYBE_NARROW_TO_64
+ }
+
+ /* -- Multiple 128 or 64 bit loads -- */
+ else /* isLD */ {
+ UInt step = isQ ? 16 : 8;
+ IRType loadTy = isQ ? Ity_V128 : Ity_I64;
+# define MAYBE_WIDEN_FROM_64(_expr) \
+ (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
+ switch (nRegs) {
+ case 4:
+ assign(i3, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(3 * step)))));
+ /* fallthru */
+ case 3:
+ assign(i2, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(2 * step)))));
+ /* fallthru */
+ case 2:
+ assign(i1, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(1 * step)))));
+ /* fallthru */
+
+ case 1:
+ assign(i0, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(0 * step)))));
+ break;
+ default:
+ vassert(0);
+ }
+# undef MAYBE_WIDEN_FROM_64
+ switch (nRegs) {
+ case 4: (isQ ? math_DEINTERLEAVE4_128 : math_DEINTERLEAVE4_64)
+ (&u0, &u1, &u2, &u3, sz, i0,i1,i2,i3);
+ break;
+ case 3: (isQ ? math_DEINTERLEAVE3_128 : math_DEINTERLEAVE3_64)
+ (&u0, &u1, &u2, sz, i0, i1, i2);
+ break;
+ case 2: (isQ ? math_DEINTERLEAVE2_128 : math_DEINTERLEAVE2_64)
+ (&u0, &u1, sz, i0, i1);
+ break;
+ case 1: (isQ ? math_DEINTERLEAVE1_128 : math_DEINTERLEAVE1_64)
+ (&u0, sz, i0);
+ break;
+ default: vassert(0);
+ }
+ switch (nRegs) {
+ case 4: putQReg128( (tt+3) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u3));
+ /* fallthru */
+ case 3: putQReg128( (tt+2) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u2));
+ /* fallthru */
+ case 2: putQReg128( (tt+1) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u1));
+ /* fallthru */
+ case 1: putQReg128( (tt+0) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u0));
+ break;
+ default: vassert(0);
+ }
+ }
+
+ /* -- END generate the transfers -- */
+
+ /* Do the writeback, if necessary */
+ if (isPX) {
+ putIReg64orSP(nn, mkexpr(tWB));
+ }
+
+ HChar pxStr[20];
+ pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
+ if (isPX) {
+ if (mm == BITS5(1,1,1,1,1))
+ vex_sprintf(pxStr, ", #%u", xferSzB);
+ else
+ vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
+ }
+ const HChar* arr = nameArr_Q_SZ(bitQ, sz);
+ DIP("%s%u {v%u.%s .. v%u.%s}, [%s]%s\n",
+ isLD ? "ld" : "st", nRegs,
+ (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
+ pxStr);
+
+ return True;
}
- putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(8)));
- DIP("%s {v%u.%s}, [%s], #8\n", isLD ? "ld1" : "st1",
- vT, name, nameIReg64orSP(rN));
- return True;
+ /* else fall through */
}
/* ---------- LD1R (single structure, replicate) ---------- */
return False;
}
- /* -------- LD2/ST2 (multi 2-elem structs, 2 regs, post index) -------- */
- /* Only a very few cases. */
- /* 31 23 11 9 4
- 0100 1100 1101 1111 1000 11 n t LD2 {Vt.2d, V(t+1)%32.2d}, [Xn|SP], #32
- 0100 1100 1001 1111 1000 11 n t ST2 {Vt.2d, V(t+1)%32.2d}, [Xn|SP], #32
- 0100 1100 1101 1111 1000 10 n t LD2 {Vt.4s, V(t+1)%32.4s}, [Xn|SP], #32
- 0100 1100 1001 1111 1000 10 n t ST2 {Vt.4s, V(t+1)%32.4s}, [Xn|SP], #32
- */
- if ( (insn & 0xFFFFFC00) == 0x4CDF8C00 // LD2 .2d
- || (insn & 0xFFFFFC00) == 0x4C9F8C00 // ST2 .2d
- || (insn & 0xFFFFFC00) == 0x4CDF8800 // LD2 .4s
- || (insn & 0xFFFFFC00) == 0x4C9F8800 // ST2 .4s
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- UInt sz = INSN(11,10);
- const HChar* name = "??";
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- IRExpr* tEA_0 = binop(Iop_Add64, mkexpr(tEA), mkU64(0));
- IRExpr* tEA_8 = binop(Iop_Add64, mkexpr(tEA), mkU64(8));
- IRExpr* tEA_16 = binop(Iop_Add64, mkexpr(tEA), mkU64(16));
- IRExpr* tEA_24 = binop(Iop_Add64, mkexpr(tEA), mkU64(24));
- if (sz == BITS2(1,1)) {
- name = "2d";
- if (isLD) {
- putQRegLane((vT+0) % 32, 0, loadLE(Ity_I64, tEA_0));
- putQRegLane((vT+0) % 32, 1, loadLE(Ity_I64, tEA_16));
- putQRegLane((vT+1) % 32, 0, loadLE(Ity_I64, tEA_8));
- putQRegLane((vT+1) % 32, 1, loadLE(Ity_I64, tEA_24));
- } else {
- storeLE(tEA_0, getQRegLane((vT+0) % 32, 0, Ity_I64));
- storeLE(tEA_16, getQRegLane((vT+0) % 32, 1, Ity_I64));
- storeLE(tEA_8, getQRegLane((vT+1) % 32, 0, Ity_I64));
- storeLE(tEA_24, getQRegLane((vT+1) % 32, 1, Ity_I64));
- }
- }
- else if (sz == BITS2(1,0)) {
- /* Uh, this is ugly. TODO: better. */
- name = "4s";
- IRExpr* tEA_4 = binop(Iop_Add64, mkexpr(tEA), mkU64(4));
- IRExpr* tEA_12 = binop(Iop_Add64, mkexpr(tEA), mkU64(12));
- IRExpr* tEA_20 = binop(Iop_Add64, mkexpr(tEA), mkU64(20));
- IRExpr* tEA_28 = binop(Iop_Add64, mkexpr(tEA), mkU64(28));
- if (isLD) {
- putQRegLane((vT+0) % 32, 0, loadLE(Ity_I32, tEA_0));
- putQRegLane((vT+0) % 32, 1, loadLE(Ity_I32, tEA_8));
- putQRegLane((vT+0) % 32, 2, loadLE(Ity_I32, tEA_16));
- putQRegLane((vT+0) % 32, 3, loadLE(Ity_I32, tEA_24));
- putQRegLane((vT+1) % 32, 0, loadLE(Ity_I32, tEA_4));
- putQRegLane((vT+1) % 32, 1, loadLE(Ity_I32, tEA_12));
- putQRegLane((vT+1) % 32, 2, loadLE(Ity_I32, tEA_20));
- putQRegLane((vT+1) % 32, 3, loadLE(Ity_I32, tEA_28));
- } else {
- storeLE(tEA_0, getQRegLane((vT+0) % 32, 0, Ity_I32));
- storeLE(tEA_8, getQRegLane((vT+0) % 32, 1, Ity_I32));
- storeLE(tEA_16, getQRegLane((vT+0) % 32, 2, Ity_I32));
- storeLE(tEA_24, getQRegLane((vT+0) % 32, 3, Ity_I32));
- storeLE(tEA_4, getQRegLane((vT+1) % 32, 0, Ity_I32));
- storeLE(tEA_12, getQRegLane((vT+1) % 32, 1, Ity_I32));
- storeLE(tEA_20, getQRegLane((vT+1) % 32, 2, Ity_I32));
- storeLE(tEA_28, getQRegLane((vT+1) % 32, 3, Ity_I32));
- }
- }
- else {
- vassert(0); // Can't happen.
- }
- putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(32)));
- DIP("%s {v%u.%s, v%u.%s}, [%s], #32\n", isLD ? "ld2" : "st2",
- (vT+0) % 32, name, (vT+1) % 32, name, nameIReg64orSP(rN));
- return True;
- }
-
/* -------- LD1/ST1 (multi 1-elem structs, 2 regs, no offset) -------- */
/* Only a very few cases. */
/* 31 23
return True;
}
- /* -------- LD3/ST3 (multi 3-elem structs, 3 regs, post index) -------- */
- /* Only a very few cases. */
- /* 31 23 11 9 4
- 0100 1100 1101 1111 0100 11 n t LD3 {Vt.2d .. V(t+2)%32.2d}, [Xn|SP], #48
- 0100 1100 1001 1111 0100 11 n t ST3 {Vt.2d .. V(t+2)%32.2d}, [Xn|SP], #48
- */
- if ( (insn & 0xFFFFFC00) == 0x4CDF4C00 // LD3 .2d
- || (insn & 0xFFFFFC00) == 0x4C9F4C00 // ST3 .2d
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- UInt sz = INSN(11,10);
- const HChar* name = "??";
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- IRExpr* tEA_0 = binop(Iop_Add64, mkexpr(tEA), mkU64(0));
- IRExpr* tEA_8 = binop(Iop_Add64, mkexpr(tEA), mkU64(8));
- IRExpr* tEA_16 = binop(Iop_Add64, mkexpr(tEA), mkU64(16));
- IRExpr* tEA_24 = binop(Iop_Add64, mkexpr(tEA), mkU64(24));
- IRExpr* tEA_32 = binop(Iop_Add64, mkexpr(tEA), mkU64(32));
- IRExpr* tEA_40 = binop(Iop_Add64, mkexpr(tEA), mkU64(40));
- if (sz == BITS2(1,1)) {
- name = "2d";
- if (isLD) {
- putQRegLane((vT+0) % 32, 0, loadLE(Ity_I64, tEA_0));
- putQRegLane((vT+0) % 32, 1, loadLE(Ity_I64, tEA_24));
- putQRegLane((vT+1) % 32, 0, loadLE(Ity_I64, tEA_8));
- putQRegLane((vT+1) % 32, 1, loadLE(Ity_I64, tEA_32));
- putQRegLane((vT+2) % 32, 0, loadLE(Ity_I64, tEA_16));
- putQRegLane((vT+2) % 32, 1, loadLE(Ity_I64, tEA_40));
- } else {
- storeLE(tEA_0, getQRegLane((vT+0) % 32, 0, Ity_I64));
- storeLE(tEA_24, getQRegLane((vT+0) % 32, 1, Ity_I64));
- storeLE(tEA_8, getQRegLane((vT+1) % 32, 0, Ity_I64));
- storeLE(tEA_32, getQRegLane((vT+1) % 32, 1, Ity_I64));
- storeLE(tEA_16, getQRegLane((vT+2) % 32, 0, Ity_I64));
- storeLE(tEA_40, getQRegLane((vT+2) % 32, 1, Ity_I64));
- }
- }
- else {
- vassert(0); // Can't happen.
- }
- putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(48)));
- DIP("%s {v%u.%s, v%u.%s, v%u.%s}, [%s], #32\n",
- isLD ? "ld3" : "st3",
- (vT+0) % 32, name, (vT+1) % 32, name, (vT+2) % 32, name,
- nameIReg64orSP(rN));
- return True;
- }
-
/* ------------------ LD{,A}X{R,RH,RB} ------------------ */
/* ------------------ ST{,L}X{R,RH,RB} ------------------ */
/* 31 29 23 20 14 9 4