From: Julian Seward Date: Sat, 20 Apr 2013 21:19:44 +0000 (+0000) Subject: VLD3/VST3: generate in-line interleave/de-interleave code, so that X-Git-Tag: svn/VALGRIND_3_9_0^2~85 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0d8de47646a0d3fc66a263a07c3ca7e98029f454;p=thirdparty%2Fvalgrind.git VLD3/VST3: generate in-line interleave/de-interleave code, so that loads can always be done in 8 byte units. git-svn-id: svn://svn.valgrind.org/vex/trunk@2710 --- diff --git a/VEX/priv/guest_arm_toIR.c b/VEX/priv/guest_arm_toIR.c index 5df32b9270..694fce8be0 100644 --- a/VEX/priv/guest_arm_toIR.c +++ b/VEX/priv/guest_arm_toIR.c @@ -7972,6 +7972,163 @@ static void math_INTERLEAVE_2 (/*OUT*/IRTemp* i0, /*OUT*/IRTemp* i1, } } +// Helper function for generating arbitrary slicing 'n' dicing of +// 3 8x8 vectors, as needed for VLD3.8 and VST3.8. +static IRExpr* math_PERM_8x8x3(const UChar* desc, + IRTemp s0, IRTemp s1, IRTemp s2) +{ + // desc is an array of 8 pairs, encoded as 16 bytes, + // that describe how to assemble the result lanes, starting with + // lane 7. Each pair is: first component (0..2) says which of + // s0/s1/s2 to use. Second component (0..7) is the lane number + // in the source to use. + UInt si; + for (si = 0; si < 7; si++) { + vassert(desc[2 * si + 0] <= 2); + vassert(desc[2 * si + 1] <= 7); + } + IRTemp h3 = newTemp(Ity_I64); + IRTemp h2 = newTemp(Ity_I64); + IRTemp h1 = newTemp(Ity_I64); + IRTemp h0 = newTemp(Ity_I64); + IRTemp srcs[3] = {s0, s1, s2}; +# define SRC_VEC(_lane) mkexpr(srcs[desc[2 * (7-(_lane)) + 0]]) +# define SRC_SHIFT(_lane) mkU8(56-8*(desc[2 * (7-(_lane)) + 1])) + assign(h3, binop(Iop_InterleaveHI8x8, + binop(Iop_Shl64, SRC_VEC(7), SRC_SHIFT(7)), + binop(Iop_Shl64, SRC_VEC(6), SRC_SHIFT(6)))); + assign(h2, binop(Iop_InterleaveHI8x8, + binop(Iop_Shl64, SRC_VEC(5), SRC_SHIFT(5)), + binop(Iop_Shl64, SRC_VEC(4), SRC_SHIFT(4)))); + assign(h1, binop(Iop_InterleaveHI8x8, + binop(Iop_Shl64, SRC_VEC(3), SRC_SHIFT(3)), + binop(Iop_Shl64, SRC_VEC(2), SRC_SHIFT(2)))); + assign(h0, binop(Iop_InterleaveHI8x8, + binop(Iop_Shl64, SRC_VEC(1), SRC_SHIFT(1)), + binop(Iop_Shl64, SRC_VEC(0), SRC_SHIFT(0)))); +# undef SRC_VEC +# undef SRC_SHIFT + // Now h3..h0 are 64 bit vectors with useful information only + // in the top 16 bits. We now concatentate those four 16-bit + // groups so as to produce the final result. + IRTemp w1 = newTemp(Ity_I64); + IRTemp w0 = newTemp(Ity_I64); + assign(w1, binop(Iop_InterleaveHI16x4, mkexpr(h3), mkexpr(h2))); + assign(w0, binop(Iop_InterleaveHI16x4, mkexpr(h1), mkexpr(h0))); + return binop(Iop_InterleaveHI32x2, mkexpr(w1), mkexpr(w0)); +} + +/* Generate 3x64 -> 3x64 deinterleave code, for VLD3. Caller must + make *u0, *u1 and *u2 be valid IRTemps before the call. */ +static void math_DEINTERLEAVE_3 ( + /*OUT*/IRTemp* u0, /*OUT*/IRTemp* u1, /*OUT*/IRTemp* u2, + IRTemp i0, IRTemp i1, IRTemp i2, Int laneszB + ) +{ +# define IHI32x2(_e1, _e2) binop(Iop_InterleaveHI32x2, (_e1), (_e2)) +# define IHI16x4(_e1, _e2) binop(Iop_InterleaveHI16x4, (_e1), (_e2)) +# define SHL64(_tmp, _amt) binop(Iop_Shl64, mkexpr(_tmp), mkU8(_amt)) + /* The following assumes that the guest is little endian, and hence + that the memory-side (interleaved) data is stored + little-endianly. */ + vassert(u0 && u1 && u2); + if (laneszB == 4) { + // memLE(192 bits) == A0 B0 C0 A1 B1 C1 + // i0 == B0 A0, i1 == A1 C0, i2 == C1 B1 + // u0 == A1 A0, u1 == B1 B0, u2 == C1 C0 + assign(*u0, IHI32x2(SHL64(i1, 0), SHL64(i0, 32))); + assign(*u1, IHI32x2(SHL64(i2, 32), SHL64(i0, 0))); + assign(*u2, IHI32x2(SHL64(i2, 0), SHL64(i1, 32))); + } else if (laneszB == 2) { + // memLE(192 bits) == A0 B0 C0 A1, B1 C1 A2 B2, C2 A3 B3 C3 + // i0 == A1 C0 B0 A0, i1 == B2 A2 C1 B1, i2 == C3 B3 A3 C2 + // u0 == A3 A2 A1 A0, u1 == B3 B2 B1 B0, u2 == C3 C2 C1 C0 +# define XXX(_tmp3,_la3,_tmp2,_la2,_tmp1,_la1,_tmp0,_la0) \ + IHI32x2( \ + IHI16x4(SHL64((_tmp3),48-16*(_la3)), \ + SHL64((_tmp2),48-16*(_la2))), \ + IHI16x4(SHL64((_tmp1),48-16*(_la1)), \ + SHL64((_tmp0),48-16*(_la0)))) + assign(*u0, XXX(i2,1, i1,2, i0,3, i0,0)); + assign(*u1, XXX(i2,2, i1,3, i1,0, i0,1)); + assign(*u2, XXX(i2,3, i2,0, i1,1, i0,2)); +# undef XXX + } else if (laneszB == 1) { + // These describe how the result vectors [7..0] are + // assembled from the source vectors. Each pair is + // (source vector number, lane number). + static const UChar de0[16] = {2,5, 2,2, 1,7, 1,4, 1,1, 0,6, 0,3, 0,0}; + static const UChar de1[16] = {2,6, 2,3, 2,0, 1,5, 1,2, 0,7, 0,4, 0,1}; + static const UChar de2[16] = {2,7, 2,4, 2,1, 1,6, 1,3, 1,0, 0,5, 0,2}; + assign(*u0, math_PERM_8x8x3(de0, i0, i1, i2)); + assign(*u1, math_PERM_8x8x3(de1, i0, i1, i2)); + assign(*u2, math_PERM_8x8x3(de2, i0, i1, i2)); + } else { + // Can never happen, since VLD3 only has valid lane widths of 32, + // 16 or 8 bits. + vpanic("math_DEINTERLEAVE_3"); + } +# undef SHL64 +# undef IHI16x4 +# undef IHI32x2 +} + +/* Generate 3x64 -> 3x64 interleave code, for VST3. Caller must + make *i0, *i1 and *i2 be valid IRTemps before the call. */ +static void math_INTERLEAVE_3 ( + /*OUT*/IRTemp* i0, /*OUT*/IRTemp* i1, /*OUT*/IRTemp* i2, + IRTemp u0, IRTemp u1, IRTemp u2, Int laneszB + ) +{ +# define IHI32x2(_e1, _e2) binop(Iop_InterleaveHI32x2, (_e1), (_e2)) +# define IHI16x4(_e1, _e2) binop(Iop_InterleaveHI16x4, (_e1), (_e2)) +# define SHL64(_tmp, _amt) binop(Iop_Shl64, mkexpr(_tmp), mkU8(_amt)) + /* The following assumes that the guest is little endian, and hence + that the memory-side (interleaved) data is stored + little-endianly. */ + vassert(i0 && i1 && i2); + if (laneszB == 4) { + // memLE(192 bits) == A0 B0 C0 A1 B1 C1 + // i0 == B0 A0, i1 == A1 C0, i2 == C1 B1 + // u0 == A1 A0, u1 == B1 B0, u2 == C1 C0 + assign(*i0, IHI32x2(SHL64(u1, 32), SHL64(u0, 32))); + assign(*i1, IHI32x2(SHL64(u0, 0), SHL64(u2, 32))); + assign(*i2, IHI32x2(SHL64(u2, 0), SHL64(u1, 0))); + } else if (laneszB == 2) { + // memLE(192 bits) == A0 B0 C0 A1, B1 C1 A2 B2, C2 A3 B3 C3 + // i0 == A1 C0 B0 A0, i1 == B2 A2 C1 B1, i2 == C3 B3 A3 C2 + // u0 == A3 A2 A1 A0, u1 == B3 B2 B1 B0, u2 == C3 C2 C1 C0 +# define XXX(_tmp3,_la3,_tmp2,_la2,_tmp1,_la1,_tmp0,_la0) \ + IHI32x2( \ + IHI16x4(SHL64((_tmp3),48-16*(_la3)), \ + SHL64((_tmp2),48-16*(_la2))), \ + IHI16x4(SHL64((_tmp1),48-16*(_la1)), \ + SHL64((_tmp0),48-16*(_la0)))) + assign(*i0, XXX(u0,1, u2,0, u1,0, u0,0)); + assign(*i1, XXX(u1,2, u0,2, u2,1, u1,1)); + assign(*i2, XXX(u2,3, u1,3, u0,3, u2,2)); +# undef XXX + } else if (laneszB == 1) { + // These describe how the result vectors [7..0] are + // assembled from the source vectors. Each pair is + // (source vector number, lane number). + static const UChar in0[16] = {1,2, 0,2, 2,1, 1,1, 0,1, 2,0, 1,0, 0,0}; + static const UChar in1[16] = {0,5, 2,4, 1,4, 0,4, 2,3, 1,3, 0,3, 2,2}; + static const UChar in2[16] = {2,7, 1,7, 0,7, 2,6, 1,6, 0,6, 2,5, 1,5}; + assign(*i0, math_PERM_8x8x3(in0, u0, u1, u2)); + assign(*i1, math_PERM_8x8x3(in1, u0, u1, u2)); + assign(*i2, math_PERM_8x8x3(in2, u0, u1, u2)); + } else { + // Can never happen, since VST3 only has valid lane widths of 32, + // 16 or 8 bits. + vpanic("math_INTERLEAVE_3"); + } +# undef SHL64 +# undef IHI16x4 +# undef IHI32x2 +} + + /* A7.7 Advanced SIMD element or structure load/store instructions */ static Bool dis_neon_load_or_store ( UInt theInstr, @@ -8235,7 +8392,8 @@ Bool dis_neon_load_or_store ( UInt theInstr, addr = tmp; } } - else if (N == 1 /* 2-interleaving -- VLD2/VST2 */) { + else + if (N == 1 /* 2-interleaving -- VLD2/VST2 */) { vassert( (regs == 1 && (inc == 1 || inc == 2)) || (regs == 2 && inc == 2) ); // Make 'nregs' be the number of registers and 'regstep' @@ -8300,6 +8458,7 @@ Bool dis_neon_load_or_store ( UInt theInstr, assign(di1, loadLE(Ity_I64, a1)); assign(di2, loadLE(Ity_I64, a2)); assign(di3, loadLE(Ity_I64, a3)); + // Note spooky interleaving: du0, du2, di0, di1 etc math_DEINTERLEAVE_2(&du0, &du2, di0, di1, 1 << size); math_DEINTERLEAVE_2(&du1, &du3, di2, di3, 1 << size); putDRegI64(rD + 0 * regstep, mkexpr(du0), IRTemp_INVALID); @@ -8311,6 +8470,7 @@ Bool dis_neon_load_or_store ( UInt theInstr, assign(du1, getDRegI64(rD + 1 * regstep)); assign(du2, getDRegI64(rD + 2 * regstep)); assign(du3, getDRegI64(rD + 3 * regstep)); + // Note spooky interleaving: du0, du2, di0, di1 etc math_INTERLEAVE_2(&di0, &di1, du0, du2, 1 << size); math_INTERLEAVE_2(&di2, &di3, du1, du3, 1 << size); storeLE(a0, mkexpr(di0)); @@ -8323,7 +8483,40 @@ Bool dis_neon_load_or_store ( UInt theInstr, assign(tmp, binop(Iop_Add32, mkexpr(addr), mkU32(32))); addr = tmp; } - + } + else if (N == 2 /* 3-interleaving -- VLD3/VST3 */) { + // Dd, Dd+1, Dd+2 regs = 1, inc = 1 + // Dd, Dd+2, Dd+4 regs = 1, inc = 2 + vassert(regs == 1 && (inc == 1 || inc == 2)); + IRExpr* a0 = binop(Iop_Add32, mkexpr(addr), mkU32(0)); + IRExpr* a1 = binop(Iop_Add32, mkexpr(addr), mkU32(8)); + IRExpr* a2 = binop(Iop_Add32, mkexpr(addr), mkU32(16)); + IRTemp di0 = newTemp(Ity_I64); + IRTemp di1 = newTemp(Ity_I64); + IRTemp di2 = newTemp(Ity_I64); + IRTemp du0 = newTemp(Ity_I64); + IRTemp du1 = newTemp(Ity_I64); + IRTemp du2 = newTemp(Ity_I64); + if (bL) { + assign(di0, loadLE(Ity_I64, a0)); + assign(di1, loadLE(Ity_I64, a1)); + assign(di2, loadLE(Ity_I64, a2)); + math_DEINTERLEAVE_3(&du0, &du1, &du2, di0, di1, di2, 1 << size); + putDRegI64(rD + 0 * inc, mkexpr(du0), IRTemp_INVALID); + putDRegI64(rD + 1 * inc, mkexpr(du1), IRTemp_INVALID); + putDRegI64(rD + 2 * inc, mkexpr(du2), IRTemp_INVALID); + } else { + assign(du0, getDRegI64(rD + 0 * inc)); + assign(du1, getDRegI64(rD + 1 * inc)); + assign(du2, getDRegI64(rD + 2 * inc)); + math_INTERLEAVE_3(&di0, &di1, &di2, du0, du1, du2, 1 << size); + storeLE(a0, mkexpr(di0)); + storeLE(a1, mkexpr(di1)); + storeLE(a2, mkexpr(di2)); + } + IRTemp tmp = newTemp(Ity_I32); + assign(tmp, binop(Iop_Add32, mkexpr(addr), mkU32(24))); + addr = tmp; } else { /* Fallback case */