From: Julian Seward Date: Sat, 20 Apr 2013 23:27:36 +0000 (+0000) Subject: VLD4/VST4: generate in-line interleave/de-interleave code, so that X-Git-Tag: svn/VALGRIND_3_9_0^2~84 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f8c1196311483e70cec7839aa2beee97efc25cea;p=thirdparty%2Fvalgrind.git VLD4/VST4: generate in-line interleave/de-interleave code, so that loads can always be done in 8 byte units. git-svn-id: svn://svn.valgrind.org/vex/trunk@2711 --- diff --git a/VEX/priv/guest_arm_toIR.c b/VEX/priv/guest_arm_toIR.c index 694fce8be0..ac8524ee7a 100644 --- a/VEX/priv/guest_arm_toIR.c +++ b/VEX/priv/guest_arm_toIR.c @@ -8128,6 +8128,166 @@ static void math_INTERLEAVE_3 ( # undef IHI32x2 } +/* Generate 4x64 -> 4x64 deinterleave code, for VLD4. Caller must + make *u0, *u1, *u2 and *u3 be valid IRTemps before the call. */ +static void math_DEINTERLEAVE_4 ( + /*OUT*/IRTemp* u0, /*OUT*/IRTemp* u1, + /*OUT*/IRTemp* u2, /*OUT*/IRTemp* u3, + IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3, Int laneszB + ) +{ +# define IHI32x2(_t1, _t2) \ + binop(Iop_InterleaveHI32x2, mkexpr(_t1), mkexpr(_t2)) +# define ILO32x2(_t1, _t2) \ + binop(Iop_InterleaveLO32x2, mkexpr(_t1), mkexpr(_t2)) +# define IHI16x4(_t1, _t2) \ + binop(Iop_InterleaveHI16x4, mkexpr(_t1), mkexpr(_t2)) +# define ILO16x4(_t1, _t2) \ + binop(Iop_InterleaveLO16x4, mkexpr(_t1), mkexpr(_t2)) +# define IHI8x8(_t1, _e2) \ + binop(Iop_InterleaveHI8x8, mkexpr(_t1), _e2) +# define SHL64(_tmp, _amt) \ + binop(Iop_Shl64, mkexpr(_tmp), mkU8(_amt)) + /* The following assumes that the guest is little endian, and hence + that the memory-side (interleaved) data is stored + little-endianly. */ + vassert(u0 && u1 && u2 && u3); + if (laneszB == 4) { + assign(*u0, ILO32x2(i2, i0)); + assign(*u1, IHI32x2(i2, i0)); + assign(*u2, ILO32x2(i3, i1)); + assign(*u3, IHI32x2(i3, i1)); + } else if (laneszB == 2) { + IRTemp b1b0a1a0 = newTemp(Ity_I64); + IRTemp b3b2a3a2 = newTemp(Ity_I64); + IRTemp d1d0c1c0 = newTemp(Ity_I64); + IRTemp d3d2c3c2 = newTemp(Ity_I64); + assign(b1b0a1a0, ILO16x4(i1, i0)); + assign(b3b2a3a2, ILO16x4(i3, i2)); + assign(d1d0c1c0, IHI16x4(i1, i0)); + assign(d3d2c3c2, IHI16x4(i3, i2)); + // And now do what we did for the 32-bit case. + assign(*u0, ILO32x2(b3b2a3a2, b1b0a1a0)); + assign(*u1, IHI32x2(b3b2a3a2, b1b0a1a0)); + assign(*u2, ILO32x2(d3d2c3c2, d1d0c1c0)); + assign(*u3, IHI32x2(d3d2c3c2, d1d0c1c0)); + } else if (laneszB == 1) { + // Deinterleave into 16-bit chunks, then do as the 16-bit case. + IRTemp i0x = newTemp(Ity_I64); + IRTemp i1x = newTemp(Ity_I64); + IRTemp i2x = newTemp(Ity_I64); + IRTemp i3x = newTemp(Ity_I64); + assign(i0x, IHI8x8(i0, SHL64(i0, 32))); + assign(i1x, IHI8x8(i1, SHL64(i1, 32))); + assign(i2x, IHI8x8(i2, SHL64(i2, 32))); + assign(i3x, IHI8x8(i3, SHL64(i3, 32))); + // From here on is like the 16 bit case. + IRTemp b1b0a1a0 = newTemp(Ity_I64); + IRTemp b3b2a3a2 = newTemp(Ity_I64); + IRTemp d1d0c1c0 = newTemp(Ity_I64); + IRTemp d3d2c3c2 = newTemp(Ity_I64); + assign(b1b0a1a0, ILO16x4(i1x, i0x)); + assign(b3b2a3a2, ILO16x4(i3x, i2x)); + assign(d1d0c1c0, IHI16x4(i1x, i0x)); + assign(d3d2c3c2, IHI16x4(i3x, i2x)); + // And now do what we did for the 32-bit case. + assign(*u0, ILO32x2(b3b2a3a2, b1b0a1a0)); + assign(*u1, IHI32x2(b3b2a3a2, b1b0a1a0)); + assign(*u2, ILO32x2(d3d2c3c2, d1d0c1c0)); + assign(*u3, IHI32x2(d3d2c3c2, d1d0c1c0)); + } else { + // Can never happen, since VLD4 only has valid lane widths of 32, + // 16 or 8 bits. + vpanic("math_DEINTERLEAVE_4"); + } +# undef SHL64 +# undef IHI8x8 +# undef ILO16x4 +# undef IHI16x4 +# undef ILO32x2 +# undef IHI32x2 +} + +/* Generate 4x64 -> 4x64 interleave code, for VST4. Caller must + make *i0, *i1, *i2 and *i3 be valid IRTemps before the call. */ +static void math_INTERLEAVE_4 ( + /*OUT*/IRTemp* i0, /*OUT*/IRTemp* i1, + /*OUT*/IRTemp* i2, /*OUT*/IRTemp* i3, + IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3, Int laneszB + ) +{ +# define IHI32x2(_t1, _t2) \ + binop(Iop_InterleaveHI32x2, mkexpr(_t1), mkexpr(_t2)) +# define ILO32x2(_t1, _t2) \ + binop(Iop_InterleaveLO32x2, mkexpr(_t1), mkexpr(_t2)) +# define CEV16x4(_t1, _t2) \ + binop(Iop_CatEvenLanes16x4, mkexpr(_t1), mkexpr(_t2)) +# define COD16x4(_t1, _t2) \ + binop(Iop_CatOddLanes16x4, mkexpr(_t1), mkexpr(_t2)) +# define COD8x8(_t1, _e2) \ + binop(Iop_CatOddLanes8x8, mkexpr(_t1), _e2) +# define SHL64(_tmp, _amt) \ + binop(Iop_Shl64, mkexpr(_tmp), mkU8(_amt)) + /* The following assumes that the guest is little endian, and hence + that the memory-side (interleaved) data is stored + little-endianly. */ + vassert(u0 && u1 && u2 && u3); + if (laneszB == 4) { + assign(*i0, ILO32x2(u1, u0)); + assign(*i1, ILO32x2(u3, u2)); + assign(*i2, IHI32x2(u1, u0)); + assign(*i3, IHI32x2(u3, u2)); + } else if (laneszB == 2) { + // First, interleave at the 32-bit lane size. + IRTemp b1b0a1a0 = newTemp(Ity_I64); + IRTemp b3b2a3a2 = newTemp(Ity_I64); + IRTemp d1d0c1c0 = newTemp(Ity_I64); + IRTemp d3d2c3c2 = newTemp(Ity_I64); + assign(b1b0a1a0, ILO32x2(u1, u0)); + assign(b3b2a3a2, IHI32x2(u1, u0)); + assign(d1d0c1c0, ILO32x2(u3, u2)); + assign(d3d2c3c2, IHI32x2(u3, u2)); + // And interleave (cat) at the 16 bit size. + assign(*i0, CEV16x4(d1d0c1c0, b1b0a1a0)); + assign(*i1, COD16x4(d1d0c1c0, b1b0a1a0)); + assign(*i2, CEV16x4(d3d2c3c2, b3b2a3a2)); + assign(*i3, COD16x4(d3d2c3c2, b3b2a3a2)); + } else if (laneszB == 1) { + // First, interleave at the 32-bit lane size. + IRTemp b1b0a1a0 = newTemp(Ity_I64); + IRTemp b3b2a3a2 = newTemp(Ity_I64); + IRTemp d1d0c1c0 = newTemp(Ity_I64); + IRTemp d3d2c3c2 = newTemp(Ity_I64); + assign(b1b0a1a0, ILO32x2(u1, u0)); + assign(b3b2a3a2, IHI32x2(u1, u0)); + assign(d1d0c1c0, ILO32x2(u3, u2)); + assign(d3d2c3c2, IHI32x2(u3, u2)); + // And interleave (cat) at the 16 bit size. + IRTemp i0x = newTemp(Ity_I64); + IRTemp i1x = newTemp(Ity_I64); + IRTemp i2x = newTemp(Ity_I64); + IRTemp i3x = newTemp(Ity_I64); + assign(i0x, CEV16x4(d1d0c1c0, b1b0a1a0)); + assign(i1x, COD16x4(d1d0c1c0, b1b0a1a0)); + assign(i2x, CEV16x4(d3d2c3c2, b3b2a3a2)); + assign(i3x, COD16x4(d3d2c3c2, b3b2a3a2)); + // And rearrange within each word, to get the right 8 bit lanes. + assign(*i0, COD8x8(i0x, SHL64(i0x, 8))); + assign(*i1, COD8x8(i1x, SHL64(i1x, 8))); + assign(*i2, COD8x8(i2x, SHL64(i2x, 8))); + assign(*i3, COD8x8(i3x, SHL64(i3x, 8))); + } else { + // Can never happen, since VLD4 only has valid lane widths of 32, + // 16 or 8 bits. + vpanic("math_DEINTERLEAVE_4"); + } +# undef SHL64 +# undef COD8x8 +# undef COD16x4 +# undef CEV16x4 +# undef ILO32x2 +# undef IHI32x2 +} /* A7.7 Advanced SIMD element or structure load/store instructions */ static @@ -8329,7 +8489,8 @@ Bool dis_neon_load_or_store ( UInt theInstr, } else { /* ------------ Case (3) ------------ VSTn / VLDn (multiple n-element structures) */ - UInt r, lanes; + inc = (fB & 1) + 1; + if (fB == BITS4(0,0,1,0) // Dd, Dd+1, Dd+2, Dd+3 inc = 1 regs = 4 || fB == BITS4(0,1,1,0) // Dd, Dd+1, Dd+2 inc = 1 regs = 3 || fB == BITS4(0,1,1,1) // Dd inc = 2 regs = 1 @@ -8337,20 +8498,28 @@ Bool dis_neon_load_or_store ( UInt theInstr, N = 0; // VLD1/VST1. 'inc' does not appear to have any // meaning for the VLD1/VST1 cases. 'regs' is the number of // registers involved. + if (rD + regs > 32) return False; } else if (fB == BITS4(0,0,1,1) // Dd, Dd+1, Dd+2, Dd+3 inc=2 regs = 2 || fB == BITS4(1,0,0,0) // Dd, Dd+1 inc=1 regs = 1 || fB == BITS4(1,0,0,1)) { // Dd, Dd+2 inc=2 regs = 1 N = 1; // VLD2/VST2. 'regs' is the number of register-pairs involved + if (regs == 1 && inc == 1 && rD + 1 >= 32) return False; + if (regs == 1 && inc == 2 && rD + 2 >= 32) return False; + if (regs == 2 && inc == 2 && rD + 3 >= 32) return False; } else if (fB == BITS4(0,1,0,0) || fB == BITS4(0,1,0,1)) { - N = 2; + N = 2; // VLD3/VST3 + if (inc == 1 && rD + 2 >= 32) return False; + if (inc == 2 && rD + 4 >= 32) return False; } else if (fB == BITS4(0,0,0,0) || fB == BITS4(0,0,0,1)) { - N = 3; + N = 3; // VLD4/VST4 + if (inc == 1 && rD + 3 >= 32) return False; + if (inc == 2 && rD + 6 >= 32) return False; } else { return False; } - inc = (fB & 1) + 1; + if (N == 1 && fB == BITS4(0,0,1,1)) { regs = 2; } else if (N == 0) { @@ -8369,8 +8538,6 @@ Bool dis_neon_load_or_store ( UInt theInstr, if (size == 3) return False; - lanes = 8 / (1 << size); - // go uncond if (condT != IRTemp_INVALID) mk_skip_over_T32_if_cond_is_false(condT); @@ -8380,6 +8547,7 @@ Bool dis_neon_load_or_store ( UInt theInstr, assign(addr, mkexpr(initialRn)); if (N == 0 /* No interleaving -- VLD1/VST1 */) { + UInt r; vassert(regs == 1 || regs == 2 || regs == 3 || regs == 4); /* inc has no relevance here */ for (r = 0; r < regs; r++) { @@ -8484,7 +8652,8 @@ Bool dis_neon_load_or_store ( UInt theInstr, addr = tmp; } } - else if (N == 2 /* 3-interleaving -- VLD3/VST3 */) { + else + if (N == 2 /* 3-interleaving -- VLD3/VST3 */) { // Dd, Dd+1, Dd+2 regs = 1, inc = 1 // Dd, Dd+2, Dd+4 regs = 1, inc = 2 vassert(regs == 1 && (inc == 1 || inc == 2)); @@ -8518,21 +8687,52 @@ Bool dis_neon_load_or_store ( UInt theInstr, assign(tmp, binop(Iop_Add32, mkexpr(addr), mkU32(24))); addr = tmp; } - else { - /* Fallback case */ - for (r = 0; r < regs; r++) { - for (i = 0; i < lanes; i++) { - if (bL) - mk_neon_elem_load_to_one_lane(rD + r, inc, i, N, size, addr); - else - mk_neon_elem_store_from_one_lane(rD + r, - inc, i, N, size, addr); - IRTemp tmp = newTemp(Ity_I32); - assign(tmp, binop(Iop_Add32, mkexpr(addr), - mkU32((1 << size) * (N + 1)))); - addr = tmp; - } + else + if (N == 3 /* 4-interleaving -- VLD4/VST4 */) { + // Dd, Dd+1, Dd+2, Dd+3 regs = 1, inc = 1 + // Dd, Dd+2, Dd+4, Dd+6 regs = 1, inc = 2 + vassert(regs == 1 && (inc == 1 || inc == 2)); + IRExpr* a0 = binop(Iop_Add32, mkexpr(addr), mkU32(0)); + IRExpr* a1 = binop(Iop_Add32, mkexpr(addr), mkU32(8)); + IRExpr* a2 = binop(Iop_Add32, mkexpr(addr), mkU32(16)); + IRExpr* a3 = binop(Iop_Add32, mkexpr(addr), mkU32(24)); + IRTemp di0 = newTemp(Ity_I64); + IRTemp di1 = newTemp(Ity_I64); + IRTemp di2 = newTemp(Ity_I64); + IRTemp di3 = newTemp(Ity_I64); + IRTemp du0 = newTemp(Ity_I64); + IRTemp du1 = newTemp(Ity_I64); + IRTemp du2 = newTemp(Ity_I64); + IRTemp du3 = newTemp(Ity_I64); + if (bL) { + assign(di0, loadLE(Ity_I64, a0)); + assign(di1, loadLE(Ity_I64, a1)); + assign(di2, loadLE(Ity_I64, a2)); + assign(di3, loadLE(Ity_I64, a3)); + math_DEINTERLEAVE_4(&du0, &du1, &du2, &du3, + di0, di1, di2, di3, 1 << size); + putDRegI64(rD + 0 * inc, mkexpr(du0), IRTemp_INVALID); + putDRegI64(rD + 1 * inc, mkexpr(du1), IRTemp_INVALID); + putDRegI64(rD + 2 * inc, mkexpr(du2), IRTemp_INVALID); + putDRegI64(rD + 3 * inc, mkexpr(du3), IRTemp_INVALID); + } else { + assign(du0, getDRegI64(rD + 0 * inc)); + assign(du1, getDRegI64(rD + 1 * inc)); + assign(du2, getDRegI64(rD + 2 * inc)); + assign(du3, getDRegI64(rD + 3 * inc)); + math_INTERLEAVE_4(&di0, &di1, &di2, &di3, + du0, du1, du2, du3, 1 << size); + storeLE(a0, mkexpr(di0)); + storeLE(a1, mkexpr(di1)); + storeLE(a2, mkexpr(di2)); + storeLE(a3, mkexpr(di3)); } + IRTemp tmp = newTemp(Ity_I32); + assign(tmp, binop(Iop_Add32, mkexpr(addr), mkU32(32))); + addr = tmp; + } + else { + vassert(0); } /* Writeback */ @@ -8556,6 +8756,7 @@ Bool dis_neon_load_or_store ( UInt theInstr, || (inc == 2 && regs > 1 && N > 0)) { DIP("d%u-d%u", rD, rD + regs * (N + 1) - 1); } else { + UInt r; for (r = 0; r < regs; r++) { for (i = 0; i <= N; i++) { if (i || r)