From: Julian Seward Date: Tue, 12 Jun 2012 08:45:39 +0000 (+0000) Subject: Make a start at implementing 256-bit AVX instructions generated by X-Git-Tag: svn/VALGRIND_3_8_1^2~99 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4e77c93e2c608aea6a7ae94075e39b97b9873d5f;p=thirdparty%2Fvalgrind.git Make a start at implementing 256-bit AVX instructions generated by "gcc-4.7.0 -mavx -O3": VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r VPSRLQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib git-svn-id: svn://svn.valgrind.org/vex/trunk@2379 --- diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index fa58dd5bbb..7dfc3dd5ae 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -9751,17 +9751,35 @@ static Long dis_PMOVMSKB_128 ( VexAbiInfo* vbi, Prefix pfx, /* FIXME: why not just use InterleaveLO / InterleaveHI ?? */ -static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, UChar opc ) +/* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */ +static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH ) { IRTemp s3, s2, s1, s0, d3, d2, d1, d0; - Bool hi = toBool(opc == 0x15); - vassert(opc == 0x15/*UNPCKLPS*/ || opc == 0x14/*UNPCKHPS*/); s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID; breakup128to32s( dV, &d3, &d2, &d1, &d0 ); breakup128to32s( sV, &s3, &s2, &s1, &s0 ); IRTemp res = newTemp(Ity_V128); - assign(res, hi ? mk128from32s( s3, d3, s2, d2 ) - : mk128from32s( s1, d1, s0, d0 )); + assign(res, xIsH ? mk128from32s( s3, d3, s2, d2 ) + : mk128from32s( s1, d1, s0, d0 )); + return res; +} + + +/* FIXME: why not just use InterleaveLO / InterleaveHI ?? */ +/* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */ +static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH ) +{ + IRTemp s1 = newTemp(Ity_I64); + IRTemp s0 = newTemp(Ity_I64); + IRTemp d1 = newTemp(Ity_I64); + IRTemp d0 = newTemp(Ity_I64); + assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( d0, unop(Iop_V128to64, mkexpr(dV)) ); + assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( s0, unop(Iop_V128to64, mkexpr(sV)) ); + IRTemp res = newTemp(Ity_V128); + assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) + : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0))); return res; } @@ -10135,7 +10153,7 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, DIP("unpck%sps %s,%s\n", hi ? "h" : "l", dis_buf, nameXMMReg(rG)); } - IRTemp res = math_UNPCKxPS_128( sV, dV, opc ); + IRTemp res = math_UNPCKxPS_128( sV, dV, hi ); putXMMReg( rG, mkexpr(res) ); goto decode_success; } @@ -10144,45 +10162,27 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, /* These just appear to be special cases of SHUFPS */ if (have66noF2noF3(pfx) && sz == 2 /* could be 8 if rex also present */) { - IRTemp s1 = newTemp(Ity_I64); - IRTemp s0 = newTemp(Ity_I64); - IRTemp d1 = newTemp(Ity_I64); - IRTemp d0 = newTemp(Ity_I64); + Bool hi = toBool(opc == 0x15); IRTemp sV = newTemp(Ity_V128); IRTemp dV = newTemp(Ity_V128); - Bool hi = toBool(opc == 0x15); - modrm = getUChar(delta); - assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); - + UInt rG = gregOfRexRM(pfx,modrm); + assign( dV, getXMMReg(rG) ); if (epartIsReg(modrm)) { - assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + UInt rE = eregOfRexRM(pfx,modrm); + assign( sV, getXMMReg(rE) ); delta += 1; DIP("unpck%sps %s,%s\n", hi ? "h" : "l", - nameXMMReg(eregOfRexRM(pfx,modrm)), - nameXMMReg(gregOfRexRM(pfx,modrm))); + nameXMMReg(rE), nameXMMReg(rG)); } else { addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); delta += alen; DIP("unpck%sps %s,%s\n", hi ? "h" : "l", - dis_buf, - nameXMMReg(gregOfRexRM(pfx,modrm))); - } - - assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) ); - assign( d0, unop(Iop_V128to64, mkexpr(dV)) ); - assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) ); - assign( s0, unop(Iop_V128to64, mkexpr(sV)) ); - - if (hi) { - putXMMReg( gregOfRexRM(pfx,modrm), - binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) ); - } else { - putXMMReg( gregOfRexRM(pfx,modrm), - binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) ); + dis_buf, nameXMMReg(rG)); } - + IRTemp res = math_UNPCKxPD_128( sV, dV, hi ); + putXMMReg( rG, mkexpr(res) ); goto decode_success; } break; @@ -15579,6 +15579,54 @@ static Long dis_PEXTRD ( VexAbiInfo* vbi, Prefix pfx, } +static Long dis_PEXTRQ ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool isAvx ) +{ + IRTemp addr = IRTemp_INVALID; + UChar modrm = 0; + Int alen = 0; + HChar dis_buf[50]; + + Int imm8_0; + IRTemp xmm_vec = newTemp(Ity_V128); + IRTemp src_qword = newTemp(Ity_I64); + HChar* mbV = isAvx ? "v" : ""; + + vassert(1==getRexW(pfx)); /* ensured by caller */ + modrm = getUChar(delta); + assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) ); + + if ( epartIsReg( modrm ) ) { + imm8_0 = (Int)(getUChar(delta+1) & 1); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + imm8_0 = (Int)(getUChar(delta+alen) & 1); + } + + switch ( imm8_0 ) { + case 0: assign( src_qword, unop(Iop_V128to64, mkexpr(xmm_vec)) ); + break; + case 1: assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) ); + break; + default: vassert(0); + } + + if ( epartIsReg( modrm ) ) { + putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) ); + delta += 1+1; + DIP( "%spextrq $%d, %s,%s\n", mbV, imm8_0, + nameXMMReg( gregOfRexRM(pfx, modrm) ), + nameIReg64( eregOfRexRM(pfx, modrm) ) ); + } else { + storeLE( mkexpr(addr), mkexpr(src_qword) ); + delta += alen+1; + DIP( "%spextrq $%d, %s,%s\n", mbV, + imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf ); + } + return delta; +} + + /* This can fail, in which case it returns the original (unchanged) delta. */ static Long dis_PCMPxSTRx ( VexAbiInfo* vbi, Prefix pfx, @@ -16261,41 +16309,7 @@ Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK, here the REX.W bit is present */ if (have66noF2noF3(pfx) && sz == 8 /* REX.W is present */) { - - Int imm8_0; - IRTemp xmm_vec = newTemp(Ity_V128); - IRTemp src_qword = newTemp(Ity_I64); - - modrm = getUChar(delta); - assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) ); - - if ( epartIsReg( modrm ) ) { - imm8_0 = (Int)(getUChar(delta+1) & 1); - } else { - addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); - imm8_0 = (Int)(getUChar(delta+alen) & 1); - } - switch ( imm8_0 ) { - case 0: assign( src_qword, unop(Iop_V128to64, mkexpr(xmm_vec)) ); - break; - case 1: assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) ); - break; - default: vassert(0); - } - - if ( epartIsReg( modrm ) ) { - putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) ); - delta += 1+1; - DIP( "pextrq $%d, %s,%s\n", imm8_0, - nameXMMReg( gregOfRexRM(pfx, modrm) ), - nameIReg64( eregOfRexRM(pfx, modrm) ) ); - } else { - storeLE( mkexpr(addr), mkexpr(src_qword) ); - delta += alen+1; - DIP( "pextrq $%d, %s,%s\n", - imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf ); - } - + delta = dis_PEXTRQ( vbi, pfx, delta, False/*!isAvx*/); goto decode_success; } break; @@ -19316,6 +19330,7 @@ Long dis_ESC_0F3A ( /*--- ---*/ /*------------------------------------------------------------*/ +/* FIXME: common up with the _256_ version below? */ static Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG ( /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi, @@ -19436,7 +19451,7 @@ Long dis_AVX128_shiftE_to_V_imm( Prefix pfx, //case Iop_SarN32x4: sar = True; size = 32; break; case Iop_ShrN16x8: shr = True; size = 16; break; case Iop_ShrN32x4: shr = True; size = 32; break; - //case Iop_ShrN64x2: shr = True; size = 64; break; + case Iop_ShrN64x2: shr = True; size = 64; break; default: vassert(0); } @@ -19594,7 +19609,7 @@ static Long dis_AVX128_E_V_to_G_lo32 ( /*OUT*/Bool* uses_vvvv, /* All-lanes AVX128 binary operation: - G[127:0] = V127:0] `op` E[127:0] + G[127:0] = V[127:0] `op` E[127:0] G[255:128] = 0. */ static Long dis_AVX128_E_V_to_G ( /*OUT*/Bool* uses_vvvv, @@ -19752,6 +19767,80 @@ Long dis_AVX128_E_to_G_unary ( /*OUT*/Bool* uses_vvvv, } +/* FIXME: common up with the _128_ version above? */ +static +Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG ( + /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi, + Prefix pfx, Long delta, HChar* name, + /* The actual operation. Use either 'op' or 'opfn', + but not both. */ + IROp op, IRTemp(*opFn)(IRTemp,IRTemp), + Bool invertLeftArg, + Bool swapArgs + ) +{ + UChar modrm = getUChar(delta); + UInt rD = gregOfRexRM(pfx, modrm); + UInt rSL = getVexNvvvv(pfx); + IRTemp tSL = newTemp(Ity_V256); + IRTemp tSR = newTemp(Ity_V256); + IRTemp addr = IRTemp_INVALID; + HChar dis_buf[50]; + Int alen = 0; + vassert(1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*WIG?*/); + + // Hmm. we don't actually have Iop_NotV256 (yet). Hence kludge: + vassert(!invertLeftArg); + assign(tSL, /* invertLeftArg ? unop(Iop_NotV256, getYMMReg(rSL)) + : */ getYMMReg(rSL)); + + if (epartIsReg(modrm)) { + UInt rSR = eregOfRexRM(pfx, modrm); + delta += 1; + assign(tSR, getYMMReg(rSR)); + DIP("%s %s,%s,%s\n", + name, nameYMMReg(rSR), nameYMMReg(rSL), nameYMMReg(rD)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + assign(tSR, loadLE(Ity_V256, mkexpr(addr))); + DIP("%s %s,%s,%s\n", + name, dis_buf, nameYMMReg(rSL), nameYMMReg(rD)); + } + + IRTemp res = IRTemp_INVALID; + if (op != Iop_INVALID) { + vassert(opFn == NULL); + res = newTemp(Ity_V256); + assign(res, swapArgs ? binop(op, mkexpr(tSR), mkexpr(tSL)) + : binop(op, mkexpr(tSL), mkexpr(tSR))); + } else { + vassert(opFn != NULL); + res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR); + } + + putYMMReg(rD, mkexpr(res)); + + *uses_vvvv = True; + return delta; +} + + +/* All-lanes AVX256 binary operation: + G[255:0] = V[255:0] `op` E[255:0] +*/ +static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv, + VexAbiInfo* vbi, + Prefix pfx, Long delta, + HChar* opname, IROp op ) +{ + return dis_VEX_NDS_256_AnySimdPfx_0F_WIG( + uses_vvvv, vbi, pfx, delta, opname, op, + NULL, False/*!invertLeftArg*/, False/*!swapArgs*/ + ); +} + + __attribute__((noinline)) static Long dis_ESC_0F__VEX ( @@ -19810,6 +19899,23 @@ Long dis_ESC_0F__VEX ( delta += alen; goto decode_success; } + /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + putYMMRegLoAndZU( rG, getXMMReg( rE )); + DIP("vmovupd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG)); + delta += 1; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) ); + DIP("vmovupd %s,%s\n", dis_buf, nameXMMReg(rG)); + delta += alen; + } + goto decode_success; + } /* VMOVUPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 10 /r */ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { UChar modrm = getUChar(delta); @@ -19827,6 +19933,23 @@ Long dis_ESC_0F__VEX ( } goto decode_success; } + /* VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + putYMMRegLoAndZU( rG, getXMMReg( rE )); + DIP("vmovups %s,%s\n", nameXMMReg(rE), nameXMMReg(rG)); + delta += 1; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) ); + DIP("vmovups %s,%s\n", dis_buf, nameXMMReg(rG)); + delta += alen; + } + goto decode_success; + } break; case 0x11: @@ -19935,7 +20058,9 @@ Long dis_ESC_0F__VEX ( break; case 0x14: + case 0x15: /* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */ + /* VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r */ if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { Bool hi = opc == 0x15; UChar modrm = getUChar(delta); @@ -19957,7 +20082,35 @@ Long dis_ESC_0F__VEX ( DIP("vunpck%sps %s,%s\n", hi ? "h" : "l", dis_buf, nameXMMReg(rG)); } - IRTemp res = math_UNPCKxPS_128( eV, vV, opc ); + IRTemp res = math_UNPCKxPS_128( eV, vV, hi ); + putYMMRegLoAndZU( rG, mkexpr(res) ); + *uses_vvvv = True; + goto decode_success; + } + /* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */ + /* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + Bool hi = opc == 0x15; + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + UInt rV = getVexNvvvv(pfx); + IRTemp eV = newTemp(Ity_V128); + IRTemp vV = newTemp(Ity_V128); + assign( vV, getXMMReg(rV) ); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( eV, getXMMReg(rE) ); + delta += 1; + DIP("vunpck%spd %s,%s\n", hi ? "h" : "l", + nameXMMReg(rE), nameXMMReg(rG)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( eV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += alen; + DIP("vunpck%spd %s,%s\n", hi ? "h" : "l", + dis_buf, nameXMMReg(rG)); + } + IRTemp res = math_UNPCKxPD_128( eV, vV, hi ); putYMMRegLoAndZU( rG, mkexpr(res) ); *uses_vvvv = True; goto decode_success; @@ -20386,6 +20539,24 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx4 ); goto decode_success; } + /* VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r */ + if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx8 ); + goto decode_success; + } + /* VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_AVX128_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx2 ); + goto decode_success; + } + /* VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx4 ); + goto decode_success; + } break; case 0x59: @@ -20407,6 +20578,24 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx4 ); goto decode_success; } + /* VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r */ + if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx8 ); + goto decode_success; + } + /* VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_AVX128_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx2 ); + goto decode_success; + } + /* VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx4 ); + goto decode_success; + } break; case 0x5A: @@ -20506,6 +20695,24 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx4 ); goto decode_success; } + /* VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r */ + if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx8 ); + goto decode_success; + } + /* VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_AVX128_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx2 ); + goto decode_success; + } + /* VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx4 ); + goto decode_success; + } break; case 0x5D: @@ -20542,6 +20749,18 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 ); goto decode_success; } + /* VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r */ + if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 ); + goto decode_success; + } + /* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx4 ); + goto decode_success; + } break; case 0x5F: @@ -20827,6 +21046,7 @@ Long dis_ESC_0F__VEX ( case 0x73: /* VPSRLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /3 ib */ /* VPSLLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /7 ib */ + /* VPSRLQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib */ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ && epartIsReg(getUChar(delta))) { Int rS = eregOfRexRM(pfx,getUChar(delta)); @@ -20850,6 +21070,12 @@ Long dis_ESC_0F__VEX ( *uses_vvvv = True; goto decode_success; } + if (gregLO3ofRM(getUChar(delta)) == 2) { + delta = dis_AVX128_shiftE_to_V_imm( pfx, delta, + "vpsrlq", Iop_ShrN64x2 ); + *uses_vvvv = True; + goto decode_success; + } /* else fall through */ } break; @@ -21319,6 +21545,16 @@ Long dis_ESC_0F38__VEX ( } break; + case 0x29: + /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */ + /* VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 ); + goto decode_success; + } + break; + case 0x30: /* VPMOVZXBW xmm2/m64, xmm1 */ /* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */ @@ -21337,6 +21573,16 @@ Long dis_ESC_0F38__VEX ( } break; + case 0x37: + /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */ + /* VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 ); + goto decode_success; + } + break; + case 0x39: /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */ /* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */ @@ -21407,6 +21653,12 @@ Long dis_ESC_0F3A__VEX ( delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ ); goto decode_success; } + /* VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib */ + if (have66noF2noF3(pfx) + && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) { + delta = dis_PEXTRQ( vbi, pfx, delta, True/*isAvx*/ ); + goto decode_success; + } break; case 0x18: diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index c24678e9e8..c1efff0517 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -3368,7 +3368,7 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs. --*/ /*---------------------------------------------------------*/ -static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo, +static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, ISelEnv* env, IRExpr* e ) { iselDVecExpr_wrk( rHi, rLo, env, e ); @@ -3383,13 +3383,15 @@ static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo, /* DO NOT CALL THIS DIRECTLY */ -static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo, +static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, ISelEnv* env, IRExpr* e ) { vassert(e); IRType ty = typeOfIRExpr(env->type_env,e); vassert(ty == Ity_V256); + AMD64SseOp op = Asse_INVALID; + /* read 256-bit IRTemp */ if (e->tag == Iex_RdTmp) { lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp); @@ -3422,6 +3424,54 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo, return; } + if (e->tag == Iex_Binop) { + switch (e->Iex.Binop.op) { + + case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4; + case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4; + case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4; + case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4; + do_64Fx4: + { + HReg argLhi, argLlo, argRhi, argRlo; + iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1); + iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + addInstr(env, mk_vMOVsd_RR(argLhi, dstHi)); + addInstr(env, mk_vMOVsd_RR(argLlo, dstLo)); + addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi)); + addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo)); + *rHi = dstHi; + *rLo = dstLo; + return; + } + + case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8; + case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8; + case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8; + case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8; + do_32Fx8: + { + HReg argLhi, argLlo, argRhi, argRlo; + iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1); + iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + addInstr(env, mk_vMOVsd_RR(argLhi, dstHi)); + addInstr(env, mk_vMOVsd_RR(argLlo, dstLo)); + addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi)); + addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo)); + *rHi = dstHi; + *rLo = dstLo; + return; + } + + default: + break; + } /* switch (e->Iex.Binop.op) */ + } /* if (e->tag == Iex_Binop) */ + if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) { HReg rsp = hregAMD64_RSP(); HReg vHi = newVRegV(env); diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 050dfe9fae..5953a93a54 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -982,8 +982,16 @@ void ppIROp ( IROp op ) case Iop_V256to64_2: vex_printf("V256to64_2"); return; case Iop_V256to64_3: vex_printf("V256to64_3"); return; case Iop_64x4toV256: vex_printf("64x4toV256"); return; - case Iop_DPBtoBCD: vex_printf("Iop_DPBtoBCD"); return; - case Iop_BCDtoDPB: vex_printf("Iop_BCDtoDPB"); return; + case Iop_DPBtoBCD: vex_printf("DPBtoBCD"); return; + case Iop_BCDtoDPB: vex_printf("BCDtoDPB"); return; + case Iop_Add64Fx4: vex_printf("Add64Fx4"); return; + case Iop_Sub64Fx4: vex_printf("Sub64Fx4"); return; + case Iop_Mul64Fx4: vex_printf("Mul64Fx4"); return; + case Iop_Div64Fx4: vex_printf("Div64Fx4"); return; + case Iop_Add32Fx8: vex_printf("Add32Fx8"); return; + case Iop_Sub32Fx8: vex_printf("Sub32Fx8"); return; + case Iop_Mul32Fx8: vex_printf("Mul32Fx8"); return; + case Iop_Div32Fx8: vex_printf("Div32Fx8"); return; default: vpanic("ppIROp(1)"); } @@ -2783,6 +2791,16 @@ void typeOfPrimop ( IROp op, case Iop_64x4toV256: QUATERNARY(Ity_I64, Ity_I64, Ity_I64, Ity_I64, Ity_V256); + case Iop_Add64Fx4: + case Iop_Sub64Fx4: + case Iop_Mul64Fx4: + case Iop_Div64Fx4: + case Iop_Add32Fx8: + case Iop_Sub32Fx8: + case Iop_Mul32Fx8: + case Iop_Div32Fx8: + BINARY(Ity_V256,Ity_V256, Ity_V256); + default: ppIROp(op); vpanic("typeOfPrimop"); diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 22fed0861e..5cc2dc8ac7 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1428,8 +1428,18 @@ typedef Iop_V256to64_2, Iop_V256to64_3, // V256 -> I64, extract most sigificant lane - Iop_64x4toV256 // (I64,I64,I64,I64)->V256 + Iop_64x4toV256, // (I64,I64,I64,I64)->V256 // first arg is most significant lane + + /* ------------------ 256-bit SIMD FP. ------------------ */ + Iop_Add64Fx4, + Iop_Sub64Fx4, + Iop_Mul64Fx4, + Iop_Div64Fx4, + Iop_Add32Fx8, + Iop_Sub32Fx8, + Iop_Mul32Fx8, + Iop_Div32Fx8 } IROp;