From: Julian Seward Date: Wed, 28 Jan 2015 10:52:36 +0000 (+0000) Subject: Use IR conditional stores (IRStoreG) to implement AVX-2 conditional X-Git-Tag: svn/VALGRIND_3_11_0^2~105 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=912fe87a435ac602985965044bcb24ade92f8583;p=thirdparty%2Fvalgrind.git Use IR conditional stores (IRStoreG) to implement AVX-2 conditional vector stores, VPMASKMOV{D,Q} xmm/ymm to memory. git-svn-id: svn://svn.valgrind.org/vex/trunk@3077 --- diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index d0c223324a..35aaa73019 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -27235,10 +27235,11 @@ static Long dis_FMA ( const VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc ) } -/* Masked load. */ -static ULong dis_VMASKMOV_load ( Bool *uses_vvvv, const VexAbiInfo* vbi, - Prefix pfx, Long delta, - const HChar* opname, Bool isYMM, IRType ty ) +/* Masked load or masked store. */ +static ULong dis_VMASKMOV ( Bool *uses_vvvv, const VexAbiInfo* vbi, + Prefix pfx, Long delta, + const HChar* opname, Bool isYMM, IRType ty, + Bool isLoad ) { HChar dis_buf[50]; Int alen, i; @@ -27246,50 +27247,55 @@ static ULong dis_VMASKMOV_load ( Bool *uses_vvvv, const VexAbiInfo* vbi, UChar modrm = getUChar(delta); UInt rG = gregOfRexRM(pfx,modrm); UInt rV = getVexNvvvv(pfx); - IRTemp res[8], cond; + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); - if (isYMM) { + delta += alen; + + /**/ if (isLoad && isYMM) { DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) ); - } else { + } + else if (isLoad && !isYMM) { DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) ); } - delta += alen; - for (i = 0; i < sizeof(res)/sizeof(res[0]); i++) - res[i] = IRTemp_INVALID; + else if (!isLoad && isYMM) { + DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), nameYMMReg(rG), dis_buf ); + } + else { + vassert(!isLoad && !isYMM); + DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), nameXMMReg(rG), dis_buf ); + } - for (i = 0; i < 2 * (isYMM ? 2 : 1) * (ty == Ity_I32 ? 2 : 1); i++) { - res[i] = newTemp(ty); - cond = newTemp(Ity_I1); - assign( cond, - binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S, - ty == Ity_I32 ? getYMMRegLane32( rV, i ) - : getYMMRegLane64( rV, i ), + vassert(ty == Ity_I32 || ty == Ity_I64); + Bool laneIs32 = ty == Ity_I32; + + Int nLanes = (isYMM ? 2 : 1) * (laneIs32 ? 4 : 2); + + for (i = 0; i < nLanes; i++) { + IRTemp cond = newTemp(Ity_I1); + assign( cond, + binop(laneIs32 ? Iop_CmpLT32S : Iop_CmpLT64S, + (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rV, i ), mkU(ty, 0) )); - stmt( - IRStmt_LoadG( - Iend_LE, - ty == Ity_I32 ? ILGop_Ident32 : ILGop_Ident64, - res[i], - binop(Iop_Add64, mkexpr(addr), mkU64(i * (ty == Ity_I32 ? 4 : 8))), - ty == Ity_I32 ? mkU32(0) : mkU64(0), - mkexpr(cond) - )); - } - switch (ty) { - case Ity_I32: - for (i = 0; i < 8; i++) - putYMMRegLane32( rG, i, (i < 4 || isYMM) - ? mkexpr(res[i]) : mkU32(0) ); - break; - case Ity_I64: - for (i = 0; i < 4; i++) - putYMMRegLane64( rG, i, (i < 2 || isYMM) - ? mkexpr(res[i]) : mkU64(0) ); - break; - default: vassert(0); + IRTemp data = newTemp(ty); + IRExpr* ea = binop(Iop_Add64, mkexpr(addr), + mkU64(i * (laneIs32 ? 4 : 8))); + if (isLoad) { + stmt( + IRStmt_LoadG( + Iend_LE, laneIs32 ? ILGop_Ident32 : ILGop_Ident64, + data, ea, laneIs32 ? mkU32(0) : mkU64(0), mkexpr(cond) + )); + (laneIs32 ? putYMMRegLane32 : putYMMRegLane64)( rG, i, mkexpr(data) ); + } else { + assign(data, (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rG, i )); + stmt( IRStmt_StoreG(Iend_LE, ea, mkexpr(data), mkexpr(cond)) ); + } } + if (isLoad && !isYMM) + putYMMRegLane128( rG, 1, mkV128(0) ); + *uses_vvvv = True; return delta; } @@ -28202,15 +28208,15 @@ Long dis_ESC_0F38__VEX ( /* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2C /r */ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) { - delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps", - /*!isYMM*/False, Ity_I32 ); + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps", + /*!isYMM*/False, Ity_I32, /*isLoad*/True ); goto decode_success; } /* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2C /r */ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) { - delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps", - /*isYMM*/True, Ity_I32 ); + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps", + /*isYMM*/True, Ity_I32, /*isLoad*/True ); goto decode_success; } break; @@ -28219,15 +28225,15 @@ Long dis_ESC_0F38__VEX ( /* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2D /r */ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) { - delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd", - /*!isYMM*/False, Ity_I64 ); + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd", + /*!isYMM*/False, Ity_I64, /*isLoad*/True ); goto decode_success; } /* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2D /r */ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) { - delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd", - /*isYMM*/True, Ity_I64 ); + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd", + /*isYMM*/True, Ity_I64, /*isLoad*/True ); goto decode_success; } break; @@ -28788,29 +28794,60 @@ Long dis_ESC_0F38__VEX ( /* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { - delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd", - /*!isYMM*/False, Ity_I32 ); + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd", + /*!isYMM*/False, Ity_I32, /*isLoad*/True ); goto decode_success; } /* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { - delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd", - /*isYMM*/True, Ity_I32 ); + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd", + /*isYMM*/True, Ity_I32, /*isLoad*/True ); goto decode_success; } /* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { - delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq", - /*!isYMM*/False, Ity_I64 ); + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq", + /*!isYMM*/False, Ity_I64, /*isLoad*/True ); goto decode_success; } /* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { - delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq", - /*isYMM*/True, Ity_I64 ); + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq", + /*isYMM*/True, Ity_I64, /*isLoad*/True ); + goto decode_success; + } + break; + + case 0x8E: + /* VPMASKMOVD xmm2, xmm1, m128 = VEX.NDS.128.66.0F38.W0 8E /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd", + /*!isYMM*/False, Ity_I32, /*!isLoad*/False ); + goto decode_success; + } + /* VPMASKMOVD ymm2, ymm1, m256 = VEX.NDS.256.66.0F38.W0 8E /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd", + /*isYMM*/True, Ity_I32, /*!isLoad*/False ); + goto decode_success; + } + /* VPMASKMOVQ xmm2, xmm1, m128 = VEX.NDS.128.66.0F38.W1 8E /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq", + /*!isYMM*/False, Ity_I64, /*!isLoad*/False ); + goto decode_success; + } + /* VPMASKMOVQ ymm2, ymm1, m256 = VEX.NDS.256.66.0F38.W1 8E /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq", + /*isYMM*/True, Ity_I64, /*!isLoad*/False ); goto decode_success; } break; diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index fdbf05e455..4cfd9a44b0 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -753,7 +753,18 @@ AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB, i->Ain.CLoad.szB = szB; i->Ain.CLoad.addr = addr; i->Ain.CLoad.dst = dst; - vassert(cond != Acc_ALWAYS); + vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8)); + return i; +} +AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB, + HReg src, AMD64AMode* addr ) { + AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr)); + i->tag = Ain_CStore; + i->Ain.CStore.cond = cond; + i->Ain.CStore.szB = szB; + i->Ain.CStore.src = src; + i->Ain.CStore.addr = addr; + vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8)); return i; } AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) { @@ -1135,13 +1146,24 @@ void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 ) case Ain_CLoad: vex_printf("if (%%rflags.%s) { ", showAMD64CondCode(i->Ain.CLoad.cond)); - vex_printf("mov%c (", i->Ain.CLoad.szB == 4 ? 'l' : 'q'); + vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q'); ppAMD64AMode(i->Ain.CLoad.addr); - vex_printf("), "); + vex_printf(", "); (i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64) (i->Ain.CLoad.dst); vex_printf(" }"); return; + case Ain_CStore: + vex_printf("if (%%rflags.%s) { ", + showAMD64CondCode(i->Ain.CStore.cond)); + vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q'); + (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64) + (i->Ain.CStore.src); + vex_printf(", "); + ppAMD64AMode(i->Ain.CStore.addr); + vex_printf(" }"); + return; + case Ain_MovxLQ: vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z'); ppHRegAMD64_lo32(i->Ain.MovxLQ.src); @@ -1488,6 +1510,10 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 ) addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr); addHRegUse(u, HRmModify, i->Ain.CLoad.dst); return; + case Ain_CStore: + addRegUsage_AMD64AMode(u, i->Ain.CStore.addr); + addHRegUse(u, HRmRead, i->Ain.CStore.src); + return; case Ain_MovxLQ: addHRegUse(u, HRmRead, i->Ain.MovxLQ.src); addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst); @@ -1724,6 +1750,10 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 ) mapRegs_AMD64AMode(m, i->Ain.CLoad.addr); mapReg(m, &i->Ain.CLoad.dst); return; + case Ain_CStore: + mapRegs_AMD64AMode(m, i->Ain.CStore.addr); + mapReg(m, &i->Ain.CStore.src); + return; case Ain_MovxLQ: mapReg(m, &i->Ain.MovxLQ.src); mapReg(m, &i->Ain.MovxLQ.dst); @@ -3035,6 +3065,35 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, goto done; } + case Ain_CStore: { + /* AFAICS this is identical to Ain_CStore except that the opcode + is 0x89 instead of 0x8B. */ + vassert(i->Ain.CStore.cond != Acc_ALWAYS); + + /* Only 32- or 64-bit variants are allowed. */ + vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8); + + /* Use ptmp for backpatching conditional jumps. */ + ptmp = NULL; + + /* jmp fwds if !condition */ + *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1))); + ptmp = p; /* fill in this bit later */ + *p++ = 0; /* # of bytes to jump over; don't know how many yet. */ + + /* Now the store. */ + rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr); + *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex; + *p++ = 0x89; + p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr); + + /* Fix up the conditional branch */ + Int delta = p - ptmp; + vassert(delta > 0 && delta < 40); + *ptmp = toUChar(delta-1); + goto done; + } + case Ain_MovxLQ: /* No, _don't_ ask me why the sense of the args has to be different in the S vs Z case. I don't know. */ diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index 6ebe9b628e..b3959e47ee 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -369,6 +369,7 @@ typedef Ain_XAssisted, /* assisted transfer to GA */ Ain_CMov64, /* conditional move, 64-bit reg-reg only */ Ain_CLoad, /* cond. load to int reg, 32 bit ZX or 64 bit only */ + Ain_CStore, /* cond. store from int reg, 32 or 64 bit only */ Ain_MovxLQ, /* reg-reg move, zx-ing/sx-ing top half */ Ain_LoadEX, /* mov{s,z}{b,w,l}q from mem to reg */ Ain_Store, /* store 32/16/8 bit value in memory */ @@ -514,6 +515,14 @@ typedef AMD64AMode* addr; HReg dst; } CLoad; + /* cond. store from int reg, 32 or 64 bit only. + cond may not be Acc_ALWAYS. */ + struct { + AMD64CondCode cond; + UChar szB; /* 4 or 8 only */ + HReg src; + AMD64AMode* addr; + } CStore; /* reg-reg move, sx-ing/zx-ing top half */ struct { Bool syned; @@ -721,6 +730,8 @@ extern AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP, extern AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode, HReg src, HReg dst ); extern AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB, AMD64AMode* addr, HReg dst ); +extern AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB, + HReg src, AMD64AMode* addr ); extern AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ); extern AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned, AMD64AMode* src, HReg dst ); diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 999ce955dd..a10e1fc95a 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -4314,6 +4314,28 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) return; } + /* --------- STOREG (guarded store) --------- */ + case Ist_StoreG: { + IRStoreG* sg = stmt->Ist.StoreG.details; + if (sg->end != Iend_LE) + goto stmt_fail; + + UChar szB = 0; /* invalid */ + switch (typeOfIRExpr(env->type_env, sg->data)) { + case Ity_I32: szB = 4; break; + case Ity_I64: szB = 8; break; + default: break; + } + if (szB == 0) + goto stmt_fail; + + AMD64AMode* amAddr = iselIntExpr_AMode(env, sg->addr); + HReg rSrc = iselIntExpr_R(env, sg->data); + AMD64CondCode cc = iselCondCode(env, sg->guard); + addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr)); + return; + } + /* --------- STORE --------- */ case Ist_Store: { IRType tya = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);