From: Julian Seward Date: Wed, 11 May 2005 16:13:37 +0000 (+0000) Subject: Make the amd64 back end capable of dealing with the stuff memcheck X-Git-Tag: svn/VALGRIND_3_0_1^2~148 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1976b0855f89a4523fff7d943a4c6fd846b02363;p=thirdparty%2Fvalgrind.git Make the amd64 back end capable of dealing with the stuff memcheck generates for 128-bit vector primops. git-svn-id: svn://svn.valgrind.org/vex/trunk@1186 --- diff --git a/VEX/priv/host-amd64/hdefs.c b/VEX/priv/host-amd64/hdefs.c index 769d83e317..bd71c82102 100644 --- a/VEX/priv/host-amd64/hdefs.c +++ b/VEX/priv/host-amd64/hdefs.c @@ -1603,9 +1603,12 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i ) i->Ain.Sse64FLo.dst); return; case Ain_SseReRg: - if (i->Ain.SseReRg.op == Asse_XOR - && i->Ain.SseReRg.src == i->Ain.SseReRg.dst) { - /* reg-alloc needs to understand 'xor r,r' as a write of r */ + if ( (i->Ain.SseReRg.op == Asse_XOR + || i->Ain.SseReRg.op == Asse_CMPEQ32) + && i->Ain.SseReRg.src == i->Ain.SseReRg.dst) { + /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd + r,r' as a write of a value to r, and independent of any + previous value in r */ /* (as opposed to a rite of passage :-) */ addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst); } else { diff --git a/VEX/priv/host-amd64/isel.c b/VEX/priv/host-amd64/isel.c index 63eebda65c..0664d22538 100644 --- a/VEX/priv/host-amd64/isel.c +++ b/VEX/priv/host-amd64/isel.c @@ -698,17 +698,31 @@ void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode ) } -/* Generate !src into a new vector register. Amazing that there isn't - a less crappy way to do this. +/* Generate all-zeroes into a new vector register. */ -static HReg do_sse_NotV128 ( ISelEnv* env, HReg src ) +static HReg generate_zeroes_V128 ( ISelEnv* env ) { HReg dst = newVRegV(env); - /* Set dst to zero. Not strictly necessary. */ addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst)); - /* And now make it all 1s ... */ + return dst; +} + +/* Generate all-ones into a new vector register. +*/ +static HReg generate_ones_V128 ( ISelEnv* env ) +{ + HReg dst = newVRegV(env); addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst)); - /* Finally, xor 'src' into it. */ + return dst; +} + + +/* Generate !src into a new vector register. Amazing that there isn't + a less crappy way to do this. +*/ +static HReg do_sse_NotV128 ( ISelEnv* env, HReg src ) +{ + HReg dst = generate_ones_V128(env); addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst)); return dst; } @@ -3053,7 +3067,7 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) HReg dst = newVRegV(env); vassert(e->Iex.Const.con->tag == Ico_V128); if (e->Iex.Const.con->Ico.V128 == 0x0000) { - addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst)); + dst = generate_zeroes_V128(env); return dst; } else if (e->Iex.Const.con->Ico.V128 == 0x00FF) { @@ -3108,9 +3122,8 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0) */ HReg arg = iselVecExpr(env, e->Iex.Unop.arg); - HReg tmp = newVRegV(env); + HReg tmp = generate_zeroes_V128(env); HReg dst = newVRegV(env); - addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, tmp)); addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp)); tmp = do_sse_NotV128(env, tmp); addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst)); @@ -3118,59 +3131,20 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) return dst; } -//.. case Iop_CmpNEZ32x4: { -//.. /* Sigh, we have to generate lousy code since this has to -//.. work on SSE1 hosts */ -//.. /* basically, the idea is: for each lane: -//.. movl lane, %r ; negl %r (now CF = lane==0 ? 0 : 1) -//.. sbbl %r, %r (now %r = 1Sto32(CF)) -//.. movl %r, lane -//.. */ -//.. Int i; -//.. X86AMode* am; -//.. X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP()); -//.. HReg arg = iselVecExpr(env, e->Iex.Unop.arg); -//.. HReg dst = newVRegV(env); -//.. HReg r32 = newVRegI(env); -//.. sub_from_esp(env, 16); -//.. addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0)); -//.. for (i = 0; i < 4; i++) { -//.. am = X86AMode_IR(i*4, hregX86_ESP()); -//.. addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), r32)); -//.. addInstr(env, X86Instr_Unary32(Xun_NEG, X86RM_Reg(r32))); -//.. addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32), r32)); -//.. addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32), am)); -//.. } -//.. addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0)); -//.. add_to_esp(env, 16); -//.. return dst; -//.. } -//.. -//.. case Iop_CmpNEZ8x16: -//.. case Iop_CmpNEZ16x8: { -//.. /* We can use SSE2 instructions for this. */ -//.. HReg arg; -//.. HReg vec0 = newVRegV(env); -//.. HReg vec1 = newVRegV(env); -//.. HReg dst = newVRegV(env); -//.. X86SseOp cmpOp -//.. = e->Iex.Unop.op==Iop_CmpNEZ16x8 ? Xsse_CMPEQ16 -//.. : Xsse_CMPEQ8; -//.. REQUIRE_SSE2; -//.. addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec0, vec0)); -//.. addInstr(env, mk_vMOVsd_RR(vec0, vec1)); -//.. addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, vec1, vec1)); -//.. /* defer arg computation to here so as to give CMPEQF as long -//.. as possible to complete */ -//.. arg = iselVecExpr(env, e->Iex.Unop.arg); -//.. /* vec0 is all 0s; vec1 is all 1s */ -//.. addInstr(env, mk_vMOVsd_RR(arg, dst)); -//.. /* 16x8 or 8x16 comparison == */ -//.. addInstr(env, X86Instr_SseReRg(cmpOp, vec0, dst)); -//.. /* invert result */ -//.. addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec1, dst)); -//.. return dst; -//.. } + case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector; + case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector; + case Iop_CmpNEZ8x16: op = Asse_CMPEQ8; goto do_CmpNEZ_vector; + do_CmpNEZ_vector: + { + HReg arg = iselVecExpr(env, e->Iex.Unop.arg); + HReg tmp = newVRegV(env); + HReg zero = generate_zeroes_V128(env); + HReg dst; + addInstr(env, mk_vMOVsd_RR(arg, tmp)); + addInstr(env, AMD64Instr_SseReRg(op, zero, tmp)); + dst = do_sse_NotV128(env, tmp); + return dst; + } case Iop_Recip32Fx4: op = Asse_RCPF; goto do_32Fx4_unary; case Iop_RSqrt32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;