From: Julian Seward Date: Wed, 3 Aug 2016 11:53:11 +0000 (+0000) Subject: arm32 backend stuff needed to support IR artefacts resulting from X-Git-Tag: svn/VALGRIND_3_12_0^2~31 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8b082937aef13ddf595dcb3b387aa751e9d347bb;p=thirdparty%2Fvalgrind.git arm32 backend stuff needed to support IR artefacts resulting from guest support of 32-bit V8 crypto instructions: * add new pseudo-instruction ARMin_VXferQ, to move values between two D regs and a Q reg, in either direction. Use this to implement Iop_64HLtoV128 much more efficiently than before, and to implement Iop_V128HIto64 and Iop_V128to64. * Generate code for helper calls in which have four or more (32-bit) word-sized arguments and a V128 return value. These require passing arguments on the stack. git-svn-id: svn://svn.valgrind.org/vex/trunk@3235 --- diff --git a/VEX/priv/host_arm_defs.c b/VEX/priv/host_arm_defs.c index 82d5519fec..dfcc578f07 100644 --- a/VEX/priv/host_arm_defs.c +++ b/VEX/priv/host_arm_defs.c @@ -1329,6 +1329,15 @@ ARMInstr* ARMInstr_VCvtSD ( Bool sToD, HReg dst, HReg src ) { i->ARMin.VCvtSD.src = src; return i; } +ARMInstr* ARMInstr_VXferQ ( Bool toQ, HReg qD, HReg dHi, HReg dLo ) { + ARMInstr* i = LibVEX_Alloc_inline(sizeof(ARMInstr)); + i->tag = ARMin_VXferQ; + i->ARMin.VXferQ.toQ = toQ; + i->ARMin.VXferQ.qD = qD; + i->ARMin.VXferQ.dHi = dHi; + i->ARMin.VXferQ.dLo = dLo; + return i; +} ARMInstr* ARMInstr_VXferD ( Bool toD, HReg dD, HReg rHi, HReg rLo ) { ARMInstr* i = LibVEX_Alloc_inline(sizeof(ARMInstr)); i->tag = ARMin_VXferD; @@ -1800,6 +1809,29 @@ void ppARMInstr ( const ARMInstr* i ) { vex_printf(", "); ppHRegARM(i->ARMin.VCvtSD.src); return; + case ARMin_VXferQ: + if (i->ARMin.VXferQ.toQ) { + vex_printf("vmov "); + ppHRegARM(i->ARMin.VXferQ.qD); + vex_printf("-lo64, "); + ppHRegARM(i->ARMin.VXferQ.dLo); + vex_printf(" ; vmov "); + ppHRegARM(i->ARMin.VXferQ.qD); + vex_printf("-hi64, "); + ppHRegARM(i->ARMin.VXferQ.dHi); + } else { + vex_printf("vmov "); + ppHRegARM(i->ARMin.VXferQ.dLo); + vex_printf(", "); + ppHRegARM(i->ARMin.VXferQ.qD); + vex_printf("-lo64"); + vex_printf(" ; vmov "); + ppHRegARM(i->ARMin.VXferQ.dHi); + vex_printf(", "); + ppHRegARM(i->ARMin.VXferQ.qD); + vex_printf("-hi64"); + } + return; case ARMin_VXferD: vex_printf("vmov "); if (i->ARMin.VXferD.toD) { @@ -2201,6 +2233,17 @@ void getRegUsage_ARMInstr ( HRegUsage* u, const ARMInstr* i, Bool mode64 ) addHRegUse(u, HRmWrite, i->ARMin.VCvtSD.dst); addHRegUse(u, HRmRead, i->ARMin.VCvtSD.src); return; + case ARMin_VXferQ: + if (i->ARMin.VXferQ.toQ) { + addHRegUse(u, HRmWrite, i->ARMin.VXferQ.qD); + addHRegUse(u, HRmRead, i->ARMin.VXferQ.dHi); + addHRegUse(u, HRmRead, i->ARMin.VXferQ.dLo); + } else { + addHRegUse(u, HRmRead, i->ARMin.VXferQ.qD); + addHRegUse(u, HRmWrite, i->ARMin.VXferQ.dHi); + addHRegUse(u, HRmWrite, i->ARMin.VXferQ.dLo); + } + return; case ARMin_VXferD: if (i->ARMin.VXferD.toD) { addHRegUse(u, HRmWrite, i->ARMin.VXferD.dD); @@ -2422,6 +2465,11 @@ void mapRegs_ARMInstr ( HRegRemap* m, ARMInstr* i, Bool mode64 ) i->ARMin.VCvtSD.dst = lookupHRegRemap(m, i->ARMin.VCvtSD.dst); i->ARMin.VCvtSD.src = lookupHRegRemap(m, i->ARMin.VCvtSD.src); return; + case ARMin_VXferQ: + i->ARMin.VXferQ.qD = lookupHRegRemap(m, i->ARMin.VXferQ.qD); + i->ARMin.VXferQ.dHi = lookupHRegRemap(m, i->ARMin.VXferQ.dHi); + i->ARMin.VXferQ.dLo = lookupHRegRemap(m, i->ARMin.VXferQ.dLo); + return; case ARMin_VXferD: i->ARMin.VXferD.dD = lookupHRegRemap(m, i->ARMin.VXferD.dD); i->ARMin.VXferD.rHi = lookupHRegRemap(m, i->ARMin.VXferD.rHi); @@ -3682,6 +3730,46 @@ Int emit_ARMInstr ( /*MB_MOD*/Bool* is_profInc, goto done; } } + case ARMin_VXferQ: { + UInt insn; + UInt qD = qregEnc(i->ARMin.VXferQ.qD); + UInt dHi = dregEnc(i->ARMin.VXferQ.dHi); + UInt dLo = dregEnc(i->ARMin.VXferQ.dLo); + /* This is a bit tricky. We need to make 2 D-D moves and we rely + on the fact that the Q register can be treated as two D registers. + We also rely on the fact that the register allocator will allocate + the two D's and the Q to disjoint parts of the register file, + and so we don't have to worry about the first move's destination + being the same as the second move's source, etc. We do have + assertions though. */ + /* The ARM ARM specifies that + D<2n> maps to the least significant half of Q + D<2n+1> maps to the most significant half of Q + So there are no issues with endianness here. + */ + UInt qDlo = 2 * qD + 0; + UInt qDhi = 2 * qD + 1; + /* Stay sane .. */ + vassert(qDhi != dHi && qDhi != dLo); + vassert(qDlo != dHi && qDlo != dLo); + /* vmov dX, dY is + F 2 (0,dX[4],1,0) dY[3:0] dX[3:0] 1 (dY[4],0,dY[4],1) dY[3:0] + */ +# define VMOV_D_D(_xx,_yy) \ + XXXXXXXX( 0xF, 0x2, BITS4(0, (((_xx) >> 4) & 1), 1, 0), \ + ((_yy) & 0xF), ((_xx) & 0xF), 0x1, \ + BITS4( (((_yy) >> 4) & 1), 0, (((_yy) >> 4) & 1), 1), \ + ((_yy) & 0xF) ) + if (i->ARMin.VXferQ.toQ) { + insn = VMOV_D_D(qDlo, dLo); *p++ = insn; + insn = VMOV_D_D(qDhi, dHi); *p++ = insn; + } else { + insn = VMOV_D_D(dLo, qDlo); *p++ = insn; + insn = VMOV_D_D(dHi, qDhi); *p++ = insn; + } +# undef VMOV_D_D + goto done; + } case ARMin_VXferD: { UInt dD = dregEnc(i->ARMin.VXferD.dD); UInt rHi = iregEnc(i->ARMin.VXferD.rHi); diff --git a/VEX/priv/host_arm_defs.h b/VEX/priv/host_arm_defs.h index 47f459d826..cd2051256a 100644 --- a/VEX/priv/host_arm_defs.h +++ b/VEX/priv/host_arm_defs.h @@ -591,6 +591,7 @@ typedef ARMin_VCMovD, ARMin_VCMovS, ARMin_VCvtSD, + ARMin_VXferQ, ARMin_VXferD, ARMin_VXferS, ARMin_VCvtID, @@ -824,6 +825,13 @@ typedef HReg dst; HReg src; } VCvtSD; + /* Transfer a NEON Q reg to/from two D registers (VMOV x 2) */ + struct { + Bool toQ; + HReg qD; + HReg dHi; + HReg dLo; + } VXferQ; /* Transfer a VFP D reg to/from two integer registers (VMOV) */ struct { Bool toD; @@ -994,6 +1002,7 @@ extern ARMInstr* ARMInstr_VCmpD ( HReg argL, HReg argR ); extern ARMInstr* ARMInstr_VCMovD ( ARMCondCode, HReg dst, HReg src ); extern ARMInstr* ARMInstr_VCMovS ( ARMCondCode, HReg dst, HReg src ); extern ARMInstr* ARMInstr_VCvtSD ( Bool sToD, HReg dst, HReg src ); +extern ARMInstr* ARMInstr_VXferQ ( Bool toQ, HReg qD, HReg dHi, HReg dLo ); extern ARMInstr* ARMInstr_VXferD ( Bool toD, HReg dD, HReg rHi, HReg rLo ); extern ARMInstr* ARMInstr_VXferS ( Bool toS, HReg fD, HReg rLo ); extern ARMInstr* ARMInstr_VCvtID ( Bool iToD, Bool syned, diff --git a/VEX/priv/host_arm_isel.c b/VEX/priv/host_arm_isel.c index 1513112184..a11bc6d269 100644 --- a/VEX/priv/host_arm_isel.c +++ b/VEX/priv/host_arm_isel.c @@ -368,6 +368,134 @@ Bool mightRequireFixedRegs ( IRExpr* e ) } +static +Bool doHelperCallWithArgsOnStack ( /*OUT*/UInt* stackAdjustAfterCall, + /*OUT*/RetLoc* retloc, + ISelEnv* env, + IRExpr* guard, + IRCallee* cee, IRType retTy, IRExpr** args ) +{ + /* This function deals just with the case where the arg sequence is: + VECRET followed by between 4 and 12 Ity_I32 values. So far no other + cases are necessary or supported. */ + + /* Check this matches the required format. */ + if (args[0] == NULL || args[0]->tag != Iex_VECRET) + goto no_match; + + UInt i; + UInt n_real_args = 0; + for (i = 1; args[i]; i++) { + IRExpr* arg = args[i]; + if (UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg))) + goto no_match; + IRType argTy = typeOfIRExpr(env->type_env, arg); + if (UNLIKELY(argTy != Ity_I32)) + goto no_match; + n_real_args++; + } + + /* We expect to pass at least some args on the stack. */ + if (n_real_args <= 3) + goto no_match; + + /* But not too many. */ + if (n_real_args > 12) + goto no_match; + + /* General rules for a call: + + Args 1 .. 4 go in R0 .. R3. The rest are pushed R to L on the + stack; that is, arg 5 is at the lowest address, arg 6 at the + next lowest, etc. + + The stack is to be kept 8 aligned. + + It appears (for unclear reasons) that the highest 3 words made + available when moving SP downwards are not to be used. For + example, if 5 args are to go on the stack, then SP must be moved + down 32 bytes, and the area at SP+20 .. SP+31 is not to be used + by the caller. + */ + + /* For this particular case, we use the following layout: + + ------ original SP + 112 bytes + ------ + return value + ------ original SP - 128 + space + args words, between 1 and 11 + ------ new SP = original_SP - 256 + + Using 256 bytes is overkill, but it is simple and good enough. + */ + + /* This should really be + HReg argVRegs[n_real_args]; + but that makes it impossible to do 'goto's forward past. + Hence the following kludge. */ + vassert(n_real_args <= 11); + HReg argVRegs[11]; + for (i = 0; i < 11; i++) + argVRegs[i] = INVALID_HREG; + + /* Compute args into vregs. */ + for (i = 0; i < n_real_args; i++) { + argVRegs[i] = iselIntExpr_R(env, args[i+1]); + } + + /* Now we can compute the condition. We can't do it earlier + because the argument computations could trash the condition + codes. Be a bit clever to handle the common case where the + guard is 1:Bit. */ + ARMCondCode cc = ARMcc_AL; + if (guard) { + if (guard->tag == Iex_Const + && guard->Iex.Const.con->tag == Ico_U1 + && guard->Iex.Const.con->Ico.U1 == True) { + /* unconditional -- do nothing */ + } else { + goto no_match; //ATC + cc = iselCondCode( env, guard ); + } + } + + HReg r0 = hregARM_R0(); + HReg sp = hregARM_R13(); + + ARMRI84* c256 = ARMRI84_I84(64, 15); // 64 `ror` (15 * 2) + + addInstr(env, ARMInstr_Alu(ARMalu_SUB, r0, sp, ARMRI84_I84(128, 0))); + + addInstr(env, mk_iMOVds_RR(hregARM_R1(), argVRegs[0])); + addInstr(env, mk_iMOVds_RR(hregARM_R2(), argVRegs[1])); + addInstr(env, mk_iMOVds_RR(hregARM_R3(), argVRegs[2])); + + addInstr(env, ARMInstr_Alu(ARMalu_SUB, sp, sp, c256)); + + for (i = 3; i < n_real_args; i++) { + addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*store*/, argVRegs[i], + ARMAMode1_RI(sp, (i-3) * 4))); + } + + vassert(*stackAdjustAfterCall == 0); + vassert(is_RetLoc_INVALID(*retloc)); + + *stackAdjustAfterCall = 256; + *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 128); + + Addr32 target = (Addr)cee->addr; + addInstr(env, ARMInstr_Call( cc, target, 4, *retloc )); + + return True; /* success */ + + no_match: + return False; +} + + /* Do a complete function call. |guard| is a Ity_Bit expression indicating whether or not the call happens. If guard==NULL, the call is unconditional. |retloc| is set to indicate where the @@ -470,6 +598,21 @@ Bool doHelperCall ( /*OUT*/UInt* stackAdjustAfterCall, n_args++; } + /* If there are more than 4 args, we are going to have to pass + some via memory. Use a different function to (possibly) deal with + that; dealing with it here is too complex. */ + if (n_args > ARM_N_ARGREGS) { + return doHelperCallWithArgsOnStack(stackAdjustAfterCall, retloc, + env, guard, cee, retTy, args ); + + } + + /* After this point we make no attempt to pass args on the stack, + and just give up if that case (which is OK because it never + happens). Even if there are for example only 3 args, it might + still be necessary to pass some of them on the stack if for example + two or more of them are 64-bit integers. */ + argregs[0] = hregARM_R0(); argregs[1] = hregARM_R1(); argregs[2] = hregARM_R2(); @@ -653,30 +796,30 @@ Bool doHelperCall ( /*OUT*/UInt* stackAdjustAfterCall, vassert(*stackAdjustAfterCall == 0); vassert(is_RetLoc_INVALID(*retloc)); switch (retTy) { - case Ity_INVALID: - /* Function doesn't return a value. */ - *retloc = mk_RetLoc_simple(RLPri_None); - break; - case Ity_I64: - *retloc = mk_RetLoc_simple(RLPri_2Int); - break; - case Ity_I32: case Ity_I16: case Ity_I8: - *retloc = mk_RetLoc_simple(RLPri_Int); - break; - case Ity_V128: - vassert(0); // ATC - *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0); - *stackAdjustAfterCall = 16; - break; - case Ity_V256: - vassert(0); // ATC - *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0); - *stackAdjustAfterCall = 32; - break; - default: - /* IR can denote other possible return types, but we don't - handle those here. */ - vassert(0); + case Ity_INVALID: + /* Function doesn't return a value. */ + *retloc = mk_RetLoc_simple(RLPri_None); + break; + case Ity_I64: + *retloc = mk_RetLoc_simple(RLPri_2Int); + break; + case Ity_I32: case Ity_I16: case Ity_I8: + *retloc = mk_RetLoc_simple(RLPri_Int); + break; + case Ity_V128: + vassert(0); // ATC + *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0); + *stackAdjustAfterCall = 16; + break; + case Ity_V256: + vassert(0); // ATC + *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0); + *stackAdjustAfterCall = 32; + break; + default: + /* IR can denote other possible return types, but we don't + handle those here. */ + vassert(0); } /* Finally, generate the call itself. This needs the *retloc value @@ -3714,6 +3857,14 @@ static HReg iselNeon64Expr_wrk ( ISelEnv* env, IRExpr* e ) res, arg, 0, False)); return res; } + case Iop_V128to64: + case Iop_V128HIto64: { + HReg src = iselNeonExpr(env, e->Iex.Unop.arg); + HReg resLo = newVRegD(env); + HReg resHi = newVRegD(env); + addInstr(env, ARMInstr_VXferQ(False/*!toQ*/, src, resHi, resLo)); + return e->Iex.Unop.op == Iop_V128HIto64 ? resHi : resLo; + } default: break; } @@ -4305,7 +4456,7 @@ static HReg iselNeonExpr_wrk ( ISelEnv* env, IRExpr* e ) if (e->tag == Iex_Binop) { switch (e->Iex.Binop.op) { - case Iop_64HLtoV128: + case Iop_64HLtoV128: { /* Try to match into single "VMOV reg, imm" instruction */ if (e->Iex.Binop.arg1->tag == Iex_Const && e->Iex.Binop.arg2->tag == Iex_Const && @@ -4349,45 +4500,12 @@ static HReg iselNeonExpr_wrk ( ISelEnv* env, IRExpr* e ) } /* Does not match "VMOV Reg, Imm" form. We'll have to do it the slow way. */ - { - /* local scope */ - /* Done via the stack for ease of use. */ - /* FIXME: assumes little endian host */ - HReg w3, w2, w1, w0; - HReg res = newVRegV(env); - ARMAMode1* sp_0 = ARMAMode1_RI(hregARM_R13(), 0); - ARMAMode1* sp_4 = ARMAMode1_RI(hregARM_R13(), 4); - ARMAMode1* sp_8 = ARMAMode1_RI(hregARM_R13(), 8); - ARMAMode1* sp_12 = ARMAMode1_RI(hregARM_R13(), 12); - ARMRI84* c_16 = ARMRI84_I84(16,0); - /* Make space for SP */ - addInstr(env, ARMInstr_Alu(ARMalu_SUB, hregARM_R13(), - hregARM_R13(), c_16)); - - /* Store the less significant 64 bits */ - iselInt64Expr(&w1, &w0, env, e->Iex.Binop.arg2); - addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*store*/, - w0, sp_0)); - addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*store*/, - w1, sp_4)); - - /* Store the more significant 64 bits */ - iselInt64Expr(&w3, &w2, env, e->Iex.Binop.arg1); - addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*store*/, - w2, sp_8)); - addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*store*/, - w3, sp_12)); - - /* Load result back from stack. */ - addInstr(env, ARMInstr_NLdStQ(True/*load*/, res, - mkARMAModeN_R(hregARM_R13()))); - - /* Restore SP */ - addInstr(env, ARMInstr_Alu(ARMalu_ADD, hregARM_R13(), - hregARM_R13(), c_16)); - return res; - } /* local scope */ - goto neon_expr_bad; + HReg dHi = iselNeon64Expr(env, e->Iex.Binop.arg1); + HReg dLo = iselNeon64Expr(env, e->Iex.Binop.arg2); + HReg res = newVRegV(env); + addInstr(env, ARMInstr_VXferQ(True/*toQ*/, res, dHi, dLo)); + return res; + } case Iop_AndV128: { HReg res = newVRegV(env); HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1); @@ -5359,7 +5477,7 @@ static HReg iselNeonExpr_wrk ( ISelEnv* env, IRExpr* e ) return dst; } - neon_expr_bad: + /* neon_expr_bad: */ ppIRExpr(e); vpanic("iselNeonExpr_wrk"); } @@ -5974,7 +6092,7 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) switch (retty) { case Ity_INVALID: /* function doesn't return anything */ case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: - //case Ity_V128: //ATC + case Ity_V128: retty_ok = True; break; default: break; @@ -5987,7 +6105,9 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) call is skipped. */ UInt addToSp = 0; RetLoc rloc = mk_RetLoc_INVALID(); - doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args ); + Bool ok = doHelperCall( &addToSp, &rloc, env, + d->guard, d->cee, retty, d->args ); + if (!ok) goto stmt_fail; vassert(is_sane_RetLoc(rloc)); /* Now figure out what to do with the returned value, if any. */ @@ -6026,11 +6146,6 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) return; } case Ity_V128: { - vassert(0); // ATC. The code that this produces really - // needs to be looked at, to verify correctness. - // I don't think this can ever happen though, since the - // ARM front end never produces 128-bit loads/stores. - // Hence the following is mostly theoretical. /* The returned value is on the stack, and *retloc tells us where. Fish it off the stack and then move the stack pointer upwards to clear it, as directed by @@ -6038,16 +6153,26 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) vassert(rloc.pri == RLPri_V128SpRel); vassert(rloc.spOff < 256); // else ARMRI84_I84(_,0) can't encode it vassert(addToSp >= 16); - vassert(addToSp < 256); // ditto reason as for rloc.spOff + vassert(addToSp <= 256); + /* Both the stack delta and the offset must be at least 8-aligned. + If that isn't so, doHelperCall() has generated bad code. */ + vassert(0 == (rloc.spOff % 8)); + vassert(0 == (addToSp % 8)); HReg dst = lookupIRTemp(env, d->tmp); HReg tmp = newVRegI(env); - HReg r13 = hregARM_R13(); // sp + HReg sp = hregARM_R13(); addInstr(env, ARMInstr_Alu(ARMalu_ADD, - tmp, r13, ARMRI84_I84(rloc.spOff,0))); + tmp, sp, ARMRI84_I84(rloc.spOff,0))); ARMAModeN* am = mkARMAModeN_R(tmp); + /* This load could be done with its effective address 0 % 8, + because that's the best stack alignment that we can be + assured of. */ addInstr(env, ARMInstr_NLdStQ(True/*load*/, dst, am)); - addInstr(env, ARMInstr_Alu(ARMalu_ADD, - r13, r13, ARMRI84_I84(addToSp,0))); + + ARMRI84* spAdj + = addToSp == 256 ? ARMRI84_I84(64, 15) // 64 `ror` (15 * 2) + : ARMRI84_I84(addToSp, 0); + addInstr(env, ARMInstr_Alu(ARMalu_ADD, sp, sp, spAdj)); return; } default: