i->ARMin.VCvtSD.src = src;
return i;
}
+ARMInstr* ARMInstr_VXferQ ( Bool toQ, HReg qD, HReg dHi, HReg dLo ) {
+ ARMInstr* i = LibVEX_Alloc_inline(sizeof(ARMInstr));
+ i->tag = ARMin_VXferQ;
+ i->ARMin.VXferQ.toQ = toQ;
+ i->ARMin.VXferQ.qD = qD;
+ i->ARMin.VXferQ.dHi = dHi;
+ i->ARMin.VXferQ.dLo = dLo;
+ return i;
+}
ARMInstr* ARMInstr_VXferD ( Bool toD, HReg dD, HReg rHi, HReg rLo ) {
ARMInstr* i = LibVEX_Alloc_inline(sizeof(ARMInstr));
i->tag = ARMin_VXferD;
vex_printf(", ");
ppHRegARM(i->ARMin.VCvtSD.src);
return;
+ case ARMin_VXferQ:
+ if (i->ARMin.VXferQ.toQ) {
+ vex_printf("vmov ");
+ ppHRegARM(i->ARMin.VXferQ.qD);
+ vex_printf("-lo64, ");
+ ppHRegARM(i->ARMin.VXferQ.dLo);
+ vex_printf(" ; vmov ");
+ ppHRegARM(i->ARMin.VXferQ.qD);
+ vex_printf("-hi64, ");
+ ppHRegARM(i->ARMin.VXferQ.dHi);
+ } else {
+ vex_printf("vmov ");
+ ppHRegARM(i->ARMin.VXferQ.dLo);
+ vex_printf(", ");
+ ppHRegARM(i->ARMin.VXferQ.qD);
+ vex_printf("-lo64");
+ vex_printf(" ; vmov ");
+ ppHRegARM(i->ARMin.VXferQ.dHi);
+ vex_printf(", ");
+ ppHRegARM(i->ARMin.VXferQ.qD);
+ vex_printf("-hi64");
+ }
+ return;
case ARMin_VXferD:
vex_printf("vmov ");
if (i->ARMin.VXferD.toD) {
addHRegUse(u, HRmWrite, i->ARMin.VCvtSD.dst);
addHRegUse(u, HRmRead, i->ARMin.VCvtSD.src);
return;
+ case ARMin_VXferQ:
+ if (i->ARMin.VXferQ.toQ) {
+ addHRegUse(u, HRmWrite, i->ARMin.VXferQ.qD);
+ addHRegUse(u, HRmRead, i->ARMin.VXferQ.dHi);
+ addHRegUse(u, HRmRead, i->ARMin.VXferQ.dLo);
+ } else {
+ addHRegUse(u, HRmRead, i->ARMin.VXferQ.qD);
+ addHRegUse(u, HRmWrite, i->ARMin.VXferQ.dHi);
+ addHRegUse(u, HRmWrite, i->ARMin.VXferQ.dLo);
+ }
+ return;
case ARMin_VXferD:
if (i->ARMin.VXferD.toD) {
addHRegUse(u, HRmWrite, i->ARMin.VXferD.dD);
i->ARMin.VCvtSD.dst = lookupHRegRemap(m, i->ARMin.VCvtSD.dst);
i->ARMin.VCvtSD.src = lookupHRegRemap(m, i->ARMin.VCvtSD.src);
return;
+ case ARMin_VXferQ:
+ i->ARMin.VXferQ.qD = lookupHRegRemap(m, i->ARMin.VXferQ.qD);
+ i->ARMin.VXferQ.dHi = lookupHRegRemap(m, i->ARMin.VXferQ.dHi);
+ i->ARMin.VXferQ.dLo = lookupHRegRemap(m, i->ARMin.VXferQ.dLo);
+ return;
case ARMin_VXferD:
i->ARMin.VXferD.dD = lookupHRegRemap(m, i->ARMin.VXferD.dD);
i->ARMin.VXferD.rHi = lookupHRegRemap(m, i->ARMin.VXferD.rHi);
goto done;
}
}
+ case ARMin_VXferQ: {
+ UInt insn;
+ UInt qD = qregEnc(i->ARMin.VXferQ.qD);
+ UInt dHi = dregEnc(i->ARMin.VXferQ.dHi);
+ UInt dLo = dregEnc(i->ARMin.VXferQ.dLo);
+ /* This is a bit tricky. We need to make 2 D-D moves and we rely
+ on the fact that the Q register can be treated as two D registers.
+ We also rely on the fact that the register allocator will allocate
+ the two D's and the Q to disjoint parts of the register file,
+ and so we don't have to worry about the first move's destination
+ being the same as the second move's source, etc. We do have
+ assertions though. */
+ /* The ARM ARM specifies that
+ D<2n> maps to the least significant half of Q<n>
+ D<2n+1> maps to the most significant half of Q<n>
+ So there are no issues with endianness here.
+ */
+ UInt qDlo = 2 * qD + 0;
+ UInt qDhi = 2 * qD + 1;
+ /* Stay sane .. */
+ vassert(qDhi != dHi && qDhi != dLo);
+ vassert(qDlo != dHi && qDlo != dLo);
+ /* vmov dX, dY is
+ F 2 (0,dX[4],1,0) dY[3:0] dX[3:0] 1 (dY[4],0,dY[4],1) dY[3:0]
+ */
+# define VMOV_D_D(_xx,_yy) \
+ XXXXXXXX( 0xF, 0x2, BITS4(0, (((_xx) >> 4) & 1), 1, 0), \
+ ((_yy) & 0xF), ((_xx) & 0xF), 0x1, \
+ BITS4( (((_yy) >> 4) & 1), 0, (((_yy) >> 4) & 1), 1), \
+ ((_yy) & 0xF) )
+ if (i->ARMin.VXferQ.toQ) {
+ insn = VMOV_D_D(qDlo, dLo); *p++ = insn;
+ insn = VMOV_D_D(qDhi, dHi); *p++ = insn;
+ } else {
+ insn = VMOV_D_D(dLo, qDlo); *p++ = insn;
+ insn = VMOV_D_D(dHi, qDhi); *p++ = insn;
+ }
+# undef VMOV_D_D
+ goto done;
+ }
case ARMin_VXferD: {
UInt dD = dregEnc(i->ARMin.VXferD.dD);
UInt rHi = iregEnc(i->ARMin.VXferD.rHi);
}
+static
+Bool doHelperCallWithArgsOnStack ( /*OUT*/UInt* stackAdjustAfterCall,
+ /*OUT*/RetLoc* retloc,
+ ISelEnv* env,
+ IRExpr* guard,
+ IRCallee* cee, IRType retTy, IRExpr** args )
+{
+ /* This function deals just with the case where the arg sequence is:
+ VECRET followed by between 4 and 12 Ity_I32 values. So far no other
+ cases are necessary or supported. */
+
+ /* Check this matches the required format. */
+ if (args[0] == NULL || args[0]->tag != Iex_VECRET)
+ goto no_match;
+
+ UInt i;
+ UInt n_real_args = 0;
+ for (i = 1; args[i]; i++) {
+ IRExpr* arg = args[i];
+ if (UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg)))
+ goto no_match;
+ IRType argTy = typeOfIRExpr(env->type_env, arg);
+ if (UNLIKELY(argTy != Ity_I32))
+ goto no_match;
+ n_real_args++;
+ }
+
+ /* We expect to pass at least some args on the stack. */
+ if (n_real_args <= 3)
+ goto no_match;
+
+ /* But not too many. */
+ if (n_real_args > 12)
+ goto no_match;
+
+ /* General rules for a call:
+
+ Args 1 .. 4 go in R0 .. R3. The rest are pushed R to L on the
+ stack; that is, arg 5 is at the lowest address, arg 6 at the
+ next lowest, etc.
+
+ The stack is to be kept 8 aligned.
+
+ It appears (for unclear reasons) that the highest 3 words made
+ available when moving SP downwards are not to be used. For
+ example, if 5 args are to go on the stack, then SP must be moved
+ down 32 bytes, and the area at SP+20 .. SP+31 is not to be used
+ by the caller.
+ */
+
+ /* For this particular case, we use the following layout:
+
+ ------ original SP
+ 112 bytes
+ ------
+ return value
+ ------ original SP - 128
+ space
+ args words, between 1 and 11
+ ------ new SP = original_SP - 256
+
+ Using 256 bytes is overkill, but it is simple and good enough.
+ */
+
+ /* This should really be
+ HReg argVRegs[n_real_args];
+ but that makes it impossible to do 'goto's forward past.
+ Hence the following kludge. */
+ vassert(n_real_args <= 11);
+ HReg argVRegs[11];
+ for (i = 0; i < 11; i++)
+ argVRegs[i] = INVALID_HREG;
+
+ /* Compute args into vregs. */
+ for (i = 0; i < n_real_args; i++) {
+ argVRegs[i] = iselIntExpr_R(env, args[i+1]);
+ }
+
+ /* Now we can compute the condition. We can't do it earlier
+ because the argument computations could trash the condition
+ codes. Be a bit clever to handle the common case where the
+ guard is 1:Bit. */
+ ARMCondCode cc = ARMcc_AL;
+ if (guard) {
+ if (guard->tag == Iex_Const
+ && guard->Iex.Const.con->tag == Ico_U1
+ && guard->Iex.Const.con->Ico.U1 == True) {
+ /* unconditional -- do nothing */
+ } else {
+ goto no_match; //ATC
+ cc = iselCondCode( env, guard );
+ }
+ }
+
+ HReg r0 = hregARM_R0();
+ HReg sp = hregARM_R13();
+
+ ARMRI84* c256 = ARMRI84_I84(64, 15); // 64 `ror` (15 * 2)
+
+ addInstr(env, ARMInstr_Alu(ARMalu_SUB, r0, sp, ARMRI84_I84(128, 0)));
+
+ addInstr(env, mk_iMOVds_RR(hregARM_R1(), argVRegs[0]));
+ addInstr(env, mk_iMOVds_RR(hregARM_R2(), argVRegs[1]));
+ addInstr(env, mk_iMOVds_RR(hregARM_R3(), argVRegs[2]));
+
+ addInstr(env, ARMInstr_Alu(ARMalu_SUB, sp, sp, c256));
+
+ for (i = 3; i < n_real_args; i++) {
+ addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*store*/, argVRegs[i],
+ ARMAMode1_RI(sp, (i-3) * 4)));
+ }
+
+ vassert(*stackAdjustAfterCall == 0);
+ vassert(is_RetLoc_INVALID(*retloc));
+
+ *stackAdjustAfterCall = 256;
+ *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 128);
+
+ Addr32 target = (Addr)cee->addr;
+ addInstr(env, ARMInstr_Call( cc, target, 4, *retloc ));
+
+ return True; /* success */
+
+ no_match:
+ return False;
+}
+
+
/* Do a complete function call. |guard| is a Ity_Bit expression
indicating whether or not the call happens. If guard==NULL, the
call is unconditional. |retloc| is set to indicate where the
n_args++;
}
+ /* If there are more than 4 args, we are going to have to pass
+ some via memory. Use a different function to (possibly) deal with
+ that; dealing with it here is too complex. */
+ if (n_args > ARM_N_ARGREGS) {
+ return doHelperCallWithArgsOnStack(stackAdjustAfterCall, retloc,
+ env, guard, cee, retTy, args );
+
+ }
+
+ /* After this point we make no attempt to pass args on the stack,
+ and just give up if that case (which is OK because it never
+ happens). Even if there are for example only 3 args, it might
+ still be necessary to pass some of them on the stack if for example
+ two or more of them are 64-bit integers. */
+
argregs[0] = hregARM_R0();
argregs[1] = hregARM_R1();
argregs[2] = hregARM_R2();
vassert(*stackAdjustAfterCall == 0);
vassert(is_RetLoc_INVALID(*retloc));
switch (retTy) {
- case Ity_INVALID:
- /* Function doesn't return a value. */
- *retloc = mk_RetLoc_simple(RLPri_None);
- break;
- case Ity_I64:
- *retloc = mk_RetLoc_simple(RLPri_2Int);
- break;
- case Ity_I32: case Ity_I16: case Ity_I8:
- *retloc = mk_RetLoc_simple(RLPri_Int);
- break;
- case Ity_V128:
- vassert(0); // ATC
- *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
- *stackAdjustAfterCall = 16;
- break;
- case Ity_V256:
- vassert(0); // ATC
- *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
- *stackAdjustAfterCall = 32;
- break;
- default:
- /* IR can denote other possible return types, but we don't
- handle those here. */
- vassert(0);
+ case Ity_INVALID:
+ /* Function doesn't return a value. */
+ *retloc = mk_RetLoc_simple(RLPri_None);
+ break;
+ case Ity_I64:
+ *retloc = mk_RetLoc_simple(RLPri_2Int);
+ break;
+ case Ity_I32: case Ity_I16: case Ity_I8:
+ *retloc = mk_RetLoc_simple(RLPri_Int);
+ break;
+ case Ity_V128:
+ vassert(0); // ATC
+ *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
+ *stackAdjustAfterCall = 16;
+ break;
+ case Ity_V256:
+ vassert(0); // ATC
+ *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
+ *stackAdjustAfterCall = 32;
+ break;
+ default:
+ /* IR can denote other possible return types, but we don't
+ handle those here. */
+ vassert(0);
}
/* Finally, generate the call itself. This needs the *retloc value
res, arg, 0, False));
return res;
}
+ case Iop_V128to64:
+ case Iop_V128HIto64: {
+ HReg src = iselNeonExpr(env, e->Iex.Unop.arg);
+ HReg resLo = newVRegD(env);
+ HReg resHi = newVRegD(env);
+ addInstr(env, ARMInstr_VXferQ(False/*!toQ*/, src, resHi, resLo));
+ return e->Iex.Unop.op == Iop_V128HIto64 ? resHi : resLo;
+ }
default:
break;
}
if (e->tag == Iex_Binop) {
switch (e->Iex.Binop.op) {
- case Iop_64HLtoV128:
+ case Iop_64HLtoV128: {
/* Try to match into single "VMOV reg, imm" instruction */
if (e->Iex.Binop.arg1->tag == Iex_Const &&
e->Iex.Binop.arg2->tag == Iex_Const &&
}
/* Does not match "VMOV Reg, Imm" form. We'll have to do
it the slow way. */
- {
- /* local scope */
- /* Done via the stack for ease of use. */
- /* FIXME: assumes little endian host */
- HReg w3, w2, w1, w0;
- HReg res = newVRegV(env);
- ARMAMode1* sp_0 = ARMAMode1_RI(hregARM_R13(), 0);
- ARMAMode1* sp_4 = ARMAMode1_RI(hregARM_R13(), 4);
- ARMAMode1* sp_8 = ARMAMode1_RI(hregARM_R13(), 8);
- ARMAMode1* sp_12 = ARMAMode1_RI(hregARM_R13(), 12);
- ARMRI84* c_16 = ARMRI84_I84(16,0);
- /* Make space for SP */
- addInstr(env, ARMInstr_Alu(ARMalu_SUB, hregARM_R13(),
- hregARM_R13(), c_16));
-
- /* Store the less significant 64 bits */
- iselInt64Expr(&w1, &w0, env, e->Iex.Binop.arg2);
- addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*store*/,
- w0, sp_0));
- addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*store*/,
- w1, sp_4));
-
- /* Store the more significant 64 bits */
- iselInt64Expr(&w3, &w2, env, e->Iex.Binop.arg1);
- addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*store*/,
- w2, sp_8));
- addInstr(env, ARMInstr_LdSt32(ARMcc_AL, False/*store*/,
- w3, sp_12));
-
- /* Load result back from stack. */
- addInstr(env, ARMInstr_NLdStQ(True/*load*/, res,
- mkARMAModeN_R(hregARM_R13())));
-
- /* Restore SP */
- addInstr(env, ARMInstr_Alu(ARMalu_ADD, hregARM_R13(),
- hregARM_R13(), c_16));
- return res;
- } /* local scope */
- goto neon_expr_bad;
+ HReg dHi = iselNeon64Expr(env, e->Iex.Binop.arg1);
+ HReg dLo = iselNeon64Expr(env, e->Iex.Binop.arg2);
+ HReg res = newVRegV(env);
+ addInstr(env, ARMInstr_VXferQ(True/*toQ*/, res, dHi, dLo));
+ return res;
+ }
case Iop_AndV128: {
HReg res = newVRegV(env);
HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
return dst;
}
- neon_expr_bad:
+ /* neon_expr_bad: */
ppIRExpr(e);
vpanic("iselNeonExpr_wrk");
}
switch (retty) {
case Ity_INVALID: /* function doesn't return anything */
case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
- //case Ity_V128: //ATC
+ case Ity_V128:
retty_ok = True; break;
default:
break;
call is skipped. */
UInt addToSp = 0;
RetLoc rloc = mk_RetLoc_INVALID();
- doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
+ Bool ok = doHelperCall( &addToSp, &rloc, env,
+ d->guard, d->cee, retty, d->args );
+ if (!ok) goto stmt_fail;
vassert(is_sane_RetLoc(rloc));
/* Now figure out what to do with the returned value, if any. */
return;
}
case Ity_V128: {
- vassert(0); // ATC. The code that this produces really
- // needs to be looked at, to verify correctness.
- // I don't think this can ever happen though, since the
- // ARM front end never produces 128-bit loads/stores.
- // Hence the following is mostly theoretical.
/* The returned value is on the stack, and *retloc tells
us where. Fish it off the stack and then move the
stack pointer upwards to clear it, as directed by
vassert(rloc.pri == RLPri_V128SpRel);
vassert(rloc.spOff < 256); // else ARMRI84_I84(_,0) can't encode it
vassert(addToSp >= 16);
- vassert(addToSp < 256); // ditto reason as for rloc.spOff
+ vassert(addToSp <= 256);
+ /* Both the stack delta and the offset must be at least 8-aligned.
+ If that isn't so, doHelperCall() has generated bad code. */
+ vassert(0 == (rloc.spOff % 8));
+ vassert(0 == (addToSp % 8));
HReg dst = lookupIRTemp(env, d->tmp);
HReg tmp = newVRegI(env);
- HReg r13 = hregARM_R13(); // sp
+ HReg sp = hregARM_R13();
addInstr(env, ARMInstr_Alu(ARMalu_ADD,
- tmp, r13, ARMRI84_I84(rloc.spOff,0)));
+ tmp, sp, ARMRI84_I84(rloc.spOff,0)));
ARMAModeN* am = mkARMAModeN_R(tmp);
+ /* This load could be done with its effective address 0 % 8,
+ because that's the best stack alignment that we can be
+ assured of. */
addInstr(env, ARMInstr_NLdStQ(True/*load*/, dst, am));
- addInstr(env, ARMInstr_Alu(ARMalu_ADD,
- r13, r13, ARMRI84_I84(addToSp,0)));
+
+ ARMRI84* spAdj
+ = addToSp == 256 ? ARMRI84_I84(64, 15) // 64 `ror` (15 * 2)
+ : ARMRI84_I84(addToSp, 0);
+ addInstr(env, ARMInstr_Alu(ARMalu_ADD, sp, sp, spAdj));
return;
}
default: