}
}
+/* op rm/mrm, i */
+static void emit_gmrmi(ASMState *as, x86Group xg, Reg rb, int32_t i)
+{
+ x86Op xo;
+ if (checki8(i)) {
+ emit_i8(as, i);
+ xo = XG_TOXOi8(xg);
+ } else {
+ emit_i32(as, i);
+ xo = XG_TOXOi(xg);
+ }
+ emit_mrm(as, xo, (Reg)(xg & 7) | (rb & REX_64), (rb & ~REX_64));
+}
+
/* -- Emit moves ---------------------------------------------------------- */
/* mov [base+ofs], i */
/* mov r, i / xor r, r */
static void emit_loadi(ASMState *as, Reg r, int32_t i)
{
- if (i == 0) {
+ /* XOR r,r is shorter, but modifies the flags. This is bad for HIOP. */
+ if (i == 0 && !(LJ_32 && (IR(as->curins)->o == IR_HIOP ||
+ (as->curins+1 < as->T->nins &&
+ IR(as->curins+1)->o == IR_HIOP)))) {
emit_rr(as, XO_ARITH(XOg_XOR), r, r);
} else {
MCode *p = as->mcp;
/* Label for short jumps. */
typedef MCode *MCLabel;
+#if LJ_32 && LJ_HASFFI
+/* jmp short target */
+static void emit_sjmp(ASMState *as, MCLabel target)
+{
+ MCode *p = as->mcp;
+ ptrdiff_t delta = target - p;
+ lua_assert(delta == (int8_t)delta);
+ p[-1] = (MCode)(int8_t)delta;
+ p[-2] = XI_JMPs;
+ as->mcp = p - 2;
+}
+#endif
+
/* jcc short target */
static void emit_sjcc(ASMState *as, int cc, MCLabel target)
{
} else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */
lua_assert(irt_isnil(ir->t));
emit_getgl(as, r, jit_L);
-#if LJ_64 /* NYI: 32 bit register pairs. */
+#if LJ_64
} else if (ir->o == IR_KINT64) {
emit_loadu64(as, r, ir_kint64(ir)->u64);
#endif
#if LJ_64
#define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0))
#else
-/* NYI: 32 bit register pairs. */
-#define REX_64IR(ir, r) check_exp(!irt_is64((ir)->t), (r))
+#define REX_64IR(ir, r) (r)
#endif
/* Generic move between two regs. */
emit_loadn(as, dest, tv);
return;
}
-#if LJ_64 /* NYI: 32 bit register pairs. */
+#if LJ_64
} else if (ir->o == IR_KINT64) {
emit_loadu64(as, dest, ir_kint64(ir)->u64);
return;
#endif
if (r) { /* Argument is in a register. */
if (r < RID_MAX_GPR && ref < ASMREF_TMP1) {
-#if LJ_64 /* NYI: 32 bit register pairs. */
+#if LJ_64
if (ir->o == IR_KINT64)
emit_loadu64(as, r, ir_kint64(ir)->u64);
else
ra_evictset(as, drop); /* Evictions must be performed first. */
if (ra_used(ir)) {
if (irt_isfp(ir->t)) {
- int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
+ int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
#if LJ_64
if ((ci->flags & CCI_CASTU64)) {
Reg dest = ir->r;
int stfp = (st == IRT_NUM || st == IRT_FLOAT);
IRRef lref = ir->op1;
lua_assert(irt_type(ir->t) != st);
+ lua_assert(!(LJ_32 && (irt_isint64(ir->t) || st64))); /* Handled by SPLIT. */
if (irt_isfp(ir->t)) {
Reg dest = ra_dest(as, ir, RSET_FPR);
if (stfp) { /* FP to FP conversion. */
Reg left = asm_fuseload(as, lref, RSET_FPR);
emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left);
if (left == dest) return; /* Avoid the XO_XORPS. */
-#if LJ_32
- } else if (st >= IRT_U32) {
- /* NYI: 64 bit integer or uint32_t to number conversion. */
- setintV(&as->J->errinfo, ir->o);
- lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
+ } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */
+ /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
+ cTValue *k = lj_ir_k64_find(as->J, U64x(43380000,00000000));
+ Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest));
+ if (irt_isfloat(ir->t))
+ emit_rr(as, XO_CVTSD2SS, dest, dest);
+ emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */
+ emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */
+ emit_loadn(as, bias, k);
+ emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
return;
-#endif
} else { /* Integer to FP conversion. */
Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ?
ra_alloc1(as, lref, RSET_GPR) :
emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */
} else if (stfp) { /* FP to integer conversion. */
if (irt_isguard(ir->t)) {
- lua_assert(!irt_is64(ir->t)); /* No support for checked 64 bit conv. */
+ /* Checked conversions are only supported from number to int. */
+ lua_assert(irt_isint(ir->t) && st == IRT_NUM);
asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
-#if LJ_32
- } else if (irt_isi64(ir->t) || irt_isu64(ir->t) || irt_isu32(ir->t)) {
- /* NYI: number to 64 bit integer or uint32_t conversion. */
- setintV(&as->J->errinfo, ir->o);
- lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
-#endif
} else {
Reg dest = ra_dest(as, ir, RSET_GPR);
x86Op op = st == IRT_NUM ?
((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) :
((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI);
- if (LJ_64 && irt_isu64(ir->t)) {
- const void *k = lj_ir_k64_find(as->J, U64x(c3f00000,00000000));
- MCLabel l_end = emit_label(as);
- Reg left = IR(lref)->r;
+ if (LJ_32 && irt_isu32(ir->t)) { /* FP to U32 conversion on x86. */
+ /* u32 = (int32_t)(number - 2^31) + 2^31 */
+ Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
+ ra_scratch(as, RSET_FPR);
+ emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000);
+ emit_rr(as, op, dest, tmp);
+ if (st == IRT_NUM)
+ emit_rma(as, XO_ADDSD, tmp,
+ lj_ir_k64_find(as->J, U64x(c1e00000,00000000)));
+ else
+ emit_rma(as, XO_ADDSS, tmp,
+ lj_ir_k64_find(as->J, U64x(00000000,cf000000)));
+ ra_left(as, tmp, lref);
+ } else if (LJ_64 && irt_isu64(ir->t)) {
/* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
- if (ra_hasreg(left)) {
- Reg tmpn = ra_scratch(as, rset_exclude(RSET_FPR, left));
- emit_rr(as, op, dest|REX_64, tmpn);
- emit_rr(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, tmpn, left);
- emit_rma(as, st == IRT_NUM ? XMM_MOVRM(as) : XO_MOVSS, tmpn, k);
- } else {
- left = ra_allocref(as, lref, RSET_FPR);
- emit_rr(as, op, dest|REX_64, left);
- emit_rma(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, left, k);
- }
+ Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
+ ra_scratch(as, RSET_FPR);
+ MCLabel l_end = emit_label(as);
+ emit_rr(as, op, dest|REX_64, tmp);
+ if (st == IRT_NUM)
+ emit_rma(as, XO_ADDSD, tmp,
+ lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
+ else
+ emit_rma(as, XO_ADDSS, tmp,
+ lj_ir_k64_find(as->J, U64x(00000000,df800000)));
emit_sjcc(as, CC_NS, l_end);
emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */
- emit_rr(as, op, dest|REX_64, left);
+ emit_rr(as, op, dest|REX_64, tmp);
+ ra_left(as, tmp, lref);
} else {
Reg left = asm_fuseload(as, lref, RSET_FPR);
if (LJ_64 && irt_isu32(ir->t))
- emit_rr(as, XO_MOV, dest, dest); /* Zero upper 32 bits. */
+ emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */
emit_mrm(as, op,
dest|((LJ_64 &&
(irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0),
emit_mrm(as, op, dest, left);
}
} else { /* 32/64 bit integer conversions. */
- if (irt_is64(ir->t)) {
-#if LJ_32
- /* NYI: conversion to 64 bit integers. */
- setintV(&as->J->errinfo, ir->o);
- lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
-#else
+ if (LJ_32) { /* Only need to handle 32/32 bit no-op (cast) on x86. */
+ Reg dest = ra_dest(as, ir, RSET_GPR);
+ ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
+ } else if (irt_is64(ir->t)) {
Reg dest = ra_dest(as, ir, RSET_GPR);
if (st64 || !(ir->op2 & IRCONV_SEXT)) {
/* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
Reg left = asm_fuseload(as, lref, RSET_GPR);
emit_mrm(as, XO_MOVSXd, dest|REX_64, left);
}
-#endif
} else {
Reg dest = ra_dest(as, ir, RSET_GPR);
if (st64) {
-#if LJ_32
- /* NYI: conversion from 64 bit integers. */
- setintV(&as->J->errinfo, ir->o);
- lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
-#else
Reg left = asm_fuseload(as, lref, RSET_GPR);
- /* This is either a 32 bit reg/reg mov which zeroes the hi-32 bits
- ** or a load of the lower 32 bits from a 64 bit address.
+ /* This is either a 32 bit reg/reg mov which zeroes the hiword
+ ** or a load of the loword from a 64 bit address.
*/
emit_mrm(as, XO_MOV, dest, left);
-#endif
} else { /* 32/32 bit no-op (cast). */
ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
}
}
}
+#if LJ_32 && LJ_HASFFI
+/* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */
+
+/* 64 bit integer to FP conversion in 32 bit mode. */
+static void asm_conv_fp_int64(ASMState *as, IRIns *ir)
+{
+ Reg hi = ra_alloc1(as, ir->op1, RSET_GPR);
+ Reg lo = ra_alloc1(as, (ir-1)->op1, rset_exclude(RSET_GPR, hi));
+ int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
+ Reg dest = ir->r;
+ if (ra_hasreg(dest)) {
+ ra_free(as, dest);
+ ra_modified(as, dest);
+ emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS,
+ dest, RID_ESP, ofs);
+ }
+ emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd,
+ irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
+ if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) {
+ /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
+ MCLabel l_end = emit_label(as);
+ emit_rma(as, XO_FADDq, XOg_FADDq,
+ lj_ir_k64_find(as->J, U64x(43f00000,00000000)));
+ emit_sjcc(as, CC_NS, l_end);
+ emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */
+ } else {
+ lua_assert(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64);
+ }
+ emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0);
+ /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */
+ emit_rmro(as, XO_MOVto, hi, RID_ESP, 4);
+ emit_rmro(as, XO_MOVto, lo, RID_ESP, 0);
+}
+
+/* FP to 64 bit integer conversion in 32 bit mode. */
+static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
+{
+ IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK);
+ IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH);
+ Reg lo, hi;
+ lua_assert(st == IRT_NUM || st == IRT_FLOAT);
+ lua_assert(dt == IRT_I64 || dt == IRT_U64);
+ lua_assert(((ir-1)->op2 & IRCONV_TRUNC));
+ hi = ra_dest(as, ir, RSET_GPR);
+ lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi));
+ if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0);
+ /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */
+ if (!(as->flags & JIT_F_SSE3)) { /* Set FPU rounding mode to default. */
+ emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 4);
+ emit_rmro(as, XO_MOVto, lo, RID_ESP, 4);
+ emit_gri(as, XG_ARITHi(XOg_AND), lo, 0xf3ff);
+ }
+ if (dt == IRT_U64) {
+ /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
+ MCLabel l_pop, l_end = emit_label(as);
+ emit_x87op(as, XI_FPOP);
+ l_pop = emit_label(as);
+ emit_sjmp(as, l_end);
+ emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
+ if ((as->flags & JIT_F_SSE3))
+ emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
+ else
+ emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
+ emit_rma(as, XO_FADDq, XOg_FADDq,
+ lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
+ emit_sjcc(as, CC_NS, l_pop);
+ emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */
+ }
+ emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
+ if ((as->flags & JIT_F_SSE3)) { /* Truncation is easy with SSE3. */
+ emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
+ } else { /* Otherwise set FPU rounding mode to truncate before the store. */
+ emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
+ emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 0);
+ emit_rmro(as, XO_MOVtow, lo, RID_ESP, 0);
+ emit_rmro(as, XO_ARITHw(XOg_OR), lo, RID_ESP, 0);
+ emit_loadi(as, lo, 0xc00);
+ emit_rmro(as, XO_FNSTCW, XOg_FNSTCW, RID_ESP, 0);
+ }
+ if (dt == IRT_U64)
+ emit_x87op(as, XI_FDUP);
+ emit_mrm(as, st == IRT_NUM ? XO_FLDq : XO_FLDd,
+ st == IRT_NUM ? XOg_FLDq: XOg_FLDd,
+ asm_fuseload(as, ir->op1, RSET_EMPTY));
+}
+#endif
+
static void asm_strto(ASMState *as, IRIns *ir)
{
/* Force a spill slot for the destination register (if any). */
ra_left(as, RID_EAX, ir->op2);
}
+#if LJ_64 && LJ_HASFFI
+static void asm_arith64(ASMState *as, IRIns *ir, IRCallID id)
+{
+ const CCallInfo *ci = &lj_ir_callinfo[id];
+ IRRef args[2];
+ args[0] = ir->op1;
+ args[1] = ir->op2;
+ asm_setupresult(as, ir, ci);
+ asm_gencall(as, ci, args);
+}
+#endif
+
/* Find out whether swapping operands might be beneficial. */
static int swapops(ASMState *as, IRIns *ir)
{
/* -- Comparisons --------------------------------------------------------- */
/* Virtual flags for unordered FP comparisons. */
-#define VCC_U 0x100 /* Unordered. */
-#define VCC_P 0x200 /* Needs extra CC_P branch. */
-#define VCC_S 0x400 /* Swap avoids CC_P branch. */
+#define VCC_U 0x1000 /* Unordered. */
+#define VCC_P 0x2000 /* Needs extra CC_P branch. */
+#define VCC_S 0x4000 /* Swap avoids CC_P branch. */
#define VCC_PS (VCC_P|VCC_S)
-static void asm_comp_(ASMState *as, IRIns *ir, int cc)
+/* Map of comparisons to flags. ORDER IR. */
+#define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf))
+static const uint16_t asm_compmap[IR_ABC+1] = {
+ /* signed non-eq unsigned flags */
+ /* LT */ COMPFLAGS(CC_GE, CC_G, CC_AE, VCC_PS),
+ /* GE */ COMPFLAGS(CC_L, CC_L, CC_B, 0),
+ /* LE */ COMPFLAGS(CC_G, CC_G, CC_A, VCC_PS),
+ /* GT */ COMPFLAGS(CC_LE, CC_L, CC_BE, 0),
+ /* ULT */ COMPFLAGS(CC_AE, CC_A, CC_AE, VCC_U),
+ /* UGE */ COMPFLAGS(CC_B, CC_B, CC_B, VCC_U|VCC_PS),
+ /* ULE */ COMPFLAGS(CC_A, CC_A, CC_A, VCC_U),
+ /* UGT */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS),
+ /* EQ */ COMPFLAGS(CC_NE, CC_NE, CC_NE, VCC_P),
+ /* NE */ COMPFLAGS(CC_E, CC_E, CC_E, VCC_U|VCC_P),
+ /* ABC */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS) /* Same as UGT. */
+};
+
+/* FP and integer comparisons. */
+static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
{
if (irt_isnum(ir->t)) {
IRRef lref = ir->op1;
if (irl+1 == ir) /* Referencing previous ins? */
as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */
} else {
- x86Op xo;
- if (checki8(imm)) {
- emit_i8(as, imm);
- xo = XO_ARITHi8;
- } else {
- emit_i32(as, imm);
- xo = XO_ARITHi;
- }
- emit_mrm(as, xo, r64 + XOg_CMP, left);
+ emit_gmrmi(as, XG_ARITHi(XOg_CMP), r64 + left, imm);
}
}
} else {
}
}
-#define asm_comp(as, ir, ci, cf, cu) \
- asm_comp_(as, ir, (ci)+((cf)<<4)+(cu))
+#if LJ_32 && LJ_HASFFI
+/* 64 bit integer comparisons in 32 bit mode. */
+static void asm_comp_int64(ASMState *as, IRIns *ir)
+{
+ uint32_t cc = asm_compmap[(ir-1)->o];
+ RegSet allow = RSET_GPR;
+ Reg lefthi = RID_NONE, leftlo = RID_NONE;
+ Reg righthi = RID_NONE, rightlo = RID_NONE;
+ MCLabel l_around;
+ x86ModRM mrm;
+
+ as->curins--; /* Skip loword ins. Avoids failing in noconflict(), too. */
+
+ /* Allocate/fuse hiword operands. */
+ if (irref_isk(ir->op2)) {
+ lefthi = asm_fuseload(as, ir->op1, allow);
+ } else {
+ lefthi = ra_alloc1(as, ir->op1, allow);
+ righthi = asm_fuseload(as, ir->op2, allow);
+ if (righthi == RID_MRM) {
+ if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
+ if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
+ } else {
+ rset_clear(allow, righthi);
+ }
+ }
+ mrm = as->mrm; /* Save state for hiword instruction. */
+
+ /* Allocate/fuse loword operands. */
+ if (irref_isk((ir-1)->op2)) {
+ leftlo = asm_fuseload(as, (ir-1)->op1, allow);
+ } else {
+ leftlo = ra_alloc1(as, (ir-1)->op1, allow);
+ rightlo = asm_fuseload(as, (ir-1)->op2, allow);
+ if (rightlo == RID_MRM) {
+ if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
+ if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
+ } else {
+ rset_clear(allow, rightlo);
+ }
+ }
+
+ /* All register allocations must be performed _before_ this point. */
+ l_around = emit_label(as);
+ as->invmcp = as->testmcp = NULL; /* Cannot use these optimizations. */
+
+ /* Loword comparison and branch. */
+ asm_guardcc(as, cc >> 4); /* Always use unsigned compare for loword. */
+ if (ra_noreg(rightlo)) {
+ int32_t imm = IR((ir-1)->op2)->i;
+ if (imm == 0 && ((cc >> 4) & 0xa) != 0x2 && leftlo != RID_MRM)
+ emit_rr(as, XO_TEST, leftlo, leftlo);
+ else
+ emit_gmrmi(as, XG_ARITHi(XOg_CMP), leftlo, imm);
+ } else {
+ emit_mrm(as, XO_CMP, leftlo, rightlo);
+ }
+
+ /* Hiword comparison and branches. */
+ if ((cc & 15) != CC_NE)
+ emit_sjcc(as, CC_NE, l_around); /* Hiword unequal: skip loword compare. */
+ if ((cc & 15) != CC_E)
+ asm_guardcc(as, cc >> 8); /* Hiword compare without equality check. */
+ as->mrm = mrm; /* Restore state. */
+ if (ra_noreg(righthi)) {
+ int32_t imm = IR(ir->op2)->i;
+ if (imm == 0 && (cc & 0xa) != 0x2 && lefthi != RID_MRM)
+ emit_rr(as, XO_TEST, lefthi, lefthi);
+ else
+ emit_gmrmi(as, XG_ARITHi(XOg_CMP), lefthi, imm);
+ } else {
+ emit_mrm(as, XO_CMP, lefthi, righthi);
+ }
+}
+#endif
+
+/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
+
+/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
+static void asm_hiop(ASMState *as, IRIns *ir)
+{
+#if LJ_32 && LJ_HASFFI
+ /* HIOP is marked as a store because it needs its own DCE logic. */
+ int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */
+ if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
+ if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */
+ if (usehi || uselo) {
+ if (irt_isfp(ir->t))
+ asm_conv_fp_int64(as, ir);
+ else
+ asm_conv_int64_fp(as, ir);
+ }
+ as->curins--; /* Always skip the CONV. */
+ return;
+ } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */
+ asm_comp_int64(as, ir);
+ return;
+ }
+ if (!usehi) return; /* Skip unused hiword op for all remaining ops. */
+ switch ((ir-1)->o) {
+ case IR_ADD:
+ asm_intarith(as, ir, uselo ? XOg_ADC : XOg_ADD);
+ break;
+ case IR_SUB:
+ asm_intarith(as, ir, uselo ? XOg_SBB : XOg_SUB);
+ break;
+ case IR_NEG: {
+ Reg dest = ra_dest(as, ir, RSET_GPR);
+ emit_rr(as, XO_GROUP3, XOg_NEG, dest);
+ if (uselo) {
+ emit_i8(as, 0);
+ emit_rr(as, XO_ARITHi8, XOg_ADC, dest);
+ }
+ ra_left(as, dest, ir->op1);
+ break;
+ }
+ case IR_CALLN:
+ ra_destreg(as, ir, RID_RETHI);
+ if (!uselo)
+ ra_allocref(as, ir->op1, RID2RSET(RID_RET)); /* Mark call as used. */
+ break;
+ default: lua_assert(0); break;
+ }
+#else
+ UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on x64 or without FFI. */
+#endif
+}
/* -- Stack handling ------------------------------------------------------ */
switch ((IROp)ir->o) {
/* Miscellaneous ops. */
case IR_LOOP: asm_loop(as); break;
- case IR_NOP: break;
+ case IR_NOP: lua_assert(!ra_used(ir)); break;
case IR_PHI: asm_phi(as, ir); break;
+ case IR_HIOP: asm_hiop(as, ir); break;
/* Guarded assertions. */
- case IR_LT: asm_comp(as, ir, CC_GE, CC_AE, VCC_PS); break;
- case IR_GE: asm_comp(as, ir, CC_L, CC_B, 0); break;
- case IR_LE: asm_comp(as, ir, CC_G, CC_A, VCC_PS); break;
- case IR_GT: asm_comp(as, ir, CC_LE, CC_BE, 0); break;
- case IR_ULT: asm_comp(as, ir, CC_AE, CC_AE, VCC_U); break;
- case IR_UGE: asm_comp(as, ir, CC_B, CC_B, VCC_U|VCC_PS); break;
- case IR_ULE: asm_comp(as, ir, CC_A, CC_A, VCC_U); break;
- case IR_ABC:
- case IR_UGT: asm_comp(as, ir, CC_BE, CC_BE, VCC_U|VCC_PS); break;
- case IR_EQ: asm_comp(as, ir, CC_NE, CC_NE, VCC_P); break;
- case IR_NE: asm_comp(as, ir, CC_E, CC_E, VCC_U|VCC_P); break;
+ case IR_LT: case IR_GE: case IR_LE: case IR_GT:
+ case IR_ULT: case IR_UGE: case IR_ULE: case IR_UGT:
+ case IR_EQ: case IR_NE: case IR_ABC:
+ asm_comp(as, ir, asm_compmap[ir->o]);
+ break;
case IR_RETF: asm_retf(as, ir); break;
case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
asm_fpmath(as, ir);
break;
- case IR_POWI: asm_powi(as, ir); break;
+ case IR_POWI:
+#if LJ_64 && LJ_HASFFI
+ if (!irt_isnum(ir->t))
+ asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
+ IRCALL_lj_carith_powu64);
+ else
+#endif
+ asm_powi(as, ir);
+ break;
/* Overflow-checking arithmetic ops. Note: don't use LEA here! */
case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
{
for (as->curins--; as->curins > as->stopins; as->curins--) {
IRIns *ir = IR(as->curins);
+ lua_assert(!(LJ_32 && irt_isint64(ir->t))); /* Handled by SPLIT. */
if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE))
continue; /* Dead-code elimination can be soooo easy. */
if (irt_isguard(ir->t))
case IR_CALLN: case IR_CALLL: case IR_CALLS: {
const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
#if LJ_64
- /* NYI: add stack slots for x64 calls with many args. */
lua_assert(CCI_NARGS(ci) <= (LJ_ABI_WIN ? 4 : 6));
ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET);
#else
- /* NYI: not fastcall-aware, but doesn't matter (yet). */
+ lua_assert(!(ci->flags & CCI_FASTCALL) || CCI_NARGS(ci) <= 2);
if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */
as->evenspill = (int32_t)CCI_NARGS(ci);
ir->prev = REGSP_HINT(RID_RET);
(RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH;
continue;
}
+#if LJ_32 && LJ_HASFFI
+ case IR_HIOP:
+ if ((ir-1)->o == IR_CALLN)
+ ir->prev = REGSP_HINT(RID_RETHI);
+ break;
+#endif
/* C calls evict all scratch regs and return results in RID_RET. */
case IR_SNEW: case IR_NEWREF:
#if !LJ_64
as->modset = RSET_SCRATCH;
break;
case IR_POWI:
+#if LJ_64 && LJ_HASFFI
+ if (!irt_isnum(ir->t)) {
+ ir->prev = REGSP_HINT(RID_RET);
+ if (inloop)
+ as->modset |= (RSET_SCRATCH & RSET_GPR);
+ continue;
+ }
+#endif
ir->prev = REGSP_HINT(RID_XMM0);
if (inloop)
as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
--- /dev/null
+/*
+** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions.
+** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_opt_split_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT && LJ_HASFFI && LJ_32
+
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_vm.h"
+
+/* SPLIT pass:
+**
+** This pass splits up 64 bit IR instructions into multiple 32 bit IR
+** instructions. It's only active for 32 bit CPUs which lack native 64 bit
+** operations. The FFI is currently the only emitter for 64 bit
+** instructions, so this pass is disabled if the FFI is disabled.
+**
+** Splitting the IR in a separate pass keeps each 32 bit IR assembler
+** backend simple. Only a small amount of extra functionality needs to be
+** implemented. This is much easier than adding support for allocating
+** register pairs to each backend (believe me, I tried). A few simple, but
+** important optimizations can be performed by the SPLIT pass, which would
+** be tedious to do in the backend.
+**
+** The basic idea is to replace each 64 bit IR instruction with its 32 bit
+** equivalent plus an extra HIOP instruction. The splitted IR is not passed
+** through FOLD or any other optimizations, so each HIOP is guaranteed to
+** immediately follow it's counterpart. The actual functionality of HIOP is
+** inferred from the previous instruction.
+**
+** The operands of HIOP hold the hiword input references. The output of HIOP
+** is the hiword output reference, which is also used to hold the hiword
+** register or spill slot information. The register allocator treats this
+** instruction independent of any other instruction, which improves code
+** quality compared to using fixed register pairs.
+**
+** It's easier to split up some instructions into two regular 32 bit
+** instructions. E.g. XLOAD is split up into two XLOADs with two different
+** addresses. Obviously 64 bit constants need to be split up into two 32 bit
+** constants, too. Some hiword instructions can be entirely omitted, e.g.
+** when zero-extending a 32 bit value to 64 bits.
+**
+** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with
+** two int64_t fields:
+**
+** 0100 p32 ADD base +8
+** 0101 i64 XLOAD 0100
+** 0102 i64 ADD 0101 +1
+** 0103 p32 ADD base +16
+** 0104 i64 XSTORE 0103 0102
+**
+** mov rax, [esi+0x8]
+** add rax, +0x01
+** mov [esi+0x10], rax
+**
+** Here's the transformed IR and the x86 machine code after the SPLIT pass:
+**
+** 0100 p32 ADD base +8
+** 0101 int XLOAD 0100
+** 0102 p32 ADD base +12
+** 0103 int XLOAD 0102
+** 0104 int ADD 0101 +1
+** 0105 int HIOP 0103 +0
+** 0106 p32 ADD base +16
+** 0107 int XSTORE 0106 0104
+** 0108 p32 ADD base +20
+** 0109 int XSTORE 0108 0105
+**
+** mov eax, [esi+0x8]
+** mov ecx, [esi+0xc]
+** add eax, +0x01
+** adc ecx, +0x00
+** mov [esi+0x10], eax
+** mov [esi+0x14], ecx
+**
+** You may notice the reassociated hiword address computation, which is
+** later fused into the mov operands by the assembler.
+*/
+
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref) (&J->cur.ir[(ref)])
+
+/* Directly emit the transformed IR without updating chains etc. */
+static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2)
+{
+ IRRef nref = lj_ir_nextins(J);
+ IRIns *ir = IR(nref);
+ ir->ot = ot;
+ ir->op1 = op1;
+ ir->op2 = op2;
+ return nref;
+}
+
+/* Emit a CALLN with two split 64 bit arguments. */
+static IRRef split_call64(jit_State *J, IRRef1 *hisubst, IRIns *oir,
+ IRIns *ir, IRCallID id)
+{
+ IRRef tmp, op1 = ir->op1, op2 = ir->op2;
+ J->cur.nins--;
+#if LJ_LE
+ tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]);
+ tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
+ tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
+#else
+ tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev);
+ tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
+ tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
+#endif
+ ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id);
+ return split_emit(J, IRTI(IR_HIOP), tmp, tmp);
+}
+
+/* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */
+static IRRef split_ptr(jit_State *J, IRRef ref)
+{
+ IRIns *ir = IR(ref);
+ int32_t ofs = 4;
+ if (ir->o == IR_ADD && irref_isk(ir->op2)) { /* Reassociate address. */
+ ofs += IR(ir->op2)->i;
+ ref = ir->op1;
+ if (ofs == 0) return ref;
+ }
+ return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs));
+}
+
+/* Transform the old IR to the new IR. */
+static void split_ir(jit_State *J)
+{
+ IRRef nins = J->cur.nins, nk = J->cur.nk;
+ MSize irlen = nins - nk;
+ MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1));
+ IRIns *oir = (IRIns *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need);
+ IRRef1 *hisubst;
+ IRRef ref;
+
+ /* Copy old IR to buffer. */
+ memcpy(oir, IR(nk), irlen*sizeof(IRIns));
+ /* Bias hiword substitution table and old IR. Loword kept in field prev. */
+ hisubst = (IRRef1 *)&oir[irlen] - nk;
+ oir -= nk;
+
+ /* Remove all IR instructions, but retain IR constants. */
+ J->cur.nins = REF_FIRST;
+
+ /* Process constants and fixed references. */
+ for (ref = nk; ref <= REF_BASE; ref++) {
+ IRIns *ir = &oir[ref];
+ if (ir->o == IR_KINT64) { /* Split up 64 bit constant. */
+ TValue tv = *ir_k64(ir);
+ ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo);
+ hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi);
+ } else {
+ ir->prev = (IRRef1)ref; /* Identity substitution for loword. */
+ }
+ }
+
+ /* Process old IR instructions. */
+ for (ref = REF_FIRST; ref < nins; ref++) {
+ IRIns *ir = &oir[ref];
+ IRRef nref = lj_ir_nextins(J);
+ IRIns *nir = IR(nref);
+
+ /* Copy-substitute old instruction to new instruction. */
+ nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev;
+ nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev;
+ ir->prev = nref; /* Loword substitution. */
+ nir->o = ir->o;
+ nir->t.irt = ir->t.irt & ~(IRT_MARK|IRT_ISPHI);
+
+ /* Split 64 bit instructions. */
+ if (irt_isint64(ir->t)) {
+ IRRef hi = hisubst[ir->op1];
+ nir->t.irt = IRT_INT | (nir->t.irt & IRT_GUARD); /* Turn into INT op. */
+ switch (ir->o) {
+ case IR_ADD:
+ case IR_SUB:
+ /* Use plain op for hiword if loword cannot produce a carry/borrow. */
+ if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) {
+ ir->prev = nir->op1; /* Pass through loword. */
+ nir->op1 = hi; nir->op2 = hisubst[ir->op2];
+ hi = nref;
+ break;
+ }
+ /* fallthrough */
+ case IR_NEG:
+ hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]);
+ break;
+ case IR_MUL:
+ hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64);
+ break;
+ case IR_POWI:
+ hi = split_call64(J, hisubst, oir, ir,
+ irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
+ IRCALL_lj_carith_powu64);
+ break;
+ case IR_XLOAD:
+ hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2);
+#if LJ_BE
+ ir->prev = hi; hi = nref;
+#endif
+ break;
+ case IR_XSTORE:
+#if LJ_LE
+ hi = hisubst[ir->op2];
+#else
+ hi = nir->op2; nir->op2 = hisubst[ir->op2];
+#endif
+ split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi);
+ continue;
+ case IR_CONV: { /* Conversion to 64 bit integer. Others handled below. */
+ IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
+ if (st == IRT_NUM || st == IRT_FLOAT) { /* FP to 64 bit int conv. */
+ hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref);
+ } else if (st == IRT_I64 || st == IRT_U64) { /* 64/64 bit cast. */
+ /* Drop cast, since assembler doesn't care. */
+ hisubst[ref] = hi;
+ goto fwdlo;
+ } else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */
+ IRRef k31 = lj_ir_kint(J, 31);
+ nir = IR(nref); /* May have been reallocated. */
+ ir->prev = nir->op1; /* Pass through loword. */
+ nir->o = IR_BSAR; /* hi = bsar(lo, 31). */
+ nir->op2 = k31;
+ hi = nref;
+ } else { /* Zero-extend to 64 bit. */
+ hisubst[ref] = lj_ir_kint(J, 0);
+ goto fwdlo;
+ }
+ break;
+ }
+ case IR_PHI: {
+ IRRef hi2;
+ if ((irref_isk(nir->op1) && irref_isk(nir->op2)) ||
+ nir->op1 == nir->op2)
+ J->cur.nins--; /* Drop useless PHIs. */
+ hi2 = hisubst[ir->op2];
+ if (!((irref_isk(hi) && irref_isk(hi2)) || hi == hi2))
+ split_emit(J, IRTI(IR_PHI), hi, hi2);
+ continue;
+ }
+ default:
+ lua_assert(ir->o <= IR_NE);
+ split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]); /* Comparisons. */
+ continue;
+ }
+ hisubst[ref] = hi; /* Store hiword substitution. */
+ } else if (ir->o == IR_CONV) { /* See above, too. */
+ IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
+ if (st == IRT_I64 || st == IRT_U64) { /* Conversion from 64 bit int. */
+ if (irt_isfp(ir->t)) { /* 64 bit integer to FP conversion. */
+ ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)),
+ hisubst[ir->op1], nref);
+ } else { /* Truncate to lower 32 bits. */
+ fwdlo:
+ ir->prev = nir->op1; /* Forward loword. */
+ /* Replace with NOP to avoid messing up the snapshot logic. */
+ nir->ot = IRT(IR_NOP, IRT_NIL);
+ nir->op1 = nir->op2 = 0;
+ }
+ }
+ } else if (ir->o == IR_LOOP) {
+ J->loopref = nref; /* Needed by assembler. */
+ }
+ }
+
+ /* Add PHI marks. */
+ for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) {
+ IRIns *ir = IR(ref);
+ if (ir->o != IR_PHI) break;
+ if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t);
+ if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t);
+ }
+
+ /* Substitute snapshot maps. */
+ oir[nins].prev = J->cur.nins; /* Substitution for last snapshot. */
+ {
+ SnapNo i, nsnap = J->cur.nsnap;
+ for (i = 0; i < nsnap; i++) {
+ SnapShot *snap = &J->cur.snap[i];
+ SnapEntry *map = &J->cur.snapmap[snap->mapofs];
+ MSize n, nent = snap->nent;
+ snap->ref = oir[snap->ref].prev;
+ for (n = 0; n < nent; n++) {
+ SnapEntry sn = map[n];
+ map[n] = ((sn & 0xffff0000) | oir[snap_ref(sn)].prev);
+ }
+ }
+ }
+}
+
+/* Protected callback for split pass. */
+static TValue *cpsplit(lua_State *L, lua_CFunction dummy, void *ud)
+{
+ jit_State *J = (jit_State *)ud;
+ split_ir(J);
+ UNUSED(L); UNUSED(dummy);
+ return NULL;
+}
+
+#ifdef LUA_USE_ASSERT
+/* Slow, but sure way to check whether a SPLIT pass is needed. */
+static int split_needsplit(jit_State *J)
+{
+ IRIns *ir, *irend;
+ IRRef ref;
+ for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++)
+ if (irt_isint64(ir->t))
+ return 1;
+ for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev)
+ if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 ||
+ (IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64)
+ return 1;
+ return 0; /* Nope. */
+}
+#endif
+
+/* SPLIT pass. */
+void lj_opt_split(jit_State *J)
+{
+ lua_assert(J->needsplit >= split_needsplit(J)); /* Verify flag. */
+ if (J->needsplit) {
+ int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit);
+ if (errcode) {
+ /* Completely reset the trace to avoid inconsistent dump on abort. */
+ J->cur.nins = J->cur.nk = REF_BASE;
+ J->cur.nsnap = 0;
+ lj_err_throw(J->L, errcode); /* Propagate errors. */
+ }
+ }
+}
+
+#undef IR
+
+#endif