From 829549ae33cf40d553e26df28e5cc1e914068c1c Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Tue, 26 Sep 2017 16:30:12 +0200 Subject: [PATCH] First attempt at an assembler for the new IfThenElse stuff. Currently this will only work for x86-linux. --- VEX/priv/host_generic_regs.h | 72 +++++ VEX/priv/host_x86_defs.c | 139 +++++++-- VEX/priv/host_x86_defs.h | 36 ++- VEX/priv/main_main.c | 572 ++++++++++++++++++++++++++++++++--- 4 files changed, 753 insertions(+), 66 deletions(-) diff --git a/VEX/priv/host_generic_regs.h b/VEX/priv/host_generic_regs.h index 4dad207bcb..add30e2e9a 100644 --- a/VEX/priv/host_generic_regs.h +++ b/VEX/priv/host_generic_regs.h @@ -555,6 +555,78 @@ extern HInstrSB* doRegisterAllocation( ); +/*---------------------------------------------------------*/ +/*--- The assembler ---*/ +/*---------------------------------------------------------*/ + +/* A struct that carries constants to the emit_*Instr functions. */ +typedef + struct { + // Are we emitting 64-bit code? Is only relevant on hosts that share + // the same backend for 32- and 64-bit variants: PPC and MIPS. + const Bool mode64; + // What's the endianness of the host? + const VexEndness endness_host; + // Host addresses for various block-exit paths + const void* disp_cp_chain_me_to_slowEP; + const void* disp_cp_chain_me_to_fastEP; + const void* disp_cp_xindir; + const void* disp_cp_xassisted; + } + EmitConstants; + + +// An offset in the assembly buffer. Wrapped up in a struct so we don't +// confuse it with any other kind of number. +//typedef struct { UInt u; } AssemblyBufferOffset; +typedef UInt AssemblyBufferOffset; // for now + + +// Information about a relocation that we will need to do. +// +// What this describes is as follows. Let |where| be an offset in the +// assembly buffer. Let |dst| be some other offset in the assembly buffer. +// +// What this describes is a modification of a 32 bit integer located at +// |where[0..3]|, interpreted in the host's endianness. The 32 bit value at +// that location has bits [bitNoMax .. bitNoMin] inclusive replaced by the +// least significant bits of the following expression +// +// E = (dst - where + sign_extend(bias)) >>signed rshift +// +// and all other bits of that 32 bit value are left unchanged. Observe that +// this description assumes E is signed. If the bits that we copy out of E +// and into the 32 bit word do not sign extend back to E, then the offset is +// too large to be represented and the relocation cannot be performed. +// +// The use of |bitNo{Max,Min}| facilitates writing an arbitrary bitfield in +// the middle of (eg) an ARM branch instruction. For amd64-style 32-bit +// branch offsets we expect the values to be 0 and 31 respectively. +// +// The use of |rshift| facilitates architectures (eg ARM) that use fixed +// length instructions, and hence require the offset to be stated in number +// of instructions instead of (eg on amd64) number of bytes. +// +// The use of |bias| is necessary because on some targets (eg amd64) the +// processor expects the branch offset to be stated relative to the first +// byte of the instruction, but |where| points somewhere further along the +// instruction. Hence we have to add a small negative bias to "back up" +// |where| to the start of the instruction. +// +// Now, we don't actually store |dst|, because we know it "from context" at +// the time we want to perform the relocation: it is the current offset in +// the assembly buffer, for the entire assembly process. +typedef + struct { + AssemblyBufferOffset where; // where the relocation should be applied + UChar bitNoMin; // 0 .. 31 inclusive + UChar bitNoMax; // bitNoMin .. 31 inclusive + Int bias; // arbitrary, but in practice very small + UChar rshift; // 0, 1 or 2 only + } + Relocation; + + #endif /* ndef __VEX_HOST_GENERIC_REGS_H */ /*---------------------------------------------------------------*/ diff --git a/VEX/priv/host_x86_defs.c b/VEX/priv/host_x86_defs.c index f4ff04917c..272828ab6f 100644 --- a/VEX/priv/host_x86_defs.c +++ b/VEX/priv/host_x86_defs.c @@ -654,6 +654,21 @@ X86Instr* X86Instr_Call ( X86CondCode cond, Addr32 target, Int regparms, vassert(is_sane_RetLoc(rloc)); return i; } +X86Instr* X86Instr_Jmp ( UInt hereOffs, UInt dstOffs ) { + X86Instr* i = LibVEX_Alloc_inline(sizeof(X86Instr)); + i->tag = Xin_Jmp; + i->Xin.Jmp.hereOffs = hereOffs; + i->Xin.Jmp.dstOffs = dstOffs; + return i; +} +X86Instr* X86Instr_JmpCond ( X86CondCode cond, + UInt dst_qentno /* FOR DEBUG PRINTING ONLY */ ) { + X86Instr* i = LibVEX_Alloc_inline(sizeof(X86Instr)); + i->tag = Xin_JmpCond; + i->Xin.JmpCond.cond = cond; + i->Xin.JmpCond.dst_qentno = dst_qentno; + return i; +} X86Instr* X86Instr_XDirect ( Addr32 dstGA, X86AMode* amEIP, X86CondCode cond, Bool toFastEP ) { X86Instr* i = LibVEX_Alloc_inline(sizeof(X86Instr)); @@ -1001,7 +1016,17 @@ void ppX86Instr ( const X86Instr* i, Bool mode64 ) { i->Xin.Call.regparms); ppRetLoc(i->Xin.Call.rloc); vex_printf("] 0x%x", i->Xin.Call.target); - break; + return; + case Xin_Jmp: + vex_printf("jmp from [%03u] to [%03u] (delta = %d)", + i->Xin.Jmp.hereOffs, i->Xin.Jmp.dstOffs, + (Int)(i->Xin.Jmp.dstOffs) - (Int)(i->Xin.Jmp.hereOffs)); + return; + case Xin_JmpCond: + vex_printf("j%s to queue[%u] (delta currently unknown)", + showX86CondCode(i->Xin.JmpCond.cond), + i->Xin.JmpCond.dst_qentno); + return; case Xin_XDirect: vex_printf("(xDirect) "); vex_printf("if (%%eflags.%s) { ", @@ -1225,7 +1250,7 @@ void ppX86Instr ( const X86Instr* i, Bool mode64 ) { "adcl $0,NotKnownYet+4"); return; case Xin_IfThenElse: - vex_printf("if (!%s) then {...", + vex_printf("if (ccOOL=%s) then IL {...", showX86CondCode(i->Xin.IfThenElse.hite->ccOOL)); return; default: @@ -2162,13 +2187,9 @@ static UChar* push_word_from_tags ( UChar* p, UShort tags ) instruction was a profiler inc, set *is_profInc to True, else leave it unchanged. */ -Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc, - UChar* buf, Int nbuf, const X86Instr* i, - Bool mode64, VexEndness endness_host, - const void* disp_cp_chain_me_to_slowEP, - const void* disp_cp_chain_me_to_fastEP, - const void* disp_cp_xindir, - const void* disp_cp_xassisted ) +UInt emit_X86Instr ( /*MB_MOD*/Bool* is_profInc, + UChar* buf, UInt nbuf, const X86Instr* i, + const EmitConstants* emitConsts ) { UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc; @@ -2176,7 +2197,7 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc, UChar* p = &buf[0]; UChar* ptmp; vassert(nbuf >= 32); - vassert(mode64 == False); + vassert(emitConsts->mode64 == False); /* vex_printf("asm ");ppX86Instr(i, mode64); vex_printf("\n"); */ @@ -2484,14 +2505,56 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc, *p++ = toUChar(0xD0 + irno); goto done; + case Xin_Jmp: { + Long deltaLL + = ((Long)(i->Xin.Jmp.hereOffs)) - ((Long)(i->Xin.Jmp.dstOffs)); + /* Stay sane .. */ + vassert(-1000000LL <= deltaLL && deltaLL <= 1000000LL); + Int delta = (Int)deltaLL; + /* The offset that we must encode is actually relative to the start of + the next instruction. Also, there are short and long encodings of + this instruction. Try to use the short one if possible. */ + if (delta >= -0x78 && delta <= 0x78) { + delta += 2; + *p++ = toUChar(0xEB); + *p++ = toUChar(delta & 0xFF); + delta >>= 8; + vassert(delta == 0 || delta == -1); + } else { + delta += 5; + *p++ = toUChar(0xE9); + p = emit32(p, (UInt)delta); + } + goto done; + } + + case Xin_JmpCond: { + /* We don't know the destination yet, so just emit this so that it + loops back to itself. The main (host-independent) assembler logic + will later patch it when the destination is known. Until then, + emitting an instruction that jumps to itself seems like a good + idea, since if we mistakenly forget to patch it, the generated code + will spin at this point, and when we attach a debugger, it will be + obvious what has happened. */ + *p++ = toUChar(0x0F); + *p++ = toUChar(0x80 + (UInt)i->Xin.JmpCond.cond); + // FFFFFFFA == -6, which, relative to the next insn, points to + // the start of this one. + *p++ = toUChar(0xFA); + *p++ = toUChar(0xFF); + *p++ = toUChar(0xFF); + *p++ = toUChar(0xFF); + goto done; + } + case Xin_XDirect: { /* NB: what goes on here has to be very closely coordinated with the chainXDirect_X86 and unchainXDirect_X86 below. */ /* We're generating chain-me requests here, so we need to be sure this is actually allowed -- no-redir translations can't use chain-me's. Hence: */ - vassert(disp_cp_chain_me_to_slowEP != NULL); - vassert(disp_cp_chain_me_to_fastEP != NULL); + vassert(emitConsts->disp_cp_chain_me_to_slowEP != NULL); + vassert(emitConsts->disp_cp_chain_me_to_fastEP != NULL); /* Use ptmp for backpatching conditional jumps. */ ptmp = NULL; @@ -2519,8 +2582,9 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc, /* movl $disp_cp_chain_me_to_{slow,fast}EP,%edx; */ *p++ = 0xBA; const void* disp_cp_chain_me - = i->Xin.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP - : disp_cp_chain_me_to_slowEP; + = i->Xin.XDirect.toFastEP + ? emitConsts->disp_cp_chain_me_to_fastEP + : emitConsts->disp_cp_chain_me_to_slowEP; p = emit32(p, (UInt)(Addr)disp_cp_chain_me); /* call *%edx */ *p++ = 0xFF; @@ -2543,7 +2607,7 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc, translations without going through the scheduler. That means no XDirects or XIndirs out from no-redir translations. Hence: */ - vassert(disp_cp_xindir != NULL); + vassert(emitConsts->disp_cp_xindir != NULL); /* Use ptmp for backpatching conditional jumps. */ ptmp = NULL; @@ -2563,7 +2627,7 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc, /* movl $disp_indir, %edx */ *p++ = 0xBA; - p = emit32(p, (UInt)(Addr)disp_cp_xindir); + p = emit32(p, (UInt)(Addr)emitConsts->disp_cp_xindir); /* jmp *%edx */ *p++ = 0xFF; *p++ = 0xE2; @@ -2627,7 +2691,7 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc, /* movl $disp_indir, %edx */ *p++ = 0xBA; - p = emit32(p, (UInt)(Addr)disp_cp_xassisted); + p = emit32(p, (UInt)(Addr)emitConsts->disp_cp_xassisted); /* jmp *%edx */ *p++ = 0xFF; *p++ = 0xE2; @@ -3366,7 +3430,7 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc, } bad: - ppX86Instr(i, mode64); + ppX86Instr(i, emitConsts->mode64); vpanic("emit_X86Instr"); /*NOTREACHED*/ @@ -3515,6 +3579,45 @@ VexInvalRange patchProfInc_X86 ( VexEndness endness_host, } +/* Create relocation info needed to patch a branch offset for instruction I + whose first instruction is at WHERE in the assembly buffer. */ +Relocation collectRelocInfo_X86 ( AssemblyBufferOffset where, + X86Instr* i ) +{ + /* Xin_JmpCond produces a conditional branch, of the form + 0F 8x <32-bit-offset> + where 'x' encodes the condition code. + + When we come to patch it so as to jump to a particular destination, we + must be aware that: + + (1) the field we want to patch starts 2 bytes into the instruction. + Hence ".where = where + 2" + + (2) the processor expects the 32-bit-offset value to be relative to + the start of the *next* instruction. The patcher will patch + the location we specified ("where + 2", but that is 4 bytes + before the start of the next instruction. Hence we set the bias + to -4, so that it reduces the computed offset by 4, which makes + it relative to the start of the next instruction. + */ + switch (i->tag) { + case Xin_JmpCond: { + Relocation rel = { .where = where + 2, + .bitNoMin = 0, + .bitNoMax = 31, + .bias = -4, + .rshift = 0 }; + return rel; + } + default: + // We don't expect to be asked to compute relocation information + // for any other kind of instruction. + vpanic("collectRelocInfo_X86"); + } +} + + /*---------------------------------------------------------------*/ /*--- end host_x86_defs.c ---*/ /*---------------------------------------------------------------*/ diff --git a/VEX/priv/host_x86_defs.h b/VEX/priv/host_x86_defs.h index d32ff9862d..ea02c4aa0a 100644 --- a/VEX/priv/host_x86_defs.h +++ b/VEX/priv/host_x86_defs.h @@ -355,6 +355,8 @@ typedef Xin_Sh3232, /* shldl or shrdl */ Xin_Push, /* push (32-bit?) value on stack */ Xin_Call, /* call to address in register */ + Xin_Jmp, /* unconditional branch */ + Xin_JmpCond, /* conditional branch */ Xin_XDirect, /* direct transfer to GA */ Xin_XIndir, /* indirect transfer to GA */ Xin_XAssisted, /* assisted transfer to GA */ @@ -456,6 +458,21 @@ typedef Int regparms; /* 0 .. 3 */ RetLoc rloc; /* where the return value will be */ } Call; + /* Unconditional branch, using the offset (dstOffs - hereOffs). + This could be done with just one offset, but storing two values + makes it easier to understand assembly debug printing. */ + struct { + UInt hereOffs; + UInt dstOffs; + } Jmp; + /* Conditional branch. At the time this is generated, we don't + know the offset and so this merely denotes a conditional branch + with a 32-bit unknown offset. For debug printing ONLY, we also + record the assembler queue entry number. */ + struct { + X86CondCode cond; + UInt dst_qentno; // FOR DEBUG PRINTING ONLY + } JmpCond; /* Update the guest EIP value, then exit requesting to chain to it. May be conditional. Urr, use of Addr32 implicitly assumes that wordsize(guest) == wordsize(host). */ @@ -673,6 +690,9 @@ extern X86Instr* X86Instr_Div ( Bool syned, X86RM* ); extern X86Instr* X86Instr_Sh3232 ( X86ShiftOp, UInt amt, HReg src, HReg dst ); extern X86Instr* X86Instr_Push ( X86RMI* ); extern X86Instr* X86Instr_Call ( X86CondCode, Addr32, Int, RetLoc ); +extern X86Instr* X86Instr_Jmp ( UInt hereOffs, UInt dstOffs ); +extern X86Instr* X86Instr_JmpCond ( X86CondCode, + UInt/*FOR DEBUG PRINTING ONLY*/); extern X86Instr* X86Instr_XDirect ( Addr32 dstGA, X86AMode* amEIP, X86CondCode cond, Bool toFastEP ); extern X86Instr* X86Instr_XIndir ( HReg dstGA, X86AMode* amEIP, @@ -724,14 +744,10 @@ extern void getRegUsage_X86Instr ( HRegUsage*, const X86Instr*, Bool ); extern void mapRegs_X86Instr ( HRegRemap*, X86Instr*, Bool ); extern Bool isMove_X86Instr ( const X86Instr*, HReg*, HReg* ); extern HInstrIfThenElse* isIfThenElse_X86Instr(X86Instr*); -extern Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc, - UChar* buf, Int nbuf, const X86Instr* i, - Bool mode64, - VexEndness endness_host, - const void* disp_cp_chain_me_to_slowEP, - const void* disp_cp_chain_me_to_fastEP, - const void* disp_cp_xindir, - const void* disp_cp_xassisted ); +extern UInt emit_X86Instr ( /*MB_MOD*/Bool* is_profInc, + UChar* buf, UInt nbuf, + const X86Instr* i, + const EmitConstants* emitConsts ); extern void genSpill_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2, HReg rreg, Int offset, Bool ); @@ -775,6 +791,10 @@ extern VexInvalRange patchProfInc_X86 ( VexEndness endness_host, void* place_to_patch, const ULong* location_of_counter ); +/* Create relocation info needed to patch a branch offset for instruction I + whose first instruction is at WHERE in the assembly buffer. */ +extern Relocation collectRelocInfo_X86 ( AssemblyBufferOffset where, + X86Instr* i ); #endif /* ndef __VEX_HOST_X86_DEFS_H */ diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c index 689c2e8cdf..a42e6792e4 100644 --- a/VEX/priv/main_main.c +++ b/VEX/priv/main_main.c @@ -294,7 +294,9 @@ void LibVEX_Init ( } -/* --------- Make a translation. --------- */ +/*---------------------------------------------------------*/ +/*--- The compilation pipeline: front end ---*/ +/*---------------------------------------------------------*/ /* KLUDGE: S390 need to know the hwcaps of the host when generating code. But that info is not passed to emit_S390Instr. Only mode64 is @@ -676,7 +678,7 @@ IRSB* LibVEX_FrontEnd ( /*MOD*/ VexTranslateArgs* vta, True/*must be flat*/, guest_word_type ); /* Do a post-instrumentation cleanup pass. */ - if (vta->instrument1 || vta->instrument2) { + if (1 || vta->instrument1 || vta->instrument2) { do_deadcode_BB( irsb ); irsb = cprop_BB( irsb ); do_deadcode_BB( irsb ); @@ -698,6 +700,199 @@ IRSB* LibVEX_FrontEnd ( /*MOD*/ VexTranslateArgs* vta, } +/*---------------------------------------------------------*/ +/*--- The compilation pipeline: back end ---*/ +/*---------------------------------------------------------*/ + +/* ---- Code patching helpers for the assembler ---- */ + +static UInt loadUnaligned32 ( UChar* where, VexEndness endness ) +{ + UInt w32 = 0; + switch (endness) { + case VexEndnessLE: + w32 = (w32 << 8) | where[3]; + w32 = (w32 << 8) | where[2]; + w32 = (w32 << 8) | where[1]; + w32 = (w32 << 8) | where[0]; + break; + case VexEndnessBE: + w32 = (w32 << 8) | where[0]; + w32 = (w32 << 8) | where[1]; + w32 = (w32 << 8) | where[2]; + w32 = (w32 << 8) | where[3]; + break; + default: + vassert(0); + } + return w32; +} + +static void storeUnaligned32 ( UChar* where, UInt w32, VexEndness endness ) +{ + switch (endness) { + case VexEndnessLE: + where[0] = (w32 & 0xFF); w32 >>= 8; + where[1] = (w32 & 0xFF); w32 >>= 8; + where[2] = (w32 & 0xFF); w32 >>= 8; + where[3] = (w32 & 0xFF); w32 >>= 8; + break; + case VexEndnessBE: + where[3] = (w32 & 0xFF); w32 >>= 8; + where[2] = (w32 & 0xFF); w32 >>= 8; + where[1] = (w32 & 0xFF); w32 >>= 8; + where[0] = (w32 & 0xFF); w32 >>= 8; + break; + default: + vassert(0); + } +} + + +/* ---- Helpers for processing relocations ---- */ + +/* Print a relocation. */ +static +void ppRelocation ( Relocation rel ) +{ + vex_printf("Reloc{where=[%03u], bits=[%u..%u], bias=%d, rshift=%u}", + rel.where, (UInt)rel.bitNoMax, (UInt)rel.bitNoMin, + rel.bias, (UInt)rel.rshift); +} + +/* Apply a relocation, so that the instruction detailed in REL (which is + assumed already to be in ASSEMBLY_BUF) will jump to DST. */ +static +void applyRelocation ( Relocation rel, + UChar* assembly_buf, + AssemblyBufferOffset assembly_buf_used, + AssemblyBufferOffset dst, + VexEndness host_endness, + Bool verbose_asm ) +{ + const Bool debug_reloc = False; + + /* Sanity check the location. "+4" because we will patch a 32 bit + field. */ + vassert(rel.where + 4 <= assembly_buf_used); + + /* Figure out where the relocation should be applied. */ + UChar* whereP = assembly_buf + rel.where; + + /* And where the jump destination is. */ + UChar* dstP = assembly_buf + dst; + + /* Compute the 'E' value, using 64 bit arithmetic. See comment on + definition of type Relocation. */ + vassert(rel.rshift <= 2); + Long E = ((Long)(ULong)(HWord)dstP) + ((Long)rel.bias) + - ((Long)(ULong)(HWord)whereP); + E = E >>/*signed*/rel.rshift; + if (debug_reloc) + vex_printf("E = 0x%llx\n", E); + + /* Figure out how many significant bits of E we need, and check that they + sign extend back to E. If they don't, the relocation is impossible to + perform, and the system is in deep trouble -- we have to abort. It + should never fail in practice because the jump distances are small and + so are representable in all reasonable instruction sets. */ + vassert(rel.bitNoMin <= 31); + vassert(rel.bitNoMax <= 31); + vassert(rel.bitNoMax >= rel.bitNoMin); + UInt nBitsOfE = (UInt)rel.bitNoMax - (UInt)rel.bitNoMin + 1; + vassert(nBitsOfE >= 1 && nBitsOfE <= 32); + if (debug_reloc) + vex_printf("nBitsOfE = %u\n", nBitsOfE); + + /* "The lowest nBitsOfE of E sign extend back to E itself". */ + vassert(E == ((E << (64-nBitsOfE)) >>/*signed*/ (64-nBitsOfE))); + + /* Figure out the 32-bit mask for the location at |where| to be modified. + It contains |nBitsOfE| bits that need to be modified. */ + vassert(nBitsOfE + rel.bitNoMin <= 32); + ULong mask64 = ((1ULL << nBitsOfE) - 1) << rel.bitNoMin; + vassert(0 == (mask64 & 0xFFFFFFFF00000000ULL)); + UInt mask = (UInt)mask64; + if (debug_reloc) + vex_printf("mask = 0x%x\n", mask); + UInt eBits = (((UInt)E) << rel.bitNoMin) & mask; + + /* Actually apply it. */ + UInt w32old = loadUnaligned32(whereP, host_endness); + UInt w32new = (w32old & ~mask) | eBits; + + if (UNLIKELY(verbose_asm)) { + vex_printf(" -- RELOC changing 0x%08x to 0x%08x\n", w32old, w32new); + } + storeUnaligned32(whereP, w32new, host_endness); + + /* We are done. Sheesh. */ +} + + +/* ---- A helper for emitting simple instructions ---- */ + +/* Emit a simple insn INSN into BUF at offset BUF_USED. Returns the new + number of bytes used, which will be > BUF_USED to denote success and == + BUF_USED to denote failure (output buffer full). If INSN is a + profiler-inc, *OFFS_PROFINC will be set to show its offset in the output + buffer, else *OFFS_PROFINC will be unchanged. +*/ +static +AssemblyBufferOffset emitSimpleInsn ( /*MB_MOD*/Int* offs_profInc, + UChar* buf, + AssemblyBufferOffset buf_used, + AssemblyBufferOffset buf_limit, + const HInstr* insn, + const EmitConstants* emitConsts, + const VexTranslateArgs* vta ) +{ + /* Emit into a 128 byte temp buffer */ + UChar insn_bytes[128]; + Bool isProfInc = False; + UInt j = emit_X86Instr(&isProfInc, insn_bytes, sizeof(insn_bytes), + insn, emitConsts); + /* Debug printing? */ + if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM)) { + const Int min_spacing = 9; + vex_printf(" [%03u] ", buf_used); + for (Int k = 0; k < j; k++) { + vex_printf("%02x ", (UInt)insn_bytes[k]); + } + for (Int k = j; k < min_spacing; k++) { + vex_printf(" . "); + } + vex_printf(" "); + ppX86Instr(insn, False); + vex_printf("\n"); + } + /* Will it fit? */ + if (UNLIKELY(buf_used + j > buf_limit)) { + return buf_used; // denotes FAIL + } + /* Do we need to record the ProfInc offset? */ + if (UNLIKELY(isProfInc)) { + vassert(vta->addProfInc); /* else where did it come from? */ + vassert(*offs_profInc == -1); /* there can be only one (tm) */ + vassert(buf_used >= 0); + *offs_profInc = (Int)buf_used; + } + /* Copy to output. */ + { UChar* dst = &vta->host_bytes[buf_used]; + for (UInt k = 0; k < j; k++) { + dst[k] = insn_bytes[k]; + } + } + /* This needs to return a value greater than the original OFFS, in order + to indicate that this function succeeded. That depends on the + assumption that the emit_*Insn call above will emit at least one byte + for any instruction it is given, which seems reasonable. */ + return buf_used + j; +} + + +/* ---- The back end proper ---- */ + /* Back end of the compilation pipeline. Is not exported. */ static void libvex_BackEnd ( const VexTranslateArgs *vta, @@ -737,7 +932,6 @@ static void libvex_BackEnd ( const VexTranslateArgs *vta, Int offB_HOST_EvC_COUNTER; Int offB_HOST_EvC_FAILADDR; Addr max_ga; - UChar insn_bytes[128]; HInstrSB* vcode; HInstrSB* rcode; @@ -1117,49 +1311,340 @@ static void libvex_BackEnd ( const VexTranslateArgs *vta, "------------------------\n\n"); } - out_used = 0; /* tracks along the host_bytes array */ - /* TODO-JIT: This needs another interface when assembler/flattener - is given whole HInstrSB and also pointer to function - which prints emitted bytes. */ - for (UInt i = 0; i < rcode->insns->insns_used; i++) { - HInstr* hi = rcode->insns->insns[i]; - Bool hi_isProfInc = False; - if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM)) { - ppInstr(hi, mode64); - vex_printf("\n"); + //////////////////////////////////////////////////////// + //// BEGIN the assembler + + // QElem are work Queue elements. The work Queue is the top level data + // structure for the emitter. It is initialised with the HInstrVec* of + // the overall HInstrSB. Every OOL HInstrVec* in the tree will at some + // point be present in the Queue. IL HInstrVec*s are never present in + // the Queue because the inner emitter loop processes them in-line, using + // a Stack (see below) to keep track of its nesting level. + // + // The Stack (see below) is empty before and after every Queue element is + // processed. In other words, the Stack only holds state needed during + // the processing of a single Queue element. + // + // The ordering of elements in the Queue is irrelevant -- correct code + // will be emitted even with set semantics (arbitrary order). However, + // the FIFOness of the queue is believed to generate code in which + // colder and colder code (more deeply nested OOLs) is placed further + // and further from the start of the emitted machine code, which sounds + // like a layout which should minimise icache misses. + // + // QElems also contain two pieces of jump-fixup information. When we + // finally come to process a QElem, we need to know: + // + // * |jumpToOOLpoint|: the place which wants to jump to the start of the + // emitted insns for this QElem. We must have already emitted that, + // since it will be the conditional jump that leads to this QElem (OOL + // block). + // + // * |resumePoint|: the place we should jump back to after the QElem is + // finished (the "resume point"), which is the emitted code of the + // HInstr immediately following the HInstrIfThenElse that has this + // QElem as its OOL block. + // + // When the QElem is processed, we know both the |jumpToOOLpoint| and + // the |resumePoint|, and so the first can be patched, and the second + // we generate an instruction to jump to. + // + // There are three complications with patching: + // + // (1) per comments on Stack elems, we do not know the |resumePoint| when + // creating a QElem. That will only be known when processing of the + // corresponding IL block is completed. + // + // (2) The top level HInstrVec* has neither a |jumpToOOLpoint| nor a + // |resumePoint|. + // + // (3) Non-top-level OOLs may not have a valid |resumePoint| if they do + // an unconditional IR-level Exit. We can generate the resume point + // branch, but it will be never be used. + typedef + struct { + // The HInstrs for this OOL. + HInstrVec* oolVec; + // Where we should patch to jump to the OOL ("how do we get here?") + Bool jumpToOOLpoint_valid; + Relocation jumpToOOLpoint; + // Resume point offset, in bytes from start of output buffer + // ("where do we go after this block is completed?") + Bool resumePoint_valid; + AssemblyBufferOffset resumePoint; } - Int j = emit(&hi_isProfInc, - insn_bytes, sizeof insn_bytes, hi, - mode64, vta->archinfo_host.endness, - vta->disp_cp_chain_me_to_slowEP, - vta->disp_cp_chain_me_to_fastEP, - vta->disp_cp_xindir, - vta->disp_cp_xassisted); - if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM)) { - for (Int k = 0; k < j; k++) - vex_printf("%02x ", (UInt)insn_bytes[k]); - vex_printf("\n\n"); + QElem; + + + // SElem are stack elements. When we suspend processing a HInstrVec* in + // order to process an IL path in an IfThenElse, we push the HInstrVec* + // and the next index to process on the stack, so that we know where to + // resume when the nested IL sequence is completed. |vec| and |vec_next| + // record the resume HInstr. + // + // A second effect of processing a nested IL sequence is that we will + // have to (later) process the corresponding OOL sequence. And that OOL + // sequence will have to finish with a jump back to the "resume point" + // (the emitted instruction immediately following the IfThenElse). We + // only know the offset of the resume point instruction in the output + // buffer when we actually resume emitted from there -- that is, when the + // entry we pushed, is popped. So, when we pop, we must mark the + // corresponding OOL entry in the Queue to record there the resume point + // offset. For this reason we also carry |ool_qindex|, which is the + // index of the corresponding OOL entry in the Queue. + typedef + struct { + HInstrVec* vec; // resume point HInstr vector + UInt vec_next; // resume point HInstr vector index + Int ool_qindex; // index in Queue of OOL to mark when we resume } - if (UNLIKELY(out_used + j > vta->host_bytes_size)) { - vexSetAllocModeTEMP_and_clear(); - vex_traceflags = 0; - res->status = VexTransOutputFull; - return; + SElem; + + // The Stack. The stack depth is bounded by maximum number of nested + // hot (IL) sections, so in practice it is going to be very small. + const Int nSTACK = 4; + + SElem stack[nSTACK]; + Int stackPtr; // points to most recently pushed entry <=> "-1 means empty" + + // The Queue. The queue size is bounded by the number of cold (OOL) + // sections in the entire HInstrSB, so it's also going to be pretty + // small. + const Int nQUEUE = 8; + + QElem queue[nQUEUE]; + Int queueOldest; // index of oldest entry, initially 0 + Int queueNewest; // index of newest entry, + // initially -1, otherwise must be >= queueOldest + + /////////////////////////////////////////////////////// + + const Bool verbose_asm = (vex_traceflags & VEX_TRACE_ASM) != 0; + + const EmitConstants emitConsts + = { .mode64 = mode64, + .endness_host = vta->archinfo_host.endness, + .disp_cp_chain_me_to_slowEP = vta->disp_cp_chain_me_to_slowEP, + .disp_cp_chain_me_to_fastEP = vta->disp_cp_chain_me_to_fastEP, + .disp_cp_xindir = vta->disp_cp_xindir, + .disp_cp_xassisted = vta->disp_cp_xassisted }; + + AssemblyBufferOffset cursor = 0; + AssemblyBufferOffset cursor_limit = vta->host_bytes_size; + + queueOldest = 0; + queueNewest = -1; + + vassert(queueNewest < nQUEUE); + queueNewest++; + { + QElem* qe = &queue[queueNewest]; + vex_bzero(qe, sizeof(*qe)); + qe->oolVec = rcode->insns; + qe->jumpToOOLpoint_valid = False; + qe->resumePoint_valid = False; + } + vassert(queueNewest == 0); + + /* Main loop, processing Queue entries, until there are no more. */ + while (queueOldest <= queueNewest) { + + Int qCur = queueOldest; + if (UNLIKELY(verbose_asm)) + vex_printf("BEGIN queue[%d]\n", qCur); + + // Take the oldest entry in the queue + QElem* qe = &queue[queueOldest]; + queueOldest++; + + // Stay sane. Only the top level block has no branch to it and no + // resume point. + if (qe->oolVec == rcode->insns) { + // This is the top level block + vassert(!qe->jumpToOOLpoint_valid); + vassert(!qe->resumePoint_valid); + } else { + vassert(qe->jumpToOOLpoint_valid); + vassert(qe->resumePoint_valid); + // In the future, we might be able to allow the resume point to be + // invalid for non-top-level blocks, if the block contains an + // unconditional exit. Currently the IR can't represent that, so + // the assertion is valid. + } + + // Processing |qe| + if (qe->jumpToOOLpoint_valid) { + // patch qe->jmpToOOLpoint to jump to |here| + if (UNLIKELY(verbose_asm)) { + vex_printf(" -- APPLY "); + ppRelocation(qe->jumpToOOLpoint); + vex_printf("\n"); + } + applyRelocation(qe->jumpToOOLpoint, &vta->host_bytes[0], + cursor, cursor, vta->archinfo_host.endness, + verbose_asm); } - if (UNLIKELY(hi_isProfInc)) { - vassert(vta->addProfInc); /* else where did it come from? */ - vassert(res->offs_profInc == -1); /* there can be only one (tm) */ - vassert(out_used >= 0); - res->offs_profInc = out_used; + + // Initialise the stack, for processing of |qe|. + stackPtr = 0; // "contains one element" + + stack[stackPtr].vec = qe->oolVec; + stack[stackPtr].vec_next = 0; + stack[stackPtr].ool_qindex = -1; // INVALID + + // Iterate till the stack is empty. This effectively does a + // depth-first traversal of the hot-path (IL) tree reachable from + // here, and at the same time adds any encountered cold-path (OOL) + // blocks to the Queue for later processing. This is the heart of the + // flattening algorithm. + while (stackPtr >= 0) { + + if (UNLIKELY(verbose_asm)) + vex_printf(" -- CONSIDER stack[%d]\n", stackPtr); + + HInstrVec* vec = stack[stackPtr].vec; + UInt vec_next = stack[stackPtr].vec_next; + Int ool_qindex = stack[stackPtr].ool_qindex; + stackPtr--; + + if (vec_next > 0) { + // We're resuming the current IL block having just finished + // processing a nested IL. The OOL counterpart to the nested IL + // we just finished processing will have to jump back to here. + // So we'll need to mark its Queue entry to record that fact. + + // First assert that the OOL actually *is* in the Queue (it + // must be, since we can't have processed it yet). + vassert(queueOldest <= queueNewest); // "at least 1 entry in Q" + vassert(queueOldest <= ool_qindex && ool_qindex <= queueNewest); + + vassert(!queue[ool_qindex].resumePoint_valid); + queue[ool_qindex].resumePoint = cursor; + queue[ool_qindex].resumePoint_valid = True; + if (UNLIKELY(verbose_asm)) + vex_printf(" -- RESUME previous IL\n"); + } else { + // We're starting a new IL. Due to the tail-recursive nature of + // entering ILs, this means we can actually only be starting the + // outermost (top level) block for this particular Queue entry. + vassert(ool_qindex == -1); + vassert(vec == qe->oolVec); + if (UNLIKELY(verbose_asm)) + vex_printf(" -- START new IL\n"); + } + + // Repeatedly process "zero or more simple HInstrs followed by (an + // IfThenElse or end-of-block)" + while (True) { + + // Process "zero or more simple HInstrs" + while (vec_next < vec->insns_used + && !isIfThenElse(vec->insns[vec_next])) { + AssemblyBufferOffset cursor_next + = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0], + cursor, cursor_limit, vec->insns[vec_next], + &emitConsts, vta ); + if (UNLIKELY(cursor_next == cursor)) { + // We ran out of output space. Give up. + goto out_of_buffer_space; + } + vec_next++; + cursor = cursor_next; + } + + // Now we've either got to the end of the hot path, or we have + // an IfThenElse. + if (vec_next >= vec->insns_used) + break; + + // So we have an IfThenElse. + HInstrIfThenElse* hite = isIfThenElse(vec->insns[vec_next]); + vassert(hite); + vassert(hite->n_phis == 0); // the regalloc will have removed them + + // Put |ite|'s OOL block in the Queue. We'll deal with it + // later. Also, generate the (skeleton) conditional branch to it, + // and collect enough information that we can create patch the + // branch later, once we know where the destination is. + vassert(queueNewest < nQUEUE-1); // else out of Queue space + queueNewest++; + queue[queueNewest].oolVec = hite->outOfLine; + queue[queueNewest].resumePoint_valid = False; // not yet known + queue[queueNewest].resumePoint = -1; // invalid + + HInstr* cond_branch + = X86Instr_JmpCond(hite->ccOOL, + queueNewest/*FOR DEBUG PRINTING ONLY*/); + AssemblyBufferOffset cursor_next + = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0], + cursor, cursor_limit, cond_branch, + &emitConsts, vta ); + if (UNLIKELY(cursor_next == cursor)) { + // We ran out of output space. Give up. + goto out_of_buffer_space; + } + queue[queueNewest].jumpToOOLpoint_valid = True; + queue[queueNewest].jumpToOOLpoint + = collectRelocInfo_X86(cursor, cond_branch); + + cursor = cursor_next; + + // Now we descend into |ite's| IL block. So we need to save + // where we are in this block, so we can resume when the inner + // one is done. + vassert(stackPtr < nSTACK-1); // else out of Stack space + stackPtr++; + stack[stackPtr].vec = vec; + stack[stackPtr].vec_next = vec_next+1; + stack[stackPtr].ool_qindex = queueNewest; + + // And now descend into the inner block. We could have just + // pushed its details on the stack and immediately pop it, but + // it seems simpler to update |vec| and |vec_next| and continue + // directly. + if (UNLIKELY(verbose_asm)) { + vex_printf(" -- START inner IL\n"); + } + vec = hite->fallThrough; + vec_next = 0; + + // And continue with "Repeatedly process ..." + } + + // Getting here means we've completed an inner IL and now want to + // resume the parent IL. That is, pop a saved context off the + // stack. } - { UChar* dst = &vta->host_bytes[out_used]; - for (Int k = 0; k < j; k++) { - dst[k] = insn_bytes[k]; - } - out_used += j; + + // Hot path is complete. Now, probably, we have to add a jump + // back to the resume point. + if (qe->resumePoint_valid) { + if (0) + vex_printf(" // Generate jump to resume point [%03u]\n", + qe->resumePoint); + HInstr* jmp = X86Instr_Jmp(cursor, qe->resumePoint); + AssemblyBufferOffset cursor_next + = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0], + cursor, cursor_limit, jmp, + &emitConsts, vta ); + if (UNLIKELY(cursor_next == cursor)) { + // We ran out of output space. Give up. + goto out_of_buffer_space; + } + cursor = cursor_next; } + + if (UNLIKELY(verbose_asm)) + vex_printf("END queue[%d]\n\n", qCur); + // Finished with this Queue entry. } - *(vta->host_bytes_used) = out_used; + // Queue empty, all blocks processed + + *(vta->host_bytes_used) = cursor; + out_used = cursor; + //// + //// END of the assembler + //////////////////////////////////////////////////////// vexAllocSanityCheck(); @@ -1178,6 +1663,13 @@ static void libvex_BackEnd ( const VexTranslateArgs *vta, vex_traceflags = 0; res->status = VexTransOK; return; + + // Return path for when we've run out of output buffer space. + out_of_buffer_space: + vexSetAllocModeTEMP_and_clear(); + vex_traceflags = 0; + res->status = VexTransOutputFull; + return; } -- 2.47.2