From: Julian Seward Date: Sat, 2 Jan 2021 16:18:53 +0000 (+0100) Subject: More arm64 isel tuning: create {and,orr,eor,add,sub} reg,reg,reg-shifted-by-imm X-Git-Tag: VALGRIND_3_17_0~82 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d2a22725450016759f3bbd22d849e5e82fd0c822;p=thirdparty%2Fvalgrind.git More arm64 isel tuning: create {and,orr,eor,add,sub} reg,reg,reg-shifted-by-imm Thus far the arm64 isel can't generate instructions of the form {and,or,xor,add,sub} reg,reg,reg-shifted-by-imm and hence sometimes winds up generating pairs like lsh x2, x1, #13 ; orr x4, x3, x2 when instead it could just have generated orr x4, x3, x1, lsh #13 This commit fixes that, although only for the 64-bit case, not the 32-bit case. Specifically, it can transform the IR forms {Add,Sub,And,Or,Xor}(E1, {Shl,Shr,Sar}(E2, immediate)) and {Add,And,Or,Xor}({Shl,Shr,Sar}(E1, immediate), E2) into a single arm64 instruction. Note that `Sub` is not included in the second line, because shifting the first operand requires inverting the arg order in the arm64 instruction, which isn't allowable with `Sub`, since it's not commutative and arm64 doesn't offer us a reverse-subtract instruction to use instead. This gives a 1.1% reduction generated code size when running /usr/bin/date on Memcheck. --- diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index 6ea67ef319..67dd06c78a 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -498,6 +498,17 @@ static const HChar* showARM64ShiftOp ( ARM64ShiftOp op ) { } } +static const HChar* showARM64RRSOp ( ARM64RRSOp op ) { + switch (op) { + case ARM64rrs_ADD: return "add"; + case ARM64rrs_SUB: return "sub"; + case ARM64rrs_AND: return "and"; + case ARM64rrs_OR: return "orr"; + case ARM64rrs_XOR: return "eor"; + default: vpanic("showARM64RRSOp"); + } +} + static const HChar* showARM64UnaryOp ( ARM64UnaryOp op ) { switch (op) { case ARM64un_NEG: return "neg"; @@ -858,6 +869,20 @@ ARM64Instr* ARM64Instr_Logic ( HReg dst, i->ARM64in.Logic.op = op; return i; } +ARM64Instr* ARM64Instr_RRS ( HReg dst, HReg argL, HReg argR, + ARM64ShiftOp shiftOp, UChar amt, + ARM64RRSOp mainOp ) { + ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr)); + i->tag = ARM64in_RRS; + i->ARM64in.RRS.dst = dst; + i->ARM64in.RRS.argL = argL; + i->ARM64in.RRS.argR = argR; + i->ARM64in.RRS.shiftOp = shiftOp; + i->ARM64in.RRS.amt = amt; + i->ARM64in.RRS.mainOp = mainOp; + vassert(amt >= 1 && amt <= 63); + return i; +} ARM64Instr* ARM64Instr_Test ( HReg argL, ARM64RIL* argR ) { ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr)); i->tag = ARM64in_Test; @@ -1446,6 +1471,16 @@ void ppARM64Instr ( const ARM64Instr* i ) { vex_printf(", "); ppARM64RIL(i->ARM64in.Logic.argR); return; + case ARM64in_RRS: + vex_printf("%s ", showARM64RRSOp(i->ARM64in.RRS.mainOp)); + ppHRegARM64(i->ARM64in.RRS.dst); + vex_printf(", "); + ppHRegARM64(i->ARM64in.RRS.argL); + vex_printf(", "); + ppHRegARM64(i->ARM64in.RRS.argR); + vex_printf(", %s #%u", showARM64ShiftOp(i->ARM64in.RRS.shiftOp), + i->ARM64in.RRS.amt); + return; case ARM64in_Test: vex_printf("tst "); ppHRegARM64(i->ARM64in.Test.argL); @@ -2018,6 +2053,11 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 ) addHRegUse(u, HRmRead, i->ARM64in.Logic.argL); addRegUsage_ARM64RIL(u, i->ARM64in.Logic.argR); return; + case ARM64in_RRS: + addHRegUse(u, HRmWrite, i->ARM64in.RRS.dst); + addHRegUse(u, HRmRead, i->ARM64in.RRS.argL); + addHRegUse(u, HRmRead, i->ARM64in.RRS.argR); + return; case ARM64in_Test: addHRegUse(u, HRmRead, i->ARM64in.Test.argL); addRegUsage_ARM64RIL(u, i->ARM64in.Test.argR); @@ -2386,6 +2426,11 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 ) i->ARM64in.Logic.argL = lookupHRegRemap(m, i->ARM64in.Logic.argL); mapRegs_ARM64RIL(m, i->ARM64in.Logic.argR); return; + case ARM64in_RRS: + i->ARM64in.RRS.dst = lookupHRegRemap(m, i->ARM64in.RRS.dst); + i->ARM64in.RRS.argL = lookupHRegRemap(m, i->ARM64in.RRS.argL); + i->ARM64in.RRS.argR = lookupHRegRemap(m, i->ARM64in.RRS.argR); + return; case ARM64in_Test: i->ARM64in.Test.argL = lookupHRegRemap(m, i->ARM64in.Test.argL); mapRegs_ARM64RIL(m, i->ARM64in.Logic.argR); @@ -2892,8 +2937,13 @@ static inline UInt qregEnc ( HReg r ) #define X01110101 BITS8(0,1,1,1,0,1,0,1) #define X01110110 BITS8(0,1,1,1,0,1,1,0) #define X01110111 BITS8(0,1,1,1,0,1,1,1) +#define X10001010 BITS8(1,0,0,0,1,0,1,0) +#define X10001011 BITS8(1,0,0,0,1,0,1,1) +#define X10101010 BITS8(1,0,1,0,1,0,1,0) #define X11000001 BITS8(1,1,0,0,0,0,0,1) #define X11000011 BITS8(1,1,0,0,0,0,1,1) +#define X11001010 BITS8(1,1,0,0,1,0,1,0) +#define X11001011 BITS8(1,1,0,0,1,0,1,1) #define X11010100 BITS8(1,1,0,1,0,1,0,0) #define X11010110 BITS8(1,1,0,1,0,1,1,0) #define X11011000 BITS8(1,1,0,1,1,0,0,0) @@ -3064,7 +3114,6 @@ static inline UInt X_3_6_1_6_6_5_5 ( UInt f1, UInt f2, UInt f3, return w; } - static inline UInt X_3_8_5_1_5_5_5 ( UInt f1, UInt f2, UInt f3, UInt f4, UInt f5, UInt f6, UInt f7 ) { vassert(3+8+5+1+5+5+5 == 32); @@ -3086,6 +3135,27 @@ static inline UInt X_3_8_5_1_5_5_5 ( UInt f1, UInt f2, UInt f3, UInt f4, return w; } +static inline UInt X_8_2_1_5_6_5_5 ( UInt f1, UInt f2, UInt f3, UInt f4, + UInt f5, UInt f6, UInt f7 ) { + vassert(8+2+1+5+6+5+5 == 32); + vassert(f1 < (1<<8)); + vassert(f2 < (1<<2)); + vassert(f3 < (1<<1)); + vassert(f4 < (1<<5)); + vassert(f5 < (1<<6)); + vassert(f6 < (1<<5)); + vassert(f7 < (1<<5)); + UInt w = 0; + w = (w << 8) | f1; + w = (w << 2) | f2; + w = (w << 1) | f3; + w = (w << 5) | f4; + w = (w << 6) | f5; + w = (w << 5) | f6; + w = (w << 5) | f7; + return w; +} + //ZZ #define X0000 BITS4(0,0,0,0) //ZZ #define X0001 BITS4(0,0,0,1) //ZZ #define X0010 BITS4(0,0,1,0) @@ -3543,6 +3613,31 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, } goto done; } + case ARM64in_RRS: { + UInt top8 = 0; + switch (i->ARM64in.RRS.mainOp) { + case ARM64rrs_ADD: top8 = X10001011; break; + case ARM64rrs_SUB: top8 = X11001011; break; + case ARM64rrs_AND: top8 = X10001010; break; + case ARM64rrs_XOR: top8 = X11001010; break; + case ARM64rrs_OR: top8 = X10101010; break; + default: vassert(0); /*NOTREACHED*/ + } + UInt sh = 0; + switch (i->ARM64in.RRS.shiftOp) { + case ARM64sh_SHL: sh = X00; break; + case ARM64sh_SHR: sh = X01; break; + case ARM64sh_SAR: sh = X10; break; + default: vassert(0); /*NOTREACHED*/ + } + UInt amt = i->ARM64in.RRS.amt; + vassert(amt >= 1 && amt <= 63); + *p++ = X_8_2_1_5_6_5_5(top8, sh, 0, + iregEnc(i->ARM64in.RRS.argR), amt, + iregEnc(i->ARM64in.RRS.argL), + iregEnc(i->ARM64in.RRS.dst)); + goto done; + } case ARM64in_Test: { UInt rD = 31; /* XZR, we are going to dump the result */ UInt rN = iregEnc(i->ARM64in.Test.argL); diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h index 24da64e22b..105d7ce843 100644 --- a/VEX/priv/host_arm64_defs.h +++ b/VEX/priv/host_arm64_defs.h @@ -254,6 +254,17 @@ typedef } ARM64ShiftOp; +typedef + enum { + ARM64rrs_ADD=54, + ARM64rrs_SUB, + ARM64rrs_AND, + ARM64rrs_OR, + ARM64rrs_XOR, + ARM64rrs_INVALID + } + ARM64RRSOp; + typedef enum { ARM64un_NEG=60, @@ -475,6 +486,7 @@ typedef ARM64in_Arith=1220, ARM64in_Cmp, ARM64in_Logic, + ARM64in_RRS, ARM64in_Test, ARM64in_Shift, ARM64in_Unary, @@ -567,6 +579,15 @@ typedef ARM64RIL* argR; ARM64LogicOp op; } Logic; + /* 64 bit AND/OR/XOR/ADD/SUB, reg, reg-with-imm-shift */ + struct { + HReg dst; + HReg argL; + HReg argR; + ARM64ShiftOp shiftOp; + UChar amt; /* 1 to 63 only */ + ARM64RRSOp mainOp; + } RRS; /* 64 bit TST reg, reg or bimm (AND and set flags) */ struct { HReg argL; @@ -956,6 +977,8 @@ typedef extern ARM64Instr* ARM64Instr_Arith ( HReg, HReg, ARM64RIA*, Bool isAdd ); extern ARM64Instr* ARM64Instr_Cmp ( HReg, ARM64RIA*, Bool is64 ); extern ARM64Instr* ARM64Instr_Logic ( HReg, HReg, ARM64RIL*, ARM64LogicOp ); +extern ARM64Instr* ARM64Instr_RRS ( HReg, HReg, HReg, ARM64ShiftOp, + UChar amt, ARM64RRSOp mainOp ); extern ARM64Instr* ARM64Instr_Test ( HReg, ARM64RIL* ); extern ARM64Instr* ARM64Instr_Shift ( HReg, HReg, ARM64RI6*, ARM64ShiftOp ); extern ARM64Instr* ARM64Instr_Unary ( HReg, HReg, ARM64UnaryOp ); diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index 517b7b15b8..689cdba969 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -791,6 +791,94 @@ Bool doHelperCall ( /*OUT*/UInt* stackAdjustAfterCall, partial values if necessary. */ +/* ---------------- RRS matching helper ---------------- */ + +/* This helper matches 64-bit integer expressions of the form + {Add,Sub,And,Or,Xor}(E1, {Shl,Shr,Sar}(E2, immediate)) + and + {Add,And,Or,Xor}({Shl,Shr,Sar}(E1, immediate), E2) + which is a useful thing to do because AArch64 can compute those in + a single instruction. + */ +static Bool matchesRegRegShift(/*OUT*/ARM64RRSOp* mainOp, + /*OUT*/ARM64ShiftOp* shiftOp, + /*OUT*/UChar* amt, + /*OUT*/IRExpr** argUnshifted, + /*OUT*/IRExpr** argToBeShifted, + IRExpr* e) +{ + *mainOp = (ARM64RRSOp)0; + *shiftOp = (ARM64ShiftOp)0; + *amt = 0; + *argUnshifted = NULL; + *argToBeShifted = NULL; + if (e->tag != Iex_Binop) { + return False; + } + const IROp irMainOp = e->Iex.Binop.op; + Bool canSwap = True; + switch (irMainOp) { + case Iop_And64: *mainOp = ARM64rrs_AND; break; + case Iop_Or64: *mainOp = ARM64rrs_OR; break; + case Iop_Xor64: *mainOp = ARM64rrs_XOR; break; + case Iop_Add64: *mainOp = ARM64rrs_ADD; break; + case Iop_Sub64: *mainOp = ARM64rrs_SUB; canSwap = False; break; + default: return False; + } + /* The root node is OK. Now check the right (2nd) arg. */ + IRExpr* argL = e->Iex.Binop.arg1; + IRExpr* argR = e->Iex.Binop.arg2; + + // This loop runs either one or two iterations. In the first iteration, we + // check for a shiftable right (second) arg. If that fails, at the end of + // the first iteration, the args are swapped, if that is valid, and we go + // round again, hence checking for a shiftable left (first) arg. + UInt iterNo = 1; + while (True) { + vassert(iterNo == 1 || iterNo == 2); + if (argR->tag == Iex_Binop) { + const IROp irShiftOp = argR->Iex.Binop.op; + if (irShiftOp == Iop_Shl64 + || irShiftOp == Iop_Shr64 || irShiftOp == Iop_Sar64) { + IRExpr* argRL = argR->Iex.Binop.arg1; + const IRExpr* argRR = argR->Iex.Binop.arg2; + if (argRR->tag == Iex_Const) { + const IRConst* argRRconst = argRR->Iex.Const.con; + vassert(argRRconst->tag == Ico_U8); // due to typecheck rules + const UChar amount = argRRconst->Ico.U8; + if (amount >= 1 && amount <= 63) { + // We got a match \o/ + // *mainOp is already set + switch (irShiftOp) { + case Iop_Shl64: *shiftOp = ARM64sh_SHL; break; + case Iop_Shr64: *shiftOp = ARM64sh_SHR; break; + case Iop_Sar64: *shiftOp = ARM64sh_SAR; break; + default: vassert(0); // guarded above + } + *amt = amount; + *argUnshifted = argL; + *argToBeShifted = argRL; + return True; + } + } + } + } + // We failed to get a match in the first iteration. So, provided the + // root node isn't SUB, swap the arguments and make one further + // iteration. If that doesn't succeed, we must give up. + if (iterNo == 1 && canSwap) { + IRExpr* tmp = argL; + argL = argR; + argR = tmp; + iterNo = 2; + continue; + } + // Give up. + return False; + } + /*NOTREACHED*/ + } + /* --------------------- AMode --------------------- */ /* Return an AMode which computes the value of the specified @@ -1577,7 +1665,34 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) break; } - /* ADD/SUB */ + /* AND64/OR64/XOR64/ADD64/SUB64(e1, e2 shifted by imm) + AND64/OR64/XOR64/ADD64(e1 shifted by imm, e2) + */ + { + switch (e->Iex.Binop.op) { + case Iop_And64: case Iop_Or64: case Iop_Xor64: + case Iop_Add64: case Iop_Sub64:{ + ARM64RRSOp mainOp = ARM64rrs_INVALID; + ARM64ShiftOp shiftOp = (ARM64ShiftOp)0; // Invalid + IRExpr* argUnshifted = NULL; + IRExpr* argToBeShifted = NULL; + UChar amt = 0; + if (matchesRegRegShift(&mainOp, &shiftOp, &amt, &argUnshifted, + &argToBeShifted, e)) { + HReg rDst = newVRegI(env); + HReg rUnshifted = iselIntExpr_R(env, argUnshifted); + HReg rToBeShifted = iselIntExpr_R(env, argToBeShifted); + addInstr(env, ARM64Instr_RRS(rDst, rUnshifted, rToBeShifted, + shiftOp, amt, mainOp)); + return rDst; + } + } + default: + break; + } + } + + /* ADD/SUB(e1, e2) (for any e1, e2) */ switch (e->Iex.Binop.op) { case Iop_Add64: case Iop_Add32: case Iop_Sub64: case Iop_Sub32: { @@ -1593,7 +1708,7 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) break; } - /* AND/OR/XOR */ + /* AND/OR/XOR(e1, e2) (for any e1, e2) */ switch (e->Iex.Binop.op) { case Iop_And64: case Iop_And32: lop = ARM64lo_AND; goto log_binop; case Iop_Or64: case Iop_Or32: lop = ARM64lo_OR; goto log_binop;