From fd9ffff05bcda961da13d1212a466d591e558fbe Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Sun, 1 May 2011 18:47:10 +0000 Subject: [PATCH] Improvements to condition code handling on ARM. (1) guest_arm_spechelper: add another spec rule for armg_calculate_condition. Add a spec rules for armg_calculate_flag_c and armg_calculate_flag_v. (2) guest_arm_toIR.c: when storing oldC (shifter carry out) and oldV values in the thunk, be sure to ensure the top 31 bits are zero. This improves the effectiveness of the new spec rules (1) by avoiding getting into situations where we have Mux0X(c, x, And32(x,1)), where in fact x has bits 31:1 as zero. iropt can't fold that out. So make sure the spec rules don't generate any unnecessary And32(x,1); hence the above becomes Mux0X(c, x, x) which iropt can reduce simply to "x". (3) armg_calculate_flags_nzcv: assert condition (2) wherever possible. These changes drastically improve --tool=none performance in some cases (eg, perf/fbench with softfloat is doubled in speed). git-svn-id: svn://svn.valgrind.org/vex/trunk@2138 --- VEX/priv/guest_arm_defs.h | 14 ++-- VEX/priv/guest_arm_helpers.c | 157 ++++++++++++++++++++++++++++++++--- VEX/priv/guest_arm_toIR.c | 99 +++++++++++++--------- 3 files changed, 212 insertions(+), 58 deletions(-) diff --git a/VEX/priv/guest_arm_defs.h b/VEX/priv/guest_arm_defs.h index 02078c4595..8724d8edb5 100644 --- a/VEX/priv/guest_arm_defs.h +++ b/VEX/priv/guest_arm_defs.h @@ -148,6 +148,10 @@ UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2, that the definedness of the stored flags always depends on all 3 DEP values. + Fields carrying only 1 or 2 bits of useful information (old_C, + shifter_co, old_V, oldC:oldV) must have their top 31 or 30 bits + (respectively) zero. The text "31x0:" or "30x0:" denotes this. + A summary of the field usages is: OP DEP1 DEP2 DEP3 @@ -156,11 +160,11 @@ UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2, OP_COPY current NZCV unused unused OP_ADD argL argR unused OP_SUB argL argR unused - OP_ADC argL argR old_C - OP_SBB argL argR old_C - OP_LOGIC result shifter_co old_V - OP_MUL result unused old_C:old_V - OP_MULL resLO32 resHI32 old_C:old_V + OP_ADC argL argR 31x0:old_C + OP_SBB argL argR 31x0:old_C + OP_LOGIC result 31x0:shifter_co 31x0:old_V + OP_MUL result unused 30x0:old_C:old_V + OP_MULL resLO32 resHI32 30x0:old_C:old_V */ enum { diff --git a/VEX/priv/guest_arm_helpers.c b/VEX/priv/guest_arm_helpers.c index 93f81150d5..96d5824377 100644 --- a/VEX/priv/guest_arm_helpers.c +++ b/VEX/priv/guest_arm_helpers.c @@ -110,6 +110,7 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1, UInt argL = cc_dep1; UInt argR = cc_dep2; UInt oldC = cc_dep3; + vassert((oldC & ~1) == 0); UInt res = (argL + argR) + oldC; UInt nf = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 ); UInt zf = lshift( res == 0, ARMG_CC_SHIFT_Z ); @@ -127,6 +128,7 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1, UInt argL = cc_dep1; UInt argR = cc_dep2; UInt oldC = cc_dep3; + vassert((oldC & ~1) == 0); UInt res = argL - argR - (oldC ^ 1); UInt nf = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 ); UInt zf = lshift( res == 0, ARMG_CC_SHIFT_Z ); @@ -143,11 +145,13 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1, /* (res, shco, oldV) */ UInt res = cc_dep1; UInt shco = cc_dep2; + vassert((shco & ~1) == 0); UInt oldV = cc_dep3; + vassert((oldV & ~1) == 0); UInt nf = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 ); UInt zf = lshift( res == 0, ARMG_CC_SHIFT_Z ); - UInt cf = lshift( shco & 1, ARMG_CC_SHIFT_C ); - UInt vf = lshift( oldV & 1, ARMG_CC_SHIFT_V ); + UInt cf = lshift( shco, ARMG_CC_SHIFT_C ); + UInt vf = lshift( oldV, ARMG_CC_SHIFT_V ); return nf | zf | cf | vf; } case ARMG_CC_OP_MUL: { @@ -155,10 +159,11 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1, UInt res = cc_dep1; UInt oldC = (cc_dep3 >> 1) & 1; UInt oldV = (cc_dep3 >> 0) & 1; + vassert((cc_dep3 & ~3) == 0); UInt nf = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 ); UInt zf = lshift( res == 0, ARMG_CC_SHIFT_Z ); - UInt cf = lshift( oldC & 1, ARMG_CC_SHIFT_C ); - UInt vf = lshift( oldV & 1, ARMG_CC_SHIFT_V ); + UInt cf = lshift( oldC, ARMG_CC_SHIFT_C ); + UInt vf = lshift( oldV, ARMG_CC_SHIFT_V ); return nf | zf | cf | vf; } case ARMG_CC_OP_MULL: { @@ -167,10 +172,11 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1, UInt resHi32 = cc_dep2; UInt oldC = (cc_dep3 >> 1) & 1; UInt oldV = (cc_dep3 >> 0) & 1; + vassert((cc_dep3 & ~3) == 0); UInt nf = lshift( resHi32 & (1<<31), ARMG_CC_SHIFT_N - 31 ); UInt zf = lshift( (resHi32|resLo32) == 0, ARMG_CC_SHIFT_Z ); - UInt cf = lshift( oldC & 1, ARMG_CC_SHIFT_C ); - UInt vf = lshift( oldV & 1, ARMG_CC_SHIFT_V ); + UInt cf = lshift( oldC, ARMG_CC_SHIFT_C ); + UInt vf = lshift( oldV, ARMG_CC_SHIFT_V ); return nf | zf | cf | vf; } default: @@ -185,7 +191,7 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1, /* CALLED FROM GENERATED CODE: CLEAN HELPER */ /* Calculate the C flag from the thunk components, in the lowest bit - of the word (bit 0). */ + of the word (bit 0). Bits 31:1 of the returned value are zero. */ UInt armg_calculate_flag_c ( UInt cc_op, UInt cc_dep1, UInt cc_dep2, UInt cc_dep3 ) { @@ -196,7 +202,7 @@ UInt armg_calculate_flag_c ( UInt cc_op, UInt cc_dep1, /* CALLED FROM GENERATED CODE: CLEAN HELPER */ /* Calculate the V flag from the thunk components, in the lowest bit - of the word (bit 0). */ + of the word (bit 0). Bits 31:1 of the returned value are zero. */ UInt armg_calculate_flag_v ( UInt cc_op, UInt cc_dep1, UInt cc_dep2, UInt cc_dep3 ) { @@ -221,7 +227,7 @@ UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2, /* Calculate the specified condition from the thunk components, in the lowest bit of the word (bit 0). */ extern -UInt armg_calculate_condition ( UInt cond_n_op /* ARMCondcode << 4 | cc_op */, +UInt armg_calculate_condition ( UInt cond_n_op /* (ARMCondcode << 4) | cc_op */, UInt cc_dep1, UInt cc_dep2, UInt cc_dep3 ) { @@ -332,12 +338,17 @@ IRExpr* guest_arm_spechelper ( HChar* function_name, /* --------- specialising "armg_calculate_condition" --------- */ if (vex_streq(function_name, "armg_calculate_condition")) { - /* specialise calls to above "armg_calculate condition" function */ - IRExpr *cond_n_op, *cc_dep1, *cc_dep2; + + /* specialise calls to the "armg_calculate_condition" function. + Not sure whether this is strictly necessary, but: the + replacement IR must produce only the values 0 or 1. Bits + 31:1 are required to be zero. */ + IRExpr *cond_n_op, *cc_dep1, *cc_dep2, *cc_ndep; vassert(arity == 4); - cond_n_op = args[0]; /* ARMCondcode << 4 | ARMG_CC_OP_* */ + cond_n_op = args[0]; /* (ARMCondcode << 4) | ARMG_CC_OP_* */ cc_dep1 = args[1]; cc_dep2 = args[2]; + cc_ndep = args[3]; /*---------------- SUB ----------------*/ @@ -384,7 +395,27 @@ IRExpr* guest_arm_spechelper ( HChar* function_name, binop(Iop_CmpLE32U, cc_dep1, cc_dep2)); } + /*---------------- SBB ----------------*/ + + if (isU32(cond_n_op, (ARMCondHS << 4) | ARMG_CC_OP_SBB)) { + /* This seems to happen a lot in softfloat code, eg __divdf3+140 */ + /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */ + /* HS after SBB (same as C after SBB below) + --> oldC ? (argL >=u argR) : (argL >u argR) + --> oldC ? (argR <=u argL) : (argR test res == 0 */ return unop(Iop_1Uto32, @@ -397,6 +428,7 @@ IRExpr* guest_arm_spechelper ( HChar* function_name, } /*----------------- AL -----------------*/ + /* A critically important case for Thumb code. What we're trying to spot is the case where cond_n_op is an @@ -443,6 +475,107 @@ IRExpr* guest_arm_spechelper ( HChar* function_name, } } + /* --------- specialising "armg_calculate_flag_c" --------- */ + + else + if (vex_streq(function_name, "armg_calculate_flag_c")) { + + /* specialise calls to the "armg_calculate_flag_c" function. + Note that the returned value must be either 0 or 1; nonzero + bits 31:1 are not allowed. In turn, incoming oldV and oldC + values (from the thunk) are assumed to have bits 31:1 + clear. */ + IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep; + vassert(arity == 4); + cc_op = args[0]; /* ARMG_CC_OP_* */ + cc_dep1 = args[1]; + cc_dep2 = args[2]; + cc_ndep = args[3]; + + if (isU32(cc_op, ARMG_CC_OP_LOGIC)) { + /* Thunk args are (result, shco, oldV) */ + /* C after LOGIC --> shco */ + return cc_dep2; + } + + if (isU32(cc_op, ARMG_CC_OP_SUB)) { + /* Thunk args are (argL, argR, unused) */ + /* C after SUB --> argL >=u argR + --> argR <=u argL */ + return unop(Iop_1Uto32, + binop(Iop_CmpLE32U, cc_dep2, cc_dep1)); + } + + if (isU32(cc_op, ARMG_CC_OP_SBB)) { + /* This happens occasionally in softfloat code, eg __divdf3+140 */ + /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */ + /* C after SBB (same as HS after SBB above) + --> oldC ? (argL >=u argR) : (argL >u argR) + --> oldC ? (argR <=u argL) : (argR oldV */ + return cc_ndep; + } + + if (isU32(cc_op, ARMG_CC_OP_SBB)) { + /* This happens occasionally in softfloat code, eg __divdf3+140 */ + /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */ + /* V after SBB + --> let res = argL - argR - (oldC ^ 1) + in (argL ^ argR) & (argL ^ res) & 1 + */ + return + binop( + Iop_And32, + binop( + Iop_And32, + // argL ^ argR + binop(Iop_Xor32, cc_dep1, cc_dep2), + // argL ^ (argL - argR - (oldC ^ 1)) + binop(Iop_Xor32, + cc_dep1, + binop(Iop_Sub32, + binop(Iop_Sub32, cc_dep1, cc_dep2), + binop(Iop_Xor32, cc_ndep, mkU32(1))) + ) + ), + mkU32(1) + ); + } + + } + # undef unop # undef binop # undef mkU32 diff --git a/VEX/priv/guest_arm_toIR.c b/VEX/priv/guest_arm_toIR.c index 696c6cdeab..d9433881d1 100644 --- a/VEX/priv/guest_arm_toIR.c +++ b/VEX/priv/guest_arm_toIR.c @@ -1088,14 +1088,13 @@ static HChar* nCC ( ARMCondcode cond ) { static IRExpr* mk_armg_calculate_condition_dyn ( IRExpr* cond ) { vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I32); - /* And 'cond' had better produce a value in which only bits 7:4 - bits are nonzero. However, obviously we can't assert for - that. */ + /* And 'cond' had better produce a value in which only bits 7:4 are + nonzero. However, obviously we can't assert for that. */ /* So what we're constructing for the first argument is - "(cond << 4) | stored-operation-operation". However, - as per comments above, must be supplied pre-shifted to this - function. + "(cond << 4) | stored-operation". + However, as per comments above, 'cond' must be supplied + pre-shifted to this function. This pairing scheme requires that the ARM_CC_OP_ values all fit in 4 bits. Hence we are passing a (COND, OP) pair in the lowest @@ -1700,6 +1699,12 @@ IRExpr* signed_overflow_after_Add32 ( IRExpr* resE, The calling convention for res and newC is a bit funny. They could be passed by value, but instead are passed by ref. + + The C (shco) value computed must be zero in bits 31:1, as the IR + optimisations for flag handling (guest_arm_spechelper) rely on + that, and the slow-path handlers (armg_calculate_flags_nzcv) assert + for it. Same applies to all these functions that compute shco + after a shift or rotate, not just this one. */ static void compute_result_and_C_after_LSL_by_imm5 ( @@ -1751,7 +1756,7 @@ static void compute_result_and_C_after_LSL_by_reg ( /* mux0X(amt == 0, mux0X(amt < 32, 0, - Rm[(32-amt) & 31]) + Rm[(32-amt) & 31]), oldC) */ /* About the best you can do is pray that iropt is able @@ -1767,16 +1772,19 @@ static void compute_result_and_C_after_LSL_by_reg ( unop(Iop_1Uto8, binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))), mkU32(0), - binop(Iop_Shr32, - mkexpr(rMt), - unop(Iop_32to8, - binop(Iop_And32, - binop(Iop_Sub32, - mkU32(32), - mkexpr(amtT)), - mkU32(31) - ) - ) + binop(Iop_And32, + binop(Iop_Shr32, + mkexpr(rMt), + unop(Iop_32to8, + binop(Iop_And32, + binop(Iop_Sub32, + mkU32(32), + mkexpr(amtT)), + mkU32(31) + ) + ) + ), + mkU32(1) ) ), mkexpr(oldC) @@ -1862,7 +1870,7 @@ static void compute_result_and_C_after_LSR_by_reg ( /* mux0X(amt == 0, mux0X(amt < 32, 0, - Rm[(amt-1) & 31]) + Rm[(amt-1) & 31]), oldC) */ IRTemp oldC = newTemp(Ity_I32); @@ -1876,16 +1884,19 @@ static void compute_result_and_C_after_LSR_by_reg ( unop(Iop_1Uto8, binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))), mkU32(0), - binop(Iop_Shr32, - mkexpr(rMt), - unop(Iop_32to8, - binop(Iop_And32, - binop(Iop_Sub32, - mkexpr(amtT), - mkU32(1)), - mkU32(31) - ) - ) + binop(Iop_And32, + binop(Iop_Shr32, + mkexpr(rMt), + unop(Iop_32to8, + binop(Iop_And32, + binop(Iop_Sub32, + mkexpr(amtT), + mkU32(1)), + mkU32(31) + ) + ) + ), + mkU32(1) ) ), mkexpr(oldC) @@ -1984,20 +1995,26 @@ static void compute_result_and_C_after_ASR_by_reg ( IRExpr_Mux0X( unop(Iop_1Uto8, binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))), - binop(Iop_Shr32, - mkexpr(rMt), - mkU8(31) + binop(Iop_And32, + binop(Iop_Shr32, + mkexpr(rMt), + mkU8(31) + ), + mkU32(1) ), - binop(Iop_Shr32, - mkexpr(rMt), - unop(Iop_32to8, - binop(Iop_And32, - binop(Iop_Sub32, - mkexpr(amtT), - mkU32(1)), - mkU32(31) - ) - ) + binop(Iop_And32, + binop(Iop_Shr32, + mkexpr(rMt), + unop(Iop_32to8, + binop(Iop_And32, + binop(Iop_Sub32, + mkexpr(amtT), + mkU32(1)), + mkU32(31) + ) + ) + ), + mkU32(1) ) ), mkexpr(oldC) -- 2.47.2