From fd9ffff05bcda961da13d1212a466d591e558fbe Mon Sep 17 00:00:00 2001
From: Julian Seward <jseward@acm.org>
Date: Sun, 1 May 2011 18:47:10 +0000
Subject: [PATCH] Improvements to condition code handling on ARM.

(1) guest_arm_spechelper: add another spec rule for
    armg_calculate_condition.  Add a spec rules for
    armg_calculate_flag_c and armg_calculate_flag_v.

(2) guest_arm_toIR.c: when storing oldC (shifter carry out) and
    oldV values in the thunk, be sure to ensure the top 31 bits
    are zero.  This improves the effectiveness of the new spec
    rules (1) by avoiding getting into situations where we have
    Mux0X(c, x, And32(x,1)), where in fact x has bits 31:1 as
    zero.  iropt can't fold that out.  So make sure the spec
    rules don't generate any unnecessary And32(x,1); hence the
    above becomes Mux0X(c, x, x) which iropt can reduce simply
    to "x".

(3) armg_calculate_flags_nzcv: assert condition (2) wherever
    possible.

These changes drastically improve --tool=none performance in some
cases (eg, perf/fbench with softfloat is doubled in speed).


git-svn-id: svn://svn.valgrind.org/vex/trunk@2138
---
 VEX/priv/guest_arm_defs.h    |  14 ++--
 VEX/priv/guest_arm_helpers.c | 157 ++++++++++++++++++++++++++++++++---
 VEX/priv/guest_arm_toIR.c    |  99 +++++++++++++---------
 3 files changed, 212 insertions(+), 58 deletions(-)

diff --git a/VEX/priv/guest_arm_defs.h b/VEX/priv/guest_arm_defs.h
index 02078c4595..8724d8edb5 100644
--- a/VEX/priv/guest_arm_defs.h
+++ b/VEX/priv/guest_arm_defs.h
@@ -148,6 +148,10 @@ UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2,
    that the definedness of the stored flags always depends on
    all 3 DEP values.
 
+   Fields carrying only 1 or 2 bits of useful information (old_C,
+   shifter_co, old_V, oldC:oldV) must have their top 31 or 30 bits
+   (respectively) zero.  The text "31x0:" or "30x0:" denotes this.
+
    A summary of the field usages is:
 
    OP                DEP1              DEP2              DEP3
@@ -156,11 +160,11 @@ UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2,
    OP_COPY           current NZCV      unused            unused
    OP_ADD            argL              argR              unused
    OP_SUB            argL              argR              unused
-   OP_ADC            argL              argR              old_C
-   OP_SBB            argL              argR              old_C
-   OP_LOGIC          result            shifter_co        old_V
-   OP_MUL            result            unused            old_C:old_V
-   OP_MULL           resLO32           resHI32           old_C:old_V
+   OP_ADC            argL              argR              31x0:old_C
+   OP_SBB            argL              argR              31x0:old_C
+   OP_LOGIC          result            31x0:shifter_co   31x0:old_V
+   OP_MUL            result            unused            30x0:old_C:old_V
+   OP_MULL           resLO32           resHI32           30x0:old_C:old_V
 */
 
 enum {
diff --git a/VEX/priv/guest_arm_helpers.c b/VEX/priv/guest_arm_helpers.c
index 93f81150d5..96d5824377 100644
--- a/VEX/priv/guest_arm_helpers.c
+++ b/VEX/priv/guest_arm_helpers.c
@@ -110,6 +110,7 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1,
          UInt argL = cc_dep1;
          UInt argR = cc_dep2;
          UInt oldC = cc_dep3;
+         vassert((oldC & ~1) == 0);
          UInt res  = (argL + argR) + oldC;
          UInt nf   = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
          UInt zf   = lshift( res == 0, ARMG_CC_SHIFT_Z );
@@ -127,6 +128,7 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1,
          UInt argL = cc_dep1;
          UInt argR = cc_dep2;
          UInt oldC = cc_dep3;
+         vassert((oldC & ~1) == 0);
          UInt res  = argL - argR - (oldC ^ 1);
          UInt nf   = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
          UInt zf   = lshift( res == 0, ARMG_CC_SHIFT_Z );
@@ -143,11 +145,13 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1,
          /* (res, shco, oldV) */
          UInt res  = cc_dep1;
          UInt shco = cc_dep2;
+         vassert((shco & ~1) == 0);
          UInt oldV = cc_dep3;
+         vassert((oldV & ~1) == 0);
          UInt nf   = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
          UInt zf   = lshift( res == 0, ARMG_CC_SHIFT_Z );
-         UInt cf   = lshift( shco & 1, ARMG_CC_SHIFT_C );
-         UInt vf   = lshift( oldV & 1, ARMG_CC_SHIFT_V );
+         UInt cf   = lshift( shco, ARMG_CC_SHIFT_C );
+         UInt vf   = lshift( oldV, ARMG_CC_SHIFT_V );
          return nf | zf | cf | vf;
       }
       case ARMG_CC_OP_MUL: {
@@ -155,10 +159,11 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1,
          UInt res  = cc_dep1;
          UInt oldC = (cc_dep3 >> 1) & 1;
          UInt oldV = (cc_dep3 >> 0) & 1;
+         vassert((cc_dep3 & ~3) == 0);
          UInt nf   = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
          UInt zf   = lshift( res == 0, ARMG_CC_SHIFT_Z );
-         UInt cf   = lshift( oldC & 1, ARMG_CC_SHIFT_C );
-         UInt vf   = lshift( oldV & 1, ARMG_CC_SHIFT_V );
+         UInt cf   = lshift( oldC, ARMG_CC_SHIFT_C );
+         UInt vf   = lshift( oldV, ARMG_CC_SHIFT_V );
          return nf | zf | cf | vf;
       }
       case ARMG_CC_OP_MULL: {
@@ -167,10 +172,11 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1,
          UInt resHi32 = cc_dep2;
          UInt oldC    = (cc_dep3 >> 1) & 1;
          UInt oldV    = (cc_dep3 >> 0) & 1;
+         vassert((cc_dep3 & ~3) == 0);
          UInt nf      = lshift( resHi32 & (1<<31), ARMG_CC_SHIFT_N - 31 );
          UInt zf      = lshift( (resHi32|resLo32) == 0, ARMG_CC_SHIFT_Z );
-         UInt cf      = lshift( oldC & 1, ARMG_CC_SHIFT_C );
-         UInt vf      = lshift( oldV & 1, ARMG_CC_SHIFT_V );
+         UInt cf      = lshift( oldC, ARMG_CC_SHIFT_C );
+         UInt vf      = lshift( oldV, ARMG_CC_SHIFT_V );
          return nf | zf | cf | vf;
       }
       default:
@@ -185,7 +191,7 @@ UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1,
 
 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* Calculate the C flag from the thunk components, in the lowest bit
-   of the word (bit 0). */
+   of the word (bit 0).  Bits 31:1 of the returned value are zero. */
 UInt armg_calculate_flag_c ( UInt cc_op, UInt cc_dep1,
                              UInt cc_dep2, UInt cc_dep3 )
 {
@@ -196,7 +202,7 @@ UInt armg_calculate_flag_c ( UInt cc_op, UInt cc_dep1,
 
 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* Calculate the V flag from the thunk components, in the lowest bit
-   of the word (bit 0). */
+   of the word (bit 0).  Bits 31:1 of the returned value are zero. */
 UInt armg_calculate_flag_v ( UInt cc_op, UInt cc_dep1,
                              UInt cc_dep2, UInt cc_dep3 )
 {
@@ -221,7 +227,7 @@ UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2,
 /* Calculate the specified condition from the thunk components, in the
    lowest bit of the word (bit 0). */
 extern 
-UInt armg_calculate_condition ( UInt cond_n_op /* ARMCondcode << 4 | cc_op */,
+UInt armg_calculate_condition ( UInt cond_n_op /* (ARMCondcode << 4) | cc_op */,
                                 UInt cc_dep1,
                                 UInt cc_dep2, UInt cc_dep3 )
 {
@@ -332,12 +338,17 @@ IRExpr* guest_arm_spechelper ( HChar*   function_name,
    /* --------- specialising "armg_calculate_condition" --------- */
 
    if (vex_streq(function_name, "armg_calculate_condition")) {
-      /* specialise calls to above "armg_calculate condition" function */
-      IRExpr *cond_n_op, *cc_dep1, *cc_dep2;
+
+      /* specialise calls to the "armg_calculate_condition" function.
+         Not sure whether this is strictly necessary, but: the
+         replacement IR must produce only the values 0 or 1.  Bits
+         31:1 are required to be zero. */
+      IRExpr *cond_n_op, *cc_dep1, *cc_dep2, *cc_ndep;
       vassert(arity == 4);
-      cond_n_op = args[0]; /* ARMCondcode << 4  |  ARMG_CC_OP_* */
+      cond_n_op = args[0]; /* (ARMCondcode << 4)  |  ARMG_CC_OP_* */
       cc_dep1   = args[1];
       cc_dep2   = args[2];
+      cc_ndep   = args[3];
 
       /*---------------- SUB ----------------*/
 
@@ -384,7 +395,27 @@ IRExpr* guest_arm_spechelper ( HChar*   function_name,
                      binop(Iop_CmpLE32U, cc_dep1, cc_dep2));
       }
 
+      /*---------------- SBB ----------------*/
+
+      if (isU32(cond_n_op, (ARMCondHS << 4) | ARMG_CC_OP_SBB)) {
+         /* This seems to happen a lot in softfloat code, eg __divdf3+140 */
+         /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
+         /* HS after SBB (same as C after SBB below)
+            --> oldC ? (argL >=u argR) : (argL >u argR)
+            --> oldC ? (argR <=u argL) : (argR <u argL)
+         */
+         return
+            IRExpr_Mux0X(
+               unop(Iop_32to8, cc_ndep),
+               /* case oldC == 0 */
+               unop(Iop_1Uto32, binop(Iop_CmpLT32U, cc_dep2, cc_dep1)),
+               /* case oldC != 0 */
+               unop(Iop_1Uto32, binop(Iop_CmpLE32U, cc_dep2, cc_dep1))
+            );
+      }
+
       /*---------------- LOGIC ----------------*/
+
       if (isU32(cond_n_op, (ARMCondEQ << 4) | ARMG_CC_OP_LOGIC)) {
          /* EQ after LOGIC --> test res == 0 */
          return unop(Iop_1Uto32,
@@ -397,6 +428,7 @@ IRExpr* guest_arm_spechelper ( HChar*   function_name,
       }
 
       /*----------------- AL -----------------*/
+
       /* A critically important case for Thumb code.
 
          What we're trying to spot is the case where cond_n_op is an
@@ -443,6 +475,107 @@ IRExpr* guest_arm_spechelper ( HChar*   function_name,
       }
    }
 
+   /* --------- specialising "armg_calculate_flag_c" --------- */
+
+   else
+   if (vex_streq(function_name, "armg_calculate_flag_c")) {
+
+      /* specialise calls to the "armg_calculate_flag_c" function.
+         Note that the returned value must be either 0 or 1; nonzero
+         bits 31:1 are not allowed.  In turn, incoming oldV and oldC
+         values (from the thunk) are assumed to have bits 31:1
+         clear. */
+      IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
+      vassert(arity == 4);
+      cc_op   = args[0]; /* ARMG_CC_OP_* */
+      cc_dep1 = args[1];
+      cc_dep2 = args[2];
+      cc_ndep = args[3];
+
+      if (isU32(cc_op, ARMG_CC_OP_LOGIC)) {
+         /* Thunk args are (result, shco, oldV) */
+         /* C after LOGIC --> shco */
+         return cc_dep2;
+      }
+
+      if (isU32(cc_op, ARMG_CC_OP_SUB)) {
+         /* Thunk args are (argL, argR, unused) */
+         /* C after SUB --> argL >=u argR
+                        --> argR <=u argL */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLE32U, cc_dep2, cc_dep1));
+      }
+
+      if (isU32(cc_op, ARMG_CC_OP_SBB)) {
+         /* This happens occasionally in softfloat code, eg __divdf3+140 */
+         /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
+         /* C after SBB (same as HS after SBB above)
+            --> oldC ? (argL >=u argR) : (argL >u argR)
+            --> oldC ? (argR <=u argL) : (argR <u argL)
+         */
+         return
+            IRExpr_Mux0X(
+               unop(Iop_32to8, cc_ndep),
+               /* case oldC == 0 */
+               unop(Iop_1Uto32, binop(Iop_CmpLT32U, cc_dep2, cc_dep1)),
+               /* case oldC != 0 */
+               unop(Iop_1Uto32, binop(Iop_CmpLE32U, cc_dep2, cc_dep1))
+            );
+      }
+
+   }
+
+   /* --------- specialising "armg_calculate_flag_v" --------- */
+
+   else
+   if (vex_streq(function_name, "armg_calculate_flag_v")) {
+
+      /* specialise calls to the "armg_calculate_flag_v" function.
+         Note that the returned value must be either 0 or 1; nonzero
+         bits 31:1 are not allowed.  In turn, incoming oldV and oldC
+         values (from the thunk) are assumed to have bits 31:1
+         clear. */
+      IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
+      vassert(arity == 4);
+      cc_op   = args[0]; /* ARMG_CC_OP_* */
+      cc_dep1 = args[1];
+      cc_dep2 = args[2];
+      cc_ndep = args[3];
+
+      if (isU32(cc_op, ARMG_CC_OP_LOGIC)) {
+         /* Thunk args are (result, shco, oldV) */
+         /* V after LOGIC --> oldV */
+         return cc_ndep;
+      }
+
+      if (isU32(cc_op, ARMG_CC_OP_SBB)) {
+         /* This happens occasionally in softfloat code, eg __divdf3+140 */
+         /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
+         /* V after SBB
+            --> let res = argL - argR - (oldC ^ 1)
+                in  (argL ^ argR) & (argL ^ res) & 1
+         */
+         return
+            binop(
+               Iop_And32,
+               binop(
+                  Iop_And32,
+                  // argL ^ argR
+                  binop(Iop_Xor32, cc_dep1, cc_dep2),
+                  // argL ^ (argL - argR - (oldC ^ 1))
+                  binop(Iop_Xor32,
+                        cc_dep1,
+                        binop(Iop_Sub32,
+                              binop(Iop_Sub32, cc_dep1, cc_dep2),
+                              binop(Iop_Xor32, cc_ndep, mkU32(1)))
+                  )
+               ),
+               mkU32(1)
+            );
+      }
+
+   }
+
 #  undef unop
 #  undef binop
 #  undef mkU32
diff --git a/VEX/priv/guest_arm_toIR.c b/VEX/priv/guest_arm_toIR.c
index 696c6cdeab..d9433881d1 100644
--- a/VEX/priv/guest_arm_toIR.c
+++ b/VEX/priv/guest_arm_toIR.c
@@ -1088,14 +1088,13 @@ static HChar* nCC ( ARMCondcode cond ) {
 static IRExpr* mk_armg_calculate_condition_dyn ( IRExpr* cond )
 {
    vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I32);
-   /* And 'cond' had better produce a value in which only bits 7:4
-      bits are nonzero.  However, obviously we can't assert for
-      that. */
+   /* And 'cond' had better produce a value in which only bits 7:4 are
+      nonzero.  However, obviously we can't assert for that. */
 
    /* So what we're constructing for the first argument is 
-      "(cond << 4) | stored-operation-operation".  However,
-      as per comments above, must be supplied pre-shifted to this
-      function.
+      "(cond << 4) | stored-operation".
+      However, as per comments above, 'cond' must be supplied
+      pre-shifted to this function.
 
       This pairing scheme requires that the ARM_CC_OP_ values all fit
       in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
@@ -1700,6 +1699,12 @@ IRExpr* signed_overflow_after_Add32 ( IRExpr* resE,
 
    The calling convention for res and newC is a bit funny.  They could
    be passed by value, but instead are passed by ref.
+
+   The C (shco) value computed must be zero in bits 31:1, as the IR
+   optimisations for flag handling (guest_arm_spechelper) rely on
+   that, and the slow-path handlers (armg_calculate_flags_nzcv) assert
+   for it.  Same applies to all these functions that compute shco
+   after a shift or rotate, not just this one.
 */
 
 static void compute_result_and_C_after_LSL_by_imm5 (
@@ -1751,7 +1756,7 @@ static void compute_result_and_C_after_LSL_by_reg (
       /* mux0X(amt == 0,
                mux0X(amt < 32, 
                      0,
-                     Rm[(32-amt) & 31])
+                     Rm[(32-amt) & 31]),
                oldC)
       */
       /* About the best you can do is pray that iropt is able
@@ -1767,16 +1772,19 @@ static void compute_result_and_C_after_LSL_by_reg (
                unop(Iop_1Uto8,
                     binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
                mkU32(0),
-               binop(Iop_Shr32,
-                     mkexpr(rMt),
-                     unop(Iop_32to8,
-                          binop(Iop_And32,
-                                binop(Iop_Sub32,
-                                      mkU32(32),
-                                      mkexpr(amtT)),
-                                mkU32(31)
-                          )
-                     )
+               binop(Iop_And32,
+                     binop(Iop_Shr32,
+                           mkexpr(rMt),
+                           unop(Iop_32to8,
+                                binop(Iop_And32,
+                                      binop(Iop_Sub32,
+                                            mkU32(32),
+                                            mkexpr(amtT)),
+                                      mkU32(31)
+                                )
+                           )
+                     ),
+                     mkU32(1)
                )
             ),
             mkexpr(oldC)
@@ -1862,7 +1870,7 @@ static void compute_result_and_C_after_LSR_by_reg (
       /* mux0X(amt == 0,
                mux0X(amt < 32, 
                      0,
-                     Rm[(amt-1) & 31])
+                     Rm[(amt-1) & 31]),
                oldC)
       */
       IRTemp oldC = newTemp(Ity_I32);
@@ -1876,16 +1884,19 @@ static void compute_result_and_C_after_LSR_by_reg (
                unop(Iop_1Uto8,
                     binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
                mkU32(0),
-               binop(Iop_Shr32,
-                     mkexpr(rMt),
-                     unop(Iop_32to8,
-                          binop(Iop_And32,
-                                binop(Iop_Sub32,
-                                      mkexpr(amtT),
-                                      mkU32(1)),
-                                mkU32(31)
-                          )
-                     )
+               binop(Iop_And32,
+                     binop(Iop_Shr32,
+                           mkexpr(rMt),
+                           unop(Iop_32to8,
+                                binop(Iop_And32,
+                                      binop(Iop_Sub32,
+                                            mkexpr(amtT),
+                                            mkU32(1)),
+                                      mkU32(31)
+                                )
+                           )
+                     ),
+                     mkU32(1)
                )
             ),
             mkexpr(oldC)
@@ -1984,20 +1995,26 @@ static void compute_result_and_C_after_ASR_by_reg (
             IRExpr_Mux0X(
                unop(Iop_1Uto8,
                     binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
-               binop(Iop_Shr32,
-                     mkexpr(rMt),
-                     mkU8(31)
+               binop(Iop_And32,
+                     binop(Iop_Shr32,
+                           mkexpr(rMt),
+                           mkU8(31)
+                     ),
+                     mkU32(1)
                ),
-               binop(Iop_Shr32,
-                     mkexpr(rMt),
-                     unop(Iop_32to8,
-                          binop(Iop_And32,
-                                binop(Iop_Sub32,
-                                      mkexpr(amtT),
-                                      mkU32(1)),
-                                mkU32(31)
-                          )
-                     )
+               binop(Iop_And32,
+                     binop(Iop_Shr32,
+                           mkexpr(rMt),
+                           unop(Iop_32to8,
+                                binop(Iop_And32,
+                                      binop(Iop_Sub32,
+                                            mkexpr(amtT),
+                                            mkU32(1)),
+                                      mkU32(31)
+                                )
+                           )
+                     ),
+                     mkU32(1)
                )
             ),
             mkexpr(oldC)
-- 
2.47.2