]> git.ipfire.org Git - thirdparty/valgrind.git/commitdiff
Improvements to code generation for 32 bit instructions. When
authorJulian Seward <jseward@acm.org>
Sun, 5 Jun 2011 17:56:03 +0000 (17:56 +0000)
committerJulian Seward <jseward@acm.org>
Sun, 5 Jun 2011 17:56:03 +0000 (17:56 +0000)
appropriate, generate 32 bit add/sub/and/or/xor/cmp, so as to avoid a
bunch of cases where previously values would have been widened to 64
bits, or shifted left 32 bits, before being used.  Reduces the size of
the generated code by up to 2.8%.

git-svn-id: svn://svn.valgrind.org/vex/trunk@2156

VEX/priv/guest_amd64_helpers.c
VEX/priv/host_amd64_defs.c
VEX/priv/host_amd64_defs.h
VEX/priv/host_amd64_isel.c

index cea850f42373256fa4d6b1c57efb9dcc085d9044..9375f1db9740d876f25c74339dec77e3325702fe 100644 (file)
@@ -877,6 +877,7 @@ IRExpr* guest_amd64_spechelper ( HChar* function_name,
 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
+#  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
 
    Int i, arity = 0;
@@ -959,34 +960,34 @@ IRExpr* guest_amd64_spechelper ( HChar* function_name,
       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
          /* long sub/cmp, then Z --> test dst==src */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpEQ64, 
-                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
-                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+                     binop(Iop_CmpEQ32,
+                           unop(Iop_64to32, cc_dep1),
+                           unop(Iop_64to32, cc_dep2)));
       }
       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
          /* long sub/cmp, then NZ --> test dst!=src */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpNE64, 
-                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
-                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+                     binop(Iop_CmpNE32,
+                           unop(Iop_64to32, cc_dep1),
+                           unop(Iop_64to32, cc_dep2)));
       }
 
       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
          /* long sub/cmp, then L (signed less than) 
             --> test dst <s src */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpLT64S, 
-                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
-                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+                     binop(Iop_CmpLT32S,
+                           unop(Iop_64to32, cc_dep1),
+                           unop(Iop_64to32, cc_dep2)));
       }
 
       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
          /* long sub/cmp, then LE (signed less than or equal) 
             --> test dst <=s src */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpLE64S, 
-                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
-                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+                     binop(Iop_CmpLE32S,
+                           unop(Iop_64to32, cc_dep1),
+                           unop(Iop_64to32, cc_dep2)));
 
       }
       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
@@ -995,9 +996,9 @@ IRExpr* guest_amd64_spechelper ( HChar* function_name,
             --> test (dst >s src)
             --> test (src <s dst) */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpLT64S,
-                           binop(Iop_Shl64,cc_dep2,mkU8(32)),
-                           binop(Iop_Shl64,cc_dep1,mkU8(32))));
+                     binop(Iop_CmpLT32S,
+                           unop(Iop_64to32, cc_dep2),
+                           unop(Iop_64to32, cc_dep1)));
 
       }
 
@@ -1005,28 +1006,28 @@ IRExpr* guest_amd64_spechelper ( HChar* function_name,
          /* long sub/cmp, then BE (unsigned less than or equal)
             --> test dst <=u src */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpLE64U, 
-                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
-                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+                     binop(Iop_CmpLE32U, 
+                           unop(Iop_64to32, cc_dep1),
+                           unop(Iop_64to32, cc_dep2)));
       }
       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
          /* long sub/cmp, then NBE (unsigned greater than)
             --> test src <u dst */
          /* Note, args are opposite way round from the usual */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpLT64U, 
-                           binop(Iop_Shl64,cc_dep2,mkU8(32)),
-                           binop(Iop_Shl64,cc_dep1,mkU8(32))));
+                     binop(Iop_CmpLT32U, 
+                           unop(Iop_64to32, cc_dep2),
+                           unop(Iop_64to32, cc_dep1)));
       }
 
       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
          /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpLT64S,
-                           binop(Iop_Sub64,
-                                 binop(Iop_Shl64, cc_dep1, mkU8(32)), 
-                                 binop(Iop_Shl64, cc_dep2, mkU8(32))),
-                           mkU64(0)));
+                     binop(Iop_CmpLT32S,
+                           binop(Iop_Sub32,
+                                 unop(Iop_64to32, cc_dep1), 
+                                 unop(Iop_64to32, cc_dep2)),
+                           mkU32(0)));
       }
 
       /*---------------- SUBW ----------------*/
@@ -1126,17 +1127,17 @@ IRExpr* guest_amd64_spechelper ( HChar* function_name,
       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
          /* long and/or/xor, then Z --> test dst==0 */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpEQ64, 
-                           binop(Iop_Shl64,cc_dep1,mkU8(32)), 
-                           mkU64(0)));
+                     binop(Iop_CmpEQ32,
+                           unop(Iop_64to32, cc_dep1), 
+                           mkU32(0)));
       }
 
       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
          /* long and/or/xor, then NZ --> test dst!=0 */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpNE64, 
-                           binop(Iop_Shl64,cc_dep1,mkU8(32)), 
-                           mkU64(0)));
+                     binop(Iop_CmpNE32,
+                           unop(Iop_64to32, cc_dep1), 
+                           mkU32(0)));
       }
 
       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
@@ -1147,9 +1148,9 @@ IRExpr* guest_amd64_spechelper ( HChar* function_name,
             the result is <=signed 0.  Hence ...
          */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpLE64S, 
-                           binop(Iop_Shl64,cc_dep1,mkU8(32)), 
-                           mkU64(0)));
+                     binop(Iop_CmpLE32S,
+                           unop(Iop_64to32, cc_dep1), 
+                           mkU32(0)));
       }
 
       /*---------------- LOGICB ----------------*/
@@ -1214,9 +1215,9 @@ IRExpr* guest_amd64_spechelper ( HChar* function_name,
       if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
          /* dec L, then Z --> test dst == 0 */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpEQ64,
-                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
-                           mkU64(0)));
+                     binop(Iop_CmpEQ32,
+                           unop(Iop_64to32, cc_dep1),
+                           mkU32(0)));
       }
 
       /*---------------- DECW ----------------*/
@@ -1337,9 +1338,9 @@ IRExpr* guest_amd64_spechelper ( HChar* function_name,
       if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
          /* C after sub denotes unsigned less than */
          return unop(Iop_1Uto64,
-                     binop(Iop_CmpLT64U, 
-                           binop(Iop_Shl64,cc_dep1,mkU8(32)), 
-                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+                     binop(Iop_CmpLT32U,
+                           unop(Iop_64to32, cc_dep1), 
+                           unop(Iop_64to32, cc_dep2)));
       }
       if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
          /* C after sub denotes unsigned less than */
@@ -1373,6 +1374,7 @@ IRExpr* guest_amd64_spechelper ( HChar* function_name,
 #  undef unop
 #  undef binop
 #  undef mkU64
+#  undef mkU32
 #  undef mkU8
 
    return NULL;
index 90119f2b172d596f5603a93d1f9177a3a18989ce..834a4aad911e858d879ec6ccbc96b84e869bfc00 100644 (file)
@@ -314,13 +314,16 @@ AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
    return op;
 }
 
-void ppAMD64RMI ( AMD64RMI* op ) {
+static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
    switch (op->tag) {
       case Armi_Imm: 
          vex_printf("$0x%x", op->Armi.Imm.imm32);
          return;
-      case Armi_Reg: 
-         ppHRegAMD64(op->Armi.Reg.reg);
+      case Armi_Reg:
+         if (lo32)
+            ppHRegAMD64_lo32(op->Armi.Reg.reg);
+         else
+            ppHRegAMD64(op->Armi.Reg.reg);
          return;
       case Armi_Mem: 
          ppAMD64AMode(op->Armi.Mem.am);
@@ -329,6 +332,12 @@ void ppAMD64RMI ( AMD64RMI* op ) {
          vpanic("ppAMD64RMI");
    }
 }
+void ppAMD64RMI ( AMD64RMI* op ) {
+   ppAMD64RMI_wrk(op, False/*!lo32*/);
+}
+void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
+   ppAMD64RMI_wrk(op, True/*lo32*/);
+}
 
 /* An AMD64RMI can only be used in a "read" context (what would it mean
    to write or modify a literal?) and so we enumerate its registers
@@ -679,6 +688,19 @@ AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
    i->Ain.Lea64.dst   = dst;
    return i;
 }
+AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
+   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag            = Ain_Alu32R;
+   i->Ain.Alu32R.op  = op;
+   i->Ain.Alu32R.src = src;
+   i->Ain.Alu32R.dst = dst;
+   switch (op) {
+      case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
+      case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
+      default: vassert(0);
+   }
+   return i;
+}
 AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    i->tag            = Ain_MulL;
@@ -1083,6 +1105,12 @@ void ppAMD64Instr ( AMD64Instr* i, Bool mode64 )
          vex_printf(",");
          ppHRegAMD64(i->Ain.Lea64.dst);
          return;
+      case Ain_Alu32R:
+         vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
+         ppAMD64RMI_lo32(i->Ain.Alu32R.src);
+         vex_printf(",");
+         ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
+         return;
       case Ain_MulL:
          vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
          ppAMD64RM(i->Ain.MulL.src);
@@ -1423,6 +1451,15 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i, Bool mode64 )
          addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
          addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
          return;
+      case Ain_Alu32R:
+         vassert(i->Ain.Alu32R.op != Aalu_MOV);
+         addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
+         if (i->Ain.Alu32R.op == Aalu_CMP) { 
+            addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
+            return;
+         }
+         addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
+         return;
       case Ain_MulL:
          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
          addHRegUse(u, HRmModify, hregAMD64_RAX());
@@ -1719,6 +1756,10 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
          mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
          mapReg(m, &i->Ain.Lea64.dst);
          return;
+      case Ain_Alu32R:
+         mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
+         mapReg(m, &i->Ain.Alu32R.dst);
+         return;
       case Ain_MulL:
          mapRegs_AMD64RM(m, i->Ain.MulL.src);
          return;
@@ -2586,6 +2627,69 @@ Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i,
       p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
       goto done;
 
+   case Ain_Alu32R:
+      /* ADD/SUB/AND/OR/XOR/CMP */
+      opc = opc_rr = subopc_imm = opc_imma = 0;
+      switch (i->Ain.Alu32R.op) {
+         case Aalu_ADD: opc = 0x03; opc_rr = 0x01; 
+                        subopc_imm = 0; opc_imma = 0x05; break;
+         case Aalu_SUB: opc = 0x2B; opc_rr = 0x29; 
+                        subopc_imm = 5; opc_imma = 0x2D; break;
+         case Aalu_AND: opc = 0x23; opc_rr = 0x21; 
+                        subopc_imm = 4; opc_imma = 0x25; break;
+         case Aalu_XOR: opc = 0x33; opc_rr = 0x31; 
+                        subopc_imm = 6; opc_imma = 0x35; break;
+         case Aalu_OR:  opc = 0x0B; opc_rr = 0x09; 
+                        subopc_imm = 1; opc_imma = 0x0D; break;
+         case Aalu_CMP: opc = 0x3B; opc_rr = 0x39; 
+                        subopc_imm = 7; opc_imma = 0x3D; break;
+         default: goto bad;
+      }
+      switch (i->Ain.Alu32R.src->tag) {
+         case Armi_Imm:
+            if (i->Ain.Alu32R.dst == hregAMD64_RAX()
+                && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
+               goto bad; /* FIXME: awaiting test case */
+               *p++ = toUChar(opc_imma);
+               p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
+            } else
+            if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
+               rex  = clearWBit( rexAMode_R( fake(0), i->Ain.Alu32R.dst ) );
+               if (rex != 0x40) *p++ = rex;
+               *p++ = 0x83; 
+               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu32R.dst);
+               *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
+            } else {
+               rex  = clearWBit( rexAMode_R( fake(0), i->Ain.Alu32R.dst) );
+               if (rex != 0x40) *p++ = rex;
+               *p++ = 0x81; 
+               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu32R.dst);
+               p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
+            }
+            goto done;
+         case Armi_Reg:
+            rex  = clearWBit( 
+                   rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
+                               i->Ain.Alu32R.dst) );
+            if (rex != 0x40) *p++ = rex;
+            *p++ = toUChar(opc_rr);
+            p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
+                             i->Ain.Alu32R.dst);
+            goto done;
+         case Armi_Mem:
+            rex  = clearWBit(
+                   rexAMode_M( i->Ain.Alu32R.dst,
+                               i->Ain.Alu32R.src->Armi.Mem.am) );
+            if (rex != 0x40) *p++ = rex;
+            *p++ = toUChar(opc);
+            p = doAMode_M(p, i->Ain.Alu32R.dst,
+                             i->Ain.Alu32R.src->Armi.Mem.am);
+            goto done;
+         default: 
+            goto bad;
+      }
+      break;
+
    case Ain_MulL:
       subopc = i->Ain.MulL.syned ? 5 : 4;
       switch (i->Ain.MulL.src->tag)  {
index eecb24bbf5a434a9d1cdae815085e611fc773edc..6f74cd0f8513890b1ec35d6b906a11aca2f50cbb 100644 (file)
@@ -189,7 +189,8 @@ extern AMD64RMI* AMD64RMI_Imm ( UInt );
 extern AMD64RMI* AMD64RMI_Reg ( HReg );
 extern AMD64RMI* AMD64RMI_Mem ( AMD64AMode* );
 
-extern void ppAMD64RMI ( AMD64RMI* );
+extern void ppAMD64RMI      ( AMD64RMI* );
+extern void ppAMD64RMI_lo32 ( AMD64RMI* );
 
 
 /* --------- Operand, which can be reg or immediate only. --------- */
@@ -359,6 +360,7 @@ typedef
       Ain_Test64,      /* 64-bit test (AND, set flags, discard result) */
       Ain_Unary64,     /* 64-bit not and neg */
       Ain_Lea64,       /* 64-bit compute EA into a reg */
+      Ain_Alu32R,      /* 32-bit add/sub/and/or/xor/cmp, dst=REG (a la Alu64R) */
       Ain_MulL,        /* widening multiply */
       Ain_Div,         /* div and mod */
 //..       Xin_Sh3232,    /* shldl or shrdl */
@@ -449,6 +451,12 @@ typedef
             AMD64AMode* am;
             HReg        dst;
          } Lea64;
+         /* 32-bit add/sub/and/or/xor/cmp, dst=REG (a la Alu64R) */
+         struct {
+            AMD64AluOp op;
+            AMD64RMI*  src;
+            HReg       dst;
+         } Alu32R;
          /* 64 x 64 -> 128 bit widening multiply: RDX:RAX = RAX *s/u
             r/m64 */
          struct {
@@ -676,6 +684,7 @@ extern AMD64Instr* AMD64Instr_Alu64R     ( AMD64AluOp, AMD64RMI*, HReg );
 extern AMD64Instr* AMD64Instr_Alu64M     ( AMD64AluOp, AMD64RI*,  AMD64AMode* );
 extern AMD64Instr* AMD64Instr_Unary64    ( AMD64UnaryOp op, HReg dst );
 extern AMD64Instr* AMD64Instr_Lea64      ( AMD64AMode* am, HReg dst );
+extern AMD64Instr* AMD64Instr_Alu32R     ( AMD64AluOp, AMD64RMI*, HReg );
 extern AMD64Instr* AMD64Instr_Sh64       ( AMD64ShiftOp, UInt, HReg );
 extern AMD64Instr* AMD64Instr_Test64     ( UInt imm32, HReg dst );
 extern AMD64Instr* AMD64Instr_MulL       ( Bool syned, AMD64RM* );
index 1198cec506c137251722c5c67fdc1d0787c758f8..7e6cfe3eaa4155e0528f56dbfd9e552ee5985c54 100644 (file)
@@ -1173,19 +1173,11 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
       /* Handle misc other ops. */
 
       if (e->Iex.Binop.op == Iop_Max32U) {
-         /* This generates a truly rotten piece of code.  Just as well
-            it doesn't happen very often. */
-         HReg src1  = iselIntExpr_R(env, e->Iex.Binop.arg1);
-         HReg src1L = newVRegI(env);
-         HReg src2  = iselIntExpr_R(env, e->Iex.Binop.arg2);
-         HReg src2L = newVRegI(env);
-         HReg dst   = newVRegI(env);
-         addInstr(env, mk_iMOVsd_RR(src1,dst));
-         addInstr(env, mk_iMOVsd_RR(src1,src1L));
-         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, src1L));
-         addInstr(env, mk_iMOVsd_RR(src2,src2L));
-         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, src2L));
-         addInstr(env, AMD64Instr_Alu64R(Aalu_CMP, AMD64RMI_Reg(src2L), src1L));
+         HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg dst  = newVRegI(env);
+         HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(src1, dst));
+         addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
          addInstr(env, AMD64Instr_CMov64(Acc_B, AMD64RM_Reg(src2), dst));
          return dst;
       }
@@ -1422,6 +1414,36 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
          }
       }
 
+      /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
+         Use 32 bit arithmetic and let the default zero-extend rule
+         do the 32Uto64 for free. */
+      if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
+         IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
+         IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
+         IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
+         AMD64AluOp aluOp = Aalu_INVALID;
+         switch (opi) {
+            case Iop_Add32: aluOp = Aalu_ADD; break;
+            case Iop_Sub32: aluOp = Aalu_SUB; break;
+            case Iop_And32: aluOp = Aalu_AND; break;
+            case Iop_Or32:  aluOp = Aalu_OR;  break;
+            case Iop_Xor32: aluOp = Aalu_XOR; break;
+            default: break;
+         }
+         if (aluOp != Aalu_INVALID) {
+            /* For commutative ops we assume any literal values are on
+               the second operand. */
+            HReg dst      = newVRegI(env);
+            HReg reg      = iselIntExpr_R(env, argL);
+            AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
+            addInstr(env, mk_iMOVsd_RR(reg,dst));
+            addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
+            return dst;
+         }
+         /* just fall through to normal handling for Iop_32Uto64 */
+      }
+
+      /* Fallback cases */
       switch (e->Iex.Unop.op) {
          case Iop_32Uto64:
          case Iop_32Sto64: {
@@ -2176,10 +2198,8 @@ static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
    if (e->tag == Iex_Unop 
        && e->Iex.Unop.op == Iop_CmpNEZ32) {
       HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
-      HReg      tmp  = newVRegI(env);
       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
-      addInstr(env, AMD64Instr_MovxLQ(False, r1, tmp));
-      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,tmp));
+      addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
       return Acc_NZ;
    }
 
@@ -2249,25 +2269,6 @@ static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
       }
    }
 
-   /* CmpEQ32 / CmpNE32 */
-   if (e->tag == Iex_Binop 
-       && (e->Iex.Binop.op == Iop_CmpEQ32
-           || e->Iex.Binop.op == Iop_CmpNE32
-           || e->Iex.Binop.op == Iop_CasCmpEQ32
-           || e->Iex.Binop.op == Iop_CasCmpNE32)) {
-      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
-      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
-      HReg      r    = newVRegI(env);
-      addInstr(env, mk_iMOVsd_RR(r1,r));
-      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
-      addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, r));
-      switch (e->Iex.Binop.op) {
-         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
-         case Iop_CmpNE32: case Iop_CasCmpNE32: return Acc_NZ;
-         default: vpanic("iselCondCode(amd64): CmpXX32");
-      }
-   }
-
    /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
       Saves a "movq %rax, %tmp" compared to the default route. */
    if (e->tag == Iex_Binop 
@@ -2312,6 +2313,30 @@ static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
       }
    }
 
+   /* Cmp*32*(x,y) */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpEQ32
+           || e->Iex.Binop.op == Iop_CmpNE32
+           || e->Iex.Binop.op == Iop_CmpLT32S
+           || e->Iex.Binop.op == Iop_CmpLT32U
+           || e->Iex.Binop.op == Iop_CmpLE32S
+           || e->Iex.Binop.op == Iop_CmpLE32U
+           || e->Iex.Binop.op == Iop_CasCmpEQ32
+           || e->Iex.Binop.op == Iop_CasCmpNE32)) {
+      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+      addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
+         case Iop_CmpNE32: case Iop_CasCmpNE32: return Acc_NZ;
+        case Iop_CmpLT32S: return Acc_L;
+        case Iop_CmpLT32U: return Acc_B;
+        case Iop_CmpLE32S: return Acc_LE;
+         case Iop_CmpLE32U: return Acc_BE;
+         default: vpanic("iselCondCode(amd64): CmpXX32");
+      }
+   }
+
    ppIRExpr(e);
    vpanic("iselCondCode(amd64)");
 }