From: Julian Seward <jseward@acm.org>
Date: Mon, 11 Jul 2011 11:43:38 +0000 (+0000)
Subject: Complete the implementation of ARM atomic ops: {LD,ST}REX{,B,H,D} in
X-Git-Tag: svn/VALGRIND_3_7_0^2~52
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=775a45280c4366d5571e5bbd72da5dda9877050b;p=thirdparty%2Fvalgrind.git

Complete the implementation of ARM atomic ops: {LD,ST}REX{,B,H,D} in
both ARM and Thumb encodings, for NEON and non-NEON capable backends.
Bug 266035 comments 4, 43, 51.  Derived from patches by Jeff Brown
<jeffbrown@google.com>, Igor Saenko <igor.saenko@gmail.com> and
Dr. David Alan Gilbert <david.gilbert@linaro.org>.


git-svn-id: svn://svn.valgrind.org/vex/trunk@2172
---

diff --git a/VEX/priv/guest_arm_toIR.c b/VEX/priv/guest_arm_toIR.c
index bc05bc8c01..5fdd1dd679 100644
--- a/VEX/priv/guest_arm_toIR.c
+++ b/VEX/priv/guest_arm_toIR.c
@@ -13370,52 +13370,107 @@ DisResult disInstr_ARM_WRK (
    /* -- ARMv6 instructions                                    -- */
    /* ----------------------------------------------------------- */
 
-   /* --------------------- ldrex, strex --------------------- */
-
-   // LDREX
-   if (0x01900F9F == (insn & 0x0FF00FFF)) {
-      UInt rT = INSN(15,12);
-      UInt rN = INSN(19,16);
-      if (rT == 15 || rN == 15) {
-         /* undecodable; fall through */
+   /* ------------------- {ldr,str}ex{,b,h,d} ------------------- */
+
+   // LDREXD, LDREX, LDREXH, LDREXB
+   if (0x01900F9F == (insn & 0x0F900FFF)) {
+      UInt   rT    = INSN(15,12);
+      UInt   rN    = INSN(19,16);
+      IRType ty    = Ity_INVALID;
+      IROp   widen = Iop_INVALID;
+      HChar* nm    = NULL;
+      Bool   valid = True;
+      switch (INSN(22,21)) {
+         case 0: nm = "";  ty = Ity_I32; break;
+         case 1: nm = "d"; ty = Ity_I64; break;
+         case 2: nm = "b"; ty = Ity_I8;  widen = Iop_8Uto32; break;
+         case 3: nm = "h"; ty = Ity_I16; widen = Iop_16Uto32; break;
+         default: vassert(0);
+      }
+      if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
+         if (rT == 15 || rN == 15)
+            valid = False;
       } else {
+         vassert(ty == Ity_I64);
+         if ((rT & 1) == 1 || rT == 14 || rN == 15)
+            valid = False;
+      }
+      if (valid) {
          IRTemp res;
          /* make unconditional */
          if (condT != IRTemp_INVALID) {
-            mk_skip_over_A32_if_cond_is_false( condT );
-            condT = IRTemp_INVALID;
+           mk_skip_over_A32_if_cond_is_false( condT );
+           condT = IRTemp_INVALID;
          }
          /* Ok, now we're unconditional.  Do the load. */
-         res = newTemp(Ity_I32);
+         res = newTemp(ty);
+         // FIXME: assumes little-endian guest
          stmt( IRStmt_LLSC(Iend_LE, res, getIRegA(rN),
                            NULL/*this is a load*/) );
-         putIRegA(rT, mkexpr(res), IRTemp_INVALID, Ijk_Boring);
-         DIP("ldrex%s r%u, [r%u]\n", nCC(INSN_COND), rT, rN);
+         if (ty == Ity_I64) {
+            // FIXME: assumes little-endian guest
+            putIRegA(rT+0, unop(Iop_64to32, mkexpr(res)),
+                           IRTemp_INVALID, Ijk_Boring);
+            putIRegA(rT+1, unop(Iop_64HIto32, mkexpr(res)),
+                           IRTemp_INVALID, Ijk_Boring);
+            DIP("ldrex%s%s r%u, r%u, [r%u]\n",
+                nm, nCC(INSN_COND), rT+0, rT+1, rN);
+         } else {
+            putIRegA(rT, widen == Iop_INVALID
+                            ? mkexpr(res) : unop(widen, mkexpr(res)),
+                     IRTemp_INVALID, Ijk_Boring);
+            DIP("ldrex%s%s r%u, [r%u]\n", nm, nCC(INSN_COND), rT, rN);
+         }
          goto decode_success;
       }
-      /* fall through */
+      /* undecodable; fall through */
    }
 
-   // STREX
-   if (0x01800F90 == (insn & 0x0FF00FF0)) {
-      UInt rT = INSN(3,0);
-      UInt rN = INSN(19,16);
-      UInt rD = INSN(15,12);
-      if (rT == 15 || rN == 15 || rD == 15
-          || rD == rT || rD == rN) {
-         /* undecodable; fall through */
+   // STREXD, STREX, STREXH, STREXB
+   if (0x01800F90 == (insn & 0x0F900FF0)) {
+      UInt   rT     = INSN(3,0);
+      UInt   rN     = INSN(19,16);
+      UInt   rD     = INSN(15,12);
+      IRType ty     = Ity_INVALID;
+      IROp   narrow = Iop_INVALID;
+      HChar* nm     = NULL;
+      Bool   valid  = True;
+      switch (INSN(22,21)) {
+         case 0: nm = "";  ty = Ity_I32; break;
+         case 1: nm = "d"; ty = Ity_I64; break;
+         case 2: nm = "b"; ty = Ity_I8;  narrow = Iop_32to8; break;
+         case 3: nm = "h"; ty = Ity_I16; narrow = Iop_32to16; break;
+         default: vassert(0);
+      }
+      if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
+         if (rD == 15 || rN == 15 || rT == 15
+             || rD == rN || rD == rT)
+            valid = False;
       } else {
-         IRTemp resSC1, resSC32;
-
+         vassert(ty == Ity_I64);
+         if (rD == 15 || (rT & 1) == 1 || rT == 14 || rN == 15
+             || rD == rN || rD == rT || rD == rT+1)
+            valid = False;
+      }
+      if (valid) {
+         IRTemp resSC1, resSC32, data;
          /* make unconditional */
          if (condT != IRTemp_INVALID) {
             mk_skip_over_A32_if_cond_is_false( condT );
             condT = IRTemp_INVALID;
          }
-
          /* Ok, now we're unconditional.  Do the store. */
+         data = newTemp(ty);
+         assign(data,
+                ty == Ity_I64
+                   // FIXME: assumes little-endian guest
+                   ? binop(Iop_32HLto64, getIRegA(rT+1), getIRegA(rT+0))
+                   : narrow == Iop_INVALID
+                      ? getIRegA(rT)
+                      : unop(narrow, getIRegA(rT)));
          resSC1 = newTemp(Ity_I1);
-         stmt( IRStmt_LLSC(Iend_LE, resSC1, getIRegA(rN), getIRegA(rT)) );
+         // FIXME: assumes little-endian guest
+         stmt( IRStmt_LLSC(Iend_LE, resSC1, getIRegA(rN), mkexpr(data)) );
 
          /* Set rD to 1 on failure, 0 on success.  Currently we have
             resSC1 == 0 on failure, 1 on success. */
@@ -13425,7 +13480,13 @@ DisResult disInstr_ARM_WRK (
 
          putIRegA(rD, mkexpr(resSC32),
                       IRTemp_INVALID, Ijk_Boring);
-         DIP("strex%s r%u, r%u, [r%u]\n", nCC(INSN_COND), rD, rT, rN);
+         if (ty == Ity_I64) {
+            DIP("strex%s%s r%u, r%u, r%u, [r%u]\n",
+                nm, nCC(INSN_COND), rD, rT, rT+1, rN);
+         } else {
+            DIP("strex%s%s r%u, r%u, [r%u]\n",
+                nm, nCC(INSN_COND), rD, rT, rN);
+         }
          goto decode_success;
       }
       /* fall through */
@@ -17771,6 +17832,49 @@ DisResult disInstr_THUMB_WRK (
       }
    }
 
+   /* --------------- (T1) LDREX{B,H} --------------- */
+   if (INSN0(15,4) == 0xE8D
+       && (INSN1(11,0) == 0xF4F || INSN1(11,0) == 0xF5F)) {
+      UInt rN  = INSN0(3,0);
+      UInt rT  = INSN1(15,12);
+      Bool isH = INSN1(11,0) == 0xF5F;
+      if (!isBadRegT(rT) && rN != 15) {
+         IRTemp res;
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false( condT );
+         // now uncond
+         res = newTemp(isH ? Ity_I16 : Ity_I8);
+         stmt( IRStmt_LLSC(Iend_LE, res, getIRegT(rN),
+                           NULL/*this is a load*/ ));
+         putIRegT(rT, unop(isH ? Iop_16Uto32 : Iop_8Uto32, mkexpr(res)),
+                      IRTemp_INVALID);
+         DIP("ldrex%c r%u, [r%u]\n", isH ? 'h' : 'b', rT, rN);
+         goto decode_success;
+      }
+   }
+
+   /* --------------- (T1) LDREXD --------------- */
+   if (INSN0(15,4) == 0xE8D && INSN1(7,0) == 0x7F) {
+      UInt rN  = INSN0(3,0);
+      UInt rT  = INSN1(15,12);
+      UInt rT2 = INSN1(11,8);
+      if (!isBadRegT(rT) && !isBadRegT(rT2) && rT != rT2 && rN != 15) {
+         IRTemp res;
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false( condT );
+         // now uncond
+         res = newTemp(Ity_I64);
+         // FIXME: assumes little-endian guest
+         stmt( IRStmt_LLSC(Iend_LE, res, getIRegT(rN),
+                           NULL/*this is a load*/ ));
+         // FIXME: assumes little-endian guest
+         putIRegT(rT,  unop(Iop_64to32,   mkexpr(res)), IRTemp_INVALID);
+         putIRegT(rT2, unop(Iop_64HIto32, mkexpr(res)), IRTemp_INVALID);
+         DIP("ldrexd r%u, r%u, [r%u]\n", rT, rT2, rN);
+         goto decode_success;
+      }
+   }
+
    /* ----------------- (T1) STREX ----------------- */
    if (INSN0(15,4) == 0xE84) {
       UInt rN   = INSN0(3,0);
@@ -17780,30 +17884,84 @@ DisResult disInstr_THUMB_WRK (
       if (!isBadRegT(rD) && !isBadRegT(rT) && rN != 15 
           && rD != rN && rD != rT) {
          IRTemp resSC1, resSC32;
-
          // go uncond
          mk_skip_over_T32_if_cond_is_false( condT );
          // now uncond
-
          /* Ok, now we're unconditional.  Do the store. */
          resSC1 = newTemp(Ity_I1);
          stmt( IRStmt_LLSC(Iend_LE,
                            resSC1,
                            binop(Iop_Add32, getIRegT(rN), mkU32(imm8 * 4)),
                            getIRegT(rT)) );
-
          /* Set rD to 1 on failure, 0 on success.  Currently we have
             resSC1 == 0 on failure, 1 on success. */
          resSC32 = newTemp(Ity_I32);
          assign(resSC32,
                 unop(Iop_1Uto32, unop(Iop_Not1, mkexpr(resSC1))));
-
          putIRegT(rD, mkexpr(resSC32), IRTemp_INVALID);
          DIP("strex r%u, r%u, [r%u, #+%u]\n", rD, rT, rN, imm8 * 4);
          goto decode_success;
       }
    }
 
+   /* --------------- (T1) STREX{B,H} --------------- */
+   if (INSN0(15,4) == 0xE8C
+       && (INSN1(11,4) == 0xF4 || INSN1(11,4) == 0xF5)) {
+      UInt rN  = INSN0(3,0);
+      UInt rT  = INSN1(15,12);
+      UInt rD  = INSN1(3,0);
+      Bool isH = INSN1(11,4) == 0xF5;
+      if (!isBadRegT(rD) && !isBadRegT(rT) && rN != 15 
+          && rD != rN && rD != rT) {
+         IRTemp resSC1, resSC32;
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false( condT );
+         // now uncond
+         /* Ok, now we're unconditional.  Do the store. */
+         resSC1 = newTemp(Ity_I1);
+         stmt( IRStmt_LLSC(Iend_LE, resSC1, getIRegT(rN),
+                           unop(isH ? Iop_32to16 : Iop_32to8,
+                                getIRegT(rT))) );
+         /* Set rD to 1 on failure, 0 on success.  Currently we have
+            resSC1 == 0 on failure, 1 on success. */
+         resSC32 = newTemp(Ity_I32);
+         assign(resSC32,
+                unop(Iop_1Uto32, unop(Iop_Not1, mkexpr(resSC1))));
+         putIRegT(rD, mkexpr(resSC32), IRTemp_INVALID);
+         DIP("strex%c r%u, r%u, [r%u]\n", isH ? 'h' : 'b', rD, rT, rN);
+         goto decode_success;
+      }
+   }
+
+   /* ---------------- (T1) STREXD ---------------- */
+   if (INSN0(15,4) == 0xE8C && INSN1(7,4) == BITS4(0,1,1,1)) {
+      UInt rN  = INSN0(3,0);
+      UInt rT  = INSN1(15,12);
+      UInt rT2 = INSN1(11,8);
+      UInt rD  = INSN1(3,0);
+      if (!isBadRegT(rD) && !isBadRegT(rT) && !isBadRegT(rT2)
+          && rN != 15 && rD != rN && rD != rT && rD != rT) {
+         IRTemp resSC1, resSC32, data;
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false( condT );
+         // now uncond
+         /* Ok, now we're unconditional.  Do the store. */
+         resSC1 = newTemp(Ity_I1);
+         data = newTemp(Ity_I64);
+         // FIXME: assumes little-endian guest
+         assign(data, binop(Iop_32HLto64, getIRegT(rT2), getIRegT(rT)));
+         // FIXME: assumes little-endian guest
+         stmt( IRStmt_LLSC(Iend_LE, resSC1, getIRegT(rN), mkexpr(data)));
+         /* Set rD to 1 on failure, 0 on success.  Currently we have
+            resSC1 == 0 on failure, 1 on success. */
+         resSC32 = newTemp(Ity_I32);
+         assign(resSC32,
+                unop(Iop_1Uto32, unop(Iop_Not1, mkexpr(resSC1))));
+         putIRegT(rD, mkexpr(resSC32), IRTemp_INVALID);
+         DIP("strexd r%u, r%u, r%u, [r%u]\n", rD, rT, rT2, rN);
+         goto decode_success;
+      }
+   }
    /* -------------- v7 barrier insns -------------- */
    if (INSN0(15,0) == 0xF3BF && (INSN1(15,0) & 0xFF00) == 0x8F00) {
       /* XXX this isn't really right, is it?  The generated IR does
diff --git a/VEX/priv/host_arm_defs.c b/VEX/priv/host_arm_defs.c
index fd3719756b..fb8df5c38c 100644
--- a/VEX/priv/host_arm_defs.c
+++ b/VEX/priv/host_arm_defs.c
@@ -1206,14 +1206,14 @@ ARMInstr* ARMInstr_LdrEX ( Int szB ) {
    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
    i->tag             = ARMin_LdrEX;
    i->ARMin.LdrEX.szB = szB;
-   vassert(szB == 4 || szB == 1);
+   vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
    return i;
 }
 ARMInstr* ARMInstr_StrEX ( Int szB ) {
    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
    i->tag             = ARMin_StrEX;
    i->ARMin.StrEX.szB = szB;
-   vassert(szB == 4 || szB == 1);
+   vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
    return i;
 }
 ARMInstr* ARMInstr_VLdStD ( Bool isLoad, HReg dD, ARMAModeV* am ) {
@@ -1603,16 +1603,28 @@ void ppARMInstr ( ARMInstr* i ) {
             vex_printf("r1:r0, r2, r3");
          }
          return;
-      case ARMin_LdrEX:
-         vex_printf("ldrex%s ", i->ARMin.LdrEX.szB == 1 ? "b"
-                                : i->ARMin.LdrEX.szB == 2 ? "h" : "");
-         vex_printf("r0, [r1]");
+      case ARMin_LdrEX: {
+         HChar* sz = "";
+         switch (i->ARMin.LdrEX.szB) {
+            case 1: sz = "b"; break; case 2: sz = "h"; break;
+            case 8: sz = "d"; break; case 4: break;
+            default: vassert(0);
+         }      
+         vex_printf("ldrex%s %sr2, [r4]",
+                    sz, i->ARMin.LdrEX.szB == 8 ? "r3:" : "");
          return;
-      case ARMin_StrEX:
-         vex_printf("strex%s ", i->ARMin.StrEX.szB == 1 ? "b"
-                                : i->ARMin.StrEX.szB == 2 ? "h" : "");
-         vex_printf("r0, r1, [r2]");
+      }
+      case ARMin_StrEX: {
+         HChar* sz = "";
+         switch (i->ARMin.StrEX.szB) {
+            case 1: sz = "b"; break; case 2: sz = "h"; break;
+            case 8: sz = "d"; break; case 4: break;
+            default: vassert(0);
+         }      
+         vex_printf("strex%s r0, %sr2, [r4]",
+                    sz, i->ARMin.StrEX.szB == 8 ? "r3:" : "");
          return;
+      }
       case ARMin_VLdStD:
          if (i->ARMin.VLdStD.isLoad) {
             vex_printf("fldd  ");
@@ -1989,13 +2001,17 @@ void getRegUsage_ARMInstr ( HRegUsage* u, ARMInstr* i, Bool mode64 )
             addHRegUse(u, HRmWrite, hregARM_R1());
          return;
       case ARMin_LdrEX:
-         addHRegUse(u, HRmWrite, hregARM_R0());
-         addHRegUse(u, HRmRead, hregARM_R1());
+         addHRegUse(u, HRmRead, hregARM_R4());
+         addHRegUse(u, HRmWrite, hregARM_R2());
+         if (i->ARMin.LdrEX.szB == 8)
+            addHRegUse(u, HRmWrite, hregARM_R3());
          return;
       case ARMin_StrEX:
+         addHRegUse(u, HRmRead, hregARM_R4());
          addHRegUse(u, HRmWrite, hregARM_R0());
-         addHRegUse(u, HRmRead, hregARM_R1());
          addHRegUse(u, HRmRead, hregARM_R2());
+         if (i->ARMin.StrEX.szB == 8)
+            addHRegUse(u, HRmRead, hregARM_R3());
          return;
       case ARMin_VLdStD:
          addRegUsage_ARMAModeV(u, i->ARMin.VLdStD.amode);
@@ -2959,27 +2975,31 @@ Int emit_ARMInstr ( UChar* buf, Int nbuf, ARMInstr* i,
          goto bad;
       }
       case ARMin_LdrEX: {
-         /* E1910F9F   ldrex    r0, [r1]
-            E1F10F9F   ldrexh   r0, [r1]
-            E1D10F9F   ldrexb   r0, [r1]
+         /* E1D42F9F   ldrexb r2, [r4]
+            E1F42F9F   ldrexh r2, [r4]
+            E1942F9F   ldrex  r2, [r4]
+            E1B42F9F   ldrexd r2, r3, [r4]
          */
          switch (i->ARMin.LdrEX.szB) {
-            case 4: *p++ = 0xE1910F9F; goto done;
-            //case 2: *p++ = 0xE1F10F9F; goto done;
-            case 1: *p++ = 0xE1D10F9F; goto done;
+            case 1: *p++ = 0xE1D42F9F; goto done;
+            case 2: *p++ = 0xE1F42F9F; goto done;
+            case 4: *p++ = 0xE1942F9F; goto done;
+            case 8: *p++ = 0xE1B42F9F; goto done;
             default: break;
          }
          goto bad;
       }
       case ARMin_StrEX: {
-         /* E1820F91   strex   r0, r1, [r2]
-            E1E20F91   strexh  r0, r1, [r2]
-            E1C20F91   strexb  r0, r1, [r2]
+         /* E1C40F92   strexb r0, r2, [r4]
+            E1E40F92   strexh r0, r2, [r4]
+            E1840F92   strex  r0, r2, [r4]
+            E1A40F92   strexd r0, r2, r3, [r4]
          */
          switch (i->ARMin.StrEX.szB) {
-            case 4: *p++ = 0xE1820F91; goto done;
-            //case 2: *p++ = 0xE1E20F91; goto done;
-            case 1: *p++ = 0xE1C20F91; goto done;
+            case 1: *p++ = 0xE1C40F92; goto done;
+            case 2: *p++ = 0xE1E40F92; goto done;
+            case 4: *p++ = 0xE1840F92; goto done;
+            case 8: *p++ = 0xE1A40F92; goto done;
             default: break;
          }
          goto bad;
diff --git a/VEX/priv/host_arm_defs.h b/VEX/priv/host_arm_defs.h
index b96ec3ab23..92bdbe06f9 100644
--- a/VEX/priv/host_arm_defs.h
+++ b/VEX/priv/host_arm_defs.h
@@ -709,18 +709,21 @@ typedef
          struct {
             ARMMulOp op;
          } Mul;
-         /* LDREX{,H,B} r0, [r1]
+         /* LDREX{,H,B} r2, [r4]  and
+            LDREXD r2, r3, [r4]   (on LE hosts, transferred value is r3:r2)
             Again, hardwired registers since this is not performance
             critical, and there are possibly constraints on the
             registers that we can't express in the register allocator.*/
          struct {
-            Int  szB; /* currently only 4 is allowed */
+            Int  szB; /* 1, 2, 4 or 8 */
          } LdrEX;
-         /* STREX{,H,B} r0, r1, [r2]
-            r0 = SC( [r2] = r1 )
+         /* STREX{,H,B} r0, r2, [r4]  and  
+            STREXD r0, r2, r3, [r4]   (on LE hosts, transferred value is r3:r2)
+            r0 = SC( [r4] = r2 )      (8, 16, 32 bit transfers)
+            r0 = SC( [r4] = r3:r2)    (64 bit transfers)
             Ditto comment re fixed registers. */
          struct {
-            Int  szB; /* currently only 4 is allowed */
+            Int  szB; /* 1, 2, 4 or 8 */
          } StrEX;
          /* VFP INSTRUCTIONS */
          /* 64-bit Fp load/store */
diff --git a/VEX/priv/host_arm_isel.c b/VEX/priv/host_arm_isel.c
index d4d9c86ff5..35c4c65b6b 100644
--- a/VEX/priv/host_arm_isel.c
+++ b/VEX/priv/host_arm_isel.c
@@ -211,8 +211,8 @@ static ARMAMode2*  iselIntExpr_AMode2     ( ISelEnv* env, IRExpr* e );
 static ARMAModeV*  iselIntExpr_AModeV_wrk ( ISelEnv* env, IRExpr* e );
 static ARMAModeV*  iselIntExpr_AModeV     ( ISelEnv* env, IRExpr* e );
 
-static ARMAModeN* iselIntExpr_AModeN_wrk  ( ISelEnv* env, IRExpr* e );
-static ARMAModeN* iselIntExpr_AModeN      ( ISelEnv* env, IRExpr* e );
+static ARMAModeN*  iselIntExpr_AModeN_wrk ( ISelEnv* env, IRExpr* e );
+static ARMAModeN*  iselIntExpr_AModeN     ( ISelEnv* env, IRExpr* e );
 
 static ARMRI84*    iselIntExpr_RI84_wrk
         ( /*OUT*/Bool* didInv, Bool mayInv, ISelEnv* env, IRExpr* e );
@@ -5820,50 +5820,86 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
          /* LL */
          IRTemp res = stmt->Ist.LLSC.result;
          IRType ty  = typeOfIRTemp(env->type_env, res);
-         if (ty == Ity_I32 || ty == Ity_I8) {
+         if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
             Int  szB   = 0;
             HReg r_dst = lookupIRTemp(env, res);
             HReg raddr = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
             switch (ty) {
                case Ity_I8:  szB = 1; break;
+               case Ity_I16: szB = 2; break;
                case Ity_I32: szB = 4; break;
                default:      vassert(0);
             }
-            addInstr(env, mk_iMOVds_RR(hregARM_R1(), raddr));
+            addInstr(env, mk_iMOVds_RR(hregARM_R4(), raddr));
             addInstr(env, ARMInstr_LdrEX(szB));
-            addInstr(env, mk_iMOVds_RR(r_dst, hregARM_R0()));
+            addInstr(env, mk_iMOVds_RR(r_dst, hregARM_R2()));
             return;
          }
-         /* else fall thru; is unhandled */
+         if (ty == Ity_I64) {
+            HReg raddr = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
+            addInstr(env, mk_iMOVds_RR(hregARM_R4(), raddr));
+            addInstr(env, ARMInstr_LdrEX(8));
+            /* Result is in r3:r2.  On a non-NEON capable CPU, we must
+               move it into a result register pair.  On a NEON capable
+               CPU, the result register will be a 64 bit NEON
+               register, so we must move it there instead. */
+            if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+               HReg dst = lookupIRTemp(env, res);
+               addInstr(env, ARMInstr_VXferD(True, dst, hregARM_R3(),
+                                                        hregARM_R2()));
+            } else {
+               HReg r_dst_hi, r_dst_lo;
+               lookupIRTemp64(&r_dst_hi, &r_dst_lo, env, res);
+               addInstr(env, mk_iMOVds_RR(r_dst_lo, hregARM_R2()));
+               addInstr(env, mk_iMOVds_RR(r_dst_hi, hregARM_R3()));
+            }
+            return;
+         }
+         /*NOTREACHED*/
+         vassert(0); 
       } else {
          /* SC */
-         IRTemp res = stmt->Ist.LLSC.result;
-         IRType ty  = typeOfIRTemp(env->type_env, res);
          IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.LLSC.storedata);
-         vassert(ty == Ity_I1);
-         if (tyd == Ity_I32 || tyd == Ity_I8) {
-            Int  szB     = 0;
-            HReg r_res   = lookupIRTemp(env, res);
-            HReg rD      = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
-            HReg rA      = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
-            ARMRI84* one = ARMRI84_I84(1,0);
+         if (tyd == Ity_I32 || tyd == Ity_I16 || tyd == Ity_I8) {
+            Int  szB = 0;
+            HReg rD  = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
+            HReg rA  = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
             switch (tyd) {
                case Ity_I8:  szB = 1; break;
+               case Ity_I16: szB = 2; break;
                case Ity_I32: szB = 4; break;
                default:      vassert(0);
             }
-            addInstr(env, mk_iMOVds_RR(hregARM_R1(), rD));
-            addInstr(env, mk_iMOVds_RR(hregARM_R2(), rA));
+            addInstr(env, mk_iMOVds_RR(hregARM_R2(), rD));
+            addInstr(env, mk_iMOVds_RR(hregARM_R4(), rA));
             addInstr(env, ARMInstr_StrEX(szB));
-            /* now r0 is 1 if failed, 0 if success.  Change to IR
-               conventions (0 is fail, 1 is success).  Also transfer
-               result to r_res. */
-            addInstr(env, ARMInstr_Alu(ARMalu_XOR, r_res, hregARM_R0(), one));
-            /* And be conservative -- mask off all but the lowest bit */
-            addInstr(env, ARMInstr_Alu(ARMalu_AND, r_res, r_res, one));
-            return;
-         }
-         /* else fall thru; is unhandled */
+         } else {
+            vassert(tyd == Ity_I64);
+            /* This is really ugly.  There is no is/is-not NEON
+               decision akin to the case for LL, because iselInt64Expr
+               fudges this for us, and always gets the result into two
+               GPRs even if this means moving it from a NEON
+               register. */
+            HReg rDhi, rDlo;
+            iselInt64Expr(&rDhi, &rDlo, env, stmt->Ist.LLSC.storedata);
+            HReg rA = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
+            addInstr(env, mk_iMOVds_RR(hregARM_R2(), rDlo));
+            addInstr(env, mk_iMOVds_RR(hregARM_R3(), rDhi));
+            addInstr(env, mk_iMOVds_RR(hregARM_R4(), rA));
+            addInstr(env, ARMInstr_StrEX(8));
+         }
+         /* now r0 is 1 if failed, 0 if success.  Change to IR
+            conventions (0 is fail, 1 is success).  Also transfer
+            result to r_res. */
+         IRTemp   res   = stmt->Ist.LLSC.result;
+         IRType   ty    = typeOfIRTemp(env->type_env, res);
+         HReg     r_res = lookupIRTemp(env, res);
+         ARMRI84* one   = ARMRI84_I84(1,0);
+         vassert(ty == Ity_I1);
+         addInstr(env, ARMInstr_Alu(ARMalu_XOR, r_res, hregARM_R0(), one));
+         /* And be conservative -- mask off all but the lowest bit */
+         addInstr(env, ARMInstr_Alu(ARMalu_AND, r_res, r_res, one));
+         return;
       }
       break;
    }
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
index f5ac5fc94c..f44ac87367 100644
--- a/VEX/priv/ir_defs.c
+++ b/VEX/priv/ir_defs.c
@@ -3344,14 +3344,16 @@ void tcStmt ( IRSB* bb, IRStmt* stmt, IRType gWordTy )
          tyRes = typeOfIRTemp(tyenv, stmt->Ist.LLSC.result);
          if (stmt->Ist.LLSC.storedata == NULL) {
             /* it's a LL */
-            if (tyRes != Ity_I64 && tyRes != Ity_I32 && tyRes != Ity_I8)
+            if (tyRes != Ity_I64 && tyRes != Ity_I32
+                && tyRes != Ity_I16 && tyRes != Ity_I8)
                sanityCheckFail(bb,stmt,"Ist.LLSC(LL).result :: bogus");
          } else {
             /* it's a SC */
             if (tyRes != Ity_I1)
                sanityCheckFail(bb,stmt,"Ist.LLSC(SC).result: not :: Ity_I1");
             tyData = typeOfIRExpr(tyenv, stmt->Ist.LLSC.storedata);
-            if (tyData != Ity_I64 && tyData != Ity_I32 && tyData != Ity_I8)
+            if (tyData != Ity_I64 && tyData != Ity_I32
+                && tyData != Ity_I16 && tyData != Ity_I8)
                sanityCheckFail(bb,stmt,
                                "Ist.LLSC(SC).result :: storedata bogus");
          }