Merge in branches/DCAS.

author Julian Seward <jseward@acm.org>

Wed, 1 Jul 2009 08:10:49 +0000 (08:10 +0000)

committer Julian Seward <jseward@acm.org>

Wed, 1 Jul 2009 08:10:49 +0000 (08:10 +0000)
author Julian Seward <jseward@acm.org>
Wed, 1 Jul 2009 08:10:49 +0000 (08:10 +0000)
committer Julian Seward <jseward@acm.org>
Wed, 1 Jul 2009 08:10:49 +0000 (08:10 +0000)
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c

index 6c4e1e4712dbcd8138bcf69bc60119aad8442a92..6c3ab6d91f871807dc51e8e9e25198febf93eb92 100644 (file)
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -1032,6 +1032,27 @@ IRSB* cg_instrument ( VgCallbackClosure* closure,
              break;
           }
  
+         case Ist_CAS: {
+            /* We treat it as a read and a write of the location.  I
+               think that is the same behaviour as it was before IRCAS
+               was introduced, since prior to that point, the Vex
+               front ends would translate a lock-prefixed instruction
+               into a (normal) read followed by a (normal) write. */
+            Int    dataSize;
+            IRCAS* cas = st->Ist.CAS.details;
+            tl_assert(cas->addr != NULL);
+            tl_assert(cas->dataLo != NULL);
+            dataSize = sizeofIRType(typeOfIRExpr(tyenv, cas->dataLo));
+            if (cas->dataHi != NULL)
+               dataSize *= 2; /* since it's a doubleword-CAS */
+            /* I don't think this can ever happen, but play safe. */
+            if (dataSize > MIN_LINE_SIZE)
+               dataSize = MIN_LINE_SIZE;
+            addEvent_Dr( &cgs, curr_inode, dataSize, cas->addr );
+            addEvent_Dw( &cgs, curr_inode, dataSize, cas->addr );
+            break;
+         }
+
           case Ist_Exit: {
              /* Stuff to widen the guard expression to a host word, so
                 we can pass it to the branch predictor simulation
diff --git a/callgrind/main.c b/callgrind/main.c

index f2d1250371d8c06a1b34a7915d552b54725d5930..a2a62539b224f3edb25031c0647534d938b1b8a4 100644 (file)
--- a/callgrind/main.c
+++ b/callgrind/main.c
@@ -657,8 +657,14 @@ void CLG_(collectBlockInfo)(IRSB* sbIn,
  static
  void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
  {
+    /* JRS 2009june01: re IRTemp_INVALID, am assuming that this
+       function is used only to create instrumentation, and not to
+       copy/reconstruct IRStmt_Stores that were in the incoming IR
+       superblock.  If that is not a correct assumption, then things
+       will break badly on PowerPC, esp w/ threaded apps. */
      addStmtToIRSB( bbOut,
                    IRStmt_Store(CLGEndness,
+                                IRTemp_INVALID,
                                 IRExpr_Const(hWordTy == Ity_I32 ?
                                              IRConst_U32( addr ) :
                                              IRConst_U64( addr )),
@@ -841,6 +847,24 @@ IRSB* CLG_(instrument)( VgCallbackClosure* closure,
             break;
          }
  
+         case Ist_CAS: {
+            /* We treat it as a read and a write of the location.  I
+               think that is the same behaviour as it was before IRCAS
+               was introduced, since prior to that point, the Vex
+               front ends would translate a lock-prefixed instruction
+               into a (normal) read followed by a (normal) write. */
+            Int    dataSize;
+            IRCAS* cas = st->Ist.CAS.details;
+            CLG_ASSERT(cas->addr && isIRAtom(cas->addr));
+            CLG_ASSERT(cas->dataLo);
+            dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
+            if (cas->dataHi != NULL)
+               dataSize *= 2; /* since this is a doubleword-cas */
+            addEvent_Dr( &clgs, curr_inode, dataSize, cas->addr );
+            addEvent_Dw( &clgs, curr_inode, dataSize, cas->addr );
+            break;
+         }
+ 
          case Ist_Exit: {
             UInt jmps_passed;
  
@@ -1101,7 +1125,8 @@ UInt syscalltime[VG_N_THREADS];
  #endif
  
  static
-void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno)
+void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno,
+                           UWord* args, UInt nArgs)
  {
    if (CLG_(clo).collect_systime) {
  #if CLG_MICROSYSTIME
@@ -1115,7 +1140,8 @@ void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno)
  }
  
  static
-void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno, SysRes res)
+void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno,
+                            UWord* args, UInt nArgs, SysRes res)
  {
    if (CLG_(clo).collect_systime &&
        CLG_(current_state).bbcc) {
diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c

index 7e3980699ee35b49e18c30401e1d31913f7e4e1b..19b7a2f6e9b9cd60948ac5134dccf6a81eee89ab 100644 (file)
--- a/coregrind/m_machine.c
+++ b/coregrind/m_machine.c
@@ -350,7 +350,7 @@ Bool VG_(machine_get_hwcaps)( void )
     LibVEX_default_VexArchInfo(&vai);
  
  #if defined(VGA_x86)
-   { Bool have_sse1, have_sse2;
+   { Bool have_sse1, have_sse2, have_cx8;
       UInt eax, ebx, ecx, edx;
  
       if (!VG_(has_cpuid)())
@@ -368,6 +368,13 @@ Bool VG_(machine_get_hwcaps)( void )
       have_sse1 = (edx & (1<<25)) != 0; /* True => have sse insns */
       have_sse2 = (edx & (1<<26)) != 0; /* True => have sse2 insns */
  
+     /* cmpxchg8b is a minimum requirement now; if we don't have it we
+        must simply give up.  But all CPUs since Pentium-I have it, so
+        that doesn't seem like much of a restriction. */
+     have_cx8 = (edx & (1<<8)) != 0; /* True => have cmpxchg8b */
+     if (!have_cx8)
+        return False;
+
       if (have_sse2 && have_sse1) {
          va          = VexArchX86;
          vai.hwcaps  = VEX_HWCAPS_X86_SSE1;
@@ -390,10 +397,40 @@ Bool VG_(machine_get_hwcaps)( void )
     }
  
  #elif defined(VGA_amd64)
-   vg_assert(VG_(has_cpuid)());
-   va         = VexArchAMD64;
-   vai.hwcaps = 0; /*baseline - SSE2 */
-   return True;
+   { Bool have_sse1, have_sse2, have_sse3, have_cx8, have_cx16;
+     UInt eax, ebx, ecx, edx;
+
+     if (!VG_(has_cpuid)())
+        /* we can't do cpuid at all.  Give up. */
+        return False;
+
+     VG_(cpuid)(0, &eax, &ebx, &ecx, &edx);
+     if (eax < 1)
+        /* we can't ask for cpuid(x) for x > 0.  Give up. */
+        return False;
+
+     /* get capabilities bits into edx */
+     VG_(cpuid)(1, &eax, &ebx, &ecx, &edx);
+
+     have_sse1 = (edx & (1<<25)) != 0; /* True => have sse insns */
+     have_sse2 = (edx & (1<<26)) != 0; /* True => have sse2 insns */
+     have_sse3 = (ecx & (1<<9)) != 0;  /* True => have sse3 insns */
+
+     /* cmpxchg8b is a minimum requirement now; if we don't have it we
+        must simply give up.  But all CPUs since Pentium-I have it, so
+        that doesn't seem like much of a restriction. */
+     have_cx8 = (edx & (1<<8)) != 0; /* True => have cmpxchg8b */
+     if (!have_cx8)
+        return False;
+
+     /* on amd64 we tolerate older cpus, which don't have cmpxchg16b */
+     have_cx16 = (ecx & (1<<13)) != 0; /* True => have cmpxchg16b */
+
+     va         = VexArchAMD64;
+     vai.hwcaps = (have_sse3 ? VEX_HWCAPS_AMD64_SSE3 : 0)
+                  | (have_cx16 ? VEX_HWCAPS_AMD64_CX16 : 0);
+     return True;
+   }
  
  #elif defined(VGA_ppc32)
     {
@@ -549,7 +586,6 @@ Bool VG_(machine_get_hwcaps)( void )
       VG_(sigaction)(VKI_SIGFPE, NULL, &saved_sigfpe_act);
       tmp_sigfpe_act = saved_sigfpe_act;
  
-
       /* NODEFER: signal handler does not return (from the kernel's point of
          view), hence if it is to successfully catch a signal more than once,
          we need the NODEFER flag. */
diff --git a/coregrind/m_scheduler/scheduler.c b/coregrind/m_scheduler/scheduler.c

index d0d17e6c9d8f0058f5c311fe583c5215e2ef7f39..59f59686be83ff858f86cf0550c354c9f547bd7f 100644 (file)
--- a/coregrind/m_scheduler/scheduler.c
+++ b/coregrind/m_scheduler/scheduler.c
@@ -679,22 +679,6 @@ static UInt run_thread_for_a_while ( ThreadId tid )
     trc = 0;
     dispatch_ctr_SAVED = VG_(dispatch_ctr);
  
-#  if defined(VGA_ppc32) || defined(VGA_ppc64)
-   /* This is necessary due to the hacky way vex models reservations
-      on ppc.  It's really quite incorrect for each thread to have its
-      own reservation flag/address, since it's really something that
-      all threads share (that's the whole point).  But having shared
-      guest state is something we can't model with Vex.  However, as
-      per PaulM's 2.4.0ppc, the reservation is modelled using a
-      reservation flag which is cleared at each context switch.  So it
-      is indeed possible to get away with a per thread-reservation if
-      the thread's reservation is cleared before running it.
-   */
-   /* Clear any existing reservation that this thread might have made
-      last time it was running. */
-   VG_(threads)[tid].arch.vex.guest_RESVN = 0;
-#  endif   
-
  #  if defined(VGP_ppc32_aix5) || defined(VGP_ppc64_aix5)
     /* On AIX, we need to get a plausible value for SPRG3 for this
        thread, since it's used I think as a thread-state pointer.  It
@@ -1169,6 +1153,10 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
           VG_(synth_fault)(tid);
           break;
  
+      case VEX_TRC_JMP_SIGBUS:
+         VG_(synth_sigbus)(tid);
+         break;
+
        case VEX_TRC_JMP_NODECODE:
           VG_(message)(Vg_UserMsg,
              "valgrind: Unrecognised instruction at address %#lx.",
diff --git a/coregrind/m_signals.c b/coregrind/m_signals.c

index 43c9aecf98db2ee80ef20d1dcc6fc0f00879441c..c4a1a365072cb266b8fa1c80e625c7642c050007 100644 (file)
--- a/coregrind/m_signals.c
+++ b/coregrind/m_signals.c
@@ -1766,6 +1766,27 @@ void VG_(synth_sigill)(ThreadId tid, Addr addr)
     deliver_signal(tid, &info, NULL);
  }
  
+// Synthesise a SIGBUS.
+void VG_(synth_sigbus)(ThreadId tid)
+{
+   vki_siginfo_t info;
+
+   vg_assert(VG_(threads)[tid].status == VgTs_Runnable);
+
+   VG_(memset)(&info, 0, sizeof(info));
+   info.si_signo = VKI_SIGBUS;
+   /* There are several meanings to SIGBUS (as per POSIX, presumably),
+      but the most widely understood is "invalid address alignment",
+      so let's use that. */
+   info.si_code  = VKI_BUS_ADRALN;
+   /* If we knew the invalid address in question, we could put it
+      in .si_addr.  Oh well. */
+   /* info.VKI_SIGINFO_si_addr = (void*)addr; */
+
+   resume_scheduler(tid);
+   deliver_signal(tid, &info, NULL);
+}
+
  // Synthesise a SIGTRAP.
  void VG_(synth_sigtrap)(ThreadId tid)
  {
diff --git a/coregrind/m_syswrap/syswrap-main.c b/coregrind/m_syswrap/syswrap-main.c

index 11f56ec600cf1ab39d8c212ad94a4c96213aa188..5b8fc70feb214dfba16750bfe002094f368d870e 100644 (file)
--- a/coregrind/m_syswrap/syswrap-main.c
+++ b/coregrind/m_syswrap/syswrap-main.c
@@ -431,7 +431,6 @@ void getSyscallArgsFromGuestState ( /*OUT*/SyscallArgs*       canonical,
     canonical->arg7  = 0;
     canonical->arg8  = 0;
  
-
  #elif defined(VGP_ppc32_linux)
     VexGuestPPC32State* gst = (VexGuestPPC32State*)gst_vanilla;
     canonical->sysno = gst->guest_GPR0;
@@ -444,7 +443,6 @@ void getSyscallArgsFromGuestState ( /*OUT*/SyscallArgs*       canonical,
     canonical->arg7  = 0;
     canonical->arg8  = 0;
  
-
  #elif defined(VGP_ppc64_linux)
     VexGuestPPC64State* gst = (VexGuestPPC64State*)gst_vanilla;
     canonical->sysno = gst->guest_GPR0;
@@ -457,7 +455,6 @@ void getSyscallArgsFromGuestState ( /*OUT*/SyscallArgs*       canonical,
     canonical->arg7  = 0;
     canonical->arg8  = 0;
  
-
  #elif defined(VGP_ppc32_aix5)
     VexGuestPPC32State* gst = (VexGuestPPC32State*)gst_vanilla;
     canonical->sysno = gst->guest_GPR2;
@@ -1377,7 +1374,17 @@ void VG_(client_syscall) ( ThreadId tid, UInt trc )
  
     /* Do any pre-syscall actions */
     if (VG_(needs).syscall_wrapper) {
-      VG_TDICT_CALL(tool_pre_syscall, tid, sysno);
+      UWord tmpv[8];
+      tmpv[0] = sci->orig_args.arg1;
+      tmpv[1] = sci->orig_args.arg2;
+      tmpv[2] = sci->orig_args.arg3;
+      tmpv[3] = sci->orig_args.arg4;
+      tmpv[4] = sci->orig_args.arg5;
+      tmpv[5] = sci->orig_args.arg6;
+      tmpv[6] = sci->orig_args.arg7;
+      tmpv[7] = sci->orig_args.arg8;
+      VG_TDICT_CALL(tool_pre_syscall, tid, sysno,
+                    &tmpv[0], sizeof(tmpv)/sizeof(tmpv[0]));
     }
  
     vg_assert(ent);
@@ -1655,8 +1662,21 @@ void VG_(post_syscall) (ThreadId tid)
        putSyscallStatusIntoGuestState( tid, &sci->status, &tst->arch.vex );
  
     /* Do any post-syscall actions required by the tool. */
-   if (VG_(needs).syscall_wrapper)
-      VG_TDICT_CALL(tool_post_syscall, tid, sysno, sci->status.sres);
+   if (VG_(needs).syscall_wrapper) {
+      UWord tmpv[8];
+      tmpv[0] = sci->orig_args.arg1;
+      tmpv[1] = sci->orig_args.arg2;
+      tmpv[2] = sci->orig_args.arg3;
+      tmpv[3] = sci->orig_args.arg4;
+      tmpv[4] = sci->orig_args.arg5;
+      tmpv[5] = sci->orig_args.arg6;
+      tmpv[6] = sci->orig_args.arg7;
+      tmpv[7] = sci->orig_args.arg8;
+      VG_TDICT_CALL(tool_post_syscall, tid, 
+                    sysno,
+                    &tmpv[0], sizeof(tmpv)/sizeof(tmpv[0]),
+                    sci->status.sres);
+   }
  
     /* The syscall is done. */
     vg_assert(sci->status.what == SsComplete);
diff --git a/coregrind/m_tooliface.c b/coregrind/m_tooliface.c

index 79a5e4bbddb369ae7684befd7cc419f8dc287e6d..4ed966d00e0c7622339621a41a2101095835cd66 100644 (file)
--- a/coregrind/m_tooliface.c
+++ b/coregrind/m_tooliface.c
@@ -269,8 +269,8 @@ void VG_(needs_client_requests)(
  }
  
  void VG_(needs_syscall_wrapper)(
-   void(*pre) (ThreadId, UInt),
-   void(*post)(ThreadId, UInt, SysRes res)
+   void(*pre) (ThreadId, UInt, UWord*, UInt),
+   void(*post)(ThreadId, UInt, UWord*, UInt, SysRes res)
  )
  {
     VG_(needs).syscall_wrapper = True;
diff --git a/coregrind/pub_core_signals.h b/coregrind/pub_core_signals.h

index 6d6f68364c9ec694c3122d528e3f15eff52d5614..92875bd26aec499f2ec29cc5c680111def81bc5d 100644 (file)
--- a/coregrind/pub_core_signals.h
+++ b/coregrind/pub_core_signals.h
@@ -73,6 +73,7 @@ extern void VG_(synth_fault_mapping)(ThreadId tid, Addr addr);
  extern void VG_(synth_fault_perms)  (ThreadId tid, Addr addr);
  extern void VG_(synth_sigill)       (ThreadId tid, Addr addr);
  extern void VG_(synth_sigtrap)      (ThreadId tid);
+extern void VG_(synth_sigbus)       (ThreadId tid);
  
  /* Extend the stack to cover addr, if possible */
  extern Bool VG_(extend_stack)(Addr addr, UInt maxsize);
diff --git a/coregrind/pub_core_tooliface.h b/coregrind/pub_core_tooliface.h

index 1131127ce4a920dd2230afaff5d0a99f7c28181e..25cfb2652e4435bc050786126f693ba3b98c0336 100644 (file)
--- a/coregrind/pub_core_tooliface.h
+++ b/coregrind/pub_core_tooliface.h
@@ -138,8 +138,8 @@ typedef struct {
     Bool (*tool_handle_client_request)(ThreadId, UWord*, UWord*);
  
     // VG_(needs).syscall_wrapper
-   void (*tool_pre_syscall) (ThreadId, UInt);
-   void (*tool_post_syscall)(ThreadId, UInt, SysRes);
+   void (*tool_pre_syscall) (ThreadId, UInt, UWord*, UInt);
+   void (*tool_post_syscall)(ThreadId, UInt, UWord*, UInt, SysRes);
  
     // VG_(needs).sanity_checks
     Bool (*tool_cheap_sanity_check)(void);
diff --git a/drd/drd_load_store.c b/drd/drd_load_store.c

index 2172c4716ec823db105d219867756d78115f626c..6398dac0d9edc1154fbe2a70589e5dd37bde5255 100644 (file)
--- a/drd/drd_load_store.c
+++ b/drd/drd_load_store.c
@@ -450,7 +450,6 @@ IRSB* DRD_(instrument)(VgCallbackClosure* const closure,
     IRSB*    bb;
     IRExpr** argv;
     Bool     instrument = True;
-   Bool     bus_locked = False;
  
     /* Set up BB */
     bb           = emptyIRSB();
@@ -484,16 +483,6 @@ IRSB* DRD_(instrument)(VgCallbackClosure* const closure,
           {
           case Imbe_Fence:
              break; /* not interesting */
-         case Imbe_BusLock:
-         case Imbe_SnoopedStoreBegin:
-            tl_assert(! bus_locked);
-            bus_locked = True;
-            break;
-         case Imbe_BusUnlock:
-         case Imbe_SnoopedStoreEnd:
-            tl_assert(bus_locked);
-            bus_locked = False;
-            break;
           default:
              tl_assert(0);
           }
@@ -501,7 +490,8 @@ IRSB* DRD_(instrument)(VgCallbackClosure* const closure,
           break;
  
        case Ist_Store:
-         if (instrument && ! bus_locked)
+         if (instrument && /* ignore stores resulting from st{d,w}cx. */
+                           st->Ist.Store.resSC == IRTemp_INVALID)
           {
              instrument_store(bb,
                               st->Ist.Store.addr,
@@ -547,8 +537,7 @@ IRSB* DRD_(instrument)(VgCallbackClosure* const closure,
                            argv);
                    addStmtToIRSB(bb, IRStmt_Dirty(di));
                 }
-               if ((mFx == Ifx_Write || mFx == Ifx_Modify)
-                   && ! bus_locked)
+               if (mFx == Ifx_Write || mFx == Ifx_Modify)
                 {
                    di = unsafeIRDirty_0_N(
                            /*regparms*/2,
@@ -565,14 +554,32 @@ IRSB* DRD_(instrument)(VgCallbackClosure* const closure,
           addStmtToIRSB(bb, st);
           break;
  
+      case Ist_CAS:
+         if (instrument)
+         {
+            /* Just treat this as a read of the location.  I believe
+               this is equivalent to the previous logic, which
+               observed bus-lock/unlock Ist_MBEs, and ignored all
+               writes within sections bracketed by bus-lock and
+               bus-unlock annotations. */
+            Int    dataSize;
+            IRCAS* cas = st->Ist.CAS.details;
+            tl_assert(cas->addr != NULL);
+            tl_assert(cas->dataLo != NULL);
+            dataSize = sizeofIRType(typeOfIRExpr(bb->tyenv, cas->dataLo));
+            if (cas->dataHi != NULL)
+               dataSize *= 2; /* since it's a doubleword-CAS */
+            instrument_load(bb, cas->addr, dataSize);
+         }
+         addStmtToIRSB(bb, st);
+         break;
+
        default:
           addStmtToIRSB(bb, st);
           break;
        }
     }
  
-   tl_assert(! bus_locked);
-
     return bb;
  }
  
diff --git a/exp-ptrcheck/h_main.c b/exp-ptrcheck/h_main.c

index c2c2aa3dba1892fb4f1cfce9c39f44690e8f1414..d5e532fb1c6fd1eaebce69e18f7fe98335dd4613 100644 (file)
--- a/exp-ptrcheck/h_main.c
+++ b/exp-ptrcheck/h_main.c
@@ -1536,7 +1536,6 @@ static void get_IntRegInfo ( /*OUT*/IntRegInfo* iii, Int offset, Int szB )
     if (o == GOF(CTR)       && is4) goto exactly1;
     if (o == GOF(CIA)       && is4) goto none;
     if (o == GOF(IP_AT_SYSCALL) && is4) goto none;
-   if (o == GOF(RESVN)     && is4) goto none;
     if (o == GOF(TISTART)   && is4) goto none;
     if (o == GOF(TILEN)     && is4) goto none;
     if (o == GOF(REDIR_SP)  && is4) goto none;
@@ -1700,7 +1699,6 @@ static void get_IntRegInfo ( /*OUT*/IntRegInfo* iii, Int offset, Int szB )
     if (o == GOF(CTR)       && is8) goto exactly1;
     if (o == GOF(CIA)       && is8) goto none;
     if (o == GOF(IP_AT_SYSCALL) && is8) goto none;
-   if (o == GOF(RESVN)     && is8) goto none;
     if (o == GOF(TISTART)   && is8) goto none;
     if (o == GOF(TILEN)     && is8) goto none;
     if (o == GOF(REDIR_SP)  && is8) goto none;
@@ -2115,7 +2113,8 @@ void h_post_reg_write_clientcall(ThreadId tid, PtrdiffT guest_state_offset,
  /*--- System calls                                                 ---*/
  /*--------------------------------------------------------------------*/
  
-void h_pre_syscall ( ThreadId tid, UInt sysno )
+void h_pre_syscall ( ThreadId tid, UInt sysno,
+                     UWord* args, UInt nArgs )
  {
     /* we don't do anything at the pre-syscall point */
  }
@@ -2415,6 +2414,9 @@ static void setup_post_syscall_table ( void )
  #     if defined(__NR_shmget)
        ADD(1, __NR_shmget);
  #     endif
+#     if defined(__NR_ipc) && defined(VKI_SHMAT)
+      ADD(1, __NR_ipc); /* ppc{32,64}-linux horrors */
+#     endif
  
     /* --------------- AIX5 --------------- */
  
@@ -2473,7 +2475,8 @@ static void setup_post_syscall_table ( void )
  }
  
  
-void h_post_syscall ( ThreadId tid, UInt sysno, SysRes res )
+void h_post_syscall ( ThreadId tid, UInt sysno, 
+                      UWord* args, UInt nArgs, SysRes res )
  {
     Word i, n;
     UWordPair* pair;
@@ -2517,14 +2520,9 @@ void h_post_syscall ( ThreadId tid, UInt sysno, SysRes res )
  
     /* Deal with the common case */
     pair = VG_(indexXA)( post_syscall_table, i );
-   if (pair->uw2 == 0) {
-     /* the common case */
-      VG_(set_syscall_return_shadows)( 
-         tid, /* retval */ (UWord)NONPTR, 0,
-              /* error */  (UWord)NONPTR, 0
-      );
-      return;
-   }
+   if (pair->uw2 == 0)
+      /* the common case */
+      goto res_NONPTR_err_NONPTR;
  
     /* Special handling for all remaining cases */
     tl_assert(pair->uw2 == 1);
@@ -2537,24 +2535,15 @@ void h_post_syscall ( ThreadId tid, UInt sysno, SysRes res )
           syscall completes. */
        post_reg_write_nonptr_or_unknown( tid, PC_OFF_FS_ZERO, 
                                               PC_SZB_FS_ZERO );
-      VG_(set_syscall_return_shadows)( 
-         tid, /* retval */ (UWord)NONPTR, 0,
-              /* error */  (UWord)NONPTR, 0
-      );
-      return;
+      goto res_NONPTR_err_NONPTR;
     }
  #  endif
  
  #  if defined(__NR_brk)
     // With brk(), result (of kernel syscall, not glibc wrapper) is a heap
     // pointer.  Make the shadow UNKNOWN.
-   if (sysno ==  __NR_brk) {
-      VG_(set_syscall_return_shadows)( 
-         tid, /* retval */ (UWord)UNKNOWN, 0,
-              /* error */  (UWord)NONPTR,  0
-      );
-      return;
-   }
+   if (sysno == __NR_brk)
+      goto res_UNKNOWN_err_NONPTR;
  #  endif
  
     // With mmap, new_mem_mmap() has already been called and added the
@@ -2573,13 +2562,9 @@ void h_post_syscall ( ThreadId tid, UInt sysno, SysRes res )
        ) {
        if (sr_isError(res)) {
           // mmap() had an error, return value is a small negative integer
-         VG_(set_syscall_return_shadows)( tid, /*val*/ (UWord)NONPTR, 0,
-                                               /*err*/ (UWord)NONPTR, 0 );
-         if (0) VG_(printf)("ZZZZZZZ mmap res -> NONPTR\n");
+         goto res_NONPTR_err_NONPTR;
        } else {
-         VG_(set_syscall_return_shadows)( tid, /*val*/ (UWord)UNKNOWN, 0,
-                                               /*err*/ (UWord)NONPTR, 0 );
-         if (0) VG_(printf)("ZZZZZZZ mmap res -> UNKNOWN\n");
+         goto res_UNKNOWN_err_NONPTR;
        }
        return;
     }
@@ -2589,24 +2574,40 @@ void h_post_syscall ( ThreadId tid, UInt sysno, SysRes res )
  #  if defined(__NR_shmat)
     if (sysno == __NR_shmat) {
        if (sr_isError(res)) {
-         VG_(set_syscall_return_shadows)( tid, /*val*/ (UWord)NONPTR, 0,
-                                               /*err*/ (UWord)NONPTR, 0 );
-         if (0) VG_(printf)("ZZZZZZZ shmat res -> NONPTR\n");
+         goto res_NONPTR_err_NONPTR;
        } else {
-         VG_(set_syscall_return_shadows)( tid, /*val*/ (UWord)UNKNOWN, 0,
-                                               /*err*/ (UWord)NONPTR, 0 );
-         if (0) VG_(printf)("ZZZZZZZ shmat res -> UNKNOWN\n");
+         goto res_UNKNOWN_err_NONPTR;
        }
-      return;
     }
  #  endif
  
  #  if defined(__NR_shmget)
-   if (sysno == __NR_shmget) {
+   if (sysno == __NR_shmget)
        // FIXME: is this correct?
-      VG_(set_syscall_return_shadows)( tid, /*val*/ (UWord)UNKNOWN, 0,
-                                            /*err*/ (UWord)NONPTR, 0 );
-      return;
+      goto res_UNKNOWN_err_NONPTR;
+#  endif
+
+#  if defined(__NR_ipc) && defined(VKI_SHMAT)
+   /* perhaps this should be further conditionalised with
+      && (defined(VGP_ppc32_linux) || defined(VGP_ppc64_linux)
+      Note, this just copies the behaviour of __NR_shmget above.
+
+      JRS 2009 June 02: it seems that the return value from
+      sys_ipc(VKI_SHMAT, ...) doesn't have much relationship to the
+      result returned by the originating user-level shmat call.  It's
+      different (and much lower) by a large but integral number of
+      pages.  I don't have time to chase this right now.  Observed on
+      ppc{32,64}-linux.  Result appears to be false errors from apps
+      using shmat.  Confusion though -- shouldn't be related to the
+      actual numeric values returned by the syscall, though, should
+      it?  Confused.  Maybe some bad interaction with a
+      nonpointer-or-unknown heuristic? */
+   if (sysno == __NR_ipc) {
+      if (args[0] == VKI_SHMAT) {
+         goto res_UNKNOWN_err_NONPTR;
+      } else {
+         goto res_NONPTR_err_NONPTR;
+      }
     }
  #  endif
  
@@ -2614,6 +2615,16 @@ void h_post_syscall ( ThreadId tid, UInt sysno, SysRes res )
        post_syscall_table has .w2 == 1, which in turn implies there
        should be special-case code for it above. */
     tl_assert(0);
+
+  res_NONPTR_err_NONPTR:
+   VG_(set_syscall_return_shadows)( tid, /* retval */ (UWord)NONPTR, 0,
+                                         /* error */  (UWord)NONPTR, 0 );
+   return;
+
+  res_UNKNOWN_err_NONPTR:
+   VG_(set_syscall_return_shadows)( tid, /* retval */ (UWord)UNKNOWN, 0,
+                                         /* error */  (UWord)NONPTR, 0 );
+   return;
  }
  
  
@@ -2916,25 +2927,73 @@ void check_load1(Addr m, Seg* mptr_vseg)
  // ------------------ Store handlers ------------------ //
  
  /* On 32 bit targets, we will use:
-      check_store1 check_store2 check_store4_P
+      check_store1 check_store2 check_store4_P check_store4C_P
        check_store4 (for 32-bit nonpointer stores)
        check_store8_ms4B_ls4B (for 64-bit stores)
        check_store16_ms4B_4B_4B_ls4B (for xmm/altivec stores)
  
     On 64 bit targets, we will use:
-      check_store1 check_store2 check_store4 check_store8_P
+      check_store1 check_store2 check_store4 check_store4C
+      check_store8_P check_store_8C_P
        check_store8_all8B (for 64-bit nonpointer stores)
        check_store16_ms8B_ls8B (for xmm/altivec stores)
  
     A "_P" handler writes a pointer to memory, and so has an extra
     argument -- the pointer's shadow value.  That implies that
-   check_store4_P is only to be called on a 32 bit host and
-   check_store8_P is only to be called on a 64 bit host.  For all
+   check_store4{,C}_P is only to be called on a 32 bit host and
+   check_store8{,C}_P is only to be called on a 64 bit host.  For all
     other cases, and for the misaligned _P cases, the strategy is to
     let the store go through, and then snoop around with
     nonptr_or_unknown to fix up the shadow values of any affected
     words. */
  
+/* Helpers for store-conditionals.  Ugly kludge :-(
+   They all return 1 if the SC was successful and 0 if it failed. */
+static inline UWord do_store_conditional_32( Addr m/*dst*/, UInt t/*val*/ )
+{
+#  if defined(VGA_ppc32) || defined(VGA_ppc64)
+   UWord success;
+   /* If this assertion fails, the underlying IR is (semantically) ill-formed
+      as per the IR spec for IRStmt_Store. */
+   tl_assert(VG_IS_4_ALIGNED(m));
+   __asm__ __volatile__(
+      "stwcx. %2,0,%1"    "\n\t" /* data,0,addr */
+      "mfcr   %0"         "\n\t"
+      "srwi   %0,%0,29"   "\n\t" /* move relevant CR bit to LSB */
+      : /*out*/"=b"(success) 
+      : /*in*/ "b"(m), "b"( (UWord)t ) 
+      : /*trash*/ "memory", "cc"
+        /* Note: srwi is OK even on 64-bit host because the we're
+           after bit 29 (normal numbering) and we mask off all the
+           other junk just below. */
+   );
+   return success & (UWord)1;
+#  else
+   tl_assert(0); /* not implemented on other platforms */
+#  endif
+}
+
+static inline UWord do_store_conditional_64( Addr m/*dst*/, ULong t/*val*/ )
+{
+#  if defined(VGA_ppc64)
+   UWord success;
+   /* If this assertion fails, the underlying IR is (semantically) ill-formed
+      as per the IR spec for IRStmt_Store. */
+   tl_assert(VG_IS_8_ALIGNED(m));
+   __asm__ __volatile__(
+      "stdcx. %2,0,%1"    "\n\t" /* data,0,addr */
+      "mfcr   %0"         "\n\t"
+      "srdi   %0,%0,29"   "\n\t" /* move relevant CR bit to LSB */
+      : /*out*/"=b"(success) 
+      : /*in*/ "b"(m), "b"( (UWord)t ) 
+      : /*trash*/ "memory", "cc"
+   );
+   return success & (UWord)1;
+#  else
+   tl_assert(0); /* not implemented on other platforms */
+#  endif
+}
+
  /* Apply nonptr_or_unknown to all the words intersecting
     [a, a+len). */
  static VG_REGPARM(2)
@@ -3066,6 +3125,29 @@ void check_store8_P(Addr m, Seg* mptr_vseg, UWord t, Seg* t_vseg)
     }
  }
  
+// This handles 64 bit store-conditionals on 64 bit targets.  It must
+// not be called on 32 bit targets.
+static VG_REGPARM(3)
+UWord check_store8C_P(Addr m, Seg* mptr_vseg, UWord t, Seg* t_vseg)
+{
+   UWord success;
+   tl_assert(sizeof(UWord) == 8); /* DO NOT REMOVE */
+#  if SC_SEGS
+   checkSeg(t_vseg);
+   checkSeg(mptr_vseg);
+#  endif
+   check_load_or_store(/*is_write*/True, m, 8, mptr_vseg);
+   // Actually *do* the STORE here
+   success = do_store_conditional_64( m, t );
+   if (VG_IS_8_ALIGNED(m)) {
+      set_mem_vseg( m, t_vseg );
+   } else {
+      // straddling two words
+      nonptr_or_unknown_range(m, 8);
+   }
+   return success;
+}
+
  // This handles 32 bit stores on 32 bit targets.  It must
  // not be called on 64 bit targets.
  static VG_REGPARM(3)
@@ -3087,6 +3169,29 @@ void check_store4_P(Addr m, Seg* mptr_vseg, UWord t, Seg* t_vseg)
     }
  }
  
+// This handles 32 bit store-conditionals on 32 bit targets.  It must
+// not be called on 64 bit targets.
+static VG_REGPARM(3)
+UWord check_store4C_P(Addr m, Seg* mptr_vseg, UWord t, Seg* t_vseg)
+{
+   UWord success;
+   tl_assert(sizeof(UWord) == 4); /* DO NOT REMOVE */
+#  if SC_SEGS
+   checkSeg(t_vseg);
+   checkSeg(mptr_vseg);
+#  endif
+   check_load_or_store(/*is_write*/True, m, 4, mptr_vseg);
+   // Actually *do* the STORE here
+   success = do_store_conditional_32( m, t );
+   if (VG_IS_4_ALIGNED(m)) {
+      set_mem_vseg( m, t_vseg );
+   } else {
+      // straddling two words
+      nonptr_or_unknown_range(m, 4);
+   }
+   return success;
+}
+
  // Used for both 32 bit and 64 bit targets.
  static VG_REGPARM(3)
  void check_store4(Addr m, Seg* mptr_vseg, UWord t)
@@ -3100,6 +3205,23 @@ void check_store4(Addr m, Seg* mptr_vseg, UWord t)
     nonptr_or_unknown_range(m, 4);
  }
  
+// Used for 32-bit store-conditionals on 64 bit targets only.  It must
+// not be called on 32 bit targets.
+static VG_REGPARM(3)
+UWord check_store4C(Addr m, Seg* mptr_vseg, UWord t)
+{
+   UWord success;
+   tl_assert(sizeof(UWord) == 8); /* DO NOT REMOVE */
+#  if SC_SEGS
+   checkSeg(mptr_vseg);
+#  endif
+   check_load_or_store(/*is_write*/True, m, 4, mptr_vseg);
+   // Actually *do* the STORE here
+   success = do_store_conditional_32( m, t );
+   nonptr_or_unknown_range(m, 4);
+   return success;
+}
+
  // Used for both 32 bit and 64 bit targets.
  static VG_REGPARM(3)
  void check_store2(Addr m, Seg* mptr_vseg, UWord t)
@@ -4084,8 +4206,8 @@ static void gen_nonptr_or_unknown_for_III( PCEnv* pce, IntRegInfo* iii )
     }
  }
  
-/* Generate into 'ane', instrumentation for 'st'.  Also copy 'st'
-   itself into 'ane' (the caller does not do so).  This is somewhat
+/* Generate into 'pce', instrumentation for 'st'.  Also copy 'st'
+   itself into 'pce' (the caller does not do so).  This is somewhat
     complex and relies heavily on the assumption that the incoming IR
     is in flat form.
  
@@ -4243,20 +4365,54 @@ static void schemeS ( PCEnv* pce, IRStmt* st )
              the post-hoc ugly hack of inspecting and "improving" the
              shadow data after the store, in the case where it isn't an
              aligned word store.
+
+            Only word-sized values are shadowed.  If this is a
+            store-conditional, .resSC will denote a non-word-typed
+            temp, and so we don't need to shadow it.  Assert about the
+            type, tho.  However, since we're not re-emitting the
+            original IRStmt_Store, but rather doing it as part of the
+            helper function, we need to actually do a SC in the
+            helper, and assign the result bit to .resSC.  Ugly.
           */
           IRExpr* data  = st->Ist.Store.data;
           IRExpr* addr  = st->Ist.Store.addr;
           IRType  d_ty  = typeOfIRExpr(pce->bb->tyenv, data);
           IRExpr* addrv = schemeEw_Atom( pce, addr );
+         IRTemp  resSC = st->Ist.Store.resSC;
+         if (resSC != IRTemp_INVALID) {
+            tl_assert(typeOfIRTemp(pce->bb->tyenv, resSC) == Ity_I1);
+            /* viz, not something we want to shadow */
+            /* also, throw out all store-conditional cases that
+               we can't handle */
+            if (pce->gWordTy == Ity_I32 && d_ty != Ity_I32)
+               goto unhandled;
+            if (pce->gWordTy == Ity_I64 && d_ty != Ity_I32 && d_ty != Ity_I64)
+               goto unhandled;
+         }
           if (pce->gWordTy == Ity_I32) {
              /* ------ 32 bit host/guest (cough, cough) ------ */
              switch (d_ty) {
                 /* Integer word case */
                 case Ity_I32: {
                    IRExpr* datav = schemeEw_Atom( pce, data );
-                  gen_dirty_v_WWWW( pce,
-                                    &check_store4_P, "check_store4_P",
-                                    addr, addrv, data, datav );
+                  if (resSC == IRTemp_INVALID) {
+                     /* "normal" store */
+                     gen_dirty_v_WWWW( pce,
+                                       &check_store4_P, "check_store4_P",
+                                       addr, addrv, data, datav );
+                  } else {
+                     /* store-conditional; need to snarf the success bit */
+                     IRTemp resSC32
+                         = gen_dirty_W_WWWW( pce,
+                                             &check_store4C_P,
+                                             "check_store4C_P",
+                                             addr, addrv, data, datav );
+                     /* presumably resSC32 will really be Ity_I32.  In
+                        any case we'll get jumped by the IR sanity
+                        checker if it's not, when it sees the
+                        following statement. */
+                     assign( 'I', pce, resSC, unop(Iop_32to1, mkexpr(resSC32)) );
+                  }
                    break;
                 }
                 /* Integer subword cases */
@@ -4345,17 +4501,39 @@ static void schemeS ( PCEnv* pce, IRStmt* st )
                 /* Integer word case */
                 case Ity_I64: {
                    IRExpr* datav = schemeEw_Atom( pce, data );
-                  gen_dirty_v_WWWW( pce,
-                                    &check_store8_P, "check_store8_P",
-                                    addr, addrv, data, datav );
+                  if (resSC == IRTemp_INVALID) {
+                     /* "normal" store */
+                     gen_dirty_v_WWWW( pce,
+                                       &check_store8_P, "check_store8_P",
+                                       addr, addrv, data, datav );
+                  } else {
+                     IRTemp resSC64
+                         = gen_dirty_W_WWWW( pce,
+                                             &check_store8C_P,
+                                             "check_store8C_P",
+                                             addr, addrv, data, datav );
+                     assign( 'I', pce, resSC, unop(Iop_64to1, mkexpr(resSC64)) );
+                  }
                    break;
                 }
                 /* Integer subword cases */
                 case Ity_I32:
-                  gen_dirty_v_WWW( pce,
-                                   &check_store4, "check_store4",
-                                   addr, addrv,
-                                   uwiden_to_host_word( pce, data ));
+                  if (resSC == IRTemp_INVALID) {
+                     /* "normal" store */
+                     gen_dirty_v_WWW( pce,
+                                      &check_store4, "check_store4",
+                                      addr, addrv,
+                                      uwiden_to_host_word( pce, data ));
+                  } else {
+                     /* store-conditional; need to snarf the success bit */
+                     IRTemp resSC64
+                         = gen_dirty_W_WWW( pce,
+                                            &check_store4C,
+                                            "check_store4C",
+                                            addr, addrv,
+                                            uwiden_to_host_word( pce, data ));
+                     assign( 'I', pce, resSC, unop(Iop_64to1, mkexpr(resSC64)) );
+                  }
                    break;
                 case Ity_I16:
                    gen_dirty_v_WWW( pce,
diff --git a/exp-ptrcheck/h_main.h b/exp-ptrcheck/h_main.h

index de626227e92c10c0f303e9793265e2106cdcd929..8bbe07a218f04f06d3b35f59dee6fc83edc7d8a0 100644 (file)
--- a/exp-ptrcheck/h_main.h
+++ b/exp-ptrcheck/h_main.h
@@ -82,8 +82,10 @@ void h_post_reg_write_demux ( CorePart part, ThreadId tid,
  void h_post_reg_write_clientcall(ThreadId tid, PtrdiffT guest_state_offset,
                                   SizeT size, Addr f );
  
-void h_pre_syscall ( ThreadId tid, UInt syscallno );
-void h_post_syscall ( ThreadId tid, UInt syscallno, SysRes res );
+void h_pre_syscall ( ThreadId tid, UInt syscallno,
+                     UWord* args, UInt nArgs );
+void h_post_syscall ( ThreadId tid, UInt syscallno,
+                      UWord* args, UInt nArgs, SysRes res );
  
  /* Note that this also does the sg_ instrumentation. */
  IRSB* h_instrument ( VgCallbackClosure* closure,
diff --git a/exp-ptrcheck/sg_main.c b/exp-ptrcheck/sg_main.c

index e285a3d1dba871b8879f0db0ab57714c5f447b29..20b53823e6d6259c769abd2fa52752e7e8c1b902 100644 (file)
--- a/exp-ptrcheck/sg_main.c
+++ b/exp-ptrcheck/sg_main.c
@@ -2226,6 +2226,33 @@ void sg_instrument_IRStmt ( /*MOD*/struct _SGEnv * env,
           break;
        }
  
+      case Ist_CAS: {
+         /* We treat it as a read and a write of the location.  I
+            think that is the same behaviour as it was before IRCAS
+            was introduced, since prior to that point, the Vex front
+            ends would translate a lock-prefixed instruction into a
+            (normal) read followed by a (normal) write. */
+         if (env->firstRef) {
+            Int    dataSize;
+            IRCAS* cas = st->Ist.CAS.details;
+            tl_assert(cas->addr != NULL);
+            tl_assert(cas->dataLo != NULL);
+            dataSize = sizeofIRType(typeOfIRExpr(sbOut->tyenv, cas->dataLo));
+            if (cas->dataHi != NULL)
+               dataSize *= 2; /* since it's a doubleword-CAS */
+            instrument_mem_access(
+               sbOut, cas->addr, dataSize, False/*!isStore*/,
+               sizeofIRType(hWordTy), env->curr_IP, layout
+            );
+            instrument_mem_access(
+               sbOut, cas->addr, dataSize, True/*isStore*/,
+               sizeofIRType(hWordTy), env->curr_IP, layout
+            );
+            env->firstRef = False;
+         }
+         break;
+      }
+
        default:
           tl_assert(0);
  
diff --git a/helgrind/hg_main.c b/helgrind/hg_main.c

index bff2fd59ebd85ece529f8aca4ab34c981b963638..9080ca6d5b3f4f7016baffbf564f08e711470203 100644 (file)
--- a/helgrind/hg_main.c
+++ b/helgrind/hg_main.c
@@ -3603,40 +3603,6 @@ static void instrument_mem_access ( IRSB*   bbOut,
  }
  
  
-//static void instrument_memory_bus_event ( IRSB* bbOut, IRMBusEvent event )
-//{
-//   switch (event) {
-//      case Imbe_SnoopedStoreBegin:
-//      case Imbe_SnoopedStoreEnd:
-//         /* These arise from ppc stwcx. insns.  They should perhaps be
-//            handled better. */
-//         break;
-//      case Imbe_Fence:
-//         break; /* not interesting */
-//      case Imbe_BusLock:
-//      case Imbe_BusUnlock:
-//         addStmtToIRSB(
-//            bbOut,
-//            IRStmt_Dirty(
-//               unsafeIRDirty_0_N( 
-//                  0/*regparms*/, 
-//                  event == Imbe_BusLock ? "evh__bus_lock"
-//                                        : "evh__bus_unlock",
-//                  VG_(fnptr_to_fnentry)(
-//                     event == Imbe_BusLock ? &evh__bus_lock 
-//                                           : &evh__bus_unlock 
-//                  ),
-//                  mkIRExprVec_0() 
-//               )
-//            )
-//         );
-//         break;
-//      default:
-//         tl_assert(0);
-//   }
-//}
-
-
  static
  IRSB* hg_instrument ( VgCallbackClosure* closure,
                        IRSB* bbIn,
@@ -3644,10 +3610,10 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
                        VexGuestExtents* vge,
                        IRType gWordTy, IRType hWordTy )
  {
-   Int   i;
-   IRSB* bbOut;
-   Bool  x86busLocked   = False;
-   Bool  isSnoopedStore = False;
+   Int     i;
+   IRSB*   bbOut;
+   Addr64  cia; /* address of current insn */
+   IRStmt* st;
  
     if (gWordTy != hWordTy) {
        /* We don't currently support this case. */
@@ -3667,8 +3633,16 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
        i++;
     }
  
+   // Get the first statement, and initial cia from it
+   tl_assert(bbIn->stmts_used > 0);
+   tl_assert(i < bbIn->stmts_used);
+   st = bbIn->stmts[i];
+   tl_assert(Ist_IMark == st->tag);
+   cia = st->Ist.IMark.addr;
+   st = NULL;
+
     for (/*use current i*/; i < bbIn->stmts_used; i++) {
-      IRStmt* st = bbIn->stmts[i];
+      st = bbIn->stmts[i];
        tl_assert(st);
        tl_assert(isFlatIRStmt(st));
        switch (st->tag) {
@@ -3676,43 +3650,45 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
           case Ist_AbiHint:
           case Ist_Put:
           case Ist_PutI:
-         case Ist_IMark:
           case Ist_Exit:
              /* None of these can contain any memory references. */
              break;
  
+         case Ist_IMark:
+            /* no mem refs, but note the insn address. */
+            cia = st->Ist.IMark.addr;
+            break;
+
           case Ist_MBE:
              //instrument_memory_bus_event( bbOut, st->Ist.MBE.event );
              switch (st->Ist.MBE.event) {
                 case Imbe_Fence:
                    break; /* not interesting */
-               /* Imbe_Bus{Lock,Unlock} arise from x86/amd64 LOCK
-                  prefixed instructions. */
-               case Imbe_BusLock:
-                  tl_assert(x86busLocked == False);
-                  x86busLocked = True;
-                  break;
-               case Imbe_BusUnlock:
-                  tl_assert(x86busLocked == True);
-                  x86busLocked = False;
-                  break;
-                  /* Imbe_SnoopedStore{Begin,End} arise from ppc
-                     stwcx. instructions. */
-               case Imbe_SnoopedStoreBegin:
-                  tl_assert(isSnoopedStore == False);
-                  isSnoopedStore = True;
-                  break;
-               case Imbe_SnoopedStoreEnd:
-                  tl_assert(isSnoopedStore == True);
-                  isSnoopedStore = False;
-                  break;
                 default:
                    goto unhandled;
              }
              break;
  
+         case Ist_CAS: {
+            /* Atomic read-modify-write cycle.  Just pretend it's a
+               read. */
+            IRCAS* cas    = st->Ist.CAS.details;
+            Bool   isDCAS = cas->dataHi != NULL;
+            instrument_mem_access(
+               bbOut,
+               cas->addr,
+               (isDCAS ? 2 : 1)
+                  * sizeofIRType(typeOfIRExpr(bbIn->tyenv, cas->dataLo)),
+               False/*!isStore*/,
+               sizeofIRType(hWordTy)
+            );
+            break;
+         }
+
           case Ist_Store:
-            if (!x86busLocked && !isSnoopedStore)
+            /* It seems we pretend that store-conditionals don't
+               exist, viz, just ignore them ... */
+            if (st->Ist.Store.resSC == IRTemp_INVALID) {
                 instrument_mem_access( 
                    bbOut, 
                    st->Ist.Store.addr, 
@@ -3720,9 +3696,12 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
                    True/*isStore*/,
                    sizeofIRType(hWordTy)
                 );
+            }
              break;
  
           case Ist_WrTmp: {
+            /* ... whereas here we don't care whether a load is a
+               vanilla one or a load-linked. */
              IRExpr* data = st->Ist.WrTmp.data;
              if (data->tag == Iex_Load) {
                 instrument_mem_access(
@@ -3751,11 +3730,6 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
                       sizeofIRType(hWordTy)
                    );
                 }
-               /* This isn't really correct.  Really the
-                  instrumentation should be only added when
-                  (!x86busLocked && !isSnoopedStore), just like with
-                  Ist_Store.  Still, I don't think this is
-                  particularly important. */
                 if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
                    instrument_mem_access( 
                       bbOut, d->mAddr, dataSize, True/*isStore*/,
diff --git a/helgrind/libhb_core.c b/helgrind/libhb_core.c

index 1a4c046446803159279d8541eaf37e272f6bbece..6deccf0596dbf6f0e013410032eef2061bbdf861 100644 (file)
--- a/helgrind/libhb_core.c
+++ b/helgrind/libhb_core.c
@@ -2393,7 +2393,7 @@ static POrd VtsID__getOrdering_WRK ( VtsID vi1, VtsID vi2 ) {
     return ord;
  }
  static inline POrd VtsID__getOrdering ( VtsID vi1, VtsID vi2 ) {
-   return vi1 == vi2  ? POrd_EQ  : VtsID__getOrdering_WRK(vi1, vi2);
+   return LIKELY(vi1 == vi2)  ? POrd_EQ  : VtsID__getOrdering_WRK(vi1, vi2);
  }
  
  /* compute binary join */
@@ -2424,7 +2424,7 @@ static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
     return res;
  }
  static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
-   return vi1 == vi2  ? vi1  : VtsID__join2_WRK(vi1, vi2);
+   return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
  }
  
  /* create a singleton VTS, namely [thr:1] */
@@ -3653,7 +3653,7 @@ static inline SVal msm_read ( SVal svOld,
        tl_assert(is_sane_SVal_C(svOld));
     }
  
-   if (SVal__isC(svOld)) {
+   if (LIKELY(SVal__isC(svOld))) {
        POrd  ord;
        VtsID tviR  = acc_thr->viR;
        VtsID tviW  = acc_thr->viW;
@@ -3661,7 +3661,7 @@ static inline SVal msm_read ( SVal svOld,
        VtsID wmini = SVal__unC_Wmin(svOld);
  
        ord = VtsID__getOrdering(rmini,tviR);
-      if (ord == POrd_EQ || ord == POrd_LT) {
+      if (LIKELY(ord == POrd_EQ || ord == POrd_LT)) {
           /* no race */
           /* Note: RWLOCK subtlety: use tviW, not tviR */
           svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
@@ -3708,9 +3708,10 @@ static inline SVal msm_read ( SVal svOld,
     if (CHECK_MSM) {
        tl_assert(is_sane_SVal_C(svNew));
     }
-   tl_assert(svNew != SVal_INVALID);
-   if (svNew != svOld && HG_(clo_show_conflicts)) {
-      if (SVal__isC(svOld) && SVal__isC(svNew)) {
+   if (UNLIKELY(svNew != svOld)) {
+      tl_assert(svNew != SVal_INVALID);
+      if (HG_(clo_show_conflicts)
+          && SVal__isC(svOld) && SVal__isC(svNew)) {
           event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
           stats__msm_read_change++;
        }
@@ -3734,13 +3735,13 @@ static inline SVal msm_write ( SVal svOld,
        tl_assert(is_sane_SVal_C(svOld));
     }
  
-   if (SVal__isC(svOld)) {
+   if (LIKELY(SVal__isC(svOld))) {
        POrd  ord;
        VtsID tviW  = acc_thr->viW;
        VtsID wmini = SVal__unC_Wmin(svOld);
  
        ord = VtsID__getOrdering(wmini,tviW);
-      if (ord == POrd_EQ || ord == POrd_LT) {
+      if (LIKELY(ord == POrd_EQ || ord == POrd_LT)) {
           /* no race */
           svNew = SVal__mkC( tviW, tviW );
           goto out;
@@ -3807,9 +3808,10 @@ static inline SVal msm_write ( SVal svOld,
     if (CHECK_MSM) {
        tl_assert(is_sane_SVal_C(svNew));
     }
-   tl_assert(svNew != SVal_INVALID);
-   if (svNew != svOld && HG_(clo_show_conflicts)) {
-      if (SVal__isC(svOld) && SVal__isC(svNew)) {
+   if (UNLIKELY(svNew != svOld)) {
+      tl_assert(svNew != SVal_INVALID);
+      if (HG_(clo_show_conflicts)
+          && SVal__isC(svOld) && SVal__isC(svNew)) {
           event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
           stats__msm_write_change++;
        }
@@ -3845,7 +3847,8 @@ void zsm_apply8___msm_read ( Thr* thr, Addr a ) {
     }
     svOld = cl->svals[cloff];
     svNew = msm_read( svOld, thr,a,1 );
-   tl_assert(svNew != SVal_INVALID);
+   if (CHECK_ZSM)
+      tl_assert(svNew != SVal_INVALID);
     cl->svals[cloff] = svNew;
  }
  
@@ -3868,7 +3871,8 @@ void zsm_apply8___msm_write ( Thr* thr, Addr a ) {
     }
     svOld = cl->svals[cloff];
     svNew = msm_write( svOld, thr,a,1 );
-   tl_assert(svNew != SVal_INVALID);
+   if (CHECK_ZSM)
+      tl_assert(svNew != SVal_INVALID);
     cl->svals[cloff] = svNew;
  }
  
@@ -3898,7 +3902,8 @@ void zsm_apply16___msm_read ( Thr* thr, Addr a ) {
     }
     svOld = cl->svals[cloff];
     svNew = msm_read( svOld, thr,a,2 );
-   tl_assert(svNew != SVal_INVALID);
+   if (CHECK_ZSM)
+      tl_assert(svNew != SVal_INVALID);
     cl->svals[cloff] = svNew;
     return;
    slowcase: /* misaligned, or must go further down the tree */
@@ -3931,7 +3936,8 @@ void zsm_apply16___msm_write ( Thr* thr, Addr a ) {
     }
     svOld = cl->svals[cloff];
     svNew = msm_write( svOld, thr,a,2 );
-   tl_assert(svNew != SVal_INVALID);
+   if (CHECK_ZSM)
+      tl_assert(svNew != SVal_INVALID);
     cl->svals[cloff] = svNew;
     return;
    slowcase: /* misaligned, or must go further down the tree */
@@ -3965,7 +3971,8 @@ void zsm_apply32___msm_read ( Thr* thr, Addr a ) {
     }
     svOld = cl->svals[cloff];
     svNew = msm_read( svOld, thr,a,4 );
-   tl_assert(svNew != SVal_INVALID);
+   if (CHECK_ZSM)
+      tl_assert(svNew != SVal_INVALID);
     cl->svals[cloff] = svNew;
     return;
    slowcase: /* misaligned, or must go further down the tree */
@@ -3997,7 +4004,8 @@ void zsm_apply32___msm_write ( Thr* thr, Addr a ) {
     }
     svOld = cl->svals[cloff];
     svNew = msm_write( svOld, thr,a,4 );
-   tl_assert(svNew != SVal_INVALID);
+   if (CHECK_ZSM)
+      tl_assert(svNew != SVal_INVALID);
     cl->svals[cloff] = svNew;
     return;
    slowcase: /* misaligned, or must go further down the tree */
@@ -4026,7 +4034,8 @@ void zsm_apply64___msm_read ( Thr* thr, Addr a ) {
     }
     svOld = cl->svals[cloff];
     svNew = msm_read( svOld, thr,a,8 );
-   tl_assert(svNew != SVal_INVALID);
+   if (CHECK_ZSM)
+      tl_assert(svNew != SVal_INVALID);
     cl->svals[cloff] = svNew;
     return;
    slowcase: /* misaligned, or must go further down the tree */
@@ -4053,7 +4062,8 @@ void zsm_apply64___msm_write ( Thr* thr, Addr a ) {
     }
     svOld = cl->svals[cloff];
     svNew = msm_write( svOld, thr,a,8 );
-   tl_assert(svNew != SVal_INVALID);
+   if (CHECK_ZSM)
+      tl_assert(svNew != SVal_INVALID);
     cl->svals[cloff] = svNew;
     return;
    slowcase: /* misaligned, or must go further down the tree */
diff --git a/include/pub_tool_tooliface.h b/include/pub_tool_tooliface.h

index 5b1725c2cef759309dc8a6647b564a217699b620..a006c4b6453e7d42eb518215ae5a0e7a324e4632 100644 (file)
--- a/include/pub_tool_tooliface.h
+++ b/include/pub_tool_tooliface.h
@@ -386,9 +386,19 @@ extern void VG_(needs_client_requests) (
  /* Tool does stuff before and/or after system calls? */
  // Nb: If either of the pre_ functions malloc() something to return, the
  // corresponding post_ function had better free() it!
+// Also, the args are the 'original args' -- that is, it may be
+// that the syscall pre-wrapper will modify the args before the
+// syscall happens.  So these args are the original, un-modified
+// args.  Finally, nArgs merely indicates the length of args[..],
+// it does not indicate how many of those values are actually
+// relevant to the syscall.  args[0 .. nArgs-1] is guaranteed
+// to be defined and to contain all the args for this syscall,
+// possibly including some trailing zeroes.
  extern void VG_(needs_syscall_wrapper) (
-   void (* pre_syscall)(ThreadId tid, UInt syscallno),
-   void (*post_syscall)(ThreadId tid, UInt syscallno, SysRes res)
+               void (* pre_syscall)(ThreadId tid, UInt syscallno,
+                                    UWord* args, UInt nArgs),
+               void (*post_syscall)(ThreadId tid, UInt syscallno,
+                                    UWord* args, UInt nArgs, SysRes res)
  );
  
  /* Are tool-state sanity checks performed? */
diff --git a/lackey/lk_main.c b/lackey/lk_main.c

index c0394806c474dde711ac14044df4beb808755ea4..1a52c6611498ae59d7ab6de64b5b1058841406de 100644 (file)
--- a/lackey/lk_main.c
+++ b/lackey/lk_main.c
@@ -784,6 +784,27 @@ IRSB* lk_instrument ( VgCallbackClosure* closure,
              break;
           }
  
+         case Ist_CAS: {
+            /* We treat it as a read and a write of the location.  I
+               think that is the same behaviour as it was before IRCAS
+               was introduced, since prior to that point, the Vex
+               front ends would translate a lock-prefixed instruction
+               into a (normal) read followed by a (normal) write. */
+            if (clo_trace_mem) {
+               Int    dataSize;
+               IRCAS* cas = st->Ist.CAS.details;
+               tl_assert(cas->addr != NULL);
+               tl_assert(cas->dataLo != NULL);
+               dataSize = sizeofIRType(typeOfIRExpr(tyenv, cas->dataLo));
+               if (cas->dataHi != NULL)
+                  dataSize *= 2; /* since it's a doubleword-CAS */
+               addEvent_Dr( sbOut, cas->addr, dataSize );
+               addEvent_Dw( sbOut, cas->addr, dataSize );
+            }
+            addStmtToIRSB( sbOut, st );
+            break;
+         }
+
           case Ist_Exit:
              if (clo_basic_counts) {
                 // The condition of a branch was inverted by VEX if a taken
diff --git a/massif/ms_main.c b/massif/ms_main.c

index c54fc3cf1d581904bb4a4068e26dae18abb0d323..867652c84ba7fa664168bd134286a8572d26cfb8 100644 (file)
--- a/massif/ms_main.c
+++ b/massif/ms_main.c
@@ -1899,12 +1899,14 @@ static void add_counter_update(IRSB* sbOut, Int n)
     IRTemp t2 = newIRTemp(sbOut->tyenv, Ity_I64);
     IRExpr* counter_addr = mkIRExpr_HWord( (HWord)&guest_instrs_executed );
  
-   IRStmt* st1 = IRStmt_WrTmp(t1, IRExpr_Load(END, Ity_I64, counter_addr));
+   IRStmt* st1 = IRStmt_WrTmp(t1, IRExpr_Load(False/*!isLL*/,
+                                              END, Ity_I64, counter_addr));
     IRStmt* st2 =
        IRStmt_WrTmp(t2,
                     IRExpr_Binop(Iop_Add64, IRExpr_RdTmp(t1),
                                             IRExpr_Const(IRConst_U64(n))));
-   IRStmt* st3 = IRStmt_Store(END, counter_addr, IRExpr_RdTmp(t2));
+   IRStmt* st3 = IRStmt_Store(END, IRTemp_INVALID/*"not store-conditional"*/,
+                              counter_addr, IRExpr_RdTmp(t2));
  
     addStmtToIRSB( sbOut, st1 );
     addStmtToIRSB( sbOut, st2 );
diff --git a/memcheck/mc_machine.c b/memcheck/mc_machine.c

index 05f6da4f4e889e87ab80882129824187e6ba8458..752f1a4f0ea7465781c27fda27d7c7824d7899b1 100644 (file)
--- a/memcheck/mc_machine.c
+++ b/memcheck/mc_machine.c
@@ -182,7 +182,6 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
  
     if (o == GOF(CIA)       && sz == 8) return -1;
     if (o == GOF(IP_AT_SYSCALL) && sz == 8) return -1; /* slot unused */
-   if (o == GOF(RESVN)     && sz == 8) return -1;
     if (o == GOF(FPROUND)   && sz == 4) return -1;
     if (o == GOF(EMWARN)    && sz == 4) return -1;
     if (o == GOF(TISTART)   && sz == 8) return -1;
@@ -341,7 +340,6 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
  
     if (o == GOF(CIA)       && sz == 4) return -1;
     if (o == GOF(IP_AT_SYSCALL) && sz == 4) return -1; /* slot unused */
-   if (o == GOF(RESVN)     && sz == 4) return -1;
     if (o == GOF(FPROUND)   && sz == 4) return -1;
     if (o == GOF(VRSAVE)    && sz == 4) return -1;
     if (o == GOF(EMWARN)    && sz == 4) return -1;
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c

index bc8d698ada30e592ac4b0ff522f0668b0ec3003f..65b266ae3f24f459acbb5819ecbb6aca2bf0f7c3 100644 (file)
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -126,25 +126,56 @@ static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
  /*--- Memcheck running state, and tmp management.          ---*/
  /*------------------------------------------------------------*/
  
+/* Carries info about a particular tmp.  The tmp's number is not
+   recorded, as this is implied by (equal to) its index in the tmpMap
+   in MCEnv.  The tmp's type is also not recorded, as this is present
+   in MCEnv.sb->tyenv.
+
+   When .kind is Orig, .shadowV and .shadowB may give the identities
+   of the temps currently holding the associated definedness (shadowV)
+   and origin (shadowB) values, or these may be IRTemp_INVALID if code
+   to compute such values has not yet been emitted.
+
+   When .kind is VSh or BSh then the tmp is holds a V- or B- value,
+   and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
+   illogical for a shadow tmp itself to be shadowed.
+*/
+typedef
+   enum { Orig=1, VSh=2, BSh=3 }
+   TempKind;
+
+typedef
+   struct {
+      TempKind kind;
+      IRTemp   shadowV;
+      IRTemp   shadowB;
+   }
+   TempMapEnt;
+
+
  /* Carries around state during memcheck instrumentation. */
  typedef
     struct _MCEnv {
        /* MODIFIED: the superblock being constructed.  IRStmts are
           added. */
-      IRSB* bb;
+      IRSB* sb;
        Bool  trace;
  
-      /* MODIFIED: a table [0 .. #temps_in_original_bb-1] which maps
-         original temps to their current their current shadow temp.
-         Initially all entries are IRTemp_INVALID.  Entries are added
-         lazily since many original temps are not used due to
-         optimisation prior to instrumentation.  Note that floating
-         point original tmps are shadowed by integer tmps of the same
-         size, and Bit-typed original tmps are shadowed by the type
-         Ity_I8.  See comment below. */
-      IRTemp* tmpMapV;        /* V-bit tmp shadows */
-      IRTemp* tmpMapB; /* origin tracking tmp shadows */
-      Int     n_originalTmps; /* for range checking */
+      /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
+         current kind and possibly shadow temps for each temp in the
+         IRSB being constructed.  Note that it does not contain the
+         type of each tmp.  If you want to know the type, look at the
+         relevant entry in sb->tyenv.  It follows that at all times
+         during the instrumentation process, the valid indices for
+         tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
+         total number of Orig, V- and B- temps allocated so far.
+
+         The reason for this strange split (types in one place, all
+         other info in another) is that we need the types to be
+         attached to sb so as to make it possible to do
+         "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
+         instrumentation process. */
+      XArray* /* of TempMapEnt */ tmpMap;
  
        /* MODIFIED: indicates whether "bogus" literals have so far been
           found.  Starts off False, and may change to True. */
@@ -185,17 +216,44 @@ typedef
     sanity checker should catch all such anomalies, however.  
  */
  
+/* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
+   both the table in mce->sb and to our auxiliary mapping.  Note that
+   newTemp may cause mce->tmpMap to resize, hence previous results
+   from VG_(indexXA)(mce->tmpMap) are invalidated. */
+static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
+{
+   Word       newIx;
+   TempMapEnt ent;
+   IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
+   ent.kind    = kind;
+   ent.shadowV = IRTemp_INVALID;
+   ent.shadowB = IRTemp_INVALID;
+   newIx = VG_(addToXA)( mce->tmpMap, &ent );
+   tl_assert(newIx == (Word)tmp);
+   return tmp;
+}
+
+
  /* Find the tmp currently shadowing the given original tmp.  If none
     so far exists, allocate one.  */
  static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
  {
-   tl_assert(orig < mce->n_originalTmps);
-   if (mce->tmpMapV[orig] == IRTemp_INVALID) {
-      mce->tmpMapV[orig] 
-         = newIRTemp(mce->bb->tyenv, 
-                     shadowTypeV(mce->bb->tyenv->types[orig]));
+   TempMapEnt* ent;
+   /* VG_(indexXA) range-checks 'orig', hence no need to check
+      here. */
+   ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
+   tl_assert(ent->kind == Orig);
+   if (ent->shadowV == IRTemp_INVALID) {
+      IRTemp tmpV
+        = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
+      /* newTemp may cause mce->tmpMap to resize, hence previous results
+         from VG_(indexXA) are invalid. */
+      ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
+      tl_assert(ent->kind == Orig);
+      tl_assert(ent->shadowV == IRTemp_INVALID);
+      ent->shadowV = tmpV;
     }
-   return mce->tmpMapV[orig];
+   return ent->shadowV;
  }
  
  /* Allocate a new shadow for the given original tmp.  This means any
@@ -203,13 +261,27 @@ static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
     necessary to give a new value to a shadow once it has been tested
     for undefinedness, but unfortunately IR's SSA property disallows
     this.  Instead we must abandon the old shadow, allocate a new one
-   and use that instead. */
+   and use that instead.
+
+   This is the same as findShadowTmpV, except we don't bother to see
+   if a shadow temp already existed -- we simply allocate a new one
+   regardless. */
  static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
  {
-   tl_assert(orig < mce->n_originalTmps);
-   mce->tmpMapV[orig] 
-      = newIRTemp(mce->bb->tyenv, 
-                  shadowTypeV(mce->bb->tyenv->types[orig]));
+   TempMapEnt* ent;
+   /* VG_(indexXA) range-checks 'orig', hence no need to check
+      here. */
+   ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
+   tl_assert(ent->kind == Orig);
+   if (1) {
+      IRTemp tmpV
+        = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
+      /* newTemp may cause mce->tmpMap to resize, hence previous results
+         from VG_(indexXA) are invalid. */
+      ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
+      tl_assert(ent->kind == Orig);
+      ent->shadowV = tmpV;
+   }
  }
  
  
@@ -232,8 +304,10 @@ static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
  {
     if (a1->tag == Iex_Const)
        return True;
-   if (a1->tag == Iex_RdTmp && a1->Iex.RdTmp.tmp < mce->n_originalTmps)
-      return True;
+   if (a1->tag == Iex_RdTmp) {
+      TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
+      return ent->kind == Orig;
+   }
     return False;
  }
  
@@ -243,8 +317,10 @@ static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
  {
     if (a1->tag == Iex_Const)
        return True;
-   if (a1->tag == Iex_RdTmp && a1->Iex.RdTmp.tmp >= mce->n_originalTmps)
-      return True;
+   if (a1->tag == Iex_RdTmp) {
+      TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
+      return ent->kind == VSh || ent->kind == BSh;
+   }
     return False;
  }
  
@@ -312,13 +388,13 @@ static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
        ppIRStmt(st);
        VG_(printf)("\n");
     }
-   addStmtToIRSB(mce->bb, st);
+   addStmtToIRSB(mce->sb, st);
  }
  
  /* assign value to tmp */
  static inline 
  void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
-  stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
+   stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
  }
  
  /* build various kinds of expressions */
@@ -336,14 +412,24 @@ void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
     an atom.
  
     'ty' is the type of 'e' and hence the type that the new temporary
-   needs to be.  But passing it is redundant, since we can deduce the
-   type merely by inspecting 'e'.  So at least that fact to assert
-   that the two types agree. */
-static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e ) {
-   IRTemp t;
-   IRType tyE = typeOfIRExpr(mce->bb->tyenv, e);
+   needs to be.  But passing it in is redundant, since we can deduce
+   the type merely by inspecting 'e'.  So at least use that fact to
+   assert that the two types agree. */
+static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
+{
+   TempKind k;
+   IRTemp   t;
+   IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
     tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
-   t = newIRTemp(mce->bb->tyenv, ty);
+   switch (cat) {
+      case 'V': k = VSh;  break;
+      case 'B': k = BSh;  break;
+      case 'C': k = Orig; break; 
+                /* happens when we are making up new "orig"
+                   expressions, for IRCAS handling */
+      default: tl_assert(0);
+   }
+   t = newTemp(mce, ty, k);
     assign(cat, mce, t, e);
     return mkexpr(t);
  }
@@ -569,7 +655,7 @@ static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
     /* Note, dst_ty is a shadow type, not an original type. */
     /* First of all, collapse vbits down to a single bit. */
     tl_assert(isShadowAtom(mce,vbits));
-   src_ty = typeOfIRExpr(mce->bb->tyenv, vbits);
+   src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
  
     /* Fast-track some common cases */
     if (src_ty == Ity_I32 && dst_ty == Ity_I32)
@@ -928,7 +1014,7 @@ static void complainIfUndefined ( MCEnv* mce, IRAtom* atom )
     tl_assert(isShadowAtom(mce, vatom));
     tl_assert(sameKindedAtoms(atom, vatom));
  
-   ty = typeOfIRExpr(mce->bb->tyenv, vatom);
+   ty = typeOfIRExpr(mce->sb->tyenv, vatom);
  
     /* sz is only used for constructing the error message */
     sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
@@ -1112,7 +1198,7 @@ void do_shadow_PUT ( MCEnv* mce,  Int offset,
        tl_assert(isShadowAtom(mce, vatom));
     }
  
-   ty = typeOfIRExpr(mce->bb->tyenv, vatom);
+   ty = typeOfIRExpr(mce->sb->tyenv, vatom);
     tl_assert(ty != Ity_I1);
     if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
        /* later: no ... */
@@ -1226,8 +1312,8 @@ static
  IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
  {
     IRAtom* at;
-   IRType t1 = typeOfIRExpr(mce->bb->tyenv, va1);
-   IRType t2 = typeOfIRExpr(mce->bb->tyenv, va2);
+   IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
+   IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
     tl_assert(isShadowAtom(mce,va1));
     tl_assert(isShadowAtom(mce,va2));
  
@@ -1275,9 +1361,9 @@ IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
                    IRAtom* va1, IRAtom* va2, IRAtom* va3 )
  {
     IRAtom* at;
-   IRType t1 = typeOfIRExpr(mce->bb->tyenv, va1);
-   IRType t2 = typeOfIRExpr(mce->bb->tyenv, va2);
-   IRType t3 = typeOfIRExpr(mce->bb->tyenv, va3);
+   IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
+   IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
+   IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
     tl_assert(isShadowAtom(mce,va1));
     tl_assert(isShadowAtom(mce,va2));
     tl_assert(isShadowAtom(mce,va3));
@@ -1344,10 +1430,10 @@ IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
                    IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
  {
     IRAtom* at;
-   IRType t1 = typeOfIRExpr(mce->bb->tyenv, va1);
-   IRType t2 = typeOfIRExpr(mce->bb->tyenv, va2);
-   IRType t3 = typeOfIRExpr(mce->bb->tyenv, va3);
-   IRType t4 = typeOfIRExpr(mce->bb->tyenv, va4);
+   IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
+   IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
+   IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
+   IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
     tl_assert(isShadowAtom(mce,va1));
     tl_assert(isShadowAtom(mce,va2));
     tl_assert(isShadowAtom(mce,va3));
@@ -1416,7 +1502,7 @@ IRAtom* mkLazyN ( MCEnv* mce,
        tl_assert(isOriginalAtom(mce, exprvec[i]));
        if (cee->mcx_mask & (1<<i))
           continue;
-      if (typeOfIRExpr(mce->bb->tyenv, exprvec[i]) != Ity_I64)
+      if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
           mergeTy64 = False;
     }
  
@@ -2726,7 +2812,7 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
  
     /* We need to have a place to park the V bits we're just about to
        read. */
-   datavbits = newIRTemp(mce->bb->tyenv, ty);
+   datavbits = newTemp(mce, ty, VSh);
     di = unsafeIRDirty_1_N( datavbits, 
                             1/*regparms*/, 
                             hname, VG_(fnptr_to_fnentry)( helper ), 
@@ -2786,7 +2872,7 @@ IRAtom* expr2vbits_Mux0X ( MCEnv* mce,
     vbitsC = expr2vbits(mce, cond);
     vbits0 = expr2vbits(mce, expr0);
     vbitsX = expr2vbits(mce, exprX);
-   ty = typeOfIRExpr(mce->bb->tyenv, vbits0);
+   ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
  
     return
        mkUifU(mce, ty, assignNew('V', mce, ty, 
@@ -2812,7 +2898,7 @@ IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e )
           return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
  
        case Iex_Const:
-         return definedOfType(shadowTypeV(typeOfIRExpr(mce->bb->tyenv, e)));
+         return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
  
        case Iex_Qop:
           return expr2vbits_Qop(
@@ -2875,7 +2961,7 @@ IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
     /* vatom is vbits-value and as such can only have a shadow type. */
     tl_assert(isShadowAtom(mce,vatom));
  
-   ty  = typeOfIRExpr(mce->bb->tyenv, vatom);
+   ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
     tyH = mce->hWordTy;
  
     if (tyH == Ity_I32) {
@@ -2914,13 +3000,17 @@ IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
  
  /* Generate a shadow store.  addr is always the original address atom.
     You can pass in either originals or V-bits for the data atom, but
-   obviously not both.  */
+   obviously not both.  guard :: Ity_I1 controls whether the store
+   really happens; NULL means it unconditionally does.  Note that
+   guard itself is not checked for definedness; the caller of this
+   function must do that if necessary. */
  
  static 
  void do_shadow_Store ( MCEnv* mce, 
                         IREndness end,
                         IRAtom* addr, UInt bias,
-                       IRAtom* data, IRAtom* vdata )
+                       IRAtom* data, IRAtom* vdata,
+                       IRAtom* guard )
  {
     IROp     mkAdd;
     IRType   ty, tyAddr;
@@ -2945,14 +3035,20 @@ void do_shadow_Store ( MCEnv* mce,
     tl_assert(isOriginalAtom(mce,addr));
     tl_assert(isShadowAtom(mce,vdata));
  
-   ty = typeOfIRExpr(mce->bb->tyenv, vdata);
+   if (guard) {
+      tl_assert(isOriginalAtom(mce, guard));
+      tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
+   }
+
+   ty = typeOfIRExpr(mce->sb->tyenv, vdata);
  
     // If we're not doing undefined value checking, pretend that this value
     // is "all valid".  That lets Vex's optimiser remove some of the V bit
     // shadow computation ops that precede it.
     if (MC_(clo_mc_level) == 1) {
        switch (ty) {
-         case Ity_V128: c = IRConst_V128(V_BITS16_DEFINED); break; // V128 weirdness
+         case Ity_V128: // V128 weirdness
+                        c = IRConst_V128(V_BITS16_DEFINED); break;
           case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
           case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
           case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
@@ -3040,6 +3136,8 @@ void do_shadow_Store ( MCEnv* mce,
                       hname, VG_(fnptr_to_fnentry)( helper ), 
                       mkIRExprVec_2( addrHi64, vdataHi64 )
                    );
+      if (guard) diLo64->guard = guard;
+      if (guard) diHi64->guard = guard;
        setHelperAnns( mce, diLo64 );
        setHelperAnns( mce, diHi64 );
        stmt( 'V', mce, IRStmt_Dirty(diLo64) );
@@ -3076,6 +3174,7 @@ void do_shadow_Store ( MCEnv* mce,
                                  zwidenToHostWord( mce, vdata ))
                );
        }
+      if (guard) di->guard = guard;
        setHelperAnns( mce, di );
        stmt( 'V', mce, IRStmt_Dirty(di) );
     }
@@ -3180,7 +3279,7 @@ void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
        tl_assert(d->mAddr);
        complainIfUndefined(mce, d->mAddr);
  
-      tyAddr = typeOfIRExpr(mce->bb->tyenv, d->mAddr);
+      tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
        tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
        tl_assert(tyAddr == mce->hWordTy); /* not really right */
     }
@@ -3221,7 +3320,7 @@ void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
     /* Outputs: the destination temporary, if there is one. */
     if (d->tmp != IRTemp_INVALID) {
        dst   = findShadowTmpV(mce, d->tmp);
-      tyDst = typeOfIRTemp(mce->bb->tyenv, d->tmp);
+      tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
        assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
     }
  
@@ -3261,14 +3360,16 @@ void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
        while (toDo >= 4) {
           do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
                            NULL, /* original data */
-                          mkPCastTo( mce, Ity_I32, curr ) );
+                          mkPCastTo( mce, Ity_I32, curr ),
+                          NULL/*guard*/ );
           toDo -= 4;
        }
        /* chew off 16-bit chunks */
        while (toDo >= 2) {
           do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
                            NULL, /* original data */
-                          mkPCastTo( mce, Ity_I16, curr ) );
+                          mkPCastTo( mce, Ity_I16, curr ),
+                          NULL/*guard*/ );
           toDo -= 2;
        }
        tl_assert(toDo == 0); /* also need to handle 1-byte excess */
@@ -3276,6 +3377,7 @@ void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
  
  }
  
+
  /* We have an ABI hint telling us that [base .. base+len-1] is to
     become undefined ("writable").  Generate code to call a helper to
     notify the A/V bit machinery of this fact.
@@ -3306,6 +3408,457 @@ void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
  }
  
  
+/* ------ Dealing with IRCAS (big and complex) ------ */
+
+/* FWDS */
+static IRAtom* gen_load_b  ( MCEnv* mce, Int szB, 
+                             IRAtom* baseaddr, Int offset );
+static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
+static void    gen_store_b ( MCEnv* mce, Int szB,
+                             IRAtom* baseaddr, Int offset, IRAtom* dataB,
+                             IRAtom* guard );
+
+static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
+static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
+
+
+/* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
+   IRExpr.Consts, else this asserts.  If they are both Consts, it
+   doesn't do anything.  So that just leaves the RdTmp case.
+
+   In which case: this assigns the shadow value SHADOW to the IR
+   shadow temporary associated with ORIG.  That is, ORIG, being an
+   original temporary, will have a shadow temporary associated with
+   it.  However, in the case envisaged here, there will so far have
+   been no IR emitted to actually write a shadow value into that
+   temporary.  What this routine does is to (emit IR to) copy the
+   value in SHADOW into said temporary, so that after this call,
+   IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
+   value in SHADOW.
+
+   Point is to allow callers to compute "by hand" a shadow value for
+   ORIG, and force it to be associated with ORIG.
+
+   How do we know that that shadow associated with ORIG has not so far
+   been assigned to?  Well, we don't per se know that, but supposing
+   it had.  Then this routine would create a second assignment to it,
+   and later the IR sanity checker would barf.  But that never
+   happens.  QED.
+*/
+static void bind_shadow_tmp_to_orig ( UChar how,
+                                      MCEnv* mce,
+                                      IRAtom* orig, IRAtom* shadow )
+{
+   tl_assert(isOriginalAtom(mce, orig));
+   tl_assert(isShadowAtom(mce, shadow));
+   switch (orig->tag) {
+      case Iex_Const:
+         tl_assert(shadow->tag == Iex_Const);
+         break;
+      case Iex_RdTmp:
+         tl_assert(shadow->tag == Iex_RdTmp);
+         if (how == 'V') {
+            assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
+                   shadow);
+         } else {
+            tl_assert(how == 'B');
+            assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
+                   shadow);
+         }
+         break;
+      default:
+         tl_assert(0);
+   }
+}
+
+
+static
+void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
+{
+   /* Scheme is (both single- and double- cases):
+
+      1. fetch data#,dataB (the proposed new value)
+
+      2. fetch expd#,expdB (what we expect to see at the address)
+
+      3. check definedness of address
+
+      4. load old#,oldB from shadow memory; this also checks
+         addressibility of the address
+
+      5. the CAS itself
+
+      6. complain if "expected == old" is undefined
+
+      7. if "expected == old"
+            store data#,dataB to shadow memory
+
+      Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
+      'data' but 7 stores 'data#'.  Hence it is possible for the
+      shadow data to be incorrectly checked and/or updated:
+
+      * 6 could falsely complain if 4 read old# as undefined, but some
+        other thread wrote a defined value to the location after 4 but
+        before 5.
+
+      * 6 could falsely not-complain if 4 read old# as defined, but
+        some other thread wrote an undefined value to the location
+        after 4 but before 5.
+
+      * 7 is at least gated correctly, since the 'expected == old'
+        condition is derived from outputs of 5.  However, the shadow
+        write could happen too late: imagine after 5 we are
+        descheduled, a different thread runs, writes a different
+        (shadow) value at the address, and then we resume, hence
+        overwriting the shadow value written by the other thread.
+
+      Because the original memory access is atomic, there's no way to
+      make both the original and shadow accesses into a single atomic
+      thing, hence this is unavoidable.
+
+      At least as Valgrind stands, I don't think it's a problem, since
+      we're single threaded *and* we guarantee that there are no
+      context switches during the execution of any specific superblock
+      -- context switches can only happen at superblock boundaries.
+
+      If Valgrind ever becomes MT in the future, then it might be more
+      of a problem.  A possible kludge would be to artificially
+      associate with the location, a lock, which we must acquire and
+      release around the transaction as a whole.  Hmm, that probably
+      would't work properly since it only guards us against other
+      threads doing CASs on the same location, not against other
+      threads doing normal reads and writes.
+   */
+   if (cas->oldHi == IRTemp_INVALID) {
+      do_shadow_CAS_single( mce, cas );
+   } else {
+      do_shadow_CAS_double( mce, cas );
+   }
+}
+
+
+static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
+{
+   IRAtom *vdataLo = NULL, *bdataLo = NULL;
+   IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
+   IRAtom *voldLo  = NULL, *boldLo  = NULL;
+   IRAtom *expd_eq_old_V = NULL, *expd_eq_old_B = NULL;
+   IRAtom *expd_eq_old   = NULL;
+   IROp   opCmpEQ;
+   Int    elemSzB;
+   IRType elemTy;
+   Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
+
+   /* single CAS */
+   tl_assert(cas->oldHi == IRTemp_INVALID);
+   tl_assert(cas->expdHi == NULL);
+   tl_assert(cas->dataHi == NULL);
+
+   elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
+   switch (elemTy) {
+      case Ity_I8:  elemSzB = 1; opCmpEQ = Iop_CmpEQ8;  break;
+      case Ity_I16: elemSzB = 2; opCmpEQ = Iop_CmpEQ16; break;
+      case Ity_I32: elemSzB = 4; opCmpEQ = Iop_CmpEQ32; break;
+      case Ity_I64: elemSzB = 8; opCmpEQ = Iop_CmpEQ64; break;
+      default: tl_assert(0); /* IR defn disallows any other types */
+   }
+
+   /* 1. fetch data# (the proposed new value) */
+   tl_assert(isOriginalAtom(mce, cas->dataLo));
+   vdataLo
+      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
+   tl_assert(isShadowAtom(mce, vdataLo));
+   if (otrak) {
+      bdataLo
+         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
+      tl_assert(isShadowAtom(mce, bdataLo));
+   }
+
+   /* 2. fetch expected# (what we expect to see at the address) */
+   tl_assert(isOriginalAtom(mce, cas->expdLo));
+   vexpdLo
+      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
+   tl_assert(isShadowAtom(mce, vexpdLo));
+   if (otrak) {
+      bexpdLo
+         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
+      tl_assert(isShadowAtom(mce, bexpdLo));
+   }
+
+   /* 3. check definedness of address */
+   /* 4. fetch old# from shadow memory; this also checks
+         addressibility of the address */
+   voldLo
+      = assignNew(
+           'V', mce, elemTy,
+           expr2vbits_Load( 
+              mce,
+              cas->end, elemTy, cas->addr, 0/*Addr bias*/
+        ));
+   if (otrak) {
+      boldLo
+         = assignNew('B', mce, Ity_I32,
+                     gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
+   }
+
+   /* 5. the CAS itself */
+   stmt( 'C', mce, IRStmt_CAS(cas) );
+
+   /* 6. complain if "expected == old" is undefined */
+   /* Doing this directly interacts in a complex way with origin
+      tracking.  Much easier to make up an expression tree and hand
+      that off to expr2vbits_Binop.  We will need the expression
+      tree in any case in order to decide whether or not to do a
+      shadow store. */
+   /* Note that 'C' is kinda faking it; it is indeed a non-shadow
+      tree, but it's not copied from the input block. */
+   expd_eq_old
+      = assignNew('C', mce, Ity_I1,
+                  binop(opCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
+
+   /* Compute into expd_eq_old_V the definedness for expd_eq_old.
+      First we need to ensure that cas->oldLo's V-shadow is bound
+      voldLo, since expr2vbits_Binop will generate a use of it. */
+   bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
+   expd_eq_old_V
+     = expr2vbits_Binop( mce, opCmpEQ, cas->expdLo, mkexpr(cas->oldLo) );
+   if (otrak) {
+      bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
+      expd_eq_old_B
+         = gen_maxU32( mce, bexpdLo, boldLo );
+   }
+
+   /* Generate a complaint if expd_eq_old is undefined.  As above,
+      first force expd_eq_old's definedness to be bound to its
+      V-shadow tmp. */
+   bind_shadow_tmp_to_orig('V', mce, expd_eq_old, expd_eq_old_V);
+   if (otrak)
+      bind_shadow_tmp_to_orig('B', mce, expd_eq_old, expd_eq_old_B);
+   complainIfUndefined(mce, expd_eq_old);
+
+   /* 7. if "expected == old"
+            store data# to shadow memory */
+   do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
+                    NULL/*data*/, vdataLo/*vdata*/,
+                    expd_eq_old/*guard for store*/ );
+   if (otrak) {
+      gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
+                   bdataLo/*bdata*/,
+                   expd_eq_old/*guard for store*/ );
+   }
+}
+
+
+static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
+{
+   IRAtom *vdataHi = NULL, *bdataHi = NULL;
+   IRAtom *vdataLo = NULL, *bdataLo = NULL;
+   IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
+   IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
+   IRAtom *voldHi  = NULL, *boldHi  = NULL;
+   IRAtom *voldLo  = NULL, *boldLo  = NULL;
+   IRAtom *xHi   = NULL, *xLo   = NULL, *xHL   = NULL;
+   IRAtom *xHi_V = NULL, *xLo_V = NULL, *xHL_V = NULL;
+   IRAtom *xHi_B = NULL, *xLo_B = NULL, *xHL_B = NULL;
+   IRAtom *expd_eq_old_V = NULL, *expd_eq_old_B = NULL;
+   IRAtom *expd_eq_old   = NULL, *zero = NULL;
+   IROp   opCmpEQ, opOr, opXor;
+   Int    elemSzB, memOffsLo, memOffsHi;
+   IRType elemTy;
+   Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
+
+   /* double CAS */
+   tl_assert(cas->oldHi != IRTemp_INVALID);
+   tl_assert(cas->expdHi != NULL);
+   tl_assert(cas->dataHi != NULL);
+
+   elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
+   switch (elemTy) {
+      case Ity_I8:
+         opCmpEQ = Iop_CmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8; 
+         elemSzB = 1; zero = mkU8(0);
+         break;
+      case Ity_I16:
+         opCmpEQ = Iop_CmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
+         elemSzB = 2; zero = mkU16(0);
+         break;
+      case Ity_I32:
+         opCmpEQ = Iop_CmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
+         elemSzB = 4; zero = mkU32(0);
+         break;
+      case Ity_I64:
+         opCmpEQ = Iop_CmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
+         elemSzB = 8; zero = mkU64(0);
+         break;
+      default:
+         tl_assert(0); /* IR defn disallows any other types */
+   }
+
+   /* 1. fetch data# (the proposed new value) */
+   tl_assert(isOriginalAtom(mce, cas->dataHi));
+   tl_assert(isOriginalAtom(mce, cas->dataLo));
+   vdataHi
+      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi));
+   vdataLo
+      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
+   tl_assert(isShadowAtom(mce, vdataHi));
+   tl_assert(isShadowAtom(mce, vdataLo));
+   if (otrak) {
+      bdataHi
+         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
+      bdataLo
+         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
+      tl_assert(isShadowAtom(mce, bdataHi));
+      tl_assert(isShadowAtom(mce, bdataLo));
+   }
+
+   /* 2. fetch expected# (what we expect to see at the address) */
+   tl_assert(isOriginalAtom(mce, cas->expdHi));
+   tl_assert(isOriginalAtom(mce, cas->expdLo));
+   vexpdHi
+      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi));
+   vexpdLo
+      = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
+   tl_assert(isShadowAtom(mce, vexpdHi));
+   tl_assert(isShadowAtom(mce, vexpdLo));
+   if (otrak) {
+      bexpdHi
+         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
+      bexpdLo
+         = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
+      tl_assert(isShadowAtom(mce, bexpdHi));
+      tl_assert(isShadowAtom(mce, bexpdLo));
+   }
+
+   /* 3. check definedness of address */
+   /* 4. fetch old# from shadow memory; this also checks
+         addressibility of the address */
+   if (cas->end == Iend_LE) {
+      memOffsLo = 0;
+      memOffsHi = elemSzB;
+   } else {
+      tl_assert(cas->end == Iend_BE);
+      memOffsLo = elemSzB;
+      memOffsHi = 0;
+   }
+   voldHi
+      = assignNew(
+           'V', mce, elemTy,
+           expr2vbits_Load( 
+              mce,
+              cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/
+        ));
+   voldLo
+      = assignNew(
+           'V', mce, elemTy,
+           expr2vbits_Load( 
+              mce,
+              cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/
+        ));
+   if (otrak) {
+      boldHi
+         = assignNew('B', mce, Ity_I32,
+                     gen_load_b(mce, elemSzB, cas->addr,
+                                memOffsHi/*addr bias*/));
+      boldLo
+         = assignNew('B', mce, Ity_I32,
+                     gen_load_b(mce, elemSzB, cas->addr,
+                                memOffsLo/*addr bias*/));
+   }
+
+   /* 5. the CAS itself */
+   stmt( 'C', mce, IRStmt_CAS(cas) );
+
+   /* 6. complain if "expected == old" is undefined */
+   /* Doing this directly interacts in a complex way with origin
+      tracking.  Much easier to make up an expression tree and hand
+      that off to expr2vbits_Binop.  We will need the expression
+      tree in any case in order to decide whether or not to do a
+      shadow store. */
+   /* Note that 'C' is kinda faking it; it is indeed a non-shadow
+      tree, but it's not copied from the input block. */
+   /*
+      xHi = oldHi ^ expdHi;
+      xLo = oldLo ^ expdLo;
+      xHL = xHi | xLo;
+      expd_eq_old = xHL == 0;
+   */
+
+   /* --- xHi = oldHi ^ expdHi --- */
+   xHi = assignNew('C', mce, elemTy,
+                   binop(opXor, cas->expdHi, mkexpr(cas->oldHi))); 
+   bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
+   xHi_V
+      = expr2vbits_Binop( mce, opXor, cas->expdHi, mkexpr(cas->oldHi));
+   if (otrak) {
+      bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
+      xHi_B = gen_maxU32( mce, bexpdHi, boldHi );
+   }
+
+   /* --- xLo = oldLo ^ expdLo --- */
+   xLo = assignNew('C', mce, elemTy,
+                   binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
+   bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
+   xLo_V
+      = expr2vbits_Binop( mce, opXor, cas->expdLo, mkexpr(cas->oldLo));
+   if (otrak) {
+      bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
+      xLo_B = gen_maxU32( mce, bexpdLo, boldLo );
+   }
+
+   /* --- xHL = xHi | xLo --- */
+   xHL = assignNew('C', mce, elemTy,
+                   binop(opOr, xHi, xLo));
+   bind_shadow_tmp_to_orig('V', mce, xHi, xHi_V);
+   bind_shadow_tmp_to_orig('V', mce, xLo, xLo_V);
+   xHL_V
+      = expr2vbits_Binop( mce, opOr, xHi, xLo );
+   if (otrak) {
+      bind_shadow_tmp_to_orig('B', mce, xHi, xHi_B);
+      bind_shadow_tmp_to_orig('B', mce, xLo, xLo_B);
+      xHL_B = gen_maxU32( mce, xHi_B, xLo_B );
+   }
+
+   /* --- expd_eq_old = xHL == 0 --- */
+   expd_eq_old
+      = assignNew('C', mce, Ity_I1,
+                  binop(opCmpEQ, xHL, zero));
+   bind_shadow_tmp_to_orig('V', mce, xHL, xHL_V);
+   expd_eq_old_V
+      = expr2vbits_Binop( mce, opCmpEQ, xHL, zero);
+   if (otrak) {
+      expd_eq_old_B = xHL_B; /* since the zero literal isn't going to
+                                contribute any interesting origin */
+   }
+
+   /* The backend's register allocator is probably on fire by now :-) */
+   /* Generate a complaint if expd_eq_old is undefined.  As above,
+      first force expd_eq_old's definedness to be bound to its
+      V-shadow tmp. */
+   bind_shadow_tmp_to_orig('V', mce, expd_eq_old, expd_eq_old_V);
+   if (otrak)
+      bind_shadow_tmp_to_orig('B', mce, expd_eq_old, expd_eq_old_B);
+   complainIfUndefined(mce, expd_eq_old);
+
+   /* 7. if "expected == old"
+            store data# to shadow memory */
+   do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
+                    NULL/*data*/, vdataHi/*vdata*/,
+                    expd_eq_old/*guard for store*/ );
+   do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
+                    NULL/*data*/, vdataLo/*vdata*/,
+                    expd_eq_old/*guard for store*/ );
+   if (otrak) {
+      gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
+                   bdataHi/*bdata*/,
+                   expd_eq_old/*guard for store*/ );
+      gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
+                   bdataLo/*bdata*/,
+                   expd_eq_old/*guard for store*/ );
+   }
+}
+
+
  /*------------------------------------------------------------*/
  /*--- Memcheck main                                        ---*/
  /*------------------------------------------------------------*/
@@ -3349,6 +3902,7 @@ static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
     Int      i;
     IRExpr*  e;
     IRDirty* d;
+   IRCAS*   cas;
     switch (st->tag) {
        case Ist_WrTmp:
           e = st->Ist.WrTmp.data;
@@ -3415,6 +3969,13 @@ static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
        case Ist_IMark:
        case Ist_MBE:
           return False;
+      case Ist_CAS:
+         cas = st->Ist.CAS.details;
+         return isBogusAtom(cas->addr)
+                || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
+                || isBogusAtom(cas->expdLo)
+                || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
+                || isBogusAtom(cas->dataLo);
        default: 
        unhandled:
           ppIRStmt(st);
@@ -3424,7 +3985,7 @@ static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
  
  
  IRSB* MC_(instrument) ( VgCallbackClosure* closure,
-                        IRSB* bb_in, 
+                        IRSB* sb_in, 
                          VexGuestLayout* layout, 
                          VexGuestExtents* vge,
                          IRType gWordTy, IRType hWordTy )
@@ -3434,7 +3995,7 @@ IRSB* MC_(instrument) ( VgCallbackClosure* closure,
     Int     i, j, first_stmt;
     IRStmt* st;
     MCEnv   mce;
-   IRSB*   bb;
+   IRSB*   sb_out;
  
     if (gWordTy != hWordTy) {
        /* We don't currently support this case. */
@@ -3454,22 +4015,29 @@ IRSB* MC_(instrument) ( VgCallbackClosure* closure,
     tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
  
     /* Set up SB */
-   bb = deepCopyIRSBExceptStmts(bb_in);
-
-   /* Set up the running environment.  Only .bb is modified as we go
-      along. */
-   mce.bb             = bb;
+   sb_out = deepCopyIRSBExceptStmts(sb_in);
+
+   /* Set up the running environment.  Both .sb and .tmpMap are
+      modified as we go along.  Note that tmps are added to both
+      .sb->tyenv and .tmpMap together, so the valid index-set for
+      those two arrays should always be identical. */
+   VG_(memset)(&mce, 0, sizeof(mce));
+   mce.sb             = sb_out;
     mce.trace          = verboze;
     mce.layout         = layout;
-   mce.n_originalTmps = bb->tyenv->types_used;
     mce.hWordTy        = hWordTy;
     mce.bogusLiterals  = False;
-   mce.tmpMapV        = LibVEX_Alloc(mce.n_originalTmps * sizeof(IRTemp));
-   mce.tmpMapB        = LibVEX_Alloc(mce.n_originalTmps * sizeof(IRTemp));
-   for (i = 0; i < mce.n_originalTmps; i++) {
-      mce.tmpMapV[i] = IRTemp_INVALID;
-      mce.tmpMapB[i] = IRTemp_INVALID;
+
+   mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
+                            sizeof(TempMapEnt));
+   for (i = 0; i < sb_in->tyenv->types_used; i++) {
+      TempMapEnt ent;
+      ent.kind    = Orig;
+      ent.shadowV = IRTemp_INVALID;
+      ent.shadowB = IRTemp_INVALID;
+      VG_(addToXA)( mce.tmpMap, &ent );
     }
+   tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
  
     /* Make a preliminary inspection of the statements, to see if there
        are any dodgy-looking literals.  If there are, we generate
@@ -3479,9 +4047,9 @@ IRSB* MC_(instrument) ( VgCallbackClosure* closure,
  
     bogus = False;
  
-   for (i = 0; i < bb_in->stmts_used; i++) {
+   for (i = 0; i < sb_in->stmts_used; i++) {
  
-      st = bb_in->stmts[i];
+      st = sb_in->stmts[i];
        tl_assert(st);
        tl_assert(isFlatIRStmt(st));
  
@@ -3500,16 +4068,17 @@ IRSB* MC_(instrument) ( VgCallbackClosure* closure,
  
     /* Copy verbatim any IR preamble preceding the first IMark */
  
-   tl_assert(mce.bb == bb);
+   tl_assert(mce.sb == sb_out);
+   tl_assert(mce.sb != sb_in);
  
     i = 0;
-   while (i < bb_in->stmts_used && bb_in->stmts[i]->tag != Ist_IMark) {
+   while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
  
-      st = bb_in->stmts[i];
+      st = sb_in->stmts[i];
        tl_assert(st);
        tl_assert(isFlatIRStmt(st));
  
-      stmt( 'C', &mce, bb_in->stmts[i] );
+      stmt( 'C', &mce, sb_in->stmts[i] );
        i++;
     }
  
@@ -3536,16 +4105,16 @@ IRSB* MC_(instrument) ( VgCallbackClosure* closure,
        no-origin, as appropriate for a defined value.
     */
     for (j = 0; j < i; j++) {
-      if (bb_in->stmts[j]->tag == Ist_WrTmp) {
+      if (sb_in->stmts[j]->tag == Ist_WrTmp) {
           /* findShadowTmpV checks its arg is an original tmp;
              no need to assert that here. */
-         IRTemp tmp_o = bb_in->stmts[j]->Ist.WrTmp.tmp;
+         IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
           IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
-         IRType ty_v  = typeOfIRTemp(bb->tyenv, tmp_v);
+         IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
           assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
           if (MC_(clo_mc_level) == 3) {
              IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
-            tl_assert(typeOfIRTemp(bb->tyenv, tmp_b) == Ity_I32);
+            tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
              assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
           }
           if (0) {
@@ -3558,15 +4127,15 @@ IRSB* MC_(instrument) ( VgCallbackClosure* closure,
  
     /* Iterate over the remaining stmts to generate instrumentation. */
  
-   tl_assert(bb_in->stmts_used > 0);
+   tl_assert(sb_in->stmts_used > 0);
     tl_assert(i >= 0);
-   tl_assert(i < bb_in->stmts_used);
-   tl_assert(bb_in->stmts[i]->tag == Ist_IMark);
+   tl_assert(i < sb_in->stmts_used);
+   tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
  
-   for (/* use current i*/; i <  bb_in->stmts_used; i++) {
+   for (/* use current i*/; i < sb_in->stmts_used; i++) {
  
-      st = bb_in->stmts[i];
-      first_stmt = bb->stmts_used;
+      st = sb_in->stmts[i];
+      first_stmt = sb_out->stmts_used;
  
        if (verboze) {
           VG_(printf)("\n");
@@ -3574,8 +4143,11 @@ IRSB* MC_(instrument) ( VgCallbackClosure* closure,
           VG_(printf)("\n");
        }
  
-      if (MC_(clo_mc_level) == 3)
-         schemeS( &mce, st );
+      if (MC_(clo_mc_level) == 3) {
+         /* See comments on case Ist_CAS below. */
+         if (st->tag != Ist_CAS) 
+            schemeS( &mce, st );
+      }
  
        /* Generate instrumentation code for each stmt ... */
  
@@ -3605,7 +4177,34 @@ IRSB* MC_(instrument) ( VgCallbackClosure* closure,
              do_shadow_Store( &mce, st->Ist.Store.end,
                                     st->Ist.Store.addr, 0/* addr bias */,
                                     st->Ist.Store.data,
-                                   NULL /* shadow data */ );
+                                   NULL /* shadow data */,
+                                   NULL/*guard*/ );
+            /* If this is a store conditional, it writes to .resSC a
+               value indicating whether or not the store succeeded.
+               Just claim this value is always defined.  In the
+               PowerPC interpretation of store-conditional,
+               definedness of the success indication depends on
+               whether the address of the store matches the
+               reservation address.  But we can't tell that here (and
+               anyway, we're not being PowerPC-specific).  At least we
+               are guarantted that the definedness of the store
+               address, and its addressibility, will be checked as per
+               normal.  So it seems pretty safe to just say that the
+               success indication is always defined.
+
+               In schemeS, for origin tracking, we must
+               correspondingly set a no-origin value for the origin
+               shadow of resSC.
+            */
+            if (st->Ist.Store.resSC != IRTemp_INVALID) {
+               assign( 'V', &mce,
+                       findShadowTmpV(&mce, st->Ist.Store.resSC),
+                       definedOfType(
+                          shadowTypeV(
+                             typeOfIRTemp(mce.sb->tyenv,
+                                          st->Ist.Store.resSC)
+                     )));
+            }
              break;
  
           case Ist_Exit:
@@ -3629,6 +4228,16 @@ IRSB* MC_(instrument) ( VgCallbackClosure* closure,
                                st->Ist.AbiHint.nia );
              break;
  
+         case Ist_CAS:
+            do_shadow_CAS( &mce, st->Ist.CAS.details );
+            /* Note, do_shadow_CAS copies the CAS itself to the output
+               block, because it needs to add instrumentation both
+               before and after it.  Hence skip the copy below.  Also
+               skip the origin-tracking stuff (call to schemeS) above,
+               since that's all tangled up with it too; do_shadow_CAS
+               does it all. */
+            break;
+
           default:
              VG_(printf)("\n");
              ppIRStmt(st);
@@ -3638,40 +4247,48 @@ IRSB* MC_(instrument) ( VgCallbackClosure* closure,
        } /* switch (st->tag) */
  
        if (0 && verboze) {
-         for (j = first_stmt; j < bb->stmts_used; j++) {
+         for (j = first_stmt; j < sb_out->stmts_used; j++) {
              VG_(printf)("   ");
-            ppIRStmt(bb->stmts[j]);
+            ppIRStmt(sb_out->stmts[j]);
              VG_(printf)("\n");
           }
           VG_(printf)("\n");
        }
  
-      /* ... and finally copy the stmt itself to the output. */
-      stmt('C', &mce, st);
-
+      /* ... and finally copy the stmt itself to the output.  Except,
+         skip the copy of IRCASs; see comments on case Ist_CAS
+         above. */
+      if (st->tag != Ist_CAS)
+         stmt('C', &mce, st);
     }
  
     /* Now we need to complain if the jump target is undefined. */
-   first_stmt = bb->stmts_used;
+   first_stmt = sb_out->stmts_used;
  
     if (verboze) {
-      VG_(printf)("bb->next = ");
-      ppIRExpr(bb->next);
+      VG_(printf)("sb_in->next = ");
+      ppIRExpr(sb_in->next);
        VG_(printf)("\n\n");
     }
  
-   complainIfUndefined( &mce, bb->next );
+   complainIfUndefined( &mce, sb_in->next );
  
     if (0 && verboze) {
-      for (j = first_stmt; j < bb->stmts_used; j++) {
+      for (j = first_stmt; j < sb_out->stmts_used; j++) {
           VG_(printf)("   ");
-         ppIRStmt(bb->stmts[j]);
+         ppIRStmt(sb_out->stmts[j]);
           VG_(printf)("\n");
        }
        VG_(printf)("\n");
     }
  
-   return bb;
+   /* If this fails, there's been some serious snafu with tmp management,
+      that should be investigated. */
+   tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
+   VG_(deleteXA)( mce.tmpMap );
+
+   tl_assert(mce.sb == sb_out);
+   return sb_out;
  }
  
  /*------------------------------------------------------------*/
@@ -3826,14 +4443,25 @@ IRSB* MC_(final_tidy) ( IRSB* sb_in )
  /*--- Origin tracking stuff                                ---*/
  /*------------------------------------------------------------*/
  
+/* Almost identical to findShadowTmpV. */
  static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
  {
-   tl_assert(orig < mce->n_originalTmps);
-   if (mce->tmpMapB[orig] == IRTemp_INVALID) {
-      mce->tmpMapB[orig] 
-         = newIRTemp(mce->bb->tyenv, Ity_I32);
+   TempMapEnt* ent;
+   /* VG_(indexXA) range-checks 'orig', hence no need to check
+      here. */
+   ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
+   tl_assert(ent->kind == Orig);
+   if (ent->shadowB == IRTemp_INVALID) {
+      IRTemp tmpB
+        = newTemp( mce, Ity_I32, BSh );
+      /* newTemp may cause mce->tmpMap to resize, hence previous results
+         from VG_(indexXA) are invalid. */
+      ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
+      tl_assert(ent->kind == Orig);
+      tl_assert(ent->shadowB == IRTemp_INVALID);
+      ent->shadowB = tmpB;
     }
-   return mce->tmpMapB[orig];
+   return ent->shadowB;
  }
  
  static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
@@ -3848,7 +4476,7 @@ static IRAtom* gen_load_b ( MCEnv* mce, Int szB,
     HChar*   hName;
     IRTemp   bTmp;
     IRDirty* di;
-   IRType   aTy   = typeOfIRExpr( mce->bb->tyenv, baseaddr );
+   IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
     IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
     IRAtom*  ea    = baseaddr;
     if (offset != 0) {
@@ -3856,7 +4484,7 @@ static IRAtom* gen_load_b ( MCEnv* mce, Int szB,
                                     : mkU64( (Long)(Int)offset );
        ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
     }
-   bTmp = newIRTemp(mce->bb->tyenv, mce->hWordTy);
+   bTmp = newTemp(mce, mce->hWordTy, BSh);
  
     switch (szB) {
        case 1: hFun  = (void*)&MC_(helperc_b_load1);
@@ -3887,7 +4515,7 @@ static IRAtom* gen_load_b ( MCEnv* mce, Int szB,
     stmt( 'B', mce, IRStmt_Dirty(di) );
     if (mce->hWordTy == Ity_I64) {
        /* 64-bit host */
-      IRTemp bTmp32 = newIRTemp(mce->bb->tyenv, Ity_I32);
+      IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
        assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
        return mkexpr(bTmp32);
     } else {
@@ -3895,15 +4523,23 @@ static IRAtom* gen_load_b ( MCEnv* mce, Int szB,
        return mkexpr(bTmp);
     }
  }
+
+/* Generate a shadow store.  guard :: Ity_I1 controls whether the
+   store really happens; NULL means it unconditionally does. */
  static void gen_store_b ( MCEnv* mce, Int szB,
-                          IRAtom* baseaddr, Int offset, IRAtom* dataB )
+                          IRAtom* baseaddr, Int offset, IRAtom* dataB,
+                          IRAtom* guard )
  {
     void*    hFun;
     HChar*   hName;
     IRDirty* di;
-   IRType   aTy   = typeOfIRExpr( mce->bb->tyenv, baseaddr );
+   IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
     IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
     IRAtom*  ea    = baseaddr;
+   if (guard) {
+      tl_assert(isOriginalAtom(mce, guard));
+      tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
+   }
     if (offset != 0) {
        IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
                                     : mkU64( (Long)(Int)offset );
@@ -3937,11 +4573,12 @@ static void gen_store_b ( MCEnv* mce, Int szB,
          );
     /* no need to mess with any annotations.  This call accesses
        neither guest state nor guest memory. */
+   if (guard) di->guard = guard;
     stmt( 'B', mce, IRStmt_Dirty(di) );
  }
  
  static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
-   IRType eTy = typeOfIRExpr(mce->bb->tyenv, e);
+   IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
     if (eTy == Ity_I64)
        return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
     if (eTy == Ity_I32)
@@ -3950,7 +4587,7 @@ static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
  }
  
  static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
-   IRType eTy = typeOfIRExpr(mce->bb->tyenv, e);
+   IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
     tl_assert(eTy == Ity_I32);
     if (dstTy == Ity_I64)
        return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
@@ -4220,12 +4857,14 @@ static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
        toDo   = d->mSize;
        /* chew off 32-bit chunks */
        while (toDo >= 4) {
-         gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr );
+         gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
+                      NULL/*guard*/ );
           toDo -= 4;
        }
        /* handle possible 16-bit excess */
        while (toDo >= 2) {
-         gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr );
+        gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
+                     NULL/*guard*/ );
           toDo -= 2;
        }
        tl_assert(toDo == 0); /* also need to handle 1-byte excess */
@@ -4282,16 +4921,25 @@ static void schemeS ( MCEnv* mce, IRStmt* st )
              available (somewhere) */
           tl_assert(isIRAtom(st->Ist.Store.addr));
           dszB = sizeofIRType(
-                   typeOfIRExpr(mce->bb->tyenv, st->Ist.Store.data ));
+                   typeOfIRExpr(mce->sb->tyenv, st->Ist.Store.data ));
           dataB = schemeE( mce, st->Ist.Store.data );
-         gen_store_b( mce, dszB, st->Ist.Store.addr, 0/*offset*/, dataB );
+         gen_store_b( mce, dszB, st->Ist.Store.addr, 0/*offset*/, dataB,
+                      NULL/*guard*/ );
+         /* For the rationale behind this, see comments at the place
+            where the V-shadow for .resSC is constructed, in the main
+            loop in MC_(instrument).  In short, wee regard .resSc as
+            always-defined. */
+         if (st->Ist.Store.resSC != IRTemp_INVALID) {
+            assign( 'B', mce, findShadowTmpB(mce, st->Ist.Store.resSC),
+                    mkU32(0) );
+         }
           break;
        }
        case Ist_Put: {
           Int b_offset
              = MC_(get_otrack_shadow_offset)(
                   st->Ist.Put.offset,
-                 sizeofIRType(typeOfIRExpr(mce->bb->tyenv, st->Ist.Put.data))
+                 sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
                );
           if (b_offset >= 0) {
              /* FIXME: this isn't an atom! */
author	Julian Seward <jseward@acm.org>
	Wed, 1 Jul 2009 08:10:49 +0000 (08:10 +0000)
committer	Julian Seward <jseward@acm.org>
	Wed, 1 Jul 2009 08:10:49 +0000 (08:10 +0000)
cachegrind/cg_main.c		patch \| blob \| blame \| history
callgrind/main.c		patch \| blob \| blame \| history
coregrind/m_machine.c		patch \| blob \| blame \| history
coregrind/m_scheduler/scheduler.c		patch \| blob \| blame \| history
coregrind/m_signals.c		patch \| blob \| blame \| history
coregrind/m_syswrap/syswrap-main.c		patch \| blob \| blame \| history
coregrind/m_tooliface.c		patch \| blob \| blame \| history
coregrind/pub_core_signals.h		patch \| blob \| blame \| history
coregrind/pub_core_tooliface.h		patch \| blob \| blame \| history
drd/drd_load_store.c		patch \| blob \| blame \| history
exp-ptrcheck/h_main.c		patch \| blob \| blame \| history
exp-ptrcheck/h_main.h		patch \| blob \| blame \| history
exp-ptrcheck/sg_main.c		patch \| blob \| blame \| history
helgrind/hg_main.c		patch \| blob \| blame \| history
helgrind/libhb_core.c		patch \| blob \| blame \| history
include/pub_tool_tooliface.h		patch \| blob \| blame \| history
lackey/lk_main.c		patch \| blob \| blame \| history
massif/ms_main.c		patch \| blob \| blame \| history
memcheck/mc_machine.c		patch \| blob \| blame \| history
memcheck/mc_translate.c		patch \| blob \| blame \| history