]> git.ipfire.org Git - thirdparty/valgrind.git/commitdiff
Add tests to verify behaviour of atomic instruction handling.
authorJulian Seward <jseward@acm.org>
Sat, 4 Jul 2009 12:44:08 +0000 (12:44 +0000)
committerJulian Seward <jseward@acm.org>
Sat, 4 Jul 2009 12:44:08 +0000 (12:44 +0000)
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@10410

16 files changed:
memcheck/tests/Makefile.am
memcheck/tests/atomic_incs.c [new file with mode: 0644]
memcheck/tests/atomic_incs.stderr.exp [new file with mode: 0644]
memcheck/tests/atomic_incs.stdout.exp-32bit [new file with mode: 0644]
memcheck/tests/atomic_incs.stdout.exp-64bit [new file with mode: 0644]
memcheck/tests/atomic_incs.vgtest [new file with mode: 0644]
none/tests/amd64/Makefile.am
none/tests/amd64/amd64locked.c [new file with mode: 0644]
none/tests/amd64/amd64locked.stderr.exp [new file with mode: 0644]
none/tests/amd64/amd64locked.stdout.exp [new file with mode: 0644]
none/tests/amd64/amd64locked.vgtest [new file with mode: 0644]
none/tests/x86/Makefile.am
none/tests/x86/x86locked.c [new file with mode: 0644]
none/tests/x86/x86locked.stderr.exp [new file with mode: 0644]
none/tests/x86/x86locked.stdout.exp [new file with mode: 0644]
none/tests/x86/x86locked.vgtest [new file with mode: 0644]

index ab4ae0b824a963dc5fcee2a59ba5a41c99153404..212ca0f43248f6ad4abe2481edff1d802058e017 100644 (file)
@@ -36,6 +36,8 @@ noinst_HEADERS = leak.h
 
 EXTRA_DIST = \
        addressable.stderr.exp addressable.stdout.exp addressable.vgtest \
+       atomic_incs.stderr.exp atomic_incs.vgtest \
+       atomic_incs.stdout.exp-32bit atomic_incs.stdout.exp-64bit \
        badaddrvalue.stderr.exp \
        badaddrvalue.stdout.exp badaddrvalue.vgtest \
        badfree-2trace.stderr.exp badfree-2trace.vgtest \
@@ -183,6 +185,7 @@ EXTRA_DIST = \
 
 check_PROGRAMS = \
        addressable \
+       atomic_incs \
        badaddrvalue badfree badjump badjump2 \
        badloop badpoll badrw brk2 buflen_check \
        clientperm custom_alloc \
diff --git a/memcheck/tests/atomic_incs.c b/memcheck/tests/atomic_incs.c
new file mode 100644 (file)
index 0000000..07d6a06
--- /dev/null
@@ -0,0 +1,192 @@
+
+/* This is an example of a program which does atomic memory operations
+   between two processes which share a page.  Valgrind 3.4.1 and
+   earlier produce incorrect answers because it does not preserve
+   atomicity of the relevant instructions in the generated code; but
+   the post-DCAS-merge versions of Valgrind do behave correctly. */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+
+#define NNN 3456987
+
+__attribute__((noinline)) void atomic_add_8bit ( char* p, int n ) 
+{
+   unsigned long block[2];
+   block[0] = (unsigned long)p;
+   block[1] = n;
+#if defined(VGA_x86)
+   __asm__ __volatile__(
+      "movl 0(%%esi),%%eax"      "\n\t"
+      "movl 4(%%esi),%%ebx"      "\n\t"
+      "lock; addb %%bl,(%%eax)"  "\n"
+      : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
+   );
+#elif defined(VGA_amd64)
+   __asm__ __volatile__(
+      "movq 0(%%rsi),%%rax"      "\n\t"
+      "movq 8(%%rsi),%%rbx"      "\n\t"
+      "lock; addb %%bl,(%%rax)"  "\n"
+      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
+   );
+#else
+# error "Unsupported arch"
+#endif
+}
+
+__attribute__((noinline)) void atomic_add_16bit ( short* p, int n ) 
+{
+   unsigned long block[2];
+   block[0] = (unsigned long)p;
+   block[1] = n;
+#if defined(VGA_x86)
+   __asm__ __volatile__(
+      "movl 0(%%esi),%%eax"      "\n\t"
+      "movl 4(%%esi),%%ebx"      "\n\t"
+      "lock; addw %%bx,(%%eax)"  "\n"
+      : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
+   );
+#elif defined(VGA_amd64)
+   __asm__ __volatile__(
+      "movq 0(%%rsi),%%rax"      "\n\t"
+      "movq 8(%%rsi),%%rbx"      "\n\t"
+      "lock; addw %%bx,(%%rax)"  "\n"
+      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
+   );
+#else
+# error "Unsupported arch"
+#endif
+}
+
+__attribute__((noinline)) void atomic_add_32bit ( int* p, int n ) 
+{
+   unsigned long block[2];
+   block[0] = (unsigned long)p;
+   block[1] = n;
+#if defined(VGA_x86)
+   __asm__ __volatile__(
+      "movl 0(%%esi),%%eax"       "\n\t"
+      "movl 4(%%esi),%%ebx"       "\n\t"
+      "lock; addl %%ebx,(%%eax)"  "\n"
+      : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
+   );
+#elif defined(VGA_amd64)
+   __asm__ __volatile__(
+      "movq 0(%%rsi),%%rax"       "\n\t"
+      "movq 8(%%rsi),%%rbx"       "\n\t"
+      "lock; addl %%ebx,(%%rax)"  "\n"
+      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
+   );
+#else
+# error "Unsupported arch"
+#endif
+}
+
+__attribute__((noinline)) void atomic_add_64bit ( long long int* p, int n ) 
+{
+  // this is a bit subtle.  It relies on the fact that, on a 64-bit platform,
+  // sizeof(unsigned long long int) == sizeof(unsigned long) == sizeof(void*) 
+   unsigned long long int block[2];
+   block[0] = (unsigned long long int)(unsigned long)p;
+   block[1] = n;
+#if defined(VGA_x86)
+   /* do nothing; is not supported */
+#elif defined(VGA_amd64)
+   __asm__ __volatile__(
+      "movq 0(%%rsi),%%rax"      "\n\t"
+      "movq 8(%%rsi),%%rbx"      "\n\t"
+      "lock; addq %%rbx,(%%rax)" "\n"
+      : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
+   );
+#else
+# error "Unsupported arch"
+#endif
+}
+
+int main ( int argc, char** argv )
+{
+   int    i, status;
+   char*  page;
+   char*  p8;
+   short* p16;
+   int*   p32;
+   long long int* p64;
+   pid_t  child, p2;
+
+   printf("parent, pre-fork\n");
+
+   page = mmap( 0, sysconf(_SC_PAGESIZE),
+                   PROT_READ|PROT_WRITE,
+                   MAP_ANONYMOUS|MAP_SHARED, -1, 0 );
+   if (page == MAP_FAILED) {
+      perror("mmap failed");
+      exit(1);
+   }
+
+   p8  = (char*)(page+0);
+   p16 = (short*)(page+256);
+   p32 = (int*)(page+512);
+   p64 = (long long int*)(page+768);
+
+   *p8  = 0;
+   *p16 = 0;
+   *p32 = 0;
+   *p64 = 0;
+
+   child = fork();
+   if (child == -1) {
+      perror("fork() failed\n");
+      return 1;
+   }
+
+   if (child == 0) {
+      /* --- CHILD --- */
+      printf("child\n");
+      for (i = 0; i < NNN; i++) {
+         atomic_add_8bit(p8, 1);
+         atomic_add_16bit(p16, 1);
+         atomic_add_32bit(p32, 1);
+         atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
+      }
+      return 1;
+      /* NOTREACHED */
+
+   }
+
+   /* --- PARENT --- */
+
+   printf("parent\n");
+
+   for (i = 0; i < NNN; i++) {
+      atomic_add_8bit(p8, 1);
+      atomic_add_16bit(p16, 1);
+      atomic_add_32bit(p32, 1);
+      atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
+   }
+
+   p2 = waitpid(child, &status, 0);
+   assert(p2 == child);
+
+   /* assert that child finished normally */
+   assert(WIFEXITED(status));
+
+   printf("FINAL VALUES:  8 bit %d,  16 bit %d,  32 bit %d,  64 bit %lld\n",
+          (int)(*p8), (int)(*p16), *p32, *p64 );
+
+   if (-74 == (int)(*p8) 
+       && 32694 == (int)(*p16) 
+       && 6913974 == *p32
+       && (0LL == *p64 || 682858642110 == *p64)) {
+      printf("PASS\n");
+   } else {
+      printf("FAIL -- see source code for expected values\n");
+   }
+
+   printf("parent exits\n");
+
+   return 0;
+}
diff --git a/memcheck/tests/atomic_incs.stderr.exp b/memcheck/tests/atomic_incs.stderr.exp
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/memcheck/tests/atomic_incs.stdout.exp-32bit b/memcheck/tests/atomic_incs.stdout.exp-32bit
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/memcheck/tests/atomic_incs.stdout.exp-64bit b/memcheck/tests/atomic_incs.stdout.exp-64bit
new file mode 100644 (file)
index 0000000..82405c5
--- /dev/null
@@ -0,0 +1,7 @@
+parent, pre-fork
+child
+parent, pre-fork
+parent
+FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 682858642110
+PASS
+parent exits
diff --git a/memcheck/tests/atomic_incs.vgtest b/memcheck/tests/atomic_incs.vgtest
new file mode 100644 (file)
index 0000000..f433317
--- /dev/null
@@ -0,0 +1,2 @@
+prog: atomic_incs
+vgopts: -q --track-origins=yes
index 314b627166c962eb37def0183f7e885bd0cbbd4a..51ec345ce2c3202b1d99572cd8bbdbf2879cf356 100644 (file)
@@ -17,6 +17,7 @@ endif
 # to avoid packaging screwups if 'make dist' is run on a machine
 # which failed the BUILD_SSE3_TESTS test in configure.in.
 EXTRA_DIST = \
+       amd64locked.vgtest amd64locked.stdout.exp amd64locked.stderr.exp \
        bug127521-64.vgtest bug127521-64.stdout.exp bug127521-64.stderr.exp \
        bug132813-amd64.vgtest bug132813-amd64.stdout.exp \
        bug132813-amd64.stderr.exp \
@@ -50,6 +51,7 @@ EXTRA_DIST = \
        slahf-amd64.vgtest
 
 check_PROGRAMS = \
+       amd64locked \
        bug127521-64 bug132813-amd64 bug132918 \
        clc \
        $(INSN_TESTS) \
@@ -80,6 +82,7 @@ AM_CXXFLAGS  += @FLAG_M64@
 AM_CCASFLAGS += @FLAG_M64@
 
 # generic C ones
+amd64locked_CFLAGS     = $(AM_CFLAGS) -O
 bug132918_LDADD                = -lm
 insn_basic_SOURCES     = insn_basic.def
 insn_basic_LDADD       = -lm
diff --git a/none/tests/amd64/amd64locked.c b/none/tests/amd64/amd64locked.c
new file mode 100644 (file)
index 0000000..d464857
--- /dev/null
@@ -0,0 +1,1062 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define VERBOSE 0
+
+typedef  unsigned int            UInt;
+typedef  unsigned char           UChar;
+typedef  unsigned long long int  ULong;
+typedef  signed long long int    Long;
+typedef  signed int              Int;
+typedef  unsigned short          UShort;
+typedef  unsigned long           UWord;
+typedef  char                    HChar;
+
+/////////////////////////////////////////////////////////////////
+// BEGIN crc32 stuff                                           //
+/////////////////////////////////////////////////////////////////
+
+static const UInt crc32Table[256] = {
+
+   /*-- Ugly, innit? --*/
+
+   0x00000000L, 0x04c11db7L, 0x09823b6eL, 0x0d4326d9L,
+   0x130476dcL, 0x17c56b6bL, 0x1a864db2L, 0x1e475005L,
+   0x2608edb8L, 0x22c9f00fL, 0x2f8ad6d6L, 0x2b4bcb61L,
+   0x350c9b64L, 0x31cd86d3L, 0x3c8ea00aL, 0x384fbdbdL,
+   0x4c11db70L, 0x48d0c6c7L, 0x4593e01eL, 0x4152fda9L,
+   0x5f15adacL, 0x5bd4b01bL, 0x569796c2L, 0x52568b75L,
+   0x6a1936c8L, 0x6ed82b7fL, 0x639b0da6L, 0x675a1011L,
+   0x791d4014L, 0x7ddc5da3L, 0x709f7b7aL, 0x745e66cdL,
+   0x9823b6e0L, 0x9ce2ab57L, 0x91a18d8eL, 0x95609039L,
+   0x8b27c03cL, 0x8fe6dd8bL, 0x82a5fb52L, 0x8664e6e5L,
+   0xbe2b5b58L, 0xbaea46efL, 0xb7a96036L, 0xb3687d81L,
+   0xad2f2d84L, 0xa9ee3033L, 0xa4ad16eaL, 0xa06c0b5dL,
+   0xd4326d90L, 0xd0f37027L, 0xddb056feL, 0xd9714b49L,
+   0xc7361b4cL, 0xc3f706fbL, 0xceb42022L, 0xca753d95L,
+   0xf23a8028L, 0xf6fb9d9fL, 0xfbb8bb46L, 0xff79a6f1L,
+   0xe13ef6f4L, 0xe5ffeb43L, 0xe8bccd9aL, 0xec7dd02dL,
+   0x34867077L, 0x30476dc0L, 0x3d044b19L, 0x39c556aeL,
+   0x278206abL, 0x23431b1cL, 0x2e003dc5L, 0x2ac12072L,
+   0x128e9dcfL, 0x164f8078L, 0x1b0ca6a1L, 0x1fcdbb16L,
+   0x018aeb13L, 0x054bf6a4L, 0x0808d07dL, 0x0cc9cdcaL,
+   0x7897ab07L, 0x7c56b6b0L, 0x71159069L, 0x75d48ddeL,
+   0x6b93dddbL, 0x6f52c06cL, 0x6211e6b5L, 0x66d0fb02L,
+   0x5e9f46bfL, 0x5a5e5b08L, 0x571d7dd1L, 0x53dc6066L,
+   0x4d9b3063L, 0x495a2dd4L, 0x44190b0dL, 0x40d816baL,
+   0xaca5c697L, 0xa864db20L, 0xa527fdf9L, 0xa1e6e04eL,
+   0xbfa1b04bL, 0xbb60adfcL, 0xb6238b25L, 0xb2e29692L,
+   0x8aad2b2fL, 0x8e6c3698L, 0x832f1041L, 0x87ee0df6L,
+   0x99a95df3L, 0x9d684044L, 0x902b669dL, 0x94ea7b2aL,
+   0xe0b41de7L, 0xe4750050L, 0xe9362689L, 0xedf73b3eL,
+   0xf3b06b3bL, 0xf771768cL, 0xfa325055L, 0xfef34de2L,
+   0xc6bcf05fL, 0xc27dede8L, 0xcf3ecb31L, 0xcbffd686L,
+   0xd5b88683L, 0xd1799b34L, 0xdc3abdedL, 0xd8fba05aL,
+   0x690ce0eeL, 0x6dcdfd59L, 0x608edb80L, 0x644fc637L,
+   0x7a089632L, 0x7ec98b85L, 0x738aad5cL, 0x774bb0ebL,
+   0x4f040d56L, 0x4bc510e1L, 0x46863638L, 0x42472b8fL,
+   0x5c007b8aL, 0x58c1663dL, 0x558240e4L, 0x51435d53L,
+   0x251d3b9eL, 0x21dc2629L, 0x2c9f00f0L, 0x285e1d47L,
+   0x36194d42L, 0x32d850f5L, 0x3f9b762cL, 0x3b5a6b9bL,
+   0x0315d626L, 0x07d4cb91L, 0x0a97ed48L, 0x0e56f0ffL,
+   0x1011a0faL, 0x14d0bd4dL, 0x19939b94L, 0x1d528623L,
+   0xf12f560eL, 0xf5ee4bb9L, 0xf8ad6d60L, 0xfc6c70d7L,
+   0xe22b20d2L, 0xe6ea3d65L, 0xeba91bbcL, 0xef68060bL,
+   0xd727bbb6L, 0xd3e6a601L, 0xdea580d8L, 0xda649d6fL,
+   0xc423cd6aL, 0xc0e2d0ddL, 0xcda1f604L, 0xc960ebb3L,
+   0xbd3e8d7eL, 0xb9ff90c9L, 0xb4bcb610L, 0xb07daba7L,
+   0xae3afba2L, 0xaafbe615L, 0xa7b8c0ccL, 0xa379dd7bL,
+   0x9b3660c6L, 0x9ff77d71L, 0x92b45ba8L, 0x9675461fL,
+   0x8832161aL, 0x8cf30badL, 0x81b02d74L, 0x857130c3L,
+   0x5d8a9099L, 0x594b8d2eL, 0x5408abf7L, 0x50c9b640L,
+   0x4e8ee645L, 0x4a4ffbf2L, 0x470cdd2bL, 0x43cdc09cL,
+   0x7b827d21L, 0x7f436096L, 0x7200464fL, 0x76c15bf8L,
+   0x68860bfdL, 0x6c47164aL, 0x61043093L, 0x65c52d24L,
+   0x119b4be9L, 0x155a565eL, 0x18197087L, 0x1cd86d30L,
+   0x029f3d35L, 0x065e2082L, 0x0b1d065bL, 0x0fdc1becL,
+   0x3793a651L, 0x3352bbe6L, 0x3e119d3fL, 0x3ad08088L,
+   0x2497d08dL, 0x2056cd3aL, 0x2d15ebe3L, 0x29d4f654L,
+   0xc5a92679L, 0xc1683bceL, 0xcc2b1d17L, 0xc8ea00a0L,
+   0xd6ad50a5L, 0xd26c4d12L, 0xdf2f6bcbL, 0xdbee767cL,
+   0xe3a1cbc1L, 0xe760d676L, 0xea23f0afL, 0xeee2ed18L,
+   0xf0a5bd1dL, 0xf464a0aaL, 0xf9278673L, 0xfde69bc4L,
+   0x89b8fd09L, 0x8d79e0beL, 0x803ac667L, 0x84fbdbd0L,
+   0x9abc8bd5L, 0x9e7d9662L, 0x933eb0bbL, 0x97ffad0cL,
+   0xafb010b1L, 0xab710d06L, 0xa6322bdfL, 0xa2f33668L,
+   0xbcb4666dL, 0xb8757bdaL, 0xb5365d03L, 0xb1f740b4L
+};
+
+#define UPDATE_CRC(crcVar,cha)                 \
+{                                              \
+   crcVar = (crcVar << 8) ^                    \
+            crc32Table[(crcVar >> 24) ^        \
+                       ((UChar)cha)];          \
+}
+
+static UInt crcBytes ( UChar* bytes, UWord nBytes, UInt crcIn )
+{
+   UInt crc = crcIn;
+   while (nBytes >= 4) {
+      UPDATE_CRC(crc, bytes[0]);
+      UPDATE_CRC(crc, bytes[1]);
+      UPDATE_CRC(crc, bytes[2]);
+      UPDATE_CRC(crc, bytes[3]);
+      bytes += 4;
+      nBytes -= 4;
+   }
+   while (nBytes >= 1) {
+      UPDATE_CRC(crc, bytes[0]);
+      bytes += 1;
+      nBytes -= 1;
+   }
+   return crc;
+}
+
+static UInt crcFinalise ( UInt crc ) {
+   return ~crc;
+}
+
+////////
+
+static UInt theCRC = 0xFFFFFFFF;
+
+static HChar outBuf[1024];
+// take output that's in outBuf, length as specified, and
+// update the running crc.
+static void send ( int nbytes )
+{
+   assert( ((unsigned int)nbytes) < sizeof(outBuf)-1);
+   assert(outBuf[nbytes] == 0);
+   theCRC = crcBytes( (UChar*)&outBuf[0], nbytes, theCRC );
+   if (VERBOSE) printf("SEND %08x %s", theCRC, outBuf);
+}
+
+
+/////////////////////////////////////////////////////////////////
+// END crc32 stuff                                             //
+/////////////////////////////////////////////////////////////////
+
+#if 0
+
+// full version
+#define NVALS 76
+
+static ULong val[NVALS]
+    = { 0x00ULL, 0x01ULL, 0x02ULL, 0x03ULL,
+        0x3FULL, 0x40ULL, 0x41ULL,
+        0x7EULL, 0x7FULL, 0x80ULL, 0x81ULL, 0x82ULL,
+        0xBFULL, 0xC0ULL, 0xC1ULL,
+        0xFCULL, 0xFDULL, 0xFEULL, 0xFFULL,
+
+        0xFF00ULL, 0xFF01ULL, 0xFF02ULL, 0xFF03ULL,
+        0xFF3FULL, 0xFF40ULL, 0xFF41ULL,
+        0xFF7EULL, 0xFF7FULL, 0xFF80ULL, 0xFF81ULL, 0xFF82ULL,
+        0xFFBFULL, 0xFFC0ULL, 0xFFC1ULL,
+        0xFFFCULL, 0xFFFDULL, 0xFFFEULL, 0xFFFFULL,
+
+        0xFFFFFF00ULL, 0xFFFFFF01ULL, 0xFFFFFF02ULL, 0xFFFFFF03ULL,
+        0xFFFFFF3FULL, 0xFFFFFF40ULL, 0xFFFFFF41ULL,
+        0xFFFFFF7EULL, 0xFFFFFF7FULL, 0xFFFFFF80ULL, 0xFFFFFF81ULL, 0xFFFFFF82ULL,
+        0xFFFFFFBFULL, 0xFFFFFFC0ULL, 0xFFFFFFC1ULL,
+        0xFFFFFFFCULL, 0xFFFFFFFDULL, 0xFFFFFFFEULL, 0xFFFFFFFFULL,
+
+        0xFFFFFFFFFFFFFF00ULL, 0xFFFFFFFFFFFFFF01ULL, 0xFFFFFFFFFFFFFF02ULL,
+                               0xFFFFFFFFFFFFFF03ULL,
+        0xFFFFFFFFFFFFFF3FULL, 0xFFFFFFFFFFFFFF40ULL, 0xFFFFFFFFFFFFFF41ULL,
+        0xFFFFFFFFFFFFFF7EULL, 0xFFFFFFFFFFFFFF7FULL, 0xFFFFFFFFFFFFFF80ULL,
+                               0xFFFFFFFFFFFFFF81ULL, 0xFFFFFFFFFFFFFF82ULL,
+        0xFFFFFFFFFFFFFFBFULL, 0xFFFFFFFFFFFFFFC0ULL, 0xFFFFFFFFFFFFFFC1ULL,
+        0xFFFFFFFFFFFFFFFCULL, 0xFFFFFFFFFFFFFFFDULL, 0xFFFFFFFFFFFFFFFEULL,
+                               0xFFFFFFFFFFFFFFFFULL
+      };
+
+#else
+
+// shortened version, for use as valgrind regtest
+#define NVALS 36
+
+static ULong val[NVALS]
+    = { 0x00ULL, 0x01ULL,
+        0x3FULL, 0x40ULL,
+        0x7FULL, 0x80ULL,
+        0xBFULL, 0xC0ULL,
+        0xFFULL,
+
+        0xFF00ULL, 0xFF01ULL,
+        0xFF3FULL, 0xFF40ULL,
+        0xFF7FULL, 0xFF80ULL,
+        0xFFBFULL, 0xFFC0ULL,
+        0xFFFFULL,
+
+        0xFFFFFF00ULL, 0xFFFFFF01ULL,
+        0xFFFFFF3FULL, 0xFFFFFF40ULL,
+        0xFFFFFF7EULL, 0xFFFFFF7FULL,
+        0xFFFFFFBFULL, 0xFFFFFFC0ULL,
+        0xFFFFFFFFULL,
+
+        0xFFFFFFFFFFFFFF00ULL, 0xFFFFFFFFFFFFFF01ULL,
+        0xFFFFFFFFFFFFFF3FULL, 0xFFFFFFFFFFFFFF40ULL,
+        0xFFFFFFFFFFFFFF7FULL, 0xFFFFFFFFFFFFFF80ULL,
+        0xFFFFFFFFFFFFFFBFULL, 0xFFFFFFFFFFFFFFC0ULL,
+        0xFFFFFFFFFFFFFFFFULL
+      };
+
+#endif
+
+/////////////////////////////////////
+
+#define CC_C    0x0001
+#define CC_P    0x0004
+#define CC_A    0x0010
+#define CC_Z    0x0040
+#define CC_S    0x0080
+#define CC_O    0x0800
+
+#define CC_MASK (CC_C | CC_P | CC_A | CC_Z | CC_S | CC_O)
+
+#define GEN_do_locked_G_E(_name,_eax)   \
+  \
+  __attribute__((noinline)) void do_locked_G_E_##_name ( void )  \
+  {   \
+    volatile Long e_val, g_val, e_val_before;   \
+    Long o, s, z, a, c, p, v1, v2, flags_in;       \
+    Long block[4];   \
+    \
+    for (v1 = 0; v1 < NVALS; v1++) {   \
+    for (v2 = 0; v2 < NVALS; v2++) {   \
+    \
+    for (o = 0; o < 2; o++) {   \
+    for (s = 0; s < 2; s++) {   \
+    for (z = 0; z < 2; z++) {   \
+    for (a = 0; a < 2; a++) {   \
+    for (c = 0; c < 2; c++) {   \
+    for (p = 0; p < 2; p++) {   \
+      \
+      flags_in = (o ? CC_O : 0)   \
+               | (s ? CC_S : 0)   \
+               | (z ? CC_Z : 0)   \
+               | (a ? CC_A : 0)   \
+               | (c ? CC_C : 0)   \
+               | (p ? CC_P : 0);   \
+      \
+      g_val = val[v1];   \
+      e_val = val[v2];   \
+      e_val_before = e_val;   \
+      \
+      block[0] = flags_in;   \
+      block[1] = g_val;   \
+      block[2] = (long)&e_val;   \
+      block[3] = 0;   \
+      __asm__ __volatile__(   \
+          "movq 0(%0), %%rax\n\t"   \
+          "pushq %%rax\n\t"   \
+          "popfq\n\t"   \
+          "movq 8(%0), %%rax\n\t"   \
+          "movq 16(%0), %%rbx\n\t"   \
+          "lock; " #_name " %%" #_eax ",(%%rbx)\n\t"   \
+          "pushfq\n\t"   \
+          "popq %%rax\n\t"   \
+          "movq %%rax, 24(%0)\n\t"   \
+          : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
+      );   \
+      \
+      send( \
+      sprintf(outBuf, \
+             "%s G=%016llx E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n",       \
+             #_name, g_val, e_val_before, flags_in,   \
+              e_val, block[3] & CC_MASK));            \
+      \
+    }}}}}}   \
+    \
+    }}   \
+  }
+
+GEN_do_locked_G_E(addb,al)
+GEN_do_locked_G_E(addw,ax)
+GEN_do_locked_G_E(addl,eax)
+GEN_do_locked_G_E(addq,rax)
+
+GEN_do_locked_G_E(orb, al)
+GEN_do_locked_G_E(orw, ax)
+GEN_do_locked_G_E(orl, eax)
+GEN_do_locked_G_E(orq, rax)
+
+GEN_do_locked_G_E(adcb,al)
+GEN_do_locked_G_E(adcw,ax)
+GEN_do_locked_G_E(adcl,eax)
+GEN_do_locked_G_E(adcq,rax)
+
+GEN_do_locked_G_E(sbbb,al)
+GEN_do_locked_G_E(sbbw,ax)
+GEN_do_locked_G_E(sbbl,eax)
+GEN_do_locked_G_E(sbbq,rax)
+
+GEN_do_locked_G_E(andb,al)
+GEN_do_locked_G_E(andw,ax)
+GEN_do_locked_G_E(andl,eax)
+GEN_do_locked_G_E(andq,rax)
+
+GEN_do_locked_G_E(subb,al)
+GEN_do_locked_G_E(subw,ax)
+GEN_do_locked_G_E(subl,eax)
+GEN_do_locked_G_E(subq,rax)
+
+GEN_do_locked_G_E(xorb,al)
+GEN_do_locked_G_E(xorw,ax)
+GEN_do_locked_G_E(xorl,eax)
+GEN_do_locked_G_E(xorq,rax)
+
+
+
+
+#define GEN_do_locked_imm_E(_name,_eax,_imm)        \
+  \
+  __attribute__((noinline)) void do_locked_imm_E_##_name##_##_imm ( void )  \
+  {   \
+    volatile Long e_val, e_val_before;   \
+    Long o, s, z, a, c, p, v2, flags_in;   \
+    Long block[3];   \
+    \
+    for (v2 = 0; v2 < NVALS; v2++) {   \
+    \
+    for (o = 0; o < 2; o++) {   \
+    for (s = 0; s < 2; s++) {   \
+    for (z = 0; z < 2; z++) {   \
+    for (a = 0; a < 2; a++) {   \
+    for (c = 0; c < 2; c++) {   \
+    for (p = 0; p < 2; p++) {   \
+      \
+      flags_in = (o ? CC_O : 0)   \
+               | (s ? CC_S : 0)   \
+               | (z ? CC_Z : 0)   \
+               | (a ? CC_A : 0)   \
+               | (c ? CC_C : 0)   \
+               | (p ? CC_P : 0);   \
+      \
+      e_val = val[v2];   \
+      e_val_before = e_val;   \
+      \
+      block[0] = flags_in;   \
+      block[1] = (long)&e_val;   \
+      block[2] = 0;   \
+      __asm__ __volatile__(   \
+          "movq 0(%0), %%rax\n\t"   \
+          "pushq %%rax\n\t"   \
+          "popfq\n\t"   \
+          "movq 8(%0), %%rbx\n\t"   \
+          "lock; " #_name " $" #_imm ",(%%rbx)\n\t"   \
+          "pushfq\n\t"   \
+          "popq %%rax\n\t"   \
+          "movq %%rax, 16(%0)\n\t"   \
+          : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
+      );   \
+      \
+      send( \
+           sprintf(outBuf, \
+           "%s I=%s E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n",    \
+             #_name, #_imm, e_val_before, flags_in,         \
+                   e_val, block[2] & CC_MASK));             \
+      \
+    }}}}}}   \
+    \
+    }   \
+  }
+
+GEN_do_locked_imm_E(addb,al,0x7F)
+GEN_do_locked_imm_E(addb,al,0xF1)
+GEN_do_locked_imm_E(addw,ax,0x7E)
+GEN_do_locked_imm_E(addw,ax,0x9325)
+GEN_do_locked_imm_E(addl,eax,0x7D)
+GEN_do_locked_imm_E(addl,eax,0x31415927)
+GEN_do_locked_imm_E(addq,rax,0x7D)
+GEN_do_locked_imm_E(addq,rax,0x31415927)
+
+GEN_do_locked_imm_E(orb,al,0x7F)
+GEN_do_locked_imm_E(orb,al,0xF1)
+GEN_do_locked_imm_E(orw,ax,0x7E)
+GEN_do_locked_imm_E(orw,ax,0x9325)
+GEN_do_locked_imm_E(orl,eax,0x7D)
+GEN_do_locked_imm_E(orl,eax,0x31415927)
+GEN_do_locked_imm_E(orq,rax,0x7D)
+GEN_do_locked_imm_E(orq,rax,0x31415927)
+
+GEN_do_locked_imm_E(adcb,al,0x7F)
+GEN_do_locked_imm_E(adcb,al,0xF1)
+GEN_do_locked_imm_E(adcw,ax,0x7E)
+GEN_do_locked_imm_E(adcw,ax,0x9325)
+GEN_do_locked_imm_E(adcl,eax,0x7D)
+GEN_do_locked_imm_E(adcl,eax,0x31415927)
+GEN_do_locked_imm_E(adcq,rax,0x7D)
+GEN_do_locked_imm_E(adcq,rax,0x31415927)
+
+GEN_do_locked_imm_E(sbbb,al,0x7F)
+GEN_do_locked_imm_E(sbbb,al,0xF1)
+GEN_do_locked_imm_E(sbbw,ax,0x7E)
+GEN_do_locked_imm_E(sbbw,ax,0x9325)
+GEN_do_locked_imm_E(sbbl,eax,0x7D)
+GEN_do_locked_imm_E(sbbl,eax,0x31415927)
+GEN_do_locked_imm_E(sbbq,rax,0x7D)
+GEN_do_locked_imm_E(sbbq,rax,0x31415927)
+
+GEN_do_locked_imm_E(andb,al,0x7F)
+GEN_do_locked_imm_E(andb,al,0xF1)
+GEN_do_locked_imm_E(andw,ax,0x7E)
+GEN_do_locked_imm_E(andw,ax,0x9325)
+GEN_do_locked_imm_E(andl,eax,0x7D)
+GEN_do_locked_imm_E(andl,eax,0x31415927)
+GEN_do_locked_imm_E(andq,rax,0x7D)
+GEN_do_locked_imm_E(andq,rax,0x31415927)
+
+GEN_do_locked_imm_E(subb,al,0x7F)
+GEN_do_locked_imm_E(subb,al,0xF1)
+GEN_do_locked_imm_E(subw,ax,0x7E)
+GEN_do_locked_imm_E(subw,ax,0x9325)
+GEN_do_locked_imm_E(subl,eax,0x7D)
+GEN_do_locked_imm_E(subl,eax,0x31415927)
+GEN_do_locked_imm_E(subq,rax,0x7D)
+GEN_do_locked_imm_E(subq,rax,0x31415927)
+
+GEN_do_locked_imm_E(xorb,al,0x7F)
+GEN_do_locked_imm_E(xorb,al,0xF1)
+GEN_do_locked_imm_E(xorw,ax,0x7E)
+GEN_do_locked_imm_E(xorw,ax,0x9325)
+GEN_do_locked_imm_E(xorl,eax,0x7D)
+GEN_do_locked_imm_E(xorl,eax,0x31415927)
+GEN_do_locked_imm_E(xorq,rax,0x7D)
+GEN_do_locked_imm_E(xorq,rax,0x31415927)
+
+#define GEN_do_locked_unary_E(_name,_eax)        \
+  \
+  __attribute__((noinline)) void do_locked_unary_E_##_name ( void )  \
+  {   \
+    volatile Long e_val, e_val_before;   \
+    Long o, s, z, a, c, p, v2, flags_in;     \
+    Long block[3];   \
+    \
+    for (v2 = 0; v2 < NVALS; v2++) {   \
+    \
+    for (o = 0; o < 2; o++) {   \
+    for (s = 0; s < 2; s++) {   \
+    for (z = 0; z < 2; z++) {   \
+    for (a = 0; a < 2; a++) {   \
+    for (c = 0; c < 2; c++) {   \
+    for (p = 0; p < 2; p++) {   \
+      \
+      flags_in = (o ? CC_O : 0)   \
+               | (s ? CC_S : 0)   \
+               | (z ? CC_Z : 0)   \
+               | (a ? CC_A : 0)   \
+               | (c ? CC_C : 0)   \
+               | (p ? CC_P : 0);   \
+      \
+      e_val = val[v2];   \
+      e_val_before = e_val;   \
+      \
+      block[0] = flags_in;   \
+      block[1] = (long)&e_val;   \
+      block[2] = 0;   \
+      __asm__ __volatile__(   \
+          "movq 0(%0), %%rax\n\t"   \
+          "pushq %%rax\n\t"   \
+          "popfq\n\t"   \
+          "movq 8(%0), %%rbx\n\t"   \
+          "lock; " #_name " (%%rbx)\n\t"   \
+          "pushfq\n\t"   \
+          "popq %%rax\n\t"   \
+          "movq %%rax, 16(%0)\n\t"   \
+          : : "r"(&block[0]) : "rax","rbx","cc","memory"   \
+      );   \
+      \
+      send( \
+           sprintf(outBuf, \
+            "%s E=%016llx CCIN=%08llx -> E=%016llx CCOUT=%08llx\n", \
+             #_name, e_val_before, flags_in,         \
+            e_val, block[2] & CC_MASK));                       \
+      \
+    }}}}}}   \
+    \
+    }   \
+  }
+
+GEN_do_locked_unary_E(decb,al)
+GEN_do_locked_unary_E(decw,ax)
+GEN_do_locked_unary_E(decl,eax)
+GEN_do_locked_unary_E(decq,rax)
+
+GEN_do_locked_unary_E(incb,al)
+GEN_do_locked_unary_E(incw,ax)
+GEN_do_locked_unary_E(incl,eax)
+GEN_do_locked_unary_E(incq,rax)
+
+GEN_do_locked_unary_E(negb,al)
+GEN_do_locked_unary_E(negw,ax)
+GEN_do_locked_unary_E(negl,eax)
+GEN_do_locked_unary_E(negq,rax)
+
+GEN_do_locked_unary_E(notb,al)
+GEN_do_locked_unary_E(notw,ax)
+GEN_do_locked_unary_E(notl,eax)
+GEN_do_locked_unary_E(notq,rax)
+
+
+/////////////////////////////////////////////////////////////////
+
+ULong btsq_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("lock; btsq\t%2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((ULong)bitno) : "rdx","cc","memory" );
+   /* Pretty meaningless to dereference base here, but that's what you
+      have to do to get a btsl insn which refers to memory starting at
+      base. */
+   return res;
+}
+ULong btsl_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("lock; btsl\t%2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((UInt)bitno));
+   return res;
+}
+ULong btsw_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("lock; btsw\t%w2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((ULong)bitno));
+   return res;
+}
+
+ULong btrq_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("lock; btrq\t%2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((ULong)bitno));
+   return res;
+}
+ULong btrl_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("lock; btrl\t%2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((UInt)bitno));
+   return res;
+}
+ULong btrw_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("lock; btrw\t%w2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((ULong)bitno));
+   return res;
+}
+
+ULong btcq_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("lock; btcq\t%2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((ULong)bitno));
+   return res;
+}
+ULong btcl_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("lock; btcl\t%2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((UInt)bitno));
+   return res;
+}
+ULong btcw_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("lock; btcw\t%w2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((ULong)bitno));
+   return res;
+}
+
+ULong btq_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("btq\t%2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((ULong)bitno)
+                : "cc", "memory");
+   return res;
+}
+ULong btl_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("btl\t%2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((UInt)bitno)
+                : "cc", "memory");
+   return res;
+}
+ULong btw_mem ( UChar* base, int bitno )
+{
+   ULong res;
+   __asm__ 
+   __volatile__("btw\t%w2, %0\n\t"
+                "setc   %%dl\n\t" 
+                "movzbq %%dl,%1\n"
+                : "=m" (*base), "=r" (res)
+                : "r" ((ULong)bitno));
+   return res;
+}
+
+ULong rol1 ( ULong x )
+{
+  return (x << 1) | (x >> 63);
+}
+
+void do_bt_G_E_tests ( void )
+{
+   ULong  n, bitoff, op;
+   ULong  c;
+   UChar* block;
+   ULong  carrydep, res;;
+
+   /*------------------------ MEM-Q -----------------------*/
+
+   carrydep = 0;
+   block = calloc(200,1);
+   block += 100;
+   /* Valid bit offsets are -800 .. 799 inclusive. */
+
+   for (n = 0; n < 10000; n++) {
+      bitoff = (random() % 1600) - 800;
+      op = random() % 4;
+      c = 2;
+      switch (op) {
+         case 0: c = btsq_mem(block, bitoff); break;
+         case 1: c = btrq_mem(block, bitoff); break;
+         case 2: c = btcq_mem(block, bitoff); break;
+         case 3: c = btq_mem(block, bitoff); break;
+      }
+      c &= 255;
+      assert(c == 0 || c == 1);
+      carrydep = c ? (rol1(carrydep) ^ (Long)bitoff) : carrydep;
+   }
+
+   /* Compute final result */
+   block -= 100;   
+   res = 0;
+   for (n = 0; n < 200; n++) {
+      UChar ch = block[n];
+      /* printf("%d ", (int)block[n]); */
+      res = rol1(res) ^ (ULong)ch;
+   }
+
+   send( sprintf(outBuf,
+                 "bt{s,r,c}q: final res 0x%llx, carrydep 0x%llx\n",
+                 res, carrydep));
+   free(block);
+
+   /*------------------------ MEM-L -----------------------*/
+
+   carrydep = 0;
+   block = calloc(200,1);
+   block += 100;
+   /* Valid bit offsets are -800 .. 799 inclusive. */
+
+   for (n = 0; n < 10000; n++) {
+      bitoff = (random() % 1600) - 800;
+      op = random() % 4;
+      c = 2;
+      switch (op) {
+         case 0: c = btsl_mem(block, bitoff); break;
+         case 1: c = btrl_mem(block, bitoff); break;
+         case 2: c = btcl_mem(block, bitoff); break;
+         case 3: c = btl_mem(block, bitoff); break;
+      }
+      c &= 255;
+      assert(c == 0 || c == 1);
+      carrydep = c ? (rol1(carrydep) ^ (Long)(Int)bitoff) : carrydep;
+   }
+
+   /* Compute final result */
+   block -= 100;   
+   res = 0;
+   for (n = 0; n < 200; n++) {
+      UChar ch = block[n];
+      /* printf("%d ", (int)block[n]); */
+      res = rol1(res) ^ (ULong)ch;
+   }
+
+   send( sprintf(outBuf,
+                 "bt{s,r,c}l: final res 0x%llx, carrydep 0x%llx\n",
+                 res, carrydep));
+   free(block);
+
+   /*------------------------ MEM-W -----------------------*/
+
+   carrydep = 0;
+   block = calloc(200,1);
+   block += 100;
+   /* Valid bit offsets are -800 .. 799 inclusive. */
+
+   for (n = 0; n < 10000; n++) {
+      bitoff = (random() % 1600) - 800;
+      op = random() % 4;
+      c = 2;
+      switch (op) {
+         case 0: c = btsw_mem(block, bitoff); break;
+         case 1: c = btrw_mem(block, bitoff); break;
+         case 2: c = btcw_mem(block, bitoff); break;
+         case 3: c = btw_mem(block, bitoff); break;
+      }
+      c &= 255;
+      assert(c == 0 || c == 1);
+      carrydep = c ? (rol1(carrydep) ^ (Long)(Int)bitoff) : carrydep;
+   }
+
+   /* Compute final result */
+   block -= 100;   
+   res = 0;
+   for (n = 0; n < 200; n++) {
+      UChar ch = block[n];
+      /* printf("%d ", (int)block[n]); */
+      res = rol1(res) ^ (ULong)ch;
+   }
+
+   send(sprintf(outBuf,
+                "bt{s,r,c}w: final res 0x%llx, carrydep 0x%llx\n",
+                res, carrydep));
+   free(block);
+}
+
+
+/////////////////////////////////////////////////////////////////
+
+/* Given a word, do bt/bts/btr/btc on bits 0, 1, 2 and 3 of it, and
+   also reconstruct the original bits 0, 1, 2, 3 by looking at the
+   carry flag.  Returned result has mashed bits 0-3 at the bottom and
+   the reconstructed original bits 0-3 as 4-7. */
+
+ULong mash_mem_Q ( ULong* origp )
+{
+  ULong reconstructed, mashed;
+  __asm__ __volatile__ (
+     "movq %2, %%rdx\n\t"
+     ""
+     "movq $0, %%rax\n\t"
+     "\n\t"
+     "btq  $0, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "lock; btsq $1, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "shlq $1, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "lock; btrq $2, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "shlq $2, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "lock; btcq $3, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "shlq $3, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "movq %%rax, %0\n\t"
+     "movq (%%rdx), %1"
+     : "=r" (reconstructed), "=r" (mashed)
+     : "r" (origp)
+     : "rax", "rcx", "rdx", "cc");
+  return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
+}
+
+ULong mash_mem_L ( UInt* origp )
+{
+  ULong reconstructed; UInt mashed;
+  __asm__ __volatile__ (
+     "movq %2, %%rdx\n\t"
+     ""
+     "movq $0, %%rax\n\t"
+     "\n\t"
+     "btl  $0, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "lock; btsl $1, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "shlq $1, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "lock; btrl $2, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "shlq $2, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "lock; btcl $3, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "shlq $3, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "movq %%rax, %0\n\t"
+     "movl (%%rdx), %1"
+     : "=r" (reconstructed), "=r" (mashed)
+     : "r" (origp)
+     : "rax", "rcx", "rdx", "cc");
+  return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
+}
+
+ULong mash_mem_W ( UShort* origp )
+{
+  ULong reconstructed, mashed;
+  __asm__ __volatile__ (
+     "movq %2, %%rdx\n\t"
+     ""
+     "movq $0, %%rax\n\t"
+     "\n\t"
+     "btw  $0, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "lock; btsw $1, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "shlq $1, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "lock; btrw $2, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "shlq $2, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "lock; btcw $3, (%%rdx)\n\t"
+     "setb %%cl\n\t"
+     "movzbq %%cl, %%rcx\n\t"
+     "shlq $3, %%rcx\n\t"
+     "orq %%rcx, %%rax\n\t"
+     "\n\t"
+     "movq %%rax, %0\n\t"
+     "movzwq (%%rdx), %1"
+     : "=r" (reconstructed), "=r" (mashed)
+     : "r" (origp)
+     : "rax", "rcx", "rdx", "cc");
+  return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
+}
+
+
+void do_bt_imm_E_tests( void )
+{
+  ULong i;
+  ULong*  iiq = malloc(sizeof(ULong));
+  UInt*   iil = malloc(sizeof(UInt));
+  UShort* iiw = malloc(sizeof(UShort));
+  for (i = 0; i < 0x10; i++) {
+    *iiq = i;
+    *iil = i;
+    *iiw = i;
+    send(sprintf(outBuf,"0x%llx -> 0x%02llx 0x%02llx 0x%02llx\n", i, 
+                 mash_mem_Q(iiq), mash_mem_L(iil), mash_mem_W(iiw)));
+  }
+  free(iiq);
+  free(iil);
+  free(iiw);
+}
+
+
+/////////////////////////////////////////////////////////////////
+
+int main ( void )
+{
+  do_locked_G_E_addb();
+  do_locked_G_E_addw();
+  do_locked_G_E_addl();
+  do_locked_G_E_addq();
+
+  do_locked_G_E_orb();
+  do_locked_G_E_orw();
+  do_locked_G_E_orl();
+  do_locked_G_E_orq();
+
+  do_locked_G_E_adcb();
+  do_locked_G_E_adcw();
+  do_locked_G_E_adcl();
+  do_locked_G_E_adcq();
+
+  do_locked_G_E_sbbb();
+  do_locked_G_E_sbbw();
+  do_locked_G_E_sbbl();
+  do_locked_G_E_sbbq();
+
+  do_locked_G_E_andb();
+  do_locked_G_E_andw();
+  do_locked_G_E_andl();
+  do_locked_G_E_andq();
+
+  do_locked_G_E_subb();
+  do_locked_G_E_subw();
+  do_locked_G_E_subl();
+  do_locked_G_E_subq();
+
+  do_locked_G_E_xorb();
+  do_locked_G_E_xorw();
+  do_locked_G_E_xorl();
+  do_locked_G_E_xorq();
+  // 4 * 7
+
+  do_locked_imm_E_addb_0x7F();
+  do_locked_imm_E_addb_0xF1();
+  do_locked_imm_E_addw_0x7E();
+  do_locked_imm_E_addw_0x9325();
+  do_locked_imm_E_addl_0x7D();
+  do_locked_imm_E_addl_0x31415927();
+  do_locked_imm_E_addq_0x7D();
+  do_locked_imm_E_addq_0x31415927();
+
+  do_locked_imm_E_orb_0x7F();
+  do_locked_imm_E_orb_0xF1();
+  do_locked_imm_E_orw_0x7E();
+  do_locked_imm_E_orw_0x9325();
+  do_locked_imm_E_orl_0x7D();
+  do_locked_imm_E_orl_0x31415927();
+  do_locked_imm_E_orq_0x7D();
+  do_locked_imm_E_orq_0x31415927();
+
+  do_locked_imm_E_adcb_0x7F();
+  do_locked_imm_E_adcb_0xF1();
+  do_locked_imm_E_adcw_0x7E();
+  do_locked_imm_E_adcw_0x9325();
+  do_locked_imm_E_adcl_0x7D();
+  do_locked_imm_E_adcl_0x31415927();
+  do_locked_imm_E_adcq_0x7D();
+  do_locked_imm_E_adcq_0x31415927();
+
+  do_locked_imm_E_sbbb_0x7F();
+  do_locked_imm_E_sbbb_0xF1();
+  do_locked_imm_E_sbbw_0x7E();
+  do_locked_imm_E_sbbw_0x9325();
+  do_locked_imm_E_sbbl_0x7D();
+  do_locked_imm_E_sbbl_0x31415927();
+  do_locked_imm_E_sbbq_0x7D();
+  do_locked_imm_E_sbbq_0x31415927();
+
+  do_locked_imm_E_andb_0x7F();
+  do_locked_imm_E_andb_0xF1();
+  do_locked_imm_E_andw_0x7E();
+  do_locked_imm_E_andw_0x9325();
+  do_locked_imm_E_andl_0x7D();
+  do_locked_imm_E_andl_0x31415927();
+  do_locked_imm_E_andq_0x7D();
+  do_locked_imm_E_andq_0x31415927();
+
+  do_locked_imm_E_subb_0x7F();
+  do_locked_imm_E_subb_0xF1();
+  do_locked_imm_E_subw_0x7E();
+  do_locked_imm_E_subw_0x9325();
+  do_locked_imm_E_subl_0x7D();
+  do_locked_imm_E_subl_0x31415927();
+  do_locked_imm_E_subq_0x7D();
+  do_locked_imm_E_subq_0x31415927();
+
+  do_locked_imm_E_xorb_0x7F();
+  do_locked_imm_E_xorb_0xF1();
+  do_locked_imm_E_xorw_0x7E();
+  do_locked_imm_E_xorw_0x9325();
+  do_locked_imm_E_xorl_0x7D();
+  do_locked_imm_E_xorl_0x31415927();
+  do_locked_imm_E_xorq_0x7D();
+  do_locked_imm_E_xorq_0x31415927();
+  // 4 * 7 + 8 * 7 == 84
+
+  do_locked_unary_E_decb();
+  do_locked_unary_E_decw();
+  do_locked_unary_E_decl();
+  do_locked_unary_E_decq();
+
+  do_locked_unary_E_incb();
+  do_locked_unary_E_incw();
+  do_locked_unary_E_incl();
+  do_locked_unary_E_incq();
+
+  do_locked_unary_E_negb();
+  do_locked_unary_E_negw();
+  do_locked_unary_E_negl();
+  do_locked_unary_E_negq();
+
+  do_locked_unary_E_notb();
+  do_locked_unary_E_notw();
+  do_locked_unary_E_notl();
+  do_locked_unary_E_notq();
+  // 100
+
+  do_bt_G_E_tests();
+  // 109
+  do_bt_imm_E_tests();
+  // 118
+
+  // So there should be 118 lock-prefixed instructions in the
+  // disassembly of this compilation unit.
+  // confirm with
+  // objdump -d ./amd64locked | grep lock | grep -v do_lock | grep -v elf64 | wc
+
+
+  { UInt crcExpd = 0x1F677629;
+    theCRC = crcFinalise( theCRC );
+    if (theCRC == crcExpd) {
+       printf("amd64locked: PASS: CRCs actual 0x%08X expected 0x%08X\n",
+              theCRC, crcExpd);
+    } else {
+       printf("amd64locked: FAIL: CRCs actual 0x%08X expected 0x%08X\n",
+              theCRC, crcExpd);
+       printf("amd64locked: set #define VERBOSE 1 to diagnose\n");
+    }
+  }
+
+  return 0;
+}
diff --git a/none/tests/amd64/amd64locked.stderr.exp b/none/tests/amd64/amd64locked.stderr.exp
new file mode 100644 (file)
index 0000000..139597f
--- /dev/null
@@ -0,0 +1,2 @@
+
+
diff --git a/none/tests/amd64/amd64locked.stdout.exp b/none/tests/amd64/amd64locked.stdout.exp
new file mode 100644 (file)
index 0000000..97202d2
--- /dev/null
@@ -0,0 +1 @@
+amd64locked: PASS: CRCs actual 0x1F677629 expected 0x1F677629
diff --git a/none/tests/amd64/amd64locked.vgtest b/none/tests/amd64/amd64locked.vgtest
new file mode 100644 (file)
index 0000000..df55c5a
--- /dev/null
@@ -0,0 +1 @@
+prog: amd64locked
index 693cec0f33b30a7a8174206fe4722b6598c1f2aa..47d6a3492a0370bc9ad34c861c29faaf6b7f839c 100644 (file)
@@ -52,6 +52,7 @@ EXTRA_DIST = \
        smc1.stderr.exp smc1.stdout.exp smc1.vgtest \
        ssse3_misaligned.stderr.exp ssse3_misaligned.stdout.exp \
        ssse3_misaligned.vgtest ssse3_misaligned.c \
+       x86locked.vgtest x86locked.stdout.exp x86locked.stderr.exp \
        yield.stderr.exp yield.stdout.exp yield.disabled
 
 check_PROGRAMS = \
@@ -82,6 +83,7 @@ check_PROGRAMS = \
        pushpopseg \
        sbbmisc \
        smc1 \
+       x86locked \
        yield
 if BUILD_SSSE3_TESTS
    check_PROGRAMS += ssse3_misaligned
@@ -123,6 +125,7 @@ insn_sse3_SOURCES   = insn_sse3.def
 insn_sse3_LDADD                = -lm
 insn_ssse3_SOURCES     = insn_ssse3.def
 insn_ssse3_LDADD       = -lm
+x86locked_CFLAGS       = $(AM_CFLAGS) -O
 yield_LDADD            = -lpthread
 
 .def.c: $(srcdir)/gen_insn_test.pl
diff --git a/none/tests/x86/x86locked.c b/none/tests/x86/x86locked.c
new file mode 100644 (file)
index 0000000..38d1108
--- /dev/null
@@ -0,0 +1,864 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define VERBOSE 0
+
+typedef  unsigned int            UInt;
+typedef  unsigned char           UChar;
+typedef  unsigned long long int  ULong;
+typedef  signed long long int    Long;
+typedef  signed int              Int;
+typedef  unsigned short          UShort;
+typedef  unsigned long           UWord;
+typedef  char                    HChar;
+
+/////////////////////////////////////////////////////////////////
+// BEGIN crc32 stuff                                           //
+/////////////////////////////////////////////////////////////////
+
+static const UInt crc32Table[256] = {
+
+   /*-- Ugly, innit? --*/
+
+   0x00000000L, 0x04c11db7L, 0x09823b6eL, 0x0d4326d9L,
+   0x130476dcL, 0x17c56b6bL, 0x1a864db2L, 0x1e475005L,
+   0x2608edb8L, 0x22c9f00fL, 0x2f8ad6d6L, 0x2b4bcb61L,
+   0x350c9b64L, 0x31cd86d3L, 0x3c8ea00aL, 0x384fbdbdL,
+   0x4c11db70L, 0x48d0c6c7L, 0x4593e01eL, 0x4152fda9L,
+   0x5f15adacL, 0x5bd4b01bL, 0x569796c2L, 0x52568b75L,
+   0x6a1936c8L, 0x6ed82b7fL, 0x639b0da6L, 0x675a1011L,
+   0x791d4014L, 0x7ddc5da3L, 0x709f7b7aL, 0x745e66cdL,
+   0x9823b6e0L, 0x9ce2ab57L, 0x91a18d8eL, 0x95609039L,
+   0x8b27c03cL, 0x8fe6dd8bL, 0x82a5fb52L, 0x8664e6e5L,
+   0xbe2b5b58L, 0xbaea46efL, 0xb7a96036L, 0xb3687d81L,
+   0xad2f2d84L, 0xa9ee3033L, 0xa4ad16eaL, 0xa06c0b5dL,
+   0xd4326d90L, 0xd0f37027L, 0xddb056feL, 0xd9714b49L,
+   0xc7361b4cL, 0xc3f706fbL, 0xceb42022L, 0xca753d95L,
+   0xf23a8028L, 0xf6fb9d9fL, 0xfbb8bb46L, 0xff79a6f1L,
+   0xe13ef6f4L, 0xe5ffeb43L, 0xe8bccd9aL, 0xec7dd02dL,
+   0x34867077L, 0x30476dc0L, 0x3d044b19L, 0x39c556aeL,
+   0x278206abL, 0x23431b1cL, 0x2e003dc5L, 0x2ac12072L,
+   0x128e9dcfL, 0x164f8078L, 0x1b0ca6a1L, 0x1fcdbb16L,
+   0x018aeb13L, 0x054bf6a4L, 0x0808d07dL, 0x0cc9cdcaL,
+   0x7897ab07L, 0x7c56b6b0L, 0x71159069L, 0x75d48ddeL,
+   0x6b93dddbL, 0x6f52c06cL, 0x6211e6b5L, 0x66d0fb02L,
+   0x5e9f46bfL, 0x5a5e5b08L, 0x571d7dd1L, 0x53dc6066L,
+   0x4d9b3063L, 0x495a2dd4L, 0x44190b0dL, 0x40d816baL,
+   0xaca5c697L, 0xa864db20L, 0xa527fdf9L, 0xa1e6e04eL,
+   0xbfa1b04bL, 0xbb60adfcL, 0xb6238b25L, 0xb2e29692L,
+   0x8aad2b2fL, 0x8e6c3698L, 0x832f1041L, 0x87ee0df6L,
+   0x99a95df3L, 0x9d684044L, 0x902b669dL, 0x94ea7b2aL,
+   0xe0b41de7L, 0xe4750050L, 0xe9362689L, 0xedf73b3eL,
+   0xf3b06b3bL, 0xf771768cL, 0xfa325055L, 0xfef34de2L,
+   0xc6bcf05fL, 0xc27dede8L, 0xcf3ecb31L, 0xcbffd686L,
+   0xd5b88683L, 0xd1799b34L, 0xdc3abdedL, 0xd8fba05aL,
+   0x690ce0eeL, 0x6dcdfd59L, 0x608edb80L, 0x644fc637L,
+   0x7a089632L, 0x7ec98b85L, 0x738aad5cL, 0x774bb0ebL,
+   0x4f040d56L, 0x4bc510e1L, 0x46863638L, 0x42472b8fL,
+   0x5c007b8aL, 0x58c1663dL, 0x558240e4L, 0x51435d53L,
+   0x251d3b9eL, 0x21dc2629L, 0x2c9f00f0L, 0x285e1d47L,
+   0x36194d42L, 0x32d850f5L, 0x3f9b762cL, 0x3b5a6b9bL,
+   0x0315d626L, 0x07d4cb91L, 0x0a97ed48L, 0x0e56f0ffL,
+   0x1011a0faL, 0x14d0bd4dL, 0x19939b94L, 0x1d528623L,
+   0xf12f560eL, 0xf5ee4bb9L, 0xf8ad6d60L, 0xfc6c70d7L,
+   0xe22b20d2L, 0xe6ea3d65L, 0xeba91bbcL, 0xef68060bL,
+   0xd727bbb6L, 0xd3e6a601L, 0xdea580d8L, 0xda649d6fL,
+   0xc423cd6aL, 0xc0e2d0ddL, 0xcda1f604L, 0xc960ebb3L,
+   0xbd3e8d7eL, 0xb9ff90c9L, 0xb4bcb610L, 0xb07daba7L,
+   0xae3afba2L, 0xaafbe615L, 0xa7b8c0ccL, 0xa379dd7bL,
+   0x9b3660c6L, 0x9ff77d71L, 0x92b45ba8L, 0x9675461fL,
+   0x8832161aL, 0x8cf30badL, 0x81b02d74L, 0x857130c3L,
+   0x5d8a9099L, 0x594b8d2eL, 0x5408abf7L, 0x50c9b640L,
+   0x4e8ee645L, 0x4a4ffbf2L, 0x470cdd2bL, 0x43cdc09cL,
+   0x7b827d21L, 0x7f436096L, 0x7200464fL, 0x76c15bf8L,
+   0x68860bfdL, 0x6c47164aL, 0x61043093L, 0x65c52d24L,
+   0x119b4be9L, 0x155a565eL, 0x18197087L, 0x1cd86d30L,
+   0x029f3d35L, 0x065e2082L, 0x0b1d065bL, 0x0fdc1becL,
+   0x3793a651L, 0x3352bbe6L, 0x3e119d3fL, 0x3ad08088L,
+   0x2497d08dL, 0x2056cd3aL, 0x2d15ebe3L, 0x29d4f654L,
+   0xc5a92679L, 0xc1683bceL, 0xcc2b1d17L, 0xc8ea00a0L,
+   0xd6ad50a5L, 0xd26c4d12L, 0xdf2f6bcbL, 0xdbee767cL,
+   0xe3a1cbc1L, 0xe760d676L, 0xea23f0afL, 0xeee2ed18L,
+   0xf0a5bd1dL, 0xf464a0aaL, 0xf9278673L, 0xfde69bc4L,
+   0x89b8fd09L, 0x8d79e0beL, 0x803ac667L, 0x84fbdbd0L,
+   0x9abc8bd5L, 0x9e7d9662L, 0x933eb0bbL, 0x97ffad0cL,
+   0xafb010b1L, 0xab710d06L, 0xa6322bdfL, 0xa2f33668L,
+   0xbcb4666dL, 0xb8757bdaL, 0xb5365d03L, 0xb1f740b4L
+};
+
+#define UPDATE_CRC(crcVar,cha)                 \
+{                                              \
+   crcVar = (crcVar << 8) ^                    \
+            crc32Table[(crcVar >> 24) ^        \
+                       ((UChar)cha)];          \
+}
+
+static UInt crcBytes ( UChar* bytes, UWord nBytes, UInt crcIn )
+{
+   UInt crc = crcIn;
+   while (nBytes >= 4) {
+      UPDATE_CRC(crc, bytes[0]);
+      UPDATE_CRC(crc, bytes[1]);
+      UPDATE_CRC(crc, bytes[2]);
+      UPDATE_CRC(crc, bytes[3]);
+      bytes += 4;
+      nBytes -= 4;
+   }
+   while (nBytes >= 1) {
+      UPDATE_CRC(crc, bytes[0]);
+      bytes += 1;
+      nBytes -= 1;
+   }
+   return crc;
+}
+
+static UInt crcFinalise ( UInt crc ) {
+   return ~crc;
+}
+
+////////
+
+static UInt theCRC = 0xFFFFFFFF;
+
+static HChar outBuf[1024];
+// take output that's in outBuf, length as specified, and
+// update the running crc.
+static void send ( int nbytes )
+{
+   assert( ((unsigned int)nbytes) < sizeof(outBuf)-1);
+   assert(outBuf[nbytes] == 0);
+   theCRC = crcBytes( (UChar*)&outBuf[0], nbytes, theCRC );
+   if (VERBOSE) printf("SEND %08x %s", theCRC, outBuf);
+}
+
+
+/////////////////////////////////////////////////////////////////
+// END crc32 stuff                                             //
+/////////////////////////////////////////////////////////////////
+
+#if 0
+
+// full version
+#define NVALS 57
+
+static unsigned int val[NVALS]
+    = { 0x00, 0x01, 0x02, 0x03,
+        0x3F, 0x40, 0x41,
+        0x7E, 0x7F, 0x80, 0x81, 0x82,
+        0xBF, 0xC0, 0xC1,
+        0xFC, 0xFD, 0xFE, 0xFF,
+
+        0xFF00, 0xFF01, 0xFF02, 0xFF03,
+        0xFF3F, 0xFF40, 0xFF41,
+        0xFF7E, 0xFF7F, 0xFF80, 0xFF81, 0xFF82,
+        0xFFBF, 0xFFC0, 0xFFC1,
+        0xFFFC, 0xFFFD, 0xFFFE, 0xFFFF,
+
+        0xFFFFFF00, 0xFFFFFF01, 0xFFFFFF02, 0xFFFFFF03,
+        0xFFFFFF3F, 0xFFFFFF40, 0xFFFFFF41,
+        0xFFFFFF7E, 0xFFFFFF7F, 0xFFFFFF80, 0xFFFFFF81, 0xFFFFFF82,
+        0xFFFFFFBF, 0xFFFFFFC0, 0xFFFFFFC1,
+        0xFFFFFFFC, 0xFFFFFFFD, 0xFFFFFFFE, 0xFFFFFFFF
+      };
+
+#else
+
+// shortened version, for use as valgrind regtest
+#define NVALS 27
+
+static unsigned int val[NVALS]
+    = { 0x00, 0x01,
+        0x3F, 0x40,
+        0x7F, 0x80,
+        0xBF, 0xC0,
+        0xFF,
+
+        0xFF00, 0xFF01,
+        0xFF3F, 0xFF40,
+        0xFF7F, 0xFF80,
+        0xFFBF, 0xFFC0,
+        0xFFFF,
+
+        0xFFFFFF00, 0xFFFFFF01,
+        0xFFFFFF3F, 0xFFFFFF40,
+        0xFFFFFF7F, 0xFFFFFF80,
+        0xFFFFFFBF, 0xFFFFFFC0,
+        0xFFFFFFFF
+      };
+
+#endif
+
+/////////////////////////////////////
+
+#define CC_C    0x0001
+#define CC_P    0x0004
+#define CC_A    0x0010
+#define CC_Z    0x0040
+#define CC_S    0x0080
+#define CC_O    0x0800
+
+#define CC_MASK (CC_C | CC_P | CC_A | CC_Z | CC_S | CC_O)
+
+#define GEN_do_locked_G_E(_name,_eax)   \
+  \
+  __attribute__((noinline)) void do_locked_G_E_##_name ( void )  \
+  {   \
+    volatile int e_val, g_val, e_val_before;   \
+    int o, s, z, a, c, p, v1, v2, flags_in;   \
+    int block[4];   \
+    \
+    for (v1 = 0; v1 < NVALS; v1++) {   \
+    for (v2 = 0; v2 < NVALS; v2++) {   \
+    \
+    for (o = 0; o < 2; o++) {   \
+    for (s = 0; s < 2; s++) {   \
+    for (z = 0; z < 2; z++) {   \
+    for (a = 0; a < 2; a++) {   \
+    for (c = 0; c < 2; c++) {   \
+    for (p = 0; p < 2; p++) {   \
+      \
+      flags_in = (o ? CC_O : 0)   \
+               | (s ? CC_S : 0)   \
+               | (z ? CC_Z : 0)   \
+               | (a ? CC_A : 0)   \
+               | (c ? CC_C : 0)   \
+               | (p ? CC_P : 0);   \
+      \
+      g_val = val[v1];   \
+      e_val = val[v2];   \
+      e_val_before = e_val;   \
+      \
+      block[0] = flags_in;   \
+      block[1] = g_val;   \
+      block[2] = (int)(long)&e_val;   \
+      block[3] = 0;   \
+      __asm__ __volatile__(   \
+          "movl 0(%0), %%eax\n\t"   \
+          "pushl %%eax\n\t"   \
+          "popfl\n\t"   \
+          "movl 4(%0), %%eax\n\t"   \
+          "movl 8(%0), %%ebx\n\t"   \
+          "lock; " #_name " %%" #_eax ",(%%ebx)\n\t"   \
+          "pushfl\n\t"   \
+          "popl %%eax\n\t"   \
+          "movl %%eax, 12(%0)\n\t"   \
+          : : "r"(&block[0]) : "eax","ebx","cc","memory"   \
+      );   \
+      \
+      send( \
+         sprintf(outBuf,                                        \
+                 "%s G=%08x E=%08x CCIN=%08x -> E=%08x CCOUT=%08x\n", \
+                 #_name, g_val, e_val_before, flags_in,   \
+                 e_val, block[3] & CC_MASK) );            \
+      \
+    }}}}}}   \
+    \
+    }}   \
+  }
+
+GEN_do_locked_G_E(addb,al)
+GEN_do_locked_G_E(addw,ax)
+GEN_do_locked_G_E(addl,eax)
+
+GEN_do_locked_G_E(orb, al)
+GEN_do_locked_G_E(orw, ax)
+GEN_do_locked_G_E(orl, eax)
+
+GEN_do_locked_G_E(adcb,al)
+GEN_do_locked_G_E(adcw,ax)
+GEN_do_locked_G_E(adcl,eax)
+
+GEN_do_locked_G_E(sbbb,al)
+GEN_do_locked_G_E(sbbw,ax)
+GEN_do_locked_G_E(sbbl,eax)
+
+GEN_do_locked_G_E(andb,al)
+GEN_do_locked_G_E(andw,ax)
+GEN_do_locked_G_E(andl,eax)
+
+GEN_do_locked_G_E(subb,al)
+GEN_do_locked_G_E(subw,ax)
+GEN_do_locked_G_E(subl,eax)
+
+GEN_do_locked_G_E(xorb,al)
+GEN_do_locked_G_E(xorw,ax)
+GEN_do_locked_G_E(xorl,eax)
+
+
+
+
+#define GEN_do_locked_imm_E(_name,_eax,_imm)        \
+  \
+  __attribute__((noinline)) void do_locked_imm_E_##_name##_##_imm ( void )  \
+  {   \
+    volatile int e_val, e_val_before;   \
+    int o, s, z, a, c, p, v2, flags_in;   \
+    int block[3];   \
+    \
+    for (v2 = 0; v2 < NVALS; v2++) {   \
+    \
+    for (o = 0; o < 2; o++) {   \
+    for (s = 0; s < 2; s++) {   \
+    for (z = 0; z < 2; z++) {   \
+    for (a = 0; a < 2; a++) {   \
+    for (c = 0; c < 2; c++) {   \
+    for (p = 0; p < 2; p++) {   \
+      \
+      flags_in = (o ? CC_O : 0)   \
+               | (s ? CC_S : 0)   \
+               | (z ? CC_Z : 0)   \
+               | (a ? CC_A : 0)   \
+               | (c ? CC_C : 0)   \
+               | (p ? CC_P : 0);   \
+      \
+      e_val = val[v2];   \
+      e_val_before = e_val;   \
+      \
+      block[0] = flags_in;   \
+      block[1] = (int)(long)&e_val;   \
+      block[2] = 0;   \
+      __asm__ __volatile__(   \
+          "movl 0(%0), %%eax\n\t"   \
+          "pushl %%eax\n\t"   \
+          "popfl\n\t"   \
+          "movl 4(%0), %%ebx\n\t"   \
+          "lock; " #_name " $" #_imm ",(%%ebx)\n\t"   \
+          "pushfl\n\t"   \
+          "popl %%eax\n\t"   \
+          "movl %%eax, 8(%0)\n\t"   \
+          : : "r"(&block[0]) : "eax","ebx","cc","memory"   \
+      );   \
+      \
+      send( \
+        sprintf(outBuf, \
+             "%s I=%s E=%08x CCIN=%08x -> E=%08x CCOUT=%08x\n",       \
+             #_name, #_imm, e_val_before, flags_in,         \
+                e_val, block[2] & CC_MASK) );               \
+      \
+    }}}}}}   \
+    \
+    }   \
+  }
+
+GEN_do_locked_imm_E(addb,al,0x7F)
+GEN_do_locked_imm_E(addb,al,0xF1)
+GEN_do_locked_imm_E(addw,ax,0x7E)
+GEN_do_locked_imm_E(addw,ax,0x9325)
+GEN_do_locked_imm_E(addl,eax,0x7D)
+GEN_do_locked_imm_E(addl,eax,0x31415927)
+
+GEN_do_locked_imm_E(orb,al,0x7F)
+GEN_do_locked_imm_E(orb,al,0xF1)
+GEN_do_locked_imm_E(orw,ax,0x7E)
+GEN_do_locked_imm_E(orw,ax,0x9325)
+GEN_do_locked_imm_E(orl,eax,0x7D)
+GEN_do_locked_imm_E(orl,eax,0x31415927)
+
+GEN_do_locked_imm_E(adcb,al,0x7F)
+GEN_do_locked_imm_E(adcb,al,0xF1)
+GEN_do_locked_imm_E(adcw,ax,0x7E)
+GEN_do_locked_imm_E(adcw,ax,0x9325)
+GEN_do_locked_imm_E(adcl,eax,0x7D)
+GEN_do_locked_imm_E(adcl,eax,0x31415927)
+
+GEN_do_locked_imm_E(sbbb,al,0x7F)
+GEN_do_locked_imm_E(sbbb,al,0xF1)
+GEN_do_locked_imm_E(sbbw,ax,0x7E)
+GEN_do_locked_imm_E(sbbw,ax,0x9325)
+GEN_do_locked_imm_E(sbbl,eax,0x7D)
+GEN_do_locked_imm_E(sbbl,eax,0x31415927)
+
+GEN_do_locked_imm_E(andb,al,0x7F)
+GEN_do_locked_imm_E(andb,al,0xF1)
+GEN_do_locked_imm_E(andw,ax,0x7E)
+GEN_do_locked_imm_E(andw,ax,0x9325)
+GEN_do_locked_imm_E(andl,eax,0x7D)
+GEN_do_locked_imm_E(andl,eax,0x31415927)
+
+GEN_do_locked_imm_E(subb,al,0x7F)
+GEN_do_locked_imm_E(subb,al,0xF1)
+GEN_do_locked_imm_E(subw,ax,0x7E)
+GEN_do_locked_imm_E(subw,ax,0x9325)
+GEN_do_locked_imm_E(subl,eax,0x7D)
+GEN_do_locked_imm_E(subl,eax,0x31415927)
+
+GEN_do_locked_imm_E(xorb,al,0x7F)
+GEN_do_locked_imm_E(xorb,al,0xF1)
+GEN_do_locked_imm_E(xorw,ax,0x7E)
+GEN_do_locked_imm_E(xorw,ax,0x9325)
+GEN_do_locked_imm_E(xorl,eax,0x7D)
+GEN_do_locked_imm_E(xorl,eax,0x31415927)
+
+#define GEN_do_locked_unary_E(_name,_eax)        \
+  \
+  __attribute__((noinline)) void do_locked_unary_E_##_name ( void )  \
+  {   \
+    volatile int e_val, e_val_before;   \
+    int o, s, z, a, c, p, v2, flags_in;   \
+    int block[3];   \
+    \
+    for (v2 = 0; v2 < NVALS; v2++) {   \
+    \
+    for (o = 0; o < 2; o++) {   \
+    for (s = 0; s < 2; s++) {   \
+    for (z = 0; z < 2; z++) {   \
+    for (a = 0; a < 2; a++) {   \
+    for (c = 0; c < 2; c++) {   \
+    for (p = 0; p < 2; p++) {   \
+      \
+      flags_in = (o ? CC_O : 0)   \
+               | (s ? CC_S : 0)   \
+               | (z ? CC_Z : 0)   \
+               | (a ? CC_A : 0)   \
+               | (c ? CC_C : 0)   \
+               | (p ? CC_P : 0);   \
+      \
+      e_val = val[v2];   \
+      e_val_before = e_val;   \
+      \
+      block[0] = flags_in;   \
+      block[1] = (int)(long)&e_val;   \
+      block[2] = 0;   \
+      __asm__ __volatile__(   \
+          "movl 0(%0), %%eax\n\t"   \
+          "pushl %%eax\n\t"   \
+          "popfl\n\t"   \
+          "movl 4(%0), %%ebx\n\t"   \
+          "lock; " #_name " (%%ebx)\n\t"   \
+          "pushfl\n\t"   \
+          "popl %%eax\n\t"   \
+          "movl %%eax, 8(%0)\n\t"   \
+          : : "r"(&block[0]) : "eax","ebx","cc","memory"   \
+      );   \
+      \
+      send( \
+         sprintf(outBuf, \
+                "%s E=%08x CCIN=%08x -> E=%08x CCOUT=%08x\n",   \
+             #_name, e_val_before, flags_in,         \
+                e_val, block[2] & CC_MASK));         \
+      \
+    }}}}}}   \
+    \
+    }   \
+  }
+
+GEN_do_locked_unary_E(decb,al)
+GEN_do_locked_unary_E(decw,ax)
+GEN_do_locked_unary_E(decl,eax)
+
+GEN_do_locked_unary_E(incb,al)
+GEN_do_locked_unary_E(incw,ax)
+GEN_do_locked_unary_E(incl,eax)
+
+GEN_do_locked_unary_E(negb,al)
+GEN_do_locked_unary_E(negw,ax)
+GEN_do_locked_unary_E(negl,eax)
+
+GEN_do_locked_unary_E(notb,al)
+GEN_do_locked_unary_E(notw,ax)
+GEN_do_locked_unary_E(notl,eax)
+
+
+/////////////////////////////////////////////////////////////////
+
+unsigned int btsl_mem ( UChar* base, int bitno )
+{
+   unsigned char res;
+   __asm__ 
+   __volatile__("lock; btsl\t%2, %0\n\t"
+                "setc\t%1" 
+                : "=m" (*base), "=q" (res)
+                : "r" (bitno));
+   /* Pretty meaningless to dereference base here, but that's what you
+      have to do to get a btsl insn which refers to memory starting at
+      base. */
+   return res;
+}
+unsigned int btsw_mem ( UChar* base, int bitno )
+{
+   unsigned char res;
+   __asm__ 
+   __volatile__("lock; btsw\t%w2, %0\n\t"
+                "setc\t%1" 
+                : "=m" (*base), "=q" (res)
+                : "r" (bitno));
+   return res;
+}
+
+unsigned int btrl_mem ( UChar* base, int bitno )
+{
+   unsigned char res;
+   __asm__ 
+   __volatile__("lock; btrl\t%2, %0\n\t"
+                "setc\t%1" 
+                : "=m" (*base), "=q" (res)
+                : "r" (bitno));
+   return res;
+}
+unsigned int btrw_mem ( UChar* base, int bitno )
+{
+   unsigned char res;
+   __asm__ 
+   __volatile__("lock; btrw\t%w2, %0\n\t"
+                "setc\t%1" 
+                : "=m" (*base), "=q" (res)
+                : "r" (bitno));
+   return res;
+}
+
+unsigned int btcl_mem ( UChar* base, int bitno )
+{
+   unsigned char res;
+   __asm__ 
+   __volatile__("lock; btcl\t%2, %0\n\t"
+                "setc\t%1" 
+                : "=m" (*base), "=q" (res)
+                : "r" (bitno));
+   return res;
+}
+unsigned int btcw_mem ( UChar* base, int bitno )
+{
+   unsigned char res;
+   __asm__ 
+   __volatile__("lock; btcw\t%w2, %0\n\t"
+                "setc\t%1" 
+                : "=m" (*base), "=q" (res)
+                : "r" (bitno));
+   return res;
+}
+
+unsigned int btl_mem ( UChar* base, int bitno )
+{
+   unsigned char res;
+   __asm__ 
+   __volatile__("btl\t%2, %0\n\t"
+                "setc\t%1" 
+                : "=m" (*base), "=q" (res)
+                : "r" (bitno)
+                : "cc", "memory");
+   return res;
+}
+unsigned int btw_mem ( UChar* base, int bitno )
+{
+   unsigned char res;
+   __asm__ 
+   __volatile__("btw\t%w2, %0\n\t"
+                "setc\t%1" 
+                : "=m" (*base), "=q" (res)
+                : "r" (bitno));
+   return res;
+}
+
+ULong rol1 ( ULong x )
+{
+  return (x << 1) | (x >> 63);
+}
+
+void do_bt_G_E_tests ( void )
+{
+   UInt   n, bitoff, op;
+   UInt   c;
+   UChar* block;
+   ULong  carrydep, res;;
+
+   /*------------------------ MEM-L -----------------------*/
+
+   carrydep = 0;
+   block = calloc(200,1);
+   block += 100;
+   /* Valid bit offsets are -800 .. 799 inclusive. */
+
+   for (n = 0; n < 10000; n++) {
+      bitoff = (random() % 1600) - 800;
+      op = random() % 4;
+      c = 2;
+      switch (op) {
+         case 0: c = btsl_mem(block, bitoff); break;
+         case 1: c = btrl_mem(block, bitoff); break;
+         case 2: c = btcl_mem(block, bitoff); break;
+         case 3: c = btl_mem(block, bitoff); break;
+      }
+      c &= 255;
+      assert(c == 0 || c == 1);
+      carrydep = c ? (rol1(carrydep) ^ (Long)(Int)bitoff) : carrydep;
+   }
+
+   /* Compute final result */
+   block -= 100;   
+   res = 0;
+   for (n = 0; n < 200; n++) {
+      UChar ch = block[n];
+      /* printf("%d ", (int)block[n]); */
+      res = rol1(res) ^ (ULong)ch;
+   }
+
+   send( sprintf(outBuf, 
+                 "bt{s,r,c}l: final res 0x%llx, carrydep 0x%llx\n",
+                 res, carrydep ));
+   free(block);
+
+   /*------------------------ MEM-W -----------------------*/
+
+   carrydep = 0;
+   block = calloc(200,1);
+   block += 100;
+   /* Valid bit offsets are -800 .. 799 inclusive. */
+
+   for (n = 0; n < 10000; n++) {
+      bitoff = (random() % 1600) - 800;
+      op = random() % 4;
+      c = 2;
+      switch (op) {
+         case 0: c = btsw_mem(block, bitoff); break;
+         case 1: c = btrw_mem(block, bitoff); break;
+         case 2: c = btcw_mem(block, bitoff); break;
+         case 3: c = btw_mem(block, bitoff); break;
+      }
+      c &= 255;
+      assert(c == 0 || c == 1);
+      carrydep = c ? (rol1(carrydep) ^ (Long)(Int)bitoff) : carrydep;
+   }
+
+   /* Compute final result */
+   block -= 100;   
+   res = 0;
+   for (n = 0; n < 200; n++) {
+      UChar ch = block[n];
+      /* printf("%d ", (int)block[n]); */
+      res = rol1(res) ^ (ULong)ch;
+   }
+
+   send( sprintf(outBuf,
+                 "bt{s,r,c}w: final res 0x%llx, carrydep 0x%llx\n",
+                 res, carrydep ));
+   free(block);
+}
+
+
+/////////////////////////////////////////////////////////////////
+
+/* Given a word, do bt/bts/btr/btc on bits 0, 1, 2 and 3 of it, and
+   also reconstruct the original bits 0, 1, 2, 3 by looking at the
+   carry flag.  Returned result has mashed bits 0-3 at the bottom and
+   the reconstructed original bits 0-3 as 4-7. */
+
+UInt mash_mem_L ( UInt* origp )
+{
+  UInt reconstructed, mashed;
+  __asm__ __volatile__ (
+     "movl %2, %%edx\n\t"
+     ""
+     "movl $0, %%eax\n\t"
+     "\n\t"
+     "btl  $0, (%%edx)\n\t"
+     "setb %%cl\n\t"
+     "movzbl %%cl, %%ecx\n\t"
+     "orl %%ecx, %%eax\n\t"
+     "\n\t"
+     "lock; btsl $1, (%%edx)\n\t"
+     "setb %%cl\n\t"
+     "movzbl %%cl, %%ecx\n\t"
+     "shll $1, %%ecx\n\t"
+     "orl %%ecx, %%eax\n\t"
+     "\n\t"
+     "lock; btrl $2, (%%edx)\n\t"
+     "setb %%cl\n\t"
+     "movzbl %%cl, %%ecx\n\t"
+     "shll $2, %%ecx\n\t"
+     "orl %%ecx, %%eax\n\t"
+     "\n\t"
+     "lock; btcl $3, (%%edx)\n\t"
+     "setb %%cl\n\t"
+     "movzbl %%cl, %%ecx\n\t"
+     "shll $3, %%ecx\n\t"
+     "orl %%ecx, %%eax\n\t"
+     "\n\t"
+     "movl %%eax, %0\n\t"
+     "movl (%%edx), %1"
+
+     : "=r" (reconstructed), "=r" (mashed)
+     : "r" (origp)
+     : "eax", "ecx", "edx", "cc");
+  return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
+}
+
+UInt mash_mem_W ( UShort* origp )
+{
+  UInt reconstructed, mashed;
+  __asm__ __volatile__ (
+     "movl %2, %%edx\n\t"
+     ""
+     "movl $0, %%eax\n\t"
+     "\n\t"
+     "btw  $0, (%%edx)\n\t"
+     "setb %%cl\n\t"
+     "movzbl %%cl, %%ecx\n\t"
+     "orl %%ecx, %%eax\n\t"
+     "\n\t"
+     "lock; btsw $1, (%%edx)\n\t"
+     "setb %%cl\n\t"
+     "movzbl %%cl, %%ecx\n\t"
+     "shll $1, %%ecx\n\t"
+     "orl %%ecx, %%eax\n\t"
+     "\n\t"
+     "lock; btrw $2, (%%edx)\n\t"
+     "setb %%cl\n\t"
+     "movzbl %%cl, %%ecx\n\t"
+     "shll $2, %%ecx\n\t"
+     "orl %%ecx, %%eax\n\t"
+     "\n\t"
+     "lock; btcw $3, (%%edx)\n\t"
+     "setb %%cl\n\t"
+     "movzbl %%cl, %%ecx\n\t"
+     "shll $3, %%ecx\n\t"
+     "orl %%ecx, %%eax\n\t"
+     "\n\t"
+     "movl %%eax, %0\n\t"
+     "movzwl (%%edx), %1"
+
+     : "=r" (reconstructed), "=r" (mashed)
+     : "r" (origp)
+     : "eax", "ecx", "edx", "cc");
+  return (mashed & 0xF) | ((reconstructed & 0xF) << 4);
+}
+
+
+void do_bt_imm_E_tests( void )
+{
+  int i;
+  UInt*   iil = malloc(sizeof(UInt));
+  UShort* iiw = malloc(sizeof(UShort));
+  for (i = 0; i < 0x10; i++) {
+    *iil = i;
+    *iiw = i;
+    send( sprintf(outBuf, "0x%x -> 0x%02x 0x%02x\n", i, 
+                  mash_mem_L(iil), mash_mem_W(iiw)));
+  }
+  free(iil);
+  free(iiw);
+}
+
+
+
+/////////////////////////////////////////////////////////////////
+
+int main ( void )
+{
+  do_locked_G_E_addb();
+  do_locked_G_E_addw();
+  do_locked_G_E_addl();
+
+  do_locked_G_E_orb();
+  do_locked_G_E_orw();
+  do_locked_G_E_orl();
+
+  do_locked_G_E_adcb();
+  do_locked_G_E_adcw();
+  do_locked_G_E_adcl();
+
+  do_locked_G_E_sbbb();
+  do_locked_G_E_sbbw();
+  do_locked_G_E_sbbl();
+
+  do_locked_G_E_andb();
+  do_locked_G_E_andw();
+  do_locked_G_E_andl();
+
+  do_locked_G_E_subb();
+  do_locked_G_E_subw();
+  do_locked_G_E_subl();
+
+  do_locked_G_E_xorb();
+  do_locked_G_E_xorw();
+  do_locked_G_E_xorl();
+  //21
+  do_locked_imm_E_addb_0x7F();
+  do_locked_imm_E_addb_0xF1();
+  do_locked_imm_E_addw_0x7E();
+  do_locked_imm_E_addw_0x9325();
+  do_locked_imm_E_addl_0x7D();
+  do_locked_imm_E_addl_0x31415927();
+
+  do_locked_imm_E_orb_0x7F();
+  do_locked_imm_E_orb_0xF1();
+  do_locked_imm_E_orw_0x7E();
+  do_locked_imm_E_orw_0x9325();
+  do_locked_imm_E_orl_0x7D();
+  do_locked_imm_E_orl_0x31415927();
+
+  do_locked_imm_E_adcb_0x7F();
+  do_locked_imm_E_adcb_0xF1();
+  do_locked_imm_E_adcw_0x7E();
+  do_locked_imm_E_adcw_0x9325();
+  do_locked_imm_E_adcl_0x7D();
+  do_locked_imm_E_adcl_0x31415927();
+
+  do_locked_imm_E_sbbb_0x7F();
+  do_locked_imm_E_sbbb_0xF1();
+  do_locked_imm_E_sbbw_0x7E();
+  do_locked_imm_E_sbbw_0x9325();
+  do_locked_imm_E_sbbl_0x7D();
+  do_locked_imm_E_sbbl_0x31415927();
+
+  do_locked_imm_E_andb_0x7F();
+  do_locked_imm_E_andb_0xF1();
+  do_locked_imm_E_andw_0x7E();
+  do_locked_imm_E_andw_0x9325();
+  do_locked_imm_E_andl_0x7D();
+  do_locked_imm_E_andl_0x31415927();
+
+  do_locked_imm_E_subb_0x7F();
+  do_locked_imm_E_subb_0xF1();
+  do_locked_imm_E_subw_0x7E();
+  do_locked_imm_E_subw_0x9325();
+  do_locked_imm_E_subl_0x7D();
+  do_locked_imm_E_subl_0x31415927();
+
+  do_locked_imm_E_xorb_0x7F();
+  do_locked_imm_E_xorb_0xF1();
+  do_locked_imm_E_xorw_0x7E();
+  do_locked_imm_E_xorw_0x9325();
+  do_locked_imm_E_xorl_0x7D();
+  do_locked_imm_E_xorl_0x31415927();
+  // 63
+  do_locked_unary_E_decb();
+  do_locked_unary_E_decw();
+  do_locked_unary_E_decl();
+
+  do_locked_unary_E_incb();
+  do_locked_unary_E_incw();
+  do_locked_unary_E_incl();
+
+  do_locked_unary_E_negb();
+  do_locked_unary_E_negw();
+  do_locked_unary_E_negl();
+
+  do_locked_unary_E_notb();
+  do_locked_unary_E_notw();
+  do_locked_unary_E_notl();
+  // 75
+  do_bt_G_E_tests();
+  // 81
+  do_bt_imm_E_tests();
+  // 87
+  // So there should be 87 lock-prefixed instructions in the
+  // disassembly of this compilation unit.
+  // confirm with
+  // objdump -d ./x86locked | grep lock | grep -v do_lock | grep -v elf32 | wc
+
+  { UInt crcExpd = 0x8235DC9C;
+    theCRC = crcFinalise( theCRC );
+    if (theCRC == crcExpd) {
+       printf("x86locked: PASS: CRCs actual 0x%08X expected 0x%08X\n",
+              theCRC, crcExpd);
+    } else {
+       printf("x86locked: FAIL: CRCs actual 0x%08X expected 0x%08X\n",
+              theCRC, crcExpd);
+       printf("x86locked: set #define VERBOSE 1 to diagnose\n");
+    }
+  }
+
+  return 0;
+}
diff --git a/none/tests/x86/x86locked.stderr.exp b/none/tests/x86/x86locked.stderr.exp
new file mode 100644 (file)
index 0000000..139597f
--- /dev/null
@@ -0,0 +1,2 @@
+
+
diff --git a/none/tests/x86/x86locked.stdout.exp b/none/tests/x86/x86locked.stdout.exp
new file mode 100644 (file)
index 0000000..e5145cc
--- /dev/null
@@ -0,0 +1 @@
+x86locked: PASS: CRCs actual 0x8235DC9C expected 0x8235DC9C
diff --git a/none/tests/x86/x86locked.vgtest b/none/tests/x86/x86locked.vgtest
new file mode 100644 (file)
index 0000000..5fd4721
--- /dev/null
@@ -0,0 +1 @@
+prog: x86locked