From 418902218c2e214f25f40810a30f955c17ded097 Mon Sep 17 00:00:00 2001
From: Julian Seward <jseward@acm.org>
Date: Tue, 23 Apr 2002 16:52:51 +0000
Subject: [PATCH] Various upgrades, with the effect that mozilla now runs,
 although it has tremendous performance problems.

* Implement pthread_key_{create,delete} and pthread_{set,get}specific.

* Implement pthread_cond_timedwait.  A nuisance.

* New timer infrastructure, based on the RDTSC instruction.  This
  allows fast, accurate time measurement without swamping the host with
  gettimeofday() syscalls.

There's something definitely screwy about the scheduler, making opera
run slowly and mozilla run unbelievably slowly.  To be investigated.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@119
---
 coregrind/arch/x86-linux/vg_libpthread.c | 166 +++++++-----
 coregrind/vg_include.h                   |  48 +++-
 coregrind/vg_libpthread.c                | 166 +++++++-----
 coregrind/vg_main.c                      |   7 +
 coregrind/vg_mylibc.c                    | 137 +++++++++-
 coregrind/vg_scheduler.c                 | 310 +++++++++++++++++++++--
 tests/pth_specific.c                     | 104 ++++++++
 vg_include.h                             |  48 +++-
 vg_libpthread.c                          | 166 +++++++-----
 vg_main.c                                |   7 +
 vg_mylibc.c                              | 137 +++++++++-
 vg_scheduler.c                           | 310 +++++++++++++++++++++--
 12 files changed, 1344 insertions(+), 262 deletions(-)
 create mode 100644 tests/pth_specific.c

diff --git a/coregrind/arch/x86-linux/vg_libpthread.c b/coregrind/arch/x86-linux/vg_libpthread.c
index fb70ec6ea8..76923a3c77 100644
--- a/coregrind/arch/x86-linux/vg_libpthread.c
+++ b/coregrind/arch/x86-linux/vg_libpthread.c
@@ -134,6 +134,8 @@ static void kludged ( char* msg )
 #include <pthread.h>
 #include <stdio.h>
 #include <errno.h>
+#include <assert.h>
+#include <sys/time.h> /* gettimeofday */
 
 /* ---------------------------------------------------
    THREAD ATTRIBUTES
@@ -421,6 +423,40 @@ int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
    return res;
 }
 
+int pthread_cond_timedwait ( pthread_cond_t *cond, 
+                             pthread_mutex_t *mutex, 
+                             const struct  timespec *abstime )
+{
+   int res;
+   unsigned int ms_now, ms_end;
+   struct  timeval timeval_now;
+   unsigned long long int ull_ms_now_after_1970;
+   unsigned long long int ull_ms_end_after_1970;
+
+   ensure_valgrind("pthread_cond_timedwait");
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+   assert(ms_now != 0xFFFFFFFF);
+   res = gettimeofday(&timeval_now, NULL);
+   assert(res == 0);
+
+   ull_ms_now_after_1970 
+      = 1000ULL * ((unsigned long long int)(timeval_now.tv_sec))
+        + ((unsigned long long int)(timeval_now.tv_usec / 1000000));
+   ull_ms_end_after_1970
+      = 1000ULL * ((unsigned long long int)(abstime->tv_sec))
+        + ((unsigned long long int)(abstime->tv_nsec / 1000000));
+   assert(ull_ms_end_after_1970 >= ull_ms_now_after_1970);
+   ms_end 
+      = ms_now + (unsigned int)(ull_ms_end_after_1970 - ull_ms_now_after_1970);
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_COND_TIMEDWAIT,
+			   cond, mutex, ms_end, 0);
+   return res;
+}
+
+
 int pthread_cond_signal(pthread_cond_t *cond)
 {
    int res;
@@ -471,8 +507,12 @@ int pthread_cancel(pthread_t thread)
 int pthread_key_create(pthread_key_t *key,  
                        void  (*destr_function)  (void *))
 {
-   ignored("pthread_key_create");
-   return 0;
+   int res;
+   ensure_valgrind("pthread_key_create");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_KEY_CREATE,
+                           key, destr_function, 0, 0);
+   return res;
 }
 
 int pthread_key_delete(pthread_key_t key)
@@ -483,14 +523,22 @@ int pthread_key_delete(pthread_key_t key)
 
 int pthread_setspecific(pthread_key_t key, const void *pointer)
 {
-   ignored("pthread_setspecific");
-   return 0;
+   int res;
+   ensure_valgrind("pthread_setspecific");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_SETSPECIFIC,
+                           key, pointer, 0, 0);
+   return res;
 }
 
 void * pthread_getspecific(pthread_key_t key)
 {
-   ignored("pthread_setspecific");
-   return NULL;
+   int res;
+   ensure_valgrind("pthread_getspecific");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_GETSPECIFIC,
+                           key, 0 , 0, 0);
+   return (void*)res;
 }
 
 
@@ -784,7 +832,6 @@ int do_syscall_select( int n,
    * (unchecked) libc error numbers (EINTR etc) are the negation of the
      kernel's error numbers (VKI_EINTR etc).
 */
-#include <assert.h>
 
 
 int select ( int n, 
@@ -793,16 +840,19 @@ int select ( int n,
              fd_set *xfds, 
              struct timeval *timeout )
 {
+   unsigned int ms_now, ms_end;
    int    res;
    fd_set rfds_copy;
    fd_set wfds_copy;
    fd_set xfds_copy;
    struct vki_timeval  t_now;
-   struct vki_timeval  t_end;
    struct vki_timeval  zero_timeout;
    struct vki_timespec nanosleep_interval;
 
-   ensure_valgrind("select");
+   /* gcc's complains about ms_end being used uninitialised -- classic
+      case it can't understand, where ms_end is both defined and used
+      only if timeout != NULL.  Hence ... */
+   ms_end = 0;
 
    /* We assume that the kernel and libc data layouts are identical
       for the following types.  These asserts provide a crude
@@ -811,8 +861,17 @@ int select ( int n,
        || sizeof(struct timeval) != sizeof(struct vki_timeval))
       barf("valgrind's hacky non-blocking select(): data sizes error");
 
-   /* If a zero timeout specified, this call is harmless. */
-   if (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0) {
+   /* Detect the current time and simultaneously find out if we are
+      running on Valgrind. */
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+
+   /* If a zero timeout specified, this call is harmless.  Also go
+      this route if we're not running on Valgrind, for whatever
+      reason. */
+   if ( (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0)
+        || (ms_now == 0xFFFFFFFF) ) {
       res = do_syscall_select( n, (vki_fd_set*)rfds, 
                                    (vki_fd_set*)wfds, 
                                    (vki_fd_set*)xfds, 
@@ -825,35 +884,29 @@ int select ( int n,
       }
    }
 
-   /* If a timeout was specified, set t_end to be the end wallclock
-      time. */
+   /* If a timeout was specified, set ms_end to be the end millisecond
+      counter [wallclock] time. */
    if (timeout) {
       res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
       assert(res == 0);
-      t_end = t_now;
-      t_end.tv_usec += timeout->tv_usec;
-      t_end.tv_sec  += timeout->tv_sec;
-      if (t_end.tv_usec >= 1000000) {
-         t_end.tv_usec -= 1000000;
-         t_end.tv_sec += 1;
-      }
+      ms_end = ms_now;
+      ms_end += (timeout->tv_usec / 1000);
+      ms_end += (timeout->tv_sec * 1000);
       /* Stay sane ... */
-      assert (t_end.tv_sec > t_now.tv_sec
-              || (t_end.tv_sec == t_now.tv_sec 
-                  && t_end.tv_usec >= t_now.tv_usec));
+      assert (ms_end >= ms_now);
    }
 
    /* fprintf(stderr, "MY_SELECT: before loop\n"); */
 
    /* Either timeout == NULL, meaning wait indefinitely, or timeout !=
-      NULL, in which case t_end holds the end time. */
+      NULL, in which case ms_end holds the end time. */
    while (1) {
       if (timeout) {
-         res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-         assert(res == 0);
-         if (t_now.tv_sec > t_end.tv_sec
-             || (t_now.tv_sec == t_end.tv_sec 
-                 && t_now.tv_usec > t_end.tv_usec)) {
+         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                                 VG_USERREQ__READ_MILLISECOND_TIMER,
+                                 0, 0, 0, 0);
+         assert(ms_now != 0xFFFFFFFF);
+         if (ms_now >= ms_end) {
             /* timeout; nothing interesting happened. */
             if (rfds) FD_ZERO(rfds);
             if (wfds) FD_ZERO(wfds);
@@ -892,7 +945,7 @@ int select ( int n,
       /* fprintf(stderr, "MY_SELECT: nanosleep\n"); */
       /* nanosleep and go round again */
       nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 75 * 1000 * 1000; /* 75 milliseconds */
+      nanosleep_interval.tv_nsec = 100 * 1000 * 1000; /* 100 milliseconds */
       /* It's critical here that valgrind's nanosleep implementation
          is nonblocking. */
       (void)my_do_syscall2(__NR_nanosleep, 
@@ -907,19 +960,28 @@ int select ( int n,
 
 int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
 {
+   unsigned int        ms_now, ms_end;
    int                 res, i;
-   struct vki_timeval  t_now;
-   struct vki_timeval  t_end;
    struct vki_timespec nanosleep_interval;
 
    ensure_valgrind("poll");
 
+   /* Detect the current time and simultaneously find out if we are
+      running on Valgrind. */
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+
    if (/* CHECK SIZES FOR struct pollfd */
        sizeof(struct timeval) != sizeof(struct vki_timeval))
       barf("valgrind's hacky non-blocking poll(): data sizes error");
 
-   /* If a zero timeout specified, this call is harmless. */
-   if (__timeout == 0) {
+   /* dummy initialisation to keep gcc -Wall happy */
+   ms_end = 0;
+
+   /* If a zero timeout specified, this call is harmless.  Also do
+      this if not running on Valgrind. */
+   if (__timeout == 0 || ms_now == 0xFFFFFFFF) {
       res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, __timeout);
       if (is_kerror(res)) {
          * (__errno_location()) = -res;
@@ -929,36 +991,25 @@ int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
       }
    }
 
-   /* If a timeout was specified, set t_end to be the end wallclock
-      time. */
+   /* If a timeout was specified, set ms_end to be the end wallclock
+      time.  Easy considering that __timeout is in milliseconds. */
    if (__timeout > 0) {
-      res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-      assert(res == 0);
-      t_end = t_now;
-      t_end.tv_usec += 1000 * (__timeout % 1000);
-      t_end.tv_sec  += (__timeout / 1000);
-      if (t_end.tv_usec >= 1000000) {
-         t_end.tv_usec -= 1000000;
-         t_end.tv_sec += 1;
-      }
-      /* Stay sane ... */
-      assert (t_end.tv_sec > t_now.tv_sec
-              || (t_end.tv_sec == t_now.tv_sec 
-                  && t_end.tv_usec >= t_now.tv_usec));
+      ms_end += (unsigned int)__timeout;
    }
 
    /* fprintf(stderr, "MY_POLL: before loop\n"); */
 
    /* Either timeout < 0, meaning wait indefinitely, or timeout > 0,
       in which case t_end holds the end time. */
+   assert(__timeout != 0);
+
    while (1) {
-      assert(__timeout != 0);
       if (__timeout > 0) {
-         res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-         assert(res == 0);
-         if (t_now.tv_sec > t_end.tv_sec
-             || (t_now.tv_sec == t_end.tv_sec 
-                 && t_now.tv_usec > t_end.tv_usec)) {
+         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                                 VG_USERREQ__READ_MILLISECOND_TIMER,
+                                 0, 0, 0, 0);
+         assert(ms_now != 0xFFFFFFFF);
+         if (ms_now >= ms_end) {
             /* timeout; nothing interesting happened. */
             for (i = 0; i < __nfds; i++) 
                __fds[i].revents = 0;
@@ -966,8 +1017,7 @@ int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
          }
       }
 
-      /* These could be trashed each time round the loop, so restore
-         them each time. */
+      /* Do a return-immediately poll. */
       res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, 0 );
       if (is_kerror(res)) {
          /* Some kind of error.  Set errno and return.  */
@@ -981,7 +1031,7 @@ int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
       /* fprintf(stderr, "MY_POLL: nanosleep\n"); */
       /* nanosleep and go round again */
       nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 100 * 1000 * 1000; /* 100 milliseconds */
+      nanosleep_interval.tv_nsec = 99 * 1000 * 1000; /* 99 milliseconds */
       /* It's critical here that valgrind's nanosleep implementation
          is nonblocking. */
       (void)my_do_syscall2(__NR_nanosleep, 
diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h
index 13e3f01ff2..4ae38505f7 100644
--- a/coregrind/vg_include.h
+++ b/coregrind/vg_include.h
@@ -102,7 +102,7 @@
 
 /* These many bytes below %ESP are considered addressible if we're
    doing the --workaround-gcc296-bugs hack. */
-#define VG_GCC296_BUG_STACK_SLOP 256
+#define VG_GCC296_BUG_STACK_SLOP /*256*/ 1024
 
 /* The maximum number of calls we're prepared to save in a
    backtrace. */
@@ -123,10 +123,14 @@
 
 /* The maximum number of pthreads that we support.  This is
    deliberately not very high since our implementation of some of the
-   scheduler algorithms is surely O(N^2) in the number of threads,
-   since that's simple, at least.  And (in practice) we hope that most
+   scheduler algorithms is surely O(N) in the number of threads, since
+   that's simple, at least.  And (in practice) we hope that most
    programs do not need many threads. */
-#define VG_N_THREADS 20
+#define VG_N_THREADS 100
+
+/* Maximum number of pthread keys available.  Again, we start low until
+   the need for a higher number presents itself. */
+#define VG_N_THREAD_KEYS 10
 
 /* Number of file descriptors that can simultaneously be waited on for
    I/O to complete.  Perhaps this should be the same as VG_N_THREADS
@@ -403,8 +407,15 @@ extern Bool  VG_(is_empty_arena) ( ArenaId aid );
 #define VG_USERREQ__PTHREAD_CANCEL          0x3007
 #define VG_USERREQ__PTHREAD_EXIT            0x3008
 #define VG_USERREQ__PTHREAD_COND_WAIT       0x3009
-#define VG_USERREQ__PTHREAD_COND_SIGNAL     0x300A
-#define VG_USERREQ__PTHREAD_COND_BROADCAST  0x300B
+#define VG_USERREQ__PTHREAD_COND_TIMEDWAIT  0x300A
+#define VG_USERREQ__PTHREAD_COND_SIGNAL     0x300B
+#define VG_USERREQ__PTHREAD_COND_BROADCAST  0x300C
+#define VG_USERREQ__PTHREAD_KEY_CREATE      0x300D
+#define VG_USERREQ__PTHREAD_KEY_DELETE      0x300E
+#define VG_USERREQ__PTHREAD_SETSPECIFIC     0x300F
+#define VG_USERREQ__PTHREAD_GETSPECIFIC     0x3010
+
+#define VG_USERREQ__READ_MILLISECOND_TIMER  0x4001
 
 /* Cosmetic ... */
 #define VG_USERREQ__GET_PTHREAD_TRACE_LEVEL 0x3101
@@ -466,7 +477,12 @@ typedef
          ALWAYS == the index in vg_threads[]. */
       ThreadId tid;
 
-      /* Current scheduling status. */
+      /* Current scheduling status. 
+
+         Complications: whenever this is set to VgTs_WaitMX, you
+         should also set .m_edx to whatever the required return value
+         is for pthread_mutex_lock / pthread_cond_timedwait for when
+         the mutex finally gets unblocked. */
       ThreadStatus status;
 
       /* Identity of joiner (thread who called join on me), or
@@ -483,12 +499,21 @@ typedef
          waiting for.  In all other cases, should be NULL. */
       void* /* pthread_cond_t* */ associated_cv;
 
-      /* If VgTs_Sleeping, this is when we should wake up. */
-      ULong awaken_at;
+      /* If VgTs_Sleeping, this is when we should wake up, measured in
+         milliseconds as supplied by VG_(read_millisecond_counter). 
+ 
+         If VgTs_WaitCV, this indicates the time at which
+         pthread_cond_timedwait should wake up.  If == 0xFFFFFFFF,
+         this means infinitely far in the future, viz,
+         pthread_cond_wait. */
+      UInt awaken_at;
 
       /* return value */
       void* retval;
 
+      /* thread-specific data */
+      void* specifics[VG_N_THREAD_KEYS];
+
       /* Stacks.  When a thread slot is freed, we don't deallocate its
          stack; we just leave it lying around for the next use of the
          slot.  If the next use of the slot requires a larger stack,
@@ -662,7 +687,10 @@ extern Char* VG_(strdup) ( ArenaId aid, const Char* s);
 
 extern Char* VG_(getenv) ( Char* name );
 extern Int   VG_(getpid) ( void );
-extern ULong VG_(read_microsecond_timer)( void );
+
+extern void VG_(start_rdtsc_calibration) ( void );
+extern void VG_(end_rdtsc_calibration) ( void );
+extern UInt VG_(read_millisecond_timer) ( void );
 
 
 extern Char VG_(toupper) ( Char c );
diff --git a/coregrind/vg_libpthread.c b/coregrind/vg_libpthread.c
index fb70ec6ea8..76923a3c77 100644
--- a/coregrind/vg_libpthread.c
+++ b/coregrind/vg_libpthread.c
@@ -134,6 +134,8 @@ static void kludged ( char* msg )
 #include <pthread.h>
 #include <stdio.h>
 #include <errno.h>
+#include <assert.h>
+#include <sys/time.h> /* gettimeofday */
 
 /* ---------------------------------------------------
    THREAD ATTRIBUTES
@@ -421,6 +423,40 @@ int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
    return res;
 }
 
+int pthread_cond_timedwait ( pthread_cond_t *cond, 
+                             pthread_mutex_t *mutex, 
+                             const struct  timespec *abstime )
+{
+   int res;
+   unsigned int ms_now, ms_end;
+   struct  timeval timeval_now;
+   unsigned long long int ull_ms_now_after_1970;
+   unsigned long long int ull_ms_end_after_1970;
+
+   ensure_valgrind("pthread_cond_timedwait");
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+   assert(ms_now != 0xFFFFFFFF);
+   res = gettimeofday(&timeval_now, NULL);
+   assert(res == 0);
+
+   ull_ms_now_after_1970 
+      = 1000ULL * ((unsigned long long int)(timeval_now.tv_sec))
+        + ((unsigned long long int)(timeval_now.tv_usec / 1000000));
+   ull_ms_end_after_1970
+      = 1000ULL * ((unsigned long long int)(abstime->tv_sec))
+        + ((unsigned long long int)(abstime->tv_nsec / 1000000));
+   assert(ull_ms_end_after_1970 >= ull_ms_now_after_1970);
+   ms_end 
+      = ms_now + (unsigned int)(ull_ms_end_after_1970 - ull_ms_now_after_1970);
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_COND_TIMEDWAIT,
+			   cond, mutex, ms_end, 0);
+   return res;
+}
+
+
 int pthread_cond_signal(pthread_cond_t *cond)
 {
    int res;
@@ -471,8 +507,12 @@ int pthread_cancel(pthread_t thread)
 int pthread_key_create(pthread_key_t *key,  
                        void  (*destr_function)  (void *))
 {
-   ignored("pthread_key_create");
-   return 0;
+   int res;
+   ensure_valgrind("pthread_key_create");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_KEY_CREATE,
+                           key, destr_function, 0, 0);
+   return res;
 }
 
 int pthread_key_delete(pthread_key_t key)
@@ -483,14 +523,22 @@ int pthread_key_delete(pthread_key_t key)
 
 int pthread_setspecific(pthread_key_t key, const void *pointer)
 {
-   ignored("pthread_setspecific");
-   return 0;
+   int res;
+   ensure_valgrind("pthread_setspecific");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_SETSPECIFIC,
+                           key, pointer, 0, 0);
+   return res;
 }
 
 void * pthread_getspecific(pthread_key_t key)
 {
-   ignored("pthread_setspecific");
-   return NULL;
+   int res;
+   ensure_valgrind("pthread_getspecific");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_GETSPECIFIC,
+                           key, 0 , 0, 0);
+   return (void*)res;
 }
 
 
@@ -784,7 +832,6 @@ int do_syscall_select( int n,
    * (unchecked) libc error numbers (EINTR etc) are the negation of the
      kernel's error numbers (VKI_EINTR etc).
 */
-#include <assert.h>
 
 
 int select ( int n, 
@@ -793,16 +840,19 @@ int select ( int n,
              fd_set *xfds, 
              struct timeval *timeout )
 {
+   unsigned int ms_now, ms_end;
    int    res;
    fd_set rfds_copy;
    fd_set wfds_copy;
    fd_set xfds_copy;
    struct vki_timeval  t_now;
-   struct vki_timeval  t_end;
    struct vki_timeval  zero_timeout;
    struct vki_timespec nanosleep_interval;
 
-   ensure_valgrind("select");
+   /* gcc's complains about ms_end being used uninitialised -- classic
+      case it can't understand, where ms_end is both defined and used
+      only if timeout != NULL.  Hence ... */
+   ms_end = 0;
 
    /* We assume that the kernel and libc data layouts are identical
       for the following types.  These asserts provide a crude
@@ -811,8 +861,17 @@ int select ( int n,
        || sizeof(struct timeval) != sizeof(struct vki_timeval))
       barf("valgrind's hacky non-blocking select(): data sizes error");
 
-   /* If a zero timeout specified, this call is harmless. */
-   if (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0) {
+   /* Detect the current time and simultaneously find out if we are
+      running on Valgrind. */
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+
+   /* If a zero timeout specified, this call is harmless.  Also go
+      this route if we're not running on Valgrind, for whatever
+      reason. */
+   if ( (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0)
+        || (ms_now == 0xFFFFFFFF) ) {
       res = do_syscall_select( n, (vki_fd_set*)rfds, 
                                    (vki_fd_set*)wfds, 
                                    (vki_fd_set*)xfds, 
@@ -825,35 +884,29 @@ int select ( int n,
       }
    }
 
-   /* If a timeout was specified, set t_end to be the end wallclock
-      time. */
+   /* If a timeout was specified, set ms_end to be the end millisecond
+      counter [wallclock] time. */
    if (timeout) {
       res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
       assert(res == 0);
-      t_end = t_now;
-      t_end.tv_usec += timeout->tv_usec;
-      t_end.tv_sec  += timeout->tv_sec;
-      if (t_end.tv_usec >= 1000000) {
-         t_end.tv_usec -= 1000000;
-         t_end.tv_sec += 1;
-      }
+      ms_end = ms_now;
+      ms_end += (timeout->tv_usec / 1000);
+      ms_end += (timeout->tv_sec * 1000);
       /* Stay sane ... */
-      assert (t_end.tv_sec > t_now.tv_sec
-              || (t_end.tv_sec == t_now.tv_sec 
-                  && t_end.tv_usec >= t_now.tv_usec));
+      assert (ms_end >= ms_now);
    }
 
    /* fprintf(stderr, "MY_SELECT: before loop\n"); */
 
    /* Either timeout == NULL, meaning wait indefinitely, or timeout !=
-      NULL, in which case t_end holds the end time. */
+      NULL, in which case ms_end holds the end time. */
    while (1) {
       if (timeout) {
-         res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-         assert(res == 0);
-         if (t_now.tv_sec > t_end.tv_sec
-             || (t_now.tv_sec == t_end.tv_sec 
-                 && t_now.tv_usec > t_end.tv_usec)) {
+         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                                 VG_USERREQ__READ_MILLISECOND_TIMER,
+                                 0, 0, 0, 0);
+         assert(ms_now != 0xFFFFFFFF);
+         if (ms_now >= ms_end) {
             /* timeout; nothing interesting happened. */
             if (rfds) FD_ZERO(rfds);
             if (wfds) FD_ZERO(wfds);
@@ -892,7 +945,7 @@ int select ( int n,
       /* fprintf(stderr, "MY_SELECT: nanosleep\n"); */
       /* nanosleep and go round again */
       nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 75 * 1000 * 1000; /* 75 milliseconds */
+      nanosleep_interval.tv_nsec = 100 * 1000 * 1000; /* 100 milliseconds */
       /* It's critical here that valgrind's nanosleep implementation
          is nonblocking. */
       (void)my_do_syscall2(__NR_nanosleep, 
@@ -907,19 +960,28 @@ int select ( int n,
 
 int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
 {
+   unsigned int        ms_now, ms_end;
    int                 res, i;
-   struct vki_timeval  t_now;
-   struct vki_timeval  t_end;
    struct vki_timespec nanosleep_interval;
 
    ensure_valgrind("poll");
 
+   /* Detect the current time and simultaneously find out if we are
+      running on Valgrind. */
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+
    if (/* CHECK SIZES FOR struct pollfd */
        sizeof(struct timeval) != sizeof(struct vki_timeval))
       barf("valgrind's hacky non-blocking poll(): data sizes error");
 
-   /* If a zero timeout specified, this call is harmless. */
-   if (__timeout == 0) {
+   /* dummy initialisation to keep gcc -Wall happy */
+   ms_end = 0;
+
+   /* If a zero timeout specified, this call is harmless.  Also do
+      this if not running on Valgrind. */
+   if (__timeout == 0 || ms_now == 0xFFFFFFFF) {
       res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, __timeout);
       if (is_kerror(res)) {
          * (__errno_location()) = -res;
@@ -929,36 +991,25 @@ int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
       }
    }
 
-   /* If a timeout was specified, set t_end to be the end wallclock
-      time. */
+   /* If a timeout was specified, set ms_end to be the end wallclock
+      time.  Easy considering that __timeout is in milliseconds. */
    if (__timeout > 0) {
-      res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-      assert(res == 0);
-      t_end = t_now;
-      t_end.tv_usec += 1000 * (__timeout % 1000);
-      t_end.tv_sec  += (__timeout / 1000);
-      if (t_end.tv_usec >= 1000000) {
-         t_end.tv_usec -= 1000000;
-         t_end.tv_sec += 1;
-      }
-      /* Stay sane ... */
-      assert (t_end.tv_sec > t_now.tv_sec
-              || (t_end.tv_sec == t_now.tv_sec 
-                  && t_end.tv_usec >= t_now.tv_usec));
+      ms_end += (unsigned int)__timeout;
    }
 
    /* fprintf(stderr, "MY_POLL: before loop\n"); */
 
    /* Either timeout < 0, meaning wait indefinitely, or timeout > 0,
       in which case t_end holds the end time. */
+   assert(__timeout != 0);
+
    while (1) {
-      assert(__timeout != 0);
       if (__timeout > 0) {
-         res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-         assert(res == 0);
-         if (t_now.tv_sec > t_end.tv_sec
-             || (t_now.tv_sec == t_end.tv_sec 
-                 && t_now.tv_usec > t_end.tv_usec)) {
+         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                                 VG_USERREQ__READ_MILLISECOND_TIMER,
+                                 0, 0, 0, 0);
+         assert(ms_now != 0xFFFFFFFF);
+         if (ms_now >= ms_end) {
             /* timeout; nothing interesting happened. */
             for (i = 0; i < __nfds; i++) 
                __fds[i].revents = 0;
@@ -966,8 +1017,7 @@ int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
          }
       }
 
-      /* These could be trashed each time round the loop, so restore
-         them each time. */
+      /* Do a return-immediately poll. */
       res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, 0 );
       if (is_kerror(res)) {
          /* Some kind of error.  Set errno and return.  */
@@ -981,7 +1031,7 @@ int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
       /* fprintf(stderr, "MY_POLL: nanosleep\n"); */
       /* nanosleep and go round again */
       nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 100 * 1000 * 1000; /* 100 milliseconds */
+      nanosleep_interval.tv_nsec = 99 * 1000 * 1000; /* 99 milliseconds */
       /* It's critical here that valgrind's nanosleep implementation
          is nonblocking. */
       (void)my_do_syscall2(__NR_nanosleep, 
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
index 47ea5be073..5f7fe59c34 100644
--- a/coregrind/vg_main.c
+++ b/coregrind/vg_main.c
@@ -971,6 +971,9 @@ void VG_(main) ( void )
    VGP_(init_profiling)();
 #  endif
 
+   /* Start calibration of our RDTSC-based clock. */
+   VG_(start_rdtsc_calibration)();
+
    /* Hook to delay things long enough so we can get the pid and
       attach GDB in another shell. */
    /* {extern unsigned int sleep(unsigned int seconds); sleep(10);} */
@@ -984,6 +987,10 @@ void VG_(main) ( void )
       VGP_POPCC;
    }
 
+   /* End calibration of our RDTSC-based clock, leaving it as long as
+      we can. */
+   VG_(end_rdtsc_calibration)();
+
    /* This should come after init_memory_audit; otherwise the latter
       carefully sets up the permissions maps to cover the anonymous
       mmaps for the translation table and translation cache, which
diff --git a/coregrind/vg_mylibc.c b/coregrind/vg_mylibc.c
index a728f42399..740b21e3d1 100644
--- a/coregrind/vg_mylibc.c
+++ b/coregrind/vg_mylibc.c
@@ -291,13 +291,14 @@ Int VG_(select)( Int n,
    return res;
 }
 
-/* Returns -1 on error, but 0 if ok or interrupted. */
+/* Returns -1 on error, 0 if ok, 1 if interrupted. */
 Int VG_(nanosleep)( const struct vki_timespec *req, 
                     struct vki_timespec *rem )
 {
    Int res;
    res = vg_do_syscall2(__NR_nanosleep, (UInt)req, (UInt)rem);
    if (res == -VKI_EINVAL) return -1;
+   if (res == -VKI_EINTR)  return 1;
    return 0;
 }
 
@@ -936,17 +937,6 @@ Int VG_(getpid) ( void )
    return res;
 }
 
-/* Read a notional elapsed (wallclock-time) timer, giving a 64-bit
-   microseconds count. */
-ULong VG_(read_microsecond_timer)( void )
-{
-   Int                res;
-   struct vki_timeval tv;
-   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&tv, (UInt)NULL);
-   vg_assert(!VG_(is_kerror)(res));
-   return (1000000ULL * (ULong)(tv.tv_sec)) + (ULong)(tv.tv_usec);
-}
-
 /* Return -1 if error, else 0.  NOTE does not indicate return code of
    child! */
 Int VG_(system) ( Char* cmd )
@@ -981,6 +971,129 @@ Int VG_(system) ( Char* cmd )
 }
 
 
+/* ---------------------------------------------------------------------
+   Support for a millisecond-granularity counter using RDTSC.
+   ------------------------------------------------------------------ */
+
+static __inline__ ULong do_rdtsc_insn ( void )
+{
+   ULong x;
+   __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
+   return x;
+}
+
+/* 0 = pre-calibration, 1 = calibration, 2 = running */
+static Int   rdtsc_calibration_state     = 0;
+static ULong rdtsc_ticks_per_millisecond = 0; /* invalid value */
+
+static struct vki_timeval rdtsc_cal_start_timeval;
+static struct vki_timeval rdtsc_cal_end_timeval;
+
+static ULong              rdtsc_cal_start_raw;
+static ULong              rdtsc_cal_end_raw;
+
+UInt VG_(read_millisecond_timer) ( void )
+{
+   ULong rdtsc_now;
+   vg_assert(rdtsc_calibration_state == 2);
+   rdtsc_now = do_rdtsc_insn();
+   vg_assert(rdtsc_now > rdtsc_cal_end_raw);
+   rdtsc_now -= rdtsc_cal_end_raw;
+   rdtsc_now /= rdtsc_ticks_per_millisecond;
+   return (UInt)rdtsc_now;
+}
+
+
+void VG_(start_rdtsc_calibration) ( void )
+{
+   Int res;
+   vg_assert(rdtsc_calibration_state == 0);
+   rdtsc_calibration_state = 1;
+   rdtsc_cal_start_raw = do_rdtsc_insn();
+   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&rdtsc_cal_start_timeval, 
+                                           (UInt)NULL);
+   vg_assert(!VG_(is_kerror)(res));
+}
+
+void VG_(end_rdtsc_calibration) ( void )
+{
+   Int   res, loops;
+   ULong cpu_clock_MHZ;
+   ULong cal_clock_ticks;
+   ULong cal_wallclock_microseconds;
+   ULong wallclock_start_microseconds;
+   ULong wallclock_end_microseconds;
+   struct vki_timespec req;
+   struct vki_timespec rem;
+   
+   vg_assert(rdtsc_calibration_state == 1);
+   rdtsc_calibration_state = 2;
+
+   /* Try and delay for 20 milliseconds, so that we can at least have
+      some minimum level of accuracy. */
+   req.tv_sec = 0;
+   req.tv_nsec = 20 * 1000 * 1000;
+   loops = 0;
+   while (True) {
+      res = VG_(nanosleep)(&req, &rem);
+      vg_assert(res == 0 /*ok*/ || res == 1 /*interrupted*/);
+      if (res == 0)
+         break;
+      if (rem.tv_sec == 0 && rem.tv_nsec == 0) 
+         break;
+      req = rem;
+      loops++;
+      if (loops > 100) 
+         VG_(panic)("calibration nanosleep loop failed?!");
+   }
+
+   /* Now read both timers, and do the Math. */
+   rdtsc_cal_end_raw = do_rdtsc_insn();
+   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&rdtsc_cal_end_timeval, 
+                                           (UInt)NULL);
+
+   vg_assert(rdtsc_cal_end_raw > rdtsc_cal_start_raw);
+   cal_clock_ticks = rdtsc_cal_end_raw - rdtsc_cal_start_raw;
+
+   wallclock_start_microseconds
+      = (1000000ULL * (ULong)(rdtsc_cal_start_timeval.tv_sec)) 
+         + (ULong)(rdtsc_cal_start_timeval.tv_usec);
+   wallclock_end_microseconds
+      = (1000000ULL * (ULong)(rdtsc_cal_end_timeval.tv_sec)) 
+         + (ULong)(rdtsc_cal_end_timeval.tv_usec);
+   vg_assert(wallclock_end_microseconds > wallclock_start_microseconds);
+   cal_wallclock_microseconds 
+      = wallclock_end_microseconds - wallclock_start_microseconds;
+
+   /* Since we just nanoslept for 20 ms ... */
+   vg_assert(cal_wallclock_microseconds >= 20000);
+
+   /* Now we know (roughly) that cal_clock_ticks on RDTSC take
+      cal_wallclock_microseconds elapsed time.  Calculate the RDTSC
+      ticks-per-millisecond value. */
+   if (0)
+      VG_(printf)("%lld ticks in %lld microseconds\n", 
+                  cal_clock_ticks,  cal_wallclock_microseconds );
+
+   rdtsc_ticks_per_millisecond   
+      = cal_clock_ticks / (cal_wallclock_microseconds / 1000ULL);
+   cpu_clock_MHZ
+      = (1000ULL * rdtsc_ticks_per_millisecond) / 1000000ULL;
+   if (VG_(clo_verbosity) >= 1)
+      VG_(message)(Vg_UserMsg, "Estimated CPU clock rate is %d MHz",
+                               (UInt)cpu_clock_MHZ);
+   if (cpu_clock_MHZ < 100 || cpu_clock_MHZ > 10000)
+      VG_(panic)("end_rdtsc_calibration: "
+                 "estimated CPU MHz outside range 100 .. 10000");
+   /* Paranoia about division by zero later. */
+   vg_assert(rdtsc_ticks_per_millisecond != 0);
+   if (0)
+      VG_(printf)("ticks per millisecond %llu\n", 
+                  rdtsc_ticks_per_millisecond);
+}
+
+
+
 /* ---------------------------------------------------------------------
    Primitive support for bagging memory via mmap.
    ------------------------------------------------------------------ */
diff --git a/coregrind/vg_scheduler.c b/coregrind/vg_scheduler.c
index 68dbf19a4f..32201b9383 100644
--- a/coregrind/vg_scheduler.c
+++ b/coregrind/vg_scheduler.c
@@ -119,7 +119,26 @@ typedef
 static VgWaitedOnFd vg_waiting_fds[VG_N_WAITING_FDS];
 
 
+/* Keeping track of keys. */
+typedef
+   struct {
+      /* Has this key been allocated ? */
+      Bool inuse;
+      /* If .inuse==True, records the address of the associated
+         destructor, or NULL if none. */
+      void (*destructor)(void*);
+   }
+   ThreadKeyState;
+
+/* And our array of thread keys. */
+static ThreadKeyState vg_thread_keys[VG_N_THREAD_KEYS];
+
+typedef UInt ThreadKey;
+
+
 /* Forwards */
+static void do_pthread_cond_timedwait_TIMEOUT ( ThreadId tid );
+
 static void do_nontrivial_clientreq ( ThreadId tid );
 
 static void scheduler_sanity ( void );
@@ -511,6 +530,11 @@ void VG_(scheduler_init) ( void )
    for (i = 0; i < VG_N_WAITING_FDS; i++)
       vg_waiting_fds[i].fd = -1; /* not in use */
 
+   for (i = 0; i < VG_N_THREAD_KEYS; i++) {
+      vg_thread_keys[i].inuse      = False;
+      vg_thread_keys[i].destructor = NULL;
+   }
+
    /* Assert this is thread zero, which has certain magic
       properties. */
    tid_main = vg_alloc_ThreadState();
@@ -523,6 +547,8 @@ void VG_(scheduler_init) ( void )
    vg_threads[tid_main].retval        = NULL; /* not important */
    vg_threads[tid_main].stack_highest_word 
       = vg_threads[tid_main].m_esp /* -4  ??? */;
+   for (i = 0; i < VG_N_THREAD_KEYS; i++)
+      vg_threads[tid_main].specifics[i] = NULL;
 
    /* Copy VG_(baseBlock) state to tid_main's slot. */
    vg_tid_currently_in_baseBlock = tid_main;
@@ -618,13 +644,16 @@ Bool maybe_do_trivial_clientreq ( ThreadId tid )
             (UInt)VG_(client_memalign) ( tst, arg[1], arg[2] )
          );
 
-      /* These are heavily used. */
+      /* These are heavily used -- or at least we want them to be
+         cheap. */
       case VG_USERREQ__PTHREAD_GET_THREADID:
          SIMPLE_RETURN(tid);
       case VG_USERREQ__RUNNING_ON_VALGRIND:
          SIMPLE_RETURN(1);
       case VG_USERREQ__GET_PTHREAD_TRACE_LEVEL:
          SIMPLE_RETURN(VG_(clo_trace_pthread_level));
+      case VG_USERREQ__READ_MILLISECOND_TIMER:
+         SIMPLE_RETURN(VG_(read_millisecond_timer)());
 
       default:
          /* Too hard; wimp out. */
@@ -692,18 +721,18 @@ void sched_do_syscall ( ThreadId tid )
    syscall_no = vg_threads[tid].m_eax; /* syscall number */
 
    if (syscall_no == __NR_nanosleep) {
-      ULong t_now, t_awaken;
+      UInt t_now, t_awaken;
       struct vki_timespec* req;
       req = (struct vki_timespec*)vg_threads[tid].m_ebx; /* arg1 */
-      t_now = VG_(read_microsecond_timer)();     
+      t_now = VG_(read_millisecond_timer)();     
       t_awaken 
          = t_now
-           + (ULong)1000000ULL * (ULong)(req->tv_sec) 
-           + (ULong)( (UInt)(req->tv_nsec) / 1000 );
+           + (UInt)1000ULL * (UInt)(req->tv_sec) 
+           + (UInt)(req->tv_nsec) / 1000000;
       vg_threads[tid].status    = VgTs_Sleeping;
       vg_threads[tid].awaken_at = t_awaken;
       if (VG_(clo_trace_sched)) {
-         VG_(sprintf)(msg_buf, "at %lu: nanosleep for %lu", 
+         VG_(sprintf)(msg_buf, "at %d: nanosleep for %d", 
                                t_now, t_awaken-t_now);
 	 print_sched_event(tid, msg_buf);
       }
@@ -820,16 +849,16 @@ void poll_for_ready_fds ( void )
    Char               msg_buf[100];
 
    struct vki_timespec* rem;
-   ULong                t_now;
+   UInt                 t_now;
 
    /* Awaken any sleeping threads whose sleep has expired. */
    for (tid = 1; tid < VG_N_THREADS; tid++)
      if (vg_threads[tid].status == VgTs_Sleeping)
         break;
 
-   /* Avoid pointless calls to VG_(read_microsecond_timer). */
+   /* Avoid pointless calls to VG_(read_millisecond_timer). */
    if (tid < VG_N_THREADS) {
-      t_now = VG_(read_microsecond_timer)();
+      t_now = VG_(read_millisecond_timer)();
       for (tid = 1; tid < VG_N_THREADS; tid++) {
          if (vg_threads[tid].status != VgTs_Sleeping)
             continue;
@@ -848,7 +877,7 @@ void poll_for_ready_fds ( void )
 	    /* Reschedule this thread. */
             vg_threads[tid].status = VgTs_Runnable;
             if (VG_(clo_trace_sched)) {
-               VG_(sprintf)(msg_buf, "at %lu: nanosleep done", 
+               VG_(sprintf)(msg_buf, "at %d: nanosleep done", 
                                      t_now);
                print_sched_event(tid, msg_buf);
             }
@@ -1004,6 +1033,21 @@ void complete_blocked_syscalls ( void )
 }
 
 
+static
+void check_for_pthread_cond_timedwait ( void )
+{
+   Int i;
+   for (i = 1; i < VG_N_THREADS; i++) {
+      if (vg_threads[i].status != VgTs_WaitCV)
+         continue;
+      if (vg_threads[i].awaken_at == 0xFFFFFFFF /* no timeout */)
+         continue;
+      if (VG_(read_millisecond_timer)() >= vg_threads[i].awaken_at)
+         do_pthread_cond_timedwait_TIMEOUT(i);
+   }
+}
+
+
 static
 void nanosleep_for_a_while ( void )
 {
@@ -1011,10 +1055,9 @@ void nanosleep_for_a_while ( void )
    struct vki_timespec req;
    struct vki_timespec rem;
    req.tv_sec = 0;
-   req.tv_nsec = 20 * 1000 * 1000;
+   req.tv_nsec = 50 * 1000 * 1000;
    res = VG_(nanosleep)( &req, &rem );   
-   /* VG_(printf)("after ns, unused = %d\n", rem.tv_nsec ); */
-   vg_assert(res == 0);
+   vg_assert(res == 0 /* ok */ || res == 1 /* interrupted by signal */);
 }
 
 
@@ -1079,6 +1122,7 @@ VgSchedReturnCode VG_(scheduler) ( void )
             threads. */
          poll_for_ready_fds();
          complete_blocked_syscalls();
+         check_for_pthread_cond_timedwait();
 
          /* See if there are any signals which need to be delivered.  If
             so, choose thread(s) to deliver them to, and build signal
@@ -1527,6 +1571,7 @@ void do_pthread_create ( ThreadId parent_tid,
                          void* (*start_routine)(void *), 
                          void* arg )
 {
+   Int      i;
    Addr     new_stack;
    UInt     new_stk_szb;
    ThreadId tid;
@@ -1607,6 +1652,9 @@ void do_pthread_create ( ThreadId parent_tid,
    vg_threads[tid].joiner        = VG_INVALID_THREADID;
    vg_threads[tid].status        = VgTs_Runnable;
 
+   for (i = 0; i < VG_N_THREAD_KEYS; i++)
+      vg_threads[tid].specifics[i] = NULL;
+
    /* return zero */
    vg_threads[tid].m_edx  = 0; /* success */
 }
@@ -1691,7 +1739,7 @@ void release_one_thread_waiting_on_mutex ( pthread_mutex_t* mutex,
       mutex->__m_owner = (_pthread_descr)i;
       vg_threads[i].status        = VgTs_Runnable;
       vg_threads[i].associated_mx = NULL;
-      vg_threads[i].m_edx         = 0; /* pth_lock() success */
+      /* m_edx already holds pth_mx_lock() success (0) */
 
       if (VG_(clo_trace_pthread_level) >= 1) {
          VG_(sprintf)(msg_buf, "%s       mx %p: RESUME", 
@@ -1773,7 +1821,7 @@ void do_pthread_mutex_lock( ThreadId tid,
          } else {
             vg_threads[tid].status        = VgTs_WaitMX;
             vg_threads[tid].associated_mx = mutex;
-            /* No assignment to %EDX, since we're blocking. */
+            vg_threads[tid].m_edx         = 0; /* pth_mx_lock success value */
             if (VG_(clo_trace_pthread_level) >= 1) {
                VG_(sprintf)(msg_buf, "%s    mx %p: BLOCK", 
                                      caller, mutex );
@@ -1890,6 +1938,56 @@ void do_pthread_mutex_unlock ( ThreadId tid,
    don't need to think too hard there.  */
 
 
+static 
+void do_pthread_cond_timedwait_TIMEOUT ( ThreadId tid )
+{
+   Char             msg_buf[100];
+   pthread_mutex_t* mx;
+   pthread_cond_t*  cv;
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_WaitCV
+             && vg_threads[tid].awaken_at != 0xFFFFFFFF);
+   mx = vg_threads[tid].associated_mx;
+   vg_assert(mx != NULL);
+   cv = vg_threads[tid].associated_cv;
+   vg_assert(cv != NULL);
+
+   if (mx->__m_owner == VG_INVALID_THREADID) {
+      /* Currently unheld; hand it out to thread tid. */
+      vg_assert(mx->__m_count == 0);
+      vg_threads[tid].status        = VgTs_Runnable;
+      vg_threads[tid].m_edx         = ETIMEDOUT; 
+                                      /* pthread_cond_wait return value */
+      vg_threads[tid].associated_cv = NULL;
+      vg_threads[tid].associated_mx = NULL;
+      mx->__m_owner = (_pthread_descr)tid;
+      mx->__m_count = 1;
+
+      if (VG_(clo_trace_pthread_level) >= 1) {
+         VG_(sprintf)(msg_buf, "pthread_cond_timedwai cv %p: TIMEOUT with mx %p", 
+                               cv, mx );
+         print_pthread_event(tid, msg_buf);
+      }
+   } else {
+      /* Currently held.  Make thread tid be blocked on it. */
+      vg_assert(mx->__m_count > 0);
+      vg_threads[tid].status        = VgTs_WaitMX;
+      vg_threads[tid].m_edx         = ETIMEDOUT; 
+                                      /* pthread_cond_wait return value */
+      vg_threads[tid].associated_cv = NULL;
+      vg_threads[tid].associated_mx = mx;
+      if (VG_(clo_trace_pthread_level) >= 1) {
+         VG_(sprintf)(msg_buf, 
+            "pthread_cond_timedwai cv %p: TIMEOUT -> BLOCK for mx %p", 
+            cv, mx );
+         print_pthread_event(tid, msg_buf);
+      }
+
+   }
+}
+
+
 static
 void release_N_threads_waiting_on_cond ( pthread_cond_t* cond, 
                                          Int n_to_release, 
@@ -1920,8 +2018,6 @@ void release_N_threads_waiting_on_cond ( pthread_cond_t* cond,
 
       mx = vg_threads[i].associated_mx;
       vg_assert(mx != NULL);
-      vg_assert(mx->__m_count > 0);
-      vg_assert(is_valid_tid((ThreadId)mx->__m_owner));
 
       if (mx->__m_owner == VG_INVALID_THREADID) {
          /* Currently unheld; hand it out to thread i. */
@@ -1931,7 +2027,7 @@ void release_N_threads_waiting_on_cond ( pthread_cond_t* cond,
          vg_threads[i].associated_mx = NULL;
          mx->__m_owner = (_pthread_descr)i;
          mx->__m_count = 1;
-         vg_threads[i].m_edx = 0; /* pthread_cond_wait returns success */
+         /* .m_edx already holds pth_cond_wait success value (0) */
 
          if (VG_(clo_trace_pthread_level) >= 1) {
             VG_(sprintf)(msg_buf, "%s   cv %p: RESUME with mx %p", 
@@ -1941,9 +2037,11 @@ void release_N_threads_waiting_on_cond ( pthread_cond_t* cond,
 
       } else {
          /* Currently held.  Make thread i be blocked on it. */
+         vg_assert(mx->__m_count > 0);
          vg_threads[i].status        = VgTs_WaitMX;
          vg_threads[i].associated_cv = NULL;
          vg_threads[i].associated_mx = mx;
+         vg_threads[i].m_edx         = 0; /* pth_cond_wait success value */
 
          if (VG_(clo_trace_pthread_level) >= 1) {
             VG_(sprintf)(msg_buf, "%s   cv %p: BLOCK for mx %p", 
@@ -1961,14 +2059,18 @@ void release_N_threads_waiting_on_cond ( pthread_cond_t* cond,
 static
 void do_pthread_cond_wait ( ThreadId tid,
                             pthread_cond_t *cond, 
-                            pthread_mutex_t *mutex )
+                            pthread_mutex_t *mutex,
+			    UInt ms_end )
 {
    Char msg_buf[100];
 
+   /* If ms_end == 0xFFFFFFFF, wait forever (no timeout).  Otherwise,
+      ms_end is the ending millisecond. */
+
    /* pre: mutex should be a valid mutex and owned by tid. */
    if (VG_(clo_trace_pthread_level) >= 2) {
-      VG_(sprintf)(msg_buf, "pthread_cond_wait        cv %p, mx %p ...", 
-                            cond, mutex );
+      VG_(sprintf)(msg_buf, "pthread_cond_wait        cv %p, mx %p, end %d ...", 
+                            cond, mutex, ms_end );
       print_pthread_event(tid, msg_buf);
    }
 
@@ -2007,6 +2109,7 @@ void do_pthread_cond_wait ( ThreadId tid,
    vg_threads[tid].status        = VgTs_WaitCV;
    vg_threads[tid].associated_cv = cond;
    vg_threads[tid].associated_mx = mutex;
+   vg_threads[tid].awaken_at     = ms_end;
 
    if (VG_(clo_trace_pthread_level) >= 1) {
       VG_(sprintf)(msg_buf, 
@@ -2055,6 +2158,133 @@ void do_pthread_cond_signal_or_broadcast ( ThreadId tid,
 }
 
 
+/* -----------------------------------------------------------
+   THREAD SPECIFIC DATA
+   -------------------------------------------------------- */
+
+static __inline__
+Bool is_valid_key ( ThreadKey k )
+{
+   /* k unsigned; hence no < 0 check */
+   if (k >= VG_N_THREAD_KEYS) return False;
+   if (!vg_thread_keys[k].inuse) return False;
+   return True;
+}
+
+static
+void do_pthread_key_create ( ThreadId tid,
+                             pthread_key_t* key,
+                             void (*destructor)(void*) )
+{
+   Int  i;
+   Char msg_buf[100];
+
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_key_create      *key %p, destr %p", 
+                            key, destructor );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(sizeof(pthread_key_t) == sizeof(ThreadKey));
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+
+   for (i = 0; i < VG_N_THREAD_KEYS; i++)
+      if (!vg_thread_keys[i].inuse)   
+         break;
+
+   if (i == VG_N_THREAD_KEYS) {
+      /* vg_threads[tid].m_edx = EAGAIN; 
+         return; 
+      */
+      VG_(panic)("pthread_key_create: VG_N_THREAD_KEYS is too low;"
+                 " increase and recompile");
+   }
+
+   vg_thread_keys[i].inuse = True;
+   /* TODO: check key for addressibility */
+   *key = i;
+   vg_threads[tid].m_edx = 0;
+}
+
+
+static
+void do_pthread_key_delete ( ThreadId tid, pthread_key_t key )
+{
+   Char msg_buf[100];
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_key_delete       key %d", 
+                            key );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+   
+   if (!is_valid_key(key)) {
+      vg_threads[tid].m_edx = EINVAL;
+      return;
+   }
+
+   vg_thread_keys[key].inuse = False;
+
+   /* Optional.  We're not required to do this, although it shouldn't
+      make any difference to programs which use the key/specifics
+      functions correctly.  */
+   for (tid = 1; tid < VG_N_THREADS; tid++) {
+      if (vg_threads[tid].status != VgTs_Empty)
+         vg_threads[tid].specifics[key] = NULL;
+   }
+}
+
+
+static 
+void do_pthread_getspecific ( ThreadId tid, pthread_key_t key )
+{
+   Char msg_buf[100];
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_getspecific      key %d", 
+                            key );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+
+   if (!is_valid_key(key)) {
+      vg_threads[tid].m_edx = (UInt)NULL;
+      return;
+   }
+
+   vg_threads[tid].m_edx = (UInt)vg_threads[tid].specifics[key];
+}
+
+
+static
+void do_pthread_setspecific ( ThreadId tid, 
+                              pthread_key_t key, 
+                              void *pointer )
+{
+   Char msg_buf[100];
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_setspecific      key %d, ptr %p", 
+                            key, pointer );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+
+   if (!is_valid_key(key)) {
+      vg_threads[tid].m_edx = EINVAL;
+      return;
+   }
+
+   vg_threads[tid].specifics[key] = pointer;
+   vg_threads[tid].m_edx = 0;
+}
+
+
 /* ---------------------------------------------------------------------
    Handle non-trivial client requests.
    ------------------------------------------------------------------ */
@@ -2105,7 +2335,15 @@ void do_nontrivial_clientreq ( ThreadId tid )
       case VG_USERREQ__PTHREAD_COND_WAIT:
          do_pthread_cond_wait( tid, 
                                (pthread_cond_t *)(arg[1]),
-                               (pthread_mutex_t *)(arg[2]) );
+                               (pthread_mutex_t *)(arg[2]),
+                               0xFFFFFFFF /* no timeout */ );
+         break;
+
+      case VG_USERREQ__PTHREAD_COND_TIMEDWAIT:
+         do_pthread_cond_wait( tid, 
+                               (pthread_cond_t *)(arg[1]),
+                               (pthread_mutex_t *)(arg[2]),
+                               arg[3] /* timeout millisecond point */ );
          break;
 
       case VG_USERREQ__PTHREAD_COND_SIGNAL:
@@ -2122,6 +2360,28 @@ void do_nontrivial_clientreq ( ThreadId tid )
             (pthread_cond_t *)(arg[1]) );
          break;
 
+      case VG_USERREQ__PTHREAD_KEY_CREATE:
+ 	 do_pthread_key_create ( tid, 
+                                 (pthread_key_t*)(arg[1]),
+                                 (void(*)(void*))(arg[2]) );
+	 break;
+
+      case VG_USERREQ__PTHREAD_KEY_DELETE:
+ 	 do_pthread_key_delete ( tid, 
+                                 (pthread_key_t)(arg[1]) );
+ 	 break;
+
+      case VG_USERREQ__PTHREAD_GETSPECIFIC:
+ 	 do_pthread_getspecific ( tid, 
+                                  (pthread_key_t)(arg[1]) );
+ 	 break;
+
+      case VG_USERREQ__PTHREAD_SETSPECIFIC:
+ 	 do_pthread_setspecific ( tid, 
+                                  (pthread_key_t)(arg[1]),
+				  (void*)(arg[2]) );
+ 	 break;
+
       case VG_USERREQ__MAKE_NOACCESS:
       case VG_USERREQ__MAKE_WRITABLE:
       case VG_USERREQ__MAKE_READABLE:
@@ -2160,6 +2420,7 @@ void scheduler_sanity ( void )
    pthread_mutex_t* mx;
    pthread_cond_t*  cv;
    Int              i;
+
    /* VG_(printf)("scheduler_sanity\n"); */
    for (i = 1; i < VG_N_THREADS; i++) {
       mx = vg_threads[i].associated_mx;
@@ -2190,6 +2451,11 @@ void scheduler_sanity ( void )
          /* vg_assert(mx == NULL); */
       }
    }
+
+   for (i = 0; i < VG_N_THREAD_KEYS; i++) {
+      if (!vg_thread_keys[i].inuse)
+         vg_assert(vg_thread_keys[i].destructor == NULL);
+   }
 }
 
 
diff --git a/tests/pth_specific.c b/tests/pth_specific.c
new file mode 100644
index 0000000000..d71a6b2d79
--- /dev/null
+++ b/tests/pth_specific.c
@@ -0,0 +1,104 @@
+/********************************************************
+ * An example source module to accompany...
+ *
+ * "Using POSIX Threads: Programming with Pthreads"
+ *     by Brad nichols, Dick Buttlar, Jackie Farrell
+ *     O'Reilly & Associates, Inc.
+ *
+ ********************************************************
+ * specific.c
+ *
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/time.h>
+
+#include <pthread.h>
+
+#define NUM_THREADS  3
+pthread_key_t     saved_time_key;
+
+
+void free_time(void *arg )
+{
+  struct timeval *timev=(struct timeval *)arg; 
+  printf("free_time:\n");
+  free(timev);
+}
+
+void save_the_time(void)
+{
+  struct timeval *timev;
+
+  timev = (struct timeval *)malloc(sizeof(struct timeval));
+  
+  gettimeofday(timev, NULL);
+
+  printf("save_the_time: \t\t%ld %ld\n",timev->tv_sec, timev->tv_usec);
+
+
+  pthread_setspecific(saved_time_key, (void *)timev);
+  
+}
+
+void what_time_did_i_save(void)
+{
+  struct timeval *timev;
+
+  timev = pthread_getspecific(saved_time_key);
+
+  printf("what_time_did_i_save: \t%ld %ld\n",timev->tv_sec, timev->tv_usec);
+
+}  
+
+void *thread_routine(void *arg)
+{
+  int *my_id=(int *)arg;
+
+  printf("thread_routine %d\n", *my_id);
+
+  save_the_time();
+
+  what_time_did_i_save();
+ 
+  return(NULL); 
+}
+
+extern int 
+main(void)
+{
+	int       i, *id_arg;
+	pthread_t threads[NUM_THREADS];
+
+	id_arg = (int *)malloc(NUM_THREADS*sizeof(int));
+
+	printf("main : initializing the key\n");
+	pthread_key_create(&saved_time_key, free_time);
+
+	printf("main : spawing the threads\n");
+	for (i = 0; i < NUM_THREADS; i++) {
+
+		id_arg[i] = i;
+
+		pthread_create(&(threads[i]), 
+			       NULL,
+			       thread_routine,
+			       (void *) &(id_arg[i]));
+	}
+
+
+	for (i = 0; i < NUM_THREADS; i++) {
+	  pthread_join(threads[i], NULL);
+ 	  printf("main : thread %d has finished. \n", i);
+	}
+
+	printf("main : goodbye\n");
+
+	return 0;
+}
+
+
+
+
diff --git a/vg_include.h b/vg_include.h
index 13e3f01ff2..4ae38505f7 100644
--- a/vg_include.h
+++ b/vg_include.h
@@ -102,7 +102,7 @@
 
 /* These many bytes below %ESP are considered addressible if we're
    doing the --workaround-gcc296-bugs hack. */
-#define VG_GCC296_BUG_STACK_SLOP 256
+#define VG_GCC296_BUG_STACK_SLOP /*256*/ 1024
 
 /* The maximum number of calls we're prepared to save in a
    backtrace. */
@@ -123,10 +123,14 @@
 
 /* The maximum number of pthreads that we support.  This is
    deliberately not very high since our implementation of some of the
-   scheduler algorithms is surely O(N^2) in the number of threads,
-   since that's simple, at least.  And (in practice) we hope that most
+   scheduler algorithms is surely O(N) in the number of threads, since
+   that's simple, at least.  And (in practice) we hope that most
    programs do not need many threads. */
-#define VG_N_THREADS 20
+#define VG_N_THREADS 100
+
+/* Maximum number of pthread keys available.  Again, we start low until
+   the need for a higher number presents itself. */
+#define VG_N_THREAD_KEYS 10
 
 /* Number of file descriptors that can simultaneously be waited on for
    I/O to complete.  Perhaps this should be the same as VG_N_THREADS
@@ -403,8 +407,15 @@ extern Bool  VG_(is_empty_arena) ( ArenaId aid );
 #define VG_USERREQ__PTHREAD_CANCEL          0x3007
 #define VG_USERREQ__PTHREAD_EXIT            0x3008
 #define VG_USERREQ__PTHREAD_COND_WAIT       0x3009
-#define VG_USERREQ__PTHREAD_COND_SIGNAL     0x300A
-#define VG_USERREQ__PTHREAD_COND_BROADCAST  0x300B
+#define VG_USERREQ__PTHREAD_COND_TIMEDWAIT  0x300A
+#define VG_USERREQ__PTHREAD_COND_SIGNAL     0x300B
+#define VG_USERREQ__PTHREAD_COND_BROADCAST  0x300C
+#define VG_USERREQ__PTHREAD_KEY_CREATE      0x300D
+#define VG_USERREQ__PTHREAD_KEY_DELETE      0x300E
+#define VG_USERREQ__PTHREAD_SETSPECIFIC     0x300F
+#define VG_USERREQ__PTHREAD_GETSPECIFIC     0x3010
+
+#define VG_USERREQ__READ_MILLISECOND_TIMER  0x4001
 
 /* Cosmetic ... */
 #define VG_USERREQ__GET_PTHREAD_TRACE_LEVEL 0x3101
@@ -466,7 +477,12 @@ typedef
          ALWAYS == the index in vg_threads[]. */
       ThreadId tid;
 
-      /* Current scheduling status. */
+      /* Current scheduling status. 
+
+         Complications: whenever this is set to VgTs_WaitMX, you
+         should also set .m_edx to whatever the required return value
+         is for pthread_mutex_lock / pthread_cond_timedwait for when
+         the mutex finally gets unblocked. */
       ThreadStatus status;
 
       /* Identity of joiner (thread who called join on me), or
@@ -483,12 +499,21 @@ typedef
          waiting for.  In all other cases, should be NULL. */
       void* /* pthread_cond_t* */ associated_cv;
 
-      /* If VgTs_Sleeping, this is when we should wake up. */
-      ULong awaken_at;
+      /* If VgTs_Sleeping, this is when we should wake up, measured in
+         milliseconds as supplied by VG_(read_millisecond_counter). 
+ 
+         If VgTs_WaitCV, this indicates the time at which
+         pthread_cond_timedwait should wake up.  If == 0xFFFFFFFF,
+         this means infinitely far in the future, viz,
+         pthread_cond_wait. */
+      UInt awaken_at;
 
       /* return value */
       void* retval;
 
+      /* thread-specific data */
+      void* specifics[VG_N_THREAD_KEYS];
+
       /* Stacks.  When a thread slot is freed, we don't deallocate its
          stack; we just leave it lying around for the next use of the
          slot.  If the next use of the slot requires a larger stack,
@@ -662,7 +687,10 @@ extern Char* VG_(strdup) ( ArenaId aid, const Char* s);
 
 extern Char* VG_(getenv) ( Char* name );
 extern Int   VG_(getpid) ( void );
-extern ULong VG_(read_microsecond_timer)( void );
+
+extern void VG_(start_rdtsc_calibration) ( void );
+extern void VG_(end_rdtsc_calibration) ( void );
+extern UInt VG_(read_millisecond_timer) ( void );
 
 
 extern Char VG_(toupper) ( Char c );
diff --git a/vg_libpthread.c b/vg_libpthread.c
index fb70ec6ea8..76923a3c77 100644
--- a/vg_libpthread.c
+++ b/vg_libpthread.c
@@ -134,6 +134,8 @@ static void kludged ( char* msg )
 #include <pthread.h>
 #include <stdio.h>
 #include <errno.h>
+#include <assert.h>
+#include <sys/time.h> /* gettimeofday */
 
 /* ---------------------------------------------------
    THREAD ATTRIBUTES
@@ -421,6 +423,40 @@ int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
    return res;
 }
 
+int pthread_cond_timedwait ( pthread_cond_t *cond, 
+                             pthread_mutex_t *mutex, 
+                             const struct  timespec *abstime )
+{
+   int res;
+   unsigned int ms_now, ms_end;
+   struct  timeval timeval_now;
+   unsigned long long int ull_ms_now_after_1970;
+   unsigned long long int ull_ms_end_after_1970;
+
+   ensure_valgrind("pthread_cond_timedwait");
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+   assert(ms_now != 0xFFFFFFFF);
+   res = gettimeofday(&timeval_now, NULL);
+   assert(res == 0);
+
+   ull_ms_now_after_1970 
+      = 1000ULL * ((unsigned long long int)(timeval_now.tv_sec))
+        + ((unsigned long long int)(timeval_now.tv_usec / 1000000));
+   ull_ms_end_after_1970
+      = 1000ULL * ((unsigned long long int)(abstime->tv_sec))
+        + ((unsigned long long int)(abstime->tv_nsec / 1000000));
+   assert(ull_ms_end_after_1970 >= ull_ms_now_after_1970);
+   ms_end 
+      = ms_now + (unsigned int)(ull_ms_end_after_1970 - ull_ms_now_after_1970);
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_COND_TIMEDWAIT,
+			   cond, mutex, ms_end, 0);
+   return res;
+}
+
+
 int pthread_cond_signal(pthread_cond_t *cond)
 {
    int res;
@@ -471,8 +507,12 @@ int pthread_cancel(pthread_t thread)
 int pthread_key_create(pthread_key_t *key,  
                        void  (*destr_function)  (void *))
 {
-   ignored("pthread_key_create");
-   return 0;
+   int res;
+   ensure_valgrind("pthread_key_create");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_KEY_CREATE,
+                           key, destr_function, 0, 0);
+   return res;
 }
 
 int pthread_key_delete(pthread_key_t key)
@@ -483,14 +523,22 @@ int pthread_key_delete(pthread_key_t key)
 
 int pthread_setspecific(pthread_key_t key, const void *pointer)
 {
-   ignored("pthread_setspecific");
-   return 0;
+   int res;
+   ensure_valgrind("pthread_setspecific");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_SETSPECIFIC,
+                           key, pointer, 0, 0);
+   return res;
 }
 
 void * pthread_getspecific(pthread_key_t key)
 {
-   ignored("pthread_setspecific");
-   return NULL;
+   int res;
+   ensure_valgrind("pthread_getspecific");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_GETSPECIFIC,
+                           key, 0 , 0, 0);
+   return (void*)res;
 }
 
 
@@ -784,7 +832,6 @@ int do_syscall_select( int n,
    * (unchecked) libc error numbers (EINTR etc) are the negation of the
      kernel's error numbers (VKI_EINTR etc).
 */
-#include <assert.h>
 
 
 int select ( int n, 
@@ -793,16 +840,19 @@ int select ( int n,
              fd_set *xfds, 
              struct timeval *timeout )
 {
+   unsigned int ms_now, ms_end;
    int    res;
    fd_set rfds_copy;
    fd_set wfds_copy;
    fd_set xfds_copy;
    struct vki_timeval  t_now;
-   struct vki_timeval  t_end;
    struct vki_timeval  zero_timeout;
    struct vki_timespec nanosleep_interval;
 
-   ensure_valgrind("select");
+   /* gcc's complains about ms_end being used uninitialised -- classic
+      case it can't understand, where ms_end is both defined and used
+      only if timeout != NULL.  Hence ... */
+   ms_end = 0;
 
    /* We assume that the kernel and libc data layouts are identical
       for the following types.  These asserts provide a crude
@@ -811,8 +861,17 @@ int select ( int n,
        || sizeof(struct timeval) != sizeof(struct vki_timeval))
       barf("valgrind's hacky non-blocking select(): data sizes error");
 
-   /* If a zero timeout specified, this call is harmless. */
-   if (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0) {
+   /* Detect the current time and simultaneously find out if we are
+      running on Valgrind. */
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+
+   /* If a zero timeout specified, this call is harmless.  Also go
+      this route if we're not running on Valgrind, for whatever
+      reason. */
+   if ( (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0)
+        || (ms_now == 0xFFFFFFFF) ) {
       res = do_syscall_select( n, (vki_fd_set*)rfds, 
                                    (vki_fd_set*)wfds, 
                                    (vki_fd_set*)xfds, 
@@ -825,35 +884,29 @@ int select ( int n,
       }
    }
 
-   /* If a timeout was specified, set t_end to be the end wallclock
-      time. */
+   /* If a timeout was specified, set ms_end to be the end millisecond
+      counter [wallclock] time. */
    if (timeout) {
       res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
       assert(res == 0);
-      t_end = t_now;
-      t_end.tv_usec += timeout->tv_usec;
-      t_end.tv_sec  += timeout->tv_sec;
-      if (t_end.tv_usec >= 1000000) {
-         t_end.tv_usec -= 1000000;
-         t_end.tv_sec += 1;
-      }
+      ms_end = ms_now;
+      ms_end += (timeout->tv_usec / 1000);
+      ms_end += (timeout->tv_sec * 1000);
       /* Stay sane ... */
-      assert (t_end.tv_sec > t_now.tv_sec
-              || (t_end.tv_sec == t_now.tv_sec 
-                  && t_end.tv_usec >= t_now.tv_usec));
+      assert (ms_end >= ms_now);
    }
 
    /* fprintf(stderr, "MY_SELECT: before loop\n"); */
 
    /* Either timeout == NULL, meaning wait indefinitely, or timeout !=
-      NULL, in which case t_end holds the end time. */
+      NULL, in which case ms_end holds the end time. */
    while (1) {
       if (timeout) {
-         res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-         assert(res == 0);
-         if (t_now.tv_sec > t_end.tv_sec
-             || (t_now.tv_sec == t_end.tv_sec 
-                 && t_now.tv_usec > t_end.tv_usec)) {
+         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                                 VG_USERREQ__READ_MILLISECOND_TIMER,
+                                 0, 0, 0, 0);
+         assert(ms_now != 0xFFFFFFFF);
+         if (ms_now >= ms_end) {
             /* timeout; nothing interesting happened. */
             if (rfds) FD_ZERO(rfds);
             if (wfds) FD_ZERO(wfds);
@@ -892,7 +945,7 @@ int select ( int n,
       /* fprintf(stderr, "MY_SELECT: nanosleep\n"); */
       /* nanosleep and go round again */
       nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 75 * 1000 * 1000; /* 75 milliseconds */
+      nanosleep_interval.tv_nsec = 100 * 1000 * 1000; /* 100 milliseconds */
       /* It's critical here that valgrind's nanosleep implementation
          is nonblocking. */
       (void)my_do_syscall2(__NR_nanosleep, 
@@ -907,19 +960,28 @@ int select ( int n,
 
 int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
 {
+   unsigned int        ms_now, ms_end;
    int                 res, i;
-   struct vki_timeval  t_now;
-   struct vki_timeval  t_end;
    struct vki_timespec nanosleep_interval;
 
    ensure_valgrind("poll");
 
+   /* Detect the current time and simultaneously find out if we are
+      running on Valgrind. */
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+
    if (/* CHECK SIZES FOR struct pollfd */
        sizeof(struct timeval) != sizeof(struct vki_timeval))
       barf("valgrind's hacky non-blocking poll(): data sizes error");
 
-   /* If a zero timeout specified, this call is harmless. */
-   if (__timeout == 0) {
+   /* dummy initialisation to keep gcc -Wall happy */
+   ms_end = 0;
+
+   /* If a zero timeout specified, this call is harmless.  Also do
+      this if not running on Valgrind. */
+   if (__timeout == 0 || ms_now == 0xFFFFFFFF) {
       res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, __timeout);
       if (is_kerror(res)) {
          * (__errno_location()) = -res;
@@ -929,36 +991,25 @@ int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
       }
    }
 
-   /* If a timeout was specified, set t_end to be the end wallclock
-      time. */
+   /* If a timeout was specified, set ms_end to be the end wallclock
+      time.  Easy considering that __timeout is in milliseconds. */
    if (__timeout > 0) {
-      res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-      assert(res == 0);
-      t_end = t_now;
-      t_end.tv_usec += 1000 * (__timeout % 1000);
-      t_end.tv_sec  += (__timeout / 1000);
-      if (t_end.tv_usec >= 1000000) {
-         t_end.tv_usec -= 1000000;
-         t_end.tv_sec += 1;
-      }
-      /* Stay sane ... */
-      assert (t_end.tv_sec > t_now.tv_sec
-              || (t_end.tv_sec == t_now.tv_sec 
-                  && t_end.tv_usec >= t_now.tv_usec));
+      ms_end += (unsigned int)__timeout;
    }
 
    /* fprintf(stderr, "MY_POLL: before loop\n"); */
 
    /* Either timeout < 0, meaning wait indefinitely, or timeout > 0,
       in which case t_end holds the end time. */
+   assert(__timeout != 0);
+
    while (1) {
-      assert(__timeout != 0);
       if (__timeout > 0) {
-         res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-         assert(res == 0);
-         if (t_now.tv_sec > t_end.tv_sec
-             || (t_now.tv_sec == t_end.tv_sec 
-                 && t_now.tv_usec > t_end.tv_usec)) {
+         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                                 VG_USERREQ__READ_MILLISECOND_TIMER,
+                                 0, 0, 0, 0);
+         assert(ms_now != 0xFFFFFFFF);
+         if (ms_now >= ms_end) {
             /* timeout; nothing interesting happened. */
             for (i = 0; i < __nfds; i++) 
                __fds[i].revents = 0;
@@ -966,8 +1017,7 @@ int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
          }
       }
 
-      /* These could be trashed each time round the loop, so restore
-         them each time. */
+      /* Do a return-immediately poll. */
       res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, 0 );
       if (is_kerror(res)) {
          /* Some kind of error.  Set errno and return.  */
@@ -981,7 +1031,7 @@ int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
       /* fprintf(stderr, "MY_POLL: nanosleep\n"); */
       /* nanosleep and go round again */
       nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 100 * 1000 * 1000; /* 100 milliseconds */
+      nanosleep_interval.tv_nsec = 99 * 1000 * 1000; /* 99 milliseconds */
       /* It's critical here that valgrind's nanosleep implementation
          is nonblocking. */
       (void)my_do_syscall2(__NR_nanosleep, 
diff --git a/vg_main.c b/vg_main.c
index 47ea5be073..5f7fe59c34 100644
--- a/vg_main.c
+++ b/vg_main.c
@@ -971,6 +971,9 @@ void VG_(main) ( void )
    VGP_(init_profiling)();
 #  endif
 
+   /* Start calibration of our RDTSC-based clock. */
+   VG_(start_rdtsc_calibration)();
+
    /* Hook to delay things long enough so we can get the pid and
       attach GDB in another shell. */
    /* {extern unsigned int sleep(unsigned int seconds); sleep(10);} */
@@ -984,6 +987,10 @@ void VG_(main) ( void )
       VGP_POPCC;
    }
 
+   /* End calibration of our RDTSC-based clock, leaving it as long as
+      we can. */
+   VG_(end_rdtsc_calibration)();
+
    /* This should come after init_memory_audit; otherwise the latter
       carefully sets up the permissions maps to cover the anonymous
       mmaps for the translation table and translation cache, which
diff --git a/vg_mylibc.c b/vg_mylibc.c
index a728f42399..740b21e3d1 100644
--- a/vg_mylibc.c
+++ b/vg_mylibc.c
@@ -291,13 +291,14 @@ Int VG_(select)( Int n,
    return res;
 }
 
-/* Returns -1 on error, but 0 if ok or interrupted. */
+/* Returns -1 on error, 0 if ok, 1 if interrupted. */
 Int VG_(nanosleep)( const struct vki_timespec *req, 
                     struct vki_timespec *rem )
 {
    Int res;
    res = vg_do_syscall2(__NR_nanosleep, (UInt)req, (UInt)rem);
    if (res == -VKI_EINVAL) return -1;
+   if (res == -VKI_EINTR)  return 1;
    return 0;
 }
 
@@ -936,17 +937,6 @@ Int VG_(getpid) ( void )
    return res;
 }
 
-/* Read a notional elapsed (wallclock-time) timer, giving a 64-bit
-   microseconds count. */
-ULong VG_(read_microsecond_timer)( void )
-{
-   Int                res;
-   struct vki_timeval tv;
-   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&tv, (UInt)NULL);
-   vg_assert(!VG_(is_kerror)(res));
-   return (1000000ULL * (ULong)(tv.tv_sec)) + (ULong)(tv.tv_usec);
-}
-
 /* Return -1 if error, else 0.  NOTE does not indicate return code of
    child! */
 Int VG_(system) ( Char* cmd )
@@ -981,6 +971,129 @@ Int VG_(system) ( Char* cmd )
 }
 
 
+/* ---------------------------------------------------------------------
+   Support for a millisecond-granularity counter using RDTSC.
+   ------------------------------------------------------------------ */
+
+static __inline__ ULong do_rdtsc_insn ( void )
+{
+   ULong x;
+   __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
+   return x;
+}
+
+/* 0 = pre-calibration, 1 = calibration, 2 = running */
+static Int   rdtsc_calibration_state     = 0;
+static ULong rdtsc_ticks_per_millisecond = 0; /* invalid value */
+
+static struct vki_timeval rdtsc_cal_start_timeval;
+static struct vki_timeval rdtsc_cal_end_timeval;
+
+static ULong              rdtsc_cal_start_raw;
+static ULong              rdtsc_cal_end_raw;
+
+UInt VG_(read_millisecond_timer) ( void )
+{
+   ULong rdtsc_now;
+   vg_assert(rdtsc_calibration_state == 2);
+   rdtsc_now = do_rdtsc_insn();
+   vg_assert(rdtsc_now > rdtsc_cal_end_raw);
+   rdtsc_now -= rdtsc_cal_end_raw;
+   rdtsc_now /= rdtsc_ticks_per_millisecond;
+   return (UInt)rdtsc_now;
+}
+
+
+void VG_(start_rdtsc_calibration) ( void )
+{
+   Int res;
+   vg_assert(rdtsc_calibration_state == 0);
+   rdtsc_calibration_state = 1;
+   rdtsc_cal_start_raw = do_rdtsc_insn();
+   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&rdtsc_cal_start_timeval, 
+                                           (UInt)NULL);
+   vg_assert(!VG_(is_kerror)(res));
+}
+
+void VG_(end_rdtsc_calibration) ( void )
+{
+   Int   res, loops;
+   ULong cpu_clock_MHZ;
+   ULong cal_clock_ticks;
+   ULong cal_wallclock_microseconds;
+   ULong wallclock_start_microseconds;
+   ULong wallclock_end_microseconds;
+   struct vki_timespec req;
+   struct vki_timespec rem;
+   
+   vg_assert(rdtsc_calibration_state == 1);
+   rdtsc_calibration_state = 2;
+
+   /* Try and delay for 20 milliseconds, so that we can at least have
+      some minimum level of accuracy. */
+   req.tv_sec = 0;
+   req.tv_nsec = 20 * 1000 * 1000;
+   loops = 0;
+   while (True) {
+      res = VG_(nanosleep)(&req, &rem);
+      vg_assert(res == 0 /*ok*/ || res == 1 /*interrupted*/);
+      if (res == 0)
+         break;
+      if (rem.tv_sec == 0 && rem.tv_nsec == 0) 
+         break;
+      req = rem;
+      loops++;
+      if (loops > 100) 
+         VG_(panic)("calibration nanosleep loop failed?!");
+   }
+
+   /* Now read both timers, and do the Math. */
+   rdtsc_cal_end_raw = do_rdtsc_insn();
+   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&rdtsc_cal_end_timeval, 
+                                           (UInt)NULL);
+
+   vg_assert(rdtsc_cal_end_raw > rdtsc_cal_start_raw);
+   cal_clock_ticks = rdtsc_cal_end_raw - rdtsc_cal_start_raw;
+
+   wallclock_start_microseconds
+      = (1000000ULL * (ULong)(rdtsc_cal_start_timeval.tv_sec)) 
+         + (ULong)(rdtsc_cal_start_timeval.tv_usec);
+   wallclock_end_microseconds
+      = (1000000ULL * (ULong)(rdtsc_cal_end_timeval.tv_sec)) 
+         + (ULong)(rdtsc_cal_end_timeval.tv_usec);
+   vg_assert(wallclock_end_microseconds > wallclock_start_microseconds);
+   cal_wallclock_microseconds 
+      = wallclock_end_microseconds - wallclock_start_microseconds;
+
+   /* Since we just nanoslept for 20 ms ... */
+   vg_assert(cal_wallclock_microseconds >= 20000);
+
+   /* Now we know (roughly) that cal_clock_ticks on RDTSC take
+      cal_wallclock_microseconds elapsed time.  Calculate the RDTSC
+      ticks-per-millisecond value. */
+   if (0)
+      VG_(printf)("%lld ticks in %lld microseconds\n", 
+                  cal_clock_ticks,  cal_wallclock_microseconds );
+
+   rdtsc_ticks_per_millisecond   
+      = cal_clock_ticks / (cal_wallclock_microseconds / 1000ULL);
+   cpu_clock_MHZ
+      = (1000ULL * rdtsc_ticks_per_millisecond) / 1000000ULL;
+   if (VG_(clo_verbosity) >= 1)
+      VG_(message)(Vg_UserMsg, "Estimated CPU clock rate is %d MHz",
+                               (UInt)cpu_clock_MHZ);
+   if (cpu_clock_MHZ < 100 || cpu_clock_MHZ > 10000)
+      VG_(panic)("end_rdtsc_calibration: "
+                 "estimated CPU MHz outside range 100 .. 10000");
+   /* Paranoia about division by zero later. */
+   vg_assert(rdtsc_ticks_per_millisecond != 0);
+   if (0)
+      VG_(printf)("ticks per millisecond %llu\n", 
+                  rdtsc_ticks_per_millisecond);
+}
+
+
+
 /* ---------------------------------------------------------------------
    Primitive support for bagging memory via mmap.
    ------------------------------------------------------------------ */
diff --git a/vg_scheduler.c b/vg_scheduler.c
index 68dbf19a4f..32201b9383 100644
--- a/vg_scheduler.c
+++ b/vg_scheduler.c
@@ -119,7 +119,26 @@ typedef
 static VgWaitedOnFd vg_waiting_fds[VG_N_WAITING_FDS];
 
 
+/* Keeping track of keys. */
+typedef
+   struct {
+      /* Has this key been allocated ? */
+      Bool inuse;
+      /* If .inuse==True, records the address of the associated
+         destructor, or NULL if none. */
+      void (*destructor)(void*);
+   }
+   ThreadKeyState;
+
+/* And our array of thread keys. */
+static ThreadKeyState vg_thread_keys[VG_N_THREAD_KEYS];
+
+typedef UInt ThreadKey;
+
+
 /* Forwards */
+static void do_pthread_cond_timedwait_TIMEOUT ( ThreadId tid );
+
 static void do_nontrivial_clientreq ( ThreadId tid );
 
 static void scheduler_sanity ( void );
@@ -511,6 +530,11 @@ void VG_(scheduler_init) ( void )
    for (i = 0; i < VG_N_WAITING_FDS; i++)
       vg_waiting_fds[i].fd = -1; /* not in use */
 
+   for (i = 0; i < VG_N_THREAD_KEYS; i++) {
+      vg_thread_keys[i].inuse      = False;
+      vg_thread_keys[i].destructor = NULL;
+   }
+
    /* Assert this is thread zero, which has certain magic
       properties. */
    tid_main = vg_alloc_ThreadState();
@@ -523,6 +547,8 @@ void VG_(scheduler_init) ( void )
    vg_threads[tid_main].retval        = NULL; /* not important */
    vg_threads[tid_main].stack_highest_word 
       = vg_threads[tid_main].m_esp /* -4  ??? */;
+   for (i = 0; i < VG_N_THREAD_KEYS; i++)
+      vg_threads[tid_main].specifics[i] = NULL;
 
    /* Copy VG_(baseBlock) state to tid_main's slot. */
    vg_tid_currently_in_baseBlock = tid_main;
@@ -618,13 +644,16 @@ Bool maybe_do_trivial_clientreq ( ThreadId tid )
             (UInt)VG_(client_memalign) ( tst, arg[1], arg[2] )
          );
 
-      /* These are heavily used. */
+      /* These are heavily used -- or at least we want them to be
+         cheap. */
       case VG_USERREQ__PTHREAD_GET_THREADID:
          SIMPLE_RETURN(tid);
       case VG_USERREQ__RUNNING_ON_VALGRIND:
          SIMPLE_RETURN(1);
       case VG_USERREQ__GET_PTHREAD_TRACE_LEVEL:
          SIMPLE_RETURN(VG_(clo_trace_pthread_level));
+      case VG_USERREQ__READ_MILLISECOND_TIMER:
+         SIMPLE_RETURN(VG_(read_millisecond_timer)());
 
       default:
          /* Too hard; wimp out. */
@@ -692,18 +721,18 @@ void sched_do_syscall ( ThreadId tid )
    syscall_no = vg_threads[tid].m_eax; /* syscall number */
 
    if (syscall_no == __NR_nanosleep) {
-      ULong t_now, t_awaken;
+      UInt t_now, t_awaken;
       struct vki_timespec* req;
       req = (struct vki_timespec*)vg_threads[tid].m_ebx; /* arg1 */
-      t_now = VG_(read_microsecond_timer)();     
+      t_now = VG_(read_millisecond_timer)();     
       t_awaken 
          = t_now
-           + (ULong)1000000ULL * (ULong)(req->tv_sec) 
-           + (ULong)( (UInt)(req->tv_nsec) / 1000 );
+           + (UInt)1000ULL * (UInt)(req->tv_sec) 
+           + (UInt)(req->tv_nsec) / 1000000;
       vg_threads[tid].status    = VgTs_Sleeping;
       vg_threads[tid].awaken_at = t_awaken;
       if (VG_(clo_trace_sched)) {
-         VG_(sprintf)(msg_buf, "at %lu: nanosleep for %lu", 
+         VG_(sprintf)(msg_buf, "at %d: nanosleep for %d", 
                                t_now, t_awaken-t_now);
 	 print_sched_event(tid, msg_buf);
       }
@@ -820,16 +849,16 @@ void poll_for_ready_fds ( void )
    Char               msg_buf[100];
 
    struct vki_timespec* rem;
-   ULong                t_now;
+   UInt                 t_now;
 
    /* Awaken any sleeping threads whose sleep has expired. */
    for (tid = 1; tid < VG_N_THREADS; tid++)
      if (vg_threads[tid].status == VgTs_Sleeping)
         break;
 
-   /* Avoid pointless calls to VG_(read_microsecond_timer). */
+   /* Avoid pointless calls to VG_(read_millisecond_timer). */
    if (tid < VG_N_THREADS) {
-      t_now = VG_(read_microsecond_timer)();
+      t_now = VG_(read_millisecond_timer)();
       for (tid = 1; tid < VG_N_THREADS; tid++) {
          if (vg_threads[tid].status != VgTs_Sleeping)
             continue;
@@ -848,7 +877,7 @@ void poll_for_ready_fds ( void )
 	    /* Reschedule this thread. */
             vg_threads[tid].status = VgTs_Runnable;
             if (VG_(clo_trace_sched)) {
-               VG_(sprintf)(msg_buf, "at %lu: nanosleep done", 
+               VG_(sprintf)(msg_buf, "at %d: nanosleep done", 
                                      t_now);
                print_sched_event(tid, msg_buf);
             }
@@ -1004,6 +1033,21 @@ void complete_blocked_syscalls ( void )
 }
 
 
+static
+void check_for_pthread_cond_timedwait ( void )
+{
+   Int i;
+   for (i = 1; i < VG_N_THREADS; i++) {
+      if (vg_threads[i].status != VgTs_WaitCV)
+         continue;
+      if (vg_threads[i].awaken_at == 0xFFFFFFFF /* no timeout */)
+         continue;
+      if (VG_(read_millisecond_timer)() >= vg_threads[i].awaken_at)
+         do_pthread_cond_timedwait_TIMEOUT(i);
+   }
+}
+
+
 static
 void nanosleep_for_a_while ( void )
 {
@@ -1011,10 +1055,9 @@ void nanosleep_for_a_while ( void )
    struct vki_timespec req;
    struct vki_timespec rem;
    req.tv_sec = 0;
-   req.tv_nsec = 20 * 1000 * 1000;
+   req.tv_nsec = 50 * 1000 * 1000;
    res = VG_(nanosleep)( &req, &rem );   
-   /* VG_(printf)("after ns, unused = %d\n", rem.tv_nsec ); */
-   vg_assert(res == 0);
+   vg_assert(res == 0 /* ok */ || res == 1 /* interrupted by signal */);
 }
 
 
@@ -1079,6 +1122,7 @@ VgSchedReturnCode VG_(scheduler) ( void )
             threads. */
          poll_for_ready_fds();
          complete_blocked_syscalls();
+         check_for_pthread_cond_timedwait();
 
          /* See if there are any signals which need to be delivered.  If
             so, choose thread(s) to deliver them to, and build signal
@@ -1527,6 +1571,7 @@ void do_pthread_create ( ThreadId parent_tid,
                          void* (*start_routine)(void *), 
                          void* arg )
 {
+   Int      i;
    Addr     new_stack;
    UInt     new_stk_szb;
    ThreadId tid;
@@ -1607,6 +1652,9 @@ void do_pthread_create ( ThreadId parent_tid,
    vg_threads[tid].joiner        = VG_INVALID_THREADID;
    vg_threads[tid].status        = VgTs_Runnable;
 
+   for (i = 0; i < VG_N_THREAD_KEYS; i++)
+      vg_threads[tid].specifics[i] = NULL;
+
    /* return zero */
    vg_threads[tid].m_edx  = 0; /* success */
 }
@@ -1691,7 +1739,7 @@ void release_one_thread_waiting_on_mutex ( pthread_mutex_t* mutex,
       mutex->__m_owner = (_pthread_descr)i;
       vg_threads[i].status        = VgTs_Runnable;
       vg_threads[i].associated_mx = NULL;
-      vg_threads[i].m_edx         = 0; /* pth_lock() success */
+      /* m_edx already holds pth_mx_lock() success (0) */
 
       if (VG_(clo_trace_pthread_level) >= 1) {
          VG_(sprintf)(msg_buf, "%s       mx %p: RESUME", 
@@ -1773,7 +1821,7 @@ void do_pthread_mutex_lock( ThreadId tid,
          } else {
             vg_threads[tid].status        = VgTs_WaitMX;
             vg_threads[tid].associated_mx = mutex;
-            /* No assignment to %EDX, since we're blocking. */
+            vg_threads[tid].m_edx         = 0; /* pth_mx_lock success value */
             if (VG_(clo_trace_pthread_level) >= 1) {
                VG_(sprintf)(msg_buf, "%s    mx %p: BLOCK", 
                                      caller, mutex );
@@ -1890,6 +1938,56 @@ void do_pthread_mutex_unlock ( ThreadId tid,
    don't need to think too hard there.  */
 
 
+static 
+void do_pthread_cond_timedwait_TIMEOUT ( ThreadId tid )
+{
+   Char             msg_buf[100];
+   pthread_mutex_t* mx;
+   pthread_cond_t*  cv;
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_WaitCV
+             && vg_threads[tid].awaken_at != 0xFFFFFFFF);
+   mx = vg_threads[tid].associated_mx;
+   vg_assert(mx != NULL);
+   cv = vg_threads[tid].associated_cv;
+   vg_assert(cv != NULL);
+
+   if (mx->__m_owner == VG_INVALID_THREADID) {
+      /* Currently unheld; hand it out to thread tid. */
+      vg_assert(mx->__m_count == 0);
+      vg_threads[tid].status        = VgTs_Runnable;
+      vg_threads[tid].m_edx         = ETIMEDOUT; 
+                                      /* pthread_cond_wait return value */
+      vg_threads[tid].associated_cv = NULL;
+      vg_threads[tid].associated_mx = NULL;
+      mx->__m_owner = (_pthread_descr)tid;
+      mx->__m_count = 1;
+
+      if (VG_(clo_trace_pthread_level) >= 1) {
+         VG_(sprintf)(msg_buf, "pthread_cond_timedwai cv %p: TIMEOUT with mx %p", 
+                               cv, mx );
+         print_pthread_event(tid, msg_buf);
+      }
+   } else {
+      /* Currently held.  Make thread tid be blocked on it. */
+      vg_assert(mx->__m_count > 0);
+      vg_threads[tid].status        = VgTs_WaitMX;
+      vg_threads[tid].m_edx         = ETIMEDOUT; 
+                                      /* pthread_cond_wait return value */
+      vg_threads[tid].associated_cv = NULL;
+      vg_threads[tid].associated_mx = mx;
+      if (VG_(clo_trace_pthread_level) >= 1) {
+         VG_(sprintf)(msg_buf, 
+            "pthread_cond_timedwai cv %p: TIMEOUT -> BLOCK for mx %p", 
+            cv, mx );
+         print_pthread_event(tid, msg_buf);
+      }
+
+   }
+}
+
+
 static
 void release_N_threads_waiting_on_cond ( pthread_cond_t* cond, 
                                          Int n_to_release, 
@@ -1920,8 +2018,6 @@ void release_N_threads_waiting_on_cond ( pthread_cond_t* cond,
 
       mx = vg_threads[i].associated_mx;
       vg_assert(mx != NULL);
-      vg_assert(mx->__m_count > 0);
-      vg_assert(is_valid_tid((ThreadId)mx->__m_owner));
 
       if (mx->__m_owner == VG_INVALID_THREADID) {
          /* Currently unheld; hand it out to thread i. */
@@ -1931,7 +2027,7 @@ void release_N_threads_waiting_on_cond ( pthread_cond_t* cond,
          vg_threads[i].associated_mx = NULL;
          mx->__m_owner = (_pthread_descr)i;
          mx->__m_count = 1;
-         vg_threads[i].m_edx = 0; /* pthread_cond_wait returns success */
+         /* .m_edx already holds pth_cond_wait success value (0) */
 
          if (VG_(clo_trace_pthread_level) >= 1) {
             VG_(sprintf)(msg_buf, "%s   cv %p: RESUME with mx %p", 
@@ -1941,9 +2037,11 @@ void release_N_threads_waiting_on_cond ( pthread_cond_t* cond,
 
       } else {
          /* Currently held.  Make thread i be blocked on it. */
+         vg_assert(mx->__m_count > 0);
          vg_threads[i].status        = VgTs_WaitMX;
          vg_threads[i].associated_cv = NULL;
          vg_threads[i].associated_mx = mx;
+         vg_threads[i].m_edx         = 0; /* pth_cond_wait success value */
 
          if (VG_(clo_trace_pthread_level) >= 1) {
             VG_(sprintf)(msg_buf, "%s   cv %p: BLOCK for mx %p", 
@@ -1961,14 +2059,18 @@ void release_N_threads_waiting_on_cond ( pthread_cond_t* cond,
 static
 void do_pthread_cond_wait ( ThreadId tid,
                             pthread_cond_t *cond, 
-                            pthread_mutex_t *mutex )
+                            pthread_mutex_t *mutex,
+			    UInt ms_end )
 {
    Char msg_buf[100];
 
+   /* If ms_end == 0xFFFFFFFF, wait forever (no timeout).  Otherwise,
+      ms_end is the ending millisecond. */
+
    /* pre: mutex should be a valid mutex and owned by tid. */
    if (VG_(clo_trace_pthread_level) >= 2) {
-      VG_(sprintf)(msg_buf, "pthread_cond_wait        cv %p, mx %p ...", 
-                            cond, mutex );
+      VG_(sprintf)(msg_buf, "pthread_cond_wait        cv %p, mx %p, end %d ...", 
+                            cond, mutex, ms_end );
       print_pthread_event(tid, msg_buf);
    }
 
@@ -2007,6 +2109,7 @@ void do_pthread_cond_wait ( ThreadId tid,
    vg_threads[tid].status        = VgTs_WaitCV;
    vg_threads[tid].associated_cv = cond;
    vg_threads[tid].associated_mx = mutex;
+   vg_threads[tid].awaken_at     = ms_end;
 
    if (VG_(clo_trace_pthread_level) >= 1) {
       VG_(sprintf)(msg_buf, 
@@ -2055,6 +2158,133 @@ void do_pthread_cond_signal_or_broadcast ( ThreadId tid,
 }
 
 
+/* -----------------------------------------------------------
+   THREAD SPECIFIC DATA
+   -------------------------------------------------------- */
+
+static __inline__
+Bool is_valid_key ( ThreadKey k )
+{
+   /* k unsigned; hence no < 0 check */
+   if (k >= VG_N_THREAD_KEYS) return False;
+   if (!vg_thread_keys[k].inuse) return False;
+   return True;
+}
+
+static
+void do_pthread_key_create ( ThreadId tid,
+                             pthread_key_t* key,
+                             void (*destructor)(void*) )
+{
+   Int  i;
+   Char msg_buf[100];
+
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_key_create      *key %p, destr %p", 
+                            key, destructor );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(sizeof(pthread_key_t) == sizeof(ThreadKey));
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+
+   for (i = 0; i < VG_N_THREAD_KEYS; i++)
+      if (!vg_thread_keys[i].inuse)   
+         break;
+
+   if (i == VG_N_THREAD_KEYS) {
+      /* vg_threads[tid].m_edx = EAGAIN; 
+         return; 
+      */
+      VG_(panic)("pthread_key_create: VG_N_THREAD_KEYS is too low;"
+                 " increase and recompile");
+   }
+
+   vg_thread_keys[i].inuse = True;
+   /* TODO: check key for addressibility */
+   *key = i;
+   vg_threads[tid].m_edx = 0;
+}
+
+
+static
+void do_pthread_key_delete ( ThreadId tid, pthread_key_t key )
+{
+   Char msg_buf[100];
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_key_delete       key %d", 
+                            key );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+   
+   if (!is_valid_key(key)) {
+      vg_threads[tid].m_edx = EINVAL;
+      return;
+   }
+
+   vg_thread_keys[key].inuse = False;
+
+   /* Optional.  We're not required to do this, although it shouldn't
+      make any difference to programs which use the key/specifics
+      functions correctly.  */
+   for (tid = 1; tid < VG_N_THREADS; tid++) {
+      if (vg_threads[tid].status != VgTs_Empty)
+         vg_threads[tid].specifics[key] = NULL;
+   }
+}
+
+
+static 
+void do_pthread_getspecific ( ThreadId tid, pthread_key_t key )
+{
+   Char msg_buf[100];
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_getspecific      key %d", 
+                            key );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+
+   if (!is_valid_key(key)) {
+      vg_threads[tid].m_edx = (UInt)NULL;
+      return;
+   }
+
+   vg_threads[tid].m_edx = (UInt)vg_threads[tid].specifics[key];
+}
+
+
+static
+void do_pthread_setspecific ( ThreadId tid, 
+                              pthread_key_t key, 
+                              void *pointer )
+{
+   Char msg_buf[100];
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_setspecific      key %d, ptr %p", 
+                            key, pointer );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+
+   if (!is_valid_key(key)) {
+      vg_threads[tid].m_edx = EINVAL;
+      return;
+   }
+
+   vg_threads[tid].specifics[key] = pointer;
+   vg_threads[tid].m_edx = 0;
+}
+
+
 /* ---------------------------------------------------------------------
    Handle non-trivial client requests.
    ------------------------------------------------------------------ */
@@ -2105,7 +2335,15 @@ void do_nontrivial_clientreq ( ThreadId tid )
       case VG_USERREQ__PTHREAD_COND_WAIT:
          do_pthread_cond_wait( tid, 
                                (pthread_cond_t *)(arg[1]),
-                               (pthread_mutex_t *)(arg[2]) );
+                               (pthread_mutex_t *)(arg[2]),
+                               0xFFFFFFFF /* no timeout */ );
+         break;
+
+      case VG_USERREQ__PTHREAD_COND_TIMEDWAIT:
+         do_pthread_cond_wait( tid, 
+                               (pthread_cond_t *)(arg[1]),
+                               (pthread_mutex_t *)(arg[2]),
+                               arg[3] /* timeout millisecond point */ );
          break;
 
       case VG_USERREQ__PTHREAD_COND_SIGNAL:
@@ -2122,6 +2360,28 @@ void do_nontrivial_clientreq ( ThreadId tid )
             (pthread_cond_t *)(arg[1]) );
          break;
 
+      case VG_USERREQ__PTHREAD_KEY_CREATE:
+ 	 do_pthread_key_create ( tid, 
+                                 (pthread_key_t*)(arg[1]),
+                                 (void(*)(void*))(arg[2]) );
+	 break;
+
+      case VG_USERREQ__PTHREAD_KEY_DELETE:
+ 	 do_pthread_key_delete ( tid, 
+                                 (pthread_key_t)(arg[1]) );
+ 	 break;
+
+      case VG_USERREQ__PTHREAD_GETSPECIFIC:
+ 	 do_pthread_getspecific ( tid, 
+                                  (pthread_key_t)(arg[1]) );
+ 	 break;
+
+      case VG_USERREQ__PTHREAD_SETSPECIFIC:
+ 	 do_pthread_setspecific ( tid, 
+                                  (pthread_key_t)(arg[1]),
+				  (void*)(arg[2]) );
+ 	 break;
+
       case VG_USERREQ__MAKE_NOACCESS:
       case VG_USERREQ__MAKE_WRITABLE:
       case VG_USERREQ__MAKE_READABLE:
@@ -2160,6 +2420,7 @@ void scheduler_sanity ( void )
    pthread_mutex_t* mx;
    pthread_cond_t*  cv;
    Int              i;
+
    /* VG_(printf)("scheduler_sanity\n"); */
    for (i = 1; i < VG_N_THREADS; i++) {
       mx = vg_threads[i].associated_mx;
@@ -2190,6 +2451,11 @@ void scheduler_sanity ( void )
          /* vg_assert(mx == NULL); */
       }
    }
+
+   for (i = 0; i < VG_N_THREAD_KEYS; i++) {
+      if (!vg_thread_keys[i].inuse)
+         vg_assert(vg_thread_keys[i].destructor == NULL);
+   }
 }
 
 
-- 
2.47.2