]> git.ipfire.org Git - thirdparty/valgrind.git/commitdiff
More dispatcher tuning for ppc32/64. Makes a big difference for
authorJulian Seward <jseward@acm.org>
Mon, 26 Dec 2005 17:58:58 +0000 (17:58 +0000)
committerJulian Seward <jseward@acm.org>
Mon, 26 Dec 2005 17:58:58 +0000 (17:58 +0000)
perf/tinycc.

- run_thread_for_a_while: just clear this thread's reservation when
  starting, not all of them.

- use a different fast-cache hashing function for ppc32/64 than for
  x86/amd64.  This allows the former to use all the fast-cache entries
  rather than just 1/4 of them.

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5441

coregrind/m_dispatch/dispatch-ppc32-linux.S
coregrind/m_scheduler/scheduler.c
coregrind/m_transtab.c
coregrind/pub_core_transtab_asm.h

index 643a781fc9dbeb2b0850577848a4e4e58eb3c929..5103675b65cbb755692eb11142e40d75cd3a4c6d 100644 (file)
@@ -254,14 +254,14 @@ VG_(run_innerloop__dispatch_unprofiled):
         stw     3,OFFSET_ppc32_CIA(31)
 
         /* Are we out of timeslice?  If yes, defer to scheduler. */
-//     subic.  29,29,1
        subi    29,29,1
        cmplwi  29,0
         beq    counter_is_zero
 
         /* try a fast lookup in the translation cache */
-        /* r4=((r3<<2) & (VG_TT_FAST_MASK<<2)) */
-        rlwinm  4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2  
+        /* r4 = VG_TT_FAST_HASH(addr)           * sizeof(ULong)
+              = ((r3 >>u 2) & VG_TT_FAST_MASK)  << 2 */
+        rlwinm  4,3, 0, 32-2-VG_TT_FAST_BITS, 31-2  
         addis   5,4,VG_(tt_fast)@ha
         lwz     5,VG_(tt_fast)@l(5)
         lwz     6,4(5)   /* big-endian, so comparing 2nd 32bit word */
@@ -310,12 +310,14 @@ VG_(run_innerloop__dispatch_profiled):
         stw     3,OFFSET_ppc32_CIA(31)
 
         /* Are we out of timeslice?  If yes, defer to scheduler. */
-       addic.  29,29,-1
+       subi    29,29,1
+       cmplwi  29,0
         beq    counter_is_zero
 
         /* try a fast lookup in the translation cache */
-        /* r4=((r3<<2) & (VG_TT_FAST_MASK<<2)) */
-        rlwinm  4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2  
+        /* r4 = VG_TT_FAST_HASH(addr)           * sizeof(ULong)
+              = ((r3 >>u 2) & VG_TT_FAST_MASK)  << 2 */
+        rlwinm  4,3, 0, 32-2-VG_TT_FAST_BITS, 31-2 
         addis   5,4,VG_(tt_fast)@ha
         lwz     5,VG_(tt_fast)@l(5)
         lwz     6,4(5)   /* big-endian, so comparing 2nd 32bit word */
index 1e131199f318e162a11c9cc30e3f8f6fb01b6823..d922e143647da0fc9c89ffe201fa735a3e2a9bba 100644 (file)
@@ -331,8 +331,8 @@ static void block_signals(ThreadId tid)
    VG_(sigprocmask)(VKI_SIG_SETMASK, &mask, NULL);
 }
 
-/* Use libc setjmp/longjmp.  longjmp must not restore signal mask
-   state, but does need to pass "val" through. */
+/* Use gcc's built-in setjmp/longjmp.  longjmp must not restore signal
+   mask state, but does need to pass "val" through. */
 #define SCHEDSETJMP(tid, jumped, stmt)                                 \
    do {                                                                        \
       ThreadState * volatile _qq_tst = VG_(get_ThreadState)(tid);      \
@@ -343,7 +343,8 @@ static void block_signals(ThreadId tid)
         _qq_tst->sched_jmpbuf_valid = True;                            \
         stmt;                                                          \
       }        else if (VG_(clo_trace_sched))                                  \
-        VG_(printf)("SCHEDSETJMP(line %d) tid %d, jumped=%d\n", __LINE__, tid, jumped); \
+        VG_(printf)("SCHEDSETJMP(line %d) tid %d, jumped=%d\n",        \
+                     __LINE__, tid, jumped);                            \
       vg_assert(_qq_tst->sched_jmpbuf_valid);                          \
       _qq_tst->sched_jmpbuf_valid = False;                             \
    } while(0)
@@ -370,7 +371,6 @@ UInt run_thread_for_a_while ( ThreadId tid )
 
    /* Paranoia */
    vg_assert(VG_(is_valid_tid)(tid));
-   vg_assert(VG_(is_valid_tid)(tid));
    vg_assert(VG_(is_running_thread)(tid));
    vg_assert(!VG_(is_exiting)(tid));
 
@@ -408,11 +408,9 @@ UInt run_thread_for_a_while ( ThreadId tid )
 
       This should be abstractified and lifted out.
    */
-   { Int i;
-     /* Clear any existing reservation.  Be paranoid and clear them all. */
-     for (i = 0; i < VG_N_THREADS; i++)
-        VG_(threads)[i].arch.vex.guest_RESVN = 0;
-   }
+   /* Clear any existing reservation that this thread might have made
+      last time it was running. */
+   VG_(threads)[tid].arch.vex.guest_RESVN = 0;
 
    /* ppc guest_state vector regs must be 16byte aligned for loads/stores */
    vg_assert(VG_IS_16_ALIGNED(VG_(threads)[tid].arch.vex.guest_VR0));
@@ -422,7 +420,8 @@ UInt run_thread_for_a_while ( ThreadId tid )
    /* there should be no undealt-with signals */
    //vg_assert(VG_(threads)[tid].siginfo.si_signo == 0);
 
-   //VG_(printf)("running EIP = %p ESP=%p\n", VG_(threads)[tid].arch.m_eip, VG_(threads)[tid].arch.m_esp);
+   //VG_(printf)("running EIP = %p ESP=%p\n",
+   //VG_(threads)[tid].arch.m_eip, VG_(threads)[tid].arch.m_esp);
 
    vg_assert(VG_(my_fault));
    VG_(my_fault) = False;
index 31705a7674503cf74aecf18e5a10216b8db4b191..f04a2104012381963c891246f51b11f2bedf6581 100644 (file)
@@ -606,7 +606,7 @@ static inline UInt HASH_TT ( Addr64 key )
 
 static void setFastCacheEntry ( Addr64 key, ULong* tce, UInt* count )
 {
-   UInt cno = ((UInt)key) & VG_TT_FAST_MASK;
+   UInt cno = (UInt)VG_TT_FAST_HASH(key);
    VG_(tt_fast)[cno]  = tce;
    VG_(tt_fastN)[cno] = count;
    n_fast_updates++;
index 6041dcda3989f1dd07f66719aa696260d2db5847..24626151ae7cc3339e683525d97ae68d35c170df 100644 (file)
 #ifndef __PUB_CORE_TRANSTAB_ASM_H
 #define __PUB_CORE_TRANSTAB_ASM_H
 
-/* Constants for the fast translation lookup cache. */
+/* Constants for the fast translation lookup cache.  It is a direct
+   mapped cache, with 2^VG_TT_FAST_BITS entries.
+
+   On x86/amd64, the cache index is computed as
+   'address[VG_TT_FAST_BITS-1 : 0]'.
+
+   On ppc32/ppc64, the bottom two bits of instruction addresses are
+   zero, which means that function causes only 1/4 of the entries to
+   ever be used.  So instead the function is '(address >>u
+   2)[VG_TT_FAST_BITS-1 : 0]' on those targets. */
+
 #define VG_TT_FAST_BITS 15
 #define VG_TT_FAST_SIZE (1 << VG_TT_FAST_BITS)
 #define VG_TT_FAST_MASK ((VG_TT_FAST_SIZE) - 1)
 
+/* This macro isn't usable in asm land; nevertheless this seems
+   like a good place to put it. */
+#if defined(VGA_x86) || defined(VGA_amd64)
+#  define VG_TT_FAST_HASH(_addr)  ((((UWord)(_addr))     ) & VG_TT_FAST_MASK)
+#elif defined(VGA_ppc32) || defined(VGA_ppc64)
+#  define VG_TT_FAST_HASH(_addr)  ((((UWord)(_addr)) >> 2) & VG_TT_FAST_MASK)
+#else
+#  error "VG_TT_FAST_HASH: unknown platform"
+#endif
+
 #endif   // __PUB_CORE_TRANSTAB_ASM_H
 
 /*--------------------------------------------------------------------*/