src/patches/suse-2.6.27.31/patches.arch/ppc-optimize-sync.patch

   1 Subject: Optimise smp_{r,w}mb and mutex
   2 From: Nick Piggin <npiggin@suse.de>
   3 References:  471222 - LTC51356
   4
   5     powerpc: Optimise smp_wmb
   6
   7     Change 2d1b2027626d5151fff8ef7c06ca8e7876a1a510 ("powerpc: Fixup
   8     lwsync at runtime") removed __SUBARCH_HAS_LWSYNC, causing smp_wmb to
   9     revert back to eieio for all CPUs.  This restores the behaviour
  10     intorduced in 74f0609526afddd88bef40b651da24f3167b10b2 ("powerpc:
  11     Optimise smp_wmb on 64-bit processors").
  12
  13     powerpc: Optimise smp_rmb
  14
  15     After commit 598056d5af8fef1dbe8f96f5c2b641a528184e5a ("[POWERPC] Fix
  16     rmb to order cacheable vs. noncacheable"), rmb() becomes a sync
  17     instruction, which is needed to order cacheable vs noncacheable loads.
  18     However smp_rmb() is #defined to rmb(), and smp_rmb() can be an
  19     lwsync.
  20
  21     This restores smp_rmb() performance by using lwsync there and updates
  22     the comments.
  23
  24     powerpc: Optimise mutex
  25
  26     This implements an optimised mutex fastpath for powerpc, making use of
  27     acquire and release barrier semantics.  This takes the mutex
  28     lock+unlock benchmark from 203 to 173 cycles on a G5.
  29
  30     Signed-off-by: Nick Piggin <npiggin@suse.de>
  31     Signed-off-by: Paul Mackerras <paulus@samba.org>
  32 Signed-off-by: Olaf Hering <olh@suse.de>
  33
  34 ---
  35  arch/powerpc/include/asm/mutex.h  |  135 ++++++++++++++++++++++++++++++++++++--
  36  arch/powerpc/include/asm/synch.h  |    4 +
  37  arch/powerpc/include/asm/system.h |   24 +++---
  38  3 files changed, 147 insertions(+), 16 deletions(-)
  39
  40 --- a/arch/powerpc/include/asm/mutex.h
  41 +++ b/arch/powerpc/include/asm/mutex.h
  42 @@ -1,9 +1,134 @@
  43  /*
  44 - * Pull in the generic implementation for the mutex fastpath.
  45 + * Optimised mutex implementation of include/asm-generic/mutex-dec.h algorithm
  46 + */
  47 +#ifndef _ASM_POWERPC_MUTEX_H
  48 +#define _ASM_POWERPC_MUTEX_H
  49 +
  50 +static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
  51 +{
  52 +       int t;
  53 +
  54 +       __asm__ __volatile__ (
  55 +"1:    lwarx   %0,0,%1         # mutex trylock\n\
  56 +       cmpw    0,%0,%2\n\
  57 +       bne-    2f\n"
  58 +       PPC405_ERR77(0,%1)
  59 +"      stwcx.  %3,0,%1\n\
  60 +       bne-    1b"
  61 +       ISYNC_ON_SMP
  62 +       "\n\
  63 +2:"
  64 +       : "=&r" (t)
  65 +       : "r" (&v->counter), "r" (old), "r" (new)
  66 +       : "cc", "memory");
  67 +
  68 +       return t;
  69 +}
  70 +
  71 +static inline int __mutex_dec_return_lock(atomic_t *v)
  72 +{
  73 +       int t;
  74 +
  75 +       __asm__ __volatile__(
  76 +"1:    lwarx   %0,0,%1         # mutex lock\n\
  77 +       addic   %0,%0,-1\n"
  78 +       PPC405_ERR77(0,%1)
  79 +"      stwcx.  %0,0,%1\n\
  80 +       bne-    1b"
  81 +       ISYNC_ON_SMP
  82 +       : "=&r" (t)
  83 +       : "r" (&v->counter)
  84 +       : "cc", "memory");
  85 +
  86 +       return t;
  87 +}
  88 +
  89 +static inline int __mutex_inc_return_unlock(atomic_t *v)
  90 +{
  91 +       int t;
  92 +
  93 +       __asm__ __volatile__(
  94 +       LWSYNC_ON_SMP
  95 +"1:    lwarx   %0,0,%1         # mutex unlock\n\
  96 +       addic   %0,%0,1\n"
  97 +       PPC405_ERR77(0,%1)
  98 +"      stwcx.  %0,0,%1 \n\
  99 +       bne-    1b"
 100 +       : "=&r" (t)
 101 +       : "r" (&v->counter)
 102 +       : "cc", "memory");
 103 +
 104 +       return t;
 105 +}
 106 +
 107 +/**
 108 + *  __mutex_fastpath_lock - try to take the lock by moving the count
 109 + *                          from 1 to a 0 value
 110 + *  @count: pointer of type atomic_t
 111 + *  @fail_fn: function to call if the original value was not 1
 112 + *
 113 + * Change the count from 1 to a value lower than 1, and call <fail_fn> if
 114 + * it wasn't 1 originally. This function MUST leave the value lower than
 115 + * 1 even when the "1" assertion wasn't true.
 116 + */
 117 +static inline void
 118 +__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 119 +{
 120 +       if (unlikely(__mutex_dec_return_lock(count) < 0))
 121 +               fail_fn(count);
 122 +}
 123 +
 124 +/**
 125 + *  __mutex_fastpath_lock_retval - try to take the lock by moving the count
 126 + *                                 from 1 to a 0 value
 127 + *  @count: pointer of type atomic_t
 128 + *  @fail_fn: function to call if the original value was not 1
 129 + *
 130 + * Change the count from 1 to a value lower than 1, and call <fail_fn> if
 131 + * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
 132 + * or anything the slow path function returns.
 133 + */
 134 +static inline int
 135 +__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
 136 +{
 137 +       if (unlikely(__mutex_dec_return_lock(count) < 0))
 138 +               return fail_fn(count);
 139 +       return 0;
 140 +}
 141 +
 142 +/**
 143 + *  __mutex_fastpath_unlock - try to promote the count from 0 to 1
 144 + *  @count: pointer of type atomic_t
 145 + *  @fail_fn: function to call if the original value was not 0
 146 + *
 147 + * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>.
 148 + * In the failure case, this function is allowed to either set the value to
 149 + * 1, or to set it to a value lower than 1.
 150 + */
 151 +static inline void
 152 +__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 153 +{
 154 +       if (unlikely(__mutex_inc_return_unlock(count) <= 0))
 155 +               fail_fn(count);
 156 +}
 157 +
 158 +#define __mutex_slowpath_needs_to_unlock()             1
 159 +
 160 +/**
 161 + * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
 162 + *
 163 + *  @count: pointer of type atomic_t
 164 + *  @fail_fn: fallback function
 165   *
 166 - * TODO: implement optimized primitives instead, or leave the generic
 167 - * implementation in place, or pick the atomic_xchg() based generic
 168 - * implementation. (see asm-generic/mutex-xchg.h for details)
 169 + * Change the count from 1 to 0, and return 1 (success), or if the count
 170 + * was not 1, then return 0 (failure).
 171   */
 172 +static inline int
 173 +__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 174 +{
 175 +       if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
 176 +               return 1;
 177 +       return 0;
 178 +}
 179
 180 -#include <asm-generic/mutex-dec.h>
 181 +#endif
 182 --- a/arch/powerpc/include/asm/synch.h
 183 +++ b/arch/powerpc/include/asm/synch.h
 184 @@ -5,6 +5,10 @@
 185  #include <linux/stringify.h>
 186  #include <asm/feature-fixups.h>
 187
 188 +#if defined(__powerpc64__) || defined(CONFIG_PPC_E500MC)
 189 +#define __SUBARCH_HAS_LWSYNC
 190 +#endif
 191 +
 192  #ifndef __ASSEMBLY__
 193  extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup;
 194  extern void do_lwsync_fixups(unsigned long value, void *fixup_start,
 195 --- a/arch/powerpc/include/asm/system.h
 196 +++ b/arch/powerpc/include/asm/system.h
 197 @@ -23,15 +23,17 @@
 198   * read_barrier_depends() prevents data-dependent loads being reordered
 199   *     across this point (nop on PPC).
 200   *
 201 - * We have to use the sync instructions for mb(), since lwsync doesn't
 202 - * order loads with respect to previous stores.  Lwsync is fine for
 203 - * rmb(), though. Note that rmb() actually uses a sync on 32-bit
 204 - * architectures.
 205 + * *mb() variants without smp_ prefix must order all types of memory
 206 + * operations with one another. sync is the only instruction sufficient
 207 + * to do this.
 208   *
 209 - * For wmb(), we use sync since wmb is used in drivers to order
 210 - * stores to system memory with respect to writes to the device.
 211 - * However, smp_wmb() can be a lighter-weight lwsync or eieio barrier
 212 - * on SMP since it is only used to order updates to system memory.
 213 + * For the smp_ barriers, ordering is for cacheable memory operations
 214 + * only. We have to use the sync instruction for smp_mb(), since lwsync
 215 + * doesn't order loads with respect to previous stores.  Lwsync can be
 216 + * used for smp_rmb() and smp_wmb().
 217 + *
 218 + * However, on CPUs that don't support lwsync, lwsync actually maps to a
 219 + * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
 220   */
 221  #define mb()   __asm__ __volatile__ ("sync" : : : "memory")
 222  #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 223 @@ -45,14 +47,14 @@
 224  #ifdef CONFIG_SMP
 225
 226  #ifdef __SUBARCH_HAS_LWSYNC
 227 -#    define SMPWMB      lwsync
 228 +#    define SMPWMB      LWSYNC
 229  #else
 230  #    define SMPWMB      eieio
 231  #endif
 232
 233  #define smp_mb()       mb()
 234 -#define smp_rmb()      rmb()
 235 -#define smp_wmb()      __asm__ __volatile__ (__stringify(SMPWMB) : : :"memory")
 236 +#define smp_rmb()      __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory")
 237 +#define smp_wmb()      __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
 238  #define smp_read_barrier_depends()     read_barrier_depends()
 239  #else
 240  #define smp_mb()       barrier()