]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/suse-2.6.27.31/patches.arch/ppc-optimize-sync.patch
Disable build of xen kernel.
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.arch / ppc-optimize-sync.patch
1 Subject: Optimise smp_{r,w}mb and mutex
2 From: Nick Piggin <npiggin@suse.de>
3 References: 471222 - LTC51356
4
5 powerpc: Optimise smp_wmb
6
7 Change 2d1b2027626d5151fff8ef7c06ca8e7876a1a510 ("powerpc: Fixup
8 lwsync at runtime") removed __SUBARCH_HAS_LWSYNC, causing smp_wmb to
9 revert back to eieio for all CPUs. This restores the behaviour
10 intorduced in 74f0609526afddd88bef40b651da24f3167b10b2 ("powerpc:
11 Optimise smp_wmb on 64-bit processors").
12
13 powerpc: Optimise smp_rmb
14
15 After commit 598056d5af8fef1dbe8f96f5c2b641a528184e5a ("[POWERPC] Fix
16 rmb to order cacheable vs. noncacheable"), rmb() becomes a sync
17 instruction, which is needed to order cacheable vs noncacheable loads.
18 However smp_rmb() is #defined to rmb(), and smp_rmb() can be an
19 lwsync.
20
21 This restores smp_rmb() performance by using lwsync there and updates
22 the comments.
23
24 powerpc: Optimise mutex
25
26 This implements an optimised mutex fastpath for powerpc, making use of
27 acquire and release barrier semantics. This takes the mutex
28 lock+unlock benchmark from 203 to 173 cycles on a G5.
29
30 Signed-off-by: Nick Piggin <npiggin@suse.de>
31 Signed-off-by: Paul Mackerras <paulus@samba.org>
32 Signed-off-by: Olaf Hering <olh@suse.de>
33
34 ---
35 arch/powerpc/include/asm/mutex.h | 135 ++++++++++++++++++++++++++++++++++++--
36 arch/powerpc/include/asm/synch.h | 4 +
37 arch/powerpc/include/asm/system.h | 24 +++---
38 3 files changed, 147 insertions(+), 16 deletions(-)
39
40 --- a/arch/powerpc/include/asm/mutex.h
41 +++ b/arch/powerpc/include/asm/mutex.h
42 @@ -1,9 +1,134 @@
43 /*
44 - * Pull in the generic implementation for the mutex fastpath.
45 + * Optimised mutex implementation of include/asm-generic/mutex-dec.h algorithm
46 + */
47 +#ifndef _ASM_POWERPC_MUTEX_H
48 +#define _ASM_POWERPC_MUTEX_H
49 +
50 +static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
51 +{
52 + int t;
53 +
54 + __asm__ __volatile__ (
55 +"1: lwarx %0,0,%1 # mutex trylock\n\
56 + cmpw 0,%0,%2\n\
57 + bne- 2f\n"
58 + PPC405_ERR77(0,%1)
59 +" stwcx. %3,0,%1\n\
60 + bne- 1b"
61 + ISYNC_ON_SMP
62 + "\n\
63 +2:"
64 + : "=&r" (t)
65 + : "r" (&v->counter), "r" (old), "r" (new)
66 + : "cc", "memory");
67 +
68 + return t;
69 +}
70 +
71 +static inline int __mutex_dec_return_lock(atomic_t *v)
72 +{
73 + int t;
74 +
75 + __asm__ __volatile__(
76 +"1: lwarx %0,0,%1 # mutex lock\n\
77 + addic %0,%0,-1\n"
78 + PPC405_ERR77(0,%1)
79 +" stwcx. %0,0,%1\n\
80 + bne- 1b"
81 + ISYNC_ON_SMP
82 + : "=&r" (t)
83 + : "r" (&v->counter)
84 + : "cc", "memory");
85 +
86 + return t;
87 +}
88 +
89 +static inline int __mutex_inc_return_unlock(atomic_t *v)
90 +{
91 + int t;
92 +
93 + __asm__ __volatile__(
94 + LWSYNC_ON_SMP
95 +"1: lwarx %0,0,%1 # mutex unlock\n\
96 + addic %0,%0,1\n"
97 + PPC405_ERR77(0,%1)
98 +" stwcx. %0,0,%1 \n\
99 + bne- 1b"
100 + : "=&r" (t)
101 + : "r" (&v->counter)
102 + : "cc", "memory");
103 +
104 + return t;
105 +}
106 +
107 +/**
108 + * __mutex_fastpath_lock - try to take the lock by moving the count
109 + * from 1 to a 0 value
110 + * @count: pointer of type atomic_t
111 + * @fail_fn: function to call if the original value was not 1
112 + *
113 + * Change the count from 1 to a value lower than 1, and call <fail_fn> if
114 + * it wasn't 1 originally. This function MUST leave the value lower than
115 + * 1 even when the "1" assertion wasn't true.
116 + */
117 +static inline void
118 +__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
119 +{
120 + if (unlikely(__mutex_dec_return_lock(count) < 0))
121 + fail_fn(count);
122 +}
123 +
124 +/**
125 + * __mutex_fastpath_lock_retval - try to take the lock by moving the count
126 + * from 1 to a 0 value
127 + * @count: pointer of type atomic_t
128 + * @fail_fn: function to call if the original value was not 1
129 + *
130 + * Change the count from 1 to a value lower than 1, and call <fail_fn> if
131 + * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
132 + * or anything the slow path function returns.
133 + */
134 +static inline int
135 +__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
136 +{
137 + if (unlikely(__mutex_dec_return_lock(count) < 0))
138 + return fail_fn(count);
139 + return 0;
140 +}
141 +
142 +/**
143 + * __mutex_fastpath_unlock - try to promote the count from 0 to 1
144 + * @count: pointer of type atomic_t
145 + * @fail_fn: function to call if the original value was not 0
146 + *
147 + * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>.
148 + * In the failure case, this function is allowed to either set the value to
149 + * 1, or to set it to a value lower than 1.
150 + */
151 +static inline void
152 +__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
153 +{
154 + if (unlikely(__mutex_inc_return_unlock(count) <= 0))
155 + fail_fn(count);
156 +}
157 +
158 +#define __mutex_slowpath_needs_to_unlock() 1
159 +
160 +/**
161 + * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
162 + *
163 + * @count: pointer of type atomic_t
164 + * @fail_fn: fallback function
165 *
166 - * TODO: implement optimized primitives instead, or leave the generic
167 - * implementation in place, or pick the atomic_xchg() based generic
168 - * implementation. (see asm-generic/mutex-xchg.h for details)
169 + * Change the count from 1 to 0, and return 1 (success), or if the count
170 + * was not 1, then return 0 (failure).
171 */
172 +static inline int
173 +__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
174 +{
175 + if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
176 + return 1;
177 + return 0;
178 +}
179
180 -#include <asm-generic/mutex-dec.h>
181 +#endif
182 --- a/arch/powerpc/include/asm/synch.h
183 +++ b/arch/powerpc/include/asm/synch.h
184 @@ -5,6 +5,10 @@
185 #include <linux/stringify.h>
186 #include <asm/feature-fixups.h>
187
188 +#if defined(__powerpc64__) || defined(CONFIG_PPC_E500MC)
189 +#define __SUBARCH_HAS_LWSYNC
190 +#endif
191 +
192 #ifndef __ASSEMBLY__
193 extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup;
194 extern void do_lwsync_fixups(unsigned long value, void *fixup_start,
195 --- a/arch/powerpc/include/asm/system.h
196 +++ b/arch/powerpc/include/asm/system.h
197 @@ -23,15 +23,17 @@
198 * read_barrier_depends() prevents data-dependent loads being reordered
199 * across this point (nop on PPC).
200 *
201 - * We have to use the sync instructions for mb(), since lwsync doesn't
202 - * order loads with respect to previous stores. Lwsync is fine for
203 - * rmb(), though. Note that rmb() actually uses a sync on 32-bit
204 - * architectures.
205 + * *mb() variants without smp_ prefix must order all types of memory
206 + * operations with one another. sync is the only instruction sufficient
207 + * to do this.
208 *
209 - * For wmb(), we use sync since wmb is used in drivers to order
210 - * stores to system memory with respect to writes to the device.
211 - * However, smp_wmb() can be a lighter-weight lwsync or eieio barrier
212 - * on SMP since it is only used to order updates to system memory.
213 + * For the smp_ barriers, ordering is for cacheable memory operations
214 + * only. We have to use the sync instruction for smp_mb(), since lwsync
215 + * doesn't order loads with respect to previous stores. Lwsync can be
216 + * used for smp_rmb() and smp_wmb().
217 + *
218 + * However, on CPUs that don't support lwsync, lwsync actually maps to a
219 + * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
220 */
221 #define mb() __asm__ __volatile__ ("sync" : : : "memory")
222 #define rmb() __asm__ __volatile__ ("sync" : : : "memory")
223 @@ -45,14 +47,14 @@
224 #ifdef CONFIG_SMP
225
226 #ifdef __SUBARCH_HAS_LWSYNC
227 -# define SMPWMB lwsync
228 +# define SMPWMB LWSYNC
229 #else
230 # define SMPWMB eieio
231 #endif
232
233 #define smp_mb() mb()
234 -#define smp_rmb() rmb()
235 -#define smp_wmb() __asm__ __volatile__ (__stringify(SMPWMB) : : :"memory")
236 +#define smp_rmb() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory")
237 +#define smp_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
238 #define smp_read_barrier_depends() read_barrier_depends()
239 #else
240 #define smp_mb() barrier()