]>
Commit | Line | Data |
---|---|---|
00e5a55c BS |
1 | Subject: Optimise smp_{r,w}mb and mutex |
2 | From: Nick Piggin <npiggin@suse.de> | |
3 | References: 471222 - LTC51356 | |
4 | ||
5 | powerpc: Optimise smp_wmb | |
6 | ||
7 | Change 2d1b2027626d5151fff8ef7c06ca8e7876a1a510 ("powerpc: Fixup | |
8 | lwsync at runtime") removed __SUBARCH_HAS_LWSYNC, causing smp_wmb to | |
9 | revert back to eieio for all CPUs. This restores the behaviour | |
10 | intorduced in 74f0609526afddd88bef40b651da24f3167b10b2 ("powerpc: | |
11 | Optimise smp_wmb on 64-bit processors"). | |
12 | ||
13 | powerpc: Optimise smp_rmb | |
14 | ||
15 | After commit 598056d5af8fef1dbe8f96f5c2b641a528184e5a ("[POWERPC] Fix | |
16 | rmb to order cacheable vs. noncacheable"), rmb() becomes a sync | |
17 | instruction, which is needed to order cacheable vs noncacheable loads. | |
18 | However smp_rmb() is #defined to rmb(), and smp_rmb() can be an | |
19 | lwsync. | |
20 | ||
21 | This restores smp_rmb() performance by using lwsync there and updates | |
22 | the comments. | |
23 | ||
24 | powerpc: Optimise mutex | |
25 | ||
26 | This implements an optimised mutex fastpath for powerpc, making use of | |
27 | acquire and release barrier semantics. This takes the mutex | |
28 | lock+unlock benchmark from 203 to 173 cycles on a G5. | |
29 | ||
30 | Signed-off-by: Nick Piggin <npiggin@suse.de> | |
31 | Signed-off-by: Paul Mackerras <paulus@samba.org> | |
32 | Signed-off-by: Olaf Hering <olh@suse.de> | |
33 | ||
34 | --- | |
35 | arch/powerpc/include/asm/mutex.h | 135 ++++++++++++++++++++++++++++++++++++-- | |
36 | arch/powerpc/include/asm/synch.h | 4 + | |
37 | arch/powerpc/include/asm/system.h | 24 +++--- | |
38 | 3 files changed, 147 insertions(+), 16 deletions(-) | |
39 | ||
40 | --- a/arch/powerpc/include/asm/mutex.h | |
41 | +++ b/arch/powerpc/include/asm/mutex.h | |
42 | @@ -1,9 +1,134 @@ | |
43 | /* | |
44 | - * Pull in the generic implementation for the mutex fastpath. | |
45 | + * Optimised mutex implementation of include/asm-generic/mutex-dec.h algorithm | |
46 | + */ | |
47 | +#ifndef _ASM_POWERPC_MUTEX_H | |
48 | +#define _ASM_POWERPC_MUTEX_H | |
49 | + | |
50 | +static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new) | |
51 | +{ | |
52 | + int t; | |
53 | + | |
54 | + __asm__ __volatile__ ( | |
55 | +"1: lwarx %0,0,%1 # mutex trylock\n\ | |
56 | + cmpw 0,%0,%2\n\ | |
57 | + bne- 2f\n" | |
58 | + PPC405_ERR77(0,%1) | |
59 | +" stwcx. %3,0,%1\n\ | |
60 | + bne- 1b" | |
61 | + ISYNC_ON_SMP | |
62 | + "\n\ | |
63 | +2:" | |
64 | + : "=&r" (t) | |
65 | + : "r" (&v->counter), "r" (old), "r" (new) | |
66 | + : "cc", "memory"); | |
67 | + | |
68 | + return t; | |
69 | +} | |
70 | + | |
71 | +static inline int __mutex_dec_return_lock(atomic_t *v) | |
72 | +{ | |
73 | + int t; | |
74 | + | |
75 | + __asm__ __volatile__( | |
76 | +"1: lwarx %0,0,%1 # mutex lock\n\ | |
77 | + addic %0,%0,-1\n" | |
78 | + PPC405_ERR77(0,%1) | |
79 | +" stwcx. %0,0,%1\n\ | |
80 | + bne- 1b" | |
81 | + ISYNC_ON_SMP | |
82 | + : "=&r" (t) | |
83 | + : "r" (&v->counter) | |
84 | + : "cc", "memory"); | |
85 | + | |
86 | + return t; | |
87 | +} | |
88 | + | |
89 | +static inline int __mutex_inc_return_unlock(atomic_t *v) | |
90 | +{ | |
91 | + int t; | |
92 | + | |
93 | + __asm__ __volatile__( | |
94 | + LWSYNC_ON_SMP | |
95 | +"1: lwarx %0,0,%1 # mutex unlock\n\ | |
96 | + addic %0,%0,1\n" | |
97 | + PPC405_ERR77(0,%1) | |
98 | +" stwcx. %0,0,%1 \n\ | |
99 | + bne- 1b" | |
100 | + : "=&r" (t) | |
101 | + : "r" (&v->counter) | |
102 | + : "cc", "memory"); | |
103 | + | |
104 | + return t; | |
105 | +} | |
106 | + | |
107 | +/** | |
108 | + * __mutex_fastpath_lock - try to take the lock by moving the count | |
109 | + * from 1 to a 0 value | |
110 | + * @count: pointer of type atomic_t | |
111 | + * @fail_fn: function to call if the original value was not 1 | |
112 | + * | |
113 | + * Change the count from 1 to a value lower than 1, and call <fail_fn> if | |
114 | + * it wasn't 1 originally. This function MUST leave the value lower than | |
115 | + * 1 even when the "1" assertion wasn't true. | |
116 | + */ | |
117 | +static inline void | |
118 | +__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *)) | |
119 | +{ | |
120 | + if (unlikely(__mutex_dec_return_lock(count) < 0)) | |
121 | + fail_fn(count); | |
122 | +} | |
123 | + | |
124 | +/** | |
125 | + * __mutex_fastpath_lock_retval - try to take the lock by moving the count | |
126 | + * from 1 to a 0 value | |
127 | + * @count: pointer of type atomic_t | |
128 | + * @fail_fn: function to call if the original value was not 1 | |
129 | + * | |
130 | + * Change the count from 1 to a value lower than 1, and call <fail_fn> if | |
131 | + * it wasn't 1 originally. This function returns 0 if the fastpath succeeds, | |
132 | + * or anything the slow path function returns. | |
133 | + */ | |
134 | +static inline int | |
135 | +__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *)) | |
136 | +{ | |
137 | + if (unlikely(__mutex_dec_return_lock(count) < 0)) | |
138 | + return fail_fn(count); | |
139 | + return 0; | |
140 | +} | |
141 | + | |
142 | +/** | |
143 | + * __mutex_fastpath_unlock - try to promote the count from 0 to 1 | |
144 | + * @count: pointer of type atomic_t | |
145 | + * @fail_fn: function to call if the original value was not 0 | |
146 | + * | |
147 | + * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>. | |
148 | + * In the failure case, this function is allowed to either set the value to | |
149 | + * 1, or to set it to a value lower than 1. | |
150 | + */ | |
151 | +static inline void | |
152 | +__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) | |
153 | +{ | |
154 | + if (unlikely(__mutex_inc_return_unlock(count) <= 0)) | |
155 | + fail_fn(count); | |
156 | +} | |
157 | + | |
158 | +#define __mutex_slowpath_needs_to_unlock() 1 | |
159 | + | |
160 | +/** | |
161 | + * __mutex_fastpath_trylock - try to acquire the mutex, without waiting | |
162 | + * | |
163 | + * @count: pointer of type atomic_t | |
164 | + * @fail_fn: fallback function | |
165 | * | |
166 | - * TODO: implement optimized primitives instead, or leave the generic | |
167 | - * implementation in place, or pick the atomic_xchg() based generic | |
168 | - * implementation. (see asm-generic/mutex-xchg.h for details) | |
169 | + * Change the count from 1 to 0, and return 1 (success), or if the count | |
170 | + * was not 1, then return 0 (failure). | |
171 | */ | |
172 | +static inline int | |
173 | +__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) | |
174 | +{ | |
175 | + if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1)) | |
176 | + return 1; | |
177 | + return 0; | |
178 | +} | |
179 | ||
180 | -#include <asm-generic/mutex-dec.h> | |
181 | +#endif | |
182 | --- a/arch/powerpc/include/asm/synch.h | |
183 | +++ b/arch/powerpc/include/asm/synch.h | |
184 | @@ -5,6 +5,10 @@ | |
185 | #include <linux/stringify.h> | |
186 | #include <asm/feature-fixups.h> | |
187 | ||
188 | +#if defined(__powerpc64__) || defined(CONFIG_PPC_E500MC) | |
189 | +#define __SUBARCH_HAS_LWSYNC | |
190 | +#endif | |
191 | + | |
192 | #ifndef __ASSEMBLY__ | |
193 | extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup; | |
194 | extern void do_lwsync_fixups(unsigned long value, void *fixup_start, | |
195 | --- a/arch/powerpc/include/asm/system.h | |
196 | +++ b/arch/powerpc/include/asm/system.h | |
197 | @@ -23,15 +23,17 @@ | |
198 | * read_barrier_depends() prevents data-dependent loads being reordered | |
199 | * across this point (nop on PPC). | |
200 | * | |
201 | - * We have to use the sync instructions for mb(), since lwsync doesn't | |
202 | - * order loads with respect to previous stores. Lwsync is fine for | |
203 | - * rmb(), though. Note that rmb() actually uses a sync on 32-bit | |
204 | - * architectures. | |
205 | + * *mb() variants without smp_ prefix must order all types of memory | |
206 | + * operations with one another. sync is the only instruction sufficient | |
207 | + * to do this. | |
208 | * | |
209 | - * For wmb(), we use sync since wmb is used in drivers to order | |
210 | - * stores to system memory with respect to writes to the device. | |
211 | - * However, smp_wmb() can be a lighter-weight lwsync or eieio barrier | |
212 | - * on SMP since it is only used to order updates to system memory. | |
213 | + * For the smp_ barriers, ordering is for cacheable memory operations | |
214 | + * only. We have to use the sync instruction for smp_mb(), since lwsync | |
215 | + * doesn't order loads with respect to previous stores. Lwsync can be | |
216 | + * used for smp_rmb() and smp_wmb(). | |
217 | + * | |
218 | + * However, on CPUs that don't support lwsync, lwsync actually maps to a | |
219 | + * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio. | |
220 | */ | |
221 | #define mb() __asm__ __volatile__ ("sync" : : : "memory") | |
222 | #define rmb() __asm__ __volatile__ ("sync" : : : "memory") | |
223 | @@ -45,14 +47,14 @@ | |
224 | #ifdef CONFIG_SMP | |
225 | ||
226 | #ifdef __SUBARCH_HAS_LWSYNC | |
227 | -# define SMPWMB lwsync | |
228 | +# define SMPWMB LWSYNC | |
229 | #else | |
230 | # define SMPWMB eieio | |
231 | #endif | |
232 | ||
233 | #define smp_mb() mb() | |
234 | -#define smp_rmb() rmb() | |
235 | -#define smp_wmb() __asm__ __volatile__ (__stringify(SMPWMB) : : :"memory") | |
236 | +#define smp_rmb() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory") | |
237 | +#define smp_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory") | |
238 | #define smp_read_barrier_depends() read_barrier_depends() | |
239 | #else | |
240 | #define smp_mb() barrier() |