[people/teissler/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.arch / ppc-optimize-sync.patch

Subject: Optimise smp_{r,w}mb and mutex
From: Nick Piggin <npiggin@suse.de>
References:  471222 - LTC51356

    powerpc: Optimise smp_wmb

    Change 2d1b2027626d5151fff8ef7c06ca8e7876a1a510 ("powerpc: Fixup
    lwsync at runtime") removed __SUBARCH_HAS_LWSYNC, causing smp_wmb to
    revert back to eieio for all CPUs.  This restores the behaviour
    intorduced in 74f0609526afddd88bef40b651da24f3167b10b2 ("powerpc:
    Optimise smp_wmb on 64-bit processors").

    powerpc: Optimise smp_rmb

    After commit 598056d5af8fef1dbe8f96f5c2b641a528184e5a ("[POWERPC] Fix
    rmb to order cacheable vs. noncacheable"), rmb() becomes a sync
    instruction, which is needed to order cacheable vs noncacheable loads.
    However smp_rmb() is #defined to rmb(), and smp_rmb() can be an
    lwsync.

    This restores smp_rmb() performance by using lwsync there and updates
    the comments.

    powerpc: Optimise mutex

    This implements an optimised mutex fastpath for powerpc, making use of
    acquire and release barrier semantics.  This takes the mutex
    lock+unlock benchmark from 203 to 173 cycles on a G5.

    Signed-off-by: Nick Piggin <npiggin@suse.de>
    Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Olaf Hering <olh@suse.de>

---
 arch/powerpc/include/asm/mutex.h  |  135 ++++++++++++++++++++++++++++++++++++--
 arch/powerpc/include/asm/synch.h  |    4 +
 arch/powerpc/include/asm/system.h |   24 +++---
 3 files changed, 147 insertions(+), 16 deletions(-)

--- a/arch/powerpc/include/asm/mutex.h
+++ b/arch/powerpc/include/asm/mutex.h
@@ -1,9 +1,134 @@
 /*
- * Pull in the generic implementation for the mutex fastpath.
+ * Optimised mutex implementation of include/asm-generic/mutex-dec.h algorithm
+ */
+#ifndef _ASM_POWERPC_MUTEX_H
+#define _ASM_POWERPC_MUTEX_H
+
+static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
+{
+	int t;
+
+	__asm__ __volatile__ (
+"1:	lwarx	%0,0,%1		# mutex trylock\n\
+	cmpw	0,%0,%2\n\
+	bne-	2f\n"
+	PPC405_ERR77(0,%1)
+"	stwcx.	%3,0,%1\n\
+	bne-	1b"
+	ISYNC_ON_SMP
+	"\n\
+2:"
+	: "=&r" (t)
+	: "r" (&v->counter), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return t;
+}
+
+static inline int __mutex_dec_return_lock(atomic_t *v)
+{
+	int t;
+
+	__asm__ __volatile__(
+"1:	lwarx	%0,0,%1		# mutex lock\n\
+	addic	%0,%0,-1\n"
+	PPC405_ERR77(0,%1)
+"	stwcx.	%0,0,%1\n\
+	bne-	1b"
+	ISYNC_ON_SMP
+	: "=&r" (t)
+	: "r" (&v->counter)
+	: "cc", "memory");
+
+	return t;
+}
+
+static inline int __mutex_inc_return_unlock(atomic_t *v)
+{
+	int t;
+
+	__asm__ __volatile__(
+	LWSYNC_ON_SMP
+"1:	lwarx	%0,0,%1		# mutex unlock\n\
+	addic	%0,%0,1\n"
+	PPC405_ERR77(0,%1)
+"	stwcx.	%0,0,%1 \n\
+	bne-	1b"
+	: "=&r" (t)
+	: "r" (&v->counter)
+	: "cc", "memory");
+
+	return t;
+}
+
+/**
+ *  __mutex_fastpath_lock - try to take the lock by moving the count
+ *                          from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function MUST leave the value lower than
+ * 1 even when the "1" assertion wasn't true.
+ */
+static inline void
+__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
+{
+	if (unlikely(__mutex_dec_return_lock(count) < 0))
+		fail_fn(count);
+}
+
+/**
+ *  __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ *                                 from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns.
+ */
+static inline int
+__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	if (unlikely(__mutex_dec_return_lock(count) < 0))
+		return fail_fn(count);
+	return 0;
+}
+
+/**
+ *  __mutex_fastpath_unlock - try to promote the count from 0 to 1
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 0
+ *
+ * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>.
+ * In the failure case, this function is allowed to either set the value to
+ * 1, or to set it to a value lower than 1.
+ */
+static inline void
+__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
+{
+	if (unlikely(__mutex_inc_return_unlock(count) <= 0))
+		fail_fn(count);
+}
+
+#define __mutex_slowpath_needs_to_unlock()		1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: fallback function
  *
- * TODO: implement optimized primitives instead, or leave the generic
- * implementation in place, or pick the atomic_xchg() based generic
- * implementation. (see asm-generic/mutex-xchg.h for details)
+ * Change the count from 1 to 0, and return 1 (success), or if the count
+ * was not 1, then return 0 (failure).
  */
+static inline int
+__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
+		return 1;
+	return 0;
+}
 
-#include <asm-generic/mutex-dec.h>
+#endif
--- a/arch/powerpc/include/asm/synch.h
+++ b/arch/powerpc/include/asm/synch.h
@@ -5,6 +5,10 @@
 #include <linux/stringify.h>
 #include <asm/feature-fixups.h>
 
+#if defined(__powerpc64__) || defined(CONFIG_PPC_E500MC)
+#define __SUBARCH_HAS_LWSYNC
+#endif
+
 #ifndef __ASSEMBLY__
 extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup;
 extern void do_lwsync_fixups(unsigned long value, void *fixup_start,
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -23,15 +23,17 @@
  * read_barrier_depends() prevents data-dependent loads being reordered
  *	across this point (nop on PPC).
  *
- * We have to use the sync instructions for mb(), since lwsync doesn't
- * order loads with respect to previous stores.  Lwsync is fine for
- * rmb(), though. Note that rmb() actually uses a sync on 32-bit
- * architectures.
+ * *mb() variants without smp_ prefix must order all types of memory
+ * operations with one another. sync is the only instruction sufficient
+ * to do this.
  *
- * For wmb(), we use sync since wmb is used in drivers to order
- * stores to system memory with respect to writes to the device.
- * However, smp_wmb() can be a lighter-weight lwsync or eieio barrier
- * on SMP since it is only used to order updates to system memory.
+ * For the smp_ barriers, ordering is for cacheable memory operations
+ * only. We have to use the sync instruction for smp_mb(), since lwsync
+ * doesn't order loads with respect to previous stores.  Lwsync can be
+ * used for smp_rmb() and smp_wmb().
+ *
+ * However, on CPUs that don't support lwsync, lwsync actually maps to a
+ * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
  */
 #define mb()   __asm__ __volatile__ ("sync" : : : "memory")
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
@@ -45,14 +47,14 @@
 #ifdef CONFIG_SMP
 
 #ifdef __SUBARCH_HAS_LWSYNC
-#    define SMPWMB      lwsync
+#    define SMPWMB      LWSYNC
 #else
 #    define SMPWMB      eieio
 #endif
 
 #define smp_mb()	mb()
-#define smp_rmb()	rmb()
-#define smp_wmb()	__asm__ __volatile__ (__stringify(SMPWMB) : : :"memory")
+#define smp_rmb()	__asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory")
+#define smp_wmb()	__asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
 #define smp_read_barrier_depends()	read_barrier_depends()
 #else
 #define smp_mb()	barrier()
Commit	Line	Data
00e5a55c BS	1	Subject: Optimise smp_{r,w}mb and mutex
	2	From: Nick Piggin <npiggin@suse.de>
	3	References: 471222 - LTC51356
	4
	5	powerpc: Optimise smp_wmb
	6
	7	Change 2d1b2027626d5151fff8ef7c06ca8e7876a1a510 ("powerpc: Fixup
	8	lwsync at runtime") removed __SUBARCH_HAS_LWSYNC, causing smp_wmb to
	9	revert back to eieio for all CPUs. This restores the behaviour
	10	intorduced in 74f0609526afddd88bef40b651da24f3167b10b2 ("powerpc:
	11	Optimise smp_wmb on 64-bit processors").
	12
	13	powerpc: Optimise smp_rmb
	14
	15	After commit 598056d5af8fef1dbe8f96f5c2b641a528184e5a ("[POWERPC] Fix
	16	rmb to order cacheable vs. noncacheable"), rmb() becomes a sync
	17	instruction, which is needed to order cacheable vs noncacheable loads.
	18	However smp_rmb() is #defined to rmb(), and smp_rmb() can be an
	19	lwsync.
	20
	21	This restores smp_rmb() performance by using lwsync there and updates
	22	the comments.
	23
	24	powerpc: Optimise mutex
	25
	26	This implements an optimised mutex fastpath for powerpc, making use of
	27	acquire and release barrier semantics. This takes the mutex
	28	lock+unlock benchmark from 203 to 173 cycles on a G5.
	29
	30	Signed-off-by: Nick Piggin <npiggin@suse.de>
	31	Signed-off-by: Paul Mackerras <paulus@samba.org>
	32	Signed-off-by: Olaf Hering <olh@suse.de>
	33
	34	---
	35	arch/powerpc/include/asm/mutex.h \| 135 ++++++++++++++++++++++++++++++++++++--
	36	arch/powerpc/include/asm/synch.h \| 4 +
	37	arch/powerpc/include/asm/system.h \| 24 +++---
	38	3 files changed, 147 insertions(+), 16 deletions(-)
	39
	40	--- a/arch/powerpc/include/asm/mutex.h
	41	+++ b/arch/powerpc/include/asm/mutex.h
	42	@@ -1,9 +1,134 @@
	43	/*
	44	- * Pull in the generic implementation for the mutex fastpath.
	45	+ * Optimised mutex implementation of include/asm-generic/mutex-dec.h algorithm
	46	+ */
	47	+#ifndef _ASM_POWERPC_MUTEX_H
	48	+#define _ASM_POWERPC_MUTEX_H
	49	+
	50	+static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
	51	+{
	52	+ int t;
	53	+
	54	+ __asm__ __volatile__ (
	55	+"1: lwarx %0,0,%1 # mutex trylock\n\
	56	+ cmpw 0,%0,%2\n\
	57	+ bne- 2f\n"
	58	+ PPC405_ERR77(0,%1)
	59	+" stwcx. %3,0,%1\n\
	60	+ bne- 1b"
	61	+ ISYNC_ON_SMP
	62	+ "\n\
	63	+2:"
	64	+ : "=&r" (t)
65	+ : "r" (&v->counter), "r" (old), "r" (new)
66	+ : "cc", "memory");
67	+
68	+ return t;
69	+}
70	+
71	+static inline int __mutex_dec_return_lock(atomic_t *v)
72	+{
73	+ int t;
74	+
75	+ __asm__ __volatile__(
76	+"1: lwarx %0,0,%1 # mutex lock\n\
77	+ addic %0,%0,-1\n"
78	+ PPC405_ERR77(0,%1)
79	+" stwcx. %0,0,%1\n\
80	+ bne- 1b"
81	+ ISYNC_ON_SMP
82	+ : "=&r" (t)
83	+ : "r" (&v->counter)
84	+ : "cc", "memory");
85	+
86	+ return t;
87	+}
88	+
89	+static inline int __mutex_inc_return_unlock(atomic_t *v)
90	+{
91	+ int t;
92	+
93	+ __asm__ __volatile__(
94	+ LWSYNC_ON_SMP
95	+"1: lwarx %0,0,%1 # mutex unlock\n\
96	+ addic %0,%0,1\n"
97	+ PPC405_ERR77(0,%1)
98	+" stwcx. %0,0,%1 \n\
99	+ bne- 1b"
100	+ : "=&r" (t)
101	+ : "r" (&v->counter)
102	+ : "cc", "memory");
103	+
104	+ return t;
105	+}
106	+
107	+/**
108	+ * __mutex_fastpath_lock - try to take the lock by moving the count
109	+ * from 1 to a 0 value
110	+ * @count: pointer of type atomic_t
111	+ * @fail_fn: function to call if the original value was not 1
112	+ *
113	+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
114	+ * it wasn't 1 originally. This function MUST leave the value lower than
115	+ * 1 even when the "1" assertion wasn't true.
116	+ */
117	+static inline void
118	+__mutex_fastpath_lock(atomic_t count, void (fail_fn)(atomic_t *))
119	+{
120	+ if (unlikely(__mutex_dec_return_lock(count) < 0))
121	+ fail_fn(count);
122	+}
123	+
124	+/**
125	+ * __mutex_fastpath_lock_retval - try to take the lock by moving the count
126	+ * from 1 to a 0 value
127	+ * @count: pointer of type atomic_t
128	+ * @fail_fn: function to call if the original value was not 1
129	+ *
130	+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
131	+ * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
132	+ * or anything the slow path function returns.
133	+ */
134	+static inline int
135	+__mutex_fastpath_lock_retval(atomic_t count, int (fail_fn)(atomic_t *))
136	+{
137	+ if (unlikely(__mutex_dec_return_lock(count) < 0))
138	+ return fail_fn(count);
139	+ return 0;
140	+}
141	+
142	+/**
143	+ * __mutex_fastpath_unlock - try to promote the count from 0 to 1
144	+ * @count: pointer of type atomic_t
145	+ * @fail_fn: function to call if the original value was not 0
146	+ *
147	+ * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>.
148	+ * In the failure case, this function is allowed to either set the value to
149	+ * 1, or to set it to a value lower than 1.
150	+ */
151	+static inline void
152	+__mutex_fastpath_unlock(atomic_t count, void (fail_fn)(atomic_t *))
153	+{
154	+ if (unlikely(__mutex_inc_return_unlock(count) <= 0))
155	+ fail_fn(count);
156	+}
157	+
158	+#define __mutex_slowpath_needs_to_unlock() 1
159	+
160	+/**
161	+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
162	+ *
163	+ * @count: pointer of type atomic_t
164	+ * @fail_fn: fallback function
165	*
166	- * TODO: implement optimized primitives instead, or leave the generic
167	- * implementation in place, or pick the atomic_xchg() based generic
168	- * implementation. (see asm-generic/mutex-xchg.h for details)
169	+ * Change the count from 1 to 0, and return 1 (success), or if the count
170	+ * was not 1, then return 0 (failure).
171	*/
172	+static inline int
173	+__mutex_fastpath_trylock(atomic_t count, int (fail_fn)(atomic_t *))
174	+{
175	+ if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
176	+ return 1;
177	+ return 0;
178	+}
179
180	-#include <asm-generic/mutex-dec.h>
181	+#endif
182	--- a/arch/powerpc/include/asm/synch.h
183	+++ b/arch/powerpc/include/asm/synch.h
184	@@ -5,6 +5,10 @@
185	#include <linux/stringify.h>
186	#include <asm/feature-fixups.h>
187
188	+#if defined(__powerpc64__) \|\| defined(CONFIG_PPC_E500MC)
189	+#define __SUBARCH_HAS_LWSYNC
190	+#endif
191	+
192	#ifndef __ASSEMBLY__
193	extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup;
194	extern void do_lwsync_fixups(unsigned long value, void *fixup_start,
195	--- a/arch/powerpc/include/asm/system.h
196	+++ b/arch/powerpc/include/asm/system.h
197	@@ -23,15 +23,17 @@
198	* read_barrier_depends() prevents data-dependent loads being reordered
199	* across this point (nop on PPC).
200	*
201	- * We have to use the sync instructions for mb(), since lwsync doesn't
202	- * order loads with respect to previous stores. Lwsync is fine for
203	- * rmb(), though. Note that rmb() actually uses a sync on 32-bit
204	- * architectures.
205	+ * *mb() variants without smp_ prefix must order all types of memory
206	+ * operations with one another. sync is the only instruction sufficient
207	+ * to do this.
208	*
209	- * For wmb(), we use sync since wmb is used in drivers to order
210	- * stores to system memory with respect to writes to the device.
211	- * However, smp_wmb() can be a lighter-weight lwsync or eieio barrier
212	- * on SMP since it is only used to order updates to system memory.
213	+ * For the smp_ barriers, ordering is for cacheable memory operations
214	+ * only. We have to use the sync instruction for smp_mb(), since lwsync
215	+ * doesn't order loads with respect to previous stores. Lwsync can be
216	+ * used for smp_rmb() and smp_wmb().
217	+ *
218	+ * However, on CPUs that don't support lwsync, lwsync actually maps to a
219	+ * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
220	*/
221	#define mb() __asm__ __volatile__ ("sync" : : : "memory")
222	#define rmb() __asm__ __volatile__ ("sync" : : : "memory")
223	@@ -45,14 +47,14 @@
224	#ifdef CONFIG_SMP
225
226	#ifdef __SUBARCH_HAS_LWSYNC
227	-# define SMPWMB lwsync
228	+# define SMPWMB LWSYNC
229	#else
230	# define SMPWMB eieio
231	#endif
232
233	#define smp_mb() mb()
234	-#define smp_rmb() rmb()
235	-#define smp_wmb() __asm__ __volatile__ (__stringify(SMPWMB) : : :"memory")
236	+#define smp_rmb() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory")
237	+#define smp_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
238	#define smp_read_barrier_depends() read_barrier_depends()
239	#else
240	#define smp_mb() barrier()