--- /dev/null
+From 113fc5a6e8c2288619ff7e8187a6f556b7e0d372 Mon Sep 17 00:00:00 2001
+From: H. Peter Anvin <hpa@zytor.com>
+Date: Tue, 27 Jul 2010 17:01:49 -0700
+Subject: x86: Add memory modify constraints to xchg() and cmpxchg()
+
+From: H. Peter Anvin <hpa@zytor.com>
+
+commit 113fc5a6e8c2288619ff7e8187a6f556b7e0d372 upstream.
+
+[ Backport to .32 by Tomáš Janoušek <tomi@nomi.cz> ]
+
+xchg() and cmpxchg() modify their memory operands, not merely read
+them. For some versions of gcc the "memory" clobber has apparently
+dealt with the situation, but not for all.
+
+Originally-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: H. Peter Anvin <hpa@zytor.com>
+Cc: Glauber Costa <glommer@redhat.com>
+Cc: Avi Kivity <avi@redhat.com>
+Cc: Peter Palfrader <peter@palfrader.org>
+Cc: Greg KH <gregkh@suse.de>
+Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
+Cc: Zachary Amsden <zamsden@redhat.com>
+Cc: Marcelo Tosatti <mtosatti@redhat.com>
+LKML-Reference: <4C4F7277.8050306@zytor.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/x86/include/asm/cmpxchg_32.h | 65 +++++++++++---------------------------
+ arch/x86/include/asm/cmpxchg_64.h | 4 --
+ 2 files changed, 20 insertions(+), 49 deletions(-)
+
+--- a/arch/x86/include/asm/cmpxchg_32.h
++++ b/arch/x86/include/asm/cmpxchg_32.h
+@@ -17,60 +17,33 @@ struct __xchg_dummy {
+ #define __xg(x) ((struct __xchg_dummy *)(x))
+
+ /*
+- * The semantics of XCHGCMP8B are a bit strange, this is why
+- * there is a loop and the loading of %%eax and %%edx has to
+- * be inside. This inlines well in most cases, the cached
+- * cost is around ~38 cycles. (in the future we might want
+- * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
+- * might have an implicit FPU-save as a cost, so it's not
+- * clear which path to go.)
++ * CMPXCHG8B only writes to the target if we had the previous
++ * value in registers, otherwise it acts as a read and gives us the
++ * "new previous" value. That is why there is a loop. Preloading
++ * EDX:EAX is a performance optimization: in the common case it means
++ * we need only one locked operation.
+ *
+- * cmpxchg8b must be used with the lock prefix here to allow
+- * the instruction to be executed atomically, see page 3-102
+- * of the instruction set reference 24319102.pdf. We need
+- * the reader side to see the coherent 64bit value.
++ * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
++ * least an FPU save and/or %cr0.ts manipulation.
++ *
++ * cmpxchg8b must be used with the lock prefix here to allow the
++ * instruction to be executed atomically. We need to have the reader
++ * side to see the coherent 64bit value.
+ */
+-static inline void __set_64bit(unsigned long long *ptr,
+- unsigned int low, unsigned int high)
++static inline void set_64bit(volatile u64 *ptr, u64 value)
+ {
++ u32 low = value;
++ u32 high = value >> 32;
++ u64 prev = *ptr;
++
+ asm volatile("\n1:\t"
+- "movl (%1), %%eax\n\t"
+- "movl 4(%1), %%edx\n\t"
+ LOCK_PREFIX "cmpxchg8b %0\n\t"
+ "jnz 1b"
+- : "=m"(*ptr)
+- : "D" (ptr),
+- "b"(low),
+- "c"(high)
+- : "ax", "dx", "memory");
+-}
+-
+-static inline void __set_64bit_constant(unsigned long long *ptr,
+- unsigned long long value)
+-{
+- __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
++ : "=m" (*ptr), "+A" (prev)
++ : "b" (low), "c" (high)
++ : "memory");
+ }
+
+-#define ll_low(x) *(((unsigned int *)&(x)) + 0)
+-#define ll_high(x) *(((unsigned int *)&(x)) + 1)
+-
+-static inline void __set_64bit_var(unsigned long long *ptr,
+- unsigned long long value)
+-{
+- __set_64bit(ptr, ll_low(value), ll_high(value));
+-}
+-
+-#define set_64bit(ptr, value) \
+- (__builtin_constant_p((value)) \
+- ? __set_64bit_constant((ptr), (value)) \
+- : __set_64bit_var((ptr), (value)))
+-
+-#define _set_64bit(ptr, value) \
+- (__builtin_constant_p(value) \
+- ? __set_64bit(ptr, (unsigned int)(value), \
+- (unsigned int)((value) >> 32)) \
+- : __set_64bit(ptr, ll_low((value)), ll_high((value))))
+-
+ /*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
+--- a/arch/x86/include/asm/cmpxchg_64.h
++++ b/arch/x86/include/asm/cmpxchg_64.h
+@@ -8,13 +8,11 @@
+
+ #define __xg(x) ((volatile long *)(x))
+
+-static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
++static inline void set_64bit(volatile u64 *ptr, u64 val)
+ {
+ *ptr = val;
+ }
+
+-#define _set_64bit set_64bit
+-
+ /*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,