#endif
/*
- * In the Atomic_* definitions below, memory ordering and atomicity are somewhat
- * conflated in an inconsistent manner. First, we have Atomic_{Read,Write},
- * which only guarantees single copy atomicity, i.e. that the read/write occurs
- * in an atomic fashion, but have no implication on memory ordering. The second
- * class of Atomics are all the non-unfenced operations excluding
- * Atomic_{Read,Write}*, which both imply atomicity and act as a memory barrier,
- * implying sequentially consistent ordering of the atomic operation with all
- * loads/stores prior to and after it.
- *
- * Since on x86, the second class of operations are associated with LOCK
- * semantics, assumptions have been made about the ordering these operations
- * imply on surrounding code (see for example the vmkernel's RefCount
- * implementation). As a result, on arm64 we have to provide these same
- * guarantees. We do this by making use of DMB barriers both before and after
- * the atomic ldrx/strx sequences. A barrier before and after is required to
- * avoid having part of the atomic operation reordered with surrounding code,
- * e.g. a store-load reordering of the strx with a following load outside the
- * Atomic_ op. For the first class of operations, Atomic_{Read,Write}, we do not
- * implement a barrier.
- *
- * This implementation of Atomic operations is suboptimal on arm64, since
- * both atomicity and memory ordering are fused together. Ideally the Atomic
- * operations would only imply atomicity, and an explicit memory barrier in the
- * surrounding code used to enforce ordering where necessary. This would eschew
- * the need for the DMBs. A middle ground can be implemented where we use the
- * arm64 load-acquire/store-release exclusive instructions to implement Atomics.
- * This would imply sequential consistency of the Atomic operations (but not
- * with any of the surrounding non-atomic operations) without the need for a
- * DMB. Using these without a DMB today can still result in problematic
- * reordering by the processor with surrounding non-atomic operations, e.g. a
- * store-load reordering with a stlxr. Future optimization for arm64 should
- * consider the wider change required at the call sites to minimize DMBs.
+ * There are two concepts involved when dealing with atomic accesses:
+ * 1. Atomicity of the access itself
+ * 2. Ordering of the access with respect to other reads&writes (from the view
+ * of other processors/devices).
+ *
+ * Two examples help to clarify #2:
+ * a. Inc: A caller implementing a simple independent global event counter
+ * might not care if the compiler or processor visibly reorders the
+ * increment around other memory accesses.
+ * b. Dec: A caller implementing a reference count absolutely *doesn't* want
+ * the compiler or processor to visibly reordering writes after that
+ * decrement: if that happened, the program could then end up writing
+ * to memory that was freed by another processor.
+ *
+ * C11 has standardized a good model for expressing these orderings when doing
+ * atomics. It defines three *tiers* of ordering:
+ * 1. Sequential Consistency (every processor sees the same total order of
+ * events)
+ *
+ * 2. Acquire/Release ordering (roughly, everybody can agree previous events
+ * have completed, but they might disagree on the ordering of previous
+ * independent events).
+ *
+ * The relative ordering provided by this tier is sufficient for common
+ * locking and initialization activities, but is insufficient for unusual
+ * synchronization schemes (e.g. IRIW aka Independent Read Independent
+ * Write designs such Dekker's algorithm, Peterson's algorithm, etc.)
+ *
+ * In other words, this tier is close in behavior to Sequential Consistency
+ * in much the same way a General-Relativity universe is close to a
+ * Newtonian universe.
+ * 3. Relaxed (i.e unordered/unfenced)
+ *
+ * In C11 standard's terminology for atomic memory ordering,
+ * - in case (a) we want "relaxed" ordering for perf and,
+ * - in case (b) we want "sequentially consistent" ordering (or perhaps the
+ * only slightly weaker "release" ordering) for correctness.
+ *
+ * There are standardized mappings of operations to orderings for every
+ * processor architecture. See
+ * - https://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+ * - http://preshing.com/20120913/acquire-and-release-semantics/
+ *
+ * In this file:
+ * 1. all RMW (Read/Modify/Write) operations are sequentially consistent.
+ * This includes operations like Atomic_IncN, Atomic_ReadIfEqualWriteN,
+ * Atomic_ReadWriteN, etc.
+ * 2. all R and W operations are relaxed. This includes operations like
+ * Atomic_WriteN, Atomic_ReadN, Atomic_TestBitN, etc.
+ *
+ * The below routines of course ensure both the CPU and compiler honor the
+ * ordering constraint.
+ *
+ * Notes:
+ * 1. Since R-only and W-only operations do not provide ordering, callers
+ * using them for synchronizing operations like double-checked
+ * initialization or releasing spinlocks must provide extra barriers.
+ * 2. This implementation of Atomic operations is suboptimal. On x86,simple
+ * reads and writes have acquire/release semantics at the hardware level.
+ * On arm64, we have separate instructions for sequentially consistent
+ * reads and writes (the same instructions are used for acquire/release).
+ * Neither of these are exposed for R-only or W-only callers.
*
* For further details on x86 and ARM memory ordering see
* https://wiki.eng.vmware.com/ARM/MemoryOrdering.
/*
- * All the assembly code is tricky and written conservatively.
- * For example, to make sure gcc won't introduce copies,
- * we force the addressing mode like this:
- *
- * "xchgl %0, (%1)"
- * : "=r" (val)
- * : "r" (&var->value),
- * "0" (val)
- * : "memory"
- *
- * - edward
- *
- * Actually - turns out that gcc never generates memory aliases (it
- * still does generate register aliases though), so we can be a bit
- * more agressive with the memory constraints. The code above can be
- * modified like this:
- *
- * "xchgl %0, %1"
- * : "=r" (val),
- * "=m" (var->value),
- * : "0" (val),
- * "1" (var->value)
- *
- * The advantages are that gcc can use whatever addressing mode it
- * likes to access the memory value, and that we dont have to use a
- * way-too-generic "memory" clobber as there is now an explicit
- * declaration that var->value is modified.
- *
- * see also /usr/include/asm/atomic.h to convince yourself this is a
- * valid optimization.
- *
- * - walken
+ * The Read/Modify/Write operations on x86/x64 are all written using the
+ * "memory" constraint. This is to ensure the compiler treats the operation as
+ * a full barrier, flushing any pending/cached state currently residing in
+ * registers.
*/
#if defined _MSC_VER && _MSC_VER < 1600 && defined __x86_64__
: "=q" (val),
"+m" (var->value)
: "0" (val)
+ : "memory"
);
return val;
#elif defined _MSC_VER && _MSC_VER >= 1600
"+m" (var->value)
: "q" (newVal),
"0" (oldVal)
- : "cc"
+ : "cc", "memory"
);
return val;
__asm__ __volatile__(
"xchgl %0, %1"
: "=r" (val),
- "+m" (var->value)
+ "+m" (var->value)
: "0" (val)
+ : "memory"
);
return val;
#endif /* VM_X86_ANY */
__asm__ __volatile__(
"lock; cmpxchgl %2, %1"
: "=a" (val),
- "+m" (var->value)
+ "+m" (var->value)
: "r" (newVal),
- "0" (oldVal)
- : "cc"
+ "0" (oldVal)
+ : "cc", "memory"
);
return val;
#endif /* VM_X86_ANY */
#elif defined _MSC_VER
#if _MSC_VER >= 1310
return _InterlockedCompareExchange((long *)&var->value,
- (long)newVal,
- (long)oldVal);
+ (long)newVal,
+ (long)oldVal);
#else
#pragma warning(push)
#pragma warning(disable : 4035) // disable no-return warning
__asm__ __volatile__(
"lock; cmpxchgq %2, %1"
: "=a" (val),
- "+m" (var->value)
+ "+m" (var->value)
: "r" (newVal),
- "0" (oldVal)
- : "cc"
+ "0" (oldVal)
+ : "cc", "memory"
);
return val;
#endif //VM_ARM_V7
#elif defined _MSC_VER
return _InterlockedCompareExchange64((__int64 *)&var->value,
- (__int64)newVal,
- (__int64)oldVal);
+ (__int64)newVal,
+ (__int64)oldVal);
#else
#error No compiler defined for Atomic_ReadIfEqualWrite64
#endif
"lock; andl %1, %0"
: "+m" (var->value)
: "ri" (val)
- : "cc"
+ : "cc", "memory"
);
#endif /* VM_X86_ANY */
#elif defined _MSC_VER
"lock; orl %1, %0"
: "+m" (var->value)
: "ri" (val)
- : "cc"
+ : "cc", "memory"
);
#endif /* VM_X86_ANY */
#elif defined _MSC_VER
"lock; xorl %1, %0"
: "+m" (var->value)
: "ri" (val)
- : "cc"
+ : "cc", "memory"
);
#endif /* VM_X86_ANY */
#elif defined _MSC_VER
"lock; xorq %1, %0"
: "+m" (var->value)
: "re" (val)
- : "cc"
+ : "cc", "memory"
);
#endif
#elif defined _MSC_VER
"lock; addl %1, %0"
: "+m" (var->value)
: "ri" (val)
- : "cc"
+ : "cc", "memory"
);
#endif /* VM_X86_ANY */
#elif defined _MSC_VER
"lock; subl %1, %0"
: "+m" (var->value)
: "ri" (val)
- : "cc"
+ : "cc", "memory"
);
#endif /* VM_X86_ANY */
#elif defined _MSC_VER
"lock; incl %0"
: "+m" (var->value)
:
- : "cc"
+ : "cc", "memory"
);
#endif /* VM_X86_ANY */
#elif defined _MSC_VER
"lock; decl %0"
: "+m" (var->value)
:
- : "cc"
+ : "cc", "memory"
);
#endif /* VM_X86_ANY */
#elif defined _MSC_VER
__asm__ __volatile__(
"lock; xaddl %0, %1"
: "=r" (val),
- "+m" (var->value)
+ "+m" (var->value)
: "0" (val)
- : "cc"
+ : "cc", "memory"
);
return val;
#endif /* VM_X86_ANY */
"lock; cmpxchgq %3, %0" "\n\t"
"sete %1"
: "+m" (*var),
- "=qm" (equal),
- "=a" (dummy)
+ "=qm" (equal),
+ "=a" (dummy)
: "r" (newVal),
"2" (oldVal)
- : "cc"
+ : "cc", "memory"
);
#else /* 32-bit version for non-ARM */
typedef struct {
"lock; cmpxchg8b (%3)" "\n\t"
"xchgl %%ebx, %6" "\n\t"
"sete %0"
- : "=qm" (equal),
- "=a" (dummy1),
- "=d" (dummy2)
+ : "=qm" (equal),
+ "=a" (dummy1),
+ "=d" (dummy2)
: /*
* See the "Rules for __asm__ statements in __PIC__ code" above: %3
* must use a register class which does not contain %ebx.
"lock; cmpxchg8b %0" "\n\t"
"sete %1"
: "+m" (*var),
- "=qm" (equal),
- "=a" (dummy1),
- "=d" (dummy2)
+ "=qm" (equal),
+ "=a" (dummy1),
+ "=d" (dummy2)
: "2" (((S_uint64 *)&oldVal)->lowValue),
"3" (((S_uint64 *)&oldVal)->highValue),
"b" (((S_uint64 *)&newVal)->lowValue),
"c" (((S_uint64 *)&newVal)->highValue)
- : "cc"
+ : "cc", "memory"
);
# endif
#endif
"lock; cmpxchgl %3, %0" "\n\t"
"sete %1"
: "+m" (*var),
- "=qm" (equal),
- "=a" (dummy)
+ "=qm" (equal),
+ "=a" (dummy)
: "r" (newVal),
"2" (oldVal)
- : "cc"
+ : "cc", "memory"
);
return equal;
#endif /* VM_X86_ANY */
return _InterlockedAdd64((__int64 *)&var->value, 0);
#elif defined _MSC_VER && defined __i386__
# pragma warning(push)
-# pragma warning(disable : 4035) // disable no-return warning
+# pragma warning(disable : 4035) // disable no-return warning
{
__asm mov ecx, var
__asm mov edx, ecx
__asm__ __volatile__(
"lock; xaddq %0, %1"
: "=r" (val),
- "+m" (var->value)
+ "+m" (var->value)
: "0" (val)
- : "cc"
+ : "cc", "memory"
);
return val;
#elif defined _MSC_VER
"lock; addq %1, %0"
: "+m" (var->value)
: "re" (val)
- : "cc"
+ : "cc", "memory"
);
#endif
#elif defined _MSC_VER
"lock; subq %1, %0"
: "+m" (var->value)
: "re" (val)
- : "cc"
+ : "cc", "memory"
);
#endif
#elif defined _MSC_VER
"lock; incq %0"
: "+m" (var->value)
:
- : "cc"
+ : "cc", "memory"
);
#elif defined _MSC_VER
_InterlockedIncrement64((__int64 *)&var->value);
"lock; decq %0"
: "+m" (var->value)
:
- : "cc"
+ : "cc", "memory"
);
#elif defined _MSC_VER
_InterlockedDecrement64((__int64 *)&var->value);
__asm__ __volatile__(
"xchgq %0, %1"
: "=r" (val),
- "+m" (var->value)
+ "+m" (var->value)
: "0" (val)
+ : "memory"
);
return val;
#elif defined _MSC_VER
"lock; orq %1, %0"
: "+m" (var->value)
: "re" (val)
- : "cc"
+ : "cc", "memory"
);
#elif defined _MSC_VER
_InterlockedOr64((__int64 *)&var->value, (__int64)val);
"lock; andq %1, %0"
: "+m" (var->value)
: "re" (val)
- : "cc"
+ : "cc", "memory"
);
#elif defined _MSC_VER
_InterlockedAnd64((__int64 *)&var->value, (__int64)val);
"lock; btsq %1, %0"
: "+m" (var->value)
: "ri" ((uint64)bit)
- : "cc"
+ : "cc", "memory"
);
#else
uint64 oldVal;
"lock; btrq %1, %0"
: "+m" (var->value)
: "ri" ((uint64)bit)
- : "cc"
+ : "cc", "memory"
);
#else
uint64 oldVal;
"lock; btsq %2, %1; setc %0"
: "=rm" (out), "+m" (var->value)
: "rJ" ((uint64)bit)
- : "cc"
+ : "cc", "memory"
);
return out;
#else
__asm__ __volatile__(
"xchgw %0, %1"
: "=r" (val),
- "+m" (var->value)
+ "+m" (var->value)
: "0" (val)
+ : "memory"
);
return val;
#elif defined VM_ARM_V7
__asm__ __volatile__(
"lock; cmpxchgw %2, %1"
: "=a" (val),
- "+m" (var->value)
+ "+m" (var->value)
: "r" (newVal),
- "0" (oldVal)
- : "cc"
+ "0" (oldVal)
+ : "cc", "memory"
);
return val;
#elif defined VM_ARM_V7
"lock; andw %1, %0"
: "+m" (var->value)
: "re" (val)
- : "cc"
+ : "cc", "memory"
);
#elif defined VM_ARM_V7
register volatile uint16 res;
"lock; orw %1, %0"
: "+m" (var->value)
: "re" (val)
- : "cc"
+ : "cc", "memory"
);
#elif defined VM_ARM_V7
register volatile uint16 res;
"lock; xorw %1, %0"
: "+m" (var->value)
: "re" (val)
- : "cc"
+ : "cc", "memory"
);
#elif defined VM_ARM_V7
register volatile uint16 res;
"lock; addw %1, %0"
: "+m" (var->value)
: "re" (val)
- : "cc"
+ : "cc", "memory"
);
#elif defined VM_ARM_V7
register volatile uint16 res;
"lock; subw %1, %0"
: "+m" (var->value)
: "re" (val)
- : "cc"
+ : "cc", "memory"
);
#elif defined VM_ARM_V7
register volatile uint16 res;
"lock; incw %0"
: "+m" (var->value)
:
- : "cc"
+ : "cc", "memory"
);
#elif defined VM_ARM_ANY
Atomic_Add16(var, 1);
"lock; decw %0"
: "+m" (var->value)
:
- : "cc"
+ : "cc", "memory"
);
#elif defined VM_ARM_ANY
Atomic_Sub16(var, 1);
__asm__ __volatile__(
"lock; xaddw %0, %1"
: "=r" (val),
- "+m" (var->value)
+ "+m" (var->value)
: "0" (val)
- : "cc"
+ : "cc", "memory"
);
return val;
#elif defined VM_ARM_V7
in val) \
{ \
return (out)(cast)Atomic_ReadWrite ## size(var, \
- (uint ## size)(cast)val); \
+ (uint ## size)(cast)val); \
} \
\
\