Add missing memory constraints for vm_atomic RMW (Read/Modify/Write) instructions

author Oliver Kurth <okurth@vmware.com>

Fri, 23 Mar 2018 22:05:35 +0000 (15:05 -0700)

committer Oliver Kurth <okurth@vmware.com>

Fri, 23 Mar 2018 22:05:35 +0000 (15:05 -0700)
author Oliver Kurth <okurth@vmware.com>
Fri, 23 Mar 2018 22:05:35 +0000 (15:05 -0700)
committer Oliver Kurth <okurth@vmware.com>
Fri, 23 Mar 2018 22:05:35 +0000 (15:05 -0700)
diff --git a/open-vm-tools/lib/include/vm_atomic.h b/open-vm-tools/lib/include/vm_atomic.h

index 887d042d02a3f2608e4164149e6fd1a3aa12d7f3..d0ff097cca18f4f9065b81d2d34dc26ea97c9549 100644 (file)
--- a/open-vm-tools/lib/include/vm_atomic.h
+++ b/open-vm-tools/lib/include/vm_atomic.h
@@ -49,38 +49,68 @@ extern "C" {
  #endif
  
  /*
- * In the Atomic_* definitions below, memory ordering and atomicity are somewhat
- * conflated in an inconsistent manner. First, we have Atomic_{Read,Write},
- * which only guarantees single copy atomicity, i.e. that the read/write occurs
- * in an atomic fashion, but have no implication on memory ordering. The second
- * class of Atomics are all the non-unfenced operations excluding
- * Atomic_{Read,Write}*, which both imply atomicity and act as a memory barrier,
- * implying sequentially consistent ordering of the atomic operation with all
- * loads/stores prior to and after it.
- *
- * Since on x86, the second class of operations are associated with LOCK
- * semantics, assumptions have been made about the ordering these operations
- * imply on surrounding code (see for example the vmkernel's RefCount
- * implementation). As a result, on arm64 we have to provide these same
- * guarantees. We do this by making use of DMB barriers both before and after
- * the atomic ldrx/strx sequences. A barrier before and after is required to
- * avoid having part of the atomic operation reordered with surrounding code,
- * e.g. a store-load reordering of the strx with a following load outside the
- * Atomic_ op. For the first class of operations, Atomic_{Read,Write}, we do not
- * implement a barrier.
- *
- * This implementation of Atomic operations is suboptimal on arm64, since
- * both atomicity and memory ordering are fused together. Ideally the Atomic
- * operations would only imply atomicity, and an explicit memory barrier in the
- * surrounding code used to enforce ordering where necessary. This would eschew
- * the need for the DMBs. A middle ground can be implemented where we use the
- * arm64 load-acquire/store-release exclusive instructions to implement Atomics.
- * This would imply sequential consistency of the Atomic operations (but not
- * with any of the surrounding non-atomic operations) without the need for a
- * DMB. Using these without a DMB today can still result in problematic
- * reordering by the processor with surrounding non-atomic operations, e.g. a
- * store-load reordering with a stlxr. Future optimization for arm64 should
- * consider the wider change required at the call sites to minimize DMBs.
+ * There are two concepts involved when dealing with atomic accesses:
+ * 1. Atomicity of the access itself
+ * 2. Ordering of the access with respect to other reads&writes (from the view
+ *    of other processors/devices).
+ *
+ * Two examples help to clarify #2:
+ * a. Inc: A caller implementing a simple independent global event counter
+ *         might not care if the compiler or processor visibly reorders the
+ *         increment around other memory accesses.
+ * b. Dec: A caller implementing a reference count absolutely *doesn't* want
+ *         the compiler or processor to visibly reordering writes after that
+ *         decrement: if that happened, the program could then end up writing
+ *         to memory that was freed by another processor.
+ *
+ * C11 has standardized a good model for expressing these orderings when doing
+ * atomics. It defines three *tiers* of ordering:
+ * 1. Sequential Consistency (every processor sees the same total order of
+ *    events)
+ *
+ * 2. Acquire/Release ordering (roughly, everybody can agree previous events
+ *    have completed, but they might disagree on the ordering of previous
+ *    independent events).
+ *
+ *    The relative ordering provided by this tier is sufficient for common
+ *    locking and initialization activities, but is insufficient for unusual
+ *    synchronization schemes (e.g. IRIW aka Independent Read Independent
+ *    Write designs such Dekker's algorithm, Peterson's algorithm, etc.)
+ *
+ *    In other words, this tier is close in behavior to Sequential Consistency
+ *    in much the same way a General-Relativity universe is close to a
+ *    Newtonian universe.
+ * 3. Relaxed (i.e unordered/unfenced)
+ *
+ * In C11 standard's terminology for atomic memory ordering,
+ * - in case (a) we want "relaxed" ordering for perf and,
+ * - in case (b) we want "sequentially consistent" ordering (or perhaps the
+ *   only slightly weaker "release" ordering) for correctness.
+ *
+ * There are standardized mappings of operations to orderings for every
+ * processor architecture. See
+ * - https://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+ * - http://preshing.com/20120913/acquire-and-release-semantics/
+ *
+ * In this file:
+ * 1. all RMW (Read/Modify/Write) operations are sequentially consistent.
+ *    This includes operations like Atomic_IncN, Atomic_ReadIfEqualWriteN,
+ *    Atomic_ReadWriteN, etc.
+ * 2. all R and W operations are relaxed. This includes operations like
+ *    Atomic_WriteN, Atomic_ReadN, Atomic_TestBitN, etc.
+ *
+ * The below routines of course ensure both the CPU and compiler honor the
+ * ordering constraint.
+ *
+ * Notes:
+ * 1. Since R-only and W-only operations do not provide ordering, callers
+ *    using them for synchronizing operations like double-checked
+ *    initialization or releasing spinlocks must provide extra barriers.
+ * 2. This implementation of Atomic operations is suboptimal. On x86,simple
+ *    reads and writes have acquire/release semantics at the hardware level.
+ *    On arm64, we have separate instructions for sequentially consistent
+ *    reads and writes (the same instructions are used for acquire/release).
+ *    Neither of these are exposed for R-only or W-only callers.
   *
   * For further details on x86 and ARM memory ordering see
   * https://wiki.eng.vmware.com/ARM/MemoryOrdering.
@@ -222,38 +252,10 @@ Atomic_VolatileToAtomic64(volatile uint64 *var)  // IN:
  
  
  /*
- * All the assembly code is tricky and written conservatively.
- * For example, to make sure gcc won't introduce copies,
- * we force the addressing mode like this:
- *
- *    "xchgl %0, (%1)"
- *    : "=r" (val)
- *    : "r" (&var->value),
- *      "0" (val)
- *    : "memory"
- *
- * - edward
- *
- * Actually - turns out that gcc never generates memory aliases (it
- * still does generate register aliases though), so we can be a bit
- * more agressive with the memory constraints. The code above can be
- * modified like this:
- *
- *    "xchgl %0, %1"
- *    : "=r" (val),
- *      "=m" (var->value),
- *    : "0" (val),
- *      "1" (var->value)
- *
- * The advantages are that gcc can use whatever addressing mode it
- * likes to access the memory value, and that we dont have to use a
- * way-too-generic "memory" clobber as there is now an explicit
- * declaration that var->value is modified.
- *
- * see also /usr/include/asm/atomic.h to convince yourself this is a
- * valid optimization.
- *
- * - walken
+ * The Read/Modify/Write operations on x86/x64 are all written using the
+ * "memory" constraint. This is to ensure the compiler treats the operation as
+ * a full barrier, flushing any pending/cached state currently residing in
+ * registers.
   */
  
  #if defined _MSC_VER && _MSC_VER < 1600 && defined __x86_64__
@@ -403,6 +405,7 @@ Atomic_ReadWrite8(Atomic_uint8 *var,  // IN/OUT:
        : "=q" (val),
          "+m" (var->value)
        : "0" (val)
+      : "memory"
     );
     return val;
  #elif defined _MSC_VER && _MSC_VER >= 1600
@@ -496,7 +499,7 @@ Atomic_ReadIfEqualWrite8(Atomic_uint8 *var,  // IN/OUT:
          "+m" (var->value)
        : "q" (newVal),
          "0" (oldVal)
-      : "cc"
+      : "cc", "memory"
     );
  
     return val;
@@ -1002,8 +1005,9 @@ Atomic_ReadWrite32(Atomic_uint32 *var, // IN/OUT
     __asm__ __volatile__(
        "xchgl %0, %1"
        : "=r" (val),
-       "+m" (var->value)
+        "+m" (var->value)
        : "0" (val)
+      : "memory"
     );
     return val;
  #endif /* VM_X86_ANY */
@@ -1162,18 +1166,18 @@ Atomic_ReadIfEqualWrite32(Atomic_uint32 *var, // IN/OUT
     __asm__ __volatile__(
        "lock; cmpxchgl %2, %1"
        : "=a" (val),
-       "+m" (var->value)
+        "+m" (var->value)
        : "r" (newVal),
-       "0" (oldVal)
-      : "cc"
+        "0" (oldVal)
+      : "cc", "memory"
     );
     return val;
  #endif /* VM_X86_ANY */
  #elif defined _MSC_VER
  #if _MSC_VER >= 1310
     return _InterlockedCompareExchange((long *)&var->value,
-                                     (long)newVal,
-                                     (long)oldVal);
+                                      (long)newVal,
+                                      (long)oldVal);
  #else
  #pragma warning(push)
  #pragma warning(disable : 4035)         // disable no-return warning
@@ -1267,17 +1271,17 @@ Atomic_ReadIfEqualWrite64(Atomic_uint64 *var, // IN/OUT
     __asm__ __volatile__(
        "lock; cmpxchgq %2, %1"
        : "=a" (val),
-       "+m" (var->value)
+        "+m" (var->value)
        : "r" (newVal),
-       "0" (oldVal)
-      : "cc"
+        "0" (oldVal)
+      : "cc", "memory"
     );
     return val;
  #endif //VM_ARM_V7
  #elif defined _MSC_VER
     return _InterlockedCompareExchange64((__int64 *)&var->value,
-                                       (__int64)newVal,
-                                       (__int64)oldVal);
+                                        (__int64)newVal,
+                                        (__int64)oldVal);
  #else
  #error No compiler defined for Atomic_ReadIfEqualWrite64
  #endif
@@ -1332,7 +1336,7 @@ Atomic_And32(Atomic_uint32 *var, // IN/OUT
        "lock; andl %1, %0"
        : "+m" (var->value)
        : "ri" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #endif /* VM_X86_ANY */
  #elif defined _MSC_VER
@@ -1397,7 +1401,7 @@ Atomic_Or32(Atomic_uint32 *var, // IN/OUT
        "lock; orl %1, %0"
        : "+m" (var->value)
        : "ri" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #endif /* VM_X86_ANY */
  #elif defined _MSC_VER
@@ -1462,7 +1466,7 @@ Atomic_Xor32(Atomic_uint32 *var, // IN/OUT
        "lock; xorl %1, %0"
        : "+m" (var->value)
        : "ri" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #endif /* VM_X86_ANY */
  #elif defined _MSC_VER
@@ -1510,7 +1514,7 @@ Atomic_Xor64(Atomic_uint64 *var, // IN/OUT
        "lock; xorq %1, %0"
        : "+m" (var->value)
        : "re" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #endif
  #elif defined _MSC_VER
@@ -1569,7 +1573,7 @@ Atomic_Add32(Atomic_uint32 *var, // IN/OUT
        "lock; addl %1, %0"
        : "+m" (var->value)
        : "ri" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #endif /* VM_X86_ANY */
  #elif defined _MSC_VER
@@ -1634,7 +1638,7 @@ Atomic_Sub32(Atomic_uint32 *var, // IN/OUT
        "lock; subl %1, %0"
        : "+m" (var->value)
        : "ri" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #endif /* VM_X86_ANY */
  #elif defined _MSC_VER
@@ -1680,7 +1684,7 @@ Atomic_Inc32(Atomic_uint32 *var) // IN/OUT
        "lock; incl %0"
        : "+m" (var->value)
        :
-      : "cc"
+      : "cc", "memory"
     );
  #endif /* VM_X86_ANY */
  #elif defined _MSC_VER
@@ -1725,7 +1729,7 @@ Atomic_Dec32(Atomic_uint32 *var) // IN/OUT
        "lock; decl %0"
        : "+m" (var->value)
        :
-      : "cc"
+      : "cc", "memory"
     );
  #endif /* VM_X86_ANY */
  #elif defined _MSC_VER
@@ -1939,9 +1943,9 @@ Atomic_ReadAdd32(Atomic_uint32 *var, // IN/OUT
     __asm__ __volatile__(
        "lock; xaddl %0, %1"
        : "=r" (val),
-       "+m" (var->value)
+        "+m" (var->value)
        : "0" (val)
-      : "cc"
+      : "cc", "memory"
     );
     return val;
  #endif /* VM_X86_ANY */
@@ -2044,11 +2048,11 @@ Atomic_CMPXCHG64(Atomic_uint64 *var,   // IN/OUT
        "lock; cmpxchgq %3, %0" "\n\t"
        "sete %1"
        : "+m" (*var),
-       "=qm" (equal),
-       "=a" (dummy)
+        "=qm" (equal),
+        "=a" (dummy)
        : "r" (newVal),
          "2" (oldVal)
-      : "cc"
+      : "cc", "memory"
     );
  #else /* 32-bit version for non-ARM */
     typedef struct {
@@ -2088,9 +2092,9 @@ Atomic_CMPXCHG64(Atomic_uint64 *var,   // IN/OUT
        "lock; cmpxchg8b (%3)" "\n\t"
        "xchgl %%ebx, %6"      "\n\t"
        "sete %0"
-      :        "=qm" (equal),
-       "=a" (dummy1),
-       "=d" (dummy2)
+      : "=qm" (equal),
+        "=a" (dummy1),
+        "=d" (dummy2)
        : /*
           * See the "Rules for __asm__ statements in __PIC__ code" above: %3
           * must use a register class which does not contain %ebx.
@@ -2111,14 +2115,14 @@ Atomic_CMPXCHG64(Atomic_uint64 *var,   // IN/OUT
        "lock; cmpxchg8b %0" "\n\t"
        "sete %1"
        : "+m" (*var),
-       "=qm" (equal),
-       "=a" (dummy1),
-       "=d" (dummy2)
+        "=qm" (equal),
+        "=a" (dummy1),
+        "=d" (dummy2)
        : "2" (((S_uint64 *)&oldVal)->lowValue),
          "3" (((S_uint64 *)&oldVal)->highValue),
          "b" (((S_uint64 *)&newVal)->lowValue),
          "c" (((S_uint64 *)&newVal)->highValue)
-      : "cc"
+      : "cc", "memory"
     );
  #   endif
  #endif
@@ -2166,11 +2170,11 @@ Atomic_CMPXCHG32(Atomic_uint32 *var,   // IN/OUT
        "lock; cmpxchgl %3, %0" "\n\t"
        "sete %1"
        : "+m" (*var),
-       "=qm" (equal),
-       "=a" (dummy)
+        "=qm" (equal),
+        "=a" (dummy)
        : "r" (newVal),
          "2" (oldVal)
-      : "cc"
+      : "cc", "memory"
     );
     return equal;
  #endif /* VM_X86_ANY */
@@ -2247,7 +2251,7 @@ Atomic_Read64(Atomic_uint64 const *var) // IN
     return _InterlockedAdd64((__int64 *)&var->value, 0);
  #elif defined _MSC_VER && defined __i386__
  #   pragma warning(push)
-#   pragma warning(disable : 4035)             // disable no-return warning
+#   pragma warning(disable : 4035)      // disable no-return warning
     {
        __asm mov ecx, var
        __asm mov edx, ecx
@@ -2330,9 +2334,9 @@ Atomic_ReadAdd64(Atomic_uint64 *var, // IN/OUT
     __asm__ __volatile__(
        "lock; xaddq %0, %1"
        : "=r" (val),
-       "+m" (var->value)
+        "+m" (var->value)
        : "0" (val)
-      : "cc"
+      : "cc", "memory"
     );
     return val;
  #elif defined _MSC_VER
@@ -2460,7 +2464,7 @@ Atomic_Add64(Atomic_uint64 *var, // IN/OUT
        "lock; addq %1, %0"
        : "+m" (var->value)
        : "re" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #endif
  #elif defined _MSC_VER
@@ -2502,7 +2506,7 @@ Atomic_Sub64(Atomic_uint64 *var, // IN/OUT
        "lock; subq %1, %0"
        : "+m" (var->value)
        : "re" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #endif
  #elif defined _MSC_VER
@@ -2542,7 +2546,7 @@ Atomic_Inc64(Atomic_uint64 *var) // IN/OUT
        "lock; incq %0"
        : "+m" (var->value)
        :
-      : "cc"
+      : "cc", "memory"
     );
  #elif defined _MSC_VER
     _InterlockedIncrement64((__int64 *)&var->value);
@@ -2581,7 +2585,7 @@ Atomic_Dec64(Atomic_uint64 *var) // IN/OUT
        "lock; decq %0"
        : "+m" (var->value)
        :
-      : "cc"
+      : "cc", "memory"
     );
  #elif defined _MSC_VER
     _InterlockedDecrement64((__int64 *)&var->value);
@@ -2617,8 +2621,9 @@ Atomic_ReadWrite64(Atomic_uint64 *var, // IN/OUT
     __asm__ __volatile__(
        "xchgq %0, %1"
        : "=r" (val),
-       "+m" (var->value)
+        "+m" (var->value)
        : "0" (val)
+      : "memory"
     );
     return val;
  #elif defined _MSC_VER
@@ -2726,7 +2731,7 @@ Atomic_Or64(Atomic_uint64 *var, // IN/OUT
        "lock; orq %1, %0"
        : "+m" (var->value)
        : "re" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #elif defined _MSC_VER
     _InterlockedOr64((__int64 *)&var->value, (__int64)val);
@@ -2773,7 +2778,7 @@ Atomic_And64(Atomic_uint64 *var, // IN/OUT
        "lock; andq %1, %0"
        : "+m" (var->value)
        : "re" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #elif defined _MSC_VER
     _InterlockedAnd64((__int64 *)&var->value, (__int64)val);
@@ -2819,7 +2824,7 @@ Atomic_SetBit64(Atomic_uint64 *var, // IN/OUT
        "lock; btsq %1, %0"
        : "+m" (var->value)
        : "ri" ((uint64)bit)
-      : "cc"
+      : "cc", "memory"
     );
  #else
     uint64 oldVal;
@@ -2859,7 +2864,7 @@ Atomic_ClearBit64(Atomic_uint64 *var, // IN/OUT
        "lock; btrq %1, %0"
        : "+m" (var->value)
        : "ri" ((uint64)bit)
-      : "cc"
+      : "cc", "memory"
     );
  #else
     uint64 oldVal;
@@ -2938,7 +2943,7 @@ Atomic_TestSetBit64(Atomic_uint64 *var, // IN/OUT
        "lock; btsq %2, %1; setc %0"
        : "=rm" (out), "+m" (var->value)
        : "rJ" ((uint64)bit)
-      : "cc"
+      : "cc", "memory"
     );
     return out;
  #else
@@ -3032,8 +3037,9 @@ Atomic_ReadWrite16(Atomic_uint16 *var,  // IN/OUT:
     __asm__ __volatile__(
        "xchgw %0, %1"
        : "=r" (val),
-       "+m" (var->value)
+        "+m" (var->value)
        : "0" (val)
+      : "memory"
     );
     return val;
  #elif defined VM_ARM_V7
@@ -3140,10 +3146,10 @@ Atomic_ReadIfEqualWrite16(Atomic_uint16 *var,   // IN/OUT
     __asm__ __volatile__(
        "lock; cmpxchgw %2, %1"
        : "=a" (val),
-       "+m" (var->value)
+        "+m" (var->value)
        : "r" (newVal),
-       "0" (oldVal)
-      : "cc"
+        "0" (oldVal)
+      : "cc", "memory"
     );
     return val;
  #elif defined VM_ARM_V7
@@ -3204,7 +3210,7 @@ Atomic_And16(Atomic_uint16 *var, // IN/OUT
        "lock; andw %1, %0"
        : "+m" (var->value)
        : "re" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #elif defined VM_ARM_V7
     register volatile uint16 res;
@@ -3261,7 +3267,7 @@ Atomic_Or16(Atomic_uint16 *var, // IN/OUT
        "lock; orw %1, %0"
        : "+m" (var->value)
        : "re" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #elif defined VM_ARM_V7
     register volatile uint16 res;
@@ -3318,7 +3324,7 @@ Atomic_Xor16(Atomic_uint16 *var, // IN/OUT
        "lock; xorw %1, %0"
        : "+m" (var->value)
        : "re" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #elif defined VM_ARM_V7
     register volatile uint16 res;
@@ -3375,7 +3381,7 @@ Atomic_Add16(Atomic_uint16 *var, // IN/OUT
        "lock; addw %1, %0"
        : "+m" (var->value)
        : "re" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #elif defined VM_ARM_V7
     register volatile uint16 res;
@@ -3432,7 +3438,7 @@ Atomic_Sub16(Atomic_uint16 *var, // IN/OUT
        "lock; subw %1, %0"
        : "+m" (var->value)
        : "re" (val)
-      : "cc"
+      : "cc", "memory"
     );
  #elif defined VM_ARM_V7
     register volatile uint16 res;
@@ -3488,7 +3494,7 @@ Atomic_Inc16(Atomic_uint16 *var) // IN/OUT
        "lock; incw %0"
        : "+m" (var->value)
        :
-      : "cc"
+      : "cc", "memory"
     );
  #elif defined VM_ARM_ANY
     Atomic_Add16(var, 1);
@@ -3524,7 +3530,7 @@ Atomic_Dec16(Atomic_uint16 *var) // IN/OUT
        "lock; decw %0"
        : "+m" (var->value)
        :
-      : "cc"
+      : "cc", "memory"
     );
  #elif defined VM_ARM_ANY
     Atomic_Sub16(var, 1);
@@ -3594,9 +3600,9 @@ Atomic_ReadAdd16(Atomic_uint16 *var,  // IN/OUT
     __asm__ __volatile__(
        "lock; xaddw %0, %1"
        : "=r" (val),
-       "+m" (var->value)
+        "+m" (var->value)
        : "0" (val)
-      : "cc"
+      : "cc", "memory"
     );
     return val;
  #elif defined VM_ARM_V7
@@ -3727,7 +3733,7 @@ Atomic_ReadDec16(Atomic_uint16 *var) // IN/OUT
                              in val)                                           \
     {                                                                          \
        return (out)(cast)Atomic_ReadWrite ## size(var,                         \
-               (uint ## size)(cast)val);                                     \
+                (uint ## size)(cast)val);                                     \
     }                                                                          \
                                                                                \
                                                                                \
author	Oliver Kurth <okurth@vmware.com>
	Fri, 23 Mar 2018 22:05:35 +0000 (15:05 -0700)
committer	Oliver Kurth <okurth@vmware.com>
	Fri, 23 Mar 2018 22:05:35 +0000 (15:05 -0700)