From 40ac04d8f3df4ea7eff9b890055fa565b368f4e3 Mon Sep 17 00:00:00 2001
From: Oliver Kurth <okurth@vmware.com>
Date: Fri, 15 Sep 2017 11:23:31 -0700
Subject: [PATCH] Implement ST_LD_MEM_BARRIER on x86 with a locked xor

Microbenchmarks on modern Intel architectures show that a memory barrier
implemented with locked xor operation performs about 30% better when
compared to a barrier implemented with mfence, while providing the same
memory ordering guarantees. This patch changes the implementation of
ST_LD_MEM_BARRIER on x86 architectures to use the faster, locked xor
operation. Additionally, support for Microsoft's compiler is added.
---
 .../lib/include/vm_basic_asm_x86_common.h     | 36 +++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/open-vm-tools/lib/include/vm_basic_asm_x86_common.h b/open-vm-tools/lib/include/vm_basic_asm_x86_common.h
index 9b73f5217..bbcc35fce 100644
--- a/open-vm-tools/lib/include/vm_basic_asm_x86_common.h
+++ b/open-vm-tools/lib/include/vm_basic_asm_x86_common.h
@@ -1,5 +1,5 @@
 /*********************************************************
- * Copyright (C) 2013 VMware, Inc. All rights reserved.
+ * Copyright (C) 2013,2017 VMware, Inc. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published
@@ -103,6 +103,9 @@ void _mm_mfence(void);
 void _mm_lfence(void);
 #pragma intrinsic(_mm_mfence, _mm_lfence)
 
+long _InterlockedXor(long volatile *, long);
+#pragma intrinsic(_InterlockedXor)
+
 unsigned int __getcallerseflags(void);
 #pragma intrinsic(__getcallerseflags)
 
@@ -377,22 +380,43 @@ RDTSC_BARRIER(void)
  *
  * Thanks for pasting this whole comment into every architecture header.
  *
- * On x86, we only need to care specifically about store-load
- * reordering on normal memory types and mfence, otherwise only a compiler
- * barrier is needed.
+ * On x86, we only need to care specifically about store-load reordering on
+ * normal memory types. In other cases, only a compiler barrier is needed. The
+ * ST_LD barrier is implemented with a locked xor operation (instead of the
+ * mfence instruction) for performance reasons. See PR 1674199 for more
+ * details.
  *
  * On x64, special instructions are only provided for load-load (lfence) and
  * store-store (sfence) ordering, and they don't apply to normal memory.
  */
+
+static INLINE void
+ST_LD_MEM_BARRIER(void)
+{
+   volatile long temp;
+
+   COMPILER_MEM_BARRIER();
+#if defined __GNUC__
+   __asm__ __volatile__ (
+      "lock xorl $1, %0\n"
+      : "+m" (temp)
+      : /* no additional inputs */
+      : "cc");
+#elif defined _MSC_VER
+   _InterlockedXor(&temp, 1);
+#else
+#error ST_LD_MEM_BARRIER not defined for this compiler
+#endif
+   COMPILER_MEM_BARRIER();
+}
+
 #define LD_LD_MEM_BARRIER()      COMPILER_READ_BARRIER()
 #define LD_ST_MEM_BARRIER()      COMPILER_MEM_BARRIER()
 #define LD_LDST_MEM_BARRIER()    COMPILER_MEM_BARRIER()
-#define ST_LD_MEM_BARRIER()      __asm__ __volatile__("mfence" ::: "memory")
 #define ST_ST_MEM_BARRIER()      COMPILER_WRITE_BARRIER()
 #define ST_LDST_MEM_BARRIER()    ST_LD_MEM_BARRIER()
 #define LDST_LD_MEM_BARRIER()    ST_LD_MEM_BARRIER()
 #define LDST_ST_MEM_BARRIER()    COMPILER_MEM_BARRIER()
 #define LDST_LDST_MEM_BARRIER()  ST_LD_MEM_BARRIER()
 
-
 #endif // _VM_BASIC_ASM_X86_COMMON_H_
-- 
2.47.3