/*********************************************************
- * Copyright (C) 1998-2021 VMware, Inc. All rights reserved.
+ * Copyright (C) 1998-2022 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
uint128 oldVal, // IN
uint128 newVal) // IN
{
-#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16
+#if defined VM_ARM_64
+ /*
+ * Don't use __sync_val_compare_and_swap, as this cannot magically
+ * use the right (LL/SC vs LSE) atomics without -moutline-atomics.
+ */
+#if __GNUC__ >= 9
+ if (Atomic_HaveLSE) {
+ SMP_RW_BARRIER_RW();
+ __asm__ __volatile__(
+ ".arch armv8.2-a \n\t"
+ "casp %0, %H0, %2, %H2, %1 \n\t"
+ : "+r" (oldVal),
+ "+Q" (ptr->value)
+ : "r" (newVal)
+ );
+ SMP_RW_BARRIER_RW();
+ return oldVal;
+ } else
+#endif /* __GNUC__ */
+ {
+ union {
+ uint128 raw;
+ struct {
+ uint64 lo;
+ uint64 hi;
+ };
+ } res, _old = { oldVal }, _new = { newVal };
+ uint32 failed;
+
+ SMP_RW_BARRIER_RW();
+ __asm__ __volatile__(
+ "1: ldxp %x0, %x1, %3 \n\t"
+ " cmp %x0, %x4 \n\t"
+ " ccmp %x1, %x5, #0, eq \n\t"
+ " b.ne 2f \n\t"
+ " stxp %w2, %x6, %x7, %3 \n\t"
+ " cbnz %w2, 1b \n\t"
+ "2: \n\t"
+ : "=&r" (res.lo),
+ "=&r" (res.hi),
+ "=&r" (failed),
+ "+Q" (ptr->value)
+ : "r" (_old.lo),
+ "r" (_old.hi),
+ "r" (_new.lo),
+ "r" (_new.hi)
+ : "cc"
+ );
+ SMP_RW_BARRIER_RW();
+ return res.raw;
+ }
+#elif __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16
return __sync_val_compare_and_swap(&ptr->value, oldVal, newVal);
-#elif defined VM_ARM_64
- union {
- uint128 raw;
- struct {
- uint64 lo;
- uint64 hi;
- };
- } res, _old = { oldVal }, _new = { newVal };
- uint32 failed;
-
- SMP_RW_BARRIER_RW();
- __asm__ __volatile__(
- "1: ldxp %x0, %x1, %3 \n\t"
- " cmp %x0, %x4 \n\t"
- " ccmp %x1, %x5, #0, eq \n\t"
- " b.ne 2f \n\t"
- " stxp %w2, %x6, %x7, %3 \n\t"
- " cbnz %w2, 1b \n\t"
- "2: \n\t"
- : "=&r" (res.lo),
- "=&r" (res.hi),
- "=&r" (failed),
- "+Q" (ptr->value)
- : "r" (_old.lo),
- "r" (_old.hi),
- "r" (_new.lo),
- "r" (_new.hi)
- : "cc"
- );
- SMP_RW_BARRIER_RW();
-
- return res.raw;
#endif
}
#endif
/*********************************************************
- * Copyright (C) 2017-2018 VMware, Inc. All rights reserved.
+ * Copyright (C) 2017-2018,2022 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
#include "vm_basic_asm_arm64.h"
+/*
+ * Today these are defines, but long-term these will be patchable globals
+ * for ESXi kernel-mode code (and something similar for ESXi userspace code).
+ *
+ * Atomic_HaveLSE should be set to 1 for CPUs that have the LSE extenstion
+ * and where the atomic instructions are known to have a performance benefit.
+ * Seemingly, on some low-end chips (CA55) there may not be a benefit.
+ *
+ * Not every operation can be performed using a single-instruction atomic -
+ * LSE doesn't cover all kinds of logical/arithmetic operations. For example,
+ * there's an ldeor instruction, but not an ldorr. For cases, where there is no
+ * combined instruction that atomically performs the load/store and the ALU
+ * operation, we fall back to CAS or to LL/SC. On some uarches - e.g. Neoverse
+ * N1 - CAS shows better behavior during heavy contention than LL/SC. LL/SC,
+ * though, remains the safest option. Atomic_PreferCasForOps controls this.
+ */
+
+#ifdef VMK_ARM_LSE
+#define Atomic_HaveLSE 1
+#else
+#define Atomic_HaveLSE 0
+#endif
+#define Atomic_PreferCasForOps 1
+
+#define _VMATOM_LSE_HAVE(x) _VMATOM_LSE_HAVE_##x
+#define _VMATOM_LSE_HAVE_add 1
+#define _VMATOM_LSE_HAVE_sub 0
+#define _VMATOM_LSE_HAVE_eor 1
+#define _VMATOM_LSE_HAVE_orr 0
+#define _VMATOM_LSE_HAVE_and 0
+
+#define Atomic_PreferLSE(op) (Atomic_HaveLSE && \
+ (_VMATOM_LSE_HAVE(op) || Atomic_PreferCasForOps))
+
/* bit size, instruction suffix, register prefix, extend suffix */
#define _VMATOM_SIZE_8 8, b, w, b
#define _VMATOM_SIZE_16 16, h, w, h
/* Read (not returned), op with modval, write. */
#define _VMATOM_SNIPPET_OP(bs, is, rp, es, fenced, atm, op, modval) ({ \
- uint32 _failed; \
- uint##bs _sample; \
+ uint##bs _newval; \
\
_VMATOM_FENCE(fenced); \
- __asm__ __volatile__( \
- "1: ldxr"#is" %"#rp"0, %2 \n\t"\
- " "#op" %"#rp"0, %"#rp"0, %"#rp"3 \n\t"\
- " stxr"#is" %w1 , %"#rp"0, %2 \n\t"\
- " cbnz %w1 , 1b \n\t"\
- : "=&r" (_sample), \
- "=&r" (_failed), \
- "+Q" (*atm) \
- : "r" (modval) \
- ); \
+ if (Atomic_PreferLSE(op)) { \
+ if (_VMATOM_LSE_HAVE(op)) { \
+ __asm__ __volatile__( \
+ ".arch armv8.2-a \n\t"\
+ "st" #op #is" %"#rp"1, %0 \n\t"\
+ : "+Q" (*atm) \
+ : "r" (modval) \
+ ); \
+ } else { \
+ uint##bs _oldval; \
+ uint##bs _clobberedval; \
+ __asm__ __volatile__( \
+ ".arch armv8.2-a \n\t"\
+ " ldr"#is" %"#rp"1, %3 \n\t"\
+ "1: mov %"#rp"0, %"#rp"1 \n\t"\
+ " "#op" %"#rp"2, %"#rp"0, %"#rp"4 \n\t"\
+ " cas"#is" %"#rp"1, %"#rp"2, %3 \n\t"\
+ " cmp %"#rp"0, %"#rp"1, uxt"#es" \n\t"\
+ " b.ne 1b \n\t"\
+ : "=&r" (_oldval), \
+ "=&r" (_clobberedval), \
+ "=&r" (_newval), \
+ "+Q" (*atm) \
+ : "r" (modval) \
+ : "cc" \
+ ); \
+ } \
+ } else { \
+ uint32 _failed; \
+ __asm__ __volatile__( \
+ "1: ldxr"#is" %"#rp"0, %2 \n\t"\
+ " "#op" %"#rp"0, %"#rp"0, %"#rp"3 \n\t"\
+ " stxr"#is" %w1 , %"#rp"0, %2 \n\t"\
+ " cbnz %w1 , 1b \n\t"\
+ : "=&r" (_newval), \
+ "=&r" (_failed), \
+ "+Q" (*atm) \
+ : "r" (modval) \
+ ); \
+ } \
_VMATOM_FENCE(fenced); \
})
/* Read (returned), op with modval, write. */
#define _VMATOM_SNIPPET_ROP(bs, is, rp, es, fenced, atm, op, modval) ({ \
- uint32 _failed; \
uint##bs _newval; \
uint##bs _oldval; \
\
_VMATOM_FENCE(fenced); \
- __asm__ __volatile__( \
- "1: ldxr"#is" %"#rp"0, %3 \n\t"\
- " "#op" %"#rp"1, %"#rp"0, %"#rp"4 \n\t"\
- " stxr"#is" %w2 , %"#rp"1, %3 \n\t"\
- " cbnz %w2 , 1b \n\t"\
- : "=&r" (_oldval), \
- "=&r" (_newval), \
- "=&r" (_failed), \
- "+Q" (*atm) \
- : "r" (modval) \
- ); \
+ if (Atomic_PreferLSE(op)) { \
+ if (_VMATOM_LSE_HAVE(op)) { \
+ __asm__ __volatile__( \
+ ".arch armv8.2-a \n\t"\
+ "ld" #op #is" %"#rp"2, %"#rp"0, %1 \n\t"\
+ : "=r" (_oldval), \
+ "+Q" (*atm) \
+ : "r" (modval) \
+ ); \
+ } else { \
+ uint##bs _clobberedval; \
+ __asm__ __volatile__( \
+ ".arch armv8.2-a \n\t"\
+ " ldr"#is" %"#rp"1, %3 \n\t"\
+ "1: mov %"#rp"0, %"#rp"1 \n\t"\
+ " "#op" %"#rp"2, %"#rp"0, %"#rp"4 \n\t"\
+ " cas"#is" %"#rp"1, %"#rp"2, %3 \n\t"\
+ " cmp %"#rp"0, %"#rp"1, uxt"#es" \n\t"\
+ " b.ne 1b \n\t"\
+ : "=&r" (_oldval), \
+ "=&r" (_clobberedval), \
+ "=&r" (_newval), \
+ "+Q" (*atm) \
+ : "r" (modval) \
+ : "cc" \
+ ); \
+ } \
+ } else { \
+ uint32 _failed; \
+ __asm__ __volatile__( \
+ "1: ldxr"#is" %"#rp"0, %3 \n\t"\
+ " "#op" %"#rp"1, %"#rp"0, %"#rp"4 \n\t"\
+ " stxr"#is" %w2 , %"#rp"1, %3 \n\t"\
+ " cbnz %w2 , 1b \n\t"\
+ : "=&r" (_oldval), \
+ "=&r" (_newval), \
+ "=&r" (_failed), \
+ "+Q" (*atm) \
+ : "r" (modval) \
+ ); \
+ } \
_VMATOM_FENCE(fenced); \
\
_oldval; \
/* Read (returned), write. */
#define _VMATOM_SNIPPET_RW(bs, is, rp, es, fenced, atm, val) ({ \
- uint32 _failed; \
uint##bs _oldval; \
\
_VMATOM_FENCE(fenced); \
- __asm__ __volatile__( \
- "1: ldxr"#is" %"#rp"0, %2 \n\t"\
- " stxr"#is" %w1 , %"#rp"3, %2 \n\t"\
- " cbnz %w1 , 1b \n\t"\
- : "=&r" (_oldval), \
- "=&r" (_failed), \
- "+Q" (*atm) \
- : "r" (val) \
- ); \
+ if (Atomic_HaveLSE) { \
+ __asm__ __volatile__( \
+ ".arch armv8.2-a \n\t"\
+ "swp"#is" %"#rp"2, %"#rp"0, %1 \n\t"\
+ : "=r" (_oldval), \
+ "+Q" (*atm) \
+ : "r" (val) \
+ ); \
+ } else { \
+ uint32 _failed; \
+ __asm__ __volatile__( \
+ "1: ldxr"#is" %"#rp"0, %2 \n\t"\
+ " stxr"#is" %w1 , %"#rp"3, %2 \n\t"\
+ " cbnz %w1 , 1b \n\t"\
+ : "=&r" (_oldval), \
+ "=&r" (_failed), \
+ "+Q" (*atm) \
+ : "r" (val) \
+ ); \
+ } \
_VMATOM_FENCE(fenced); \
\
_oldval; \
/* Read (returned), if equal to old then write new. */
#define _VMATOM_SNIPPET_RIFEQW(bs, is, rp, es, fenced, atm, old, new) ({ \
- uint32 _failed; \
uint##bs _oldval; \
\
_VMATOM_FENCE(fenced); \
- __asm__ __volatile__( \
- "1: ldxr"#is" %"#rp"0, %2 \n\t"\
- " cmp %"#rp"0, %"#rp"3, uxt"#es" \n\t"\
- " b.ne 2f \n\t"\
- " stxr"#is" %w1 , %"#rp"4, %2 \n\t"\
- " cbnz %w1 , 1b \n\t"\
- "2: \n\t"\
- : "=&r" (_oldval), \
- "=&r" (_failed), \
- "+Q" (*atm) \
- : "r" (old), \
- "r" (new) \
- : "cc" \
- ); \
+ if (Atomic_HaveLSE) { \
+ __asm__ __volatile__( \
+ ".arch armv8.2-a \n\t"\
+ "cas"#is" %"#rp"0, %"#rp"2, %1 \n\t"\
+ : "=r" (_oldval), \
+ "+Q" (*atm) \
+ : "r" (new), "0" (old) \
+ ); \
+ } else { \
+ uint32 _failed; \
+ __asm__ __volatile__( \
+ "1: ldxr"#is" %"#rp"0, %2 \n\t"\
+ " cmp %"#rp"0, %"#rp"3, uxt"#es" \n\t"\
+ " b.ne 2f \n\t"\
+ " stxr"#is" %w1 , %"#rp"4, %2 \n\t"\
+ " cbnz %w1 , 1b \n\t"\
+ "2: \n\t"\
+ : "=&r" (_oldval), \
+ "=&r" (_failed), \
+ "+Q" (*atm) \
+ : "r" (old), \
+ "r" (new) \
+ : "cc" \
+ ); \
+ } \
_VMATOM_FENCE(fenced); \
\
_oldval; \