Archs relying on CAS benefit from a read prior to FETCH_OR, so it's
not just x86 that benefits from this. Let's just change the condition
to only exclude __ARM_FEATURE_ATOMICS which is the only one faster
without.
goto wait_for_flush;
__ha_cpu_relax_for_read();
-#if defined(__x86_64__)
- /* x86 prefers a read first */
- if ((tail_ofs = HA_ATOMIC_LOAD(tail_ptr)) & RING_TAIL_LOCK)
+#if !defined(__ARM_FEATURE_ATOMICS)
+ /* ARMv8.1-a has a true atomic OR and doesn't need the preliminary read */
+ if ((tail_ofs = HA_ATOMIC_LOAD(tail_ptr)) & RING_TAIL_LOCK) {
+ __ha_cpu_relax_for_read();
continue;
+ }
#endif
/* OK the queue is locked, let's attempt to get the tail lock */
tail_ofs = HA_ATOMIC_FETCH_OR(tail_ptr, RING_TAIL_LOCK);