x86_64 doesn't have a native atomic FETCH_OR(), it's implemented using
a CAS, which will always cause a write cycle. Here we know we can just
wait as long as the lock bit is held so better loop on a load, and only
attempt the CAS on success. This requires a tiny ifdef and brings nice
benefits. This brings the performance back from 3.33M to 3.75M at 24C48T
while doing no change at 3C6T.
if (next_cell != &cell)
goto wait_for_flush; // FIXME: another thread arrived, we should go to wait now
__ha_cpu_relax_for_read();
-
- tail_ofs = HA_ATOMIC_FETCH_OR(tail_ptr, RING_TAIL_LOCK);
- if (!(tail_ofs & RING_TAIL_LOCK))
- break;
-
+#if defined(__x86_64__)
+ /* x86 prefers a read first */
+ if (!(HA_ATOMIC_LOAD(tail_ptr) & RING_TAIL_LOCK))
+#endif
+ {
+ tail_ofs = HA_ATOMIC_FETCH_OR(tail_ptr, RING_TAIL_LOCK);
+ if (!(tail_ofs & RING_TAIL_LOCK))
+ break;
+ }
__ha_cpu_relax_for_read();
}
/* OK the queue is locked, let's attempt to get the tail lock */