]> git.ipfire.org Git - thirdparty/haproxy.git/commitdiff
MEDIUM: ring: improve speed in the queue waiting loop on x86_64
authorWilly Tarreau <w@1wt.eu>
Sun, 17 Mar 2024 09:20:56 +0000 (10:20 +0100)
committerWilly Tarreau <w@1wt.eu>
Mon, 25 Mar 2024 17:34:19 +0000 (17:34 +0000)
x86_64 doesn't have a native atomic FETCH_OR(), it's implemented using
a CAS, which will always cause a write cycle. Here we know we can just
wait as long as the lock bit is held so better loop on a load, and only
attempt the CAS on success. This requires a tiny ifdef and brings nice
benefits. This brings the performance back from 3.33M to 3.75M at 24C48T
while doing no change at 3C6T.

src/ring.c

index 74772314a42ab9221327ae4ee4c1f71215328588..0393a269b34611c143662e06ec49af54c15a2d2d 100644 (file)
@@ -281,11 +281,15 @@ ssize_t ring_write(struct ring *ring, size_t maxlen, const struct ist pfx[], siz
                        if (next_cell != &cell)
                                goto wait_for_flush; // FIXME: another thread arrived, we should go to wait now
                        __ha_cpu_relax_for_read();
-
-                       tail_ofs = HA_ATOMIC_FETCH_OR(tail_ptr, RING_TAIL_LOCK);
-                       if (!(tail_ofs & RING_TAIL_LOCK))
-                               break;
-
+#if defined(__x86_64__)
+                       /* x86 prefers a read first */
+                       if (!(HA_ATOMIC_LOAD(tail_ptr) & RING_TAIL_LOCK))
+#endif
+                       {
+                               tail_ofs = HA_ATOMIC_FETCH_OR(tail_ptr, RING_TAIL_LOCK);
+                               if (!(tail_ofs & RING_TAIL_LOCK))
+                                       break;
+                       }
                        __ha_cpu_relax_for_read();
                }
                /* OK the queue is locked, let's attempt to get the tail lock */