From 0bae075928250ba036cb1d96485a6e72bdb6283c Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 2 Mar 2021 20:05:09 +0100
Subject: [PATCH] MEDIUM: pools: add CONFIG_HAP_NO_GLOBAL_POOLS and
 CONFIG_HAP_GLOBAL_POOLS

We've reached a point where the global pools represent a significant
bottleneck with threads. On a 64-core machine, the performance was
divided by 8 between 32 and 64 H2 connections only because there were
not enough entries in the local caches to avoid picking from the global
pools, and the contention on the list there was very high. It becomes
obvious that we need to have an array of lists, but that will require
more changes.

In parallel, standard memory allocators have improved, with tcmalloc
and jemalloc finding their ways through mainstream systems, and glibc
having upgraded to a thread-aware ptmalloc variant, keeping this level
of contention here isn't justified anymore when we have both the local
per-thread pool caches and a fast process-wide allocator.

For these reasons, this patch introduces a new compile time setting
CONFIG_HAP_NO_GLOBAL_POOLS which is set by default when threads are
enabled with thread local pool caches, and we know we have a fast
thread-aware memory allocator (currently set for glibc>=2.26). In this
case we entirely bypass the global pool and directly use the standard
memory allocator when missing objects from the local pools. It is also
possible to force it at compile time when a good allocator is used with
another setup.

It is still possible to re-enable the global pools using
CONFIG_HAP_GLOBAL_POOLS, if a corner case is discovered regarding the
operating system's default allocator, or when building with a recent
libc but a different allocator which provides other benefits but does
not scale well with threads.
---
 include/haproxy/compat.h |  6 +++++
 include/haproxy/pool-t.h | 10 +++++++
 include/haproxy/pool.h   | 28 +++++++++++++++----
 src/pool.c               | 58 +++++++++++++++++++++++++++++++++++++++-
 4 files changed, 96 insertions(+), 6 deletions(-)

diff --git a/include/haproxy/compat.h b/include/haproxy/compat.h
index 76cf350e44..0422b90760 100644
--- a/include/haproxy/compat.h
+++ b/include/haproxy/compat.h
@@ -249,6 +249,12 @@ typedef struct { } empty_t;
 #define HA_HAVE_MALLOC_TRIM
 #endif
 
+/* glibc 2.26 includes a thread-local cache which makes it fast enough in threads */
+#if (defined(__GNU_LIBRARY__) && (__GLIBC__ > 2 || __GLIBC__ == 2 && __GLIBC_MINOR__ >= 26))
+#include <malloc.h>
+#define HA_HAVE_FAST_MALLOC
+#endif
+
 /* Max number of file descriptors we send in one sendmsg(). Linux seems to be
  * able to send 253 fds per sendmsg(), not sure about the other OSes.
  */
diff --git a/include/haproxy/pool-t.h b/include/haproxy/pool-t.h
index ac8f17e252..f1cba044eb 100644
--- a/include/haproxy/pool-t.h
+++ b/include/haproxy/pool-t.h
@@ -40,6 +40,16 @@
 #define CONFIG_HAP_LOCAL_POOLS
 #endif
 
+/* On modern architectures with many threads, a fast memory allocator, and
+ * local pools, the global pools with their single list can be way slower than
+ * the standard allocator which already has its own per-thread arenas. In this
+ * case we disable global pools. The global pools may still be enforced
+ * using CONFIG_HAP_GLOBAL_POOLS though.
+ */
+#if defined(USE_THREAD) && defined(HA_HAVE_FAST_MALLOC) && defined(CONFIG_HAP_LOCAL_POOLS) && !defined(CONFIG_HAP_GLOBAL_POOLS)
+#define CONFIG_HAP_NO_GLOBAL_POOLS
+#endif
+
 /* Pools of very similar size are shared by default, unless macro
  * DEBUG_DONT_SHARE_POOLS is set.
  */
diff --git a/include/haproxy/pool.h b/include/haproxy/pool.h
index c647bbdd28..8f31553152 100644
--- a/include/haproxy/pool.h
+++ b/include/haproxy/pool.h
@@ -150,7 +150,25 @@ static inline void pool_put_to_cache(struct pool_head *pool, void *ptr, ssize_t
 #endif // CONFIG_HAP_LOCAL_POOLS
 
 
-#ifdef CONFIG_HAP_LOCKLESS_POOLS
+#if defined(CONFIG_HAP_NO_GLOBAL_POOLS)
+
+/* this is essentially used with local caches and a fast malloc library,
+ * which may sometimes be faster than the local shared pools because it
+ * will maintain its own per-thread arenas.
+ */
+static inline void *__pool_get_first(struct pool_head *pool)
+{
+	return NULL;
+}
+
+static inline void __pool_free(struct pool_head *pool, void *ptr)
+{
+	_HA_ATOMIC_SUB(&pool->used, 1);
+	_HA_ATOMIC_SUB(&pool->allocated, 1);
+	pool_free_area(ptr, pool->size + POOL_EXTRA);
+}
+
+#elif defined(CONFIG_HAP_LOCKLESS_POOLS)
 
 /****************** Lockless pools implementation ******************/
 
@@ -274,11 +292,11 @@ static inline void *pool_get_first(struct pool_head *pool)
 		return p;
 #endif
 
-#ifndef CONFIG_HAP_LOCKLESS_POOLS
+#if !defined(CONFIG_HAP_LOCKLESS_POOLS) && !defined(CONFIG_HAP_NO_GLOBAL_POOLS)
 	HA_SPIN_LOCK(POOL_LOCK, &pool->lock);
 #endif
 	p = __pool_get_first(pool);
-#ifndef CONFIG_HAP_LOCKLESS_POOLS
+#if !defined(CONFIG_HAP_LOCKLESS_POOLS) && !defined(CONFIG_HAP_NO_GLOBAL_POOLS)
 	HA_SPIN_UNLOCK(POOL_LOCK, &pool->lock);
 #endif
 	return p;
@@ -298,12 +316,12 @@ static inline void *pool_alloc_dirty(struct pool_head *pool)
 		return p;
 #endif
 
-#ifndef CONFIG_HAP_LOCKLESS_POOLS
+#if !defined(CONFIG_HAP_LOCKLESS_POOLS) && !defined(CONFIG_HAP_NO_GLOBAL_POOLS)
 	HA_SPIN_LOCK(POOL_LOCK, &pool->lock);
 #endif
 	if ((p = __pool_get_first(pool)) == NULL)
 		p = __pool_refill_alloc(pool, 0);
-#ifndef CONFIG_HAP_LOCKLESS_POOLS
+#if !defined(CONFIG_HAP_LOCKLESS_POOLS) && !defined(CONFIG_HAP_NO_GLOBAL_POOLS)
 	HA_SPIN_UNLOCK(POOL_LOCK, &pool->lock);
 #endif
 	return p;
diff --git a/src/pool.c b/src/pool.c
index 73082e8cc1..03cb2ec5da 100644
--- a/src/pool.c
+++ b/src/pool.c
@@ -170,7 +170,63 @@ void pool_evict_from_cache()
 }
 #endif
 
-#ifdef CONFIG_HAP_LOCKLESS_POOLS
+#if defined(CONFIG_HAP_NO_GLOBAL_POOLS)
+
+/* simply fall back on the default OS' allocator */
+
+void *__pool_refill_alloc(struct pool_head *pool, unsigned int avail)
+{
+	int allocated = pool->allocated;
+	int limit = pool->limit;
+	void *ptr = NULL;
+
+	if (limit && allocated >= limit) {
+		_HA_ATOMIC_ADD(&pool->allocated, 1);
+		activity[tid].pool_fail++;
+		return NULL;
+	}
+
+	ptr = pool_alloc_area(pool->size + POOL_EXTRA);
+	if (!ptr) {
+		_HA_ATOMIC_ADD(&pool->failed, 1);
+		activity[tid].pool_fail++;
+		return NULL;
+	}
+
+	_HA_ATOMIC_ADD(&pool->allocated, 1);
+	_HA_ATOMIC_ADD(&pool->used, 1);
+
+#ifdef DEBUG_MEMORY_POOLS
+	/* keep track of where the element was allocated from */
+	*POOL_LINK(pool, ptr) = (void *)pool;
+#endif
+	return ptr;
+}
+
+/* legacy stuff */
+void *pool_refill_alloc(struct pool_head *pool, unsigned int avail)
+{
+	void *ptr;
+
+	ptr = __pool_refill_alloc(pool, avail);
+	return ptr;
+}
+
+/* legacy stuff */
+void pool_flush(struct pool_head *pool)
+{
+}
+
+/* This function might ask the malloc library to trim its buffers. */
+void pool_gc(struct pool_head *pool_ctx)
+{
+#if defined(HA_HAVE_MALLOC_TRIM)
+	malloc_trim(0);
+#endif
+}
+
+#elif defined(CONFIG_HAP_LOCKLESS_POOLS)
+
 /* Allocates new entries for pool <pool> until there are at least <avail> + 1
  * available, then returns the last one for immediate use, so that at least
  * <avail> are left available in the pool upon return. NULL is returned if the
-- 
2.39.5