From 0bae075928250ba036cb1d96485a6e72bdb6283c Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 2 Mar 2021 20:05:09 +0100 Subject: [PATCH] MEDIUM: pools: add CONFIG_HAP_NO_GLOBAL_POOLS and CONFIG_HAP_GLOBAL_POOLS We've reached a point where the global pools represent a significant bottleneck with threads. On a 64-core machine, the performance was divided by 8 between 32 and 64 H2 connections only because there were not enough entries in the local caches to avoid picking from the global pools, and the contention on the list there was very high. It becomes obvious that we need to have an array of lists, but that will require more changes. In parallel, standard memory allocators have improved, with tcmalloc and jemalloc finding their ways through mainstream systems, and glibc having upgraded to a thread-aware ptmalloc variant, keeping this level of contention here isn't justified anymore when we have both the local per-thread pool caches and a fast process-wide allocator. For these reasons, this patch introduces a new compile time setting CONFIG_HAP_NO_GLOBAL_POOLS which is set by default when threads are enabled with thread local pool caches, and we know we have a fast thread-aware memory allocator (currently set for glibc>=2.26). In this case we entirely bypass the global pool and directly use the standard memory allocator when missing objects from the local pools. It is also possible to force it at compile time when a good allocator is used with another setup. It is still possible to re-enable the global pools using CONFIG_HAP_GLOBAL_POOLS, if a corner case is discovered regarding the operating system's default allocator, or when building with a recent libc but a different allocator which provides other benefits but does not scale well with threads. --- include/haproxy/compat.h | 6 +++++ include/haproxy/pool-t.h | 10 +++++++ include/haproxy/pool.h | 28 +++++++++++++++---- src/pool.c | 58 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 96 insertions(+), 6 deletions(-) diff --git a/include/haproxy/compat.h b/include/haproxy/compat.h index 76cf350e44..0422b90760 100644 --- a/include/haproxy/compat.h +++ b/include/haproxy/compat.h @@ -249,6 +249,12 @@ typedef struct { } empty_t; #define HA_HAVE_MALLOC_TRIM #endif +/* glibc 2.26 includes a thread-local cache which makes it fast enough in threads */ +#if (defined(__GNU_LIBRARY__) && (__GLIBC__ > 2 || __GLIBC__ == 2 && __GLIBC_MINOR__ >= 26)) +#include +#define HA_HAVE_FAST_MALLOC +#endif + /* Max number of file descriptors we send in one sendmsg(). Linux seems to be * able to send 253 fds per sendmsg(), not sure about the other OSes. */ diff --git a/include/haproxy/pool-t.h b/include/haproxy/pool-t.h index ac8f17e252..f1cba044eb 100644 --- a/include/haproxy/pool-t.h +++ b/include/haproxy/pool-t.h @@ -40,6 +40,16 @@ #define CONFIG_HAP_LOCAL_POOLS #endif +/* On modern architectures with many threads, a fast memory allocator, and + * local pools, the global pools with their single list can be way slower than + * the standard allocator which already has its own per-thread arenas. In this + * case we disable global pools. The global pools may still be enforced + * using CONFIG_HAP_GLOBAL_POOLS though. + */ +#if defined(USE_THREAD) && defined(HA_HAVE_FAST_MALLOC) && defined(CONFIG_HAP_LOCAL_POOLS) && !defined(CONFIG_HAP_GLOBAL_POOLS) +#define CONFIG_HAP_NO_GLOBAL_POOLS +#endif + /* Pools of very similar size are shared by default, unless macro * DEBUG_DONT_SHARE_POOLS is set. */ diff --git a/include/haproxy/pool.h b/include/haproxy/pool.h index c647bbdd28..8f31553152 100644 --- a/include/haproxy/pool.h +++ b/include/haproxy/pool.h @@ -150,7 +150,25 @@ static inline void pool_put_to_cache(struct pool_head *pool, void *ptr, ssize_t #endif // CONFIG_HAP_LOCAL_POOLS -#ifdef CONFIG_HAP_LOCKLESS_POOLS +#if defined(CONFIG_HAP_NO_GLOBAL_POOLS) + +/* this is essentially used with local caches and a fast malloc library, + * which may sometimes be faster than the local shared pools because it + * will maintain its own per-thread arenas. + */ +static inline void *__pool_get_first(struct pool_head *pool) +{ + return NULL; +} + +static inline void __pool_free(struct pool_head *pool, void *ptr) +{ + _HA_ATOMIC_SUB(&pool->used, 1); + _HA_ATOMIC_SUB(&pool->allocated, 1); + pool_free_area(ptr, pool->size + POOL_EXTRA); +} + +#elif defined(CONFIG_HAP_LOCKLESS_POOLS) /****************** Lockless pools implementation ******************/ @@ -274,11 +292,11 @@ static inline void *pool_get_first(struct pool_head *pool) return p; #endif -#ifndef CONFIG_HAP_LOCKLESS_POOLS +#if !defined(CONFIG_HAP_LOCKLESS_POOLS) && !defined(CONFIG_HAP_NO_GLOBAL_POOLS) HA_SPIN_LOCK(POOL_LOCK, &pool->lock); #endif p = __pool_get_first(pool); -#ifndef CONFIG_HAP_LOCKLESS_POOLS +#if !defined(CONFIG_HAP_LOCKLESS_POOLS) && !defined(CONFIG_HAP_NO_GLOBAL_POOLS) HA_SPIN_UNLOCK(POOL_LOCK, &pool->lock); #endif return p; @@ -298,12 +316,12 @@ static inline void *pool_alloc_dirty(struct pool_head *pool) return p; #endif -#ifndef CONFIG_HAP_LOCKLESS_POOLS +#if !defined(CONFIG_HAP_LOCKLESS_POOLS) && !defined(CONFIG_HAP_NO_GLOBAL_POOLS) HA_SPIN_LOCK(POOL_LOCK, &pool->lock); #endif if ((p = __pool_get_first(pool)) == NULL) p = __pool_refill_alloc(pool, 0); -#ifndef CONFIG_HAP_LOCKLESS_POOLS +#if !defined(CONFIG_HAP_LOCKLESS_POOLS) && !defined(CONFIG_HAP_NO_GLOBAL_POOLS) HA_SPIN_UNLOCK(POOL_LOCK, &pool->lock); #endif return p; diff --git a/src/pool.c b/src/pool.c index 73082e8cc1..03cb2ec5da 100644 --- a/src/pool.c +++ b/src/pool.c @@ -170,7 +170,63 @@ void pool_evict_from_cache() } #endif -#ifdef CONFIG_HAP_LOCKLESS_POOLS +#if defined(CONFIG_HAP_NO_GLOBAL_POOLS) + +/* simply fall back on the default OS' allocator */ + +void *__pool_refill_alloc(struct pool_head *pool, unsigned int avail) +{ + int allocated = pool->allocated; + int limit = pool->limit; + void *ptr = NULL; + + if (limit && allocated >= limit) { + _HA_ATOMIC_ADD(&pool->allocated, 1); + activity[tid].pool_fail++; + return NULL; + } + + ptr = pool_alloc_area(pool->size + POOL_EXTRA); + if (!ptr) { + _HA_ATOMIC_ADD(&pool->failed, 1); + activity[tid].pool_fail++; + return NULL; + } + + _HA_ATOMIC_ADD(&pool->allocated, 1); + _HA_ATOMIC_ADD(&pool->used, 1); + +#ifdef DEBUG_MEMORY_POOLS + /* keep track of where the element was allocated from */ + *POOL_LINK(pool, ptr) = (void *)pool; +#endif + return ptr; +} + +/* legacy stuff */ +void *pool_refill_alloc(struct pool_head *pool, unsigned int avail) +{ + void *ptr; + + ptr = __pool_refill_alloc(pool, avail); + return ptr; +} + +/* legacy stuff */ +void pool_flush(struct pool_head *pool) +{ +} + +/* This function might ask the malloc library to trim its buffers. */ +void pool_gc(struct pool_head *pool_ctx) +{ +#if defined(HA_HAVE_MALLOC_TRIM) + malloc_trim(0); +#endif +} + +#elif defined(CONFIG_HAP_LOCKLESS_POOLS) + /* Allocates new entries for pool until there are at least + 1 * available, then returns the last one for immediate use, so that at least * are left available in the pool upon return. NULL is returned if the -- 2.39.5