MINOR: pool: make the thread-local hot cache size configurable

author Willy Tarreau <w@1wt.eu>

Mon, 19 Dec 2022 07:15:57 +0000 (08:15 +0100)

committer Willy Tarreau <w@1wt.eu>

Tue, 20 Dec 2022 13:51:12 +0000 (14:51 +0100)
author Willy Tarreau <w@1wt.eu>
Mon, 19 Dec 2022 07:15:57 +0000 (08:15 +0100)
committer Willy Tarreau <w@1wt.eu>
Tue, 20 Dec 2022 13:51:12 +0000 (14:51 +0100)
diff --git a/doc/configuration.txt b/doc/configuration.txt

index 0cc2bdee3b6084513807832b5a63fb5f608428a1..b66f75dd477e74419572a8058fba3272302604cc 100644 (file)
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -1126,6 +1126,7 @@ The following keywords are supported in the "global" section :
     - tune.maxaccept
     - tune.maxpollevents
     - tune.maxrewrite
+   - tune.memory.hot-size
     - tune.pattern.cache-size
     - tune.peers.max-updates-at-once
     - tune.pipesize
@@ -2983,6 +2984,22 @@ tune.maxrewrite <number>
    larger than that. This means you don't have to worry about it when changing
    bufsize.
  
+tune.memory.hot-size <number>
+  Sets the per-thread amount of memory that will be kept hot in the local cache
+  and will never be recoverable by other threads. Access to this memory is very
+  fast (lockless), and having enough is critical to maintain a good performance
+  level under extreme thread contention. The value is expressed in bytes, and
+  the default value is configured at build time via CONFIG_HAP_POOL_CACHE_SIZE
+  which defaults to 524288 (512 kB). A larger value may increase performance in
+  some usage scenarios, especially when performance profiles show that memory
+  allocation is stressed a lot. Experience shows that a good value sits between
+  once to twice the per CPU core L2 cache size. Too large values will have a
+  negative impact on performance by making inefficient use of the L3 caches in
+  the CPUs, and will consume larger amounts of memory. It is recommended not to
+  change this value, or to proceed in small increments. In order to completely
+  disable the per-thread CPU caches, using a very small value could work, but
+  it is better to use "-dMno-cache" on the command-line.
+
  tune.pattern.cache-size <number>
    Sets the size of the pattern lookup cache to <number> entries. This is an LRU
    cache which reminds previous lookups and their results. It is used by ACLs
diff --git a/doc/internals/api/pools.txt b/doc/internals/api/pools.txt

index 4023dc316812316c96628ccf3e47e267546fcdfd..480cf24e512e3f642c939eb5a440d5e25db783b8 100644 (file)
--- a/doc/internals/api/pools.txt
+++ b/doc/internals/api/pools.txt
@@ -124,13 +124,17 @@ properly handle allocation failures. It may also be enabled at boot time using
  "-dMfail". In this case the desired average rate of allocation failures can be
  fixed by global setting "tune.fail-alloc" expressed in percent.
  
-The thread-local caches contain the freshest objects whose total size amounts
-to CONFIG_HAP_POOL_CACHE_SIZE bytes, which is typically was 1MB before 2.6 and
-is 512kB after. The aim is to keep hot objects that still fit in the CPU core's
-private L2 cache. Once these objects do not fit into the cache anymore, there's
-no benefit keeping them local to the thread, so they'd rather be returned to
-the shared pool or the main allocator so that any other thread may make use of
-them.
+The thread-local caches contain the freshest objects. Its total size amounts to
+the number of bytes set in global.tune.pool_cache_size and that may be adjusted
+by the "tune.memory.hot-size" global option, which itself defaults to build
+time setting CONFIG_HAP_POOL_CACHE_SIZE, which was 1MB before 2.6 and 512kB
+after. The aim is to keep hot objects that still fit in the CPU core's private
+L2 cache. Once these objects do not fit into the cache anymore, there's no
+benefit keeping them local to the thread, so they'd rather be returned to the
+shared pool or the main allocator so that any other thread may make use of
+them. Under extreme thread contention the cost of accessing shared structures
+in the global cache or in malloc() may still be important and it may prove
+useful to increase the thread-local cache size.
  
  
  3. Storage in thread-local caches
@@ -563,14 +567,15 @@ CONFIG_HAP_NO_GLOBAL_POOLS
          boot-time option "-dMno-global".
  
  CONFIG_HAP_POOL_CACHE_SIZE
-        This allows one to define the size of the per-thread cache, in bytes.
-        The default value is 512 kB (524288). Smaller values will use less
-        memory at the expense of a possibly higher CPU usage when using many
-        threads. Higher values will give diminishing returns on performance
-        while using much more memory. Usually there is no benefit in using
-        more than a per-core L2 cache size. It would be better not to set this
-        value lower than a few times the size of a buffer (bufsize, defaults to
-        16 kB).
+        This allows one to define the default size of the per-thread cache, in
+        bytes. The default value is 512 kB (524288). Smaller values will use
+        less memory at the expense of a possibly higher CPU usage when using
+        many threads. Higher values will give diminishing returns on
+        performance while using much more memory. Usually there is no benefit
+        in using more than a per-core L2 cache size. It would be better not to
+        set this value lower than a few times the size of a buffer (bufsize,
+        defaults to 16 kB). In addition, keep in mind that this option may be
+        changed at runtime using "tune.memory.hot-size".
  
  CONFIG_HAP_POOL_CLUSTER_SIZE
          This allows one to define the maximum number of objects that will be
diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h

index 2e9b61b24e54302fcb2d251146707622343fee9d..11f4b2c0aabbeecaeb8e02e34e916dcf23fa74e7 100644 (file)
--- a/include/haproxy/global-t.h
+++ b/include/haproxy/global-t.h
@@ -160,6 +160,7 @@ struct global {
                 int pool_high_ratio;  /* max ratio of FDs used before we start killing idle connections when creating new connections */
                 int pool_low_count;   /* max number of opened fd before we stop using new idle connections */
                 int pool_high_count;  /* max number of opened fd before we start killing idle connections when creating new connections */
+               size_t pool_cache_size;    /* per-thread cache size per pool (defaults to CONFIG_HAP_POOL_CACHE_SIZE) */
                 unsigned short idle_timer; /* how long before an empty buffer is considered idle (ms) */
  #ifdef USE_QUIC
                 unsigned int quic_backend_max_idle_timeout;
diff --git a/src/haproxy.c b/src/haproxy.c

index 178f2748487c025f77e2839bdcc1d7594a25fd7e..68c78427d5a53a0b1f5fdf9fbc638d3947f37a0e 100644 (file)
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -2670,6 +2670,14 @@ static void init(int argc, char **argv)
  
         if (!hlua_post_init())
                 exit(1);
+
+       /* Set the per-thread pool cache size to the default value if not set.
+        * This is the right place to decide to automatically adjust it (e.g.
+        * check L2 cache size, thread counts or take into account certain
+        * expensive pools).
+        */
+       if (!global.tune.pool_cache_size)
+               global.tune.pool_cache_size = CONFIG_HAP_POOL_CACHE_SIZE;
  }
  
  void deinit(void)
diff --git a/src/pool.c b/src/pool.c

index e225d2144dc9d356732dbdb1d36ba6581543f0cc..df9d06090700a5ab26fec1cd517f84c5fb4262cc 100644 (file)
--- a/src/pool.c
+++ b/src/pool.c
@@ -517,7 +517,7 @@ void pool_evict_from_local_cache(struct pool_head *pool, int full)
         while ((ph->count && full) ||
                (ph->count >= CONFIG_HAP_POOL_CLUSTER_SIZE &&
                 ph->count >= 16 + pool_cache_count / 8 &&
-               pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4)) {
+               pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) {
                 pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE);
         }
  }
@@ -546,7 +546,7 @@ void pool_evict_from_local_caches()
                 BUG_ON(pool != ph->pool);
  
                 pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE);
-       } while (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 7 / 8);
+       } while (pool_cache_bytes > global.tune.pool_cache_size * 7 / 8);
  }
  
  /* Frees an object to the local cache, possibly pushing oldest objects to the
@@ -572,10 +572,10 @@ void pool_put_to_cache(struct pool_head *pool, void *ptr, const void *caller)
         pool_cache_count++;
         pool_cache_bytes += pool->size;
  
-       if (unlikely(pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4)) {
+       if (unlikely(pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) {
                 if (ph->count >= 16 + pool_cache_count / 8 + CONFIG_HAP_POOL_CLUSTER_SIZE)
                         pool_evict_from_local_cache(pool, 0);
-               if (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE)
+               if (pool_cache_bytes > global.tune.pool_cache_size)
                         pool_evict_from_local_caches();
         }
  }
@@ -790,7 +790,8 @@ void __pool_free(struct pool_head *pool, void *ptr)
         }
  #endif
  
-       if (unlikely(pool_debugging & POOL_DBG_NO_CACHE)) {
+       if (unlikely((pool_debugging & POOL_DBG_NO_CACHE) ||
+                    global.tune.pool_cache_size < pool->size)) {
                 pool_free_nocache(pool, ptr);
                 return;
         }
@@ -1211,6 +1212,26 @@ static int mem_parse_global_fail_alloc(char **args, int section_type, struct pro
         return 0;
  }
  
+/* config parser for global "tune.memory.hot-size" */
+static int mem_parse_global_hot_size(char **args, int section_type, struct proxy *curpx,
+                                       const struct proxy *defpx, const char *file, int line,
+                                       char **err)
+{
+       long size;
+
+       if (too_many_args(1, args, err, NULL))
+               return -1;
+
+       size = atol(args[1]);
+       if (size <= 0) {
+           memprintf(err, "'%s' expects a strictly positive value.", args[0]);
+           return -1;
+       }
+
+       global.tune.pool_cache_size = size;
+       return 0;
+}
+
  /* config parser for global "no-memory-trimming" */
  static int mem_parse_global_no_mem_trim(char **args, int section_type, struct proxy *curpx,
                                         const struct proxy *defpx, const char *file, int line,
@@ -1225,6 +1246,7 @@ static int mem_parse_global_no_mem_trim(char **args, int section_type, struct pr
  /* register global config keywords */
  static struct cfg_kw_list mem_cfg_kws = {ILH, {
         { CFG_GLOBAL, "tune.fail-alloc", mem_parse_global_fail_alloc },
+       { CFG_GLOBAL, "tune.memory.hot-size", mem_parse_global_hot_size },
         { CFG_GLOBAL, "no-memory-trimming", mem_parse_global_no_mem_trim },
         { 0, NULL, NULL }
  }};
author	Willy Tarreau <w@1wt.eu>
	Mon, 19 Dec 2022 07:15:57 +0000 (08:15 +0100)
committer	Willy Tarreau <w@1wt.eu>
	Tue, 20 Dec 2022 13:51:12 +0000 (14:51 +0100)
doc/configuration.txt		patch \| blob \| blame \| history
doc/internals/api/pools.txt		patch \| blob \| blame \| history
include/haproxy/global-t.h		patch \| blob \| blame \| history
src/haproxy.c		patch \| blob \| blame \| history
src/pool.c		patch \| blob \| blame \| history