- tune.maxaccept
- tune.maxpollevents
- tune.maxrewrite
+ - tune.memory.hot-size
- tune.pattern.cache-size
- tune.peers.max-updates-at-once
- tune.pipesize
larger than that. This means you don't have to worry about it when changing
bufsize.
+tune.memory.hot-size <number>
+ Sets the per-thread amount of memory that will be kept hot in the local cache
+ and will never be recoverable by other threads. Access to this memory is very
+ fast (lockless), and having enough is critical to maintain a good performance
+ level under extreme thread contention. The value is expressed in bytes, and
+ the default value is configured at build time via CONFIG_HAP_POOL_CACHE_SIZE
+ which defaults to 524288 (512 kB). A larger value may increase performance in
+ some usage scenarios, especially when performance profiles show that memory
+ allocation is stressed a lot. Experience shows that a good value sits between
+ once to twice the per CPU core L2 cache size. Too large values will have a
+ negative impact on performance by making inefficient use of the L3 caches in
+ the CPUs, and will consume larger amounts of memory. It is recommended not to
+ change this value, or to proceed in small increments. In order to completely
+ disable the per-thread CPU caches, using a very small value could work, but
+ it is better to use "-dMno-cache" on the command-line.
+
tune.pattern.cache-size <number>
Sets the size of the pattern lookup cache to <number> entries. This is an LRU
cache which reminds previous lookups and their results. It is used by ACLs
"-dMfail". In this case the desired average rate of allocation failures can be
fixed by global setting "tune.fail-alloc" expressed in percent.
-The thread-local caches contain the freshest objects whose total size amounts
-to CONFIG_HAP_POOL_CACHE_SIZE bytes, which is typically was 1MB before 2.6 and
-is 512kB after. The aim is to keep hot objects that still fit in the CPU core's
-private L2 cache. Once these objects do not fit into the cache anymore, there's
-no benefit keeping them local to the thread, so they'd rather be returned to
-the shared pool or the main allocator so that any other thread may make use of
-them.
+The thread-local caches contain the freshest objects. Its total size amounts to
+the number of bytes set in global.tune.pool_cache_size and that may be adjusted
+by the "tune.memory.hot-size" global option, which itself defaults to build
+time setting CONFIG_HAP_POOL_CACHE_SIZE, which was 1MB before 2.6 and 512kB
+after. The aim is to keep hot objects that still fit in the CPU core's private
+L2 cache. Once these objects do not fit into the cache anymore, there's no
+benefit keeping them local to the thread, so they'd rather be returned to the
+shared pool or the main allocator so that any other thread may make use of
+them. Under extreme thread contention the cost of accessing shared structures
+in the global cache or in malloc() may still be important and it may prove
+useful to increase the thread-local cache size.
3. Storage in thread-local caches
boot-time option "-dMno-global".
CONFIG_HAP_POOL_CACHE_SIZE
- This allows one to define the size of the per-thread cache, in bytes.
- The default value is 512 kB (524288). Smaller values will use less
- memory at the expense of a possibly higher CPU usage when using many
- threads. Higher values will give diminishing returns on performance
- while using much more memory. Usually there is no benefit in using
- more than a per-core L2 cache size. It would be better not to set this
- value lower than a few times the size of a buffer (bufsize, defaults to
- 16 kB).
+ This allows one to define the default size of the per-thread cache, in
+ bytes. The default value is 512 kB (524288). Smaller values will use
+ less memory at the expense of a possibly higher CPU usage when using
+ many threads. Higher values will give diminishing returns on
+ performance while using much more memory. Usually there is no benefit
+ in using more than a per-core L2 cache size. It would be better not to
+ set this value lower than a few times the size of a buffer (bufsize,
+ defaults to 16 kB). In addition, keep in mind that this option may be
+ changed at runtime using "tune.memory.hot-size".
CONFIG_HAP_POOL_CLUSTER_SIZE
This allows one to define the maximum number of objects that will be
int pool_high_ratio; /* max ratio of FDs used before we start killing idle connections when creating new connections */
int pool_low_count; /* max number of opened fd before we stop using new idle connections */
int pool_high_count; /* max number of opened fd before we start killing idle connections when creating new connections */
+ size_t pool_cache_size; /* per-thread cache size per pool (defaults to CONFIG_HAP_POOL_CACHE_SIZE) */
unsigned short idle_timer; /* how long before an empty buffer is considered idle (ms) */
#ifdef USE_QUIC
unsigned int quic_backend_max_idle_timeout;
if (!hlua_post_init())
exit(1);
+
+ /* Set the per-thread pool cache size to the default value if not set.
+ * This is the right place to decide to automatically adjust it (e.g.
+ * check L2 cache size, thread counts or take into account certain
+ * expensive pools).
+ */
+ if (!global.tune.pool_cache_size)
+ global.tune.pool_cache_size = CONFIG_HAP_POOL_CACHE_SIZE;
}
void deinit(void)
while ((ph->count && full) ||
(ph->count >= CONFIG_HAP_POOL_CLUSTER_SIZE &&
ph->count >= 16 + pool_cache_count / 8 &&
- pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4)) {
+ pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) {
pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE);
}
}
BUG_ON(pool != ph->pool);
pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE);
- } while (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 7 / 8);
+ } while (pool_cache_bytes > global.tune.pool_cache_size * 7 / 8);
}
/* Frees an object to the local cache, possibly pushing oldest objects to the
pool_cache_count++;
pool_cache_bytes += pool->size;
- if (unlikely(pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4)) {
+ if (unlikely(pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) {
if (ph->count >= 16 + pool_cache_count / 8 + CONFIG_HAP_POOL_CLUSTER_SIZE)
pool_evict_from_local_cache(pool, 0);
- if (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE)
+ if (pool_cache_bytes > global.tune.pool_cache_size)
pool_evict_from_local_caches();
}
}
}
#endif
- if (unlikely(pool_debugging & POOL_DBG_NO_CACHE)) {
+ if (unlikely((pool_debugging & POOL_DBG_NO_CACHE) ||
+ global.tune.pool_cache_size < pool->size)) {
pool_free_nocache(pool, ptr);
return;
}
return 0;
}
+/* config parser for global "tune.memory.hot-size" */
+static int mem_parse_global_hot_size(char **args, int section_type, struct proxy *curpx,
+ const struct proxy *defpx, const char *file, int line,
+ char **err)
+{
+ long size;
+
+ if (too_many_args(1, args, err, NULL))
+ return -1;
+
+ size = atol(args[1]);
+ if (size <= 0) {
+ memprintf(err, "'%s' expects a strictly positive value.", args[0]);
+ return -1;
+ }
+
+ global.tune.pool_cache_size = size;
+ return 0;
+}
+
/* config parser for global "no-memory-trimming" */
static int mem_parse_global_no_mem_trim(char **args, int section_type, struct proxy *curpx,
const struct proxy *defpx, const char *file, int line,
/* register global config keywords */
static struct cfg_kw_list mem_cfg_kws = {ILH, {
{ CFG_GLOBAL, "tune.fail-alloc", mem_parse_global_fail_alloc },
+ { CFG_GLOBAL, "tune.memory.hot-size", mem_parse_global_hot_size },
{ CFG_GLOBAL, "no-memory-trimming", mem_parse_global_no_mem_trim },
{ 0, NULL, NULL }
}};