From: Willy Tarreau <w@1wt.eu>
Date: Thu, 11 Sep 2025 12:52:00 +0000 (+0200)
Subject: MINOR: activity: collect CPU time spent on memory allocations for each task
X-Git-Tag: v3.3-dev9~126
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=9d8c2a888bab14102c2b78d8e0b6f54c9877ae9b;p=thirdparty%2Fhaproxy.git

MINOR: activity: collect CPU time spent on memory allocations for each task

When task profiling is enabled, the pool alloc/free code will measure the
time it takes to perform memory allocation after a cache miss or memory
freeing to the shared cache or OS. The time taken with the thread-local
cache is never measured as measuring that time is very expensive compared
to the pool access time. Here doing so costs around 2% performance at 2M
req/s, only when task profiling is enabled, so this remains reasonable.
The scheduler takes care of collecting that time and updating the
sched_activity entry corresponding to the current task when task profiling
is enabled.

The goal clearly is to track places that are wasting CPU time allocating
and releasing too often, or causing large evictions. This appears like
this in "show profiling tasks aggr":

  Tasks activity over 11.428 sec till 0.000 sec ago:
    function                      calls   cpu_tot   cpu_avg   lkw_avg   lkd_avg   mem_avg   lat_avg
    process_stream             44183891   16.47m    22.36us   491.0ns   1.154us   1.000ns   101.1us
    h1_io_cb                   57386064   4.011m    4.193us   20.00ns   16.00ns      -      29.47us
    sc_conn_io_cb              42088024   49.04s    1.165us      -         -         -      54.67us
    h1_timeout_task              438171   196.5ms   448.0ns      -         -         -      100.1us
    srv_cleanup_toremove_conns       65   1.468ms   22.58us   184.0ns   87.00ns      -      101.3us
    task_process_applet               3   508.0us   169.3us      -      107.0us   1.847us   29.67us
    srv_cleanup_idle_conns            6   225.3us   37.55us   15.74us   36.84us      -      49.47us
    accept_queue_process              2   45.62us   22.81us      -         -      4.949us   54.33us
---

diff --git a/include/haproxy/tinfo-t.h b/include/haproxy/tinfo-t.h
index 7d550e7e6..8968cfa09 100644
--- a/include/haproxy/tinfo-t.h
+++ b/include/haproxy/tinfo-t.h
@@ -161,6 +161,7 @@ struct thread_ctx {
 	uint32_t sched_wake_date;           /* current task/tasklet's wake date in 32-bit ns or 0 if not supported */
 	uint64_t sched_call_date;           /* current task/tasklet's call date in ns */
 	uint64_t lock_wait_total;           /* total time in ns spent waiting for a lock (task prof) */
+	uint64_t mem_wait_total;            /* total time in ns spent allocating/freeing memory (task prof) */
 	uint64_t lock_start_date;           /* date when first locked was obtained (task prof) */
 	uint64_t locked_total;              /* total time in ns spent with at least one lock held (task prof) */
 
diff --git a/src/pool.c b/src/pool.c
index d7928c37c..d824b2b93 100644
--- a/src/pool.c
+++ b/src/pool.c
@@ -20,6 +20,7 @@
 #include <haproxy/cfgparse.h>
 #include <haproxy/channel.h>
 #include <haproxy/cli.h>
+#include <haproxy/clock.h>
 #include <haproxy/errors.h>
 #include <haproxy/global.h>
 #include <haproxy/list.h>
@@ -787,10 +788,18 @@ void pool_put_to_cache(struct pool_head *pool, void *ptr, const void *caller)
 	pool_cache_bytes += pool->size;
 
 	if (unlikely(pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) {
+		uint64_t mem_wait_start = 0;
+
+		if (unlikely(th_ctx->flags & TH_FL_TASK_PROFILING))
+			mem_wait_start = now_mono_time();
+
 		if (ph->count >= 16 + pool_cache_count / 8 + CONFIG_HAP_POOL_CLUSTER_SIZE)
 			pool_evict_from_local_cache(pool, 0);
 		if (pool_cache_bytes > global.tune.pool_cache_size)
 			pool_evict_from_local_caches();
+
+		if (unlikely(mem_wait_start))
+			th_ctx->mem_wait_total += now_mono_time() - mem_wait_start;
 	}
 }
 
@@ -941,8 +950,12 @@ void pool_flush(struct pool_head *pool)
 void pool_gc(struct pool_head *pool_ctx)
 {
 	struct pool_head *entry;
+	uint64_t mem_wait_start = 0;
 	int isolated = thread_isolated();
 
+	if (th_ctx->flags & TH_FL_TASK_PROFILING)
+		mem_wait_start = now_mono_time();
+
 	if (!isolated)
 		thread_isolate();
 
@@ -975,6 +988,9 @@ void pool_gc(struct pool_head *pool_ctx)
 
 	if (!isolated)
 		thread_release();
+
+	if (mem_wait_start)
+		th_ctx->mem_wait_total += now_mono_time() - mem_wait_start;
 }
 
 /*
@@ -995,9 +1011,19 @@ void *__pool_alloc(struct pool_head *pool, unsigned int flags)
 	if (likely(!(pool_debugging & POOL_DBG_NO_CACHE)) && !p)
 		p = pool_get_from_cache(pool, caller);
 
-	if (unlikely(!p))
+	if (unlikely(!p)) {
+		/* count allocation time only for cache misses */
+		uint64_t mem_wait_start = 0;
+
+		if (unlikely(th_ctx->flags & TH_FL_TASK_PROFILING))
+			mem_wait_start = now_mono_time();
+
 		p = pool_alloc_nocache(pool, caller);
 
+		if (unlikely(mem_wait_start))
+			th_ctx->mem_wait_total += now_mono_time() - mem_wait_start;
+	}
+
 	if (likely(p)) {
 #ifdef USE_MEMORY_PROFILING
 		if (unlikely(profiling & HA_PROF_MEMORY)) {
@@ -1061,11 +1087,20 @@ void __pool_free(struct pool_head *pool, void *ptr)
 	if (unlikely((pool_debugging & POOL_DBG_NO_CACHE) ||
 	             (pool->flags & MEM_F_UAF) ||
 		     global.tune.pool_cache_size < pool->size)) {
+		uint64_t mem_wait_start = 0;
+
+		if (unlikely(th_ctx->flags & TH_FL_TASK_PROFILING))
+			mem_wait_start = now_mono_time();
+
 		pool_free_nocache(pool, ptr);
-		return;
-	}
 
-	pool_put_to_cache(pool, ptr, caller);
+		if (unlikely(mem_wait_start))
+			th_ctx->mem_wait_total += now_mono_time() - mem_wait_start;
+	}
+	else {
+		/* this one will count its own time itself */
+		pool_put_to_cache(pool, ptr, caller);
+	}
 }
 
 /*
diff --git a/src/task.c b/src/task.c
index 0de5a588d..4f7ef2bb2 100644
--- a/src/task.c
+++ b/src/task.c
@@ -570,6 +570,7 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
 		t->calls++;
 
 		th_ctx->lock_wait_total = 0;
+		th_ctx->mem_wait_total = 0;
 		th_ctx->locked_total = 0;
 		th_ctx->sched_wake_date = t->wake_date;
 		if (th_ctx->sched_wake_date || (t->state & TASK_F_WANTS_TIME)) {
@@ -684,6 +685,8 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
 			HA_ATOMIC_ADD(&profile_entry->cpu_time, (uint32_t)(now_mono_time() - th_ctx->sched_call_date));
 			if (th_ctx->lock_wait_total)
 				HA_ATOMIC_ADD(&profile_entry->lkw_time, th_ctx->lock_wait_total);
+			if (th_ctx->mem_wait_total)
+				HA_ATOMIC_ADD(&profile_entry->mem_time, th_ctx->mem_wait_total);
 			if (th_ctx->locked_total)
 				HA_ATOMIC_ADD(&profile_entry->lkd_time, th_ctx->locked_total);
 		}