From 388539faa35cd11019d81619cce86ac34a554d33 Mon Sep 17 00:00:00 2001
From: Olivier Houchard <ohouchard@haproxy.com>
Date: Fri, 2 May 2025 11:46:54 +0000
Subject: [PATCH] MEDIUM: stick-tables: defer adding updates to a tasklet

There is a lot of contention trying to add updates to the tree. So
instead of trying to add the updates to the tree right away, just add
them to a mt-list (with one mt-list per thread group, so that the
mt-list does not become the new point of contention that much), and
create a tasklet dedicated to adding updates to the tree, in batchs, to
avoid keeping the update lock for too long.
This helps getting stick tables perform better under heavy load.
---
 include/haproxy/defaults.h      |   4 +
 include/haproxy/stick_table-t.h |   4 +
 src/stick_table.c               | 166 ++++++++++++++++++--------------
 3 files changed, 104 insertions(+), 70 deletions(-)
diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h
index 462793eff2..c18b2f0793 100644
--- a/include/haproxy/defaults.h
+++ b/include/haproxy/defaults.h
@@ -654,4 +654,8 @@
 #define QUIC_MAX_TX_MEM 0
 #endif
 
+#ifndef STKTABLE_MAX_UPDATES_AT_ONCE
+#define STKTABLE_MAX_UPDATES_AT_ONCE 100
+#endif /* STKTABLE_MAX_UPDATES_AT_ONCE */
+
 #endif /* _HAPROXY_DEFAULTS_H */
diff --git a/include/haproxy/stick_table-t.h b/include/haproxy/stick_table-t.h
index 2340cdf59f..454a35c28c 100644
--- a/include/haproxy/stick_table-t.h
+++ b/include/haproxy/stick_table-t.h
@@ -151,6 +151,8 @@ struct stksess {
 	int seen;                 /* 0 only when no peer has seen this entry yet */
 	struct eb32_node exp;     /* ebtree node used to hold the session in expiration tree */
 	struct eb32_node upd;     /* ebtree node used to hold the update sequence tree */
+	struct mt_list pend_updts;/* list of entries to be inserted/moved in the update sequence tree */
+	int updt_is_local;        /* is the update a local one ? */
 	struct ebmb_node key;     /* ebtree node used to hold the session in table */
 	/* WARNING! do not put anything after <keys>, it's used by the key */
 };
@@ -220,9 +222,11 @@ struct stktable {
 	THREAD_ALIGN(64);
 
 	struct eb_root updates;   /* head of sticky updates sequence tree, uses updt_lock */
+	struct mt_list *pend_updts; /* list of updates to be added to the update sequence tree, one per thread-group */
 	unsigned int update;      /* uses updt_lock */
 	unsigned int localupdate; /* uses updt_lock */
 	unsigned int commitupdate;/* used to identify the latest local updates pending for sync, uses updt_lock */
+	struct tasklet *updt_task;/* tasklet responsable for pushing the pending updates into the tree */
 
 	THREAD_ALIGN(64);
 	/* this lock is heavily used and must be on its own cache line */
diff --git a/src/stick_table.c b/src/stick_table.c
index 8b5192f9d7..3be38d8278 100644
--- a/src/stick_table.c
+++ b/src/stick_table.c
@@ -144,12 +144,13 @@ int __stksess_kill(struct stktable *t, struct stksess *ts)
 	if (HA_ATOMIC_LOAD(&ts->ref_cnt))
 		return 0;
 
-	if (ts->upd.node.leaf_p) {
+	if (ts->upd.node.leaf_p || !MT_LIST_ISEMPTY(&ts->pend_updts)) {
 		updt_locked = 1;
 		HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock);
 		if (HA_ATOMIC_LOAD(&ts->ref_cnt))
 			goto out_unlock;
 	}
+	MT_LIST_DELETE(&ts->pend_updts);
 	eb32_delete(&ts->exp);
 	eb32_delete(&ts->upd);
 	ebmb_delete(&ts->key);
@@ -271,6 +272,7 @@ static struct stksess *__stksess_init(struct stktable *t, struct stksess * ts)
 	ts->key.node.leaf_p = NULL;
 	ts->exp.node.leaf_p = NULL;
 	ts->upd.node.leaf_p = NULL;
+	MT_LIST_INIT(&ts->pend_updts);
 	ts->expire = tick_add(now_ms, MS_TO_TICKS(t->expire));
 	HA_RWLOCK_INIT(&ts->lock);
 	return ts;
@@ -362,20 +364,19 @@ int stktable_trash_oldest(struct stktable *t, int to_batch)
 			 * with that lock held, will grab a ref_cnt before releasing the
 			 * lock. So we must take this lock as well and check the ref_cnt.
 			 */
-			if (ts->upd.node.leaf_p) {
-				if (!updt_locked) {
-					updt_locked = 1;
-					HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock);
-				}
-				/* now we're locked, new peers can't grab it anymore,
-				 * existing ones already have the ref_cnt.
-				 */
-				if (HA_ATOMIC_LOAD(&ts->ref_cnt))
-					continue;
+			if (!updt_locked) {
+				updt_locked = 1;
+				HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock);
 			}
+			/* now we're locked, new peers can't grab it anymore,
+			 * existing ones already have the ref_cnt.
+			 */
+			if (HA_ATOMIC_LOAD(&ts->ref_cnt))
+				continue;
 
 			/* session expired, trash it */
 			ebmb_delete(&ts->key);
+			MT_LIST_DELETE(&ts->pend_updts);
 			eb32_delete(&ts->upd);
 			__stksess_free(t, ts);
 			batched++;
@@ -585,9 +586,7 @@ struct stksess *stktable_lookup(struct stktable *t, struct stksess *ts)
  */
 void stktable_touch_with_exp(struct stktable *t, struct stksess *ts, int local, int expire, int decrefcnt)
 {
-	struct eb32_node * eb;
-	int use_wrlock = 0;
-	int do_wakeup = 0;
+	int did_append = 0;
 
 	if (expire != HA_ATOMIC_LOAD(&ts->expire)) {
 		/* we'll need to set the expiration and to wake up the expiration timer .*/
@@ -602,63 +601,24 @@ void stktable_touch_with_exp(struct stktable *t, struct stksess *ts, int local,
 			 * scheduled for at least one peer.
 			 */
 			if (!ts->upd.node.leaf_p || _HA_ATOMIC_LOAD(&ts->seen)) {
-				/* Time to upgrade the read lock to write lock */
-				HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock);
-				use_wrlock = 1;
-
-				/* here we're write-locked */
-
-				ts->seen = 0;
-				ts->upd.key = ++t->update;
-				t->localupdate = t->update;
-				eb32_delete(&ts->upd);
-				eb = eb32_insert(&t->updates, &ts->upd);
-				if (eb != &ts->upd)  {
-					eb32_delete(eb);
-					eb32_insert(&t->updates, &ts->upd);
-				}
+				_HA_ATOMIC_STORE(&ts->updt_is_local, 1);
+				did_append = MT_LIST_TRY_APPEND(&t->pend_updts[tgid - 1], &ts->pend_updts);
 			}
-			do_wakeup = 1;
 		}
 		else {
-			/* Note: we land here when learning new entries from
-			 * remote peers. We hold one ref_cnt so the entry
-			 * cannot vanish under us, however if two peers create
-			 * the same key at the exact same time, we must be
-			 * careful not to perform two parallel inserts! Hence
-			 * we need to first check leaf_p to know if the entry
-			 * is new, then lock the tree and check the entry again
-			 * (since another thread could have created it in the
-			 * mean time).
-			 */
 			if (!ts->upd.node.leaf_p) {
-				/* Time to upgrade the read lock to write lock if needed */
-				HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock);
-				use_wrlock = 1;
-
-				/* here we're write-locked */
-				if (!ts->upd.node.leaf_p) {
-					ts->seen = 0;
-					ts->upd.key= (++t->update)+(2147483648U);
-					eb = eb32_insert(&t->updates, &ts->upd);
-					if (eb != &ts->upd) {
-						eb32_delete(eb);
-						eb32_insert(&t->updates, &ts->upd);
-					}
-				}
+				_HA_ATOMIC_STORE(&ts->updt_is_local, 0);
+				did_append = MT_LIST_TRY_APPEND(&t->pend_updts[tgid - 1], &ts->pend_updts);
 			}
 		}
 
-		/* drop the lock now */
-		if (use_wrlock)
-			HA_RWLOCK_WRUNLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock);
 	}
 
+	if (did_append)
+		tasklet_wakeup(t->updt_task);
+
 	if (decrefcnt)
 		HA_ATOMIC_DEC(&ts->ref_cnt);
-
-	if (do_wakeup)
-		task_wakeup(t->sync_task, TASK_WOKEN_MSG);
 }
 
 /* Update the expiration timer for <ts> but do not touch its expiration node.
@@ -809,6 +769,60 @@ struct stksess *stktable_get_entry(struct stktable *table, struct stktable_key *
 	return ts;
 }
 
+static struct task *stktable_add_pend_updates(struct task *t, void *ctx, unsigned int state)
+{
+	struct stktable *table = ctx;
+	struct eb32_node *eb;
+	int i, is_local, cur_tgid = tgid - 1, empty_tgid = 0;
+
+	HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &table->updt_lock);
+	for (i = 0; i < STKTABLE_MAX_UPDATES_AT_ONCE; i++) {
+		struct stksess *stksess = MT_LIST_POP(&table->pend_updts[cur_tgid], typeof(stksess), pend_updts);
+
+		if (!stksess) {
+			empty_tgid++;
+			cur_tgid++;
+			if (cur_tgid == global.nbtgroups)
+                        cur_tgid = 0;
+
+			if (empty_tgid == global.nbtgroups)
+				break;
+			continue;
+		}
+		cur_tgid++;
+		empty_tgid = 0;
+		if (cur_tgid == global.nbtgroups)
+			cur_tgid = 0;
+		is_local = stksess->updt_is_local;
+		stksess->seen = 0;
+		if (is_local) {
+			stksess->upd.key = ++table->update;
+			table->localupdate = table->update;
+			eb32_delete(&stksess->upd);
+		} else {
+			stksess->upd.key = (++table->update) + (2147483648U);
+		}
+		eb = eb32_insert(&table->updates, &stksess->upd);
+		if (eb != &stksess->upd)  {
+			BUG_ON(1);
+			eb32_delete(eb);
+			eb32_insert(&table->updates, &stksess->upd);
+		}
+	}
+
+	HA_RWLOCK_WRUNLOCK(STK_TABLE_UPDT_LOCK, &table->updt_lock);
+
+	/* There's more to do, let's schedule another session */
+	if (empty_tgid < global.nbtgroups)
+		tasklet_wakeup(table->updt_task);
+
+	if (i > 0) {
+		/* We did at least one update, let's wake the sync task */
+		task_wakeup(table->sync_task, TASK_WOKEN_MSG);
+	}
+	return t;
+}
+
 /* Lookup for an entry with the same key and store the submitted
  * stksess if not found. This function locks the table either shared or
  * exclusively, and the refcount of the entry is increased.
@@ -938,20 +952,19 @@ struct task *process_table_expire(struct task *task, void *context, unsigned int
 			 * with that lock held, will grab a ref_cnt before releasing the
 			 * lock. So we must take this lock as well and check the ref_cnt.
 			 */
-			if (ts->upd.node.leaf_p) {
-				if (!updt_locked) {
-					updt_locked = 1;
-					HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock);
-				}
-				/* now we're locked, new peers can't grab it anymore,
-				 * existing ones already have the ref_cnt.
-				 */
-				if (HA_ATOMIC_LOAD(&ts->ref_cnt))
-					continue;
+			if (!updt_locked) {
+				updt_locked = 1;
+				HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock);
 			}
+			/* now we're locked, new peers can't grab it anymore,
+			 * existing ones already have the ref_cnt.
+			 */
+			if (HA_ATOMIC_LOAD(&ts->ref_cnt))
+				continue;
 
 			/* session expired, trash it */
 			ebmb_delete(&ts->key);
+			MT_LIST_DELETE(&ts->pend_updts);
 			eb32_delete(&ts->upd);
 			__stksess_free(t, ts);
 		}
@@ -988,6 +1001,7 @@ int stktable_init(struct stktable *t, char **err_msg)
 {
 	int peers_retval = 0;
 	int shard;
+	int i;
 
 	t->hash_seed = XXH64(t->id, t->idlen, 0);
 
@@ -1047,6 +1061,16 @@ int stktable_init(struct stktable *t, char **err_msg)
 
 		t->write_to.t = table;
 	}
+	t->pend_updts = calloc(global.nbtgroups, sizeof(*t->pend_updts));
+	if (!t->pend_updts)
+		goto mem_error;
+	for (i = 0; i < global.nbtgroups; i++)
+		MT_LIST_INIT(&t->pend_updts[i]);
+	t->updt_task = tasklet_new();
+	if (!t->updt_task)
+		goto mem_error;
+	t->updt_task->context = t;
+	t->updt_task->process = stktable_add_pend_updates;
 	return 1;
 
  mem_error:
@@ -1065,6 +1089,8 @@ void stktable_deinit(struct stktable *t)
 	if (!t)
 		return;
 	task_destroy(t->exp_task);
+	tasklet_free(t->updt_task);
+	ha_free(&t->pend_updts);
 	pool_destroy(t->pool);
 }
 
-- 
2.39.5