From 388539faa35cd11019d81619cce86ac34a554d33 Mon Sep 17 00:00:00 2001 From: Olivier Houchard Date: Fri, 2 May 2025 11:46:54 +0000 Subject: [PATCH] MEDIUM: stick-tables: defer adding updates to a tasklet There is a lot of contention trying to add updates to the tree. So instead of trying to add the updates to the tree right away, just add them to a mt-list (with one mt-list per thread group, so that the mt-list does not become the new point of contention that much), and create a tasklet dedicated to adding updates to the tree, in batchs, to avoid keeping the update lock for too long. This helps getting stick tables perform better under heavy load. --- include/haproxy/defaults.h | 4 + include/haproxy/stick_table-t.h | 4 + src/stick_table.c | 166 ++++++++++++++++++-------------- 3 files changed, 104 insertions(+), 70 deletions(-) diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h index 462793eff2..c18b2f0793 100644 --- a/include/haproxy/defaults.h +++ b/include/haproxy/defaults.h @@ -654,4 +654,8 @@ #define QUIC_MAX_TX_MEM 0 #endif +#ifndef STKTABLE_MAX_UPDATES_AT_ONCE +#define STKTABLE_MAX_UPDATES_AT_ONCE 100 +#endif /* STKTABLE_MAX_UPDATES_AT_ONCE */ + #endif /* _HAPROXY_DEFAULTS_H */ diff --git a/include/haproxy/stick_table-t.h b/include/haproxy/stick_table-t.h index 2340cdf59f..454a35c28c 100644 --- a/include/haproxy/stick_table-t.h +++ b/include/haproxy/stick_table-t.h @@ -151,6 +151,8 @@ struct stksess { int seen; /* 0 only when no peer has seen this entry yet */ struct eb32_node exp; /* ebtree node used to hold the session in expiration tree */ struct eb32_node upd; /* ebtree node used to hold the update sequence tree */ + struct mt_list pend_updts;/* list of entries to be inserted/moved in the update sequence tree */ + int updt_is_local; /* is the update a local one ? */ struct ebmb_node key; /* ebtree node used to hold the session in table */ /* WARNING! do not put anything after , it's used by the key */ }; @@ -220,9 +222,11 @@ struct stktable { THREAD_ALIGN(64); struct eb_root updates; /* head of sticky updates sequence tree, uses updt_lock */ + struct mt_list *pend_updts; /* list of updates to be added to the update sequence tree, one per thread-group */ unsigned int update; /* uses updt_lock */ unsigned int localupdate; /* uses updt_lock */ unsigned int commitupdate;/* used to identify the latest local updates pending for sync, uses updt_lock */ + struct tasklet *updt_task;/* tasklet responsable for pushing the pending updates into the tree */ THREAD_ALIGN(64); /* this lock is heavily used and must be on its own cache line */ diff --git a/src/stick_table.c b/src/stick_table.c index 8b5192f9d7..3be38d8278 100644 --- a/src/stick_table.c +++ b/src/stick_table.c @@ -144,12 +144,13 @@ int __stksess_kill(struct stktable *t, struct stksess *ts) if (HA_ATOMIC_LOAD(&ts->ref_cnt)) return 0; - if (ts->upd.node.leaf_p) { + if (ts->upd.node.leaf_p || !MT_LIST_ISEMPTY(&ts->pend_updts)) { updt_locked = 1; HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock); if (HA_ATOMIC_LOAD(&ts->ref_cnt)) goto out_unlock; } + MT_LIST_DELETE(&ts->pend_updts); eb32_delete(&ts->exp); eb32_delete(&ts->upd); ebmb_delete(&ts->key); @@ -271,6 +272,7 @@ static struct stksess *__stksess_init(struct stktable *t, struct stksess * ts) ts->key.node.leaf_p = NULL; ts->exp.node.leaf_p = NULL; ts->upd.node.leaf_p = NULL; + MT_LIST_INIT(&ts->pend_updts); ts->expire = tick_add(now_ms, MS_TO_TICKS(t->expire)); HA_RWLOCK_INIT(&ts->lock); return ts; @@ -362,20 +364,19 @@ int stktable_trash_oldest(struct stktable *t, int to_batch) * with that lock held, will grab a ref_cnt before releasing the * lock. So we must take this lock as well and check the ref_cnt. */ - if (ts->upd.node.leaf_p) { - if (!updt_locked) { - updt_locked = 1; - HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock); - } - /* now we're locked, new peers can't grab it anymore, - * existing ones already have the ref_cnt. - */ - if (HA_ATOMIC_LOAD(&ts->ref_cnt)) - continue; + if (!updt_locked) { + updt_locked = 1; + HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock); } + /* now we're locked, new peers can't grab it anymore, + * existing ones already have the ref_cnt. + */ + if (HA_ATOMIC_LOAD(&ts->ref_cnt)) + continue; /* session expired, trash it */ ebmb_delete(&ts->key); + MT_LIST_DELETE(&ts->pend_updts); eb32_delete(&ts->upd); __stksess_free(t, ts); batched++; @@ -585,9 +586,7 @@ struct stksess *stktable_lookup(struct stktable *t, struct stksess *ts) */ void stktable_touch_with_exp(struct stktable *t, struct stksess *ts, int local, int expire, int decrefcnt) { - struct eb32_node * eb; - int use_wrlock = 0; - int do_wakeup = 0; + int did_append = 0; if (expire != HA_ATOMIC_LOAD(&ts->expire)) { /* we'll need to set the expiration and to wake up the expiration timer .*/ @@ -602,63 +601,24 @@ void stktable_touch_with_exp(struct stktable *t, struct stksess *ts, int local, * scheduled for at least one peer. */ if (!ts->upd.node.leaf_p || _HA_ATOMIC_LOAD(&ts->seen)) { - /* Time to upgrade the read lock to write lock */ - HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock); - use_wrlock = 1; - - /* here we're write-locked */ - - ts->seen = 0; - ts->upd.key = ++t->update; - t->localupdate = t->update; - eb32_delete(&ts->upd); - eb = eb32_insert(&t->updates, &ts->upd); - if (eb != &ts->upd) { - eb32_delete(eb); - eb32_insert(&t->updates, &ts->upd); - } + _HA_ATOMIC_STORE(&ts->updt_is_local, 1); + did_append = MT_LIST_TRY_APPEND(&t->pend_updts[tgid - 1], &ts->pend_updts); } - do_wakeup = 1; } else { - /* Note: we land here when learning new entries from - * remote peers. We hold one ref_cnt so the entry - * cannot vanish under us, however if two peers create - * the same key at the exact same time, we must be - * careful not to perform two parallel inserts! Hence - * we need to first check leaf_p to know if the entry - * is new, then lock the tree and check the entry again - * (since another thread could have created it in the - * mean time). - */ if (!ts->upd.node.leaf_p) { - /* Time to upgrade the read lock to write lock if needed */ - HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock); - use_wrlock = 1; - - /* here we're write-locked */ - if (!ts->upd.node.leaf_p) { - ts->seen = 0; - ts->upd.key= (++t->update)+(2147483648U); - eb = eb32_insert(&t->updates, &ts->upd); - if (eb != &ts->upd) { - eb32_delete(eb); - eb32_insert(&t->updates, &ts->upd); - } - } + _HA_ATOMIC_STORE(&ts->updt_is_local, 0); + did_append = MT_LIST_TRY_APPEND(&t->pend_updts[tgid - 1], &ts->pend_updts); } } - /* drop the lock now */ - if (use_wrlock) - HA_RWLOCK_WRUNLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock); } + if (did_append) + tasklet_wakeup(t->updt_task); + if (decrefcnt) HA_ATOMIC_DEC(&ts->ref_cnt); - - if (do_wakeup) - task_wakeup(t->sync_task, TASK_WOKEN_MSG); } /* Update the expiration timer for but do not touch its expiration node. @@ -809,6 +769,60 @@ struct stksess *stktable_get_entry(struct stktable *table, struct stktable_key * return ts; } +static struct task *stktable_add_pend_updates(struct task *t, void *ctx, unsigned int state) +{ + struct stktable *table = ctx; + struct eb32_node *eb; + int i, is_local, cur_tgid = tgid - 1, empty_tgid = 0; + + HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &table->updt_lock); + for (i = 0; i < STKTABLE_MAX_UPDATES_AT_ONCE; i++) { + struct stksess *stksess = MT_LIST_POP(&table->pend_updts[cur_tgid], typeof(stksess), pend_updts); + + if (!stksess) { + empty_tgid++; + cur_tgid++; + if (cur_tgid == global.nbtgroups) + cur_tgid = 0; + + if (empty_tgid == global.nbtgroups) + break; + continue; + } + cur_tgid++; + empty_tgid = 0; + if (cur_tgid == global.nbtgroups) + cur_tgid = 0; + is_local = stksess->updt_is_local; + stksess->seen = 0; + if (is_local) { + stksess->upd.key = ++table->update; + table->localupdate = table->update; + eb32_delete(&stksess->upd); + } else { + stksess->upd.key = (++table->update) + (2147483648U); + } + eb = eb32_insert(&table->updates, &stksess->upd); + if (eb != &stksess->upd) { + BUG_ON(1); + eb32_delete(eb); + eb32_insert(&table->updates, &stksess->upd); + } + } + + HA_RWLOCK_WRUNLOCK(STK_TABLE_UPDT_LOCK, &table->updt_lock); + + /* There's more to do, let's schedule another session */ + if (empty_tgid < global.nbtgroups) + tasklet_wakeup(table->updt_task); + + if (i > 0) { + /* We did at least one update, let's wake the sync task */ + task_wakeup(table->sync_task, TASK_WOKEN_MSG); + } + return t; +} + /* Lookup for an entry with the same key and store the submitted * stksess if not found. This function locks the table either shared or * exclusively, and the refcount of the entry is increased. @@ -938,20 +952,19 @@ struct task *process_table_expire(struct task *task, void *context, unsigned int * with that lock held, will grab a ref_cnt before releasing the * lock. So we must take this lock as well and check the ref_cnt. */ - if (ts->upd.node.leaf_p) { - if (!updt_locked) { - updt_locked = 1; - HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock); - } - /* now we're locked, new peers can't grab it anymore, - * existing ones already have the ref_cnt. - */ - if (HA_ATOMIC_LOAD(&ts->ref_cnt)) - continue; + if (!updt_locked) { + updt_locked = 1; + HA_RWLOCK_WRLOCK(STK_TABLE_UPDT_LOCK, &t->updt_lock); } + /* now we're locked, new peers can't grab it anymore, + * existing ones already have the ref_cnt. + */ + if (HA_ATOMIC_LOAD(&ts->ref_cnt)) + continue; /* session expired, trash it */ ebmb_delete(&ts->key); + MT_LIST_DELETE(&ts->pend_updts); eb32_delete(&ts->upd); __stksess_free(t, ts); } @@ -988,6 +1001,7 @@ int stktable_init(struct stktable *t, char **err_msg) { int peers_retval = 0; int shard; + int i; t->hash_seed = XXH64(t->id, t->idlen, 0); @@ -1047,6 +1061,16 @@ int stktable_init(struct stktable *t, char **err_msg) t->write_to.t = table; } + t->pend_updts = calloc(global.nbtgroups, sizeof(*t->pend_updts)); + if (!t->pend_updts) + goto mem_error; + for (i = 0; i < global.nbtgroups; i++) + MT_LIST_INIT(&t->pend_updts[i]); + t->updt_task = tasklet_new(); + if (!t->updt_task) + goto mem_error; + t->updt_task->context = t; + t->updt_task->process = stktable_add_pend_updates; return 1; mem_error: @@ -1065,6 +1089,8 @@ void stktable_deinit(struct stktable *t) if (!t) return; task_destroy(t->exp_task); + tasklet_free(t->updt_task); + ha_free(&t->pend_updts); pool_destroy(t->pool); } -- 2.39.5