From: Olivier Houchard <ohouchard@haproxy.com>
Date: Wed, 26 Mar 2025 16:16:48 +0000 (+0000)
Subject: MAJOR: leastconn; Revamp the way servers are ordered.
X-Git-Tag: v3.2-dev9~25
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9fe72bba3;p=thirdparty%2Fhaproxy.git

MAJOR: leastconn; Revamp the way servers are ordered.

For leastconn, servers used to just be stored in an ebtree.
Each server would be one node.
Change that so that nodes contain multiple mt_lists. Each list
will contain servers that share the same key (typically meaning
they have the same number of connections). Using mt_lists means
that as long as tree elements already exist, moving a server from
one tree element to another does no longer require the lbprm write
lock.
We use multiple mt_lists to reduce the contention when moving
a server from one tree element to another. A list in the new
element will be chosen randomly.
We no longer remove a tree element as soon as they no longer
contain any server. Instead, we keep a list of all elements,
and when we need a new element, we look at that list only if it
contains a number of elements already, otherwise we'll allocate
a new one. Keeping nodes in the tree ensures that we very
rarely have to take the lbrpm write lock (as it only happens
when we're moving the server to a position for which no
element is currently in the tree).

The number of mt_lists used is defined as FWLC_NB_LISTS.
The number of tree elements we want to keep is defined as
FWLC_MIN_FREE_ENTRIES, both in defaults.h.
The value used were picked afrer experimentation, and
seems to be the best choice of performances vs memory
usage.

Doing that gives a good boost in performances when a lot of
servers are used.
With a configuration using 500 servers, before that patch,
about 830000 requests per second could be processed, with
that patch, about 1550000 requests per second are
processed, on an 64-cores AMD, using 1200 concurrent connections.
---

diff --git a/include/haproxy/backend-t.h b/include/haproxy/backend-t.h
index d7e57a772..f0a9bf354 100644
--- a/include/haproxy/backend-t.h
+++ b/include/haproxy/backend-t.h
@@ -162,12 +162,15 @@ struct lbprm {
 	int wmult;			/* ratio between user weight and effective weight */
 	int wdiv;			/* ratio between effective weight and user weight */
 	int hash_balance_factor;	/* load balancing factor * 100, 0 if disabled */
+	unsigned int lb_free_list_nb;   /* Number of elements in the free list */
 	struct sample_expr *expr;       /* sample expression for "balance (log-)hash" */
 	char *arg_str;			/* name of the URL parameter/header/cookie used for hashing */
 	int   arg_len;			/* strlen(arg_str), computed only once */
 	int   arg_opt1;			/* extra option 1 for the LB algo (algo-specific) */
 	int   arg_opt2;			/* extra option 2 for the LB algo (algo-specific) */
 	int   arg_opt3;			/* extra option 3 for the LB algo (algo-specific) */
+	uint64_t lb_seq;                /* sequence number for algos who need it */
+	struct mt_list lb_free_list;	/* LB tree elements available */
 	__decl_thread(HA_RWLOCK_T lock);
 	struct server *fbck;		/* first backup server when !PR_O_USE_ALL_BK, or NULL */
 
diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h
index dbb39f4c3..eec78132b 100644
--- a/include/haproxy/defaults.h
+++ b/include/haproxy/defaults.h
@@ -602,4 +602,33 @@
 #define MAX_SELF_USE_QUEUE 9
 #endif
 
+/*
+ * FWLC defines
+ */
+
+/*
+ * How many mt_lists we use per tree elements.
+ * The more lists we have, the less likely it
+ * will be that we'll have contention when
+ * inserting/removing an element, but the more
+ * costly it will be to look up servers.
+ */
+#ifndef FWLC_LISTS_NB
+#define FWLC_LISTS_NB   4
+#endif /* FWLC_LISTS_NB */
+
+/*
+ * How many entries we want to keep in the
+ * free list, before trying to use some.
+ * We want to keep some nodes in the tree,
+ * to avoid having to re-allocate one and
+ * modify the tree, which requires the
+ * write lock and is costly, but we
+ * don't want to have too much, to save
+ * memory.
+ */
+#ifndef FWLC_MIN_FREE_ENTRIES
+#define FWLC_MIN_FREE_ENTRIES 500
+#endif /* FWLC_MIN_FREE_ENTRIES */
+
 #endif /* _HAPROXY_DEFAULTS_H */
diff --git a/include/haproxy/server-t.h b/include/haproxy/server-t.h
index fe13318f8..716ac79fb 100644
--- a/include/haproxy/server-t.h
+++ b/include/haproxy/server-t.h
@@ -383,6 +383,12 @@ struct server {
 	union {
 		struct eb32_node lb_node;       /* node used for tree-based load balancing */
 		struct list lb_list;            /* elem used for list-based load balancing */
+		struct {
+			struct fwlc_tree_elt *tree_elt; /* pointer to the element stored in tree, protected by lb_lock  */
+			struct fwlc_tree_elt *free_elt; /* A free element, so that we don't have to allocate one, protected by lb_lock */
+			struct mt_list lb_mt_list;      /* elem used for mt list-based load balancing, protected by lb_lock */
+			int lb_lock;                    /* make sure we are the only one updating the server */
+		};
 	};
 	struct server *next_full;               /* next server in the temporary full list */
 
diff --git a/src/lb_fwlc.c b/src/lb_fwlc.c
index bb7e8979c..c647336c5 100644
--- a/src/lb_fwlc.c
+++ b/src/lb_fwlc.c
@@ -16,7 +16,47 @@
 #include <haproxy/queue.h>
 #include <haproxy/server-t.h>
 #include <haproxy/task.h>
+#include <haproxy/tools.h>
 
+struct fwlc_tree_elt {
+	struct mt_list srv_list[FWLC_LISTS_NB];
+	struct mt_list free_list;
+	struct eb32_node lb_node;
+	unsigned int elements;
+};
+
+DECLARE_STATIC_POOL(pool_head_fwlc_elt, "fwlc_tree_elt", sizeof(struct fwlc_tree_elt));
+
+#define FWLC_LBPRM_SEQ(lbprm)		((lbprm) & 0xffffffff)
+#define FWLC_LBPRM_SMALLEST(lbprm)	((lbprm) >> 32)
+
+/*
+ * Atomically try to update the sequence number, and the smallest key for which there is at least one server.
+ * Returns 1 on success, and 0 on failure.
+ */
+static int fwlc_set_seq_and_smallest(struct lbprm *lbprm, uint64_t current, unsigned int seq, unsigned int smallest)
+{
+#if !defined(HA_CAS_IS_8B) && !defined(HA_HAVE_CAS_DW)
+	__decl_thread(static HA_SPINLOCK_T seq_lock);
+#endif
+	uint64_t dst_nb = seq | ((uint64_t)smallest << 32);
+	int ret;
+#if defined(HA_CAS_IS_8B)
+	ret =  _HA_ATOMIC_CAS(&lbprm->lb_seq, &current, dst_nb);
+#elif defined(HA_HAVE_CAS_DW)
+	ret = _HA_ATOMIC_DWCAS(&lbprm->lb_seq, &current, &dst_nb);
+#else
+	HA_SPIN_LOCK(OTHER_LOCK, &seq_lock);
+	if (lbprm->lb_seq == current) {
+		lbprm->lb_seq = dst_nb;
+		ret = 1;
+	} else
+		ret = 0;
+	HA_SPIN_UNLOCK(OTHER_LOCK, &seq_lock);
+#endif
+	return ret;
+
+}
 
 /* Remove a server from a tree. It must have previously been dequeued. This
  * function is meant to be called when a server is going down or has its
@@ -29,13 +69,137 @@ static inline void fwlc_remove_from_tree(struct server *s)
 	s->lb_tree = NULL;
 }
 
+/*
+ * Remove anything allocated by the proxy
+ */
+static void fwlc_proxy_deinit(struct proxy *p)
+{
+	struct fwlc_tree_elt *tree_elt;
+
+	while ((tree_elt = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list)) != NULL) {
+		pool_free(pool_head_fwlc_elt, tree_elt);
+	}
+}
+
+/*
+ * Remove anything allocated by the server
+ */
+static void fwlc_server_deinit(struct server *s)
+{
+	if (s->free_elt) {
+		pool_free(pool_head_fwlc_elt, s->free_elt);
+		s->free_elt = NULL;
+	}
+}
+
 /* simply removes a server from a tree.
  *
  * The lbprm's lock must be held.
  */
 static inline void fwlc_dequeue_srv(struct server *s)
 {
-	eb32_delete(&s->lb_node);
+	struct fwlc_tree_elt *tree_elt = s->tree_elt;
+	unsigned int elts;
+
+	MT_LIST_DELETE(&s->lb_mt_list);
+	if (tree_elt) {
+		elts = _HA_ATOMIC_FETCH_SUB(&tree_elt->elements, 1);
+		/* We are the last element, we can nuke the node */
+		if (elts == 1) {
+			if (FWLC_LBPRM_SMALLEST(s->proxy->lbprm.lb_seq) == tree_elt->lb_node.key) {
+				/*
+				 * We were the smallest one, and now we're
+				 * gone, reset it
+				 */
+				/*
+				 * We're holding the lbprm lock so this should never fail,
+				 * as nobody should be around to modify it
+				 */
+				do {
+				} while (fwlc_set_seq_and_smallest(&s->proxy->lbprm, s->proxy->lbprm.lb_seq, FWLC_LBPRM_SEQ(s->proxy->lbprm.lb_seq) + 1, 0) == 0 && __ha_cpu_relax());
+
+			}
+			eb32_delete(&tree_elt->lb_node);
+		}
+	}
+	s->tree_elt = NULL;
+	if (s->free_elt) {
+		pool_free(pool_head_fwlc_elt, s->free_elt);
+		s->free_elt = NULL;
+	}
+}
+
+/*
+ * Allocate a tree element, either from the free list, from an element provided, or
+ * from allocation.
+ * Must be called with the wrlock
+ */
+static struct fwlc_tree_elt *fwlc_alloc_tree_elt(struct proxy *p, struct fwlc_tree_elt *allocated_elt)
+{
+	struct fwlc_tree_elt *tree_elt = NULL;
+	int i = 0;
+
+	if (p->lbprm.lb_free_list_nb >= FWLC_MIN_FREE_ENTRIES) {
+		while ((tree_elt = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list)) != NULL) {
+			MT_LIST_APPEND(&p->lbprm.lb_free_list, &tree_elt->free_list);
+			if (tree_elt->elements == 0) {
+				eb32_delete(&tree_elt->lb_node);
+				if (i == 0) {
+					struct fwlc_tree_elt *tmptree;
+
+					tmptree = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list);
+					/*
+					 * Check if the next element still contains servers, and if not,
+					 * just free it, to do some cleanup.
+					 */
+					if (tmptree && tmptree->elements == 0) {
+						eb32_delete(&tmptree->lb_node);
+						pool_free(pool_head_fwlc_elt, tmptree);
+						p->lbprm.lb_free_list_nb--;
+					} else if (tmptree)
+						MT_LIST_APPEND(&p->lbprm.lb_free_list, &tmptree->free_list);
+				}
+				return tree_elt;
+		}
+			i++;
+			if (i > 3)
+				break;
+		}
+	}
+	if (!allocated_elt)
+		tree_elt = pool_alloc(pool_head_fwlc_elt);
+	else
+		tree_elt = allocated_elt;
+
+	for (i = 0; i < FWLC_LISTS_NB; i++) {
+		MT_LIST_INIT(&tree_elt->srv_list[i]);
+	}
+	MT_LIST_INIT(&tree_elt->free_list);
+	MT_LIST_APPEND(&p->lbprm.lb_free_list, &tree_elt->free_list);
+	p->lbprm.lb_free_list_nb++;
+	tree_elt->elements = 0;
+	return tree_elt;
+}
+
+/*
+ * Return the tree element for the provided key, allocate it first if needed.
+ * Must be called with the lbprm lock held.
+ */
+static struct fwlc_tree_elt *fwlc_get_tree_elt(struct server *s, u32 key)
+{
+	struct eb32_node *node;
+	struct fwlc_tree_elt *tree_elt = NULL;
+
+	node = eb32_lookup(s->lb_tree, key);
+	if (node)
+		tree_elt = container_of(node, struct fwlc_tree_elt, lb_node);
+	if (!tree_elt) {
+		/* No element available, we have to allocate one */
+		tree_elt = fwlc_alloc_tree_elt(s->proxy, NULL);
+		tree_elt->lb_node.key = key;
+		eb32_insert(s->lb_tree, &tree_elt->lb_node);
+	}
+	return tree_elt;
 }
 
 /* Queue a server in its associated tree, assuming the <eweight> is >0.
@@ -58,10 +222,77 @@ static inline void fwlc_dequeue_srv(struct server *s)
  */
 static inline void fwlc_queue_srv(struct server *s, unsigned int eweight)
 {
+	struct fwlc_tree_elt *tree_elt;
 	unsigned int inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength);
+	unsigned int list_nb;
+	u32 key;
+
+	key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / eweight : 0;
+	tree_elt = fwlc_get_tree_elt(s, key);
+	list_nb = statistical_prng_range(FWLC_LISTS_NB);
+	MT_LIST_APPEND(&tree_elt->srv_list[list_nb], &s->lb_mt_list);
+	s->tree_elt = tree_elt;
+	_HA_ATOMIC_INC(&tree_elt->elements);
+	if (FWLC_LBPRM_SMALLEST(s->proxy->lbprm.lb_seq) > key) {
+		/*
+		 * We're holding the lbprm lock so this should never fail,
+		 * as nobody should be around to modify it
+		 */
+		do {
+		} while (fwlc_set_seq_and_smallest(&s->proxy->lbprm, s->proxy->lbprm.lb_seq, FWLC_LBPRM_SEQ(s->proxy->lbprm.lb_seq) + 1, key) == 0);
+	}
+}
+
+/*
+ * Loop across the different lists until we find an unlocked one, and lock it.
+ */
+static __inline struct mt_list fwlc_lock_target_list(struct fwlc_tree_elt *tree_elt)
+{
+	struct mt_list list = {NULL, NULL};
+	int i;
+	int dst_list;
+
+
+	dst_list = statistical_prng_range(FWLC_LISTS_NB);
+
+	while (list.next == NULL) {
+		for (i = 0; i < FWLC_LISTS_NB; i++) {
+			list = mt_list_try_lock_prev(&tree_elt->srv_list[(dst_list + i) % FWLC_LISTS_NB]);
+			if (list.next != NULL)
+				break;
+		}
+	}
+	return list;
+}
+
+/*
+ * Calculate the key to be used for a given server
+ */
+static inline unsigned int fwlc_get_key(struct server *s)
+{
+	unsigned int inflight;
+	unsigned int eweight;
+	unsigned int new_key;
+
+	inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength);
+	eweight = _HA_ATOMIC_LOAD(&s->cur_eweight);
+	new_key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1) : 0;
+
+	return new_key;
+}
+
+/*
+ * Only one thread will try to update a server position at a given time,
+ * thanks to the lb_lock. However that means that by the time we are done
+ * with the update, a new one might be needed, so check for that and
+ * schedule the tasklet if needed, once we dropped the lock.
+ */
+static inline void fwlc_check_srv_key(struct server *s, unsigned int expected)
+{
+	unsigned int key = fwlc_get_key(s);
 
-	s->lb_node.key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / eweight : 0;
-	eb32_insert(s->lb_tree, &s->lb_node);
+	if (key != expected && s->requeue_tasklet)
+		tasklet_wakeup(s->requeue_tasklet);
 }
 
 /* Re-position the server in the FWLC tree after it has been assigned one
@@ -71,45 +302,204 @@ static inline void fwlc_queue_srv(struct server *s, unsigned int eweight)
  */
 static void fwlc_srv_reposition(struct server *s)
 {
-	unsigned int inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength);
+	struct mt_list to_unlock;
+	struct fwlc_tree_elt *tree_elt = NULL, *allocated_elt = NULL;
+	struct eb32_node *node;
+	struct mt_list list;
+	uint64_t cur_seq = 0;
 	unsigned int eweight = _HA_ATOMIC_LOAD(&s->cur_eweight);
-	unsigned int new_key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1) : 0;
+	unsigned int new_key;
+	unsigned int smallest;
+	int srv_lock;
 
+	HA_RWLOCK_RDLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+	new_key = fwlc_get_key(s);
 	/* some calls will be made for no change (e.g connect_server() after
 	 * assign_server(). Let's check that first.
 	 */
-	if (s->lb_node.node.leaf_p && eweight && s->lb_node.key == new_key)
+	if ((s->tree_elt && s->tree_elt->lb_node.node.leaf_p && eweight &&
+	    s->tree_elt->lb_node.key == new_key) || !s->lb_tree) {
+		HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+		return;
+	}
+
+	srv_lock = HA_ATOMIC_XCHG(&s->lb_lock, 1);
+	/* Somebody else is updating that server, give up */
+	if (srv_lock == 1) {
+		HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
 		return;
+	}
 
-	if (HA_RWLOCK_TRYWRLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock) != 0) {
-		/* there's already some contention on the tree's lock, there's
-		 * no point insisting. Better wake up the server's tasklet that
-		 * will let this or another thread retry later. For the time
-		 * being, the server's apparent load is slightly inaccurate but
-		 * we don't care, if there is contention, it will self-regulate.
+	node = eb32_lookup(s->lb_tree, new_key);
+	if (node)
+		tree_elt = container_of(node, struct fwlc_tree_elt, lb_node);
+		/*
+		 * It is possible that s->tree_elt was changed since we checked
+		 * As s->tree_elt is only changed while holding s->lb_lock,
+		 * check again now that we acquired it, and if we're using
+		 * the right element, do nothing.
 		 */
-		if (s->requeue_tasklet)
-			tasklet_wakeup(s->requeue_tasklet);
+	if (tree_elt == s->tree_elt) {
+		HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+		_HA_ATOMIC_STORE(&s->lb_lock, 0);
+		fwlc_check_srv_key(s, new_key);
 		return;
 	}
+	/*
+	 * We have to allocate a new tree element, and/or remove the
+	 * previous element, we will modify the tree, so let's get the write
+	 * lock.
+	 */
+	if (!tree_elt) {
+		unsigned int new_new_key;
+
+		/*
+		 * We don't want to allocate something while holding the lock,
+		 * so make sure we have something allocated before.
+		 */
+		if (s->free_elt != NULL) {
+			allocated_elt = s->free_elt;
+			s->free_elt = NULL;
+		} else
+			allocated_elt = pool_alloc(pool_head_fwlc_elt);
+		if (HA_RWLOCK_TRYRDTOWR(LBPRM_LOCK, &s->proxy->lbprm.lock) != 0) {
+			/* there's already some contention on the tree's lock, there's
+			 * no point insisting. Better wake up the server's tasklet that
+			 * will let this or another thread retry later. For the time
+			 * being, the server's apparent load is slightly inaccurate but
+			 * we don't care, if there is contention, it will self-regulate.
+			 */
+			if (s->requeue_tasklet)
+				tasklet_wakeup(s->requeue_tasklet);
+			HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+			s->free_elt = allocated_elt;
+			_HA_ATOMIC_STORE(&s->lb_lock, 0);
+			return;
+		}
 
-	/* below we've got the lock */
-	if (s->lb_tree) {
 		/* we might have been waiting for a while on the lock above
 		 * so it's worth testing again because other threads are very
 		 * likely to have released a connection or taken one leading
 		 * to our target value (50% of the case in measurements).
 		 */
-		inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength);
-		eweight = _HA_ATOMIC_LOAD(&s->cur_eweight);
-		new_key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1) : 0;
-		if (!s->lb_node.node.leaf_p || s->lb_node.key != new_key) {
-			eb32_delete(&s->lb_node);
-			s->lb_node.key = new_key;
-			eb32_insert(s->lb_tree, &s->lb_node);
+
+		new_new_key = fwlc_get_key(s);
+		if (new_new_key != new_key) {
+			if (s->tree_elt &&
+			    s->tree_elt->lb_node.node.leaf_p &&
+			    eweight && s->tree_elt->lb_node.key == new_new_key) {
+				/* Okay after all we have nothing to do */
+				HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+				s->free_elt = allocated_elt;
+				_HA_ATOMIC_STORE(&s->lb_lock, 0);
+				fwlc_check_srv_key(s, new_new_key);
+				return;
+			}
+			node = eb32_lookup(s->lb_tree, new_new_key);
+			if (node) {
+				tree_elt = container_of(node, struct fwlc_tree_elt, lb_node);
+				HA_RWLOCK_WRTORD(LBPRM_LOCK, &s->proxy->lbprm.lock);
+				s->free_elt = allocated_elt;
+				allocated_elt = NULL;
+			} else
+				tree_elt = NULL;
+			new_key = new_new_key;
 		}
 	}
-	HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+
+	/*
+	 * Now we increment the number of elements in the new tree_elt,
+	 * we change our sequence number and smallest, and we then
+	 * decrement the number of elements in the old tree_elt.
+	 * It is important to keep this sequencing, as fwlc_get_next_server()
+	 * uses the number of elements to know if there is something to look for,
+	 * and we want to make sure we do not miss a server.
+	 */
+	if (!tree_elt) {
+		/*
+		 * There were no tree element matching our key,
+		 * allocate one and insert it into the tree
+		 */
+		tree_elt = fwlc_alloc_tree_elt(s->proxy, allocated_elt);
+		if (tree_elt == allocated_elt)
+			allocated_elt = NULL;
+		tree_elt->lb_node.key = new_key;
+		tree_elt->elements = 1;
+		__ha_barrier_store();
+		/* If we allocated, then we hold the write lock */
+		eb32_insert(s->lb_tree, &tree_elt->lb_node);
+		HA_RWLOCK_WRTORD(LBPRM_LOCK, &s->proxy->lbprm.lock);
+	} else {
+		_HA_ATOMIC_INC(&tree_elt->elements);
+	}
+
+	__ha_barrier_store();
+	/*
+	 * Update the sequence number, and the smallest if needed.
+	 * We always have to do it, even if we're not actually
+	 * updating the smallest one, otherwise we'll get na
+	 * ABA problem and a server may be missed when looked up.
+	 * The only time we don't have to do it if is another thread
+	 * increased it, and the new smallest element is not
+	 * higher than our new key.
+	 */
+	do {
+                unsigned int tmpsmallest;
+		uint64_t newcurseq = _HA_ATOMIC_LOAD(&s->proxy->lbprm.lb_seq);
+
+		if (cur_seq != 0 && FWLC_LBPRM_SEQ(newcurseq) >
+		   FWLC_LBPRM_SEQ(cur_seq) && new_key >= FWLC_LBPRM_SMALLEST(newcurseq))
+			break;
+
+		cur_seq = newcurseq;
+                tmpsmallest = FWLC_LBPRM_SMALLEST(cur_seq);
+                if (new_key > tmpsmallest)
+                        smallest = tmpsmallest;
+		else
+                        smallest = new_key;
+
+        } while (fwlc_set_seq_and_smallest(&s->proxy->lbprm, cur_seq, FWLC_LBPRM_SEQ(cur_seq) + 1, smallest) == 0 && __ha_cpu_relax());
+
+	__ha_barrier_store();
+
+	if (s->tree_elt)
+                _HA_ATOMIC_DEC(&s->tree_elt->elements);
+
+	/*
+	 * Now lock the existing element, and its target list.
+	 * To prevent a deadlock, we always lock the one
+	 * with the lowest key first.
+	 */
+	if (new_key < s->tree_elt->lb_node.key) {
+		to_unlock = mt_list_lock_full(&s->lb_mt_list);
+		list = fwlc_lock_target_list(tree_elt);
+	} else {
+		list = fwlc_lock_target_list(tree_elt);
+		to_unlock = mt_list_lock_full(&s->lb_mt_list);
+	}
+
+	/*
+	 * Unlock the old list, the element is now
+	 * no longer in it.
+	 */
+	mt_list_unlock_link(to_unlock);
+
+	/*
+	 * Add the element to the new list, and unlock it.
+	 */
+	mt_list_unlock_full(&s->lb_mt_list, list);
+
+	HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+
+	s->tree_elt = tree_elt;
+
+	if (allocated_elt)
+		s->free_elt = allocated_elt;
+
+	__ha_barrier_store();
+	_HA_ATOMIC_STORE(&s->lb_lock, 0);
+
+	fwlc_check_srv_key(s, new_key);
 }
 
 /* This function updates the server trees according to server <srv>'s new
@@ -306,6 +696,8 @@ void fwlc_init_server_tree(struct proxy *p)
 	p->lbprm.server_take_conn = fwlc_srv_reposition;
 	p->lbprm.server_drop_conn = fwlc_srv_reposition;
 	p->lbprm.server_requeue   = fwlc_srv_reposition;
+	p->lbprm.server_deinit    = fwlc_server_deinit;
+	p->lbprm.proxy_deinit     = fwlc_proxy_deinit;
 
 	p->lbprm.wdiv = BE_WEIGHT_SCALE;
 	for (srv = p->srv; srv; srv = srv->next) {
@@ -313,6 +705,8 @@ void fwlc_init_server_tree(struct proxy *p)
 		srv_lb_commit_status(srv);
 	}
 
+	p->lbprm.lb_seq = 0;
+
 	recount_servers(p);
 	update_backend_weight(p);
 
@@ -337,46 +731,128 @@ struct server *fwlc_get_next_server(struct proxy *p, struct server *srvtoavoid)
 {
 	struct server *srv, *avoided;
 	struct eb32_node *node;
+	uint64_t curseq;
+	int found = 0;
 
 	srv = avoided = NULL;
 
 	HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock);
+	curseq = _HA_ATOMIC_LOAD(&p->lbprm.lb_seq);
+redo:
 	if (p->srv_act)
-		node = eb32_first(&p->lbprm.fwlc.act);
+		node = eb32_lookup_ge(&p->lbprm.fwlc.act, FWLC_LBPRM_SMALLEST(curseq));
 	else if (p->lbprm.fbck) {
 		srv = p->lbprm.fbck;
 		goto out;
 	}
 	else if (p->srv_bck)
-		node = eb32_first(&p->lbprm.fwlc.bck);
+		node = eb32_lookup_ge(&p->lbprm.fwlc.bck, FWLC_LBPRM_SMALLEST(curseq));
 	else {
 		srv = NULL;
 		goto out;
 	}
 
 	while (node) {
-		/* OK, we have a server. However, it may be saturated, in which
-		 * case we don't want to reconsider it for now, so we'll simply
-		 * skip it. Same if it's the server we try to avoid, in which
-		 * case we simply remember it for later use if needed.
-		 */
+		struct fwlc_tree_elt *tree_elt;
 		struct server *s;
+		int orig_nb;
+		int i = 0;
+
+		tree_elt = eb32_entry(node, struct fwlc_tree_elt, lb_node);
+		orig_nb = statistical_prng_range(FWLC_LISTS_NB);
+
+		while (_HA_ATOMIC_LOAD(&tree_elt->elements) > 0) {
+			struct mt_list mt_list;
+			mt_list.next = _HA_ATOMIC_LOAD(&tree_elt->srv_list[(i + orig_nb) % FWLC_LISTS_NB].next);
+
+			if (mt_list.next != &tree_elt->srv_list[(i + orig_nb) % FWLC_LISTS_NB] && mt_list.next != MT_LIST_BUSY) {
+				unsigned int eweight;
+				unsigned int planned_inflight;
+				s = container_of(mt_list.next, struct server, lb_mt_list);
+				eweight = _HA_ATOMIC_LOAD(&s->cur_eweight);
+
+				planned_inflight = tree_elt->lb_node.key * eweight / SRV_EWGHT_MAX;
+				if (!s->maxconn || s->served + s->queueslength < srv_dynamic_maxconn(s) + s->maxqueue) {
+					if (_HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength) > planned_inflight + 2) {
+						/*
+						 * The server has more requests than expected,
+						 * let's try to reposition it, to avoid too
+						 * many threads using the same server at the
+						 * same time.
+						 */
+						if (i >= FWLC_LISTS_NB) {
+							HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &p->lbprm.lock);
+							fwlc_srv_reposition(s);
+							HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock);
+						}
+						i++;
+						continue;
+					}
+                                        if (s != srvtoavoid) {
+                                                srv = s;
+                                                found = 1;
+                                                break;
+                                        }
+					avoided = s;
+				}
+				i++;
+			} else if (mt_list.next == &tree_elt->srv_list[(i + orig_nb) % FWLC_LISTS_NB]) {
+				i++;
+				continue;
+			} else {
+				i++;
+				continue;
+			}
+		}
+		if (found)
+			break;
 
-		s = eb32_entry(node, struct server, lb_node);
-		if (!s->maxconn || s->served + s->queueslength < srv_dynamic_maxconn(s) + s->maxqueue) {
-			if (s != srvtoavoid) {
-				srv = s;
-				break;
+		do {
+			node = eb32_next(node);
+		} while (node && node->key < FWLC_LBPRM_SMALLEST(curseq));
+
+		if (node) {
+			uint64_t newcurseq = HA_ATOMIC_LOAD(&p->lbprm.lb_seq);
+
+			/*
+			 * If we have a bigger element than the smallest recorded, and we're up to date,
+			 * update the smallest one.
+			 */
+			if (likely(newcurseq == curseq && FWLC_LBPRM_SMALLEST(newcurseq) < node->key)) {
+				if (fwlc_set_seq_and_smallest(&p->lbprm, curseq, FWLC_LBPRM_SEQ(curseq), node->key) != 0) {
+					curseq = FWLC_LBPRM_SEQ(curseq) | ((uint64_t)node->key << 32);
+					__ha_barrier_store();
+					continue;
+				}
+
+			}
+			/*
+			 * Somebody added a new server in node we already skipped, so retry from the beginning.
+			 */
+			if (unlikely(FWLC_LBPRM_SMALLEST(newcurseq) < node->key && FWLC_LBPRM_SEQ(newcurseq) != FWLC_LBPRM_SEQ(curseq))) {
+				curseq = newcurseq;
+				goto redo;
+			}
+			curseq = newcurseq;
+		} else {
+			uint64_t newcurseq = _HA_ATOMIC_LOAD(&p->lbprm.lb_seq);
+
+			/*
+			 * No more node, but somebody changed the tree, so it's
+			 * worth trying again.
+			 */
+			if (FWLC_LBPRM_SEQ(newcurseq) != FWLC_LBPRM_SEQ(curseq)) {
+				curseq = newcurseq;
+				goto redo;
 			}
-			avoided = s;
 		}
-		node = eb32_next(node);
 	}
 
 	if (!srv)
 		srv = avoided;
  out:
 	HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &p->lbprm.lock);
+
 	return srv;
 }
 
diff --git a/src/proxy.c b/src/proxy.c
index 9b786b33a..67807d7e7 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -1478,6 +1478,8 @@ void init_new_proxy(struct proxy *p)
 	LIST_INIT(&p->filter_configs);
 	LIST_INIT(&p->tcpcheck_rules.preset_vars);
 
+	MT_LIST_INIT(&p->lbprm.lb_free_list);
+
 	p->defsrv.id = "default-server";
 	p->conf.used_listener_id = EB_ROOT;
 	p->conf.used_server_id   = EB_ROOT;