]> git.ipfire.org Git - thirdparty/haproxy.git/commitdiff
MAJOR: leastconn; Revamp the way servers are ordered.
authorOlivier Houchard <ohouchard@haproxy.com>
Wed, 26 Mar 2025 16:16:48 +0000 (16:16 +0000)
committerOlivier Houchard <cognet@ci0.org>
Tue, 1 Apr 2025 16:05:30 +0000 (18:05 +0200)
For leastconn, servers used to just be stored in an ebtree.
Each server would be one node.
Change that so that nodes contain multiple mt_lists. Each list
will contain servers that share the same key (typically meaning
they have the same number of connections). Using mt_lists means
that as long as tree elements already exist, moving a server from
one tree element to another does no longer require the lbprm write
lock.
We use multiple mt_lists to reduce the contention when moving
a server from one tree element to another. A list in the new
element will be chosen randomly.
We no longer remove a tree element as soon as they no longer
contain any server. Instead, we keep a list of all elements,
and when we need a new element, we look at that list only if it
contains a number of elements already, otherwise we'll allocate
a new one. Keeping nodes in the tree ensures that we very
rarely have to take the lbrpm write lock (as it only happens
when we're moving the server to a position for which no
element is currently in the tree).

The number of mt_lists used is defined as FWLC_NB_LISTS.
The number of tree elements we want to keep is defined as
FWLC_MIN_FREE_ENTRIES, both in defaults.h.
The value used were picked afrer experimentation, and
seems to be the best choice of performances vs memory
usage.

Doing that gives a good boost in performances when a lot of
servers are used.
With a configuration using 500 servers, before that patch,
about 830000 requests per second could be processed, with
that patch, about 1550000 requests per second are
processed, on an 64-cores AMD, using 1200 concurrent connections.

include/haproxy/backend-t.h
include/haproxy/defaults.h
include/haproxy/server-t.h
src/lb_fwlc.c
src/proxy.c

index d7e57a7726051c91d4d67755c36321cb882a19eb..f0a9bf354d88c9089f36a299d69bb2afca5adcd7 100644 (file)
@@ -162,12 +162,15 @@ struct lbprm {
        int wmult;                      /* ratio between user weight and effective weight */
        int wdiv;                       /* ratio between effective weight and user weight */
        int hash_balance_factor;        /* load balancing factor * 100, 0 if disabled */
+       unsigned int lb_free_list_nb;   /* Number of elements in the free list */
        struct sample_expr *expr;       /* sample expression for "balance (log-)hash" */
        char *arg_str;                  /* name of the URL parameter/header/cookie used for hashing */
        int   arg_len;                  /* strlen(arg_str), computed only once */
        int   arg_opt1;                 /* extra option 1 for the LB algo (algo-specific) */
        int   arg_opt2;                 /* extra option 2 for the LB algo (algo-specific) */
        int   arg_opt3;                 /* extra option 3 for the LB algo (algo-specific) */
+       uint64_t lb_seq;                /* sequence number for algos who need it */
+       struct mt_list lb_free_list;    /* LB tree elements available */
        __decl_thread(HA_RWLOCK_T lock);
        struct server *fbck;            /* first backup server when !PR_O_USE_ALL_BK, or NULL */
 
index dbb39f4c3b57d2b39770da547c6741b1b2631864..eec78132be96a9726c4cab3c1e8fe3d084d6f649 100644 (file)
 #define MAX_SELF_USE_QUEUE 9
 #endif
 
+/*
+ * FWLC defines
+ */
+
+/*
+ * How many mt_lists we use per tree elements.
+ * The more lists we have, the less likely it
+ * will be that we'll have contention when
+ * inserting/removing an element, but the more
+ * costly it will be to look up servers.
+ */
+#ifndef FWLC_LISTS_NB
+#define FWLC_LISTS_NB   4
+#endif /* FWLC_LISTS_NB */
+
+/*
+ * How many entries we want to keep in the
+ * free list, before trying to use some.
+ * We want to keep some nodes in the tree,
+ * to avoid having to re-allocate one and
+ * modify the tree, which requires the
+ * write lock and is costly, but we
+ * don't want to have too much, to save
+ * memory.
+ */
+#ifndef FWLC_MIN_FREE_ENTRIES
+#define FWLC_MIN_FREE_ENTRIES 500
+#endif /* FWLC_MIN_FREE_ENTRIES */
+
 #endif /* _HAPROXY_DEFAULTS_H */
index fe13318f862115ea63dd5ea8646e55475c993dd5..716ac79fb11d5c4c9fcc37c3687b5fef7eae784e 100644 (file)
@@ -383,6 +383,12 @@ struct server {
        union {
                struct eb32_node lb_node;       /* node used for tree-based load balancing */
                struct list lb_list;            /* elem used for list-based load balancing */
+               struct {
+                       struct fwlc_tree_elt *tree_elt; /* pointer to the element stored in tree, protected by lb_lock  */
+                       struct fwlc_tree_elt *free_elt; /* A free element, so that we don't have to allocate one, protected by lb_lock */
+                       struct mt_list lb_mt_list;      /* elem used for mt list-based load balancing, protected by lb_lock */
+                       int lb_lock;                    /* make sure we are the only one updating the server */
+               };
        };
        struct server *next_full;               /* next server in the temporary full list */
 
index bb7e8979c1cb9a8c69b0e011bcb5340fbd767fdb..c647336c538466e8f59b4b12ff98790bac00846b 100644 (file)
 #include <haproxy/queue.h>
 #include <haproxy/server-t.h>
 #include <haproxy/task.h>
+#include <haproxy/tools.h>
 
+struct fwlc_tree_elt {
+       struct mt_list srv_list[FWLC_LISTS_NB];
+       struct mt_list free_list;
+       struct eb32_node lb_node;
+       unsigned int elements;
+};
+
+DECLARE_STATIC_POOL(pool_head_fwlc_elt, "fwlc_tree_elt", sizeof(struct fwlc_tree_elt));
+
+#define FWLC_LBPRM_SEQ(lbprm)          ((lbprm) & 0xffffffff)
+#define FWLC_LBPRM_SMALLEST(lbprm)     ((lbprm) >> 32)
+
+/*
+ * Atomically try to update the sequence number, and the smallest key for which there is at least one server.
+ * Returns 1 on success, and 0 on failure.
+ */
+static int fwlc_set_seq_and_smallest(struct lbprm *lbprm, uint64_t current, unsigned int seq, unsigned int smallest)
+{
+#if !defined(HA_CAS_IS_8B) && !defined(HA_HAVE_CAS_DW)
+       __decl_thread(static HA_SPINLOCK_T seq_lock);
+#endif
+       uint64_t dst_nb = seq | ((uint64_t)smallest << 32);
+       int ret;
+#if defined(HA_CAS_IS_8B)
+       ret =  _HA_ATOMIC_CAS(&lbprm->lb_seq, &current, dst_nb);
+#elif defined(HA_HAVE_CAS_DW)
+       ret = _HA_ATOMIC_DWCAS(&lbprm->lb_seq, &current, &dst_nb);
+#else
+       HA_SPIN_LOCK(OTHER_LOCK, &seq_lock);
+       if (lbprm->lb_seq == current) {
+               lbprm->lb_seq = dst_nb;
+               ret = 1;
+       } else
+               ret = 0;
+       HA_SPIN_UNLOCK(OTHER_LOCK, &seq_lock);
+#endif
+       return ret;
+
+}
 
 /* Remove a server from a tree. It must have previously been dequeued. This
  * function is meant to be called when a server is going down or has its
@@ -29,13 +69,137 @@ static inline void fwlc_remove_from_tree(struct server *s)
        s->lb_tree = NULL;
 }
 
+/*
+ * Remove anything allocated by the proxy
+ */
+static void fwlc_proxy_deinit(struct proxy *p)
+{
+       struct fwlc_tree_elt *tree_elt;
+
+       while ((tree_elt = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list)) != NULL) {
+               pool_free(pool_head_fwlc_elt, tree_elt);
+       }
+}
+
+/*
+ * Remove anything allocated by the server
+ */
+static void fwlc_server_deinit(struct server *s)
+{
+       if (s->free_elt) {
+               pool_free(pool_head_fwlc_elt, s->free_elt);
+               s->free_elt = NULL;
+       }
+}
+
 /* simply removes a server from a tree.
  *
  * The lbprm's lock must be held.
  */
 static inline void fwlc_dequeue_srv(struct server *s)
 {
-       eb32_delete(&s->lb_node);
+       struct fwlc_tree_elt *tree_elt = s->tree_elt;
+       unsigned int elts;
+
+       MT_LIST_DELETE(&s->lb_mt_list);
+       if (tree_elt) {
+               elts = _HA_ATOMIC_FETCH_SUB(&tree_elt->elements, 1);
+               /* We are the last element, we can nuke the node */
+               if (elts == 1) {
+                       if (FWLC_LBPRM_SMALLEST(s->proxy->lbprm.lb_seq) == tree_elt->lb_node.key) {
+                               /*
+                                * We were the smallest one, and now we're
+                                * gone, reset it
+                                */
+                               /*
+                                * We're holding the lbprm lock so this should never fail,
+                                * as nobody should be around to modify it
+                                */
+                               do {
+                               } while (fwlc_set_seq_and_smallest(&s->proxy->lbprm, s->proxy->lbprm.lb_seq, FWLC_LBPRM_SEQ(s->proxy->lbprm.lb_seq) + 1, 0) == 0 && __ha_cpu_relax());
+
+                       }
+                       eb32_delete(&tree_elt->lb_node);
+               }
+       }
+       s->tree_elt = NULL;
+       if (s->free_elt) {
+               pool_free(pool_head_fwlc_elt, s->free_elt);
+               s->free_elt = NULL;
+       }
+}
+
+/*
+ * Allocate a tree element, either from the free list, from an element provided, or
+ * from allocation.
+ * Must be called with the wrlock
+ */
+static struct fwlc_tree_elt *fwlc_alloc_tree_elt(struct proxy *p, struct fwlc_tree_elt *allocated_elt)
+{
+       struct fwlc_tree_elt *tree_elt = NULL;
+       int i = 0;
+
+       if (p->lbprm.lb_free_list_nb >= FWLC_MIN_FREE_ENTRIES) {
+               while ((tree_elt = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list)) != NULL) {
+                       MT_LIST_APPEND(&p->lbprm.lb_free_list, &tree_elt->free_list);
+                       if (tree_elt->elements == 0) {
+                               eb32_delete(&tree_elt->lb_node);
+                               if (i == 0) {
+                                       struct fwlc_tree_elt *tmptree;
+
+                                       tmptree = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list);
+                                       /*
+                                        * Check if the next element still contains servers, and if not,
+                                        * just free it, to do some cleanup.
+                                        */
+                                       if (tmptree && tmptree->elements == 0) {
+                                               eb32_delete(&tmptree->lb_node);
+                                               pool_free(pool_head_fwlc_elt, tmptree);
+                                               p->lbprm.lb_free_list_nb--;
+                                       } else if (tmptree)
+                                               MT_LIST_APPEND(&p->lbprm.lb_free_list, &tmptree->free_list);
+                               }
+                               return tree_elt;
+               }
+                       i++;
+                       if (i > 3)
+                               break;
+               }
+       }
+       if (!allocated_elt)
+               tree_elt = pool_alloc(pool_head_fwlc_elt);
+       else
+               tree_elt = allocated_elt;
+
+       for (i = 0; i < FWLC_LISTS_NB; i++) {
+               MT_LIST_INIT(&tree_elt->srv_list[i]);
+       }
+       MT_LIST_INIT(&tree_elt->free_list);
+       MT_LIST_APPEND(&p->lbprm.lb_free_list, &tree_elt->free_list);
+       p->lbprm.lb_free_list_nb++;
+       tree_elt->elements = 0;
+       return tree_elt;
+}
+
+/*
+ * Return the tree element for the provided key, allocate it first if needed.
+ * Must be called with the lbprm lock held.
+ */
+static struct fwlc_tree_elt *fwlc_get_tree_elt(struct server *s, u32 key)
+{
+       struct eb32_node *node;
+       struct fwlc_tree_elt *tree_elt = NULL;
+
+       node = eb32_lookup(s->lb_tree, key);
+       if (node)
+               tree_elt = container_of(node, struct fwlc_tree_elt, lb_node);
+       if (!tree_elt) {
+               /* No element available, we have to allocate one */
+               tree_elt = fwlc_alloc_tree_elt(s->proxy, NULL);
+               tree_elt->lb_node.key = key;
+               eb32_insert(s->lb_tree, &tree_elt->lb_node);
+       }
+       return tree_elt;
 }
 
 /* Queue a server in its associated tree, assuming the <eweight> is >0.
@@ -58,10 +222,77 @@ static inline void fwlc_dequeue_srv(struct server *s)
  */
 static inline void fwlc_queue_srv(struct server *s, unsigned int eweight)
 {
+       struct fwlc_tree_elt *tree_elt;
        unsigned int inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength);
+       unsigned int list_nb;
+       u32 key;
+
+       key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / eweight : 0;
+       tree_elt = fwlc_get_tree_elt(s, key);
+       list_nb = statistical_prng_range(FWLC_LISTS_NB);
+       MT_LIST_APPEND(&tree_elt->srv_list[list_nb], &s->lb_mt_list);
+       s->tree_elt = tree_elt;
+       _HA_ATOMIC_INC(&tree_elt->elements);
+       if (FWLC_LBPRM_SMALLEST(s->proxy->lbprm.lb_seq) > key) {
+               /*
+                * We're holding the lbprm lock so this should never fail,
+                * as nobody should be around to modify it
+                */
+               do {
+               } while (fwlc_set_seq_and_smallest(&s->proxy->lbprm, s->proxy->lbprm.lb_seq, FWLC_LBPRM_SEQ(s->proxy->lbprm.lb_seq) + 1, key) == 0);
+       }
+}
+
+/*
+ * Loop across the different lists until we find an unlocked one, and lock it.
+ */
+static __inline struct mt_list fwlc_lock_target_list(struct fwlc_tree_elt *tree_elt)
+{
+       struct mt_list list = {NULL, NULL};
+       int i;
+       int dst_list;
+
+
+       dst_list = statistical_prng_range(FWLC_LISTS_NB);
+
+       while (list.next == NULL) {
+               for (i = 0; i < FWLC_LISTS_NB; i++) {
+                       list = mt_list_try_lock_prev(&tree_elt->srv_list[(dst_list + i) % FWLC_LISTS_NB]);
+                       if (list.next != NULL)
+                               break;
+               }
+       }
+       return list;
+}
+
+/*
+ * Calculate the key to be used for a given server
+ */
+static inline unsigned int fwlc_get_key(struct server *s)
+{
+       unsigned int inflight;
+       unsigned int eweight;
+       unsigned int new_key;
+
+       inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength);
+       eweight = _HA_ATOMIC_LOAD(&s->cur_eweight);
+       new_key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1) : 0;
+
+       return new_key;
+}
+
+/*
+ * Only one thread will try to update a server position at a given time,
+ * thanks to the lb_lock. However that means that by the time we are done
+ * with the update, a new one might be needed, so check for that and
+ * schedule the tasklet if needed, once we dropped the lock.
+ */
+static inline void fwlc_check_srv_key(struct server *s, unsigned int expected)
+{
+       unsigned int key = fwlc_get_key(s);
 
-       s->lb_node.key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / eweight : 0;
-       eb32_insert(s->lb_tree, &s->lb_node);
+       if (key != expected && s->requeue_tasklet)
+               tasklet_wakeup(s->requeue_tasklet);
 }
 
 /* Re-position the server in the FWLC tree after it has been assigned one
@@ -71,45 +302,204 @@ static inline void fwlc_queue_srv(struct server *s, unsigned int eweight)
  */
 static void fwlc_srv_reposition(struct server *s)
 {
-       unsigned int inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength);
+       struct mt_list to_unlock;
+       struct fwlc_tree_elt *tree_elt = NULL, *allocated_elt = NULL;
+       struct eb32_node *node;
+       struct mt_list list;
+       uint64_t cur_seq = 0;
        unsigned int eweight = _HA_ATOMIC_LOAD(&s->cur_eweight);
-       unsigned int new_key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1) : 0;
+       unsigned int new_key;
+       unsigned int smallest;
+       int srv_lock;
 
+       HA_RWLOCK_RDLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+       new_key = fwlc_get_key(s);
        /* some calls will be made for no change (e.g connect_server() after
         * assign_server(). Let's check that first.
         */
-       if (s->lb_node.node.leaf_p && eweight && s->lb_node.key == new_key)
+       if ((s->tree_elt && s->tree_elt->lb_node.node.leaf_p && eweight &&
+           s->tree_elt->lb_node.key == new_key) || !s->lb_tree) {
+               HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+               return;
+       }
+
+       srv_lock = HA_ATOMIC_XCHG(&s->lb_lock, 1);
+       /* Somebody else is updating that server, give up */
+       if (srv_lock == 1) {
+               HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
                return;
+       }
 
-       if (HA_RWLOCK_TRYWRLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock) != 0) {
-               /* there's already some contention on the tree's lock, there's
-                * no point insisting. Better wake up the server's tasklet that
-                * will let this or another thread retry later. For the time
-                * being, the server's apparent load is slightly inaccurate but
-                * we don't care, if there is contention, it will self-regulate.
+       node = eb32_lookup(s->lb_tree, new_key);
+       if (node)
+               tree_elt = container_of(node, struct fwlc_tree_elt, lb_node);
+               /*
+                * It is possible that s->tree_elt was changed since we checked
+                * As s->tree_elt is only changed while holding s->lb_lock,
+                * check again now that we acquired it, and if we're using
+                * the right element, do nothing.
                 */
-               if (s->requeue_tasklet)
-                       tasklet_wakeup(s->requeue_tasklet);
+       if (tree_elt == s->tree_elt) {
+               HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+               _HA_ATOMIC_STORE(&s->lb_lock, 0);
+               fwlc_check_srv_key(s, new_key);
                return;
        }
+       /*
+        * We have to allocate a new tree element, and/or remove the
+        * previous element, we will modify the tree, so let's get the write
+        * lock.
+        */
+       if (!tree_elt) {
+               unsigned int new_new_key;
+
+               /*
+                * We don't want to allocate something while holding the lock,
+                * so make sure we have something allocated before.
+                */
+               if (s->free_elt != NULL) {
+                       allocated_elt = s->free_elt;
+                       s->free_elt = NULL;
+               } else
+                       allocated_elt = pool_alloc(pool_head_fwlc_elt);
+               if (HA_RWLOCK_TRYRDTOWR(LBPRM_LOCK, &s->proxy->lbprm.lock) != 0) {
+                       /* there's already some contention on the tree's lock, there's
+                        * no point insisting. Better wake up the server's tasklet that
+                        * will let this or another thread retry later. For the time
+                        * being, the server's apparent load is slightly inaccurate but
+                        * we don't care, if there is contention, it will self-regulate.
+                        */
+                       if (s->requeue_tasklet)
+                               tasklet_wakeup(s->requeue_tasklet);
+                       HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+                       s->free_elt = allocated_elt;
+                       _HA_ATOMIC_STORE(&s->lb_lock, 0);
+                       return;
+               }
 
-       /* below we've got the lock */
-       if (s->lb_tree) {
                /* we might have been waiting for a while on the lock above
                 * so it's worth testing again because other threads are very
                 * likely to have released a connection or taken one leading
                 * to our target value (50% of the case in measurements).
                 */
-               inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength);
-               eweight = _HA_ATOMIC_LOAD(&s->cur_eweight);
-               new_key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1) : 0;
-               if (!s->lb_node.node.leaf_p || s->lb_node.key != new_key) {
-                       eb32_delete(&s->lb_node);
-                       s->lb_node.key = new_key;
-                       eb32_insert(s->lb_tree, &s->lb_node);
+
+               new_new_key = fwlc_get_key(s);
+               if (new_new_key != new_key) {
+                       if (s->tree_elt &&
+                           s->tree_elt->lb_node.node.leaf_p &&
+                           eweight && s->tree_elt->lb_node.key == new_new_key) {
+                               /* Okay after all we have nothing to do */
+                               HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+                               s->free_elt = allocated_elt;
+                               _HA_ATOMIC_STORE(&s->lb_lock, 0);
+                               fwlc_check_srv_key(s, new_new_key);
+                               return;
+                       }
+                       node = eb32_lookup(s->lb_tree, new_new_key);
+                       if (node) {
+                               tree_elt = container_of(node, struct fwlc_tree_elt, lb_node);
+                               HA_RWLOCK_WRTORD(LBPRM_LOCK, &s->proxy->lbprm.lock);
+                               s->free_elt = allocated_elt;
+                               allocated_elt = NULL;
+                       } else
+                               tree_elt = NULL;
+                       new_key = new_new_key;
                }
        }
-       HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+
+       /*
+        * Now we increment the number of elements in the new tree_elt,
+        * we change our sequence number and smallest, and we then
+        * decrement the number of elements in the old tree_elt.
+        * It is important to keep this sequencing, as fwlc_get_next_server()
+        * uses the number of elements to know if there is something to look for,
+        * and we want to make sure we do not miss a server.
+        */
+       if (!tree_elt) {
+               /*
+                * There were no tree element matching our key,
+                * allocate one and insert it into the tree
+                */
+               tree_elt = fwlc_alloc_tree_elt(s->proxy, allocated_elt);
+               if (tree_elt == allocated_elt)
+                       allocated_elt = NULL;
+               tree_elt->lb_node.key = new_key;
+               tree_elt->elements = 1;
+               __ha_barrier_store();
+               /* If we allocated, then we hold the write lock */
+               eb32_insert(s->lb_tree, &tree_elt->lb_node);
+               HA_RWLOCK_WRTORD(LBPRM_LOCK, &s->proxy->lbprm.lock);
+       } else {
+               _HA_ATOMIC_INC(&tree_elt->elements);
+       }
+
+       __ha_barrier_store();
+       /*
+        * Update the sequence number, and the smallest if needed.
+        * We always have to do it, even if we're not actually
+        * updating the smallest one, otherwise we'll get na
+        * ABA problem and a server may be missed when looked up.
+        * The only time we don't have to do it if is another thread
+        * increased it, and the new smallest element is not
+        * higher than our new key.
+        */
+       do {
+                unsigned int tmpsmallest;
+               uint64_t newcurseq = _HA_ATOMIC_LOAD(&s->proxy->lbprm.lb_seq);
+
+               if (cur_seq != 0 && FWLC_LBPRM_SEQ(newcurseq) >
+                  FWLC_LBPRM_SEQ(cur_seq) && new_key >= FWLC_LBPRM_SMALLEST(newcurseq))
+                       break;
+
+               cur_seq = newcurseq;
+                tmpsmallest = FWLC_LBPRM_SMALLEST(cur_seq);
+                if (new_key > tmpsmallest)
+                        smallest = tmpsmallest;
+               else
+                        smallest = new_key;
+
+        } while (fwlc_set_seq_and_smallest(&s->proxy->lbprm, cur_seq, FWLC_LBPRM_SEQ(cur_seq) + 1, smallest) == 0 && __ha_cpu_relax());
+
+       __ha_barrier_store();
+
+       if (s->tree_elt)
+                _HA_ATOMIC_DEC(&s->tree_elt->elements);
+
+       /*
+        * Now lock the existing element, and its target list.
+        * To prevent a deadlock, we always lock the one
+        * with the lowest key first.
+        */
+       if (new_key < s->tree_elt->lb_node.key) {
+               to_unlock = mt_list_lock_full(&s->lb_mt_list);
+               list = fwlc_lock_target_list(tree_elt);
+       } else {
+               list = fwlc_lock_target_list(tree_elt);
+               to_unlock = mt_list_lock_full(&s->lb_mt_list);
+       }
+
+       /*
+        * Unlock the old list, the element is now
+        * no longer in it.
+        */
+       mt_list_unlock_link(to_unlock);
+
+       /*
+        * Add the element to the new list, and unlock it.
+        */
+       mt_list_unlock_full(&s->lb_mt_list, list);
+
+       HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+
+       s->tree_elt = tree_elt;
+
+       if (allocated_elt)
+               s->free_elt = allocated_elt;
+
+       __ha_barrier_store();
+       _HA_ATOMIC_STORE(&s->lb_lock, 0);
+
+       fwlc_check_srv_key(s, new_key);
 }
 
 /* This function updates the server trees according to server <srv>'s new
@@ -306,6 +696,8 @@ void fwlc_init_server_tree(struct proxy *p)
        p->lbprm.server_take_conn = fwlc_srv_reposition;
        p->lbprm.server_drop_conn = fwlc_srv_reposition;
        p->lbprm.server_requeue   = fwlc_srv_reposition;
+       p->lbprm.server_deinit    = fwlc_server_deinit;
+       p->lbprm.proxy_deinit     = fwlc_proxy_deinit;
 
        p->lbprm.wdiv = BE_WEIGHT_SCALE;
        for (srv = p->srv; srv; srv = srv->next) {
@@ -313,6 +705,8 @@ void fwlc_init_server_tree(struct proxy *p)
                srv_lb_commit_status(srv);
        }
 
+       p->lbprm.lb_seq = 0;
+
        recount_servers(p);
        update_backend_weight(p);
 
@@ -337,46 +731,128 @@ struct server *fwlc_get_next_server(struct proxy *p, struct server *srvtoavoid)
 {
        struct server *srv, *avoided;
        struct eb32_node *node;
+       uint64_t curseq;
+       int found = 0;
 
        srv = avoided = NULL;
 
        HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock);
+       curseq = _HA_ATOMIC_LOAD(&p->lbprm.lb_seq);
+redo:
        if (p->srv_act)
-               node = eb32_first(&p->lbprm.fwlc.act);
+               node = eb32_lookup_ge(&p->lbprm.fwlc.act, FWLC_LBPRM_SMALLEST(curseq));
        else if (p->lbprm.fbck) {
                srv = p->lbprm.fbck;
                goto out;
        }
        else if (p->srv_bck)
-               node = eb32_first(&p->lbprm.fwlc.bck);
+               node = eb32_lookup_ge(&p->lbprm.fwlc.bck, FWLC_LBPRM_SMALLEST(curseq));
        else {
                srv = NULL;
                goto out;
        }
 
        while (node) {
-               /* OK, we have a server. However, it may be saturated, in which
-                * case we don't want to reconsider it for now, so we'll simply
-                * skip it. Same if it's the server we try to avoid, in which
-                * case we simply remember it for later use if needed.
-                */
+               struct fwlc_tree_elt *tree_elt;
                struct server *s;
+               int orig_nb;
+               int i = 0;
+
+               tree_elt = eb32_entry(node, struct fwlc_tree_elt, lb_node);
+               orig_nb = statistical_prng_range(FWLC_LISTS_NB);
+
+               while (_HA_ATOMIC_LOAD(&tree_elt->elements) > 0) {
+                       struct mt_list mt_list;
+                       mt_list.next = _HA_ATOMIC_LOAD(&tree_elt->srv_list[(i + orig_nb) % FWLC_LISTS_NB].next);
+
+                       if (mt_list.next != &tree_elt->srv_list[(i + orig_nb) % FWLC_LISTS_NB] && mt_list.next != MT_LIST_BUSY) {
+                               unsigned int eweight;
+                               unsigned int planned_inflight;
+                               s = container_of(mt_list.next, struct server, lb_mt_list);
+                               eweight = _HA_ATOMIC_LOAD(&s->cur_eweight);
+
+                               planned_inflight = tree_elt->lb_node.key * eweight / SRV_EWGHT_MAX;
+                               if (!s->maxconn || s->served + s->queueslength < srv_dynamic_maxconn(s) + s->maxqueue) {
+                                       if (_HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength) > planned_inflight + 2) {
+                                               /*
+                                                * The server has more requests than expected,
+                                                * let's try to reposition it, to avoid too
+                                                * many threads using the same server at the
+                                                * same time.
+                                                */
+                                               if (i >= FWLC_LISTS_NB) {
+                                                       HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &p->lbprm.lock);
+                                                       fwlc_srv_reposition(s);
+                                                       HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock);
+                                               }
+                                               i++;
+                                               continue;
+                                       }
+                                        if (s != srvtoavoid) {
+                                                srv = s;
+                                                found = 1;
+                                                break;
+                                        }
+                                       avoided = s;
+                               }
+                               i++;
+                       } else if (mt_list.next == &tree_elt->srv_list[(i + orig_nb) % FWLC_LISTS_NB]) {
+                               i++;
+                               continue;
+                       } else {
+                               i++;
+                               continue;
+                       }
+               }
+               if (found)
+                       break;
 
-               s = eb32_entry(node, struct server, lb_node);
-               if (!s->maxconn || s->served + s->queueslength < srv_dynamic_maxconn(s) + s->maxqueue) {
-                       if (s != srvtoavoid) {
-                               srv = s;
-                               break;
+               do {
+                       node = eb32_next(node);
+               } while (node && node->key < FWLC_LBPRM_SMALLEST(curseq));
+
+               if (node) {
+                       uint64_t newcurseq = HA_ATOMIC_LOAD(&p->lbprm.lb_seq);
+
+                       /*
+                        * If we have a bigger element than the smallest recorded, and we're up to date,
+                        * update the smallest one.
+                        */
+                       if (likely(newcurseq == curseq && FWLC_LBPRM_SMALLEST(newcurseq) < node->key)) {
+                               if (fwlc_set_seq_and_smallest(&p->lbprm, curseq, FWLC_LBPRM_SEQ(curseq), node->key) != 0) {
+                                       curseq = FWLC_LBPRM_SEQ(curseq) | ((uint64_t)node->key << 32);
+                                       __ha_barrier_store();
+                                       continue;
+                               }
+
+                       }
+                       /*
+                        * Somebody added a new server in node we already skipped, so retry from the beginning.
+                        */
+                       if (unlikely(FWLC_LBPRM_SMALLEST(newcurseq) < node->key && FWLC_LBPRM_SEQ(newcurseq) != FWLC_LBPRM_SEQ(curseq))) {
+                               curseq = newcurseq;
+                               goto redo;
+                       }
+                       curseq = newcurseq;
+               } else {
+                       uint64_t newcurseq = _HA_ATOMIC_LOAD(&p->lbprm.lb_seq);
+
+                       /*
+                        * No more node, but somebody changed the tree, so it's
+                        * worth trying again.
+                        */
+                       if (FWLC_LBPRM_SEQ(newcurseq) != FWLC_LBPRM_SEQ(curseq)) {
+                               curseq = newcurseq;
+                               goto redo;
                        }
-                       avoided = s;
                }
-               node = eb32_next(node);
        }
 
        if (!srv)
                srv = avoided;
  out:
        HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &p->lbprm.lock);
+
        return srv;
 }
 
index 9b786b33a3d5c14bf676a5852361ad7c23352c35..67807d7e70231491193b985542c07a3e6b56787d 100644 (file)
@@ -1478,6 +1478,8 @@ void init_new_proxy(struct proxy *p)
        LIST_INIT(&p->filter_configs);
        LIST_INIT(&p->tcpcheck_rules.preset_vars);
 
+       MT_LIST_INIT(&p->lbprm.lb_free_list);
+
        p->defsrv.id = "default-server";
        p->conf.used_listener_id = EB_ROOT;
        p->conf.used_server_id   = EB_ROOT;