]> git.ipfire.org Git - thirdparty/haproxy.git/commitdiff
BUG/MAJOR: dns: attempt to lock globaly for msg waiter list instead of use barrier
authorEmeric Brun <ebrun@haproxy.com>
Wed, 20 Oct 2021 08:49:53 +0000 (10:49 +0200)
committerWilly Tarreau <w@1wt.eu>
Wed, 20 Oct 2021 15:52:07 +0000 (17:52 +0200)
The barrier is insufficient here to protect the waiters list as we can
definitely catch situations where ds->waiter shows an inconsistency
whereby the element is not attached when entering the "if" block and
is already attached when attaching it later.

This patch uses a larger lock to maintain consistency. Without it the
code would crash in 30-180 minutes under heavy stress, always showing
the same problem (ds->waiter->n->p != &ds->waiter). Now it seems to
always resist, suggesting that this was indeed the problem.

This will have to be backported to 2.4.

src/dns.c

index 433b554ad72e391d9843aafc23dff712d31ad7d0..63bb52f25f1d03a82d42727f4dd0f2f6102419b9 100644 (file)
--- a/src/dns.c
+++ b/src/dns.c
@@ -175,18 +175,9 @@ ssize_t dns_recv_nameserver(struct dns_nameserver *ns, void *data, size_t size)
 
                        ds->rx_msg.len = 0;
 
-                       /* This barrier is here to ensure that all data is
-                        * stored if the appctx detect the elem is out of the list */
-                       __ha_barrier_store();
-
                        LIST_DEL_INIT(&ds->waiter);
 
                        if (ds->appctx) {
-                               /* This second barrier is here to ensure that
-                                * the waked up appctx won't miss that the
-                                * elem is removed from the list */
-                               __ha_barrier_store();
-
                                /* awake appctx because it may have other
                                 * message to receive
                                 */
@@ -625,7 +616,10 @@ read:
         * Note: we need a load barrier here to not miss the
         * delete from the list
         */
-       __ha_barrier_load();
+
+       /* lock the dns_stream_server containing lists heads */
+       HA_SPIN_LOCK(DNS_LOCK, &ds->dss->lock);
+
        if (!LIST_INLIST(&ds->waiter)) {
                while (1) {
                        uint16_t query_id;
@@ -701,18 +695,12 @@ read:
                        pool_free(dns_query_pool, query);
                        ds->onfly_queries--;
 
-                       /* lock the dns_stream_server containing lists heads */
-                       HA_SPIN_LOCK(DNS_LOCK, &ds->dss->lock);
-
                        /* the dns_session is also added in queue of the
                         * wait_sess list where the task processing
                         * response will pop available responses
                         */
                        LIST_APPEND(&ds->dss->wait_sess, &ds->waiter);
 
-                       /* lock the dns_stream_server containing lists heads */
-                       HA_SPIN_UNLOCK(DNS_LOCK, &ds->dss->lock);
-
                        /* awake the task processing the responses */
                        task_wakeup(ds->dss->task_rsp, TASK_WOKEN_INIT);
 
@@ -722,13 +710,14 @@ read:
                if (!LIST_INLIST(&ds->waiter)) {
                        /* there is no more pending data to read and the con was closed by the server side */
                        if (!co_data(si_oc(si)) && (si_oc(si)->flags & CF_SHUTW)) {
+                               HA_SPIN_UNLOCK(DNS_LOCK, &ds->dss->lock);
                                goto close;
                        }
                }
 
        }
 
-
+       HA_SPIN_UNLOCK(DNS_LOCK, &ds->dss->lock);
        return;
 close:
        si_shutw(si);