]> git.ipfire.org Git - thirdparty/bind9.git/commitdiff
Fix the thread safety in the dns_dispatch unit
authorOndřej Surý <ondrej@isc.org>
Wed, 30 Nov 2022 16:58:35 +0000 (17:58 +0100)
committerOndřej Surý <ondrej@isc.org>
Wed, 21 Dec 2022 12:41:15 +0000 (12:41 +0000)
The dispatches are not thread-bound, and used freely between various
threads (see the dns_resolver and dns_request units for details).

This refactoring make sure that all non-const dns_dispatch_t and
dns_dispentry_t members are accessed under a lock, and both object now
track their internal state (NONE, CONNECTING, CONNECTED, CANCELED)
instead of guessing the state from the state of various struct members.

During the refactoring, the artificial limit DNS_DISPATCH_SOCKSQUOTA on
UDP sockets per dispatch was removed as the limiting needs to happen and
happens on in dns_resolver and limiting the number of UDP sockets
artificially in dispatch could lead to unpredictable behaviour in case
one dispatch has the limit exhausted by others are idle.

The TCP artificial limit of DNS_DISPATCH_MAXREQUESTS makes even less
sense as the TCP connections are only reused in the dns_request API
that's not a heavy user of the outgoing connections.

As a side note, the fact that UDP and TCP dispatch pretends to be same
thing, but in fact the connected UDP is handled from dns_dispentry_t and
dns_dispatch_t acts as a broker, but connected TCP is handled from
dns_dispatch_t and dns_dispatchmgr_t acts as a broker doesn't really
help the clarity of this unit.

This refactoring kept to API almost same - only dns_dispatch_cancel()
and dns_dispatch_done() were merged into dns_dispatch_done() as we need
to cancel active netmgr handles in any case to not leave dangling
connections around.  The functions handling UDP and TCP have been mostly
split to their matching counterparts and the dns_dispatch_<function>
functions are now thing wrappers that call <udp|tcp>_dispatch_<function>
based on the socket type.

More debugging-level logging was added to the unit to accomodate for
this fact.

(cherry picked from commit 6f317f27ea57a96cfb67d7b02fcb8aa21be37371)

lib/dns/dispatch.c
lib/dns/include/dns/dispatch.h
lib/dns/request.c
lib/dns/resolver.c

index 115cc784763cc18dc1c94bdb21959474fb3c532d..faa52fdc2b9ea5eab6a5890230bb3f1b34f30904 100644 (file)
@@ -70,11 +70,19 @@ struct dns_dispatchmgr {
        unsigned int nv6ports; /*%< # of available ports for IPv4 */
 };
 
+typedef enum {
+       DNS_DISPATCHSTATE_NONE = 0UL,
+       DNS_DISPATCHSTATE_CONNECTING,
+       DNS_DISPATCHSTATE_CONNECTED,
+       DNS_DISPATCHSTATE_CANCELED,
+} dns_dispatchstate_t;
+
 struct dns_dispentry {
        unsigned int magic;
        isc_refcount_t references;
        dns_dispatch_t *disp;
        isc_nmhandle_t *handle; /*%< netmgr handle for UDP connection */
+       dns_dispatchstate_t state;
        unsigned int bucket;
        unsigned int retries;
        unsigned int timeout;
@@ -87,26 +95,14 @@ struct dns_dispentry {
        dispatch_cb_t sent;
        dispatch_cb_t response;
        void *arg;
-       bool canceled;
+       bool reading;
+       isc_result_t result;
        ISC_LINK(dns_dispentry_t) link;
        ISC_LINK(dns_dispentry_t) alink;
        ISC_LINK(dns_dispentry_t) plink;
        ISC_LINK(dns_dispentry_t) rlink;
 };
 
-/*%
- * Fixed UDP buffer size.
- */
-#ifndef DNS_DISPATCH_UDPBUFSIZE
-#define DNS_DISPATCH_UDPBUFSIZE 4096
-#endif /* ifndef DNS_DISPATCH_UDPBUFSIZE */
-
-typedef enum {
-       DNS_DISPATCHSTATE_NONE = 0UL,
-       DNS_DISPATCHSTATE_CONNECTING,
-       DNS_DISPATCHSTATE_CONNECTED
-} dns_dispatchstate_t;
-
 struct dns_dispatch {
        /* Unlocked. */
        unsigned int magic;     /*%< magic */
@@ -122,17 +118,15 @@ struct dns_dispatch {
        /* Locked by "lock". */
        isc_mutex_t lock; /*%< locks all below */
        isc_socktype_t socktype;
-       atomic_uint_fast32_t tcpstate;
-       atomic_bool tcpreading;
+       dns_dispatchstate_t state;
        isc_refcount_t references;
-       unsigned int shutdown_out : 1;
+
+       bool reading;
 
        dns_displist_t pending;
        dns_displist_t active;
-       unsigned int nsockets;
 
-       unsigned int requests;   /*%< how many requests we have */
-       unsigned int tcpbuffers; /*%< allocated buffers */
+       unsigned int requests; /*%< how many requests we have */
 
        unsigned int timedout;
 };
@@ -152,26 +146,6 @@ struct dns_dispatch {
 #define DNS_DISPATCHMGR_MAGIC ISC_MAGIC('D', 'M', 'g', 'r')
 #define VALID_DISPATCHMGR(e)  ISC_MAGIC_VALID((e), DNS_DISPATCHMGR_MAGIC)
 
-/*%
- * Quota to control the number of UDP dispatch sockets.  If a dispatch has
- * more than the quota of sockets, new queries will purge oldest ones, so
- * that a massive number of outstanding queries won't prevent subsequent
- * queries (especially if the older ones take longer time and result in
- * timeout).
- */
-#ifndef DNS_DISPATCH_SOCKSQUOTA
-#define DNS_DISPATCH_SOCKSQUOTA 3072
-#endif /* ifndef DNS_DISPATCH_SOCKSQUOTA */
-
-/*%
- * Quota to control the number of concurrent requests that can be handled
- * by each TCP dispatch. (UDP dispatches do not currently support socket
- * sharing.)
- */
-#ifndef DNS_DISPATCH_MAXREQUESTS
-#define DNS_DISPATCH_MAXREQUESTS 32768
-#endif /* ifndef DNS_DISPATCH_MAXREQUESTS */
-
 /*%
  * Number of buckets in the QID hash table, and the value to
  * increment the QID by when attempting to avoid collisions.
@@ -185,6 +159,20 @@ struct dns_dispatch {
 #define DNS_QID_INCREMENT 16433
 #endif /* ifndef DNS_QID_INCREMENT */
 
+#if DNS_DISPATCH_TRACE
+#define dns_dispentry_ref(ptr) \
+       dns_dispentry__ref(ptr, __func__, __FILE__, __LINE__)
+#define dns_dispentry_unref(ptr) \
+       dns_dispentry__unref(ptr, __func__, __FILE__, __LINE__)
+#define dns_dispentry_attach(ptr, ptrp) \
+       dns_dispentry__attach(ptr, ptrp, __func__, __FILE__, __LINE__)
+#define dns_dispentry_detach(ptrp) \
+       dns_dispentry__detach(ptrp, __func__, __FILE__, __LINE__)
+ISC_REFCOUNT_TRACE_DECL(dns_dispentry);
+#else
+ISC_REFCOUNT_DECL(dns_dispentry);
+#endif
+
 /*
  * Statics.
  */
@@ -200,10 +188,13 @@ udp_recv(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region,
 static void
 tcp_recv(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region,
         void *arg);
+static void
+tcp_recv_done(dns_dispentry_t *resp, isc_result_t eresult,
+             isc_region_t *region);
 static uint32_t
 dns_hash(dns_qid_t *, const isc_sockaddr_t *, dns_messageid_t, in_port_t);
 static void
-dispatch_free(dns_dispatch_t **dispp);
+dispentry_cancel(dns_dispentry_t *resp, isc_result_t result);
 static isc_result_t
 dispatch_createudp(dns_dispatchmgr_t *mgr, const isc_sockaddr_t *localaddr,
                   dns_dispatch_t **dispp);
@@ -212,12 +203,48 @@ qid_allocate(dns_dispatchmgr_t *mgr, dns_qid_t **qidp);
 static void
 qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp);
 static void
-startrecv(isc_nmhandle_t *handle, dns_dispatch_t *disp, dns_dispentry_t *resp);
-void
-dispatch_getnext(dns_dispatch_t *disp, dns_dispentry_t *resp, int32_t timeout);
+udp_startrecv(isc_nmhandle_t *handle, dns_dispentry_t *resp);
+static void
+tcp_startrecv(isc_nmhandle_t *handle, dns_dispatch_t *disp,
+             dns_dispentry_t *resp);
+static void
+tcp_dispatch_getnext(dns_dispatch_t *disp, dns_dispentry_t *resp,
+                    int32_t timeout);
+static void
+udp_dispatch_getnext(dns_dispentry_t *resp, int32_t timeout);
 
 #define LVL(x) ISC_LOG_DEBUG(x)
 
+static const char *
+socktype2str(dns_dispentry_t *resp) {
+       dns_dispatch_t *disp = resp->disp;
+
+       switch (disp->socktype) {
+       case isc_socktype_udp:
+               return ("UDP");
+       case isc_socktype_tcp:
+               return ("TCP");
+       default:
+               return ("<unexpected>");
+       }
+}
+
+static const char *
+state2str(dns_dispatchstate_t state) {
+       switch (state) {
+       case DNS_DISPATCHSTATE_NONE:
+               return ("none");
+       case DNS_DISPATCHSTATE_CONNECTING:
+               return ("connecting");
+       case DNS_DISPATCHSTATE_CONNECTED:
+               return ("connected");
+       case DNS_DISPATCHSTATE_CANCELED:
+               return ("canceled");
+       default:
+               return ("<unexpected>");
+       }
+}
+
 static void
 mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...)
        ISC_FORMAT_PRINTF(3, 4);
@@ -262,13 +289,20 @@ static void
 dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...) {
        char msgbuf[2048];
        va_list ap;
+       int r;
 
        if (!isc_log_wouldlog(dns_lctx, level)) {
                return;
        }
 
        va_start(ap, fmt);
-       vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
+       r = vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
+       if (r < 0) {
+               msgbuf[0] = '\0';
+       } else if ((unsigned int)r >= sizeof(msgbuf)) {
+               /* Truncated */
+               msgbuf[sizeof(msgbuf) - 1] = '\0';
+       }
        va_end(ap);
 
        isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
@@ -276,6 +310,34 @@ dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...) {
                      msgbuf);
 }
 
+static void
+dispentry_log(dns_dispentry_t *resp, int level, const char *fmt, ...)
+       ISC_FORMAT_PRINTF(3, 4);
+
+static void
+dispentry_log(dns_dispentry_t *resp, int level, const char *fmt, ...) {
+       char msgbuf[2048];
+       va_list ap;
+       int r;
+
+       if (!isc_log_wouldlog(dns_lctx, level)) {
+               return;
+       }
+
+       va_start(ap, fmt);
+       r = vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
+       if (r < 0) {
+               msgbuf[0] = '\0';
+       } else if ((unsigned int)r >= sizeof(msgbuf)) {
+               /* Truncated */
+               msgbuf[sizeof(msgbuf) - 1] = '\0';
+       }
+       va_end(ap);
+
+       dispatch_log(resp->disp, level, "%s response %p: %s",
+                    socktype2str(resp), resp, msgbuf);
+}
+
 /*
  * Return a hash of the destination and message id.
  */
@@ -320,8 +382,6 @@ setup_socket(dns_dispatch_t *disp, dns_dispentry_t *resp,
                return (ISC_R_ADDRNOTAVAIL);
        }
 
-       disp->nsockets++;
-
        resp->local = disp->local;
        resp->peer = *dest;
 
@@ -334,26 +394,6 @@ setup_socket(dns_dispatch_t *disp, dns_dispentry_t *resp,
        return (ISC_R_SUCCESS);
 }
 
-/*%
- * Deactivate the socket for a dispatch entry.
- * The dispatch must be locked.
- */
-static void
-deactivate_dispentry(dns_dispatch_t *disp, dns_dispentry_t *resp) {
-       if (ISC_LINK_LINKED(resp, alink)) {
-               ISC_LIST_UNLINK(disp->active, resp, alink);
-       }
-
-       if (resp->handle != NULL) {
-               INSIST(disp->socktype == isc_socktype_udp);
-
-               isc_nm_cancelread(resp->handle);
-               isc_nmhandle_detach(&resp->handle);
-       }
-
-       disp->nsockets--;
-}
-
 /*
  * Find an entry for query ID 'id', socket address 'dest', and port number
  * 'port'.
@@ -382,54 +422,47 @@ entry_search(dns_qid_t *qid, const isc_sockaddr_t *dest, dns_messageid_t id,
 }
 
 static void
-dispentry_attach(dns_dispentry_t *resp, dns_dispentry_t **respp) {
-       REQUIRE(VALID_RESPONSE(resp));
-       REQUIRE(respp != NULL && *respp == NULL);
+dispentry_destroy(dns_dispentry_t *resp) {
+       dns_dispatch_t *disp = resp->disp;
 
-       isc_refcount_increment(&resp->references);
+       /*
+        * We need to call this from here in case there's an external event that
+        * shuts down our dispatch (like ISC_R_SHUTTINGDOWN).
+        */
+       dispentry_cancel(resp, ISC_R_CANCELED);
 
-       *respp = resp;
-}
+       LOCK(&disp->lock);
+       INSIST(disp->requests > 0);
+       disp->requests--;
+       UNLOCK(&disp->lock);
 
-static void
-dispentry_destroy(dns_dispentry_t *resp) {
-       dns_dispatch_t *disp = resp->disp;
+       isc_refcount_destroy(&resp->references);
 
        resp->magic = 0;
 
-       if (ISC_LINK_LINKED(resp, plink)) {
-               ISC_LIST_UNLINK(disp->pending, resp, plink);
-       }
-
+       INSIST(!ISC_LINK_LINKED(resp, link));
+       INSIST(!ISC_LINK_LINKED(resp, plink));
        INSIST(!ISC_LINK_LINKED(resp, alink));
        INSIST(!ISC_LINK_LINKED(resp, rlink));
 
+       dispentry_log(resp, LVL(90), "destroying");
+
        if (resp->handle != NULL) {
+               dispentry_log(resp, LVL(90), "detaching handle %p from %p",
+                             resp->handle, &resp->handle);
                isc_nmhandle_detach(&resp->handle);
        }
 
-       isc_refcount_destroy(&resp->references);
-
        isc_mem_put(disp->mgr->mctx, resp, sizeof(*resp));
 
-       dns_dispatch_detach(&disp);
+       dns_dispatch_detach(&disp); /* DISPATCH001 */
 }
 
-static void
-dispentry_detach(dns_dispentry_t **respp) {
-       dns_dispentry_t *resp = NULL;
-       uint_fast32_t ref;
-
-       REQUIRE(respp != NULL && VALID_RESPONSE(*respp));
-
-       resp = *respp;
-       *respp = NULL;
-
-       ref = isc_refcount_decrement(&resp->references);
-       if (ref == 1) {
-               dispentry_destroy(resp);
-       }
-}
+#if DNS_DISPATCH_TRACE
+ISC_REFCOUNT_TRACE_IMPL(dns_dispentry, dispentry_destroy);
+#else
+ISC_REFCOUNT_IMPL(dns_dispentry, dispentry_destroy);
+#endif
 
 /*
  * How long in milliseconds has it been since this dispentry
@@ -473,7 +506,7 @@ udp_recv(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region,
        unsigned int flags;
        isc_sockaddr_t peer;
        isc_netaddr_t netaddr;
-       int match, timeout;
+       int match, timeout = 0;
        dispatch_cb_t response = NULL;
 
        REQUIRE(VALID_RESPONSE(resp));
@@ -482,20 +515,24 @@ udp_recv(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region,
        disp = resp->disp;
 
        LOCK(&disp->lock);
+       INSIST(resp->reading);
+       resp->reading = false;
 
-       dispatch_log(disp, LVL(90), "UDP response %p:%s:requests %d", resp,
-                    isc_result_totext(eresult), disp->requests);
+       response = resp->response;
 
-       /*
-        * The resp may have been deactivated by shutdown; if
-        * so, we can skip the response callback.
-        */
-       if (ISC_LINK_LINKED(resp, alink)) {
-               response = resp->response;
-       } else if (eresult == ISC_R_SUCCESS) {
+       if (resp->state == DNS_DISPATCHSTATE_CANCELED) {
+               /*
+                * Nobody is interested in the callback if the response
+                * has been canceled already.  Detach from the response
+                * and the handle.
+                */
+               response = NULL;
                eresult = ISC_R_CANCELED;
        }
 
+       dispentry_log(resp, LVL(90), "read callback:%s, requests %d",
+                     isc_result_totext(eresult), disp->requests);
+
        if (eresult != ISC_R_SUCCESS) {
                /*
                 * This is most likely a network error on a connected
@@ -521,8 +558,8 @@ udp_recv(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region,
                        char netaddrstr[ISC_NETADDR_FORMATSIZE];
                        isc_netaddr_format(&netaddr, netaddrstr,
                                           sizeof(netaddrstr));
-                       dispatch_log(disp, LVL(10), "blackholed packet from %s",
-                                    netaddrstr);
+                       dispentry_log(resp, LVL(10),
+                                     "blackholed packet from %s", netaddrstr);
                }
                goto next;
        }
@@ -535,13 +572,16 @@ udp_recv(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region,
        isc_buffer_add(&source, region->length);
        dres = dns_message_peekheader(&source, &id, &flags);
        if (dres != ISC_R_SUCCESS) {
-               dispatch_log(disp, LVL(10), "got garbage packet");
+               char netaddrstr[ISC_NETADDR_FORMATSIZE];
+               isc_netaddr_format(&netaddr, netaddrstr, sizeof(netaddrstr));
+               dispentry_log(resp, LVL(10), "got garbage packet from %s",
+                             netaddrstr);
                goto next;
        }
 
-       dispatch_log(disp, LVL(92),
-                    "got valid DNS message header, /QR %c, id %u",
-                    (((flags & DNS_MESSAGEFLAG_QR) != 0) ? '1' : '0'), id);
+       dispentry_log(resp, LVL(92),
+                     "got valid DNS message header, /QR %c, id %u",
+                     (((flags & DNS_MESSAGEFLAG_QR) != 0) ? '1' : '0'), id);
 
        /*
         * Look at the message flags.  If it's a query, ignore it.
@@ -554,7 +594,7 @@ udp_recv(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region,
         * The QID and the address must match the expected ones.
         */
        if (resp->id != id || !isc_sockaddr_equal(&peer, &resp->peer)) {
-               dispatch_log(disp, LVL(90), "response doesn't match");
+               dispentry_log(resp, LVL(90), "response doesn't match");
                inc_stats(disp->mgr, dns_resstatscounter_mismatch);
                goto next;
        }
@@ -585,26 +625,25 @@ next:
         * proper response to arrive until the original timeout fires.
         */
        response = NULL;
-       dispatch_getnext(disp, resp, resp->timeout - dispentry_runtime(resp));
+       udp_dispatch_getnext(resp, timeout);
 
 done:
        UNLOCK(&disp->lock);
 
        if (response != NULL) {
+               dispentry_log(resp, LVL(90), "UDP read callback on %p: %s",
+                             handle, isc_result_totext(eresult));
                response(eresult, region, resp->arg);
        }
 
-       dispentry_detach(&resp);
+       dns_dispentry_detach(&resp); /* DISPENTRY003 */
 }
 
 static isc_result_t
-tcp_recv_timeout(dns_dispatch_t *disp, dns_dispentry_t **respp) {
-       dns_dispentry_t *resp = ISC_LIST_HEAD(disp->active);
+tcp_recv_oldest(dns_dispatch_t *disp, dns_dispentry_t **respp) {
+       dns_dispentry_t *resp = NULL;
+       resp = ISC_LIST_HEAD(disp->active);
        if (resp != NULL) {
-               dispentry_attach(resp, &(dns_dispentry_t *){ NULL });
-               ISC_LIST_UNLINK(disp->active, resp, alink);
-               ISC_LIST_APPEND(disp->active, resp, alink);
-
                disp->timedout++;
 
                *respp = resp;
@@ -624,7 +663,7 @@ tcp_recv_success(dns_dispatch_t *disp, isc_region_t *region, dns_qid_t *qid,
        isc_result_t result = ISC_R_SUCCESS;
        dns_dispentry_t *resp = NULL;
 
-       dispatch_log(disp, LVL(90), "success, length == %d, addr = %p",
+       dispatch_log(disp, LVL(90), "TCP read success, length == %d, addr = %p",
                     region->length, region->base);
 
        /*
@@ -659,30 +698,15 @@ tcp_recv_success(dns_dispatch_t *disp, isc_region_t *region, dns_qid_t *qid,
        LOCK(&qid->lock);
        resp = entry_search(qid, peer, id, disp->localport, bucket);
        if (resp != NULL) {
-               dispentry_attach(resp, &(dns_dispentry_t *){ NULL });
-               *respp = resp;
-       } else if (disp->timedout > 0) {
-               /* There was active query that timed-out before */
-               disp->timedout--;
-
-               resp = ISC_LIST_HEAD(disp->active);
-               if (resp != NULL) {
-                       /*
-                        * It's a DNS response, but didn't match any outstanding
-                        * queries. It's possible we would have timed out by
-                        * now, but non-matching responses prevented it, so we
-                        * check the age of the oldest active resp.
-                        */
-                       int timeout = resp->timeout - dispentry_runtime(resp);
-                       if (timeout <= 0) {
-                               result = tcp_recv_timeout(disp, respp);
-                       }
+               if (resp->reading) {
+                       *respp = resp;
                } else {
-                       result = ISC_R_NOTFOUND;
+                       /* We already got our DNS message. */
+                       result = ISC_R_UNEXPECTED;
                }
        } else {
                /* We are not expecting this DNS message */
-               result = ISC_R_UNEXPECTED;
+               result = ISC_R_NOTFOUND;
        }
        dispatch_log(disp, LVL(90), "search for response in bucket %d: %s",
                     bucket, isc_result_totext(result));
@@ -692,7 +716,19 @@ tcp_recv_success(dns_dispatch_t *disp, isc_region_t *region, dns_qid_t *qid,
 }
 
 static void
-tcp_recv_shutdown(dns_dispatch_t *disp, dns_displist_t *resps) {
+tcp_recv_add(dns_displist_t *resps, dns_dispentry_t *resp,
+            isc_result_t result) {
+       dns_dispentry_ref(resp); /* DISPENTRY009 */
+       ISC_LIST_UNLINK(resp->disp->active, resp, alink);
+       ISC_LIST_APPEND(*resps, resp, rlink);
+       INSIST(resp->reading);
+       resp->reading = false;
+       resp->result = result;
+}
+
+static void
+tcp_recv_shutdown(dns_dispatch_t *disp, dns_displist_t *resps,
+                 isc_result_t result) {
        dns_dispentry_t *resp = NULL, *next = NULL;
 
        /*
@@ -700,29 +736,28 @@ tcp_recv_shutdown(dns_dispatch_t *disp, dns_displist_t *resps) {
         */
        for (resp = ISC_LIST_HEAD(disp->active); resp != NULL; resp = next) {
                next = ISC_LIST_NEXT(resp, alink);
-               dispentry_attach(resp, &(dns_dispentry_t *){ NULL });
-               ISC_LIST_UNLINK(disp->active, resp, alink);
-               ISC_LIST_APPEND(*resps, resp, rlink);
+               tcp_recv_add(resps, resp, result);
        }
 }
 
 static void
 tcp_recv_done(dns_dispentry_t *resp, isc_result_t eresult,
              isc_region_t *region) {
+       dispentry_log(resp, LVL(90), "read callback: %s",
+                     isc_result_totext(eresult));
+
        resp->response(eresult, region, resp->arg);
-       dispentry_detach(&resp);
+       dns_dispentry_detach(&resp); /* DISPENTRY009 */
 }
 
 static void
-tcp_recv_cancelall(dns_displist_t *resps, isc_region_t *region,
-                  isc_result_t result) {
+tcp_recv_processall(dns_displist_t *resps, isc_region_t *region) {
        dns_dispentry_t *resp = NULL, *next = NULL;
 
        for (resp = ISC_LIST_HEAD(*resps); resp != NULL; resp = next) {
                next = ISC_LIST_NEXT(resp, rlink);
                ISC_LIST_UNLINK(*resps, resp, rlink);
-               resp->response(result, region, resp->arg);
-               dispentry_detach(&resp);
+               tcp_recv_done(resp, resp->result, region);
        }
 }
 
@@ -732,8 +767,6 @@ tcp_recv_cancelall(dns_displist_t *resps, isc_region_t *region,
  * If I/O result == CANCELED, EOF, or error, notify everyone as the
  * various queues drain.
  *
- * If query, restart.
- *
  * If response:
  *     Allocate event, fill in details.
  *             If cannot allocate, restart.
@@ -749,94 +782,116 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region,
        dns_qid_t *qid = NULL;
        char buf[ISC_SOCKADDR_FORMATSIZE];
        isc_sockaddr_t peer;
-       dns_displist_t resps;
+       dns_displist_t resps = ISC_LIST_INITIALIZER;
 
        REQUIRE(VALID_DISPATCH(disp));
 
-       atomic_store(&disp->tcpreading, false);
-
        qid = disp->mgr->qid;
 
-       ISC_LIST_INIT(resps);
-
        LOCK(&disp->lock);
+       INSIST(disp->reading);
+       disp->reading = false;
 
-       dispatch_log(disp, LVL(90), "TCP read:%s:requests %d, buffers %d",
-                    isc_result_totext(result), disp->requests,
-                    disp->tcpbuffers);
+       dispatch_log(disp, LVL(90), "TCP read:%s:requests %u",
+                    isc_result_totext(result), disp->requests);
 
        peer = isc_nmhandle_peeraddr(handle);
 
+       /*
+        * Phase 1: Process timeout and success.
+        */
        switch (result) {
-       case ISC_R_SHUTTINGDOWN:
-       case ISC_R_CANCELED:
-       case ISC_R_EOF:
-       case ISC_R_CONNECTIONRESET:
-               isc_sockaddr_format(&peer, buf, sizeof(buf));
-               dispatch_log(disp, LVL(90), "shutting down TCP: %s: %s", buf,
-                            isc_result_totext(result));
-               tcp_recv_shutdown(disp, &resps);
-               break;
-
        case ISC_R_TIMEDOUT:
                /*
-                * Time out the oldest response in the active queue,
-                * and move it to the end. (We don't remove it from the
-                * active queue immediately, though, because the callback
-                * might decide to keep waiting and leave it active.)
+                * Time out the oldest response in the active queue.
                 */
-               result = tcp_recv_timeout(disp, &resp);
+               result = tcp_recv_oldest(disp, &resp);
                break;
-
        case ISC_R_SUCCESS:
                /* We got an answer */
                result = tcp_recv_success(disp, region, qid, &peer, &resp);
-               if (result != ISC_R_UNEXPECTED) {
-                       /*
-                        * It's a valid DNS response, which may or may not
-                        * have matched an outstanding query.
-                        */
+               break;
 
-                       dispatch_getnext(disp, NULL, -1);
-                       break;
+       default:
+               break;
+       }
+
+       if (resp != NULL) {
+               tcp_recv_add(&resps, resp, result);
+       }
+
+       /*
+        * Phase 2: Look if we timed out before.
+        */
+
+       if (result == ISC_R_NOTFOUND) {
+               if (disp->timedout > 0) {
+                       /* There was active query that timed-out before */
+                       disp->timedout--;
+               } else {
+                       result = ISC_R_UNEXPECTED;
+               }
+       }
+
+       /*
+        * Phase 3: Trigger timeouts.  It's possible that the responses would
+        * have been timedout out already, but non-matching TCP reads have
+        * prevented this.
+        */
+       dns_dispentry_t *next = NULL;
+       for (resp = ISC_LIST_HEAD(disp->active); resp != NULL; resp = next) {
+               next = ISC_LIST_NEXT(resp, alink);
+
+               /* FIXME: dispentry_runtime is always 0 for TCP */
+               int timeout = resp->timeout - dispentry_runtime(resp);
+               if (timeout <= 0) {
+                       tcp_recv_add(&resps, resp, ISC_R_TIMEDOUT);
                }
+       }
+
+       /*
+        * Phase 4: log if we errored out.
+        */
+       switch (result) {
+       case ISC_R_SUCCESS:
+       case ISC_R_TIMEDOUT:
+       case ISC_R_NOTFOUND:
+               break;
 
-               /* Got an invalid DNS response, terminate the connection */
-               FALLTHROUGH;
+       case ISC_R_SHUTTINGDOWN:
+       case ISC_R_CANCELED:
+       case ISC_R_EOF:
+       case ISC_R_CONNECTIONRESET:
+               isc_sockaddr_format(&peer, buf, sizeof(buf));
+               dispatch_log(disp, LVL(90), "shutting down TCP: %s: %s", buf,
+                            isc_result_totext(result));
+               tcp_recv_shutdown(disp, &resps, result);
+               break;
        default:
                isc_sockaddr_format(&peer, buf, sizeof(buf));
                dispatch_log(disp, ISC_LOG_ERROR,
                             "shutting down due to TCP "
                             "receive error: %s: %s",
                             buf, isc_result_totext(result));
-               tcp_recv_shutdown(disp, &resps);
+               tcp_recv_shutdown(disp, &resps, result);
                break;
        }
 
+       /*
+        * Phase 5: Resume reading if there are still active responses
+        */
+       if (!ISC_LIST_EMPTY(disp->active)) {
+               tcp_startrecv(NULL, disp, ISC_LIST_HEAD(disp->active));
+       }
+
        UNLOCK(&disp->lock);
 
-       switch (result) {
-       case ISC_R_SUCCESS:
-       case ISC_R_TIMEDOUT:
-               /*
-                * Either we found a matching response, or we timed out
-                * and canceled the oldest resp.
-                */
-               INSIST(resp != NULL);
-               tcp_recv_done(resp, result, region);
-               break;
-       case ISC_R_NOTFOUND:
-               /*
-                * Either we got a response that didn't match any active
-                * resps, or we timed out but there *were* no active resps.
-                */
-               break;
-       default:
-               /* We're being shut down; cancel all outstanding resps. */
-               tcp_recv_cancelall(&resps, region, result);
-       }
+       /*
+        * Phase 6: Process all scheduled callbacks.
+        */
+       tcp_recv_processall(&resps, region);
 
-       dns_dispatch_detach(&disp);
+       dns_dispatch_detach(&disp); /* DISPATCH002 */
 }
 
 /*%
@@ -919,6 +974,10 @@ dns_dispatchmgr_create(isc_mem_t *mctx, isc_nm_t *nm,
        mgr = isc_mem_get(mctx, sizeof(dns_dispatchmgr_t));
        *mgr = (dns_dispatchmgr_t){ .magic = 0 };
 
+#if DNS_DISPATCH_TRACE
+       fprintf(stderr, "dns_dispatchmgr__init:%s:%s:%d:%p->references = 1\n",
+               __func__, __FILE__, __LINE__, mgr);
+#endif
        isc_refcount_init(&mgr->references, 1);
 
        isc_mem_attach(mctx, &mgr->mctx);
@@ -943,31 +1002,11 @@ dns_dispatchmgr_create(isc_mem_t *mctx, isc_nm_t *nm,
        return (ISC_R_SUCCESS);
 }
 
-void
-dns_dispatchmgr_attach(dns_dispatchmgr_t *mgr, dns_dispatchmgr_t **mgrp) {
-       REQUIRE(VALID_DISPATCHMGR(mgr));
-       REQUIRE(mgrp != NULL && *mgrp == NULL);
-
-       isc_refcount_increment(&mgr->references);
-
-       *mgrp = mgr;
-}
-
-void
-dns_dispatchmgr_detach(dns_dispatchmgr_t **mgrp) {
-       dns_dispatchmgr_t *mgr = NULL;
-       uint_fast32_t ref;
-
-       REQUIRE(mgrp != NULL && VALID_DISPATCHMGR(*mgrp));
-
-       mgr = *mgrp;
-       *mgrp = NULL;
-
-       ref = isc_refcount_decrement(&mgr->references);
-       if (ref == 1) {
-               dispatchmgr_destroy(mgr);
-       }
-}
+#if DNS_DISPATCH_TRACE
+ISC_REFCOUNT_TRACE_IMPL(dns_dispatchmgr, dispatchmgr_destroy);
+#else
+ISC_REFCOUNT_IMPL(dns_dispatchmgr, dispatchmgr_destroy);
+#endif
 
 void
 dns_dispatchmgr_setblackhole(dns_dispatchmgr_t *mgr, dns_acl_t *blackhole) {
@@ -1089,46 +1128,25 @@ dispatch_allocate(dns_dispatchmgr_t *mgr, isc_socktype_t type,
         */
 
        disp = isc_mem_get(mgr->mctx, sizeof(*disp));
-       *disp = (dns_dispatch_t){ .socktype = type };
+       *disp = (dns_dispatch_t){
+               .socktype = type,
+               .link = ISC_LINK_INITIALIZER,
+               .active = ISC_LIST_INITIALIZER,
+               .pending = ISC_LIST_INITIALIZER,
+               .magic = DISPATCH_MAGIC,
+       };
 
        dns_dispatchmgr_attach(mgr, &disp->mgr);
-       isc_refcount_init(&disp->references, 1);
-       ISC_LINK_INIT(disp, link);
-       ISC_LIST_INIT(disp->active);
-       ISC_LIST_INIT(disp->pending);
-
+#if DNS_DISPATCH_TRACE
+       fprintf(stderr, "dns_dispatch__init:%s:%s:%d:%p->references = 1\n",
+               __func__, __FILE__, __LINE__, disp);
+#endif
+       isc_refcount_init(&disp->references, 1); /* DISPATCH000 */
        isc_mutex_init(&disp->lock);
 
-       disp->magic = DISPATCH_MAGIC;
-
        *dispp = disp;
 }
 
-/*
- * MUST be unlocked, and not used by anything.
- */
-static void
-dispatch_free(dns_dispatch_t **dispp) {
-       dns_dispatch_t *disp = NULL;
-       dns_dispatchmgr_t *mgr = NULL;
-
-       REQUIRE(VALID_DISPATCH(*dispp));
-       disp = *dispp;
-       *dispp = NULL;
-
-       disp->magic = 0;
-
-       mgr = disp->mgr;
-       REQUIRE(VALID_DISPATCHMGR(mgr));
-
-       INSIST(disp->requests == 0);
-       INSIST(ISC_LIST_EMPTY(disp->active));
-
-       isc_mutex_destroy(&disp->lock);
-
-       isc_mem_put(mgr->mctx, disp, sizeof(*disp));
-}
-
 isc_result_t
 dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, const isc_sockaddr_t *localaddr,
                       const isc_sockaddr_t *destaddr, isc_dscp_t dscp,
@@ -1163,8 +1181,17 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, const isc_sockaddr_t *localaddr,
        ISC_LIST_APPEND(mgr->list, disp, link);
        UNLOCK(&mgr->lock);
 
-       mgr_log(mgr, LVL(90), "dns_dispatch_createtcp: created TCP dispatch %p",
-               disp);
+       if (isc_log_wouldlog(dns_lctx, 90)) {
+               char addrbuf[ISC_SOCKADDR_FORMATSIZE];
+
+               isc_sockaddr_format(localaddr, addrbuf,
+                                   ISC_SOCKADDR_FORMATSIZE);
+
+               mgr_log(mgr, LVL(90),
+                       "dns_dispatch_createtcp: created TCP dispatch %p for "
+                       "%s",
+                       disp, addrbuf);
+       }
        *dispp = disp;
 
        return (ISC_R_SUCCESS);
@@ -1172,15 +1199,13 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, const isc_sockaddr_t *localaddr,
 
 isc_result_t
 dns_dispatch_gettcp(dns_dispatchmgr_t *mgr, const isc_sockaddr_t *destaddr,
-                   const isc_sockaddr_t *localaddr, bool *connected,
-                   dns_dispatch_t **dispp) {
+                   const isc_sockaddr_t *localaddr, dns_dispatch_t **dispp) {
        dns_dispatch_t *disp_connected = NULL;
        dns_dispatch_t *disp_fallback = NULL;
        isc_result_t result = ISC_R_NOTFOUND;
 
        REQUIRE(VALID_DISPATCHMGR(mgr));
        REQUIRE(destaddr != NULL);
-       REQUIRE(connected != NULL);
        REQUIRE(dispp != NULL && *dispp == NULL);
 
        LOCK(&mgr->lock);
@@ -1212,40 +1237,52 @@ dns_dispatch_gettcp(dns_dispatchmgr_t *mgr, const isc_sockaddr_t *destaddr,
                    (localaddr == NULL ||
                     isc_sockaddr_eqaddr(localaddr, &sockname)))
                {
-                       if (atomic_load(&disp->tcpstate) ==
-                           DNS_DISPATCHSTATE_CONNECTED)
-                       {
-                               /* We found connected dispatch */
-                               disp_connected = disp;
-                               UNLOCK(&disp->lock);
+                       switch (disp->state) {
+                       case DNS_DISPATCHSTATE_CONNECTED:
+                               /* We found a connected dispatch */
+                               dns_dispatch_attach(disp, &disp_connected);
+                               break;
+                       case DNS_DISPATCHSTATE_NONE:
+                       case DNS_DISPATCHSTATE_CONNECTING:
+                               /* We found "a" dispatch, store it for later */
+                               if (disp_fallback == NULL) {
+                                       dns_dispatch_attach(disp,
+                                                           &disp_fallback);
+                               }
+                               break;
+                       case DNS_DISPATCHSTATE_CANCELED:
+                               /*
+                                * We found a canceled dispatch, help its
+                                * removal from the list, and skip it.
+                                */
+                               ISC_LIST_UNLINK(disp->mgr->list, disp, link);
                                break;
+                       default:
+                               UNREACHABLE();
                        }
-
-                       /* We found "a" dispatch, store it for later */
-                       if (disp_fallback == NULL) {
-                               disp_fallback = disp;
-                       }
-
-                       UNLOCK(&disp->lock);
-                       continue;
                }
 
                UNLOCK(&disp->lock);
+
+               if (disp_connected != NULL) {
+                       break;
+               }
        }
 
        if (disp_connected != NULL) {
                /* We found connected dispatch */
                INSIST(disp_connected->handle != NULL);
 
-               *connected = true;
-               dns_dispatch_attach(disp_connected, dispp);
+               *dispp = disp_connected;
+               disp_connected = NULL;
 
                result = ISC_R_SUCCESS;
-       } else if (disp_fallback != NULL) {
-               /* We found matching dispatch */
-               *connected = false;
 
-               dns_dispatch_attach(disp_fallback, dispp);
+               if (disp_fallback != NULL) {
+                       dns_dispatch_detach(&disp_fallback);
+               }
+       } else if (disp_fallback != NULL) {
+               *dispp = disp_fallback;
 
                result = ISC_R_SUCCESS;
        }
@@ -1282,8 +1319,6 @@ dispatch_createudp(dns_dispatchmgr_t *mgr, const isc_sockaddr_t *localaddr,
        dns_dispatch_t *disp = NULL;
        isc_sockaddr_t sa_any;
 
-       dispatch_allocate(mgr, isc_socktype_udp, &disp);
-
        /*
         * Check whether this address/port is available locally.
         */
@@ -1291,58 +1326,66 @@ dispatch_createudp(dns_dispatchmgr_t *mgr, const isc_sockaddr_t *localaddr,
        if (!isc_sockaddr_eqaddr(&sa_any, localaddr)) {
                result = isc_nm_checkaddr(localaddr, isc_socktype_udp);
                if (result != ISC_R_SUCCESS) {
-                       goto cleanup;
+                       return (result);
                }
        }
 
+       dispatch_allocate(mgr, isc_socktype_udp, &disp);
+
        if (isc_log_wouldlog(dns_lctx, 90)) {
                char addrbuf[ISC_SOCKADDR_FORMATSIZE];
 
                isc_sockaddr_format(localaddr, addrbuf,
                                    ISC_SOCKADDR_FORMATSIZE);
                mgr_log(mgr, LVL(90),
-                       "dispatch_createudp: created UDP dispatch for %s",
-                       addrbuf);
+                       "dispatch_createudp: created UDP dispatch %p for %s",
+                       disp, addrbuf);
        }
 
        disp->local = *localaddr;
 
        /*
-        * Append it to the dispatcher list.
+        * Don't append it to the dispatcher list, we don't care about UDP, only
+        * TCP should be searched
+        *
+        * ISC_LIST_APPEND(mgr->list, disp, link);
         */
-       ISC_LIST_APPEND(mgr->list, disp, link);
-
-       mgr_log(mgr, LVL(90), "created UDP dispatcher %p", disp);
 
        *dispp = disp;
 
        return (result);
-
-       /*
-        * Error returns.
-        */
-cleanup:
-       dispatch_free(&disp);
-
-       return (result);
 }
 
 static void
 dispatch_destroy(dns_dispatch_t *disp) {
        dns_dispatchmgr_t *mgr = disp->mgr;
 
+       isc_refcount_destroy(&disp->references);
+       disp->magic = 0;
+
        LOCK(&mgr->lock);
-       ISC_LIST_UNLINK(mgr->list, disp, link);
+       if (ISC_LINK_LINKED(disp, link)) {
+               ISC_LIST_UNLINK(disp->mgr->list, disp, link);
+       }
        UNLOCK(&mgr->lock);
 
-       dispatch_log(disp, LVL(90), "shutting down; detaching from handle %p",
-                    disp->handle);
+       INSIST(disp->requests == 0);
+       INSIST(ISC_LIST_EMPTY(disp->pending));
+       INSIST(ISC_LIST_EMPTY(disp->active));
+
+       INSIST(!ISC_LINK_LINKED(disp, link));
 
-       if (disp->handle != NULL) {
+       dispatch_log(disp, LVL(90), "destroying dispatch %p", disp);
+
+       if (disp->handle) {
+               dispatch_log(disp, LVL(90), "detaching TCP handle %p from %p",
+                            disp->handle, &disp->handle);
                isc_nmhandle_detach(&disp->handle);
        }
 
-       dispatch_free(&disp);
+       isc_mutex_destroy(&disp->lock);
+
+       isc_mem_put(mgr->mctx, disp, sizeof(*disp));
 
        /*
         * Because dispatch uses mgr->mctx, we must detach after freeing
@@ -1351,106 +1394,73 @@ dispatch_destroy(dns_dispatch_t *disp) {
        dns_dispatchmgr_detach(&mgr);
 }
 
-void
-dns_dispatch_attach(dns_dispatch_t *disp, dns_dispatch_t **dispp) {
-       REQUIRE(VALID_DISPATCH(disp));
-       REQUIRE(dispp != NULL && *dispp == NULL);
-
-       isc_refcount_increment(&disp->references);
-
-       *dispp = disp;
-}
-
-void
-dns_dispatch_detach(dns_dispatch_t **dispp) {
-       dns_dispatch_t *disp = NULL;
-       uint_fast32_t ref;
-
-       REQUIRE(dispp != NULL && VALID_DISPATCH(*dispp));
-
-       disp = *dispp;
-       *dispp = NULL;
-
-       ref = isc_refcount_decrement(&disp->references);
-       dispatch_log(disp, LVL(90), "detach: refcount %" PRIuFAST32, ref - 1);
-       if (ref == 1) {
-               LOCK(&disp->lock);
-               INSIST(ISC_LIST_EMPTY(disp->pending));
-               INSIST(ISC_LIST_EMPTY(disp->active));
-               UNLOCK(&disp->lock);
-
-               dispatch_destroy(disp);
-       }
-}
+#if DNS_DISPATCH_TRACE
+ISC_REFCOUNT_TRACE_IMPL(dns_dispatch, dispatch_destroy);
+#else
+ISC_REFCOUNT_IMPL(dns_dispatch, dispatch_destroy);
+#endif
 
 isc_result_t
 dns_dispatch_add(dns_dispatch_t *disp, unsigned int options,
                 unsigned int timeout, const isc_sockaddr_t *dest,
                 dispatch_cb_t connected, dispatch_cb_t sent,
                 dispatch_cb_t response, void *arg, dns_messageid_t *idp,
-                dns_dispentry_t **resp) {
-       dns_dispentry_t *res = NULL;
+                dns_dispentry_t **respp) {
+       dns_dispentry_t *resp = NULL;
        dns_qid_t *qid = NULL;
        in_port_t localport = 0;
        dns_messageid_t id;
        unsigned int bucket;
        bool ok = false;
        int i = 0;
-       dispatch_cb_t oldest_response = NULL;
 
        REQUIRE(VALID_DISPATCH(disp));
        REQUIRE(dest != NULL);
-       REQUIRE(resp != NULL && *resp == NULL);
+       REQUIRE(respp != NULL && *respp == NULL);
        REQUIRE(idp != NULL);
        REQUIRE(disp->socktype == isc_socktype_tcp ||
                disp->socktype == isc_socktype_udp);
+       REQUIRE(connected != NULL);
+       REQUIRE(response != NULL);
+       REQUIRE(sent != NULL);
 
        LOCK(&disp->lock);
 
-       if (disp->requests >= DNS_DISPATCH_MAXREQUESTS) {
+       if (disp->state == DNS_DISPATCHSTATE_CANCELED) {
                UNLOCK(&disp->lock);
-               return (ISC_R_QUOTA);
+               return (ISC_R_CANCELED);
        }
 
        qid = disp->mgr->qid;
 
-       if (disp->socktype == isc_socktype_udp &&
-           disp->nsockets > DNS_DISPATCH_SOCKSQUOTA)
-       {
-               dns_dispentry_t *oldest = NULL;
-
-               /*
-                * Kill oldest outstanding query if the number of sockets
-                * exceeds the quota to keep the room for new queries.
-                */
-               oldest = ISC_LIST_HEAD(disp->active);
-               if (oldest != NULL) {
-                       oldest_response = oldest->response;
-                       inc_stats(disp->mgr, dns_resstatscounter_dispabort);
-               }
-       }
-
-       res = isc_mem_get(disp->mgr->mctx, sizeof(*res));
-
-       *res = (dns_dispentry_t){ .port = localport,
-                                 .timeout = timeout,
-                                 .peer = *dest,
-                                 .connected = connected,
-                                 .sent = sent,
-                                 .response = response,
-                                 .arg = arg };
-
-       isc_refcount_init(&res->references, 1);
-
-       ISC_LINK_INIT(res, link);
-       ISC_LINK_INIT(res, alink);
-       ISC_LINK_INIT(res, plink);
-       ISC_LINK_INIT(res, rlink);
+       resp = isc_mem_get(disp->mgr->mctx, sizeof(*resp));
+
+       *resp = (dns_dispentry_t){
+               .port = localport,
+               .timeout = timeout,
+               .peer = *dest,
+               .connected = connected,
+               .sent = sent,
+               .response = response,
+               .arg = arg,
+               .link = ISC_LINK_INITIALIZER,
+               .alink = ISC_LINK_INITIALIZER,
+               .plink = ISC_LINK_INITIALIZER,
+               .rlink = ISC_LINK_INITIALIZER,
+               .magic = RESPONSE_MAGIC,
+       };
+
+#if DNS_DISPATCH_TRACE
+       fprintf(stderr, "dns_dispentry__init:%s:%s:%d:%p->references = 1\n",
+               __func__, __FILE__, __LINE__, res);
+#endif
+       isc_refcount_init(&resp->references, 1); /* DISPENTRY000 */
 
        if (disp->socktype == isc_socktype_udp) {
-               isc_result_t result = setup_socket(disp, res, dest, &localport);
+               isc_result_t result = setup_socket(disp, resp, dest,
+                                                  &localport);
                if (result != ISC_R_SUCCESS) {
-                       isc_mem_put(disp->mgr->mctx, res, sizeof(*res));
+                       isc_mem_put(disp->mgr->mctx, resp, sizeof(*resp));
                        UNLOCK(&disp->lock);
                        inc_stats(disp->mgr, dns_resstatscounter_dispsockfail);
                        return (result);
@@ -1477,421 +1487,655 @@ dns_dispatch_add(dns_dispatch_t *disp, unsigned int options,
                        ok = true;
                        break;
                }
+               if ((options & DNS_DISPATCHOPT_FIXEDID) != 0) {
+                       /* When using fixed ID, we either must use it or fail */
+                       break;
+               }
                id += qid->qid_increment;
                id &= 0x0000ffff;
        } while (i++ < 64);
+
+       if (ok) {
+               resp->id = id;
+               resp->bucket = bucket;
+               ISC_LIST_APPEND(qid->qid_table[bucket], resp, link);
+       }
        UNLOCK(&qid->lock);
 
        if (!ok) {
-               isc_mem_put(disp->mgr->mctx, res, sizeof(*res));
+               isc_mem_put(disp->mgr->mctx, resp, sizeof(*resp));
                UNLOCK(&disp->lock);
                return (ISC_R_NOMORE);
        }
 
-       dns_dispatch_attach(disp, &res->disp);
-
-       res->id = id;
-       res->bucket = bucket;
-       res->magic = RESPONSE_MAGIC;
+       dns_dispatch_attach(disp, &resp->disp); /* DISPATCH001 */
 
        disp->requests++;
 
-       LOCK(&qid->lock);
-       ISC_LIST_APPEND(qid->qid_table[bucket], res, link);
-       UNLOCK(&qid->lock);
-
        inc_stats(disp->mgr, (disp->socktype == isc_socktype_udp)
                                     ? dns_resstatscounter_disprequdp
                                     : dns_resstatscounter_dispreqtcp);
 
-       ISC_LIST_APPEND(disp->active, res, alink);
-
        UNLOCK(&disp->lock);
 
-       if (oldest_response != NULL) {
-               oldest_response(ISC_R_CANCELED, NULL, res->arg);
-       }
-
        *idp = id;
-       *resp = res;
+       *respp = resp;
 
        return (ISC_R_SUCCESS);
 }
 
-void
-dispatch_getnext(dns_dispatch_t *disp, dns_dispentry_t *resp, int32_t timeout) {
-       REQUIRE(timeout <= (int32_t)UINT16_MAX);
+isc_result_t
+dns_dispatch_getnext(dns_dispentry_t *resp) {
+       REQUIRE(VALID_RESPONSE(resp));
+       REQUIRE(VALID_DISPATCH(resp->disp));
 
-       switch (disp->socktype) {
-       case isc_socktype_udp:
-               REQUIRE(resp != NULL);
+       dns_dispatch_t *disp = resp->disp;
+       isc_result_t result = ISC_R_SUCCESS;
+       int32_t timeout = -1;
 
-               dispentry_attach(resp, &(dns_dispentry_t *){ NULL });
-               if (timeout > 0) {
-                       isc_nmhandle_settimeout(resp->handle, timeout);
+       LOCK(&disp->lock);
+       switch (disp->socktype) {
+       case isc_socktype_udp: {
+               timeout = resp->timeout - dispentry_runtime(resp);
+               if (timeout <= 0) {
+                       result = ISC_R_TIMEDOUT;
+                       break;
                }
-               isc_nm_read(resp->handle, udp_recv, resp);
+               udp_dispatch_getnext(resp, timeout);
                break;
-
+       }
        case isc_socktype_tcp:
-               if (atomic_compare_exchange_strong(&disp->tcpreading,
-                                                  &(bool){ false }, true))
-               {
-                       dns_dispatch_attach(disp, &(dns_dispatch_t *){ NULL });
-                       if (timeout > 0) {
-                               isc_nmhandle_settimeout(disp->handle, timeout);
-                       }
-                       isc_nm_read(disp->handle, tcp_recv, disp);
-               }
+               tcp_dispatch_getnext(disp, resp, timeout);
                break;
-
        default:
                UNREACHABLE();
        }
-}
 
-isc_result_t
-dns_dispatch_getnext(dns_dispentry_t *resp) {
-       dns_dispatch_t *disp = NULL;
-       int32_t timeout;
+       UNLOCK(&disp->lock);
 
+       return (result);
+}
+
+static void
+udp_dispentry_cancel(dns_dispentry_t *resp, isc_result_t result) {
        REQUIRE(VALID_RESPONSE(resp));
+       REQUIRE(VALID_DISPATCH(resp->disp));
+       REQUIRE(VALID_DISPATCHMGR(resp->disp->mgr));
 
-       disp = resp->disp;
+       dns_dispatch_t *disp = resp->disp;
+       dns_dispatchmgr_t *mgr = disp->mgr;
+       dns_qid_t *qid = mgr->qid;
+       dispatch_cb_t connected = NULL;
+       dispatch_cb_t response = NULL;
 
-       REQUIRE(VALID_DISPATCH(disp));
+       LOCK(&disp->lock);
+       dispentry_log(resp, LVL(90),
+                     "canceling response: %s, %s/%s (%s/%s), "
+                     "requests %u",
+                     isc_result_totext(result), state2str(resp->state),
+                     resp->reading ? "reading" : "not reading",
+                     state2str(disp->state),
+                     disp->reading ? "reading" : "not reading",
+                     disp->requests);
 
-       if (disp->socktype == isc_socktype_udp) {
-               timeout = resp->timeout - dispentry_runtime(resp);
-               if (timeout <= 0) {
-                       return (ISC_R_TIMEDOUT);
-               }
-       } else {
-               timeout = -1;
+       if (ISC_LINK_LINKED(resp, alink)) {
+               ISC_LIST_UNLINK(disp->active, resp, alink);
        }
 
-       LOCK(&disp->lock);
-       dispatch_getnext(disp, resp, timeout);
-       UNLOCK(&disp->lock);
-       return (ISC_R_SUCCESS);
-}
-
-void
-dns_dispatch_cancel(dns_dispentry_t **respp) {
-       dns_dispentry_t *resp = NULL;
-       dns_dispatch_t *disp = NULL;
+       switch (resp->state) {
+       case DNS_DISPATCHSTATE_NONE:
+               break;
 
-       REQUIRE(respp != NULL);
+       case DNS_DISPATCHSTATE_CONNECTING:
+               dns_dispentry_ref(resp); /* DISPENTRY008 */
+               ISC_LIST_UNLINK(disp->pending, resp, plink);
+               connected = resp->connected;
+               break;
 
-       resp = *respp;
-       *respp = NULL;
+       case DNS_DISPATCHSTATE_CONNECTED:
+               if (resp->reading) {
+                       dns_dispentry_ref(resp); /* DISPENTRY003 */
+                       response = resp->response;
 
-       REQUIRE(VALID_RESPONSE(resp));
+                       dispentry_log(resp, LVL(90), "canceling read on %p",
+                                     resp->handle);
+                       isc_nm_cancelread(resp->handle);
+               }
+               break;
 
-       disp = resp->disp;
-       resp->canceled = true;
+       case DNS_DISPATCHSTATE_CANCELED:
+               goto unlock;
 
-       /* Connected UDP. */
-       if (resp->handle != NULL) {
-               isc_nm_cancelread(resp->handle);
-               goto done;
+       default:
+               UNREACHABLE();
        }
 
-       LOCK(&disp->lock);
-       /* TCP pending connection. */
-       if (ISC_LINK_LINKED(resp, plink)) {
-               dns_dispentry_t *copy = resp;
-
-               ISC_LIST_UNLINK(disp->pending, resp, plink);
-               if (resp->connected != NULL) {
-                       resp->connected(ISC_R_CANCELED, NULL, resp->arg);
-               }
+       dec_stats(disp->mgr, dns_resstatscounter_disprequdp);
 
-               /*
-                * We need to detach twice if we were pending
-                * connection - once to take the place of the
-                * detach in tcp_connected() or udp_connected()
-                * that we won't reach, and again later in
-                * dns_dispatch_done().
-                */
-               dispentry_detach(&copy);
-               UNLOCK(&disp->lock);
-               goto done;
-       }
+       LOCK(&qid->lock);
+       ISC_LIST_UNLINK(qid->qid_table[resp->bucket], resp, link);
+       UNLOCK(&qid->lock);
+       resp->state = DNS_DISPATCHSTATE_CANCELED;
 
-       /*
-        * Connected TCP, or unconnected UDP.
-        *
-        * If TCP, we don't want to cancel the dispatch
-        * unless this is the last resp waiting.
-        */
-       if (ISC_LINK_LINKED(resp, alink)) {
-               ISC_LIST_UNLINK(disp->active, resp, alink);
-               if (ISC_LIST_EMPTY(disp->active) && disp->handle != NULL) {
-                       isc_nm_cancelread(disp->handle);
-               } else if (resp->response != NULL) {
-                       resp->response(ISC_R_CANCELED, NULL, resp->arg);
-               }
-       }
+unlock:
        UNLOCK(&disp->lock);
 
-done:
-       dns_dispatch_done(&resp);
+       if (connected) {
+               dispentry_log(resp, LVL(90), "connect callback: %s",
+                             isc_result_totext(result));
+               connected(result, NULL, resp->arg);
+               dns_dispentry_detach(&resp); /* DISPENTRY008 */
+       }
+       if (response) {
+               dispentry_log(resp, LVL(90), "read callback: %s",
+                             isc_result_totext(result));
+               response(result, NULL, resp->arg);
+               dns_dispentry_detach(&resp); /* DISPENTRY003 */
+       }
 }
 
-void
-dns_dispatch_done(dns_dispentry_t **respp) {
-       dns_dispatchmgr_t *mgr = NULL;
-       dns_dispatch_t *disp = NULL;
-       dns_dispentry_t *resp = NULL;
-       dns_qid_t *qid = NULL;
-
-       REQUIRE(respp != NULL);
+static void
+tcp_dispentry_cancel(dns_dispentry_t *resp, isc_result_t result) {
+       REQUIRE(VALID_RESPONSE(resp));
+       REQUIRE(VALID_DISPATCH(resp->disp));
+       REQUIRE(VALID_DISPATCHMGR(resp->disp->mgr));
 
-       resp = *respp;
+       dns_dispatch_t *disp = resp->disp;
+       dns_dispatchmgr_t *mgr = disp->mgr;
+       dns_qid_t *qid = mgr->qid;
+       dispatch_cb_t connected = NULL;
+       dns_displist_t resps = ISC_LIST_INITIALIZER;
 
-       REQUIRE(VALID_RESPONSE(resp));
+       LOCK(&disp->lock);
+       dispentry_log(resp, LVL(90),
+                     "canceling response: %s, %s/%s (%s/%s), "
+                     "requests %u",
+                     isc_result_totext(result), state2str(resp->state),
+                     resp->reading ? "reading" : "not reading",
+                     state2str(disp->state),
+                     disp->reading ? "reading" : "not reading",
+                     disp->requests);
+
+       switch (resp->state) {
+       case DNS_DISPATCHSTATE_NONE:
+               break;
 
-       disp = resp->disp;
+       case DNS_DISPATCHSTATE_CONNECTING:
+               ISC_LIST_UNLINK(disp->pending, resp, plink);
+               connected = resp->connected;
+               break;
 
-       REQUIRE(VALID_DISPATCH(disp));
+       case DNS_DISPATCHSTATE_CONNECTED:
+               if (resp->reading) {
+                       tcp_recv_add(&resps, resp, ISC_R_CANCELED);
+               }
 
-       mgr = disp->mgr;
+               INSIST(!ISC_LINK_LINKED(resp, alink));
 
-       REQUIRE(VALID_DISPATCHMGR(mgr));
+               if (ISC_LIST_EMPTY(disp->active)) {
+                       INSIST(disp->handle != NULL);
 
-       qid = mgr->qid;
+#if DISPATCH_TCP_KEEPALIVE
+                       /*
+                        * This is an experimental code that keeps the TCP
+                        * connection open for 1 second before it is finally
+                        * closed.  By keeping the TCP connection open, it can
+                        * be reused by dns_request that uses
+                        * dns_dispatch_gettcp() to join existing TCP
+                        * connections.
+                        *
+                        * It is disabled for now, because it changes the
+                        * behaviour, but I am keeping the code here for future
+                        * reference when we improve the dns_dispatch to reuse
+                        * the TCP connections also in the resolver.
+                        *
+                        * The TCP connection reuse should be seamless and not
+                        * require any extra handling on the client side though.
+                        */
+                       isc_nmhandle_cleartimeout(disp->handle);
+                       isc_nmhandle_settimeout(disp->handle, 1000);
+
+                       if (!disp->reading) {
+                               dispentry_log(resp, LVL(90),
+                                             "final 1 second timeout on %p",
+                                             disp->handle);
+                               tcp_startrecv(NULL, disp, NULL);
+                       }
+#else
+                       if (disp->reading) {
+                               dispentry_log(resp, LVL(90),
+                                             "canceling read on %p",
+                                             disp->handle);
+                               isc_nm_cancelread(disp->handle);
+                       }
+#endif
+               }
+               break;
 
-       LOCK(&disp->lock);
-       INSIST(disp->requests > 0);
-       disp->requests--;
+       case DNS_DISPATCHSTATE_CANCELED:
+               goto unlock;
 
-       dec_stats(disp->mgr, (disp->socktype == isc_socktype_udp)
-                                    ? dns_resstatscounter_disprequdp
-                                    : dns_resstatscounter_dispreqtcp);
+       default:
+               UNREACHABLE();
+       }
 
-       deactivate_dispentry(disp, resp);
+       dec_stats(disp->mgr, dns_resstatscounter_dispreqtcp);
 
        LOCK(&qid->lock);
        ISC_LIST_UNLINK(qid->qid_table[resp->bucket], resp, link);
        UNLOCK(&qid->lock);
+       resp->state = DNS_DISPATCHSTATE_CANCELED;
+
+unlock:
        UNLOCK(&disp->lock);
 
-       dispentry_detach(respp);
+       /*
+        * NOTE: Calling the connected and response callbacks directly from here
+        * should be done asynchronously, as the dns_dispatch_done() is usually
+        * called directly from the response callback, so there's a slight
+        * chance that the call stack will get higher here, but it's mitigated
+        * by the ".reading" flag, so we don't ever go into a loop.
+        */
+
+       if (connected) {
+               dispentry_log(resp, LVL(90), "connect callback: %s",
+                             isc_result_totext(result));
+               connected(result, NULL, resp->arg);
+               dns_dispentry_detach(&resp); /* DISPENTRY005 */
+       }
+
+       tcp_recv_processall(&resps, NULL);
 }
 
-/*
- * disp must be locked.
- */
 static void
-startrecv(isc_nmhandle_t *handle, dns_dispatch_t *disp, dns_dispentry_t *resp) {
+dispentry_cancel(dns_dispentry_t *resp, isc_result_t result) {
+       REQUIRE(VALID_RESPONSE(resp));
+       REQUIRE(VALID_DISPATCH(resp->disp));
+
+       dns_dispatch_t *disp = resp->disp;
+
        switch (disp->socktype) {
        case isc_socktype_udp:
-               REQUIRE(resp != NULL && resp->handle == NULL);
-
-               TIME_NOW(&resp->start);
-               isc_nmhandle_attach(handle, &resp->handle);
-               dispentry_attach(resp, &(dns_dispentry_t *){ NULL });
-               isc_nm_read(resp->handle, udp_recv, resp);
+               udp_dispentry_cancel(resp, result);
                break;
-
        case isc_socktype_tcp:
-               REQUIRE(disp != NULL);
+               tcp_dispentry_cancel(resp, result);
+               break;
+       default:
+               UNREACHABLE();
+       }
+}
 
-               LOCK(&disp->lock);
-               REQUIRE(disp->handle == NULL);
-               atomic_compare_exchange_enforced(
-                       &disp->tcpstate,
-                       &(uint_fast32_t){ DNS_DISPATCHSTATE_CONNECTING },
-                       DNS_DISPATCHSTATE_CONNECTED);
+void
+dns_dispatch_done(dns_dispentry_t **respp) {
+       REQUIRE(VALID_RESPONSE(*respp));
 
-               isc_nmhandle_attach(handle, &disp->handle);
-               dns_dispatch_attach(disp, &(dns_dispatch_t *){ NULL });
-               isc_nm_read(disp->handle, tcp_recv, disp);
-               UNLOCK(&disp->lock);
+       dns_dispentry_t *resp = *respp;
+       *respp = NULL;
 
-               break;
+       dispentry_cancel(resp, ISC_R_CANCELED);
+       dns_dispentry_detach(&resp); /* DISPENTRY000 */
+}
 
-       default:
-               UNREACHABLE();
+static void
+udp_startrecv(isc_nmhandle_t *handle, dns_dispentry_t *resp) {
+       REQUIRE(VALID_RESPONSE(resp));
+
+       TIME_NOW(&resp->start);
+       dispentry_log(resp, LVL(90), "attaching handle %p to %p", handle,
+                     &resp->handle);
+       isc_nmhandle_attach(handle, &resp->handle);
+       dns_dispentry_ref(resp); /* DISPENTRY003 */
+       dispentry_log(resp, LVL(90), "reading");
+       isc_nm_read(resp->handle, udp_recv, resp);
+       resp->reading = true;
+}
+
+static void
+tcp_startrecv(isc_nmhandle_t *handle, dns_dispatch_t *disp,
+             dns_dispentry_t *resp) {
+       REQUIRE(VALID_DISPATCH(disp));
+       REQUIRE(disp->socktype == isc_socktype_tcp);
+
+       if (handle != NULL) {
+               isc_nmhandle_attach(handle, &disp->handle);
+       }
+       dns_dispatch_ref(disp); /* DISPATCH002 */
+       if (resp != NULL) {
+               dispentry_log(resp, LVL(90), "reading from %p", disp->handle);
+       } else {
+               dispatch_log(disp, LVL(90),
+                            "TCP reading without response from %p",
+                            disp->handle);
        }
+       isc_nm_read(disp->handle, tcp_recv, disp);
+       disp->reading = true;
 }
 
 static void
 tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) {
        dns_dispatch_t *disp = (dns_dispatch_t *)arg;
-       dns_dispentry_t *resp = NULL, *next = NULL;
-       dns_displist_t resps;
+       dns_dispentry_t *resp = NULL;
+       dns_dispentry_t *next = NULL;
+       dns_displist_t resps = ISC_LIST_INITIALIZER;
+
+       if (isc_log_wouldlog(dns_lctx, 90)) {
+               char localbuf[ISC_SOCKADDR_FORMATSIZE];
+               char peerbuf[ISC_SOCKADDR_FORMATSIZE];
+               if (handle != NULL) {
+                       isc_sockaddr_t local = isc_nmhandle_localaddr(handle);
+                       isc_sockaddr_t peer = isc_nmhandle_peeraddr(handle);
+
+                       isc_sockaddr_format(&local, localbuf,
+                                           ISC_SOCKADDR_FORMATSIZE);
+                       isc_sockaddr_format(&peer, peerbuf,
+                                           ISC_SOCKADDR_FORMATSIZE);
+               } else {
+                       isc_sockaddr_format(&disp->local, localbuf,
+                                           ISC_SOCKADDR_FORMATSIZE);
+                       isc_sockaddr_format(&disp->peer, peerbuf,
+                                           ISC_SOCKADDR_FORMATSIZE);
+               }
 
-       dispatch_log(disp, LVL(90), "TCP connected (%p): %s", disp,
-                    isc_result_totext(eresult));
+               dispatch_log(disp, LVL(90), "connected from %s to %s: %s",
+                            localbuf, peerbuf, isc_result_totext(eresult));
+       }
 
-       ISC_LIST_INIT(resps);
+       LOCK(&disp->lock);
+       INSIST(disp->state = DNS_DISPATCHSTATE_CONNECTING);
 
        if (eresult == ISC_R_SUCCESS) {
-               startrecv(handle, disp, NULL);
+               disp->state = DNS_DISPATCHSTATE_CONNECTED;
+               tcp_startrecv(handle, disp, resp);
+       } else {
+               disp->state = DNS_DISPATCHSTATE_NONE;
        }
 
        /*
         * If there are pending responses, call the connect
         * callbacks for all of them.
         */
-       LOCK(&disp->lock);
        for (resp = ISC_LIST_HEAD(disp->pending); resp != NULL; resp = next) {
                next = ISC_LIST_NEXT(resp, plink);
                ISC_LIST_UNLINK(disp->pending, resp, plink);
-               ISC_LIST_APPEND(resps, resp, plink);
+               ISC_LIST_APPEND(resps, resp, rlink);
+
+               if (eresult == ISC_R_SUCCESS) {
+                       resp->state = DNS_DISPATCHSTATE_CONNECTED;
+                       ISC_LIST_APPEND(disp->active, resp, alink);
+                       resp->reading = true;
+                       dispentry_log(resp, LVL(90), "start reading");
+               } else {
+                       resp->state = DNS_DISPATCHSTATE_NONE;
+               }
        }
        UNLOCK(&disp->lock);
 
        for (resp = ISC_LIST_HEAD(resps); resp != NULL; resp = next) {
-               next = ISC_LIST_NEXT(resp, plink);
-               ISC_LIST_UNLINK(resps, resp, plink);
+               next = ISC_LIST_NEXT(resp, rlink);
+               ISC_LIST_UNLINK(resps, resp, rlink);
 
-               if (resp->connected != NULL) {
-                       resp->connected(eresult, NULL, resp->arg);
-               }
-               dispentry_detach(&resp);
+               dispentry_log(resp, LVL(90), "connect callback: %s",
+                             isc_result_totext(eresult));
+               resp->connected(eresult, NULL, resp->arg);
+               dns_dispentry_detach(&resp); /* DISPENTRY005 */
        }
 
-       dns_dispatch_detach(&disp);
+       dns_dispatch_detach(&disp); /* DISPATCH003 */
 }
 
+static void
+udp_dispatch_connect(dns_dispatch_t *disp, dns_dispentry_t *resp);
+
 static void
 udp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) {
        dns_dispentry_t *resp = (dns_dispentry_t *)arg;
        dns_dispatch_t *disp = resp->disp;
 
-       dispatch_log(disp, LVL(90), "UDP connected (%p): %s", resp,
-                    isc_result_totext(eresult));
+       dispentry_log(resp, LVL(90), "connected: %s",
+                     isc_result_totext(eresult));
 
-       if (eresult == ISC_R_SUCCESS && resp->canceled) {
-               eresult = ISC_R_CANCELED;
-       } else if (eresult == ISC_R_SUCCESS) {
-               startrecv(handle, disp, resp);
-       } else if (eresult == ISC_R_ADDRINUSE) {
+       LOCK(&disp->lock);
+
+       switch (resp->state) {
+       case DNS_DISPATCHSTATE_CANCELED:
+               UNLOCK(&disp->lock);
+               goto detach;
+       case DNS_DISPATCHSTATE_CONNECTING:
+               ISC_LIST_UNLINK(disp->pending, resp, plink);
+               break;
+       default:
+               UNREACHABLE();
+       }
+
+       switch (eresult) {
+       case ISC_R_SUCCESS:
+               resp->state = DNS_DISPATCHSTATE_CONNECTED;
+               udp_startrecv(handle, resp);
+               break;
+       case ISC_R_ADDRINUSE: {
                in_port_t localport = 0;
                isc_result_t result;
 
                /* probably a port collision; try a different one */
-               disp->nsockets--;
                result = setup_socket(disp, resp, &resp->peer, &localport);
                if (result == ISC_R_SUCCESS) {
-                       dns_dispatch_connect(resp);
+                       UNLOCK(&disp->lock);
+                       udp_dispatch_connect(disp, resp);
                        goto detach;
                }
+               resp->state = DNS_DISPATCHSTATE_NONE;
+               break;
        }
-
-       if (resp->connected != NULL) {
-               resp->connected(eresult, NULL, resp->arg);
+       default:
+               resp->state = DNS_DISPATCHSTATE_NONE;
+               break;
        }
 
+       UNLOCK(&disp->lock);
+
+       dispentry_log(resp, LVL(90), "connect callback: %s",
+                     isc_result_totext(eresult));
+       resp->connected(eresult, NULL, resp->arg);
+
 detach:
-       dispentry_detach(&resp);
+       dns_dispentry_detach(&resp); /* DISPENTRY004 */
 }
 
-isc_result_t
-dns_dispatch_connect(dns_dispentry_t *resp) {
-       dns_dispatch_t *disp = NULL;
-       uint_fast32_t tcpstate = DNS_DISPATCHSTATE_NONE;
+static void
+udp_dispatch_connect(dns_dispatch_t *disp, dns_dispentry_t *resp) {
+       LOCK(&disp->lock);
+       resp->state = DNS_DISPATCHSTATE_CONNECTING;
+       dns_dispentry_ref(resp); /* DISPENTRY004 */
+       ISC_LIST_APPEND(disp->pending, resp, plink);
+       UNLOCK(&disp->lock);
+       isc_nm_udpconnect(disp->mgr->nm, &resp->local, &resp->peer,
+                         udp_connected, resp, resp->timeout, 0);
+}
 
-       REQUIRE(VALID_RESPONSE(resp));
+static isc_result_t
+tcp_dispatch_connect(dns_dispatch_t *disp, dns_dispentry_t *resp) {
+       /* Check whether the dispatch is already connecting or connected. */
+       LOCK(&disp->lock);
+       switch (disp->state) {
+       case DNS_DISPATCHSTATE_NONE:
+               /* First connection, continue with connecting */
+               disp->state = DNS_DISPATCHSTATE_CONNECTING;
+               resp->state = DNS_DISPATCHSTATE_CONNECTING;
+               dns_dispentry_ref(resp); /* DISPENTRY005 */
+               ISC_LIST_APPEND(disp->pending, resp, plink);
+               UNLOCK(&disp->lock);
 
-       disp = resp->disp;
+               char localbuf[ISC_SOCKADDR_FORMATSIZE];
+               char peerbuf[ISC_SOCKADDR_FORMATSIZE];
 
-       /* This will be detached once we've connected. */
-       dispentry_attach(resp, &(dns_dispentry_t *){ NULL });
+               isc_sockaddr_format(&disp->local, localbuf,
+                                   ISC_SOCKADDR_FORMATSIZE);
+               isc_sockaddr_format(&disp->peer, peerbuf,
+                                   ISC_SOCKADDR_FORMATSIZE);
 
-       switch (disp->socktype) {
-       case isc_socktype_tcp:
-               /*
-                * Check whether the dispatch is already connecting
-                * or connected.
-                */
-               atomic_compare_exchange_strong(&disp->tcpstate,
-                                              (uint_fast32_t *)&tcpstate,
-                                              DNS_DISPATCHSTATE_CONNECTING);
-               switch (tcpstate) {
-               case DNS_DISPATCHSTATE_NONE:
-                       /* First connection, continue with connecting */
-                       LOCK(&disp->lock);
-                       ISC_LIST_APPEND(disp->pending, resp, plink);
-                       UNLOCK(&disp->lock);
-                       dns_dispatch_attach(disp, &(dns_dispatch_t *){ NULL });
-                       isc_nm_tcpdnsconnect(disp->mgr->nm, &disp->local,
-                                            &disp->peer, tcp_connected, disp,
-                                            resp->timeout, 0);
-                       break;
+               dns_dispatch_ref(disp); /* DISPATCH003 */
+               dispentry_log(resp, LVL(90),
+                             "connecting from %s to %s, timeout %u", localbuf,
+                             peerbuf, resp->timeout);
 
-               case DNS_DISPATCHSTATE_CONNECTING:
-                       /* Connection pending; add resp to the list */
-                       LOCK(&disp->lock);
-                       ISC_LIST_APPEND(disp->pending, resp, plink);
-                       UNLOCK(&disp->lock);
-                       break;
+               isc_nm_tcpdnsconnect(disp->mgr->nm, &disp->local, &disp->peer,
+                                    tcp_connected, disp, resp->timeout, 0);
+               break;
 
-               case DNS_DISPATCHSTATE_CONNECTED:
-                       /* We are already connected; call the connected cb */
-                       if (resp->connected != NULL) {
-                               resp->connected(ISC_R_SUCCESS, NULL, resp->arg);
-                       }
-                       dispentry_detach(&resp);
-                       break;
+       case DNS_DISPATCHSTATE_CONNECTING:
+               /* Connection pending; add resp to the list */
+               resp->state = DNS_DISPATCHSTATE_CONNECTING;
+               dns_dispentry_ref(resp); /* DISPENTRY005 */
+               ISC_LIST_APPEND(disp->pending, resp, plink);
+               UNLOCK(&disp->lock);
+               break;
 
-               default:
-                       UNREACHABLE();
-               }
+       case DNS_DISPATCHSTATE_CONNECTED:
+               resp->state = DNS_DISPATCHSTATE_CONNECTED;
 
-               break;
+               /* Add the resp to the reading list */
+               ISC_LIST_APPEND(disp->active, resp, alink);
+               dispentry_log(resp, LVL(90), "already connected; attaching");
+               resp->reading = true;
 
-       case isc_socktype_udp:
-               isc_nm_udpconnect(disp->mgr->nm, &resp->local, &resp->peer,
-                                 udp_connected, resp, resp->timeout, 0);
+               if (!disp->reading) {
+                       /* Restart the reading */
+                       tcp_startrecv(NULL, disp, resp);
+               }
+
+               UNLOCK(&disp->lock);
+               /* We are already connected; call the connected cb */
+               dispentry_log(resp, LVL(90), "connect callback: %s",
+                             isc_result_totext(ISC_R_SUCCESS));
+               resp->connected(ISC_R_SUCCESS, NULL, resp->arg);
                break;
 
        default:
-               return (ISC_R_NOTIMPLEMENTED);
+               UNREACHABLE();
        }
 
        return (ISC_R_SUCCESS);
 }
 
+isc_result_t
+dns_dispatch_connect(dns_dispentry_t *resp) {
+       REQUIRE(VALID_RESPONSE(resp));
+       REQUIRE(VALID_DISPATCH(resp->disp));
+
+       dns_dispatch_t *disp = resp->disp;
+
+       switch (disp->socktype) {
+       case isc_socktype_tcp:
+               return (tcp_dispatch_connect(disp, resp));
+
+       case isc_socktype_udp:
+               udp_dispatch_connect(disp, resp);
+               return (ISC_R_SUCCESS);
+
+       default:
+               UNREACHABLE();
+       }
+}
+
 static void
 send_done(isc_nmhandle_t *handle, isc_result_t result, void *cbarg) {
        dns_dispentry_t *resp = (dns_dispentry_t *)cbarg;
 
        REQUIRE(VALID_RESPONSE(resp));
 
+       dns_dispatch_t *disp = resp->disp;
+
+       REQUIRE(VALID_DISPATCH(disp));
+
+       dispentry_log(resp, LVL(90), "sent: %s", isc_result_totext(result));
+
        resp->sent(result, NULL, resp->arg);
 
        if (result != ISC_R_SUCCESS) {
-               isc_nm_cancelread(handle);
+               dispentry_cancel(resp, result);
        }
 
-       dispentry_detach(&resp);
+       dns_dispentry_detach(&resp); /* DISPENTRY007 */
+       isc_nmhandle_detach(&handle);
+}
+
+static void
+tcp_dispatch_getnext(dns_dispatch_t *disp, dns_dispentry_t *resp,
+                    int32_t timeout) {
+       REQUIRE(timeout <= INT16_MAX);
+
+       if (disp->reading) {
+               return;
+       }
+
+       if (timeout > 0) {
+               isc_nmhandle_settimeout(disp->handle, timeout);
+       }
+
+       dispentry_log(resp, LVL(90), "continue reading");
+
+       dns_dispatch_ref(disp); /* DISPATCH002 */
+       isc_nm_read(disp->handle, tcp_recv, disp);
+       disp->reading = true;
+
+       ISC_LIST_APPEND(disp->active, resp, alink);
+       resp->reading = true;
+}
+
+static void
+udp_dispatch_getnext(dns_dispentry_t *resp, int32_t timeout) {
+       REQUIRE(timeout <= INT16_MAX);
+
+       if (resp->reading) {
+               return;
+       }
+
+       if (timeout > 0) {
+               isc_nmhandle_settimeout(resp->handle, timeout);
+       }
+
+       dispentry_log(resp, LVL(90), "continue reading");
+
+       dns_dispentry_ref(resp); /* DISPENTRY003 */
+       isc_nm_read(resp->handle, udp_recv, resp);
+       resp->reading = true;
 }
 
 void
 dns_dispatch_resume(dns_dispentry_t *resp, uint16_t timeout) {
-       dns_dispatch_t *disp = NULL;
-
        REQUIRE(VALID_RESPONSE(resp));
+       REQUIRE(VALID_DISPATCH(resp->disp));
 
-       disp = resp->disp;
+       dns_dispatch_t *disp = resp->disp;
 
-       REQUIRE(VALID_DISPATCH(disp));
+       LOCK(&disp->lock);
+       switch (disp->socktype) {
+       case isc_socktype_udp: {
+               udp_dispatch_getnext(resp, timeout);
+               break;
+       }
+       case isc_socktype_tcp:
+               INSIST(disp->timedout > 0);
+               disp->timedout--;
+               tcp_dispatch_getnext(disp, resp, timeout);
+               break;
+       default:
+               UNREACHABLE();
+       }
 
-       dispatch_getnext(disp, resp, timeout);
+       UNLOCK(&disp->lock);
 }
 
 void
 dns_dispatch_send(dns_dispentry_t *resp, isc_region_t *r, isc_dscp_t dscp) {
-       isc_nmhandle_t *handle = NULL;
-
        REQUIRE(VALID_RESPONSE(resp));
-
+       REQUIRE(VALID_DISPATCH(resp->disp));
        UNUSED(dscp);
 
+       dns_dispatch_t *disp = resp->disp;
+       isc_nmhandle_t *sendhandle = NULL;
+
 #if 0
        /* XXX: no DSCP support */
        if (dscp == -1) {
@@ -1906,14 +2150,19 @@ dns_dispatch_send(dns_dispentry_t *resp, isc_region_t *r, isc_dscp_t dscp) {
        }
 #endif
 
-       if (resp->disp->socktype == isc_socktype_tcp) {
-               handle = resp->disp->handle;
-       } else {
-               handle = resp->handle;
+       dispentry_log(resp, LVL(90), "sending");
+       switch (disp->socktype) {
+       case isc_socktype_udp:
+               isc_nmhandle_attach(resp->handle, &sendhandle);
+               break;
+       case isc_socktype_tcp:
+               isc_nmhandle_attach(disp->handle, &sendhandle);
+               break;
+       default:
+               UNREACHABLE();
        }
-
-       dispentry_attach(resp, &(dns_dispentry_t *){ NULL });
-       isc_nm_send(handle, r, send_done, resp);
+       dns_dispentry_ref(resp); /* DISPENTRY007 */
+       isc_nm_send(sendhandle, r, send_done, resp);
 }
 
 isc_result_t
@@ -1931,19 +2180,21 @@ dns_dispatch_getlocaladdress(dns_dispatch_t *disp, isc_sockaddr_t *addrp) {
 isc_result_t
 dns_dispentry_getlocaladdress(dns_dispentry_t *resp, isc_sockaddr_t *addrp) {
        REQUIRE(VALID_RESPONSE(resp));
+       REQUIRE(VALID_DISPATCH(resp->disp));
        REQUIRE(addrp != NULL);
 
-       if (resp->disp->socktype == isc_socktype_tcp) {
-               *addrp = resp->disp->local;
-               return (ISC_R_SUCCESS);
-       }
+       dns_dispatch_t *disp = resp->disp;
 
-       if (resp->handle != NULL) {
+       switch (disp->socktype) {
+       case isc_socktype_tcp:
+               *addrp = disp->local;
+               return (ISC_R_SUCCESS);
+       case isc_socktype_udp:
                *addrp = isc_nmhandle_localaddr(resp->handle);
                return (ISC_R_SUCCESS);
+       default:
+               UNREACHABLE();
        }
-
-       return (ISC_R_NOTIMPLEMENTED);
 }
 
 dns_dispatch_t *
@@ -1990,7 +2241,7 @@ dns_dispatchset_create(isc_mem_t *mctx, dns_dispatch_t *source,
        isc_mem_attach(mctx, &dset->mctx);
 
        dset->dispatches[0] = NULL;
-       dns_dispatch_attach(source, &dset->dispatches[0]);
+       dns_dispatch_attach(source, &dset->dispatches[0]); /* DISPATCH004 */
 
        LOCK(&mgr->lock);
        for (i = 1; i < n; i++) {
@@ -2011,7 +2262,7 @@ fail:
        UNLOCK(&mgr->lock);
 
        for (j = 0; j < i; j++) {
-               dns_dispatch_detach(&(dset->dispatches[j]));
+               dns_dispatch_detach(&(dset->dispatches[j])); /* DISPATCH004 */
        }
        isc_mem_put(mctx, dset->dispatches, sizeof(dns_dispatch_t *) * n);
        if (dset->mctx == mctx) {
@@ -2033,7 +2284,7 @@ dns_dispatchset_destroy(dns_dispatchset_t **dsetp) {
        dset = *dsetp;
        *dsetp = NULL;
        for (i = 0; i < dset->ndisp; i++) {
-               dns_dispatch_detach(&(dset->dispatches[i]));
+               dns_dispatch_detach(&(dset->dispatches[i])); /* DISPATCH004 */
        }
        isc_mem_put(dset->mctx, dset->dispatches,
                    sizeof(dns_dispatch_t *) * dset->ndisp);
index b82a46c5a99fc98490808754448ed602c2b60d03..bcf5b97db483127666d173045a9830ef52e01899 100644 (file)
 #include <isc/lang.h>
 #include <isc/mutex.h>
 #include <isc/netmgr.h>
+#include <isc/refcount.h>
 #include <isc/types.h>
 
 #include <dns/types.h>
 
+#undef DNS_DISPATCH_TRACE
+
 ISC_LANG_BEGINDECLS
 
 /*%
@@ -94,25 +97,22 @@ dns_dispatchmgr_create(isc_mem_t *mctx, isc_nm_t *nm, dns_dispatchmgr_t **mgrp);
  *\li  anything else   -- failure
  */
 
-void
-dns_dispatchmgr_attach(dns_dispatchmgr_t *mgr, dns_dispatchmgr_t **mgrp);
-/*%<
- * Attach to a dispatch manger.
- *
- * Requires:
- *\li   is valid.
- *
- *\li  mgrp != NULL && *mgrp == NULL
- */
+#if DNS_DISPATCH_TRACE
+#define dns_dispatchmgr_ref(ptr) \
+       dns_dispatchmgr__ref(ptr, __func__, __FILE__, __LINE__)
+#define dns_dispatchmgr_unref(ptr) \
+       dns_dispatchmgr__unref(ptr, __func__, __FILE__, __LINE__)
+#define dns_dispatchmgr_attach(ptr, ptrp) \
+       dns_dispatchmgr__attach(ptr, ptrp, __func__, __FILE__, __LINE__)
+#define dns_dispatchmgr_detach(ptrp) \
+       dns_dispatchmgr__detach(ptrp, __func__, __FILE__, __LINE__)
+ISC_REFCOUNT_TRACE_DECL(dns_dispatchmgr);
+#else
+ISC_REFCOUNT_DECL(dns_dispatchmgr);
+#endif
 
-void
-dns_dispatchmgr_detach(dns_dispatchmgr_t **mgrp);
 /*%<
- * Detach from the dispatch manager, and destroy it if no references
- * remain.
- *
- * Requires:
- *\li  mgrp != NULL && *mgrp is a valid dispatchmgr.
+ * Attach/Detach to a dispatch manager.
  */
 
 void
@@ -201,10 +201,21 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, const isc_sockaddr_t *localaddr,
  *\li  Anything else   -- failure.
  */
 
-void
-dns_dispatch_attach(dns_dispatch_t *disp, dns_dispatch_t **dispp);
+#if DNS_DISPATCH_TRACE
+#define dns_dispatch_ref(ptr) \
+       dns_dispatch__ref(ptr, __func__, __FILE__, __LINE__)
+#define dns_dispatch_unref(ptr) \
+       dns_dispatch__unref(ptr, __func__, __FILE__, __LINE__)
+#define dns_dispatch_attach(ptr, ptrp) \
+       dns_dispatch__attach(ptr, ptrp, __func__, __FILE__, __LINE__)
+#define dns_dispatch_detach(ptrp) \
+       dns_dispatch__detach(ptrp, __func__, __FILE__, __LINE__)
+ISC_REFCOUNT_TRACE_DECL(dns_dispatch);
+#else
+ISC_REFCOUNT_DECL(dns_dispatch);
+#endif
 /*%<
- * Attach to a dispatch handle.
+ * Attach/Detach to a dispatch handle.
  *
  * Requires:
  *\li  disp is valid.
@@ -212,15 +223,6 @@ dns_dispatch_attach(dns_dispatch_t *disp, dns_dispatch_t **dispp);
  *\li  dispp != NULL && *dispp == NULL
  */
 
-void
-dns_dispatch_detach(dns_dispatch_t **dispp);
-/*%<
- * Detaches from the dispatch.
- *
- * Requires:
- *\li  dispp != NULL and *dispp be a valid dispatch.
- */
-
 isc_result_t
 dns_dispatch_connect(dns_dispentry_t *resp);
 /*%<
@@ -253,11 +255,9 @@ dns_dispatch_resume(dns_dispentry_t *resp, uint16_t timeout);
 
 isc_result_t
 dns_dispatch_gettcp(dns_dispatchmgr_t *mgr, const isc_sockaddr_t *destaddr,
-                   const isc_sockaddr_t *localaddr, bool *connected,
-                   dns_dispatch_t **dispp);
+                   const isc_sockaddr_t *localaddr, dns_dispatch_t **dispp);
 /*
- * Attempt to connect to a existing TCP connection (connection completed
- * if connected == NULL).
+ * Attempt to connect to a existing TCP connection.
  */
 
 typedef void (*dispatch_cb_t)(isc_result_t eresult, isc_region_t *region,
@@ -307,22 +307,10 @@ dns_dispatch_add(dns_dispatch_t *disp, unsigned int options,
 
 void
 dns_dispatch_done(dns_dispentry_t **respp);
-/*%<
- * Disconnects a dispatch response entry from its dispatch and shuts it
- * down. This is called when the dispatch is complete; use
- * dns_dispatch_cancel() if it is still pending.
- *
- * Requires:
- *\li  "resp" != NULL and "*resp" contain a value previously allocated
- *     by dns_dispatch_add();
- */
+/*<
+ * Disconnect a dispatch response entry from its dispatch, cancel all
+ * pending connects and reads in a dispatch entry and shut it down.
 
-void
-dns_dispatch_cancel(dns_dispentry_t **respp);
-/*%<
- * Cancel all pending connects and reads in a dispatch entry,
- * then call dns_dispatch_done(). This is used when the caller
- * cancels a dispatch response before it has completed.
  *
  * Requires:
  *\li  "resp" != NULL and "*resp" contain a value previously allocated
index 075ac10d81ea6ea0ed2f23661ccfba012fed0f0c..e52039107a83fb58247bc49039a963a8bd05c4de 100644 (file)
@@ -400,20 +400,18 @@ isblackholed(dns_dispatchmgr_t *dispatchmgr, const isc_sockaddr_t *destaddr) {
 static isc_result_t
 tcp_dispatch(bool newtcp, dns_requestmgr_t *requestmgr,
             const isc_sockaddr_t *srcaddr, const isc_sockaddr_t *destaddr,
-            isc_dscp_t dscp, bool *connected, dns_dispatch_t **dispatchp) {
+            isc_dscp_t dscp, dns_dispatch_t **dispatchp) {
        isc_result_t result;
 
        if (!newtcp) {
                result = dns_dispatch_gettcp(requestmgr->dispatchmgr, destaddr,
-                                            srcaddr, connected, dispatchp);
+                                            srcaddr, dispatchp);
                if (result == ISC_R_SUCCESS) {
                        char peer[ISC_SOCKADDR_FORMATSIZE];
 
                        isc_sockaddr_format(destaddr, peer, sizeof(peer));
                        req_log(ISC_LOG_DEBUG(1),
-                               "attached to %s TCP "
-                               "connection to %s",
-                               *connected ? "existing" : "pending", peer);
+                               "attached to TCP connection to %s", peer);
                        return (result);
                }
        }
@@ -455,12 +453,12 @@ udp_dispatch(dns_requestmgr_t *requestmgr, const isc_sockaddr_t *srcaddr,
 static isc_result_t
 get_dispatch(bool tcp, bool newtcp, dns_requestmgr_t *requestmgr,
             const isc_sockaddr_t *srcaddr, const isc_sockaddr_t *destaddr,
-            isc_dscp_t dscp, bool *connected, dns_dispatch_t **dispatchp) {
+            isc_dscp_t dscp, dns_dispatch_t **dispatchp) {
        isc_result_t result;
 
        if (tcp) {
                result = tcp_dispatch(newtcp, requestmgr, srcaddr, destaddr,
-                                     dscp, connected, dispatchp);
+                                     dscp, dispatchp);
        } else {
                result = udp_dispatch(requestmgr, srcaddr, destaddr, dispatchp);
        }
@@ -482,7 +480,6 @@ dns_request_createraw(dns_requestmgr_t *requestmgr, isc_buffer_t *msgbuf,
        bool tcp = false;
        bool newtcp = false;
        isc_region_t r;
-       bool connected = false;
        unsigned int dispopt = 0;
 
        REQUIRE(VALID_REQUESTMGR(requestmgr));
@@ -556,7 +553,7 @@ dns_request_createraw(dns_requestmgr_t *requestmgr, isc_buffer_t *msgbuf,
 again:
 
        result = get_dispatch(tcp, newtcp, requestmgr, srcaddr, destaddr, dscp,
-                             &connected, &request->dispatch);
+                             &request->dispatch);
        if (result != ISC_R_SUCCESS) {
                goto detach;
        }
@@ -573,7 +570,6 @@ again:
        if (result != ISC_R_SUCCESS) {
                if ((options & DNS_REQUESTOPT_FIXEDID) != 0 && !newtcp) {
                        newtcp = true;
-                       connected = false;
                        dns_dispatch_detach(&request->dispatch);
                        goto again;
                }
@@ -593,21 +589,15 @@ again:
        UNLOCK(&requestmgr->lock);
 
        request->destaddr = *destaddr;
-       if (tcp && connected) {
-               req_send(request);
 
-               /* no need to call req_connected(), detach here */
-               req_detach(&(dns_request_t *){ request });
-       } else {
-               request->flags |= DNS_REQUEST_F_CONNECTING;
-               if (tcp) {
-                       request->flags |= DNS_REQUEST_F_TCP;
-               }
+       request->flags |= DNS_REQUEST_F_CONNECTING;
+       if (tcp) {
+               request->flags |= DNS_REQUEST_F_TCP;
+       }
 
-               result = dns_dispatch_connect(request->dispentry);
-               if (result != ISC_R_SUCCESS) {
-                       goto unlink;
-               }
+       result = dns_dispatch_connect(request->dispentry);
+       if (result != ISC_R_SUCCESS) {
+               goto unlink;
        }
 
        req_log(ISC_LOG_DEBUG(3), "dns_request_createraw: request %p", request);
@@ -718,7 +708,7 @@ dns_request_create(dns_requestmgr_t *requestmgr, dns_message_t *message,
 
 again:
        result = get_dispatch(tcp, false, requestmgr, srcaddr, destaddr, dscp,
-                             &connected, &request->dispatch);
+                             &request->dispatch);
        if (result != ISC_R_SUCCESS) {
                goto detach;
        }
@@ -903,7 +893,7 @@ request_cancel(dns_request_t *request) {
                request->flags &= ~DNS_REQUEST_F_CONNECTING;
 
                if (request->dispentry != NULL) {
-                       dns_dispatch_cancel(&request->dispentry);
+                       dns_dispatch_done(&request->dispentry);
                }
 
                dns_dispatch_detach(&request->dispatch);
@@ -1061,13 +1051,13 @@ static void
 req_response(isc_result_t result, isc_region_t *region, void *arg) {
        dns_request_t *request = (dns_request_t *)arg;
 
-       req_log(ISC_LOG_DEBUG(3), "req_response: request %p: %s", request,
-               isc_result_totext(result));
-
        if (result == ISC_R_CANCELED) {
                return;
        }
 
+       req_log(ISC_LOG_DEBUG(3), "req_response: request %p: %s", request,
+               isc_result_totext(result));
+
        if (result == ISC_R_TIMEDOUT) {
                LOCK(&request->requestmgr->locks[request->hash]);
                if (request->udpcount != 0) {
index 7ab22e78699cccbd4f5cd178fbbeedbfda39fd95..6f95acd9b2ff58707d6109e57ca4772be366caa5 100644 (file)
@@ -1499,7 +1499,7 @@ fctx_cancelquery(resquery_t **queryp, isc_time_t *finish, bool no_response,
         * exist, cancel them.
         */
        if (query->dispentry != NULL) {
-               dns_dispatch_cancel(&query->dispentry);
+               dns_dispatch_done(&query->dispentry);
        }
 
        LOCK(&fctx->res->buckets[fctx->bucketnum].lock);