]> git.ipfire.org Git - thirdparty/bind9.git/commitdiff
2384. [security] Additional support for query port randomization (change
authorTatuya JINMEI 神明達哉 <jinmei@isc.org>
Tue, 24 Jun 2008 00:09:12 +0000 (00:09 +0000)
committerTatuya JINMEI 神明達哉 <jinmei@isc.org>
Tue, 24 Jun 2008 00:09:12 +0000 (00:09 +0000)
#2375) including performance improvement and port range
specification.  [RT #17949, #18098]

25 files changed:
CHANGES
bin/named/bind9.xsl
bin/named/bind9.xsl.h
bin/named/server.c
configure.in
doc/arm/Bv9ARM-book.xml
lib/dns/dispatch.c
lib/dns/include/dns/dispatch.h
lib/dns/include/dns/resolver.h
lib/dns/request.c
lib/dns/resolver.c
lib/isc/Makefile.in
lib/isc/include/isc/platform.h.in
lib/isc/include/isc/portset.h
lib/isc/include/isc/socket.h
lib/isc/include/isc/timer.h
lib/isc/include/isc/types.h
lib/isc/portset.c
lib/isc/timer.c
lib/isc/unix/app.c
lib/isc/unix/include/isc/net.h
lib/isc/unix/net.c
lib/isc/unix/socket.c
lib/isc/unix/socket_p.h
lib/isccfg/namedconf.c

diff --git a/CHANGES b/CHANGES
index 9b6e02df4f3c7835279cee76e1ebacfe75ecc76b..c21b7ebc1a8063196c35241080fca2f4a23c3227 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,7 @@
+2384.  [security]      Additional support for query port randomization (change
+                       #2375) including performance improvement and port range
+                       specification.  [RT #17949, #18098]
+
 2383.  [bug]           named could double queries when they resulted in
                        SERVFAIL due to overkilling EDNS0 failure detection.
                        [RT #18182]
index edc10ece54882998276365ef7c4f1e1e2453cafa..cf921ae03d2637f5d5cf4ffaaf2b37a94fa454c7 100644 (file)
@@ -15,7 +15,7 @@
  - PERFORMANCE OF THIS SOFTWARE.
 -->
 
-<!-- $Id: bind9.xsl,v 1.13.130.4 2008/04/09 22:49:37 jinmei Exp $ -->
+<!-- $Id: bind9.xsl,v 1.13.130.5 2008/06/24 00:09:10 jinmei Exp $ -->
 
 <xsl:stylesheet version="1.0"
    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
@@ -86,7 +86,6 @@ td, th {
       </head>
       <body>
         <div class="header">Bind 9 Configuration and Statistics</div>
-
        <br/>
 
        <table>
index 80788f8c9d87c72ffc47581b69a51c06bd090b16..cca9791599e0bf5715b0ea4fb6c6579f41c2126f 100644 (file)
@@ -91,7 +91,6 @@ static char xslmsg[] =
        " </head>\n"
        " <body>\n"
        " <div class=\"header\">Bind 9 Configuration and Statistics</div>\n"
-       "\n"
        " <br/>\n"
        "\n"
        " <table>\n"
index e45759d64beb351fd2757d89f3b628b338b3b266..ef21da86cdb8041b590b4adf8419b172d36bb45c 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: server.c,v 1.495.10.14 2008/06/23 23:30:59 jinmei Exp $ */
+/* $Id: server.c,v 1.495.10.15 2008/06/24 00:09:10 jinmei Exp $ */
 
 /*! \file */
 
@@ -33,6 +33,7 @@
 #include <isc/httpd.h>
 #include <isc/lex.h>
 #include <isc/parseint.h>
+#include <isc/portset.h>
 #include <isc/print.h>
 #include <isc/resource.h>
 #include <isc/stdio.h>
@@ -538,13 +539,15 @@ mustbesecure(const cfg_obj_t *mbs, dns_resolver_t *resolver)
  */
 static isc_result_t
 get_view_querysource_dispatch(const cfg_obj_t **maps,
-                             int af, dns_dispatch_t **dispatchp)
+                             int af, dns_dispatch_t **dispatchp,
+                             isc_boolean_t is_firstview)
 {
        isc_result_t result;
        dns_dispatch_t *disp;
        isc_sockaddr_t sa;
        unsigned int attrs, attrmask;
        const cfg_obj_t *obj = NULL;
+       unsigned int maxdispatchbuffers;
 
        /*
         * Make compiler happy.
@@ -596,12 +599,18 @@ get_view_querysource_dispatch(const cfg_obj_t **maps,
                attrs |= DNS_DISPATCHATTR_IPV6;
                break;
        }
-
-       if (isc_sockaddr_getport(&sa) != 0) {
+       if (isc_sockaddr_getport(&sa) == 0) {
+               attrs |= DNS_DISPATCHATTR_EXCLUSIVE;
+               maxdispatchbuffers = 4096;
+       } else {
                INSIST(obj != NULL);
-               cfg_obj_log(obj, ns_g_lctx, ISC_LOG_INFO,
-                           "using specific query-source port suppresses port "
-                           "randomization and can be insecure.");
+               if (is_firstview) {
+                       cfg_obj_log(obj, ns_g_lctx, ISC_LOG_INFO,
+                                   "using specific query-source port "
+                                   "suppresses port randomization and can be "
+                                   "insecure.");
+               }
+               maxdispatchbuffers = 1000;
        }
 
        attrmask = 0;
@@ -613,7 +622,7 @@ get_view_querysource_dispatch(const cfg_obj_t **maps,
        disp = NULL;
        result = dns_dispatch_getudp(ns_g_dispatchmgr, ns_g_socketmgr,
                                     ns_g_taskmgr, &sa, 4096,
-                                    1024, 32768, 16411, 16433,
+                                    maxdispatchbuffers, 32768, 16411, 16433,
                                     attrs, attrmask, &disp);
        if (result != ISC_R_SUCCESS) {
                isc_sockaddr_t any;
@@ -1279,8 +1288,12 @@ configure_view(dns_view_t *view, const cfg_obj_t *config,
         *
         * XXXRTH  Hardwired number of tasks.
         */
-       CHECK(get_view_querysource_dispatch(maps, AF_INET, &dispatch4));
-       CHECK(get_view_querysource_dispatch(maps, AF_INET6, &dispatch6));
+       CHECK(get_view_querysource_dispatch(maps, AF_INET, &dispatch4,
+                                           ISC_TF(ISC_LIST_PREV(view, link)
+                                                  == NULL)));
+       CHECK(get_view_querysource_dispatch(maps, AF_INET6, &dispatch6,
+                                           ISC_TF(ISC_LIST_PREV(view, link)
+                                                  == NULL)));
        if (dispatch4 == NULL && dispatch6 == NULL) {
                UNEXPECTED_ERROR(__FILE__, __LINE__,
                                 "unable to obtain neither an IPv4 nor"
@@ -1288,50 +1301,6 @@ configure_view(dns_view_t *view, const cfg_obj_t *config,
                result = ISC_R_UNEXPECTED;
                goto cleanup;
        }
-
-       obj = NULL;
-       (void)ns_config_get(maps, "use-queryport-pool", &obj);
-       if (obj == NULL || cfg_obj_asboolean(obj)) {
-               isc_sockaddr_t sa;
-               isc_boolean_t logit4 = ISC_FALSE, logit6 = ISC_FALSE;
-
-               resopts |= (DNS_RESOLVER_USEDISPATCHPOOL4 |
-                           DNS_RESOLVER_USEDISPATCHPOOL6);
-
-               /* Check consistency with query-source(-v6) */
-               if (dispatch4 == NULL)
-                       resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL4;
-               else {
-                       result = dns_dispatch_getlocaladdress(dispatch4, &sa);
-                       INSIST(result == ISC_R_SUCCESS);
-                       if (isc_sockaddr_getport(&sa) != 0) {
-                               logit4 = ISC_TRUE;
-                               resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL4;
-                       }
-               }
-
-               if (dispatch6 == NULL)
-                       resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL6;
-               else {
-                       result = dns_dispatch_getlocaladdress(dispatch6, &sa);
-                       INSIST(result == ISC_R_SUCCESS);
-                       if (isc_sockaddr_getport(&sa) != 0) {
-                               logit6 = ISC_TRUE;
-                               resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL6;
-                       }
-               }
-               if (logit4 && obj != NULL)
-                       cfg_obj_log(obj, ns_g_lctx, ISC_LOG_ERROR,
-                                   "specific query-source port "
-                                   "cannot coexist with queryport-pool. "
-                                   "(Pool disabled)");
-               if (logit6 && obj != NULL)
-                       cfg_obj_log(obj, ns_g_lctx, ISC_LOG_ERROR,
-                                   "specific query-source-v6 port "
-                                   "cannot coexist with queryport-pool. "
-                                   "(Pool disabled)");
-       }
-
        CHECK(dns_view_createresolver(view, ns_g_taskmgr, 31,
                                      ns_g_socketmgr, ns_g_timermgr,
                                      resopts, ns_g_dispatchmgr,
@@ -2848,24 +2817,41 @@ set_limits(const cfg_obj_t **maps) {
        SETLIMIT("files", openfiles, "open files");
 }
 
-static isc_result_t
-portlist_fromconf(dns_portlist_t *portlist, unsigned int family,
-                 const cfg_obj_t *ports)
+static void
+portset_fromconf(isc_portset_t *portset, const cfg_obj_t *ports,
+                isc_boolean_t positive)
 {
        const cfg_listelt_t *element;
-       isc_result_t result = ISC_R_SUCCESS;
 
        for (element = cfg_list_first(ports);
             element != NULL;
             element = cfg_list_next(element)) {
                const cfg_obj_t *obj = cfg_listelt_value(element);
-               in_port_t port = (in_port_t)cfg_obj_asuint32(obj);
 
-               result = dns_portlist_add(portlist, family, port);
-               if (result != ISC_R_SUCCESS)
-                       break;
+               if (cfg_obj_isuint32(obj)) {
+                       in_port_t port = (in_port_t)cfg_obj_asuint32(obj);
+
+                       if (positive)
+                               isc_portset_add(portset, port);
+                       else
+                               isc_portset_remove(portset, port);
+               } else {
+                       const cfg_obj_t *obj_loport, *obj_hiport;
+                       in_port_t loport, hiport;
+
+                       obj_loport = cfg_tuple_get(obj, "loport");
+                       loport = (in_port_t)cfg_obj_asuint32(obj_loport);
+                       obj_hiport = cfg_tuple_get(obj, "hiport");
+                       hiport = (in_port_t)cfg_obj_asuint32(obj_hiport);
+
+                       if (positive)
+                               isc_portset_addrange(portset, loport, hiport);
+                       else {
+                               isc_portset_removerange(portset, loport,
+                                                       hiport);
+                       }
+               }
        }
-       return (result);
 }
 
 static isc_result_t
@@ -2904,7 +2890,7 @@ load_configuration(const char *filename, ns_server_t *server,
        const cfg_obj_t *options;
        const cfg_obj_t *views;
        const cfg_obj_t *obj;
-       const cfg_obj_t *v4ports, *v6ports;
+       const cfg_obj_t *usev4ports, *avoidv4ports, *usev6ports, *avoidv6ports;
        const cfg_obj_t *maps[3];
        const cfg_obj_t *builtin_views;
        const cfg_listelt_t *element;
@@ -2916,7 +2902,9 @@ load_configuration(const char *filename, ns_server_t *server,
        isc_uint32_t interface_interval;
        isc_uint32_t heartbeat_interval;
        isc_uint32_t udpsize;
-       in_port_t listen_port;
+       in_port_t listen_port, udpport_low, udpport_high;
+       isc_portset_t *v4portset = NULL;
+       isc_portset_t *v6portset = NULL;
        int i;
 
        cfg_aclconfctx_init(&aclconfctx);
@@ -3033,24 +3021,64 @@ load_configuration(const char *filename, ns_server_t *server,
        CHECKM(ns_statschannels_configure(ns_g_server, config, &aclconfctx),
               "configuring statistics server(s)");
 
-       v4ports = NULL;
-       v6ports = NULL;
-       (void)ns_config_get(maps, "avoid-v4-udp-ports", &v4ports);
-       (void)ns_config_get(maps, "avoid-v6-udp-ports", &v6ports);
-       if (v4ports != NULL || v6ports != NULL) {
-               dns_portlist_t *portlist = NULL;
-               result = dns_portlist_create(ns_g_mctx, &portlist);
-               if (result == ISC_R_SUCCESS && v4ports != NULL)
-                       result = portlist_fromconf(portlist, AF_INET, v4ports);
-               if (result == ISC_R_SUCCESS && v6ports != NULL)
-                       portlist_fromconf(portlist, AF_INET6, v6ports);
-               if (result == ISC_R_SUCCESS)
-                       dns_dispatchmgr_setblackportlist(ns_g_dispatchmgr, portlist);
-               if (portlist != NULL)
-                       dns_portlist_detach(&portlist);
-               CHECK(result);
-       } else
-               dns_dispatchmgr_setblackportlist(ns_g_dispatchmgr, NULL);
+       /*
+        * Configure sets of UDP query source ports.
+        */
+       CHECKM(isc_portset_create(ns_g_mctx, &v4portset),
+              "creating UDP port set");
+       CHECKM(isc_portset_create(ns_g_mctx, &v6portset),
+              "creating UDP port set");
+
+       usev4ports = NULL;
+       usev6ports = NULL;
+       avoidv4ports = NULL;
+       avoidv6ports = NULL;
+
+       (void)ns_config_get(maps, "use-v4-udp-ports", &usev4ports);
+       if (usev4ports != NULL)
+               portset_fromconf(v4portset, usev4ports, ISC_TRUE);
+       else {
+               CHECKM(isc_net_getudpportrange(AF_INET, &udpport_low,
+                                              &udpport_high),
+                      "get the default UDP/IPv4 port range");
+               if (udpport_low == udpport_high)
+                       isc_portset_add(v4portset, udpport_low);
+               else {
+                       isc_portset_addrange(v4portset, udpport_low,
+                                            udpport_high);
+               }
+               isc_log_write(ns_g_lctx, NS_LOGCATEGORY_GENERAL,
+                             NS_LOGMODULE_SERVER, ISC_LOG_INFO,
+                             "using default UDP/IPv4 port range: [%d, %d]",
+                             udpport_low, udpport_high);
+       }
+       (void)ns_config_get(maps, "avoid-v4-udp-ports", &avoidv4ports);
+       if (avoidv4ports != NULL)
+               portset_fromconf(v4portset, avoidv4ports, ISC_FALSE);
+
+       (void)ns_config_get(maps, "use-v6-udp-ports", &usev6ports);
+       if (usev6ports != NULL)
+               portset_fromconf(v6portset, usev6ports, ISC_TRUE);
+       else {
+               CHECKM(isc_net_getudpportrange(AF_INET6, &udpport_low,
+                                              &udpport_high),
+                      "get the default UDP/IPv6 port range");
+               if (udpport_low == udpport_high)
+                       isc_portset_add(v6portset, udpport_low);
+               else {
+                       isc_portset_addrange(v6portset, udpport_low,
+                                            udpport_high);
+               }
+               isc_log_write(ns_g_lctx, NS_LOGCATEGORY_GENERAL,
+                             NS_LOGMODULE_SERVER, ISC_LOG_INFO,
+                             "using default UDP/IPv6 port range: [%d, %d]",
+                             udpport_low, udpport_high);
+       }
+       (void)ns_config_get(maps, "avoid-v6-udp-ports", &avoidv6ports);
+       if (avoidv6ports != NULL)
+               portset_fromconf(v6portset, avoidv6ports, ISC_FALSE);
+
+       dns_dispatchmgr_setavailports(ns_g_dispatchmgr, v4portset, v6portset);
 
        /*
         * Set the EDNS UDP size when we don't match a view.
@@ -3531,6 +3559,12 @@ load_configuration(const char *filename, ns_server_t *server,
        result = ISC_R_SUCCESS;
 
  cleanup:
+       if (v4portset != NULL)
+               isc_portset_destroy(ns_g_mctx, &v4portset);
+
+       if (v6portset != NULL)
+               isc_portset_destroy(ns_g_mctx, &v6portset);
+
        cfg_aclconfctx_destroy(&aclconfctx);
 
        if (parser != NULL) {
index 4ffa8d7bbd63f8e108952b460d10ea23377a7e10..5ee7a599a4d47f76db9cd3817f66553c4bc971ce 100644 (file)
@@ -18,7 +18,7 @@ AC_DIVERT_PUSH(1)dnl
 esyscmd([sed "s/^/# /" COPYRIGHT])dnl
 AC_DIVERT_POP()dnl
 
-AC_REVISION($Revision: 1.432.60.9 $)
+AC_REVISION($Revision: 1.432.60.10 $)
 
 AC_INIT(lib/dns/name.c)
 AC_PREREQ(2.59)
@@ -317,6 +317,43 @@ lifconf.lifc_len = 0;
                ISC_PLATFORM_HAVELIFCONF="#undef ISC_PLATFORM_HAVELIFCONF"])
 AC_SUBST(ISC_PLATFORM_HAVELIFCONF)
 
+#
+# check if we have kqueue
+#
+AC_CHECK_FUNC(kqueue, ac_cv_have_kqueue=yes, ac_cv_have_kqueue=no)
+case $ac_cv_have_kqueue in
+yes)
+       ISC_PLATFORM_HAVEKQUEUE="#define ISC_PLATFORM_HAVEKQUEUE 1"
+       ;;
+*)
+       ISC_PLATFORM_HAVEKQUEUE="#undef ISC_PLATFORM_HAVEKQUEUE"
+       ;;
+esac
+AC_SUBST(ISC_PLATFORM_HAVEKQUEUE)
+
+#
+# check if we have epoll
+#
+AC_CHECK_FUNC(epoll_create, ac_cv_have_epoll=yes, ac_cv_have_epoll=no)
+case $ac_cv_have_epoll in
+yes)
+       ISC_PLATFORM_HAVEEPOLL="#define ISC_PLATFORM_HAVEEPOLL 1"
+       ;;
+*)
+       ISC_PLATFORM_HAVEEPOLL="#undef ISC_PLATFORM_HAVEEPOLL"
+       ;;
+esac
+AC_SUBST(ISC_PLATFORM_HAVEEPOLL)
+
+#
+# check if we support /dev/poll
+#
+AC_CHECK_HEADERS(sys/devpoll.h,
+ISC_PLATFORM_HAVEDEVPOLL="#define ISC_PLATFORM_HAVEDEVPOLL 1"
+,
+ISC_PLATFORM_HAVEDEVPOLL="#undef ISC_PLATFORM_HAVEDEVPOLL"
+)
+AC_SUBST(ISC_PLATFORM_HAVEDEVPOLL)
 
 #
 # check if we need to #include sys/select.h explicitly
index 99a18a430296539e9f48d8514c42ebe73c2ee541..c7e754845cb6563377826abf20f55745fdacc443 100644 (file)
@@ -18,7 +18,7 @@
  - PERFORMANCE OF THIS SOFTWARE.
 -->
 
-<!-- File: $Id: Bv9ARM-book.xml,v 1.340.24.12 2008/06/17 06:44:09 marka Exp $ -->
+<!-- File: $Id: Bv9ARM-book.xml,v 1.340.24.13 2008/06/24 00:09:11 jinmei Exp $ -->
 <book xmlns:xi="http://www.w3.org/2001/XInclude">
   <title>BIND 9 Administrator Reference Manual</title>
 
@@ -2934,6 +2934,33 @@ $ORIGIN 0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa.
                 </para>
               </entry>
             </row>
+            <row rowsep="0">
+              <entry colname="1">
+                <para>
+                  <varname>port_list</varname>
+                </para>
+              </entry>
+              <entry colname="2">
+                <para>
+                 A list of an <varname>ip_port</varname> or a port
+                 range.
+                 A port range is specified in the form of
+                 <userinput>range</userinput> followed by
+                 two <varname>ip_port</varname>s,
+                 <varname>port_low</varname> and
+                 <varname>port_high</varname>, which represents
+                 port numbers from <varname>port_low</varname> through
+                 <varname>port_high</varname>, inclusive.
+                 <varname>port_low</varname> must not be larger than
+                 <varname>port_high</varname>.
+                 For example,
+                 <userinput>range 1024 65535</userinput> represents
+                 ports from 1024 through 65535.
+                 In either case an asterisk (`*') character is not
+                 allowed as a valid <varname>ip_port</varname>.
+                </para>
+              </entry>
+            </row>
             <row rowsep="0">
               <entry colname="1">
                 <para>
@@ -4492,7 +4519,9 @@ category notify { null; };
     <optional> try-tcp-refresh <replaceable>yes_or_no</replaceable>; </optional>
     <optional> allow-v6-synthesis { <replaceable>address_match_list</replaceable> }; </optional>
     <optional> blackhole { <replaceable>address_match_list</replaceable> }; </optional>
+    <optional> use-v4-udp-ports { <replaceable>port_list</replaceable> }; </optional>
     <optional> avoid-v4-udp-ports { <replaceable>port_list</replaceable> }; </optional>
+    <optional> use-v6-udp-ports { <replaceable>port_list</replaceable> }; </optional>
     <optional> avoid-v6-udp-ports { <replaceable>port_list</replaceable> }; </optional>
     <optional> listen-on <optional> port <replaceable>ip_port</replaceable> </optional> { <replaceable>address_match_list</replaceable> }; </optional>
     <optional> listen-on-v6 <optional> port <replaceable>ip_port</replaceable> </optional> { <replaceable>address_match_list</replaceable> }; </optional>
@@ -6242,29 +6271,98 @@ listen-on-v6 port 1234 { !2001:db8::/32; any; };
             If <command>address</command> is <command>*</command> (asterisk) or is omitted,
             a wildcard IP address (<command>INADDR_ANY</command>)
             will be used.
+         </para>
+
+         <para>
             If <command>port</command> is <command>*</command> or is omitted,
-           a random unprivileged port number is picked up and will be
-            used for each query.
-           Previously, the <command>use-queryport-pool</command> was provided
-           to support a pool of such random ports, but this option is now
-           obsolete because reusing the same ports in the pool is not
-            sufficiently secure.
+           a random port number from a pre-configured
+           range is picked up and will be used for each query.
+           The port range(s) is that specified in
+           the <command>use-v4-udp-ports</command> (for IPv4)
+            and <command>use-v6-udp-ports</command> (for IPv6)
+           options, excluding the ranges specified in
+           the <command>avoid-v4-udp-ports</command>
+            and <command>avoid-v6-udp-ports</command> options, respectively.
+         </para>
+
+          <para>
+           The defaults of the <command>query-source</command> and
+           <command>query-source-v6</command> options
+           are:
+          </para>
+
+<programlisting>query-source address * port *;
+query-source-v6 address * port *;
+</programlisting>
+
+          <para>
+           If <command>use-v4-udp-ports</command> or
+            <command>use-v6-udp-ports</command> is unspecified,
+           <command>named</command> will check if the operating
+           system provides a programming interface to retrieve the
+           system's default range for ephemeral ports.
+           If such an interface is available,
+           <command>named</command> will use the corresponding system
+           default range; otherwise, it will use its own defaults:
+         </para>
+
+<programlisting>use-v4-udp-ports { range 1024 65535; };
+use-v6-udp-ports { range 1024 65535; };
+</programlisting>
+
+          <para>
+           Note: make sure the ranges be sufficiently large for
+           security.  A desirable size depends on various parameters,
+           but we generally recommend it contain at least 16384 ports
+           (14 bits of entropy).
+           Note also that the system's default range when used may be
+           too small for this purpose, and that the range may even be
+           changed while <command>named</command> is running; the new
+           range will automatically be applied when <command>named</command>
+           is reloaded.
+           It is encouraged to
+           configure <command>use-v4-udp-ports</command> and
+            <command>use-v6-udp-ports</command> explicitly so that the
+            ranges are sufficiently large and are reasonably
+            independent from the ranges used by other applications.
+          </para>
+
+         <para>
+           Note: the operational configuration
+           where <command>named</command> runs may prohibit the use
+           of some ports.  For example, UNIX systems will not allow
+           <command>named</command> running without a root privilege
+           to use ports less than 1024.
+           If such ports are included in the specified (or detected)
+           set of query ports, the corresponding query attempts will
+           fail, resulting in resolution failures or delay.
+           It is therefore important to configure the set of ports
+           that can be safely used in the expected operational environment.
+         </para>
+
+          <para>
+           The defaults of the <command>avoid-v4-udp-ports</command> and
+           <command>avoid-v6-udp-ports</command> options
+           are:
+          </para>
+
+<programlisting>avoid-v4-udp-ports {};
+avoid-v6-udp-ports {};
+</programlisting>
+
+         <para>
+           Note: BIND 9.5.0 introduced
+           the <command>use-queryport-pool</command> 
+           option to support a pool of such random ports, but this
+           option is now obsolete because reusing the same ports in
+           the pool may not be sufficiently secure.
            For the same reason, it is generally strongly discouraged to
             specify a particular port for the
            <command>query-source</command> or
            <command>query-source-v6</command> options;
            it implicitly disables the use of randomized port numbers.
-           The <command>avoid-v4-udp-ports</command>
-            and <command>avoid-v6-udp-ports</command> options can be used
-            to prevent named
-            from selecting certain ports.
-           The defaults are:
           </para>
 
-<programlisting>query-source address * port *;
-query-source-v6 address * port *;
-</programlisting>
-
           <variablelist>
             <varlistentry>
               <term><command>use-queryport-pool</command></term>
@@ -6638,17 +6736,48 @@ query-source-v6 address * port *;
         </sect3>
 
         <sect3>
-          <title>Bad UDP Port Lists</title>
-          <para><command>avoid-v4-udp-ports</command>
-           and <command>avoid-v6-udp-ports</command> specify a list
-            of IPv4 and IPv6 UDP ports that will not be used as system
-            assigned source ports for UDP sockets.  These lists
-            prevent named from choosing as its random source port a
-            port that is blocked by your firewall.  If a query went
-            out with such a source port, the answer would not get by
-            the firewall and the name server would have to query
-            again.
+          <title>UDP Port Lists</title>
+          <para>
+           <command>use-v4-udp-ports</command>,
+           <command>avoid-v4-udp-ports</command>,
+           <command>use-v6-udp-ports</command>, and
+           <command>avoid-v6-udp-ports</command>
+           specify a list of IPv4 and IPv6 UDP ports that will be
+           used or not used as source ports for UDP messages.
+           See <xref linkend="query_address"/> about how the
+           available ports are determined.
+           For example, with the following configuration
           </para>
+
+<programlisting>
+use-v6-udp-ports { range 32768 65535; };
+avoid-v6-udp-ports { 40000; range 50000 60000; };
+</programlisting>
+
+          <para>
+            UDP ports of IPv6 messages sent
+            from <command>named</command> will be in one
+            of the following ranges: 32768 to 39999, 40001 to 49999,
+            and 60001 to 65535.
+          </para>
+
+          <para>
+            <command>avoid-v4-udp-ports</command> and
+            <command>avoid-v6-udp-ports</command> can be used
+             to prevent <command>named</command> from choosing as its random source port a
+             port that is blocked by your firewall or a port that is
+             used by other applications;
+            if a query went out with a source port blocked by a
+             firewall, the
+            answer would not get by the firewall and the name server would
+             have to query again.
+            Note: the desired range can also be represented only with
+            <command>use-v4-udp-ports</command> and
+            <command>use-v6-udp-ports</command>, and the
+            <command>avoid-</command> options are redundant in that
+            sense; they are provided for backward compatibility and
+            to possibly simplify the port specification.
+          </para>
         </sect3>
 
         <sect3>
index 7077f0d9a3714291ddf55351f220220cf0004fdc..bbde762dde45435b4759b593863b6532348627c9 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: dispatch.c,v 1.137.128.4 2008/05/27 22:36:11 each Exp $ */
+/* $Id: dispatch.c,v 1.137.128.5 2008/06/24 00:09:11 jinmei Exp $ */
 
 /*! \file */
 
 #include <stdlib.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <stdlib.h>
 
 #include <isc/entropy.h>
 #include <isc/mem.h>
 #include <isc/mutex.h>
+#include <isc/portset.h>
 #include <isc/print.h>
 #include <isc/random.h>
 #include <isc/string.h>
 
 typedef ISC_LIST(dns_dispentry_t)      dns_displist_t;
 
-/* transaction ID */
-typedef struct dns_qid {
-       unsigned int    magic;
-       unsigned int    qid_nbuckets;   /*%< hash table size */
-       unsigned int    qid_increment;  /*%< id increment on collision */
-       isc_mutex_t     lock;
-       dns_displist_t  *qid_table;     /*%< the table itself */
-} dns_qid_t;
-
 /* ARC4 Random generator state */
 typedef struct arc4ctx {
        isc_uint8_t     i;
        isc_uint8_t     j;
        isc_uint8_t     s[256];
        int             count;
+       isc_entropy_t   *entropy;       /*%< entropy source for ARC4 */
+       isc_mutex_t     *lock;
 } arc4ctx_t;
 
+typedef struct dns_qid {
+       unsigned int    magic;
+       unsigned int    qid_nbuckets;   /*%< hash table size */
+       unsigned int    qid_increment;  /*%< id increment on collision */
+       isc_mutex_t     lock;
+       dns_displist_t  *qid_table;     /*%< the table itself */
+       dns_displist_t  *addr_table;    /*%< address/port table */
+} dns_qid_t;
+
 struct dns_dispatchmgr {
        /* Unlocked. */
        unsigned int                    magic;
@@ -71,6 +75,7 @@ struct dns_dispatchmgr {
        dns_acl_t                      *blackhole;
        dns_portlist_t                 *portlist;
        dns_stats_t                    *stats;
+       isc_entropy_t                  *entropy; /*%< entropy source */
 
        /* Locked by "lock". */
        isc_mutex_t                     lock;
@@ -94,8 +99,27 @@ struct dns_dispatchmgr {
        isc_mempool_t                  *rpool;  /*%< memory pool for replies */
        isc_mempool_t                  *dpool;  /*%< dispatch allocations */
        isc_mempool_t                  *bpool;  /*%< memory pool for buffers */
-
-       isc_entropy_t                  *entropy; /*%< entropy source */
+       isc_mempool_t                  *spool;  /*%< memory pool for dispsocs */
+
+       /*%
+        * Locked by qid->lock if qid exists; otherwise, can be used without
+        * being locked.
+        * Memory footprint considerations: this is a simple implementation of
+        * available ports, i.e., an ordered array of the actual port numbers.
+        * This will require about 256KB of memory in the worst case (128KB for
+        * each of IPv4 and IPv6).  We could reduce it by representing it as a
+        * more sophisticated way such as a list (or array) of ranges that are
+        * searched to identify a specific port.  Our decision here is the saved
+        * memory isn't worth the implementation complexity, considering the
+        * fact that the whole BIND9 process (which is mainly named) already
+        * requires a pretty large memory footprint.  We may, however, have to
+        * revisit the decision when we want to use it as a separate module for
+        * an environment where memory requirement is severer.
+        */
+       in_port_t       *v4ports;       /*%< available ports for IPv4 */
+       unsigned int    nv4ports;       /*%< # of available ports for IPv4 */
+       in_port_t       *v6ports;       /*%< available ports for IPv4 */
+       unsigned int    nv6ports;       /*%< # of available ports for IPv4 */
 };
 
 #define MGR_SHUTTINGDOWN               0x00000001U
@@ -103,28 +127,76 @@ struct dns_dispatchmgr {
 
 #define IS_PRIVATE(d)  (((d)->attributes & DNS_DISPATCHATTR_PRIVATE) != 0)
 
+typedef struct dispsocket dispsocket_t;
+
 struct dns_dispentry {
        unsigned int                    magic;
        dns_dispatch_t                 *disp;
        dns_messageid_t                 id;
        in_port_t                       port;
        unsigned int                    bucket;
+       unsigned int                    abucket;
        isc_sockaddr_t                  host;
        isc_task_t                     *task;
        isc_taskaction_t                action;
        void                           *arg;
        isc_boolean_t                   item_out;
+       dispsocket_t                    *dispsocket;
        ISC_LIST(dns_dispatchevent_t)   items;
        ISC_LINK(dns_dispentry_t)       link;
+       ISC_LINK(dns_dispentry_t)       alink;
+};
+
+/*%
+ * Maximum number of dispatch sockets that can be pooled for reuse.  The
+ * appropriate value may vary, but experiments have shown a busy caching server
+ * may need more than 1000 sockets concurrently opened.  The maximum allowable
+ * number of dispatch sockets (per manager) will be set to the double of this
+ * value.
+ */
+#ifndef DNS_DISPATCH_POOLSOCKS
+#define DNS_DISPATCH_POOLSOCKS                 2048
+#endif
+
+/*%
+ * Quota to control the number of dispatch sockets.  If a dispatch has more
+ * than the quota of sockets, new queries will purge oldest ones, so that
+ * a massive number of outstanding queries won't prevent subsequent queries
+ * (especially if the older ones take longer time and result in timeout).
+ */
+#ifndef DNS_DISPATCH_SOCKSQUOTA
+#define DNS_DISPATCH_SOCKSQUOTA                        3072
+#endif
+
+struct dispsocket {
+       unsigned int                    magic;
+       isc_socket_t                    *socket;
+       dns_dispatch_t                  *disp;
+       dns_dispentry_t                 *resp;
+       isc_task_t                      *task;
+       ISC_LINK(dispsocket_t)          link;
 };
 
 #define INVALID_BUCKET         (0xffffdead)
 
+/*%
+ * Number of tasks for each dispatch that use separate sockets for different
+ * transactions.  This must be a power of 2 as it will divide 32 bit numbers
+ * to get an uniformly random tasks selection.  See get_dispsocket().
+ */
+#define MAX_INTERNAL_TASKS     64
+
 struct dns_dispatch {
        /* Unlocked. */
        unsigned int            magic;          /*%< magic */
        dns_dispatchmgr_t      *mgr;            /*%< dispatch manager */
-       isc_task_t             *task;           /*%< internal task */
+       int                     ntasks;
+       /*%
+        * internal task buckets.  We use multiple tasks to distribute various
+        * socket events well when using separate dispatch sockets.  We use the
+        * 1st task (task[0]) for internal control events.
+        */
+       isc_task_t             *task[MAX_INTERNAL_TASKS];
        isc_socket_t           *socket;         /*%< isc socket attached to */
        isc_sockaddr_t          local;          /*%< local address */
        in_port_t               localport;      /*%< local UDP port */
@@ -146,10 +218,14 @@ struct dns_dispatch {
                                tcpmsg_valid : 1,
                                recv_pending : 1; /*%< is a recv() pending? */
        isc_result_t            shutdown_why;
+       ISC_LIST(dispsocket_t)  activesockets;
+       ISC_LIST(dispsocket_t)  inactivesockets;
+       unsigned int            nsockets;
        unsigned int            requests;       /*%< how many requests we have */
        unsigned int            tcpbuffers;     /*%< allocated buffers */
        dns_tcpmsg_t            tcpmsg;         /*%< for tcp streams */
        dns_qid_t               *qid;
+       arc4ctx_t               arc4ctx;        /*%< for QID/UDP port num */
 };
 
 #define QID_MAGIC              ISC_MAGIC('Q', 'i', 'd', ' ')
@@ -158,6 +234,9 @@ struct dns_dispatch {
 #define RESPONSE_MAGIC         ISC_MAGIC('D', 'r', 's', 'p')
 #define VALID_RESPONSE(e)      ISC_MAGIC_VALID((e), RESPONSE_MAGIC)
 
+#define DISPSOCK_MAGIC         ISC_MAGIC('D', 's', 'o', 'c')
+#define VALID_DISPSOCK(e)      ISC_MAGIC_VALID((e), DISPSOCK_MAGIC)
+
 #define DISPATCH_MAGIC         ISC_MAGIC('D', 'i', 's', 'p')
 #define VALID_DISPATCH(e)      ISC_MAGIC_VALID((e), DISPATCH_MAGIC)
 
@@ -166,16 +245,34 @@ struct dns_dispatch {
 
 #define DNS_QID(disp) ((disp)->socktype == isc_sockettype_tcp) ? \
                       (disp)->qid : (disp)->mgr->qid
+#define DISP_ARC4CTX(disp) ((disp)->socktype == isc_sockettype_udp) ? \
+                       (&(disp)->arc4ctx) : (&(disp)->mgr->arc4ctx)
+
+/*%
+ * Locking a query port buffer is a bit tricky.  We access the buffer without
+ * locking until qid is created.  Technically, there is a possibility of race
+ * between the creation of qid and access to the port buffer; in practice,
+ * however, this should be safe because qid isn't created until the first
+ * dispatch is created and there should be no contending situation until then.
+ */
+#define PORTBUFLOCK(mgr) if ((mgr)->qid != NULL) LOCK(&((mgr)->qid->lock))
+#define PORTBUFUNLOCK(mgr) if ((mgr)->qid != NULL) UNLOCK((&(mgr)->qid->lock))
+
 /*
  * Statics.
  */
-static dns_dispentry_t *bucket_search(dns_qid_t *, isc_sockaddr_t *,
-                                     dns_messageid_t, in_port_t, unsigned int);
+static dns_dispentry_t *bucket_search(dns_qid_t *, dns_displist_t *,
+                                     isc_sockaddr_t *, dns_messageid_t,
+                                     in_port_t, unsigned int, isc_boolean_t);
 static isc_boolean_t destroy_disp_ok(dns_dispatch_t *);
 static void destroy_disp(isc_task_t *task, isc_event_t *event);
-static void udp_recv(isc_task_t *, isc_event_t *);
+static void destroy_dispsocket(dns_dispatch_t *, dispsocket_t **);
+static void deactivate_dispsocket(dns_dispatch_t *, dispsocket_t *);
+static void udp_exrecv(isc_task_t *, isc_event_t *);
+static void udp_shrecv(isc_task_t *, isc_event_t *);
+static void udp_recv(isc_event_t *, dns_dispatch_t *, dispsocket_t *);
 static void tcp_recv(isc_task_t *, isc_event_t *);
-static void startrecv(dns_dispatch_t *);
+static isc_result_t startrecv(dns_dispatch_t *, dispsocket_t *);
 static isc_uint32_t dns_hash(dns_qid_t *, isc_sockaddr_t *, dns_messageid_t,
                             in_port_t);
 static void free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len);
@@ -187,6 +284,12 @@ static dns_dispentry_t *linear_first(dns_qid_t *disp);
 static dns_dispentry_t *linear_next(dns_qid_t *disp,
                                    dns_dispentry_t *resp);
 static void dispatch_free(dns_dispatch_t **dispp);
+static isc_result_t get_udpsocket(dns_dispatchmgr_t *mgr,
+                                 dns_dispatch_t *disp,
+                                 isc_socketmgr_t *sockmgr,
+                                 isc_sockaddr_t *localaddr,
+                                 isc_socket_t **sockp,
+                                 unsigned int maxtry);
 static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr,
                                       isc_socketmgr_t *sockmgr,
                                       isc_taskmgr_t *taskmgr,
@@ -197,8 +300,13 @@ static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr,
 static isc_boolean_t destroy_mgr_ok(dns_dispatchmgr_t *mgr);
 static void destroy_mgr(dns_dispatchmgr_t **mgrp);
 static isc_result_t qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
-                                unsigned int increment, dns_qid_t **qidp);
+                                unsigned int increment, dns_qid_t **qidp,
+                                isc_boolean_t needaddrtable);
 static void qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp);
+static isc_result_t open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
+                               isc_socket_t **sockp);
+static isc_boolean_t portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
+                                  isc_sockaddr_t *sockaddrp);
 
 #define LVL(x) ISC_LOG_DEBUG(x)
 
@@ -278,17 +386,38 @@ request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
        }
 }
 
-/*
- * ARC4 random number generator obtained from OpenBSD
+/*%
+ * ARC4 random number generator derived from OpenBSD.
+ * Only dispatch_arc4random() and dispatch_arc4uniformrandom() are expected
+ * to be called from general dispatch routines; the rest of them are subroutines
+ * for these two.
+ *
+ * The original copyright follows:
+ * Copyright (c) 1996, David Mazieres <dm@uun.org>
+ * Copyright (c) 2008, Damien Miller <djm@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 static void
-dispatch_arc4init(arc4ctx_t *actx) {
+dispatch_arc4init(arc4ctx_t *actx, isc_entropy_t *entropy, isc_mutex_t *lock) {
        int n;
        for (n = 0; n < 256; n++)
                actx->s[n] = n;
        actx->i = 0;
        actx->j = 0;
        actx->count = 0;
+       actx->entropy = entropy; /* don't have to attach */
+       actx->lock = lock;
 }
 
 static void
@@ -332,7 +461,7 @@ dispatch_arc4get16(arc4ctx_t *actx) {
 }
 
 static void
-dispatch_arc4stir(dns_dispatchmgr_t *mgr) {
+dispatch_arc4stir(arc4ctx_t *actx) {
        int i;
        union {
                unsigned char rnd[128];
@@ -340,51 +469,55 @@ dispatch_arc4stir(dns_dispatchmgr_t *mgr) {
        } rnd;
        isc_result_t result;
 
-       if (mgr->entropy != NULL) {
+       if (actx->entropy != NULL) {
                /*
                 * We accept any quality of random data to avoid blocking.
                 */
-               result = isc_entropy_getdata(mgr->entropy, rnd.rnd,
+               result = isc_entropy_getdata(actx->entropy, rnd.rnd,
                                             sizeof(rnd), NULL, 0);
                RUNTIME_CHECK(result == ISC_R_SUCCESS);
        } else {
                for (i = 0; i < 32; i++)
                        isc_random_get(&rnd.rnd32[i]);
        }
-       dispatch_arc4addrandom(&mgr->arc4ctx, rnd.rnd, sizeof(rnd.rnd));
+       dispatch_arc4addrandom(actx, rnd.rnd, sizeof(rnd.rnd));
 
        /*
         * Discard early keystream, as per recommendations in:
         * http://www.wisdom.weizmann.ac.il/~itsik/RC4/Papers/Rc4_ksa.ps
         */
        for (i = 0; i < 256; i++)
-               (void)dispatch_arc4get8(&mgr->arc4ctx);
+               (void)dispatch_arc4get8(actx);
 
        /*
         * Derived from OpenBSD's implementation.  The rationale is not clear,
         * but should be conservative enough in safety, and reasonably large
         * for efficiency.
         */
-       mgr->arc4ctx.count = 1600000;
+       actx->count = 1600000;
 }
 
 static isc_uint16_t
-dispatch_arc4random(dns_dispatchmgr_t *mgr) {
+dispatch_arc4random(arc4ctx_t *actx) {
        isc_uint16_t result;
 
-       LOCK(&mgr->arc4_lock);
-       mgr->arc4ctx.count -= sizeof(isc_uint16_t);
-       if (mgr->arc4ctx.count <= 0)
-               dispatch_arc4stir(mgr);
-       result = dispatch_arc4get16(&mgr->arc4ctx);
-       UNLOCK(&mgr->arc4_lock);
+       if (actx->lock != NULL)
+               LOCK(actx->lock);
+
+       actx->count -= sizeof(isc_uint16_t);
+       if (actx->count <= 0)
+               dispatch_arc4stir(actx);
+       result = dispatch_arc4get16(actx);
+
+       if (actx->lock != NULL)
+               UNLOCK(actx->lock);
+
        return (result);
 }
 
 static isc_uint16_t
-dispatch_arc4uniformrandom(dns_dispatchmgr_t *mgr, isc_uint16_t upper_bound) {
+dispatch_arc4uniformrandom(arc4ctx_t *actx, isc_uint16_t upper_bound) {
        isc_uint16_t min, r;
-       /* The caller must hold the manager lock. */
 
        if (upper_bound < 2)
                return (0);
@@ -406,7 +539,7 @@ dispatch_arc4uniformrandom(dns_dispatchmgr_t *mgr, isc_uint16_t upper_bound) {
         * to re-roll.
         */
        for (;;) {
-               r = dispatch_arc4random(mgr);
+               r = dispatch_arc4random(actx);
                if (r >= min)
                        break;
        }
@@ -489,13 +622,15 @@ destroy_disp_ok(dns_dispatch_t *disp)
        if (disp->recv_pending != 0)
                return (ISC_FALSE);
 
+       if (!ISC_LIST_EMPTY(disp->activesockets))
+               return (ISC_FALSE);
+
        if (disp->shutting_down == 0)
                return (ISC_FALSE);
 
        return (ISC_TRUE);
 }
 
-
 /*
  * Called when refcount reaches 0 (and safe to destroy).
  *
@@ -507,6 +642,8 @@ destroy_disp(isc_task_t *task, isc_event_t *event) {
        dns_dispatch_t *disp;
        dns_dispatchmgr_t *mgr;
        isc_boolean_t killmgr;
+       dispsocket_t *dispsocket;
+       int i;
 
        INSIST(event->ev_type == DNS_EVENT_DISPATCHCONTROL);
 
@@ -520,10 +657,16 @@ destroy_disp(isc_task_t *task, isc_event_t *event) {
 
        dispatch_log(disp, LVL(90),
                     "shutting down; detaching from sock %p, task %p",
-                    disp->socket, disp->task);
+                    disp->socket, disp->task[0]); /* XXXX */
 
-       isc_socket_detach(&disp->socket);
-       isc_task_detach(&disp->task);
+       if (disp->socket != NULL)
+               isc_socket_detach(&disp->socket);
+       while ((dispsocket = ISC_LIST_HEAD(disp->inactivesockets)) != NULL) {
+               ISC_LIST_UNLINK(disp->inactivesockets, dispsocket, link);
+               destroy_dispsocket(disp, &dispsocket);
+       }
+       for (i = 0; i < disp->ntasks; i++)
+               isc_task_detach(&disp->task[i]);
        isc_event_free(&event);
 
        dispatch_free(&disp);
@@ -534,23 +677,168 @@ destroy_disp(isc_task_t *task, isc_event_t *event) {
                destroy_mgr(&mgr);
 }
 
+/*%
+ * Make a new socket for a single dispatch with a random port number.
+ * The caller must hold the disp->lock and qid->lock.
+ */
+static isc_result_t
+get_dispsocket(dns_dispatch_t *disp, isc_sockaddr_t *dest,
+              isc_socketmgr_t *sockmgr, dns_qid_t *qid,
+              dispsocket_t **dispsockp, unsigned int *abucketp,
+              in_port_t *portp)
+{
+       int i;
+       isc_uint32_t r;
+       dns_dispatchmgr_t *mgr = disp->mgr;
+       isc_socket_t *sock = NULL;
+       isc_result_t result = ISC_R_FAILURE;
+       in_port_t port;
+       isc_sockaddr_t localaddr;
+       unsigned int abucket = 0;
+       dispsocket_t *dispsock;
+       unsigned int nports;
+       in_port_t *ports;
+
+       if (isc_sockaddr_pf(&disp->local) == AF_INET) {
+               nports = disp->mgr->nv4ports;
+               ports = disp->mgr->v4ports;
+       } else {
+               nports = disp->mgr->nv6ports;
+               ports = disp->mgr->v6ports;
+       }
+       if (nports == 0)
+               return (ISC_R_ADDRNOTAVAIL);
+
+       dispsock = ISC_LIST_HEAD(disp->inactivesockets);
+       if (dispsock != NULL) {
+               ISC_LIST_UNLINK(disp->inactivesockets, dispsock, link);
+               sock = dispsock->socket;
+               dispsock->socket = NULL;
+       } else {
+               dispsock = isc_mempool_get(mgr->spool);
+               if (dispsock == NULL)
+                       return (ISC_R_NOMEMORY);
+
+               disp->nsockets++;
+               dispsock->socket = NULL;
+               dispsock->disp = disp;
+               dispsock->resp = NULL;
+               isc_random_get(&r);
+               dispsock->task = NULL;
+               isc_task_attach(disp->task[r % disp->ntasks], &dispsock->task);
+               ISC_LINK_INIT(dispsock, link);
+               dispsock->magic = DISPSOCK_MAGIC;
+       }
+
+       /*
+        * Pick up a random UDP port and open a new socket with it.  Avoid
+        * choosing ports that share the same destination because it will be
+        * very likely to fail in bind(2) or connect(2).
+        */
+       localaddr = disp->local;
+       for (i = 0; i < 64; i++) {
+               port = ports[dispatch_arc4uniformrandom(DISP_ARC4CTX(disp),
+                                                       nports)];
+               isc_sockaddr_setport(&localaddr, port);
+
+               abucket = dns_hash(qid, dest, 0, port);
+               if (bucket_search(qid, qid->addr_table, dest, 0, port, abucket,
+                                 ISC_TRUE) != NULL) {
+                       continue;
+               }
+
+               result = open_socket(sockmgr, &localaddr, &sock);
+               if (result == ISC_R_SUCCESS || result != ISC_R_ADDRINUSE)
+                       break;
+       }
+
+       if (result == ISC_R_SUCCESS) {
+               dispsock->socket = sock;
+               *dispsockp = dispsock;
+               *abucketp = abucket;
+               *portp = port;
+       } else {
+               /*
+                * We could keep it in the inactive list, but since this should
+                * be an exceptional case and might be resource shortage, we'd
+                * rather destroy it.
+                */
+               if (sock != NULL)
+                       isc_socket_detach(&sock);
+               destroy_dispsocket(disp, &dispsock);
+       }
+
+       return (result);
+}
+
+/*%
+ * Destroy a dedicated dispatch socket.
+ */
+static void
+destroy_dispsocket(dns_dispatch_t *disp, dispsocket_t **dispsockp) {
+       dispsocket_t *dispsock;
+
+       /*
+        * The dispatch must be locked.
+        */
+
+       REQUIRE(dispsockp != NULL && *dispsockp != NULL);
+       dispsock = *dispsockp;
+       REQUIRE(!ISC_LINK_LINKED(dispsock, link));
+
+       disp->nsockets--;
+       dispsock->magic = 0;
+       if (dispsock->socket != NULL)
+               isc_socket_detach(&dispsock->socket);
+       if (dispsock->task != NULL)
+               isc_task_detach(&dispsock->task);
+       isc_mempool_put(disp->mgr->spool, dispsock);
+
+       *dispsockp = NULL;
+}
+
+/*%
+ * Deactivate a dedicated dispatch socket.  Move it to the inactive list for
+ * future reuse unless the total number of sockets are exceeding the maximum.
+ */
+static void
+deactivate_dispsocket(dns_dispatch_t *disp, dispsocket_t *dispsock) {
+       /*
+        * The dispatch must be locked.
+        */
+       ISC_LIST_UNLINK(disp->activesockets, dispsock, link);
+       if (dispsock->resp != NULL) {
+               INSIST(dispsock->resp->dispsocket == dispsock);
+               dispsock->resp->dispsocket = NULL;
+       }
+
+       if (disp->nsockets > DNS_DISPATCH_POOLSOCKS)
+               destroy_dispsocket(disp, &dispsock);
+       else {
+               isc_socket_close(dispsock->socket);
+               ISC_LIST_APPEND(disp->inactivesockets, dispsock, link);
+       }
+}
 
 /*
- * Find an entry for query ID 'id' and socket address 'dest' in 'qid'.
+ * Find an entry for query ID 'id', socket address 'dest', and port number
+ * 'port' in 'table'.
  * Return NULL if no such entry exists.
  */
 static dns_dispentry_t *
-bucket_search(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
-             in_port_t port, unsigned int bucket)
+bucket_search(dns_qid_t *qid, dns_displist_t *table, isc_sockaddr_t *dest,
+             dns_messageid_t id, in_port_t port, unsigned int bucket,
+             isc_boolean_t ignoreid)
 {
        dns_dispentry_t *res;
 
        REQUIRE(bucket < qid->qid_nbuckets);
 
-       res = ISC_LIST_HEAD(qid->qid_table[bucket]);
+       res = ISC_LIST_HEAD(table[bucket]);
 
        while (res != NULL) {
-               if ((res->id == id) && isc_sockaddr_equal(dest, &res->host) &&
+               if ((ignoreid || res->id == id) &&
+                   isc_sockaddr_equal(dest, &res->host) &&
                    res->port == port) {
                        return (res);
                }
@@ -624,6 +912,26 @@ allocate_event(dns_dispatch_t *disp) {
        return (ev);
 }
 
+static void
+udp_exrecv(isc_task_t *task, isc_event_t *ev) {
+       dispsocket_t *dispsock = ev->ev_arg;
+
+       UNUSED(task);
+
+       REQUIRE(VALID_DISPSOCK(dispsock));
+       udp_recv(ev, dispsock->disp, dispsock);
+}
+
+static void
+udp_shrecv(isc_task_t *task, isc_event_t *ev) {
+       dns_dispatch_t *disp = ev->ev_arg;
+
+       UNUSED(task);
+
+       REQUIRE(VALID_DISPATCH(disp));
+       udp_recv(ev, disp, NULL);
+}
+
 /*
  * General flow:
  *
@@ -639,14 +947,13 @@ allocate_event(dns_dispatch_t *disp) {
  *     restart.
  */
 static void
-udp_recv(isc_task_t *task, isc_event_t *ev_in) {
+udp_recv(isc_event_t *ev_in, dns_dispatch_t *disp, dispsocket_t *dispsock) {
        isc_socketevent_t *ev = (isc_socketevent_t *)ev_in;
-       dns_dispatch_t *disp = ev_in->ev_arg;
        dns_messageid_t id;
        isc_result_t dres;
        isc_buffer_t source;
        unsigned int flags;
-       dns_dispentry_t *resp;
+       dns_dispentry_t *resp = NULL;
        dns_dispatchevent_t *rev;
        unsigned int bucket;
        isc_boolean_t killit;
@@ -655,8 +962,8 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) {
        dns_qid_t *qid;
        isc_netaddr_t netaddr;
        int match;
-
-       UNUSED(task);
+       int result;
+       isc_boolean_t qidlocked = ISC_FALSE;
 
        LOCK(&disp->lock);
 
@@ -667,7 +974,7 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) {
                     "got packet: requests %d, buffers %d, recvs %d",
                     disp->requests, disp->mgr->buffers, disp->recv_pending);
 
-       if (ev->ev_type == ISC_SOCKEVENT_RECVDONE) {
+       if (dispsock == NULL && ev->ev_type == ISC_SOCKEVENT_RECVDONE) {
                /*
                 * Unless the receive event was imported from a listening
                 * interface, in which case the event type is
@@ -677,6 +984,19 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) {
                disp->recv_pending = 0;
        }
 
+       if (dispsock != NULL &&
+           (ev->result == ISC_R_CANCELED || dispsock->resp == NULL)) {
+               /*
+                * dispsock->resp can be NULL if this transaction was canceled
+                * just after receiving a response.  Since this socket is
+                * exclusively used and there should be at most one receive
+                * event the canceled event should have been no effect.  So
+                * we can (and should) deactivate the socket right now.
+                */
+               deactivate_dispsocket(disp, dispsock);
+               dispsock = NULL;
+       }
+
        if (disp->shutting_down) {
                /*
                 * This dispatcher is shutting down.
@@ -689,12 +1009,25 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) {
                killit = destroy_disp_ok(disp);
                UNLOCK(&disp->lock);
                if (killit)
-                       isc_task_send(disp->task, &disp->ctlevent);
+                       isc_task_send(disp->task[0], &disp->ctlevent);
 
                return;
        }
 
-       if (ev->result != ISC_R_SUCCESS) {
+       if (dispsock != NULL &&
+           (disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
+               resp = dispsock->resp;
+               id = resp->id;
+               if (ev->result != ISC_R_SUCCESS) {
+                       /*
+                        * This is most likely a network error on a connected
+                        * socket.  It makes no sense to check the address or
+                        * parse the packet, but it will help to return the
+                        * error to the caller.
+                        */
+                       goto sendresponse;
+               }
+       } else if (ev->result != ISC_R_SUCCESS) {
                free_buffer(disp, ev->region.base, ev->region.length);
 
                if (ev->result != ISC_R_CANCELED)
@@ -755,15 +1088,31 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) {
                goto restart;
        }
 
-       /* response */
-       bucket = dns_hash(qid, &ev->address, id, disp->localport);
-       LOCK(&qid->lock);
-       resp = bucket_search(qid, &ev->address, id, disp->localport, bucket);
-       dispatch_log(disp, LVL(90),
-                    "search for response in bucket %d: %s",
-                    bucket, (resp == NULL ? "not found" : "found"));
-
+       /*
+        * Search for the corresponding response.  If we are using an exclusive
+        * socket, we've already identified it and we can skip the search; but
+        * the ID and the address must match the expected ones.
+        */
        if (resp == NULL) {
+               bucket = dns_hash(qid, &ev->address, id, disp->localport);
+               LOCK(&qid->lock);
+               qidlocked = ISC_TRUE;
+               resp = bucket_search(qid, qid->qid_table, &ev->address, id,
+                                    disp->localport, bucket, ISC_FALSE);
+               dispatch_log(disp, LVL(90),
+                            "search for response in bucket %d: %s",
+                            bucket, (resp == NULL ? "not found" : "found"));
+
+               if (resp == NULL) {
+                       dns_generalstats_increment(mgr->stats,
+                                                dns_resstatscounter_mismatch);
+                       free_buffer(disp, ev->region.base, ev->region.length);
+                       goto unlock;
+               }
+       } else if (resp->id != id || !isc_sockaddr_equal(&ev->address,
+                                                        &resp->host)) {
+               dispatch_log(disp, LVL(90),
+                            "response to an exclusive socket doesn't match");
                dns_generalstats_increment(mgr->stats,
                                           dns_resstatscounter_mismatch);
                free_buffer(disp, ev->region.base, ev->region.length);
@@ -813,6 +1162,7 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) {
                }
        }
 
+  sendresponse:
        queue_response = resp->item_out;
        rev = allocate_event(resp->disp);
        if (rev == NULL) {
@@ -827,7 +1177,7 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) {
         */
        isc_buffer_init(&rev->buffer, ev->region.base, ev->region.length);
        isc_buffer_add(&rev->buffer, ev->n);
-       rev->result = ISC_R_SUCCESS;
+       rev->result = ev->result;
        rev->id = id;
        rev->addr = ev->address;
        rev->pktinfo = ev->pktinfo;
@@ -846,14 +1196,23 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) {
                isc_task_send(resp->task, ISC_EVENT_PTR(&rev));
        }
  unlock:
-       UNLOCK(&qid->lock);
+       if (qidlocked)
+               UNLOCK(&qid->lock);
 
        /*
         * Restart recv() to get the next packet.
         */
  restart:
-       startrecv(disp);
-
+       result = startrecv(disp, dispsock);
+       if (result != ISC_R_SUCCESS && dispsock != NULL) {
+               /*
+                * XXX: wired. There seems to be no recovery process other than
+                * deactivate this socket anyway (since we cannot start
+                * receiving, we won't be able to receive a cancel event
+                * from the user).
+                */
+               deactivate_dispsocket(disp, dispsock);
+       }
        UNLOCK(&disp->lock);
 
        isc_event_free(&ev_in);
@@ -953,7 +1312,7 @@ tcp_recv(isc_task_t *task, isc_event_t *ev_in) {
                killit = destroy_disp_ok(disp);
                UNLOCK(&disp->lock);
                if (killit)
-                       isc_task_send(disp->task, &disp->ctlevent);
+                       isc_task_send(disp->task[0], &disp->ctlevent);
                return;
        }
 
@@ -996,8 +1355,8 @@ tcp_recv(isc_task_t *task, isc_event_t *ev_in) {
         */
        bucket = dns_hash(qid, &tcpmsg->address, id, disp->localport);
        LOCK(&qid->lock);
-       resp = bucket_search(qid, &tcpmsg->address, id, disp->localport,
-                            bucket);
+       resp = bucket_search(qid, qid->qid_table, &tcpmsg->address, id,
+                            disp->localport, bucket, ISC_FALSE);
        dispatch_log(disp, LVL(90),
                     "search for response in bucket %d: %s",
                     bucket, (resp == NULL ? "not found" : "found"));
@@ -1038,7 +1397,7 @@ tcp_recv(isc_task_t *task, isc_event_t *ev_in) {
         * Restart recv() to get the next packet.
         */
  restart:
-       startrecv(disp);
+       (void)startrecv(disp, NULL);
 
        UNLOCK(&disp->lock);
 
@@ -1048,22 +1407,33 @@ tcp_recv(isc_task_t *task, isc_event_t *ev_in) {
 /*
  * disp must be locked.
  */
-static void
-startrecv(dns_dispatch_t *disp) {
+static isc_result_t
+startrecv(dns_dispatch_t *disp, dispsocket_t *dispsock) {
        isc_result_t res;
        isc_region_t region;
+       isc_socket_t *socket;
 
        if (disp->shutting_down == 1)
-               return;
+               return (ISC_R_SUCCESS);
 
        if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0)
-               return;
+               return (ISC_R_SUCCESS);
 
-       if (disp->recv_pending != 0)
-               return;
+       if (disp->recv_pending != 0 && dispsock == NULL)
+               return (ISC_R_SUCCESS);
 
        if (disp->mgr->buffers >= disp->mgr->maxbuffers)
-               return;
+               return (ISC_R_NOMEMORY);
+
+       if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
+           dispsock == NULL)
+               return (ISC_R_SUCCESS);
+
+       if (dispsock != NULL)
+               socket = dispsock->socket;
+       else
+               socket = disp->socket;
+       INSIST(socket != NULL);
 
        switch (disp->socktype) {
                /*
@@ -1073,28 +1443,38 @@ startrecv(dns_dispatch_t *disp) {
                region.length = disp->mgr->buffersize;
                region.base = allocate_udp_buffer(disp);
                if (region.base == NULL)
-                       return;
-               res = isc_socket_recv(disp->socket, &region, 1,
-                                     disp->task, udp_recv, disp);
-               if (res != ISC_R_SUCCESS) {
-                       free_buffer(disp, region.base, region.length);
-                       disp->shutdown_why = res;
-                       disp->shutting_down = 1;
-                       do_cancel(disp);
-                       return;
+                       return (ISC_R_NOMEMORY);
+               if (dispsock != NULL) {
+                       res = isc_socket_recv(socket, &region, 1,
+                                             dispsock->task, udp_exrecv,
+                                             dispsock);
+                       if (res != ISC_R_SUCCESS) {
+                               free_buffer(disp, region.base, region.length);
+                               return (res);
+                       }
+               } else {
+                       res = isc_socket_recv(socket, &region, 1,
+                                             disp->task[0], udp_shrecv, disp);
+                       if (res != ISC_R_SUCCESS) {
+                               free_buffer(disp, region.base, region.length);
+                               disp->shutdown_why = res;
+                               disp->shutting_down = 1;
+                               do_cancel(disp);
+                               return (ISC_R_SUCCESS); /* recover by cancel */
+                       }
+                       INSIST(disp->recv_pending == 0);
+                       disp->recv_pending = 1;
                }
-               INSIST(disp->recv_pending == 0);
-               disp->recv_pending = 1;
                break;
 
        case isc_sockettype_tcp:
-               res = dns_tcpmsg_readmessage(&disp->tcpmsg, disp->task,
+               res = dns_tcpmsg_readmessage(&disp->tcpmsg, disp->task[0],
                                             tcp_recv, disp);
                if (res != ISC_R_SUCCESS) {
                        disp->shutdown_why = res;
                        disp->shutting_down = 1;
                        do_cancel(disp);
-                       return;
+                       return (ISC_R_SUCCESS); /* recover by cancel */
                }
                INSIST(disp->recv_pending == 0);
                disp->recv_pending = 1;
@@ -1103,6 +1483,8 @@ startrecv(dns_dispatch_t *disp) {
                INSIST(0);
                break;
        }
+
+       return (ISC_R_SUCCESS);
 }
 
 /*
@@ -1155,6 +1537,7 @@ destroy_mgr(dns_dispatchmgr_t **mgrp) {
        isc_mempool_destroy(&mgr->rpool);
        isc_mempool_destroy(&mgr->dpool);
        isc_mempool_destroy(&mgr->bpool);
+       isc_mempool_destroy(&mgr->spool);
 
        DESTROYLOCK(&mgr->pool_lock);
 
@@ -1168,36 +1551,50 @@ destroy_mgr(dns_dispatchmgr_t **mgrp) {
        if (mgr->blackhole != NULL)
                dns_acl_detach(&mgr->blackhole);
 
-       if (mgr->portlist != NULL)
-               dns_portlist_detach(&mgr->portlist);
-
        if (mgr->stats != NULL)
                dns_stats_detach(&mgr->stats);
 
+       if (mgr->v4ports != NULL) {
+               isc_mem_put(mctx, mgr->v4ports,
+                           mgr->nv4ports * sizeof(in_port_t));
+       }
+       if (mgr->v6ports != NULL) {
+               isc_mem_put(mctx, mgr->v6ports,
+                           mgr->nv6ports * sizeof(in_port_t));
+       }
        isc_mem_put(mctx, mgr, sizeof(dns_dispatchmgr_t));
        isc_mem_detach(&mctx);
 }
 
 static isc_result_t
-create_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
-             isc_socket_t **sockp)
+open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
+           isc_socket_t **sockp)
 {
        isc_socket_t *sock;
        isc_result_t result;
 
-       sock = NULL;
-       result = isc_socket_create(mgr, isc_sockaddr_pf(local),
-                                  isc_sockettype_udp, &sock);
-       if (result != ISC_R_SUCCESS)
-               return (result);
-       isc_socket_setname(sock, "dispatcher", NULL);
+       sock = *sockp;
+       if (sock == NULL) {
+               result = isc_socket_create(mgr, isc_sockaddr_pf(local),
+                                          isc_sockettype_udp, &sock);
+               if (result != ISC_R_SUCCESS)
+                       return (result);
+               isc_socket_setname(sock, "dispatcher", NULL);
+       } else {
+               result = isc_socket_open(sock);
+               if (result != ISC_R_SUCCESS)
+                       return (result);
+       }
 
 #ifndef ISC_ALLOW_MAPPED
        isc_socket_ipv6only(sock, ISC_TRUE);
 #endif
        result = isc_socket_bind(sock, local);
        if (result != ISC_R_SUCCESS) {
-               isc_socket_detach(&sock);
+               if (*sockp == NULL)
+                       isc_socket_detach(&sock);
+               else
+                       isc_socket_close(sock);
                return (result);
        }
 
@@ -1205,6 +1602,24 @@ create_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
        return (ISC_R_SUCCESS);
 }
 
+/*%
+ * Create a temporary port list to set the initial default set of dispatch
+ * ports: [1024, 65535].  This is almost meaningless as the application will
+ * normally set the ports explicitly, but is provided to fill some minor corner
+ * cases.
+ */
+static isc_result_t
+create_default_portset(isc_mem_t *mctx, isc_portset_t **portsetp) {
+       isc_result_t result;
+
+       result = isc_portset_create(mctx, portsetp);
+       if (result != ISC_R_SUCCESS)
+               return (result);
+       isc_portset_addrange(*portsetp, 1024, 65535);
+
+       return (ISC_R_SUCCESS);
+}
+
 /*
  * Publics.
  */
@@ -1215,6 +1630,8 @@ dns_dispatchmgr_create(isc_mem_t *mctx, isc_entropy_t *entropy,
 {
        dns_dispatchmgr_t *mgr;
        isc_result_t result;
+       isc_portset_t *v4portset = NULL;
+       isc_portset_t *v6portset = NULL;
 
        REQUIRE(mctx != NULL);
        REQUIRE(mgrp != NULL && *mgrp == NULL);
@@ -1227,7 +1644,6 @@ dns_dispatchmgr_create(isc_mem_t *mctx, isc_entropy_t *entropy,
        isc_mem_attach(mctx, &mgr->mctx);
 
        mgr->blackhole = NULL;
-       mgr->portlist = NULL;
        mgr->stats = NULL;
 
        result = isc_mutex_init(&mgr->lock);
@@ -1283,20 +1699,43 @@ dns_dispatchmgr_create(isc_mem_t *mctx, isc_entropy_t *entropy,
        mgr->buffersize = 0;
        mgr->maxbuffers = 0;
        mgr->bpool = NULL;
+       mgr->spool = NULL;
        mgr->entropy = NULL;
        mgr->qid = NULL;
        mgr->state = 0;
        ISC_LIST_INIT(mgr->list);
+       mgr->v4ports = NULL;
+       mgr->v6ports = NULL;
+       mgr->nv4ports = 0;
+       mgr->nv6ports = 0;
        mgr->magic = DNS_DISPATCHMGR_MAGIC;
 
+       result = create_default_portset(mctx, &v4portset);
+       if (result == ISC_R_SUCCESS) {
+               result = create_default_portset(mctx, &v6portset);
+               if (result == ISC_R_SUCCESS) {
+                       result = dns_dispatchmgr_setavailports(mgr,
+                                                              v4portset,
+                                                              v6portset);
+               }
+       }
+       if (v4portset != NULL)
+               isc_portset_destroy(mctx, &v4portset);
+       if (v6portset != NULL)
+               isc_portset_destroy(mctx, &v6portset);
+       if (result != ISC_R_SUCCESS)
+               goto kill_dpool;
+
        if (entropy != NULL)
                isc_entropy_attach(entropy, &mgr->entropy);
 
-       dispatch_arc4init(&mgr->arc4ctx);
+       dispatch_arc4init(&mgr->arc4ctx, mgr->entropy, &mgr->arc4_lock);
 
        *mgrp = mgr;
        return (ISC_R_SUCCESS);
 
+ kill_dpool:
+       isc_mempool_destroy(&mgr->dpool);
  kill_rpool:
        isc_mempool_destroy(&mgr->rpool);
  kill_epool:
@@ -1335,22 +1774,88 @@ dns_dispatchmgr_setblackportlist(dns_dispatchmgr_t *mgr,
                                 dns_portlist_t *portlist)
 {
        REQUIRE(VALID_DISPATCHMGR(mgr));
-       if (mgr->portlist != NULL)
-               dns_portlist_detach(&mgr->portlist);
-       if (portlist != NULL)
-               dns_portlist_attach(portlist, &mgr->portlist);
+       UNUSED(portlist);
+
+       /* This function is deprecated: use dns_dispatchmgr_setavailports(). */
+       return;
 }
 
 dns_portlist_t *
 dns_dispatchmgr_getblackportlist(dns_dispatchmgr_t *mgr) {
        REQUIRE(VALID_DISPATCHMGR(mgr));
-       return (mgr->portlist);
+       return (NULL);          /* this function is deprecated */
+}
+
+isc_result_t
+dns_dispatchmgr_setavailports(dns_dispatchmgr_t *mgr, isc_portset_t *v4portset,
+                             isc_portset_t *v6portset)
+{
+       in_port_t *v4ports, *v6ports, p;
+       unsigned int nv4ports, nv6ports, i4, i6;
+
+       REQUIRE(VALID_DISPATCHMGR(mgr));
+
+       nv4ports = isc_portset_nports(v4portset);
+       nv6ports = isc_portset_nports(v6portset);
+
+       v4ports = NULL;
+       if (nv4ports != 0) {
+               v4ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv4ports);
+               if (v4ports == NULL)
+                       return (ISC_R_NOMEMORY);
+       }
+       v6ports = NULL;
+       if (nv6ports != 0) {
+               v6ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv6ports);
+               if (v6ports == NULL) {
+                       if (v4ports != NULL) {
+                               isc_mem_put(mgr->mctx, v4ports,
+                                           sizeof(in_port_t) *
+                                           isc_portset_nports(v4portset));
+                       }
+                       return (ISC_R_NOMEMORY);
+               }
+       }
+
+       p = 0;
+       i4 = 0;
+       i6 = 0;
+       do {
+               if (isc_portset_isset(v4portset, p)) {
+                       INSIST(i4 < nv4ports);
+                       v4ports[i4++] = p;
+               }
+               if (isc_portset_isset(v6portset, p)) {
+                       INSIST(i6 < nv6ports);
+                       v6ports[i6++] = p;
+               }
+       } while (p++ < 65535);
+       INSIST(i4 == nv4ports && i6 == nv6ports);
+
+       PORTBUFLOCK(mgr);
+       if (mgr->v4ports != NULL) {
+               isc_mem_put(mgr->mctx, mgr->v4ports,
+                           mgr->nv4ports * sizeof(in_port_t));
+       }
+       mgr->v4ports = v4ports;
+       mgr->nv4ports = nv4ports;
+
+       if (mgr->v6ports != NULL) {
+               isc_mem_put(mgr->mctx, mgr->v6ports,
+                           mgr->nv6ports * sizeof(in_port_t));
+       }
+       mgr->v6ports = v6ports;
+       mgr->nv6ports = nv6ports;
+       PORTBUFUNLOCK(mgr);
+
+       return (ISC_R_SUCCESS);
 }
 
 static isc_result_t
 dns_dispatchmgr_setudp(dns_dispatchmgr_t *mgr,
-                       unsigned int buffersize, unsigned int maxbuffers,
-                       unsigned int buckets, unsigned int increment)
+                      unsigned int buffersize, unsigned int maxbuffers,
+                      unsigned int maxrequests, unsigned int buckets,
+                      unsigned int increment)
 {
        isc_result_t result;
 
@@ -1377,24 +1882,39 @@ dns_dispatchmgr_setudp(dns_dispatchmgr_t *mgr,
                maxbuffers = 8;
 
        LOCK(&mgr->buffer_lock);
+
+       /* Create or adjust buffer pool */
        if (mgr->bpool != NULL) {
                isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
                mgr->maxbuffers = maxbuffers;
+       } else {
+               result = isc_mempool_create(mgr->mctx, buffersize, &mgr->bpool);
+               if (result != ISC_R_SUCCESS) {
+                       UNLOCK(&mgr->buffer_lock);
+                       return (result);
+               }
+               isc_mempool_setname(mgr->bpool, "dispmgr_bpool");
+               isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
+               isc_mempool_associatelock(mgr->bpool, &mgr->pool_lock);
+       }
+
+       /* Create or adjust socket pool */
+       if (mgr->spool != NULL) {
+               isc_mempool_setmaxalloc(mgr->spool, DNS_DISPATCH_POOLSOCKS * 2);
                UNLOCK(&mgr->buffer_lock);
                return (ISC_R_SUCCESS);
        }
-
-       if (isc_mempool_create(mgr->mctx, buffersize,
-                              &mgr->bpool) != ISC_R_SUCCESS) {
+       result = isc_mempool_create(mgr->mctx, sizeof(dispsocket_t),
+                                   &mgr->spool);
+       if (result != ISC_R_SUCCESS) {
                UNLOCK(&mgr->buffer_lock);
-               return (ISC_R_NOMEMORY);
+               goto cleanup;
        }
+       isc_mempool_setname(mgr->spool, "dispmgr_spool");
+       isc_mempool_setmaxalloc(mgr->spool, maxrequests);
+       isc_mempool_associatelock(mgr->spool, &mgr->pool_lock);
 
-       isc_mempool_setname(mgr->bpool, "dispmgr_bpool");
-       isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
-       isc_mempool_associatelock(mgr->bpool, &mgr->pool_lock);
-
-       result = qid_allocate(mgr, buckets, increment, &mgr->qid);
+       result = qid_allocate(mgr, buckets, increment, &mgr->qid, ISC_TRUE);
        if (result != ISC_R_SUCCESS)
                goto cleanup;
 
@@ -1405,8 +1925,10 @@ dns_dispatchmgr_setudp(dns_dispatchmgr_t *mgr,
 
  cleanup:
        isc_mempool_destroy(&mgr->bpool);
+       if (mgr->spool != NULL)
+               isc_mempool_destroy(&mgr->spool);
        UNLOCK(&mgr->buffer_lock);
-       return (ISC_R_NOMEMORY);
+       return (result);
 }
 
 void
@@ -1441,29 +1963,56 @@ dns_dispatchmgr_setstats(dns_dispatchmgr_t *mgr, dns_stats_t *stats) {
        dns_stats_attach(stats, &mgr->stats);
 }
 
+static int
+port_cmp(const void *key, const void *ent) {
+       in_port_t p1 = *(const in_port_t *)key;
+       in_port_t p2 = *(const in_port_t *)ent;
+
+       if (p1 < p2)
+               return (-1);
+       else if (p1 == p2)
+               return (0);
+       else
+               return (1);
+}
+
 static isc_boolean_t
-blacklisted(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
-           isc_sockaddr_t *sockaddrp)
+portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
+             isc_sockaddr_t *sockaddrp)
 {
        isc_sockaddr_t sockaddr;
        isc_result_t result;
+       in_port_t *ports, port;
+       unsigned int nports;
+       isc_boolean_t available = ISC_FALSE;
 
        REQUIRE(sock != NULL || sockaddrp != NULL);
 
-       if (mgr->portlist == NULL)
-               return (ISC_FALSE);
-
+       PORTBUFLOCK(mgr);
        if (sock != NULL) {
                sockaddrp = &sockaddr;
                result = isc_socket_getsockname(sock, sockaddrp);
                if (result != ISC_R_SUCCESS)
-                       return (ISC_FALSE);
+                       goto unlock;
        }
 
-       if (dns_portlist_match(mgr->portlist, isc_sockaddr_pf(sockaddrp),
-                              isc_sockaddr_getport(sockaddrp)))
-               return (ISC_TRUE);
-       return (ISC_FALSE);
+       if (isc_sockaddr_pf(sockaddrp) == AF_INET) {
+               ports = mgr->v4ports;
+               nports = mgr->nv4ports;
+       } else {
+               ports = mgr->v6ports;
+               nports = mgr->nv6ports;
+       }
+       if (ports == NULL)
+               goto unlock;
+
+       port = isc_sockaddr_getport(sockaddrp);
+       if (bsearch(&port, ports, nports, sizeof(in_port_t), port_cmp) != NULL)
+               available = ISC_TRUE;
+
+unlock:
+       PORTBUFUNLOCK(mgr);
+       return (available);
 }
 
 #define ATTRMATCH(_a1, _a2, _mask) (((_a1) & (_mask)) == ((_a2) & (_mask)))
@@ -1477,13 +2026,17 @@ local_addr_match(dns_dispatch_t *disp, isc_sockaddr_t *addr) {
                return (ISC_TRUE);
 
        /*
-        * Don't match wildcard ports against newly blacklisted ports.
+        * Don't match wildcard ports unless the port is available in the
+        * current configuration.  We can skip this check when disp->socket is
+        * NULL because such a dispatcher will choose ports on-demand from
+        * the available set.
         */
-       if (disp->mgr->portlist != NULL &&
-           isc_sockaddr_getport(addr) == 0 &&
+       if (isc_sockaddr_getport(addr) == 0 &&
            isc_sockaddr_getport(&disp->local) == 0 &&
-           blacklisted(disp->mgr, disp->socket, NULL))
+           disp->socket != NULL &&
+           !portavailable(disp->mgr, disp->socket, NULL)) {
                return (ISC_FALSE);
+       }
 
        /*
         * Check if we match the binding <address,port>.
@@ -1555,7 +2108,8 @@ dispatch_find(dns_dispatchmgr_t *mgr, isc_sockaddr_t *local,
 
 static isc_result_t
 qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
-            unsigned int increment, dns_qid_t **qidp)
+            unsigned int increment, dns_qid_t **qidp,
+            isc_boolean_t needaddrtable)
 {
        dns_qid_t *qid;
        unsigned int i;
@@ -1577,16 +2131,35 @@ qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
                return (ISC_R_NOMEMORY);
        }
 
+       qid->addr_table = NULL;
+       if (needaddrtable) {
+               qid->addr_table = isc_mem_get(mgr->mctx,
+                                             buckets * sizeof(dns_displist_t));
+               if (qid->addr_table == NULL) {
+                       isc_mem_put(mgr->mctx, qid, sizeof(*qid));
+                       isc_mem_put(mgr->mctx, qid->qid_table,
+                                   buckets * sizeof(dns_displist_t));
+                       return (ISC_R_NOMEMORY);
+               }
+       }
+
        result = isc_mutex_init(&qid->lock);
        if (result != ISC_R_SUCCESS) {
+               if (qid->addr_table != NULL) {
+                       isc_mem_put(mgr->mctx, qid->addr_table,
+                                   buckets * sizeof(dns_displist_t));
+               }
                isc_mem_put(mgr->mctx, qid->qid_table,
                            buckets * sizeof(dns_displist_t));
                isc_mem_put(mgr->mctx, qid, sizeof(*qid));
                return (result);
        }
 
-       for (i = 0; i < buckets; i++)
+       for (i = 0; i < buckets; i++) {
                ISC_LIST_INIT(qid->qid_table[i]);
+               if (qid->addr_table != NULL)
+                       ISC_LIST_INIT(qid->addr_table[i]);
+       }
 
        qid->qid_nbuckets = buckets;
        qid->qid_increment = increment;
@@ -1608,6 +2181,10 @@ qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp) {
        qid->magic = 0;
        isc_mem_put(mctx, qid->qid_table,
                    qid->qid_nbuckets * sizeof(dns_displist_t));
+       if (qid->addr_table != NULL) {
+               isc_mem_put(mctx, qid->addr_table,
+                           qid->qid_nbuckets * sizeof(dns_displist_t));
+       }
        DESTROYLOCK(&qid->lock);
        isc_mem_put(mctx, qid, sizeof(*qid));
 }
@@ -1651,6 +2228,10 @@ dispatch_allocate(dns_dispatchmgr_t *mgr, unsigned int maxrequests,
        disp->requests = 0;
        disp->tcpbuffers = 0;
        disp->qid = NULL;
+       ISC_LIST_INIT(disp->activesockets);
+       ISC_LIST_INIT(disp->inactivesockets);
+       disp->nsockets = 0;
+       dispatch_arc4init(&disp->arc4ctx, mgr->entropy, NULL);
 
        result = isc_mutex_init(&disp->lock);
        if (result != ISC_R_SUCCESS)
@@ -1703,6 +2284,8 @@ dispatch_free(dns_dispatch_t **dispp)
        INSIST(disp->tcpbuffers == 0);
        INSIST(disp->requests == 0);
        INSIST(disp->recv_pending == 0);
+       INSIST(ISC_LIST_EMPTY(disp->activesockets));
+       INSIST(ISC_LIST_EMPTY(disp->inactivesockets));
 
        isc_mempool_put(mgr->epool, disp->failsafe_ev);
        disp->failsafe_ev = NULL;
@@ -1748,7 +2331,7 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
                return (result);
        }
 
-       result = qid_allocate(mgr, buckets, increment, &disp->qid);
+       result = qid_allocate(mgr, buckets, increment, &disp->qid, ISC_FALSE);
        if (result != ISC_R_SUCCESS)
                goto deallocate_dispatch;
 
@@ -1756,8 +2339,9 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
        disp->socket = NULL;
        isc_socket_attach(sock, &disp->socket);
 
-       disp->task = NULL;
-       result = isc_task_create(taskmgr, 0, &disp->task);
+       disp->ntasks = 1;
+       disp->task[0] = NULL;
+       result = isc_task_create(taskmgr, 0, &disp->task[0]);
        if (result != ISC_R_SUCCESS)
                goto kill_socket;
 
@@ -1770,7 +2354,7 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
                goto kill_task;
        }
 
-       isc_task_setname(disp->task, "tcpdispatch", disp);
+       isc_task_setname(disp->task[0], "tcpdispatch", disp);
 
        dns_tcpmsg_init(mgr->mctx, disp->socket, &disp->tcpmsg);
        disp->tcpmsg_valid = 1;
@@ -1784,7 +2368,7 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
        UNLOCK(&mgr->lock);
 
        mgr_log(mgr, LVL(90), "created TCP dispatcher %p", disp);
-       dispatch_log(disp, LVL(90), "created task %p", disp->task);
+       dispatch_log(disp, LVL(90), "created task %p", disp->task[0]);
 
        *dispp = disp;
 
@@ -1794,7 +2378,7 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
         * Error returns.
         */
  kill_task:
-       isc_task_detach(&disp->task);
+       isc_task_detach(&disp->task[0]);
  kill_socket:
        isc_socket_detach(&disp->socket);
  deallocate_dispatch:
@@ -1829,13 +2413,13 @@ dns_dispatch_getudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
        REQUIRE((attributes & DNS_DISPATCHATTR_TCP) == 0);
 
        result = dns_dispatchmgr_setudp(mgr, buffersize, maxbuffers,
-                                       buckets, increment);
+                                       maxrequests, buckets, increment);
        if (result != ISC_R_SUCCESS)
                return (result);
 
        LOCK(&mgr->lock);
 
-       if ((attributes & DNS_DISPATCHATTR_RANDOMPORT) != 0) {
+       if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
                REQUIRE(isc_sockaddr_getport(localaddr) == 0);
                goto createudp;
        }
@@ -1855,7 +2439,7 @@ dns_dispatch_getudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
                {
                        disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
                        if (disp->recv_pending != 0)
-                               isc_socket_cancel(disp->socket, disp->task,
+                               isc_socket_cancel(disp->socket, disp->task[0],
                                                  ISC_SOCKCANCEL_RECV);
                }
 
@@ -1891,6 +2475,100 @@ dns_dispatch_getudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
 #define DNS_DISPATCH_HELD 20U
 #endif
 
+static isc_result_t
+get_udpsocket(dns_dispatchmgr_t *mgr, dns_dispatch_t *disp,
+             isc_socketmgr_t *sockmgr, isc_sockaddr_t *localaddr,
+             isc_socket_t **sockp, unsigned int maxtry)
+{
+       unsigned int i, j;
+       isc_socket_t *held[DNS_DISPATCH_HELD];
+       isc_sockaddr_t localaddr_bound;
+       isc_socket_t *sock = NULL;
+       isc_result_t result = ISC_R_SUCCESS;
+       isc_boolean_t anyport;
+
+       INSIST(sockp != NULL && *sockp == NULL);
+
+       localaddr_bound = *localaddr;
+       anyport = ISC_TF(isc_sockaddr_getport(localaddr) == 0);
+
+       if (anyport) {
+               unsigned int nports;
+               in_port_t *ports;
+
+               /*
+                * If no port is specified, we first try to pick up a random
+                * port by ourselves.
+                */
+               if (isc_sockaddr_pf(&disp->local) == AF_INET) {
+                       nports = disp->mgr->nv4ports;
+                       ports = disp->mgr->v4ports;
+               } else {
+                       nports = disp->mgr->nv6ports;
+                       ports = disp->mgr->v6ports;
+               }
+               if (nports == 0)
+                       return (ISC_R_ADDRNOTAVAIL);
+
+               for (i = 0; i < 1024; i++) {
+                       in_port_t prt;
+
+                       prt = ports[dispatch_arc4uniformrandom(
+                                       DISP_ARC4CTX(disp),
+                                       nports)];
+                       isc_sockaddr_setport(&localaddr_bound, prt);
+                       result = open_socket(sockmgr, &localaddr_bound, &sock);
+                       if (result == ISC_R_SUCCESS ||
+                           result != ISC_R_ADDRINUSE) {
+                               disp->localport = prt;
+                               *sockp = sock;
+                               return (result);
+                       }
+               }
+
+               /*
+                * If this fails 1024 times, we then ask the kernel for
+                * choosing one.
+                */
+       }
+
+       memset(held, 0, sizeof(held));
+       i = 0;
+
+       for (j = 0; j < maxtry; j++) {
+               result = open_socket(sockmgr, localaddr, &sock);
+               if (result != ISC_R_SUCCESS)
+                       goto end;
+               else if (!anyport)
+                       break;
+               else if (portavailable(mgr, sock, NULL))
+                       break;
+               if (held[i] != NULL)
+                       isc_socket_detach(&held[i]);
+               held[i++] = sock;
+               sock = NULL;
+               if (i == DNS_DISPATCH_HELD)
+                       i = 0;
+       }
+       if (j == maxtry) {
+               mgr_log(mgr, ISC_LOG_ERROR,
+                       "avoid-v%s-udp-ports: unable to allocate "
+                       "an available port",
+                       isc_sockaddr_pf(localaddr) == AF_INET ? "4" : "6");
+               result = ISC_R_FAILURE;
+               goto end;
+       }
+       *sockp = sock;
+
+end:
+       for (i = 0; i < DNS_DISPATCH_HELD; i++) {
+               if (held[i] != NULL)
+                       isc_socket_detach(&held[i]);
+       }
+
+       return (result);
+}
+
 static isc_result_t
 dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
                   isc_taskmgr_t *taskmgr,
@@ -1902,10 +2580,7 @@ dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
        isc_result_t result;
        dns_dispatch_t *disp;
        isc_socket_t *sock = NULL;
-       isc_socket_t *held[DNS_DISPATCH_HELD];
-       unsigned int i = 0, j = 0, k = 0;
-       isc_sockaddr_t localaddr_bound;
-       in_port_t localport = 0;
+       int i = 0;
 
        /*
         * dispatch_allocate() checks mgr for us.
@@ -1915,66 +2590,30 @@ dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
        if (result != ISC_R_SUCCESS)
                return (result);
 
-       /*
-        * Try to allocate a socket that is not on the blacklist.
-        * Hold up to DNS_DISPATCH_HELD sockets to prevent the OS
-        * from returning the same port to us too quickly.
-        */
-       memset(held, 0, sizeof(held));
-       localaddr_bound = *localaddr;
- getsocket:
-       if ((attributes & DNS_DISPATCHATTR_RANDOMPORT) != 0) {
-               in_port_t prt;
-
-               /* XXX: should the range be configurable? */
-               prt = 1024 + dispatch_arc4uniformrandom(mgr, 65535 - 1023);
-               isc_sockaddr_setport(&localaddr_bound, prt);
-               if (blacklisted(mgr, NULL, &localaddr_bound)) {
-                       if (++k == 1024)
-                               attributes &= ~DNS_DISPATCHATTR_RANDOMPORT;
-                       goto getsocket;
-               }
-               result = create_socket(sockmgr, &localaddr_bound, &sock);
-               if (result == ISC_R_ADDRINUSE) {
-                       if (++k == 1024)
-                               attributes &= ~DNS_DISPATCHATTR_RANDOMPORT;
-                       goto getsocket;
-               }
-               localport = prt;
-       } else
-               result = create_socket(sockmgr, localaddr, &sock);
-       if (result != ISC_R_SUCCESS)
-               goto deallocate_dispatch;
-       if ((attributes & DNS_DISPATCHATTR_RANDOMPORT) == 0 &&
-           isc_sockaddr_getport(localaddr) == 0 &&
-           blacklisted(mgr, sock, NULL))
-       {
-               if (held[i] != NULL)
-                       isc_socket_detach(&held[i]);
-               held[i++] = sock;
-               sock = NULL;
-               if (i == DNS_DISPATCH_HELD)
-                       i = 0;
-               if (j++ == 0xffffU) {
-                       mgr_log(mgr, ISC_LOG_ERROR, "avoid-v%s-udp-ports: "
-                               "unable to allocate a non-blacklisted port",
-                               isc_sockaddr_pf(localaddr) == AF_INET ?
-                                       "4" : "6");
-                       result = ISC_R_FAILURE;
+       if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0) {
+               result = get_udpsocket(mgr, disp, sockmgr, localaddr, &sock,
+                                      0xffffU);
+               if (result != ISC_R_SUCCESS)
                        goto deallocate_dispatch;
-               }
-               goto getsocket;
        }
-
        disp->socktype = isc_sockettype_udp;
        disp->socket = sock;
        disp->local = *localaddr;
-       disp->localport = localport;
 
-       disp->task = NULL;
-       result = isc_task_create(taskmgr, 0, &disp->task);
-       if (result != ISC_R_SUCCESS)
-               goto kill_socket;
+       if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
+               disp->ntasks = MAX_INTERNAL_TASKS;
+       else
+               disp->ntasks = 1;
+       for (i = 0; i < disp->ntasks; i++) {
+               disp->task[i] = NULL;
+               result = isc_task_create(taskmgr, 0, &disp->task[i]);
+               if (result != ISC_R_SUCCESS) {
+                       while (--i >= 0)
+                               isc_task_destroy(&disp->task[i]);
+                       goto kill_socket;
+               }
+               isc_task_setname(disp->task[i], "udpdispatch", disp);
+       }
 
        disp->ctlevent = isc_event_allocate(mgr->mctx, disp,
                                            DNS_EVENT_DISPATCHCONTROL,
@@ -1985,8 +2624,6 @@ dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
                goto kill_task;
        }
 
-       isc_task_setname(disp->task, "udpdispatch", disp);
-
        attributes &= ~DNS_DISPATCHATTR_TCP;
        attributes |= DNS_DISPATCHATTR_UDP;
        disp->attributes = attributes;
@@ -1997,26 +2634,25 @@ dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
        ISC_LIST_APPEND(mgr->list, disp, link);
 
        mgr_log(mgr, LVL(90), "created UDP dispatcher %p", disp);
-       dispatch_log(disp, LVL(90), "created task %p", disp->task);
-       dispatch_log(disp, LVL(90), "created socket %p", disp->socket);
+       dispatch_log(disp, LVL(90), "created task %p", disp->task[0]); /* XXX */
+       if (disp->socket != NULL)
+               dispatch_log(disp, LVL(90), "created socket %p", disp->socket);
 
        *dispp = disp;
-
-       goto cleanheld;
+       return (result);
 
        /*
         * Error returns.
         */
  kill_task:
-       isc_task_detach(&disp->task);
+       for (i = 0; i < disp->ntasks; i++)
+               isc_task_detach(&disp->task[i]);
  kill_socket:
-       isc_socket_detach(&disp->socket);
+       if (disp->socket != NULL)
+               isc_socket_detach(&disp->socket);
  deallocate_dispatch:
        dispatch_free(&disp);
- cleanheld:
-       for (i = 0; i < DNS_DISPATCH_HELD; i++)
-               if (held[i] != NULL)
-                       isc_socket_detach(&held[i]);
+
        return (result);
 }
 
@@ -2042,6 +2678,7 @@ dns_dispatch_attach(dns_dispatch_t *disp, dns_dispatch_t **dispp) {
 void
 dns_dispatch_detach(dns_dispatch_t **dispp) {
        dns_dispatch_t *disp;
+       dispsocket_t *dispsock;
        isc_boolean_t killit;
 
        REQUIRE(dispp != NULL && VALID_DISPATCH(*dispp));
@@ -2056,8 +2693,14 @@ dns_dispatch_detach(dns_dispatch_t **dispp) {
        killit = ISC_FALSE;
        if (disp->refcount == 0) {
                if (disp->recv_pending > 0)
-                       isc_socket_cancel(disp->socket, disp->task,
+                       isc_socket_cancel(disp->socket, disp->task[0],
+                                         ISC_SOCKCANCEL_RECV);
+               for (dispsock = ISC_LIST_HEAD(disp->activesockets);
+                    dispsock != NULL;
+                    dispsock = ISC_LIST_NEXT(dispsock, link)) {
+                       isc_socket_cancel(dispsock->socket, dispsock->task,
                                          ISC_SOCKCANCEL_RECV);
+               }
                disp->shutting_down = 1;
        }
 
@@ -2066,26 +2709,33 @@ dns_dispatch_detach(dns_dispatch_t **dispp) {
        killit = destroy_disp_ok(disp);
        UNLOCK(&disp->lock);
        if (killit)
-               isc_task_send(disp->task, &disp->ctlevent);
+               isc_task_send(disp->task[0], &disp->ctlevent);
 }
 
 isc_result_t
-dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
-                        isc_task_t *task, isc_taskaction_t action, void *arg,
-                        dns_messageid_t *idp, dns_dispentry_t **resp)
+dns_dispatch_addresponse2(dns_dispatch_t *disp, isc_sockaddr_t *dest,
+                         isc_task_t *task, isc_taskaction_t action, void *arg,
+                         dns_messageid_t *idp, dns_dispentry_t **resp,
+                         isc_socketmgr_t *sockmgr)
 {
        dns_dispentry_t *res;
        unsigned int bucket;
+       unsigned int abucket;
+       in_port_t localport = 0;
        dns_messageid_t id;
        int i;
        isc_boolean_t ok;
        dns_qid_t *qid;
+       dispsocket_t *dispsocket = NULL;
+       isc_result_t result;
 
        REQUIRE(VALID_DISPATCH(disp));
        REQUIRE(task != NULL);
        REQUIRE(dest != NULL);
        REQUIRE(resp != NULL && *resp == NULL);
        REQUIRE(idp != NULL);
+       if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
+               REQUIRE(sockmgr != NULL);
 
        LOCK(&disp->lock);
 
@@ -2099,23 +2749,77 @@ dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
                return (ISC_R_QUOTA);
        }
 
+       if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
+           disp->nsockets > DNS_DISPATCH_SOCKSQUOTA) {
+               dispsocket_t *oldestsocket;
+               dns_dispentry_t *oldestresp;
+               dns_dispatchevent_t *rev;
+
+               /*
+                * Kill oldest outstanding query if the number of sockets
+                * exceeds the quota to keep the room for new queries.
+                */
+               oldestsocket = ISC_LIST_HEAD(disp->activesockets);
+               oldestresp = oldestsocket->resp;
+               if (oldestresp != NULL && !oldestresp->item_out) {
+                       rev = allocate_event(oldestresp->disp);
+                       if (rev != NULL) {
+                               rev->buffer.base = NULL;
+                               rev->result = ISC_R_CANCELED;
+                               rev->id = oldestresp->id;
+                               ISC_EVENT_INIT(rev, sizeof(*rev), 0,
+                                              NULL, DNS_EVENT_DISPATCH,
+                                              oldestresp->action,
+                                              oldestresp->arg, oldestresp,
+                                              NULL, NULL);
+                               oldestresp->item_out = ISC_TRUE;
+                               isc_task_send(oldestresp->task,
+                                             ISC_EVENT_PTR(&rev));
+                       }
+               }
+
+               /*
+                * Move this entry to the tail so that it won't (easily) be
+                * examined before actually being canceled.
+                */
+               ISC_LIST_UNLINK(disp->activesockets, oldestsocket, link);
+               ISC_LIST_APPEND(disp->activesockets, oldestsocket, link);
+       }
+
+       qid = DNS_QID(disp);
+       LOCK(&qid->lock);
+
+       if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
+               /*
+                * Get a separate UDP socket with a random port number.
+                */
+               result = get_dispsocket(disp, dest, sockmgr, qid, &dispsocket,
+                                       &abucket, &localport);
+               if (result != ISC_R_SUCCESS) {
+                       UNLOCK(&qid->lock);
+                       UNLOCK(&disp->lock);
+                       return (result);
+               }
+       } else {
+               abucket = 0;    /* meaningless, but set explicitly */
+               localport = disp->localport;
+       }
+
        /*
         * Try somewhat hard to find an unique ID.
         */
-       id = (dns_messageid_t)dispatch_arc4random(disp->mgr);
-       qid = DNS_QID(disp);
-       LOCK(&qid->lock);
-       bucket = dns_hash(qid, dest, id, disp->localport);
+       id = (dns_messageid_t)dispatch_arc4random(DISP_ARC4CTX(disp));
+       bucket = dns_hash(qid, dest, id, localport);
        ok = ISC_FALSE;
        for (i = 0; i < 64; i++) {
-               if (bucket_search(qid, dest, id, disp->localport, bucket) ==
-                   NULL) {
+               if (bucket_search(qid, qid->qid_table, dest, id, localport,
+                                 bucket, ISC_FALSE) == NULL) {
                        ok = ISC_TRUE;
                        break;
                }
                id += qid->qid_increment;
                id &= 0x0000ffff;
-               bucket = dns_hash(qid, dest, id, disp->localport);
+               bucket = dns_hash(qid, dest, id, localport);
        }
 
        if (!ok) {
@@ -2128,6 +2832,8 @@ dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
        if (res == NULL) {
                UNLOCK(&qid->lock);
                UNLOCK(&disp->lock);
+               if (dispsocket != NULL)
+                       destroy_dispsocket(disp, &dispsocket);
                return (ISC_R_NOMEMORY);
        }
 
@@ -2137,43 +2843,89 @@ dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
        isc_task_attach(task, &res->task);
        res->disp = disp;
        res->id = id;
-       res->port = disp->localport;
+       res->port = localport;
        res->bucket = bucket;
+       res->abucket = abucket;
        res->host = *dest;
        res->action = action;
        res->arg = arg;
+       res->dispsocket = dispsocket;
+       if (dispsocket != NULL)
+               dispsocket->resp = res;
        res->item_out = ISC_FALSE;
        ISC_LIST_INIT(res->items);
        ISC_LINK_INIT(res, link);
+       ISC_LINK_INIT(res, alink);
        res->magic = RESPONSE_MAGIC;
        ISC_LIST_APPEND(qid->qid_table[bucket], res, link);
+       if (dispsocket != NULL)
+               ISC_LIST_APPEND(qid->addr_table[abucket], res, alink);
        UNLOCK(&qid->lock);
 
        request_log(disp, res, LVL(90),
                    "attached to task %p", res->task);
 
        if (((disp->attributes & DNS_DISPATCHATTR_UDP) != 0) ||
-           ((disp->attributes & DNS_DISPATCHATTR_CONNECTED) != 0))
-               startrecv(disp);
+           ((disp->attributes & DNS_DISPATCHATTR_CONNECTED) != 0)) {
+               result = startrecv(disp, dispsocket);
+               if (result != ISC_R_SUCCESS) {
+                       LOCK(&qid->lock);
+                       ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
+                       if (ISC_LINK_LINKED(res, alink)) {
+                               ISC_LIST_UNLINK(qid->addr_table[abucket], res,
+                                               alink);
+                       }
+                       UNLOCK(&qid->lock);
+
+                       if (dispsocket != NULL)
+                               destroy_dispsocket(disp, &dispsocket);
+
+                       disp->refcount--;
+                       disp->requests--;
+
+                       UNLOCK(&disp->lock);
+                       isc_task_detach(&res->task);
+                       isc_mempool_put(disp->mgr->rpool, res);
+                       return (result);
+               }
+       }
+
+       if (dispsocket != NULL)
+               ISC_LIST_APPEND(disp->activesockets, dispsocket, link);
 
        UNLOCK(&disp->lock);
 
        *idp = id;
        *resp = res;
 
+       if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
+               INSIST(res->dispsocket != NULL);
+
        return (ISC_R_SUCCESS);
 }
 
+isc_result_t
+dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
+                        isc_task_t *task, isc_taskaction_t action, void *arg,
+                        dns_messageid_t *idp, dns_dispentry_t **resp)
+{
+        REQUIRE(VALID_DISPATCH(disp));
+        REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0); 
+
+       return (dns_dispatch_addresponse2(disp, dest, task, action, arg,
+                                         idp, resp, NULL));
+}
+
 void
 dns_dispatch_starttcp(dns_dispatch_t *disp) {
 
        REQUIRE(VALID_DISPATCH(disp));
 
-       dispatch_log(disp, LVL(90), "starttcp %p", disp->task);
+       dispatch_log(disp, LVL(90), "starttcp %p", disp->task[0]);
 
        LOCK(&disp->lock);
        disp->attributes |= DNS_DISPATCHATTR_CONNECTED;
-       startrecv(disp);
+       (void)startrecv(disp, NULL);
        UNLOCK(&disp->lock);
 }
 
@@ -2184,6 +2936,7 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp,
        dns_dispatchmgr_t *mgr;
        dns_dispatch_t *disp;
        dns_dispentry_t *res;
+       dispsocket_t *dispsock;
        dns_dispatchevent_t *ev;
        unsigned int bucket;
        isc_boolean_t killit;
@@ -2221,8 +2974,14 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp,
        killit = ISC_FALSE;
        if (disp->refcount == 0) {
                if (disp->recv_pending > 0)
-                       isc_socket_cancel(disp->socket, disp->task,
+                       isc_socket_cancel(disp->socket, disp->task[0],
                                          ISC_SOCKCANCEL_RECV);
+               for (dispsock = ISC_LIST_HEAD(disp->activesockets);
+                    dispsock != NULL;
+                    dispsock = ISC_LIST_NEXT(dispsock, link)) {
+                       isc_socket_cancel(dispsock->socket, dispsock->task,
+                                         ISC_SOCKCANCEL_RECV);
+               }
                disp->shutting_down = 1;
        }
 
@@ -2230,6 +2989,8 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp,
 
        LOCK(&qid->lock);
        ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
+       if (ISC_LINK_LINKED(res, alink))
+               ISC_LIST_UNLINK(qid->addr_table[res->abucket], res, alink);
        UNLOCK(&qid->lock);
 
        if (ev == NULL && res->item_out) {
@@ -2258,6 +3019,12 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp,
        request_log(disp, res, LVL(90), "detaching from task %p", res->task);
        isc_task_detach(&res->task);
 
+       if (res->dispsocket != NULL) {
+               isc_socket_cancel(res->dispsocket->socket,
+                                 res->dispsocket->task, ISC_SOCKCANCEL_RECV);
+               res->dispsocket->resp = NULL;
+       }
+
        /*
         * Free any buffered requests as well
         */
@@ -2274,12 +3041,12 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp,
        if (disp->shutting_down == 1)
                do_cancel(disp);
        else
-               startrecv(disp);
+               (void)startrecv(disp, NULL);
 
        killit = destroy_disp_ok(disp);
        UNLOCK(&disp->lock);
        if (killit)
-               isc_task_send(disp->task, &disp->ctlevent);
+               isc_task_send(disp->task[0], &disp->ctlevent);
 }
 
 static void
@@ -2294,13 +3061,15 @@ do_cancel(dns_dispatch_t *disp) {
        qid = DNS_QID(disp);
 
        /*
-        * Search for the first response handler without packets outstanding.
+        * Search for the first response handler without packets outstanding
+        * unless a specific hander is given.
         */
        LOCK(&qid->lock);
        for (resp = linear_first(qid);
-            resp != NULL && resp->item_out != ISC_FALSE;
+            resp != NULL && !resp->item_out;
             /* Empty. */)
                resp = linear_next(qid, resp);
+
        /*
         * No one to send the cancel event to, so nothing to do.
         */
@@ -2333,6 +3102,16 @@ dns_dispatch_getsocket(dns_dispatch_t *disp) {
        return (disp->socket);
 }
 
+isc_socket_t *
+dns_dispatch_getentrysocket(dns_dispentry_t *resp) {
+       REQUIRE(VALID_RESPONSE(resp));
+
+       if (resp->dispsocket != NULL)
+               return (resp->dispsocket->socket);
+       else
+               return (NULL);
+}
+
 isc_result_t
 dns_dispatch_getlocaladdress(dns_dispatch_t *disp, isc_sockaddr_t *addrp) {
 
@@ -2366,11 +3145,27 @@ dns_dispatch_cancel(dns_dispatch_t *disp) {
        return;
 }
 
+unsigned int
+dns_dispatch_getattributes(dns_dispatch_t *disp) {
+       REQUIRE(VALID_DISPATCH(disp));
+
+       /*
+        * We don't bother locking disp here; it's the caller's responsibility
+        * to use only non volatile flags.
+        */
+       return (disp->attributes);
+}
+
 void
 dns_dispatch_changeattributes(dns_dispatch_t *disp,
                              unsigned int attributes, unsigned int mask)
 {
        REQUIRE(VALID_DISPATCH(disp));
+       /* Exclusive attribute can only be set on creation */
+       REQUIRE((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0);
+       /* Also, a dispatch with randomport specified cannot start listening */
+       REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0 ||
+               (attributes & DNS_DISPATCHATTR_NOLISTEN) == 0);
 
        /* XXXMLG
         * Should check for valid attributes here!
@@ -2382,13 +3177,13 @@ dns_dispatch_changeattributes(dns_dispatch_t *disp,
                if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0 &&
                    (attributes & DNS_DISPATCHATTR_NOLISTEN) == 0) {
                        disp->attributes &= ~DNS_DISPATCHATTR_NOLISTEN;
-                       startrecv(disp);
+                       (void)startrecv(disp, NULL);
                } else if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN)
                           == 0 &&
                           (attributes & DNS_DISPATCHATTR_NOLISTEN) != 0) {
                        disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
                        if (disp->recv_pending != 0)
-                               isc_socket_cancel(disp->socket, disp->task,
+                               isc_socket_cancel(disp->socket, disp->task[0],
                                                  ISC_SOCKCANCEL_RECV);
                }
        }
@@ -2412,7 +3207,7 @@ dns_dispatch_importrecv(dns_dispatch_t *disp, isc_event_t *event) {
        INSIST(sevent->n <= disp->mgr->buffersize);
        newsevent = (isc_socketevent_t *)
                    isc_event_allocate(disp->mgr->mctx, NULL,
-                                     DNS_EVENT_IMPORTRECVDONE, udp_recv,
+                                     DNS_EVENT_IMPORTRECVDONE, udp_shrecv,
                                      disp, sizeof(isc_socketevent_t));
        if (newsevent == NULL)
                return;
@@ -2432,7 +3227,7 @@ dns_dispatch_importrecv(dns_dispatch_t *disp, isc_event_t *event) {
        newsevent->pktinfo = sevent->pktinfo;
        newsevent->attributes = sevent->attributes;
 
-       isc_task_send(disp->task, ISC_EVENT_PTR(&newsevent));
+       isc_task_send(disp->task[0], ISC_EVENT_PTR(&newsevent));
 }
 
 #if 0
index d1531ba1ca7b6fca9d8e6b6e22b4cf98a5022953..afd0a5ee2c2c9699040e5b9a933ee91cb1d7f27e 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: dispatch.h,v 1.56.128.3 2008/05/27 22:36:11 each Exp $ */
+/* $Id: dispatch.h,v 1.56.128.4 2008/06/24 00:09:12 jinmei Exp $ */
 
 #ifndef DNS_DISPATCH_H
 #define DNS_DISPATCH_H 1
@@ -105,7 +105,7 @@ struct dns_dispatchevent {
  *     The dispatcher is a TCP or UDP socket.
  *
  * _IPV4, _IPV6
- *     The dispatcher uses an ipv4 or ipv6 socket.
+ *     The dispatcher uses an IPv4 or IPv6 socket.
  *
  * _NOLISTEN
  *     The dispatcher should not listen on the socket.
@@ -115,7 +115,12 @@ struct dns_dispatchevent {
  *     accept replies from them.
  *
  * _RANDOMPORT
- *     Allocate UDP port randomly.
+ *     Previously used to indicate that the port of a dispatch UDP must be
+ *     chosen randomly.  This behavior now always applies and the attribute
+ *     is obsoleted.
+ *
+ * _EXCLUSIVE
+ *     A separate socket will be used on-demand for each transaction. 
  */
 #define DNS_DISPATCHATTR_PRIVATE       0x00000001U
 #define DNS_DISPATCHATTR_TCP           0x00000002U
@@ -125,7 +130,8 @@ struct dns_dispatchevent {
 #define DNS_DISPATCHATTR_NOLISTEN      0x00000020U
 #define DNS_DISPATCHATTR_MAKEQUERY     0x00000040U
 #define DNS_DISPATCHATTR_CONNECTED     0x00000080U
-#define DNS_DISPATCHATTR_RANDOMPORT    0x00000100U
+/*#define DNS_DISPATCHATTR_RANDOMPORT  0x00000100U*/
+#define DNS_DISPATCHATTR_EXCLUSIVE     0x00000200U
 /*@}*/
 
 isc_result_t
@@ -189,23 +195,33 @@ void
 dns_dispatchmgr_setblackportlist(dns_dispatchmgr_t *mgr,
                                 dns_portlist_t *portlist);
 /*%<
- * Sets a list of UDP ports that won't be used when creating a udp
- * dispatch with a wildcard port.
+ * This function is deprecated.  Use dns_dispatchmgr_setavailports() instead.
  *
  * Requires:
  *\li  mgr is a valid dispatchmgr
- *\li  portlist to be NULL or a valid port list.
  */
 
 dns_portlist_t *
 dns_dispatchmgr_getblackportlist(dns_dispatchmgr_t *mgr);
 /*%<
- * Return the current port list.
+ * This function is deprecated and always returns NULL.
  *
  * Requires:
  *\li  mgr is a valid dispatchmgr
  */
 
+isc_result_t
+dns_dispatchmgr_setavailports(dns_dispatchmgr_t *mgr, isc_portset_t *v4portset,
+                             isc_portset_t *v6portset);
+/*%<
+ * Sets a list of UDP ports that can be used for outgoing UDP messages.
+ *
+ * Requires:
+ *\li  mgr is a valid dispatchmgr
+ *\li  v4portset is NULL or a valid port set
+ *\li  v6portset is NULL or a valid port set
+ */
+
 void
 dns_dispatchmgr_setstats(dns_dispatchmgr_t *mgr, dns_stats_t *stats);
 /*%<
@@ -331,6 +347,12 @@ dns_dispatch_starttcp(dns_dispatch_t *disp);
  *\li  'disp' is valid.
  */
 
+isc_result_t
+dns_dispatch_addresponse2(dns_dispatch_t *disp, isc_sockaddr_t *dest,
+                         isc_task_t *task, isc_taskaction_t action, void *arg,
+                         isc_uint16_t *idp, dns_dispentry_t **resp,
+                         isc_socketmgr_t *sockmgr);
+
 isc_result_t
 dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
                         isc_task_t *task, isc_taskaction_t action, void *arg,
@@ -354,6 +376,10 @@ dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
  *
  *\li  "resp" be non-NULL and *resp be NULL
  *
+ *\li  "sockmgr" be NULL or a valid socket manager.  If 'disp' has
+ *     the DNS_DISPATCHATTR_EXCLUSIVE attribute, this must not be NULL,
+ *     which also means dns_dispatch_addresponse() cannot be used.
+ *
  * Ensures:
  *
  *\li  &lt;id, dest> is a unique tuple.  That means incoming messages
@@ -384,6 +410,8 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp,
  *     argument to dns_dispatch_addresponse() when allocating '*resp'.
  */
 
+isc_socket_t *
+dns_dispatch_getentrysocket(dns_dispentry_t *resp);
 
 isc_socket_t *
 dns_dispatch_getsocket(dns_dispatch_t *disp);
@@ -421,6 +449,16 @@ dns_dispatch_cancel(dns_dispatch_t *disp);
  *\li  disp is valid.
  */
 
+unsigned int
+dns_dispatch_getattributes(dns_dispatch_t *disp);
+/*%<
+ * Return the attributes (DNS_DISPATCHATTR_xxx) of this dispatch.  Only the
+ * non-changeable attributes are expected to be referenced by the caller. 
+ *
+ * Requires:
+ *\li  disp is valid.
+ */
+
 void
 dns_dispatch_changeattributes(dns_dispatch_t *disp,
                              unsigned int attributes, unsigned int mask);
index 8c5c7afde6947e4b7ec7d1f010832c9a2d4b1f06..729cecc3ce7d81eb4dde54a33c13b74ad797f6a6 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: resolver.h,v 1.56.128.2 2008/04/03 06:08:27 tbox Exp $ */
+/* $Id: resolver.h,v 1.56.128.3 2008/06/24 00:09:12 jinmei Exp $ */
 
 #ifndef DNS_RESOLVER_H
 #define DNS_RESOLVER_H 1
@@ -107,8 +107,6 @@ typedef struct dns_fetchevent {
 
 #define DNS_RESOLVER_CHECKNAMES                0x01
 #define DNS_RESOLVER_CHECKNAMESFAIL    0x02
-#define DNS_RESOLVER_USEDISPATCHPOOL4  0x04
-#define DNS_RESOLVER_USEDISPATCHPOOL6  0x08
 
 isc_result_t
 dns_resolver_create(dns_view_t *view,
index dd3e670020302f6b10ff8ebe24e7017891ee4e60..9cf269c7fb12a488924dc01a748f7f4099668eed 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: request.c,v 1.79 2007/06/19 23:47:16 tbox Exp $ */
+/* $Id: request.c,v 1.79.128.1 2008/06/24 00:09:11 jinmei Exp $ */
 
 /*! \file */
 
@@ -121,6 +121,7 @@ static isc_result_t req_render(dns_message_t *message, isc_buffer_t **buffer,
 static void req_senddone(isc_task_t *task, isc_event_t *event);
 static void req_response(isc_task_t *task, isc_event_t *event);
 static void req_timeout(isc_task_t *task, isc_event_t *event);
+static isc_socket_t * req_getsocket(dns_request_t *request);
 static void req_connected(isc_task_t *task, isc_event_t *event);
 static void req_sendevent(dns_request_t *request, isc_result_t result);
 static void req_cancel(dns_request_t *request);
@@ -146,6 +147,7 @@ dns_requestmgr_create(isc_mem_t *mctx,
        isc_socket_t *socket;
        isc_result_t result;
        int i;
+       unsigned int dispattr;
 
        req_log(ISC_LOG_DEBUG(3), "dns_requestmgr_create");
 
@@ -154,13 +156,14 @@ dns_requestmgr_create(isc_mem_t *mctx,
        REQUIRE(socketmgr != NULL);
        REQUIRE(taskmgr != NULL);
        REQUIRE(dispatchmgr != NULL);
+       UNUSED(socket);
        if (dispatchv4 != NULL) {
-               socket = dns_dispatch_getsocket(dispatchv4);
-               REQUIRE(isc_socket_gettype(socket) == isc_sockettype_udp);
+               dispattr = dns_dispatch_getattributes(dispatchv4);
+               REQUIRE((dispattr & DNS_DISPATCHATTR_UDP) != 0);
        }
        if (dispatchv6 != NULL) {
-               socket = dns_dispatch_getsocket(dispatchv6);
-               REQUIRE(isc_socket_gettype(socket) == isc_sockettype_udp);
+               dispattr = dns_dispatch_getattributes(dispatchv6);
+               REQUIRE((dispattr & DNS_DISPATCHATTR_UDP) != 0);
        }
 
        requestmgr = isc_mem_get(mctx, sizeof(*requestmgr));
@@ -425,12 +428,19 @@ req_send(dns_request_t *request, isc_task_t *task, isc_sockaddr_t *address) {
        isc_region_t r;
        isc_socket_t *socket;
        isc_result_t result;
+       unsigned int dispattr;
 
        req_log(ISC_LOG_DEBUG(3), "req_send: request %p", request);
 
        REQUIRE(VALID_REQUEST(request));
-       socket = dns_dispatch_getsocket(request->dispatch);
+       dispattr = dns_dispatch_getattributes(request->dispatch);
+       socket = req_getsocket(request);
        isc_buffer_usedregion(request->query, &r);
+       /*
+        * We could connect the socket when we are using an exclusive dispatch
+        * as we do in resolver.c, but we prefer implementation simplicity
+        * at this moment.
+        */
        result = isc_socket_sendto(socket, &r, task, req_senddone,
                                  request, address, NULL);
        if (result == ISC_R_SUCCESS)
@@ -742,14 +752,16 @@ dns_request_createraw3(dns_requestmgr_t *requestmgr, isc_buffer_t *msgbuf,
        if (result != ISC_R_SUCCESS)
                goto cleanup;
 
-       socket = dns_dispatch_getsocket(request->dispatch);
-       INSIST(socket != NULL);
-       result = dns_dispatch_addresponse(request->dispatch, destaddr, task,
-                                         req_response, request, &id,
-                                         &request->dispentry);
+       result = dns_dispatch_addresponse2(request->dispatch, destaddr, task,
+                                          req_response, request, &id,
+                                          &request->dispentry,
+                                          requestmgr->socketmgr);
        if (result != ISC_R_SUCCESS)
                goto cleanup;
 
+       socket = req_getsocket(request);
+       INSIST(socket != NULL);
+
        result = isc_buffer_allocate(mctx, &request->query,
                                     r.length + (tcp ? 2 : 0));
        if (result != ISC_R_SUCCESS)
@@ -935,13 +947,14 @@ dns_request_createvia3(dns_requestmgr_t *requestmgr, dns_message_t *message,
        if (result != ISC_R_SUCCESS)
                goto cleanup;
 
-       socket = dns_dispatch_getsocket(request->dispatch);
-       INSIST(socket != NULL);
-       result = dns_dispatch_addresponse(request->dispatch, destaddr, task,
-                                         req_response, request, &id,
-                                         &request->dispentry);
+       result = dns_dispatch_addresponse2(request->dispatch, destaddr, task,
+                                          req_response, request, &id,
+                                          &request->dispentry,
+                                          requestmgr->socketmgr);
        if (result != ISC_R_SUCCESS)
                goto cleanup;
+       socket = req_getsocket(request);
+       INSIST(socket != NULL);
 
        message->id = id;
        if (setkey) {
@@ -1226,6 +1239,21 @@ dns_request_destroy(dns_request_t **requestp) {
  *** Private: request.
  ***/
 
+static isc_socket_t *
+req_getsocket(dns_request_t *request) {
+       unsigned int dispattr;
+       isc_socket_t *socket;
+
+       dispattr = dns_dispatch_getattributes(request->dispatch);
+       if ((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
+               INSIST(request->dispentry != NULL);
+               socket = dns_dispatch_getentrysocket(request->dispentry);
+       } else
+               socket = dns_dispatch_getsocket(request->dispatch);
+
+       return (socket);
+}
+
 static void
 req_connected(isc_task_t *task, isc_event_t *event) {
        isc_socketevent_t *sevent = (isc_socketevent_t *)event;
@@ -1425,6 +1453,7 @@ req_destroy(dns_request_t *request) {
 static void
 req_cancel(dns_request_t *request) {
        isc_socket_t *socket;
+       unsigned int dispattr;
 
        REQUIRE(VALID_REQUEST(request));
 
@@ -1437,16 +1466,23 @@ req_cancel(dns_request_t *request) {
 
        if (request->timer != NULL)
                isc_timer_detach(&request->timer);
+       dispattr = dns_dispatch_getattributes(request->dispatch);
+       socket = NULL;
+       if (DNS_REQUEST_CONNECTING(request) || DNS_REQUEST_SENDING(request)) {
+               if ((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
+                       if (request->dispentry != NULL) {
+                               socket = dns_dispatch_getentrysocket(
+                                       request->dispentry);
+                       }
+               } else
+                       socket = dns_dispatch_getsocket(request->dispatch);
+               if (DNS_REQUEST_CONNECTING(request) && socket != NULL)
+                       isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_CONNECT);
+               if (DNS_REQUEST_SENDING(request) && socket != NULL)
+                       isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_SEND);
+       }
        if (request->dispentry != NULL)
                dns_dispatch_removeresponse(&request->dispentry, NULL);
-       if (DNS_REQUEST_CONNECTING(request)) {
-               socket = dns_dispatch_getsocket(request->dispatch);
-               isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_CONNECT);
-       }
-       if (DNS_REQUEST_SENDING(request)) {
-               socket = dns_dispatch_getsocket(request->dispatch);
-               isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_SEND);
-       }
        dns_dispatch_detach(&request->dispatch);
 }
 
index 419dda5148be3c8b42b390e71d5d44140eadb344..acc908fedcf82e3b203f0036c0ae884f06ab2a79 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: resolver.c,v 1.355.12.18 2008/06/17 22:36:03 jinmei Exp $ */
+/* $Id: resolver.c,v 1.355.12.19 2008/06/24 00:09:11 jinmei Exp $ */
 
 /*! \file */
 
@@ -124,6 +124,7 @@ typedef struct query {
        isc_mem_t *                     mctx;
        dns_dispatchmgr_t *             dispatchmgr;
        dns_dispatch_t *                dispatch;
+       isc_boolean_t                   exclusivesocket;
        dns_adbaddrinfo_t *             addrinfo;
        isc_socket_t *                  tcpsocket;
        isc_time_t                      start;
@@ -294,24 +295,6 @@ typedef struct alternate {
        ISC_LINK(struct alternate)      link;
 } alternate_t;
 
-#ifdef ISC_RWLOCK_USEATOMIC
-#define DNS_RESOLVER_USERWLOCK 1
-#else
-#define DNS_RESOLVER_USERWLOCK 0
-#endif
-
-#if DNS_RESOLVER_USERWLOCK
-#define RES_INITLOCK(l)                isc_rwlock_init((l), 0, 0)
-#define RES_DESTROYLOCK(l)      isc_rwlock_destroy(l)
-#define RES_LOCK(l, t)         RWLOCK((l), (t))
-#define RES_UNLOCK(l, t)       RWUNLOCK((l), (t))
-#else
-#define RES_INITLOCK(l)                isc_mutex_init(l)
-#define RES_DESTROYLOCK(l)      DESTROYLOCK(l)
-#define RES_LOCK(l, t)         LOCK(l)
-#define RES_UNLOCK(l, t)       UNLOCK(l)
-#endif
-
 struct dns_resolver {
        /* Unlocked. */
        unsigned int                    magic;
@@ -319,11 +302,6 @@ struct dns_resolver {
        isc_mutex_t                     lock;
        isc_mutex_t                     nlock;
        isc_mutex_t                     primelock;
-#if DNS_RESOLVER_USERWLOCK
-       isc_rwlock_t                    poollock;
-#else
-       isc_mutex_t                     poollock;
-#endif
        dns_rdataclass_t                rdclass;
        isc_socketmgr_t *               socketmgr;
        isc_timermgr_t *                timermgr;
@@ -333,7 +311,9 @@ struct dns_resolver {
        unsigned int                    options;
        dns_dispatchmgr_t *             dispatchmgr;
        dns_dispatch_t *                dispatchv4;
+       isc_boolean_t                   exclusivev4;
        dns_dispatch_t *                dispatchv6;
+       isc_boolean_t                   exclusivev6;
        unsigned int                    ndisps;
        unsigned int                    nbuckets;
        fctxbucket_t *                  buckets;
@@ -352,7 +332,6 @@ struct dns_resolver {
        unsigned int                    spillatmin;
        isc_timer_t *                   spillattimer;
        isc_boolean_t                   zero_no_soa_ttl;
-       isc_timer_t *                   disppooltimer;
 
        /* Locked by lock. */
        unsigned int                    references;
@@ -366,9 +345,6 @@ struct dns_resolver {
        dns_fetch_t *                   primefetch;
        /* Locked by nlock. */
        unsigned int                    nfctx;
-       /* Locked by poollock. */
-       dns_dispatch_t **               dispatchv4pool;
-       dns_dispatch_t **               dispatchv6pool;
 };
 
 #define RES_MAGIC                      ISC_MAGIC('R', 'e', 's', '!')
@@ -603,6 +579,7 @@ fctx_cancelquery(resquery_t **queryp, dns_dispatchevent_t **deventp,
        unsigned int factor;
        dns_adbfind_t *find;
        dns_adbaddrinfo_t *addrinfo;
+       isc_socket_t *socket;
 
        query = *queryp;
        fctx = query->fctx;
@@ -684,35 +661,48 @@ fctx_cancelquery(resquery_t **queryp, dns_dispatchevent_t **deventp,
                                                           0, factor);
        }
 
-       if (query->dispentry != NULL)
-               dns_dispatch_removeresponse(&query->dispentry, deventp);
-
-       ISC_LIST_UNLINK(fctx->queries, query, link);
-
-       if (query->tsig != NULL)
-               isc_buffer_free(&query->tsig);
-
-       if (query->tsigkey != NULL)
-               dns_tsigkey_detach(&query->tsigkey);
-
        /*
         * Check for any outstanding socket events.  If they exist, cancel
         * them and let the event handlers finish the cleanup.  The resolver
         * only needs to worry about managing the connect and send events;
         * the dispatcher manages the recv events.
         */
-       if (RESQUERY_CONNECTING(query))
+       if (RESQUERY_CONNECTING(query)) {
                /*
                 * Cancel the connect.
                 */
-               isc_socket_cancel(query->tcpsocket, NULL,
-                                 ISC_SOCKCANCEL_CONNECT);
-       else if (RESQUERY_SENDING(query))
+               if (query->tcpsocket != NULL) {
+                       isc_socket_cancel(query->tcpsocket, NULL,
+                                         ISC_SOCKCANCEL_CONNECT);
+               } else if (query->dispentry != NULL) {
+                       INSIST(query->exclusivesocket);
+                       socket = dns_dispatch_getentrysocket(query->dispentry);
+                       if (socket != NULL)
+                               isc_socket_cancel(socket, NULL,
+                                                 ISC_SOCKCANCEL_CONNECT);
+               }
+       } else if (RESQUERY_SENDING(query)) {
                /*
                 * Cancel the pending send.
                 */
-               isc_socket_cancel(dns_dispatch_getsocket(query->dispatch),
-                                 NULL, ISC_SOCKCANCEL_SEND);
+               if (query->exclusivesocket && query->dispentry != NULL)
+                       socket = dns_dispatch_getentrysocket(query->dispentry);
+               else
+                       socket = dns_dispatch_getsocket(query->dispatch);
+               if (socket != NULL)
+                       isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_SEND);
+       }
+
+       if (query->dispentry != NULL)
+               dns_dispatch_removeresponse(&query->dispentry, deventp);
+
+       ISC_LIST_UNLINK(fctx->queries, query, link);
+
+       if (query->tsig != NULL)
+               isc_buffer_free(&query->tsig);
+
+       if (query->tsigkey != NULL)
+               dns_tsigkey_detach(&query->tsigkey);
 
        if (query->dispatch != NULL)
                dns_dispatch_detach(&query->dispatch);
@@ -912,43 +902,25 @@ fctx_done(fetchctx_t *fctx, isc_result_t result) {
 }
 
 static void
-resquery_senddone(isc_task_t *task, isc_event_t *event) {
+process_sendevent(resquery_t *query, isc_event_t *event) {
        isc_socketevent_t *sevent = (isc_socketevent_t *)event;
-       resquery_t *query = event->ev_arg;
        isc_boolean_t retry = ISC_FALSE;
        isc_result_t result;
        fetchctx_t *fctx;
 
-       REQUIRE(event->ev_type == ISC_SOCKEVENT_SENDDONE);
-
-       QTRACE("senddone");
-
-       /*
-        * XXXRTH
-        *
-        * Currently we don't wait for the senddone event before retrying
-        * a query.  This means that if we get really behind, we may end
-        * up doing extra work!
-        */
-
-       UNUSED(task);
-
-       INSIST(RESQUERY_SENDING(query));
-
-       query->sends--;
        fctx = query->fctx;
 
        if (RESQUERY_CANCELED(query)) {
-               if (query->sends == 0) {
+               if (query->sends == 0 && query->connects == 0) {
                        /*
                         * This query was canceled while the
-                        * isc_socket_sendto() was in progress.
+                        * isc_socket_sendto/connect() was in progress.
                         */
                        if (query->tcpsocket != NULL)
                                isc_socket_detach(&query->tcpsocket);
                        resquery_destroy(&query);
                }
-       } else
+       } else {
                switch (sevent->result) {
                case ISC_R_SUCCESS:
                        break;
@@ -970,6 +942,7 @@ resquery_senddone(isc_task_t *task, isc_event_t *event) {
                        fctx_cancelquery(&query, NULL, NULL, ISC_FALSE);
                        break;
                }
+       }
 
        isc_event_free(&event);
 
@@ -987,6 +960,48 @@ resquery_senddone(isc_task_t *task, isc_event_t *event) {
        }
 }
 
+static void
+resquery_udpconnected(isc_task_t *task, isc_event_t *event) {
+       resquery_t *query = event->ev_arg;
+
+       REQUIRE(event->ev_type == ISC_SOCKEVENT_CONNECT);
+
+       QTRACE("udpconnected");
+
+       UNUSED(task);
+
+       INSIST(RESQUERY_CONNECTING(query));
+
+       query->connects--;
+
+       process_sendevent(query, event);
+}
+
+static void
+resquery_senddone(isc_task_t *task, isc_event_t *event) {
+       resquery_t *query = event->ev_arg;
+
+       REQUIRE(event->ev_type == ISC_SOCKEVENT_SENDDONE);
+
+       QTRACE("senddone");
+
+       /*
+        * XXXRTH
+        *
+        * Currently we don't wait for the senddone event before retrying
+        * a query.  This means that if we get really behind, we may end
+        * up doing extra work!
+        */
+
+       UNUSED(task);
+
+       INSIST(RESQUERY_SENDING(query));
+
+       query->sends--;
+
+       process_sendevent(query, event);
+}
+
 static inline isc_result_t
 fctx_addopt(dns_message_t *message, unsigned int version,
            isc_uint16_t udpsize, isc_boolean_t request_nsid)
@@ -1139,6 +1154,7 @@ fctx_query(fetchctx_t *fctx, dns_adbaddrinfo_t *addrinfo,
         */
        query->dispatchmgr = res->dispatchmgr;
        query->dispatch = NULL;
+       query->exclusivesocket = ISC_FALSE;
        query->tcpsocket = NULL;
        if (res->view->peers != NULL) {
                dns_peer_t *peer = NULL;
@@ -1221,53 +1237,21 @@ fctx_query(fetchctx_t *fctx, dns_adbaddrinfo_t *addrinfo,
                        if (result != ISC_R_SUCCESS)
                                goto cleanup_query;
                } else {
-                       isc_sockaddr_t localaddr;
-                       unsigned int attrs, attrmask;
-                       dns_dispatch_t *disp_base;
-
-                       attrs = 0;
-                       attrs |= DNS_DISPATCHATTR_UDP;
-                       attrs |= DNS_DISPATCHATTR_RANDOMPORT;
-
-                       attrmask = 0;
-                       attrmask |= DNS_DISPATCHATTR_UDP;
-                       attrmask |= DNS_DISPATCHATTR_TCP;
-                       attrmask |= DNS_DISPATCHATTR_IPV4;
-                       attrmask |= DNS_DISPATCHATTR_IPV6;
-
                        switch (isc_sockaddr_pf(&addrinfo->sockaddr)) {
-                       case AF_INET:
-                               disp_base = res->dispatchv4;
-                               attrs |= DNS_DISPATCHATTR_IPV4;
+                       case PF_INET:
+                               dns_dispatch_attach(res->dispatchv4,
+                                                   &query->dispatch);
+                               query->exclusivesocket = res->exclusivev4;
                                break;
-                       case AF_INET6:
-                               disp_base = res->dispatchv6;
-                               attrs |= DNS_DISPATCHATTR_IPV6;
+                       case PF_INET6:
+                               dns_dispatch_attach(res->dispatchv6,
+                                                   &query->dispatch);
+                               query->exclusivesocket = res->exclusivev6;
                                break;
                        default:
                                result = ISC_R_NOTIMPLEMENTED;
                                goto cleanup_query;
                        }
-
-                       result = dns_dispatch_getlocaladdress(disp_base,
-                                                             &localaddr);
-                       if (result != ISC_R_SUCCESS)
-                               goto cleanup_query;
-                       if (isc_sockaddr_getport(&localaddr) == 0) {
-                               result = dns_dispatch_getudp(res->dispatchmgr,
-                                                            res->socketmgr,
-                                                            res->taskmgr,
-                                                            &localaddr,
-                                                            4096, 1000, 32768,
-                                                            16411, 16433,
-                                                            attrs, attrmask,
-                                                            &query->dispatch);
-                               if (result != ISC_R_SUCCESS)
-                                       goto cleanup_query;
-                       } else {
-                               dns_dispatch_attach(disp_base,
-                                                   &query->dispatch);
-                       }
                }
                /*
                 * We should always have a valid dispatcher here.  If we
@@ -1458,13 +1442,14 @@ resquery_send(resquery_t *query) {
        /*
         * Get a query id from the dispatch.
         */
-       result = dns_dispatch_addresponse(query->dispatch,
-                                         &query->addrinfo->sockaddr,
-                                         task,
-                                         resquery_response,
-                                         query,
-                                         &query->id,
-                                         &query->dispentry);
+       result = dns_dispatch_addresponse2(query->dispatch,
+                                          &query->addrinfo->sockaddr,
+                                          task,
+                                          resquery_response,
+                                          query,
+                                          &query->id,
+                                          &query->dispentry,
+                                          res->socketmgr);
        if (result != ISC_R_SUCCESS)
                goto cleanup_temps;
 
@@ -1681,12 +1666,24 @@ resquery_send(resquery_t *query) {
         */
        dns_message_reset(fctx->qmessage, DNS_MESSAGE_INTENTRENDER);
 
-       socket = dns_dispatch_getsocket(query->dispatch);
+       if (query->exclusivesocket)
+               socket = dns_dispatch_getentrysocket(query->dispentry);
+       else
+               socket = dns_dispatch_getsocket(query->dispatch);
        /*
         * Send the query!
         */
-       if ((query->options & DNS_FETCHOPT_TCP) == 0)
+       if ((query->options & DNS_FETCHOPT_TCP) == 0) {
                address = &query->addrinfo->sockaddr;
+               if (query->exclusivesocket) {
+                       result = isc_socket_connect(socket, address, task,
+                                                   resquery_udpconnected,
+                                                   query);
+                       if (result != ISC_R_SUCCESS)
+                               goto cleanup_message;
+                       query->connects++;
+               }
+       }
        isc_buffer_usedregion(buffer, &r);
 
        /*
@@ -2788,6 +2785,8 @@ fctx_destroy(fetchctx_t *fctx) {
 static void
 fctx_timeout(isc_task_t *task, isc_event_t *event) {
        fetchctx_t *fctx = event->ev_arg;
+       isc_timerevent_t *tevent = (isc_timerevent_t *)event;
+       resquery_t *query;
 
        REQUIRE(VALID_FCTX(fctx));
 
@@ -2803,8 +2802,18 @@ fctx_timeout(isc_task_t *task, isc_event_t *event) {
                fctx->timeouts++;
                /*
                 * We could cancel the running queries here, or we could let
-                * them keep going.  Right now we choose the latter...
+                * them keep going.  Since we normally use separate sockets for
+                * different queries, we adopt the former approach to reduce
+                * the number of open sockets: cancel the oldest query if it
+                * expired before the query had started (this is usually the
+                * case but is not always so, depending on the task schedule
+                * timing).
                 */
+               query = ISC_LIST_HEAD(fctx->queries);
+               if (query != NULL &&
+                   isc_time_compare(&tevent->due, &query->start) >= 0) {
+                       fctx_cancelquery(&query, NULL, NULL, ISC_TRUE);
+               }
                fctx->attributes &= ~FCTX_ATTR_ADDRWAIT;
                /*
                 * Our timer has triggered.  Reestablish the fctx lifetime
@@ -5668,6 +5677,19 @@ resquery_response(isc_task_t *task, isc_event_t *event) {
                         * There's no hope for this query.
                         */
                        keep_trying = ISC_TRUE;
+
+                       /*
+                        * If this is a network error on an exclusive query
+                        * socket, mark the server as bad so that we won't try
+                        * it for this fetch again.
+                        */
+                       if (query->exclusivesocket &&
+                           (devent->result == ISC_R_HOSTUNREACH ||
+                            devent->result == ISC_R_NETUNREACH ||
+                            devent->result == ISC_R_CONNREFUSED ||
+                            devent->result == ISC_R_CANCELED)) {
+                                   broken_server = devent->result;
+                       }
                }
                goto done;
        }
@@ -6271,7 +6293,6 @@ destroy(dns_resolver_t *res) {
 
        INSIST(res->nfctx == 0);
 
-       RES_DESTROYLOCK(&res->poollock);
        DESTROYLOCK(&res->primelock);
        DESTROYLOCK(&res->nlock);
        DESTROYLOCK(&res->lock);
@@ -6288,26 +6309,12 @@ destroy(dns_resolver_t *res) {
                dns_dispatch_detach(&res->dispatchv4);
        if (res->dispatchv6 != NULL)
                dns_dispatch_detach(&res->dispatchv6);
-       if (res->dispatchv4pool != NULL) {
-               for (i = 0; i < res->ndisps; i++)
-                       dns_dispatch_detach(&res->dispatchv4pool[i]);
-               isc_mem_put(res->mctx, res->dispatchv4pool,
-                           res->ndisps * sizeof(dns_dispatch_t *));
-       }
-       if (res->dispatchv6pool != NULL) {
-               for (i = 0; i < res->ndisps; i++)
-                       dns_dispatch_detach(&res->dispatchv6pool[i]);
-               isc_mem_put(res->mctx, res->dispatchv6pool,
-                           res->ndisps * sizeof(dns_dispatch_t *));
-       }
        while ((a = ISC_LIST_HEAD(res->alternates)) != NULL) {
                ISC_LIST_UNLINK(res->alternates, a, link);
                if (!a->isaddress)
                        dns_name_free(&a->_u._n.name, res->mctx);
                isc_mem_put(res->mctx, a, sizeof(*a));
        }
-       if (res->disppooltimer != NULL)
-               isc_timer_detach(&res->disppooltimer);
        dns_resolver_reset_algorithms(res);
        dns_resolver_resetmustbesecure(res);
 #if USE_ALGLOCK
@@ -6404,6 +6411,7 @@ dns_resolver_create(dns_view_t *view,
        unsigned int i, buckets_created = 0;
        isc_task_t *task = NULL;
        char name[16];
+       unsigned dispattr;
 
        /*
         * Create a resolver.
@@ -6438,9 +6446,6 @@ dns_resolver_create(dns_view_t *view,
        res->zero_no_soa_ttl = ISC_FALSE;
        res->ndisps = 0;
        res->nextdisp = 0; /* meaningless at this point, but init it */
-       res->dispatchv4pool = NULL;
-       res->dispatchv6pool = NULL;
-       res->disppooltimer = NULL;
        res->nbuckets = ntasks;
        res->activebuckets = ntasks;
        res->buckets = isc_mem_get(view->mctx,
@@ -6484,12 +6489,20 @@ dns_resolver_create(dns_view_t *view,
        }
 
        res->dispatchv4 = NULL;
-       if (dispatchv4 != NULL)
-                       dns_dispatch_attach(dispatchv4, &res->dispatchv4);
+       if (dispatchv4 != NULL) {
+               dns_dispatch_attach(dispatchv4, &res->dispatchv4);
+               dispattr = dns_dispatch_getattributes(dispatchv4);
+               res->exclusivev4 =
+                       ISC_TF((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0);
+       }
 
        res->dispatchv6 = NULL;
-       if (dispatchv6 != NULL)
+       if (dispatchv6 != NULL) {
                dns_dispatch_attach(dispatchv6, &res->dispatchv6);
+               dispattr = dns_dispatch_getattributes(dispatchv6);
+               res->exclusivev6 =
+                       ISC_TF((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0);
+       }
 
        res->references = 1;
        res->exiting = ISC_FALSE;
@@ -6511,21 +6524,17 @@ dns_resolver_create(dns_view_t *view,
        if (result != ISC_R_SUCCESS)
                goto cleanup_nlock;
 
-       result = RES_INITLOCK(&res->poollock);
-       if (result != ISC_R_SUCCESS)
-               goto cleanup_primelock;
-
        task = NULL;
        result = isc_task_create(taskmgr, 0, &task);
        if (result != ISC_R_SUCCESS)
-               goto cleanup_poollock;
+               goto cleanup_primelock;
 
        result = isc_timer_create(timermgr, isc_timertype_inactive, NULL, NULL,
                                  task, spillattimer_countdown, res,
                                  &res->spillattimer);
        isc_task_detach(&task);
        if (result != ISC_R_SUCCESS)
-               goto cleanup_poollock;
+               goto cleanup_primelock;
 
 #if USE_ALGLOCK
        result = isc_rwlock_init(&res->alglock, 0, 0);
@@ -6555,9 +6564,6 @@ dns_resolver_create(dns_view_t *view,
        isc_timer_detach(&res->spillattimer);
 #endif
 
- cleanup_poollock:
-       RES_DESTROYLOCK(&res->poollock);
-
  cleanup_primelock:
        DESTROYLOCK(&res->primelock);
 
@@ -6779,12 +6785,12 @@ dns_resolver_shutdown(dns_resolver_t *res) {
                             fctx != NULL;
                             fctx = ISC_LIST_NEXT(fctx, link))
                                fctx_shutdown(fctx);
-                       if (res->dispatchv4 != NULL) {
+                       if (res->dispatchv4 != NULL && !res->exclusivev4) {
                                sock = dns_dispatch_getsocket(res->dispatchv4);
                                isc_socket_cancel(sock, res->buckets[i].task,
                                                  ISC_SOCKCANCEL_ALL);
                        }
-                       if (res->dispatchv6 != NULL) {
+                       if (res->dispatchv6 != NULL && !res->exclusivev6) {
                                sock = dns_dispatch_getsocket(res->dispatchv6);
                                isc_socket_cancel(sock, res->buckets[i].task,
                                                  ISC_SOCKCANCEL_ALL);
@@ -7468,245 +7474,3 @@ dns_resolver_getoptions(dns_resolver_t *resolver) {
 
        return (resolver->options);
 }
-
-static void
-disppooltimer_update(isc_task_t *task, isc_event_t *event) {
-       dns_resolver_t *res = event->ev_arg;
-       isc_sockaddr_t addr4, addr6;
-       dns_dispatch_t *disp4 = NULL, *disp6 = NULL;
-       isc_result_t result;
-       unsigned int nxt;
-       unsigned int attrs_base, attrs, attrmask;
-
-       REQUIRE(VALID_RESOLVER(res));
-       REQUIRE((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0 ||
-               (res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0);
-
-       UNUSED(task);
-       isc_event_free(&event);
-
-       LOCK(&res->lock);
-       nxt = res->nextdisp++;
-       if (res->nextdisp == res->ndisps)
-               res->nextdisp = 0;
-       UNLOCK(&res->lock);
-
-       attrs_base = 0;
-       attrs_base |= DNS_DISPATCHATTR_UDP;
-       attrs_base |= DNS_DISPATCHATTR_RANDOMPORT;
-
-       attrmask = 0;
-       attrmask |= DNS_DISPATCHATTR_UDP;
-       attrmask |= DNS_DISPATCHATTR_TCP;
-       attrmask |= DNS_DISPATCHATTR_IPV4;
-       attrmask |= DNS_DISPATCHATTR_IPV6;
-
-       RES_LOCK(&res->poollock, isc_rwlocktype_read);
-       if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) {
-               result = dns_dispatch_getlocaladdress(res->dispatchv4pool[nxt],
-                                                     &addr4);
-               INSIST(result == ISC_R_SUCCESS);
-       }
-       if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) {
-               result = dns_dispatch_getlocaladdress(res->dispatchv6pool[nxt],
-                                                     &addr6);
-               INSIST(result == ISC_R_SUCCESS);
-       }
-       RES_UNLOCK(&res->poollock, isc_rwlocktype_read);
-
-       if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) {
-               attrs = attrs_base;
-               attrs |= DNS_DISPATCHATTR_IPV4;
-
-               result = dns_dispatch_getudp(res->dispatchmgr,
-                                            res->socketmgr,
-                                            res->taskmgr, &addr4,
-                                            4096, 1000, 32768, 16411,
-                                            16433, attrs, attrmask,
-                                            &disp4);
-               if (result != ISC_R_SUCCESS) {
-                       isc_log_write(dns_lctx, DNS_LOGCATEGORY_RESOLVER,
-                                     DNS_LOGMODULE_RESOLVER, ISC_LOG_ERROR,
-                                     "could not update an IPv4 random query "
-                                     "port: %s",
-                                     isc_result_totext(result));
-                       /* keep the old one */
-               }
-
-               /*
-                * We don't try to ensure the new dispatch is unique (see the
-                * comments in dns_resolver_createdispatchpool()).
-                */
-       }
-       if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) {
-               attrs = attrs_base;
-               attrs |= DNS_DISPATCHATTR_IPV6;
-
-               result = dns_dispatch_getudp(res->dispatchmgr,
-                                            res->socketmgr,
-                                            res->taskmgr, &addr6,
-                                            4096, 1000, 32768, 16411,
-                                            16433, attrs, attrmask,
-                                            &disp6);
-               if (result != ISC_R_SUCCESS) {
-                       isc_log_write(dns_lctx, DNS_LOGCATEGORY_RESOLVER,
-                                     DNS_LOGMODULE_RESOLVER, ISC_LOG_ERROR,
-                                     "could not update an IPv6 random query "
-                                     "port: %s",
-                                     isc_result_totext(result));
-               }
-       }
-
-       RES_LOCK(&res->poollock, isc_rwlocktype_write);
-       if (disp4 != NULL) {
-               dns_dispatch_detach(&res->dispatchv4pool[nxt]);
-               res->dispatchv4pool[nxt] = disp4;
-       }
-       if (disp6 != NULL) {
-               dns_dispatch_detach(&res->dispatchv6pool[nxt]);
-               res->dispatchv6pool[nxt] = disp6;
-       }
-       RES_UNLOCK(&res->poollock, isc_rwlocktype_write);
-
-       return;
-}
-
-isc_result_t
-dns_resolver_createdispatchpool(dns_resolver_t *res, unsigned int ndisps,
-                               unsigned int tick)
-{
-       unsigned int i;
-       isc_result_t result = ISC_R_SUCCESS;
-       unsigned int attrs_base, attrs, attrmask;
-       isc_sockaddr_t addr4, addr6;
-       dns_dispatch_t *disp;
-       isc_task_t *task;
-       isc_interval_t interval;
-
-       REQUIRE(VALID_RESOLVER(res));
-       REQUIRE(!res->frozen);  /* meaning we don't have to lock res */
-       REQUIRE(ndisps > 0);
-       REQUIRE((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0 ||
-               (res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0);
-
-       attrs_base = 0;
-       attrs_base |= DNS_DISPATCHATTR_UDP;
-       attrs_base |= DNS_DISPATCHATTR_RANDOMPORT;
-
-       attrmask = 0;
-       attrmask |= DNS_DISPATCHATTR_UDP;
-       attrmask |= DNS_DISPATCHATTR_TCP;
-       attrmask |= DNS_DISPATCHATTR_IPV4;
-       attrmask |= DNS_DISPATCHATTR_IPV6;
-
-       if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) {
-               INSIST(res->dispatchv4 != NULL);
-               result = dns_dispatch_getlocaladdress(res->dispatchv4, &addr4);
-               INSIST(result == ISC_R_SUCCESS &&
-                      isc_sockaddr_getport(&addr4) == 0);
-               res->dispatchv4pool = isc_mem_get(res->mctx,
-                                                 sizeof(dns_dispatch_t *) *
-                                                 ndisps);
-               if (res->dispatchv4pool == NULL)
-                       return (ISC_R_NOMEMORY);
-               for (i = 0; i < ndisps; i++)
-                       res->dispatchv4pool[i] = NULL;
-       }
-       if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) {
-               INSIST(res->dispatchv6 != NULL);
-               result = dns_dispatch_getlocaladdress(res->dispatchv6, &addr6);
-               INSIST(result == ISC_R_SUCCESS &&
-                      isc_sockaddr_getport(&addr6) == 0);
-               res->dispatchv6pool = isc_mem_get(res->mctx,
-                                                 sizeof(dns_dispatch_t *) *
-                                                 ndisps);
-               if (res->dispatchv6pool == NULL) {
-                       isc_mem_put(res->mctx, res->dispatchv4pool,
-                                   sizeof(dns_dispatch_t *) * ndisps);
-                       res->dispatchv4pool = NULL;
-                       return (ISC_R_NOMEMORY);
-               }
-               for (i = 0; i < ndisps; i++)
-                       res->dispatchv6pool[i] = NULL;
-       }
-
-       for (i = 0; i < ndisps; i++) {
-               if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) {
-                       attrs = attrs_base;
-                       attrs |= DNS_DISPATCHATTR_IPV4;
-
-                       disp = NULL;
-                       result = dns_dispatch_getudp(res->dispatchmgr,
-                                                    res->socketmgr,
-                                                    res->taskmgr, &addr4,
-                                                    4096, 1000, 32768, 16411,
-                                                    16433, attrs, attrmask,
-                                                    &disp);
-                       if (result != ISC_R_SUCCESS)
-                               goto cleanup;
-                       res->dispatchv4pool[i] = disp;
-
-                       /*
-                        * It might be better to ensure all ports are
-                        * different, but in practice it's probably okay to
-                        * assume dns_dispatch_getudp() made reasonable
-                        * choices.
-                        */
-               }
-               if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) {
-                       attrs = attrs_base;
-                       attrs |= DNS_DISPATCHATTR_IPV6;
-
-                       disp = NULL;
-                       result = dns_dispatch_getudp(res->dispatchmgr,
-                                                    res->socketmgr,
-                                                    res->taskmgr, &addr6,
-                                                    4096, 1000, 32768, 16411,
-                                                    16433, attrs, attrmask,
-                                                    &disp);
-                       if (result != ISC_R_SUCCESS)
-                               goto cleanup;
-
-                       res->dispatchv6pool[i] = disp;
-               }
-       }
-
-       /* start update timer */
-       if (tick != 0) {
-               task = NULL;
-               result = isc_task_create(res->taskmgr, 0, &task);
-               if (result != ISC_R_SUCCESS)
-                       goto cleanup;
-               isc_interval_set(&interval, tick, 0);
-               result = isc_timer_create(res->timermgr, isc_timertype_ticker,
-                                         NULL, &interval, task,
-                                         disppooltimer_update,
-                                         res, &res->disppooltimer);
-               isc_task_detach(&task);
-               if (result != ISC_R_SUCCESS)
-                       goto cleanup;
-       }
-
-       res->ndisps = ndisps;
-       res->nextdisp = 0;
-
-       return (result);
-
-  cleanup:
-       if (res->dispatchv4pool != NULL) {
-               for (i = 0; i < ndisps; i++)
-                       if (res->dispatchv4pool[i] != NULL)
-                               dns_dispatch_detach(&res->dispatchv4pool[i]);
-               isc_mem_put(res->mctx, res->dispatchv4pool,
-                           sizeof(dns_dispatch_t *) * ndisps);
-       }
-       if (res->dispatchv6pool != NULL) {
-               for (i = 0; i < ndisps; i++)
-                       if (res->dispatchv6pool[i] != NULL)
-                               dns_dispatch_detach(&res->dispatchv6pool[i]);
-               isc_mem_put(res->mctx, res->dispatchv6pool,
-                           sizeof(dns_dispatch_t *) * ndisps);
-       }
-
-       return (result);
-}
index e0a420c7fea072efe0f9ab05afbde1da3d623997..f05084962d080caea6fe7191a84e23faa35b4348 100644 (file)
@@ -13,7 +13,7 @@
 # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 # PERFORMANCE OF THIS SOFTWARE.
 
-# $Id: Makefile.in,v 1.93 2007/09/14 03:39:29 marka Exp $
+# $Id: Makefile.in,v 1.93.58.1 2008/06/24 00:09:12 jinmei Exp $
 
 srcdir =       @srcdir@
 VPATH =                @srcdir@
@@ -58,7 +58,7 @@ OBJS =                @ISC_EXTRA_OBJS@ \
                lex.@O@ lfsr.@O@ lib.@O@ log.@O@ \
                md5.@O@ mem.@O@ mutexblock.@O@ \
                netaddr.@O@ netscope.@O@ ondestroy.@O@ \
-               parseint.@O@ quota.@O@ radix.@O@ random.@O@ \
+               parseint.@O@ portset.@O@ quota.@O@ radix.@O@ random.@O@ \
                ratelimiter.@O@ refcount.@O@ region.@O@ result.@O@ rwlock.@O@ \
                serial.@O@ sha1.@O@ sha2.@O@ sockaddr.@O@ \
                string.@O@ strtoul.@O@ symtab.@O@ task.@O@ taskpool.@O@ \
@@ -73,7 +73,7 @@ SRCS =                @ISC_EXTRA_SRCS@ \
                lex.c lfsr.c lib.c log.c \
                md5.c mem.c mutexblock.c \
                netaddr.c netscope.c ondestroy.c \
-               parseint.c quota.c radix.c random.c \
+               parseint.c portset.c quota.c radix.c random.c \
                ratelimiter.c refcount.c region.c result.c rwlock.c \
                serial.c sha1.c sha2.c sockaddr.c string.c strtoul.c \
                symtab.c task.c taskpool.c timer.c version.c
index 8fd87ab1aac2cfe37b2d3acf1adf77a590c69516..21796b929281d95110b501cf535b9b8c00830705 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: platform.h.in,v 1.45.60.2 2008/01/24 23:46:26 tbox Exp $ */
+/* $Id: platform.h.in,v 1.45.60.3 2008/06/24 00:09:12 jinmei Exp $ */
 
 #ifndef ISC_PLATFORM_H
 #define ISC_PLATFORM_H 1
  */
 @ISC_PLATFORM_FIXIN6ISADDR@
 
+/*! \brief
+ * Define if the system supports kqueue multiplexing
+ */
+@ISC_PLATFORM_HAVEKQUEUE@
+
+/*! \brief
+ * Define if the system supports epoll multiplexing
+ */
+@ISC_PLATFORM_HAVEEPOLL@
+
+/*! \brief
+ * Define if the system supports /dev/poll multiplexing
+ */
+@ISC_PLATFORM_HAVEDEVPOLL@
+
 /*
  *** Printing.
  ***/
index 0d175c33e35f34eb96049fac967f4fddb211007c..818330e66830fc2ec77818e984605cbfac783d3c 100644 (file)
@@ -14,7 +14,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: portset.h,v 1.3 2008/06/23 23:47:11 tbox Exp $ */
+/* $Id: portset.h,v 1.3.2.1 2008/06/24 00:09:12 jinmei Exp $ */
 
 /*! \file isc/portset.h
  * \brief Transport Protocol Port Manipuration Module
@@ -47,7 +47,7 @@ isc_result_t
 isc_portset_create(isc_mem_t *mctx, isc_portset_t **portsetp);
 /*%<
  * Create a port set and initialize it as an empty set.
- *
+ * 
  * Requires:
  *\li  'mctx' to be valid.
  *\li  'portsetp' to be non NULL and '*portsetp' to be NULL;
@@ -61,7 +61,7 @@ void
 isc_portset_destroy(isc_mem_t *mctx, isc_portset_t **portsetp);
 /*%<
  * Destroy a port set.
- *
+ * 
  * Requires:
  *\li  'mctx' to be valid and must be the same context given when the port set
  *       was created.
@@ -72,7 +72,7 @@ isc_boolean_t
 isc_portset_isset(isc_portset_t *portset, in_port_t port);
 /*%<
  * Test whether the given port is stored in the portset.
- *
+ * 
  * Requires:
  *\li  'portset' to be a valid set.
  *
@@ -84,7 +84,7 @@ unsigned int
 isc_portset_nports(isc_portset_t *portset);
 /*%<
  * Provides the number of ports stored in the given portset.
- *
+ * 
  * Requires:
  *\li  'portset' to be a valid set.
  *
index 83da044ceab8ba0c16e45c06e2fb7c8b181d9bb9..58cd1f023632f593967dff7cf8b32e33d30ab47d 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: socket.h,v 1.72.128.2 2008/06/04 23:46:32 tbox Exp $ */
+/* $Id: socket.h,v 1.72.128.3 2008/06/24 00:09:12 jinmei Exp $ */
 
 #ifndef ISC_SOCKET_H
 #define ISC_SOCKET_H 1
@@ -360,6 +360,45 @@ isc_socket_detach(isc_socket_t **socketp);
  *             All resources used by the socket have been freed
  */
 
+isc_result_t
+isc_socket_open(isc_socket_t *sock);
+/*%<
+ * Open a new socket file descriptor of the given socket structure.  It simply
+ * opens a new descriptor; all of the other parameters including the socket
+ * type are inherited from the existing socket.  This function is provided to
+ * avoid overhead of destroying and creating sockets when many short-lived
+ * sockets are frequently opened and closed.  When the efficiency is not an
+ * issue, it should be safer to detach the unused socket and re-create a new
+ * one.
+ *
+ * Requires:
+ *
+ * \li there must be no other reference to this socket.
+ *
+ * \li 'socket' is a valid and previously closed by isc_socket_close()
+ *
+ * Returns:
+ *     Same as isc_socket_create().
+ */
+
+void
+isc_socket_close(isc_socket_t *sock);
+/*%<
+ * Close a socket file descriptor of the given socket structure.  This function
+ * is provided as an alternative to destroying an unused socket when overhead
+ * destroying/re-creating sockets can be significant, and is expected to be
+ * used with isc_socket_open().
+ *
+ * Requires:
+ *
+ * \li The socket must have a valid descriptor.
+ *
+ * \li There must be no other reference to this socket.
+ *
+ * \li There must be no pending I/O requests.
+ *             
+ */
+
 isc_result_t
 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *addressp);
 /*%<
index 69b9d2b097a364f7f3b7f93537f3b9044989c9f3..78b9653b4eb16f702f03c9535fe6f06e71cd91c3 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: timer.h,v 1.38 2007/06/19 23:47:18 tbox Exp $ */
+/* $Id: timer.h,v 1.38.128.1 2008/06/24 00:09:12 jinmei Exp $ */
 
 #ifndef ISC_TIMER_H
 #define ISC_TIMER_H 1
@@ -76,6 +76,7 @@
 #include <isc/event.h>
 #include <isc/eventclass.h>
 #include <isc/lang.h>
+#include <isc/time.h>
 
 ISC_LANG_BEGINDECLS
 
@@ -93,6 +94,7 @@ typedef enum {
 
 typedef struct isc_timerevent {
        struct isc_event        common;
+       isc_time_t              due;
 } isc_timerevent_t;
 
 #define ISC_TIMEREVENT_FIRSTEVENT      (ISC_EVENTCLASS_TIMER + 0)
index 7dc4f331eb0fc93ae6b8287b8c7dec1fcf4cbd19..9d129f2874b9395868b3b6623e0ecaa8b82698ce 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: types.h,v 1.43.128.2 2008/01/17 23:46:37 tbox Exp $ */
+/* $Id: types.h,v 1.43.128.3 2008/06/24 00:09:12 jinmei Exp $ */
 
 #ifndef ISC_TYPES_H
 #define ISC_TYPES_H 1
@@ -70,6 +70,7 @@ typedef struct isc_mempool            isc_mempool_t;          /*%< Memory Pool */
 typedef struct isc_msgcat              isc_msgcat_t;           /*%< Message Catalog */
 typedef struct isc_ondestroy           isc_ondestroy_t;        /*%< On Destroy */
 typedef struct isc_netaddr             isc_netaddr_t;          /*%< Net Address */
+typedef struct isc_portset             isc_portset_t;          /*%< Port Set */
 typedef struct isc_quota               isc_quota_t;            /*%< Quota */
 typedef struct isc_random              isc_random_t;           /*%< Random */
 typedef struct isc_ratelimiter         isc_ratelimiter_t;      /*%< Rate Limiter */
index e4c5da0f3995231337da64dcd6212894c007abe1..b4d5bc291678862fcf8aa57caa632494d687e51a 100644 (file)
@@ -14,7 +14,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: portset.c,v 1.2 2008/06/23 19:41:19 jinmei Exp $ */
+/* $Id: portset.c,v 1.2.2.1 2008/06/24 00:09:12 jinmei Exp $ */
 
 /*! \file */
 #include <isc/mem.h>
index fb599796d81ab8ecbcb7ffe55beac81df33d3fe2..f7417ca1784130ddd3afd52f25631665c62c68f3 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: timer.c,v 1.81 2007/10/24 00:57:23 marka Exp $ */
+/* $Id: timer.c,v 1.81.32.1 2008/06/24 00:09:12 jinmei Exp $ */
 
 /*! \file */
 
@@ -577,7 +577,7 @@ isc_timer_detach(isc_timer_t **timerp) {
 static void
 dispatch(isc_timermgr_t *manager, isc_time_t *now) {
        isc_boolean_t done = ISC_FALSE, post_event, need_schedule;
-       isc_event_t *event;
+       isc_timerevent_t *event;
        isc_eventtype_t type = 0;
        isc_timer_t *timer;
        isc_result_t result;
@@ -650,16 +650,18 @@ dispatch(isc_timermgr_t *manager, isc_time_t *now) {
                                /*
                                 * XXX We could preallocate this event.
                                 */
-                               event = isc_event_allocate(manager->mctx,
+                               event = (isc_timerevent_t *)isc_event_allocate(manager->mctx,
                                                           timer,
                                                           type,
                                                           timer->action,
                                                           timer->arg,
                                                           sizeof(*event));
 
-                               if (event != NULL)
-                                       isc_task_send(timer->task, &event);
-                               else
+                               if (event != NULL) {
+                                       event->due = timer->due;
+                                       isc_task_send(timer->task,
+                                                     (isc_event_t **)&event);
+                               } else
                                        UNEXPECTED_ERROR(__FILE__, __LINE__,
                                                 isc_msgcat_get(isc_msgcat,
                                                         ISC_MSGSET_TIMER,
index 675fd0006ee600ce65aec1fa59e98de13aeff414..9a8ed6b07ab300fdafecde1b06226957dd459f2a 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: app.c,v 1.54.128.3 2008/01/17 23:46:37 tbox Exp $ */
+/* $Id: app.c,v 1.54.128.4 2008/06/24 00:09:12 jinmei Exp $ */
 
 /*! \file */
 
@@ -30,6 +30,9 @@
 #include <unistd.h>
 #include <signal.h>
 #include <sys/time.h>
+#ifdef HAVE_EPOLL
+#include <sys/epoll.h>
+#endif
 
 #include <isc/app.h>
 #include <isc/boolean.h>
@@ -303,8 +306,7 @@ evloop() {
                int n;
                isc_time_t when, now;
                struct timeval tv, *tvp;
-               fd_set readfds, writefds;
-               int maxfd;
+               isc_socketwait_t *swait;
                isc_boolean_t readytasks;
                isc_boolean_t call_timer_dispatch = ISC_FALSE;
 
@@ -331,8 +333,8 @@ evloop() {
                        }
                }
 
-               isc__socketmgr_getfdsets(&readfds, &writefds, &maxfd);
-               n = select(maxfd, &readfds, &writefds, NULL, tvp);
+               swait = NULL;
+               n = isc__socketmgr_waitevents(tvp, &swait);
 
                if (n == 0 || call_timer_dispatch) {
                        /*
@@ -352,8 +354,7 @@ evloop() {
                        isc__timermgr_dispatch();
                }
                if (n > 0)
-                       (void)isc__socketmgr_dispatch(&readfds, &writefds,
-                                                     maxfd);
+                       (void)isc__socketmgr_dispatch(swait);
                (void)isc__taskmgr_dispatch();
 
                if (want_reload) {
index cb40de2c426346670c4f9850713935d9b98e49ce..08b7e217429d439c4f224688bd9230906b41f40e 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: net.h,v 1.46 2007/06/19 23:47:19 tbox Exp $ */
+/* $Id: net.h,v 1.46.128.1 2008/06/24 00:09:12 jinmei Exp $ */
 
 #ifndef ISC_NET_H
 #define ISC_NET_H 1
@@ -324,6 +324,23 @@ isc_net_probeunix(void);
  * Returns whether UNIX domain sockets are supported.
  */
 
+isc_result_t
+isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high);
+/*%<
+ * Returns system's default range of ephemeral UDP ports, if defined.
+ * If the range is not available or unknown, ISC_NET_PORTRANGELOW and
+ * ISC_NET_PORTRANGEHIGH will be returned.
+ *
+ * Requires:
+ *
+ *\li  'low' and 'high' must be non NULL.
+ *
+ * Returns:
+ *
+ *\li  *low and *high will be the ports specifying the low and high ends of
+ *     the range.
+ */
+
 #ifdef ISC_PLATFORM_NEEDNTOP
 const char *
 isc_net_ntop(int af, const void *src, char *dst, size_t size);
index b19ace9c9d5a0e5dd8c94ec53a7e29874fe3ba15..386eae01b30794eeefea382fdc7e81449591e3ba 100644 (file)
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: net.c,v 1.36 2007/09/13 04:45:18 each Exp $ */
+/* $Id: net.c,v 1.36.60.1 2008/06/24 00:09:12 jinmei Exp $ */
 
 #include <config.h>
 
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
 #include <errno.h>
 #include <unistd.h>
 
 #include <isc/string.h>
 #include <isc/util.h>
 
+/*%
+ * Definitions about UDP port range specification.  This is a total mess of
+ * portability variants: some use sysctl (but the sysctl names vary), some use
+ * system-specific interfaces, some have the same interface for IPv4 and IPv6,
+ * some separate them, etc...
+ */
+
+/*%
+ * The last resort defaults: use all non well known port space
+ */
+#ifndef ISC_NET_PORTRANGELOW
+#define ISC_NET_PORTRANGELOW 1024
+#endif /* ISC_NET_PORTRANGELOW */
+#ifndef ISC_NET_PORTRANGEHIGH
+#define ISC_NET_PORTRANGEHIGH 65535
+#endif /* ISC_NET_PORTRANGEHIGH */
+
+/*%
+ * sysctl variants
+ */
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#define USE_SYSCTL_PORTRANGE
+#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.portrange.first"
+#define SYSCTL_V4PORTRANGE_HIGH        "net.inet.ip.portrange.last"
+#define SYSCTL_V6PORTRANGE_LOW "net.inet.ip.portrange.first"
+#define SYSCTL_V6PORTRANGE_HIGH        "net.inet.ip.portrange.last"
+#endif
+
+#ifdef __NetBSD__
+#define USE_SYSCTL_PORTRANGE
+#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.anonportmin"
+#define SYSCTL_V4PORTRANGE_HIGH        "net.inet.ip.anonportmax"
+#define SYSCTL_V6PORTRANGE_LOW "net.inet6.ip6.portrange.first"
+#define SYSCTL_V6PORTRANGE_HIGH        "net.inet6.ip6.portrange.last"
+#endif
+
+#ifdef __OpenBSD__
+#define USE_SYSCTL_PORTRANGE
+#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.portfirst"
+#define SYSCTL_V4PORTRANGE_HIGH        "net.inet.ip.portlast"
+#define SYSCTL_V6PORTRANGE_LOW "net.inet6.ip6.portrange.first"
+#define SYSCTL_V6PORTRANGE_HIGH        "net.inet6.ip6.portrange.last"
+#endif
+
 #if defined(ISC_PLATFORM_HAVEIPV6)
 # if defined(ISC_PLATFORM_NEEDIN6ADDRANY)
 const struct in6_addr isc_net_in6addrany = IN6ADDR_ANY_INIT;
@@ -338,6 +385,60 @@ isc_net_probe_ipv6pktinfo(void) {
        return (ipv6pktinfo_result);
 }
 
+#ifdef USE_SYSCTL_PORTRANGE
+static isc_result_t
+getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
+       int port_low, port_high;
+       size_t portlen;
+       const char *sysctlname_lowport, *sysctlname_hiport;
+
+       if (af == AF_INET) {
+               sysctlname_lowport = SYSCTL_V4PORTRANGE_LOW;
+               sysctlname_hiport = SYSCTL_V4PORTRANGE_HIGH;
+       } else {
+               sysctlname_lowport = SYSCTL_V6PORTRANGE_LOW;
+               sysctlname_hiport = SYSCTL_V6PORTRANGE_HIGH;
+       }
+       portlen = sizeof(portlen);
+       if (sysctlbyname(sysctlname_lowport, &port_low, &portlen,
+                        NULL, 0) < 0) {
+               return (ISC_R_FAILURE);
+       }
+       portlen = sizeof(portlen);
+       if (sysctlbyname(sysctlname_hiport, &port_high, &portlen,
+                        NULL, 0) < 0) {
+               return (ISC_R_FAILURE);
+       }
+       if ((port_low & ~0xffff) != 0 || (port_high & ~0xffff) != 0)
+               return (ISC_R_RANGE);
+
+       *low = (in_port_t)port_low;
+       *high = (in_port_t)port_high;
+
+       return (ISC_R_SUCCESS);
+}
+#endif
+
+isc_result_t
+isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) {
+       int result = ISC_R_FAILURE;
+
+       REQUIRE(low != NULL && high != NULL);
+
+#ifdef USE_SYSCTL_PORTRANGE
+       result = getudpportrange_sysctl(af, low, high);
+#else
+       UNUSED(af);
+#endif
+
+       if (result != ISC_R_SUCCESS) {
+               *low = ISC_NET_PORTRANGELOW;
+               *high = ISC_NET_PORTRANGEHIGH;
+       }
+
+       return (ISC_R_SUCCESS); /* we currently never fail in this function */
+}
+
 void
 isc_net_disableipv4(void) {
        initialize();
index aebd825f80d567293d8ad484e36b89118ca50f47..2d8febf70467e0240016ffbe265c9fe7b0ff042a 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: socket.c,v 1.275.10.4 2008/03/27 21:10:24 jinmei Exp $ */
+/* $Id: socket.c,v 1.275.10.5 2008/06/24 00:09:12 jinmei Exp $ */
 
 /*! \file */
 
@@ -25,9 +25,6 @@
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
-#ifdef ISC_PLATFORM_HAVESYSUNH
-#include <sys/un.h>
-#endif
 #include <sys/time.h>
 #include <sys/uio.h>
 
 #include <isc/util.h>
 #include <isc/xml.h>
 
+#ifdef ISC_PLATFORM_HAVESYSUNH
+#include <sys/un.h>
+#endif
+#ifdef ISC_PLATFORM_HAVEKQUEUE
+#include <sys/event.h>
+#endif
+#ifdef ISC_PLATFORM_HAVEEPOLL
+#include <sys/epoll.h>
+#endif
+#ifdef ISC_PLATFORM_HAVEDEVPOLL
+#include <sys/devpoll.h>
+#endif
+
 #include "errno2result.h"
 
 #ifndef ISC_PLATFORM_USETHREADS
 #include <sys/utsname.h>
 #endif
 
+/*%
+ * Choose the most preferable multiplex method. 
+ */
+#ifdef ISC_PLATFORM_HAVEKQUEUE
+#define USE_KQUEUE
+#elif defined (ISC_PLATFORM_HAVEEPOLL)
+#define USE_EPOLL
+#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
+#define USE_DEVPOLL
+typedef struct {
+       unsigned int want_read : 1,
+               want_write : 1;
+} pollinfo_t;
+#else
+#define USE_SELECT
+#endif /* ISC_PLATFORM_HAVEKQUEUE */
+
+#ifndef ISC_PLATFORM_USETHREADS
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+struct isc_socketwait {
+       int nevents;
+};
+#elif defined (USE_SELECT)
+struct isc_socketwait {
+       fd_set readset;
+       fd_set writeset;
+       int nfds;
+       int maxfd;
+};
+#endif /* USE_KQUEUE */
+#endif /* !ISC_PLATFORM_USETHREADS */
+
+/*%
+ * Maximum number of allowable open sockets.  This is also the maximum
+ * allowable socket file descriptor.  This definition is meaningless with
+ * USE_SELECT due to the API limitation of select(2).
+ */
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+#ifndef ISC_SOCKET_MAXSOCKETS
+#define ISC_SOCKET_MAXSOCKETS 4096
+#endif
+#endif /* USE_KQUEUE || USE_EPOLL || USE_DEVPOLL */
+
+/*%
+ * Size of per-FD lock buckets.
+ */
+#ifdef ISC_PLATFORM_USETHREADS
+#define FDLOCK_COUNT           1024
+#define FDLOCK_ID(fd)          ((fd) % FDLOCK_COUNT)
+#else
+#define FDLOCK_COUNT           1
+#define FDLOCK_ID(fd)          0
+#endif /* ISC_PLATFORM_USETHREADS */
+
+/*%
+ * Maximum number of events communicated with the kernel.  There should normally
+ * be no need for having a large number.
+ */
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+#ifndef ISC_SOCKET_MAXEVENTS
+#define ISC_SOCKET_MAXEVENTS   64
+#endif
+#endif
+
 /*%
  * Some systems define the socket length argument as an int, some as size_t,
  * some as socklen_t.  This is here so it can be easily changed if needed.
@@ -209,17 +283,44 @@ struct isc_socketmgr {
        unsigned int            magic;
        isc_mem_t              *mctx;
        isc_mutex_t             lock;
+       isc_mutex_t             *fdlock;
+#ifdef USE_KQUEUE
+       int                     kqueue_fd;
+       int                     nevents;
+       struct kevent           *events;
+#endif /* USE_KQUEUE */
+#ifdef USE_EPOLL
+       int                     epoll_fd;
+       int                     nevents;
+       struct epoll_event      *events;
+#endif /* USE_EPOLL */
+#ifdef USE_DEVPOLL
+       int                     devpoll_fd;
+       int                     nevents;
+       struct pollfd           *events;
+#endif /* USE_DEVPOLL */
+       unsigned int            maxsocks;
+#ifdef ISC_PLATFORM_USETHREADS
+       int                     pipe_fds[2];
+#endif
+
+       /* Locked by fdlock. */
+       isc_socket_t           **fds;
+       int                     *fdstate;
+#ifdef USE_DEVPOLL
+       pollinfo_t              *fdpollinfo;
+#endif
+
        /* Locked by manager lock. */
        ISC_LIST(isc_socket_t)  socklist;
+#ifdef USE_SELECT
        fd_set                  read_fds;
        fd_set                  write_fds;
-       isc_socket_t           *fds[FD_SETSIZE];
-       int                     fdstate[FD_SETSIZE];
        int                     maxfd;
+#endif /* USE_SELECT */
 #ifdef ISC_PLATFORM_USETHREADS
        isc_thread_t            watcher;
        isc_condition_t         shutdown_ok;
-       int                     pipe_fds[2];
 #else /* ISC_PLATFORM_USETHREADS */
        unsigned int            refs;
 #endif /* ISC_PLATFORM_USETHREADS */
@@ -261,6 +362,9 @@ static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
                              struct msghdr *, struct iovec *, size_t *);
 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
                              struct msghdr *, struct iovec *, size_t *);
+#ifdef ISC_PLATFORM_USETHREADS
+static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
+#endif
 
 #define SELECT_POKE_SHUTDOWN           (-1)
 #define SELECT_POKE_NOTHING            (-2)
@@ -329,9 +433,164 @@ socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
        }
 }
 
+static inline isc_result_t
+watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
+       isc_result_t result = ISC_R_SUCCESS;
+
+#ifdef USE_KQUEUE
+       struct kevent evchange;
+
+       memset(&evchange, 0, sizeof(evchange));
+       if (msg == SELECT_POKE_READ)
+               evchange.filter = EVFILT_READ;
+       else
+               evchange.filter = EVFILT_WRITE;
+       evchange.flags = EV_ADD;
+       evchange.ident = fd;
+       if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
+               result = isc__errno2result(errno);
+
+       return (result);
+#elif defined(USE_EPOLL)
+       struct epoll_event event;
+
+       if (msg == SELECT_POKE_READ)
+               event.events = EPOLLIN;
+       else
+               event.events = EPOLLOUT;
+       event.data.fd = fd;
+       if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
+           errno != EEXIST) {
+               result = isc__errno2result(errno);
+       }
+
+       return (result);
+#elif defined(USE_DEVPOLL)
+       struct pollfd pfd;
+       int lockid = FDLOCK_ID(fd);
+
+       memset(&pfd, 0, sizeof(pfd));
+       if (msg == SELECT_POKE_READ)
+               pfd.events = POLLIN;
+       else
+               pfd.events = POLLOUT;
+       pfd.fd = fd;
+       pfd.revents = 0;
+       LOCK(&manager->fdlock[lockid]);
+       if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
+               result = isc__errno2result(errno);
+       else {
+               if (msg == SELECT_POKE_READ)
+                       manager->fdpollinfo[fd].want_read = 1;
+               else
+                       manager->fdpollinfo[fd].want_write = 1;
+       }
+       UNLOCK(&manager->fdlock[lockid]);
+
+       return (result);
+#elif defined(USE_SELECT)
+       LOCK(&manager->lock);
+       if (msg == SELECT_POKE_READ)
+               FD_SET(fd, &manager->read_fds);
+       if (msg == SELECT_POKE_WRITE)
+               FD_SET(fd, &manager->write_fds);
+       UNLOCK(&manager->lock);
+
+       return (result);
+#endif
+}
+
+static inline isc_result_t
+unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
+       isc_result_t result = ISC_R_SUCCESS;
+
+#ifdef USE_KQUEUE
+       struct kevent evchange;
+
+       memset(&evchange, 0, sizeof(evchange));
+       if (msg == SELECT_POKE_READ)
+               evchange.filter = EVFILT_READ;
+       else
+               evchange.filter = EVFILT_WRITE;
+       evchange.flags = EV_DELETE;
+       evchange.ident = fd;
+       if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
+               result = isc__errno2result(errno);
+
+       return (result);
+#elif defined(USE_EPOLL)
+       struct epoll_event event;
+
+       if (msg == SELECT_POKE_READ)
+               event.events = EPOLLIN;
+       else
+               event.events = EPOLLOUT;
+       event.data.fd = fd;
+       if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
+           errno != ENOENT) {
+               char strbuf[ISC_STRERRORSIZE];
+               isc__strerror(errno, strbuf, sizeof(strbuf));
+               UNEXPECTED_ERROR(__FILE__, __LINE__,
+                                "epoll_ctl(DEL), %d: %s", fd, strbuf);
+               result = ISC_R_UNEXPECTED;
+       }
+       return (result);
+#elif defined(USE_DEVPOLL)
+       struct pollfd pfds[2];
+       size_t writelen = sizeof(pfds[0]);
+       int lockid = FDLOCK_ID(fd);
+
+       memset(pfds, 0, sizeof(pfds));
+       pfds[0].events = POLLREMOVE;
+       pfds[0].fd = fd;
+
+       /*
+        * Canceling read or write polling via /dev/poll is tricky.  Since it
+        * only provides a way of canceling per FD, we may need to re-poll the
+        * socket for the other operation.
+        */
+       LOCK(&manager->fdlock[lockid]);
+       if (msg == SELECT_POKE_READ &&
+           manager->fdpollinfo[fd].want_write == 1) {
+               pfds[1].events = POLLOUT;
+               pfds[1].fd = fd;
+               writelen += sizeof(pfds[1]);
+       }
+       if (msg == SELECT_POKE_WRITE &&
+           manager->fdpollinfo[fd].want_read == 1) {
+               pfds[1].events = POLLIN;
+               pfds[1].fd = fd;
+               writelen += sizeof(pfds[1]);
+       }
+
+       if (write(manager->devpoll_fd, pfds, writelen) == -1) 
+               result = isc__errno2result(errno);
+       else {
+               if (msg == SELECT_POKE_READ)
+                       manager->fdpollinfo[fd].want_read = 0;
+               else
+                       manager->fdpollinfo[fd].want_write = 0;
+       }
+       UNLOCK(&manager->fdlock[lockid]);
+
+       return (result);
+#elif defined(USE_SELECT)
+       LOCK(&manager->lock);
+       if (msg == SELECT_POKE_READ)
+               FD_CLR(fd, &manager->read_fds);
+       else if (msg == SELECT_POKE_WRITE)
+               FD_CLR(fd, &manager->write_fds);
+       UNLOCK(&manager->lock);
+
+       return (result);
+#endif
+}
+
 static void
 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
-       isc_socket_t *sock;
+       isc_result_t result;
+       isc_boolean_t needclose;
+       int lockid = FDLOCK_ID(fd);
 
        /*
         * This is a wakeup on a socket.  If the socket is not in the
@@ -339,29 +598,50 @@ wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
         * or writes.
         */
 
-       INSIST(fd >= 0 && fd < (int)FD_SETSIZE);
+       INSIST(fd >= 0 && fd < (int)manager->maxsocks);
 
+       LOCK(&manager->fdlock[lockid]);
        if (manager->fdstate[fd] == CLOSE_PENDING
            || manager->fdstate[fd] == MANAGER_CLOSE_PENDING) {
-               FD_CLR(fd, &manager->read_fds);
-               FD_CLR(fd, &manager->write_fds);
-               if (manager->fdstate[fd] == CLOSE_PENDING)
-                       (void)close(fd);
+               needclose = ISC_TF(manager->fdstate[fd] == CLOSE_PENDING);
                manager->fdstate[fd] = CLOSED;
+               UNLOCK(&manager->fdlock[lockid]);
+
+               /*
+                * We accept (and ignore) any error from unwatch_fd() as we are
+                * closing the socket, hoping it doesn't leave dangling state in
+                * the kernel.
+                * Note that unwatch_fd() must be called after releasing the
+                * fdlock; otherwise it could cause deadlock due to a lock order
+                * reversal.
+                */
+               (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+               (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+               if (needclose)
+                       (void)close(fd);
                return;
        }
-       if (manager->fdstate[fd] != MANAGED)
+       if (manager->fdstate[fd] != MANAGED) {
+               UNLOCK(&manager->fdlock[lockid]);
                return;
-
-       sock = manager->fds[fd];
+       }
+       UNLOCK(&manager->fdlock[lockid]);
 
        /*
         * Set requested bit.
         */
-       if (msg == SELECT_POKE_READ)
-               FD_SET(sock->fd, &manager->read_fds);
-       if (msg == SELECT_POKE_WRITE)
-               FD_SET(sock->fd, &manager->write_fds);
+       result = watch_fd(manager, fd, msg);
+       if (result != ISC_R_SUCCESS) {
+               /*
+                * XXXJT: what should we do?  Ignoring the failure of watching
+                * a socket will make the application dysfunctional, but there
+                * seems to be no reasonable recovery process.
+                */
+               isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
+                             ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
+                             "failed to start watching FD (%d): %s",
+                             fd, isc_result_totext(result));
+       }
 }
 
 #ifdef ISC_PLATFORM_USETHREADS
@@ -656,7 +936,7 @@ build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
 
        memset(msg, 0, sizeof(*msg));
 
-       if (sock->type == isc_sockettype_udp) {
+       if (!sock->connected) {
                msg->msg_name = (void *)&dev->address.type.sa;
                msg->msg_namelen = dev->address.length;
        } else {
@@ -1221,8 +1501,56 @@ doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
  * Caller must ensure that the socket is not locked and no external
  * references exist.
  */
+static void
+closesocket(isc_socketmgr_t *manager, isc_sockettype_t type, int fd) {
+       int lockid = FDLOCK_ID(fd);
+
+       /*
+        * No one has this socket open, so the watcher doesn't have to be
+        * poked, and the socket doesn't have to be locked.
+        */
+       LOCK(&manager->fdlock[lockid]);
+       manager->fds[fd] = NULL;
+       if (type == isc_sockettype_fdwatch)
+               manager->fdstate[fd] = MANAGER_CLOSE_PENDING;
+       else
+               manager->fdstate[fd] = CLOSE_PENDING;
+       UNLOCK(&manager->fdlock[lockid]);
+       select_poke(manager, fd, SELECT_POKE_CLOSE);
+
+       /*
+        * update manager->maxfd here (XXX: this should be implemented more
+        * efficiently)
+        */
+#ifdef USE_SELECT
+       LOCK(&manager->lock);
+       if (manager->maxfd == fd) {
+               int i;
+
+               manager->maxfd = 0;
+               for (i = fd - 1; i >= 0; i--) {
+                       lockid = FDLOCK_ID(i);
+
+                       LOCK(&manager->fdlock[lockid]);
+                       if (manager->fdstate[i] == MANAGED) {
+                               manager->maxfd = i;
+                               UNLOCK(&manager->fdlock[lockid]);
+                               break;
+                       }
+                       UNLOCK(&manager->fdlock[lockid]);
+               }
+#ifdef ISC_PLATFORM_USETHREADS
+               if (manager->maxfd < manager->pipe_fds[0])
+                       manager->maxfd = manager->pipe_fds[0];
+#endif
+       }
+       UNLOCK(&manager->lock);
+#endif /* USE_SELECT */
+}
+
 static void
 destroy(isc_socket_t **sockp) {
+       int fd;
        isc_socket_t *sock = *sockp;
        isc_socketmgr_t *manager = sock->manager;
 
@@ -1233,20 +1561,16 @@ destroy(isc_socket_t **sockp) {
        INSIST(ISC_LIST_EMPTY(sock->recv_list));
        INSIST(ISC_LIST_EMPTY(sock->send_list));
        INSIST(sock->connect_ev == NULL);
-       REQUIRE(sock->fd >= 0 && sock->fd < (int)FD_SETSIZE);
+       REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
+
+       if (sock->fd >= 0) {
+               fd = sock->fd;
+               sock->fd = -1;
+               closesocket(manager, sock->type, fd);
+       }
 
        LOCK(&manager->lock);
 
-       /*
-        * No one has this socket open, so the watcher doesn't have to be
-        * poked, and the socket doesn't have to be locked.
-        */
-       manager->fds[sock->fd] = NULL;
-       if (sock->type == isc_sockettype_fdwatch)
-               manager->fdstate[sock->fd] = MANAGER_CLOSE_PENDING;
-       else
-               manager->fdstate[sock->fd] = CLOSE_PENDING;
-       select_poke(manager, sock->fd, SELECT_POKE_CLOSE);
        ISC_LIST_UNLINK(manager->socklist, sock, link);
 
 #ifdef ISC_PLATFORM_USETHREADS
@@ -1254,10 +1578,6 @@ destroy(isc_socket_t **sockp) {
                SIGNAL(&manager->shutdown_ok);
 #endif /* ISC_PLATFORM_USETHREADS */
 
-       /*
-        * XXX should reset manager->maxfd here
-        */
-
        UNLOCK(&manager->lock);
 
        free_socket(sockp);
@@ -1445,18 +1765,11 @@ clear_bsdcompat(void) {
 }
 #endif
 
-/*%
- * Create a new 'type' socket managed by 'manager'.  Events
- * will be posted to 'task' and when dispatched 'action' will be
- * called with 'arg' as the arg value.  The new socket is returned
- * in 'socketp'.
- */
-isc_result_t
-isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
-                 isc_socket_t **socketp)
-{
-       isc_socket_t *sock = NULL;
-       isc_result_t result;
+static isc_result_t
+opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
+       char strbuf[ISC_STRERRORSIZE];
+       const char *err = "socket";
+       int tries = 0;
 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
        int on = 1;
 #endif
@@ -1464,31 +1777,20 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
        ISC_SOCKADDR_LEN_T optlen;
        int size;
 #endif
-       char strbuf[ISC_STRERRORSIZE];
-       const char *err = "socket";
-       int tries = 0;
 
-       REQUIRE(VALID_MANAGER(manager));
-       REQUIRE(socketp != NULL && *socketp == NULL);
-
-       result = allocate_socket(manager, type, &sock);
-       if (result != ISC_R_SUCCESS)
-               return (result);
-
-       sock->pf = pf;
  again:
-       switch (type) {
+       switch (sock->type) {
        case isc_sockettype_udp:
-               sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
+               sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
                break;
        case isc_sockettype_tcp:
-               sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
+               sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
                break;
        case isc_sockettype_unix:
-               sock->fd = socket(pf, SOCK_STREAM, 0);
+               sock->fd = socket(sock->pf, SOCK_STREAM, 0);
                break;
        case isc_sockettype_fdwatch:
-               INSIST(type != isc_sockettype_fdwatch);
+               INSIST(sock->type != isc_sockettype_fdwatch);
                break;
        }
        if (sock->fd == -1 && errno == EINTR && tries++ < 42)
@@ -1509,20 +1811,17 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
        }
 #endif
 
-       if (sock->fd >= (int)FD_SETSIZE) {
+       if (sock->fd >= (int)manager->maxsocks) {
                (void)close(sock->fd);
                isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
                               isc_msgcat, ISC_MSGSET_SOCKET,
                               ISC_MSG_TOOMANYFDS,
                               "%s: too many open file descriptors", "socket");
-               free_socket(&sock);
                return (ISC_R_NORESOURCES);
        }
 
        if (sock->fd < 0) {
-               free_socket(&sock);
-
                switch (errno) {
                case EMFILE:
                case ENFILE:
@@ -1554,14 +1853,13 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
 
        if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
                (void)close(sock->fd);
-               free_socket(&sock);
                return (ISC_R_UNEXPECTED);
        }
 
 #ifdef SO_BSDCOMPAT
        RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
                                  clear_bsdcompat) == ISC_R_SUCCESS);
-       if (type != isc_sockettype_unix && bsdcompat &&
+       if (sock->type != isc_sockettype_unix && bsdcompat &&
            setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
                       (void *)&on, sizeof(on)) < 0) {
                isc__strerror(errno, strbuf, sizeof(strbuf));
@@ -1590,7 +1888,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
 #endif
 
 #if defined(USE_CMSG) || defined(SO_RCVBUF)
-       if (type == isc_sockettype_udp) {
+       if (sock->type == isc_sockettype_udp) {
 
 #if defined(USE_CMSG)
 #if defined(SO_TIMESTAMP)
@@ -1611,7 +1909,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
 #endif /* SO_TIMESTAMP */
 
 #if defined(ISC_PLATFORM_HAVEIPV6)
-               if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
+               if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
                        /*
                         * Warn explicitly because this anomaly can be hidden
                         * in usual operation (and unexpectedly appear later).
@@ -1623,7 +1921,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
 #ifdef IPV6_RECVPKTINFO
                /* RFC 3542 */
-               if ((pf == AF_INET6)
+               if ((sock->pf == AF_INET6)
                    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
                                   (void *)&on, sizeof(on)) < 0)) {
                        isc__strerror(errno, strbuf, sizeof(strbuf));
@@ -1638,7 +1936,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
                }
 #else
                /* RFC 2292 */
-               if ((pf == AF_INET6)
+               if ((sock->pf == AF_INET6)
                    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
                                   (void *)&on, sizeof(on)) < 0)) {
                        isc__strerror(errno, strbuf, sizeof(strbuf));
@@ -1655,7 +1953,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
                /* use minimum MTU */
-               if (pf == AF_INET6) {
+               if (sock->pf == AF_INET6) {
                        (void)setsockopt(sock->fd, IPPROTO_IPV6,
                                         IPV6_USE_MIN_MTU,
                                         (void *)&on, sizeof(on));
@@ -1687,25 +1985,64 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
        }
 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
 
+       return (ISC_R_SUCCESS);
+}
+
+/*%
+ * Create a new 'type' socket managed by 'manager'.  Events
+ * will be posted to 'task' and when dispatched 'action' will be
+ * called with 'arg' as the arg value.  The new socket is returned
+ * in 'socketp'.
+ */
+isc_result_t
+isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
+                 isc_socket_t **socketp)
+{
+       isc_socket_t *sock = NULL;
+       isc_result_t result;
+       int lockid;
+
+       REQUIRE(VALID_MANAGER(manager));
+       REQUIRE(socketp != NULL && *socketp == NULL);
+
+       result = allocate_socket(manager, type, &sock);
+       if (result != ISC_R_SUCCESS)
+               return (result);
+
+       sock->pf = pf;
+       result = opensocket(manager, sock);
+       if (result != ISC_R_SUCCESS) {
+               free_socket(&sock);
+               return (result);
+       }
+
        memset(sock->name, 0, sizeof(sock->name));
        sock->tag = NULL;
 
        sock->references = 1;
        *socketp = sock;
 
-       LOCK(&manager->lock);
-
        /*
         * Note we don't have to lock the socket like we normally would because
         * there are no external references to it yet.
         */
 
+       lockid = FDLOCK_ID(sock->fd);
+       LOCK(&manager->fdlock[lockid]);
        manager->fds[sock->fd] = sock;
        manager->fdstate[sock->fd] = MANAGED;
+#ifdef USE_DEVPOLL
+       INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
+              sock->manager->fdpollinfo[sock->fd].want_write == 0);
+#endif
+       UNLOCK(&manager->fdlock[lockid]);
+
+       LOCK(&manager->lock);
        ISC_LIST_APPEND(manager->socklist, sock, link);
+#ifdef USE_SELECT
        if (manager->maxfd < sock->fd)
                manager->maxfd = sock->fd;
-
+#endif
        UNLOCK(&manager->lock);
 
        socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
@@ -1714,6 +2051,48 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
        return (ISC_R_SUCCESS);
 }
 
+isc_result_t
+isc_socket_open(isc_socket_t *sock) {
+       isc_result_t result;
+
+       REQUIRE(VALID_SOCKET(sock));
+
+       LOCK(&sock->lock);
+       REQUIRE(sock->references == 1);
+       UNLOCK(&sock->lock);
+       /*
+        * We don't need to retain the lock hereafter, since no one else has
+        * this socket.
+        */
+       REQUIRE(sock->fd == -1);
+
+       result = opensocket(sock->manager, sock);
+       if (result != ISC_R_SUCCESS)
+               sock->fd = -1;
+
+       if (result == ISC_R_SUCCESS) {
+               int lockid = FDLOCK_ID(sock->fd);
+
+               LOCK(&sock->manager->fdlock[lockid]);
+               sock->manager->fds[sock->fd] = sock;
+               sock->manager->fdstate[sock->fd] = MANAGED;
+#ifdef USE_DEVPOLL
+               INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
+                      sock->manager->fdpollinfo[sock->fd].want_write == 0);
+#endif
+               UNLOCK(&sock->manager->fdlock[lockid]);
+
+#ifdef USE_SELECT
+               LOCK(&sock->manager->lock);
+               if (sock->manager->maxfd < sock->fd)
+                       sock->manager->maxfd = sock->fd;
+               UNLOCK(&sock->manager->lock);
+#endif
+       }
+
+       return (result);
+}
+
 /*
  * Create a new 'type' socket managed by 'manager'.  Events
  * will be posted to 'task' and when dispatched 'action' will be
@@ -1727,6 +2106,7 @@ isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
 {
        isc_socket_t *sock = NULL;
        isc_result_t result;
+       int lockid;
 
        REQUIRE(VALID_MANAGER(manager));
        REQUIRE(socketp != NULL && *socketp == NULL);
@@ -1744,19 +2124,23 @@ isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
        sock->references = 1;
        *socketp = sock;
 
-       LOCK(&manager->lock);
-
        /*
         * Note we don't have to lock the socket like we normally would because
         * there are no external references to it yet.
         */
 
+       lockid = FDLOCK_ID(sock->fd);
+       LOCK(&manager->fdlock[lockid]);
        manager->fds[sock->fd] = sock;
        manager->fdstate[sock->fd] = MANAGED;
+       UNLOCK(&manager->fdlock[lockid]);
+
+       LOCK(&manager->lock);
        ISC_LIST_APPEND(manager->socklist, sock, link);
+#ifdef USE_SELECT
        if (manager->maxfd < sock->fd)
                manager->maxfd = sock->fd;
-
+#endif
        UNLOCK(&manager->lock);
 
        if (flags & ISC_SOCKFDWATCH_READ)
@@ -1811,6 +2195,42 @@ isc_socket_detach(isc_socket_t **socketp) {
        *socketp = NULL;
 }
 
+void
+isc_socket_close(isc_socket_t *sock) {
+       int fd;
+
+       REQUIRE(VALID_SOCKET(sock));
+
+       LOCK(&sock->lock);
+       REQUIRE(sock->references == 1);
+       UNLOCK(&sock->lock);
+       /*
+        * We don't need to retain the lock hereafter, since no one else has
+        * this socket.
+        */
+
+       REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
+
+       INSIST(!sock->connecting);
+       INSIST(!sock->pending_recv);
+       INSIST(!sock->pending_send);
+       INSIST(!sock->pending_accept);
+       INSIST(ISC_LIST_EMPTY(sock->recv_list));
+       INSIST(ISC_LIST_EMPTY(sock->send_list));
+       INSIST(ISC_LIST_EMPTY(sock->accept_list));
+       INSIST(sock->connect_ev == NULL);
+
+       fd = sock->fd;
+       sock->fd = -1;
+       sock->listener = 0;
+       sock->connected = 0;
+       sock->connecting = 0;
+       sock->bound = 0;
+       isc_sockaddr_any(&sock->peer_address);
+
+       closesocket(sock->manager, sock->type, fd);
+}
+
 /*
  * I/O is possible on a given socket.  Schedule an event to this task that
  * will call an internal function to do the I/O.  This will charge the
@@ -1825,7 +2245,15 @@ dispatch_recv(isc_socket_t *sock) {
        isc_socketevent_t *ev;
        isc_task_t *sender;
 
+#if 0
+       /*
+        * XXXJT: this assertion seems to strong, but leave it here for
+        * reference.
+        */
        INSIST(!sock->pending_recv);
+#endif
+       if (sock->pending_recv != 0)
+               return;
 
        if (sock->type != isc_sockettype_fdwatch) {
                ev = ISC_LIST_HEAD(sock->recv_list);
@@ -2132,7 +2560,7 @@ internal_accept(isc_task_t *me, isc_event_t *ev) {
                                         sock->pf);
                        (void)close(fd);
                        goto soft_error;
-               } else if (fd >= (int)FD_SETSIZE) {
+               } else if (fd >= (int)manager->maxsocks) {
                        isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
                                       isc_msgcat, ISC_MSGSET_SOCKET,
@@ -2172,6 +2600,13 @@ internal_accept(isc_task_t *me, isc_event_t *ev) {
         * -1 means the new socket didn't happen.
         */
        if (fd != -1) {
+               int lockid = FDLOCK_ID(fd);
+
+               LOCK(&manager->fdlock[lockid]);
+               manager->fds[fd] = dev->newsocket;
+               manager->fdstate[fd] = MANAGED;
+               UNLOCK(&manager->fdlock[lockid]);
+
                LOCK(&manager->lock);
                ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
 
@@ -2184,10 +2619,10 @@ internal_accept(isc_task_t *me, isc_event_t *ev) {
                 */
                dev->address = dev->newsocket->peer_address;
 
-               manager->fds[fd] = dev->newsocket;
-               manager->fdstate[fd] = MANAGED;
+#ifdef USE_SELECT
                if (manager->maxfd < fd)
                        manager->maxfd = fd;
+#endif
 
                socket_log(sock, &dev->newsocket->peer_address, CREATION,
                           isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
@@ -2416,80 +2851,238 @@ internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
        UNLOCK(&sock->lock);
 }
 
+/*
+ * Process read/writes on each fd here.  Avoid locking
+ * and unlocking twice if both reads and writes are possible.
+ */
 static void
-process_fds(isc_socketmgr_t *manager, int maxfd,
-           fd_set *readfds, fd_set *writefds)
+process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
+          isc_boolean_t writeable)
 {
-       int i;
        isc_socket_t *sock;
        isc_boolean_t unlock_sock;
-
-       REQUIRE(maxfd <= (int)FD_SETSIZE);
+       isc_boolean_t needclose;
+       int lockid = FDLOCK_ID(fd);
 
        /*
-        * Process read/writes on other fds here.  Avoid locking
-        * and unlocking twice if both reads and writes are possible.
+        * If we need to close the socket, do it now.
         */
-       for (i = 0; i < maxfd; i++) {
-#ifdef ISC_PLATFORM_USETHREADS
-               if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
-                       continue;
-#endif /* ISC_PLATFORM_USETHREADS */
+       LOCK(&manager->fdlock[lockid]);
+       if (manager->fdstate[fd] == CLOSE_PENDING
+           || manager->fdstate[fd] == MANAGER_CLOSE_PENDING) {
+               needclose = ISC_TF(manager->fdstate[fd] == CLOSE_PENDING);
+               manager->fdstate[fd] = CLOSED;
+               UNLOCK(&manager->fdlock[lockid]);
+
+               (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+               (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+               if (needclose)
+                       (void)close(fd);
+               return;
+       }
+
+       sock = manager->fds[fd];
+       UNLOCK(&manager->fdlock[lockid]);
+       unlock_sock = ISC_FALSE;
+       if (readable) {
+               if (sock == NULL) {
+                       (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+                       goto check_write;
+               }
+               unlock_sock = ISC_TRUE;
+               LOCK(&sock->lock);
+               if (!SOCK_DEAD(sock)) {
+                       if (sock->listener)
+                               dispatch_accept(sock);
+                       else
+                               dispatch_recv(sock);
+               }
+               (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+       }
+check_write:
+       if (writeable) {
+               if (sock == NULL) {
+                       (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+                       return;
+               }
+               if (!unlock_sock) {
+                       unlock_sock = ISC_TRUE;
+                       LOCK(&sock->lock);
+               }
+               if (!SOCK_DEAD(sock)) {
+                       if (sock->connecting)
+                               dispatch_connect(sock);
+                       else
+                               dispatch_send(sock);
+               }
+               (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+       }
+       if (unlock_sock)
+               UNLOCK(&sock->lock);
+}
+
+#ifdef USE_KQUEUE
+static isc_boolean_t
+process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) {
+       int i;
+       isc_boolean_t readable, writable;
+       isc_boolean_t done = ISC_FALSE;
 
+       if (nevents == manager->nevents) {
                /*
-                * If we need to close the socket, do it now.
+                * This is not an error, but something unexpected.  If this
+                * happens, it may indicate the need for increasing
+                * ISC_SOCKET_MAXEVENTS.
                 */
-               if (manager->fdstate[i] == CLOSE_PENDING
-                   || manager->fdstate[i] == MANAGER_CLOSE_PENDING) {
-                       FD_CLR(i, &manager->read_fds);
-                       FD_CLR(i, &manager->write_fds);
-                       if (manager->fdstate[i] == CLOSE_PENDING)
-                               (void)close(i);
-                       manager->fdstate[i] = CLOSED;
+               manager_log(manager, ISC_LOGCATEGORY_GENERAL,
+                           ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
+                           "maximum number of FD events (%d) received",
+                           nevents);
+       }
+
+       for (i = 0; i < nevents; i++) {
+               REQUIRE(events[i].ident < manager->maxsocks);
+#ifdef ISC_PLATFORM_USETHREADS
+               if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
+                       done = process_ctlfd(manager);
                        continue;
                }
+#endif
+               readable = ISC_TF(events[i].filter == EVFILT_READ);
+               writable = ISC_TF(events[i].filter == EVFILT_WRITE);
+               process_fd(manager, events[i].ident, readable, writable);
+       }
 
-               sock = manager->fds[i];
-               unlock_sock = ISC_FALSE;
-               if (FD_ISSET(i, readfds)) {
-                       if (sock == NULL) {
-                               FD_CLR(i, &manager->read_fds);
-                               goto check_write;
-                       }
-                       unlock_sock = ISC_TRUE;
-                       LOCK(&sock->lock);
-                       if (!SOCK_DEAD(sock)) {
-                               if (sock->listener)
-                                       dispatch_accept(sock);
-                               else
-                                       dispatch_recv(sock);
-                       }
-                       FD_CLR(i, &manager->read_fds);
+       return (done);
+}
+#elif defined(USE_EPOLL)
+static isc_boolean_t
+process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) {
+       int i;
+       isc_boolean_t done = ISC_FALSE;
+
+       if (nevents == manager->nevents) {
+               manager_log(manager, ISC_LOGCATEGORY_GENERAL,
+                           ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
+                           "maximum number of FD events (%d) received",
+                           nevents);
+       }
+
+       for (i = 0; i < nevents; i++) {
+               REQUIRE(events[i].data.fd < (int)manager->maxsocks);
+#ifdef ISC_PLATFORM_USETHREADS
+               if (events[i].data.fd == manager->pipe_fds[0]) {
+                       done = process_ctlfd(manager);
+                       continue;
                }
-       check_write:
-               if (FD_ISSET(i, writefds)) {
-                       if (sock == NULL) {
-                               FD_CLR(i, &manager->write_fds);
-                               continue;
-                       }
-                       if (!unlock_sock) {
-                               unlock_sock = ISC_TRUE;
-                               LOCK(&sock->lock);
-                       }
-                       if (!SOCK_DEAD(sock)) {
-                               if (sock->connecting)
-                                       dispatch_connect(sock);
-                               else
-                                       dispatch_send(sock);
-                       }
-                       FD_CLR(i, &manager->write_fds);
+#endif
+               if ((events[i].events & EPOLLERR) != 0 ||
+                   (events[i].events & EPOLLHUP) != 0) {
+                       /*
+                        * epoll does not set IN/OUT bits on an erroneous
+                        * condition, so we need to try both anyway.  This is a
+                        * bit inefficient, but should be okay for such rare
+                        * events.  Note also that the read or write attempt
+                        * won't block because we use non-blocking sockets.
+                        */
+                       events[i].events |= (EPOLLIN | EPOLLOUT);
                }
-               if (unlock_sock)
-                       UNLOCK(&sock->lock);
+               process_fd(manager, events[i].data.fd,
+                          (events[i].events & EPOLLIN) != 0,
+                          (events[i].events & EPOLLOUT) != 0);
+       }
+
+       return (done);
+}
+#elif defined(USE_DEVPOLL)
+static isc_boolean_t
+process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) {
+       int i;
+       isc_boolean_t done = ISC_FALSE;
+
+       if (nevents == manager->nevents) {
+               manager_log(manager, ISC_LOGCATEGORY_GENERAL,
+                           ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
+                           "maximum number of FD events (%d) received",
+                           nevents);
+       }
+
+       for (i = 0; i < nevents; i++) {
+               REQUIRE(events[i].fd < (int)manager->maxsocks);
+#ifdef ISC_PLATFORM_USETHREADS
+               if (events[i].fd == manager->pipe_fds[0]) {
+                       done = process_ctlfd(manager);
+                       continue;
+               }
+#endif
+               process_fd(manager, events[i].fd,
+                          (events[i].events & POLLIN) != 0,
+                          (events[i].events & POLLOUT) != 0);
+       }
+
+       return (done);
+}
+#elif defined(USE_SELECT)
+static void
+process_fds(isc_socketmgr_t *manager, int maxfd,
+           fd_set *readfds, fd_set *writefds)
+{
+       int i;
+
+       REQUIRE(maxfd <= (int)manager->maxsocks);
+
+       for (i = 0; i < maxfd; i++) {
+#ifdef ISC_PLATFORM_USETHREADS
+               if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
+                       continue;
+#endif /* ISC_PLATFORM_USETHREADS */
+               process_fd(manager, i, FD_ISSET(i, readfds),
+                          FD_ISSET(i, writefds));
        }
 }
+#endif
 
 #ifdef ISC_PLATFORM_USETHREADS
+static isc_boolean_t
+process_ctlfd(isc_socketmgr_t *manager) {
+       int msg, fd;
+
+       for (;;) {
+               select_readmsg(manager, &fd, &msg);
+
+               manager_log(manager, IOEVENT,
+                           isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
+                                          ISC_MSG_WATCHERMSG,
+                                          "watcher got message %d "
+                                          "for socket %d"), msg, fd);
+
+               /*
+                * Nothing to read?
+                */
+               if (msg == SELECT_POKE_NOTHING)
+                       return (ISC_FALSE);
+
+               /*
+                * Handle shutdown message.  We really should
+                * jump out of this loop right away, but
+                * it doesn't matter if we have to do a little
+                * more work first.
+                */
+               if (msg == SELECT_POKE_SHUTDOWN)
+                       return (ISC_TRUE);
+
+               /*
+                * This is a wakeup on a socket.  Look
+                * at the event queue for both read and write,
+                * and decide if we need to watch on it now
+                * or not.
+                */
+               wakeup_socket(manager, fd, msg);
+       }
+
+       return (ISC_FALSE);
+}
+
 /*
  * This is the thread that will loop forever, always in a select or poll
  * call.
@@ -2503,97 +3096,77 @@ watcher(void *uap) {
        isc_boolean_t done;
        int ctlfd;
        int cc;
+#ifdef USE_KQUEUE
+       const char *fnname = "kevent()";
+#elif defined (USE_EPOLL)
+       const char *fnname = "epoll_wait()";
+#elif defined(USE_DEVPOLL)
+       const char *fnname = "ioctl(DP_POLL)";
+       struct dvpoll dvp;
+#elif defined (USE_SELECT)
+       const char *fnname = "select()";
        fd_set readfds;
        fd_set writefds;
-       int msg, fd;
        int maxfd;
+#endif
        char strbuf[ISC_STRERRORSIZE];
 
        /*
         * Get the control fd here.  This will never change.
         */
-       LOCK(&manager->lock);
        ctlfd = manager->pipe_fds[0];
-
        done = ISC_FALSE;
        while (!done) {
                do {
+#ifdef USE_KQUEUE
+                       cc = kevent(manager->kqueue_fd, NULL, 0,
+                                   manager->events, manager->nevents, NULL);
+#elif defined(USE_EPOLL)
+                       cc = epoll_wait(manager->epoll_fd, manager->events,
+                                       manager->nevents, -1);
+#elif defined(USE_DEVPOLL)
+                       dvp.dp_fds = manager->events;
+                       dvp.dp_nfds = manager->nevents;
+                       dvp.dp_timeout = -1;
+                       cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
+#elif defined(USE_SELECT)
+                       LOCK(&manager->lock);
                        readfds = manager->read_fds;
                        writefds = manager->write_fds;
                        maxfd = manager->maxfd + 1;
-
                        UNLOCK(&manager->lock);
 
                        cc = select(maxfd, &readfds, &writefds, NULL, NULL);
-                       if (cc < 0) {
-                               if (!SOFT_ERROR(errno)) {
-                                       isc__strerror(errno, strbuf,
-                                                     sizeof(strbuf));
-                                       FATAL_ERROR(__FILE__, __LINE__,
-                                                   "select() %s: %s",
-                                                   isc_msgcat_get(isc_msgcat,
-                                                           ISC_MSGSET_GENERAL,
-                                                           ISC_MSG_FAILED,
-                                                           "failed"),
-                                                   strbuf);
-                               }
-                       }
+#endif /* USE_KQUEUE */
 
-                       LOCK(&manager->lock);
+                       if (cc < 0 && !SOFT_ERROR(errno)) {
+                               isc__strerror(errno, strbuf, sizeof(strbuf));
+                               FATAL_ERROR(__FILE__, __LINE__,
+                                           "%s %s: %s", fnname,
+                                           isc_msgcat_get(isc_msgcat,
+                                                          ISC_MSGSET_GENERAL,
+                                                          ISC_MSG_FAILED,
+                                                          "failed"), strbuf);
+                       }
                } while (cc < 0);
 
-
+#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
+               done = process_fds(manager, manager->events, cc);
+#elif defined(USE_SELECT)
                /*
                 * Process reads on internal, control fd.
                 */
-               if (FD_ISSET(ctlfd, &readfds)) {
-                       for (;;) {
-                               select_readmsg(manager, &fd, &msg);
-
-                               manager_log(manager, IOEVENT,
-                                           isc_msgcat_get(isc_msgcat,
-                                                    ISC_MSGSET_SOCKET,
-                                                    ISC_MSG_WATCHERMSG,
-                                                    "watcher got message %d"
-                                                    " for socket %d"),
-                                                    msg, fd);
-
-                               /*
-                                * Nothing to read?
-                                */
-                               if (msg == SELECT_POKE_NOTHING)
-                                       break;
-
-                               /*
-                                * Handle shutdown message.  We really should
-                                * jump out of this loop right away, but
-                                * it doesn't matter if we have to do a little
-                                * more work first.
-                                */
-                               if (msg == SELECT_POKE_SHUTDOWN) {
-                                       done = ISC_TRUE;
-
-                                       break;
-                               }
-
-                               /*
-                                * This is a wakeup on a socket.  Look
-                                * at the event queue for both read and write,
-                                * and decide if we need to watch on it now
-                                * or not.
-                                */
-                               wakeup_socket(manager, fd, msg);
-                       }
-               }
+               if (FD_ISSET(ctlfd, &readfds))
+                       done = process_ctlfd(manager);
 
                process_fds(manager, maxfd, &readfds, &writefds);
+#endif
        }
 
        manager_log(manager, TRACE,
                    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
                                   ISC_MSG_EXITING, "watcher exiting"));
 
-       UNLOCK(&manager->lock);
        return ((isc_threadresult_t)0);
 }
 #endif /* ISC_PLATFORM_USETHREADS */
@@ -2601,14 +3174,157 @@ watcher(void *uap) {
 /*
  * Create a new socket manager.
  */
+
+static isc_result_t
+setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
+       isc_result_t result;
+
+#ifdef USE_KQUEUE
+       manager->nevents = ISC_SOCKET_MAXEVENTS;
+       manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
+                                     manager->nevents);
+       if (manager->events == NULL)
+               return (ISC_R_NOMEMORY);
+       manager->kqueue_fd = kqueue();
+       if (manager->kqueue_fd == -1) {
+               result = isc__errno2result(errno);
+               isc_mem_put(mctx, manager->events,
+                           sizeof(struct kevent) * manager->nevents);
+               return (result);
+       }
+       
+#ifdef ISC_PLATFORM_USETHREADS
+       result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+       if (result != ISC_R_SUCCESS) {
+               close(manager->kqueue_fd);
+               isc_mem_put(mctx, manager->events,
+                           sizeof(struct kevent) * manager->nevents);
+               return (result);
+       }
+#endif /* ISC_PLATFORM_USETHREADS */
+#elif defined(USE_EPOLL)
+       manager->nevents = ISC_SOCKET_MAXEVENTS;
+       manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
+                                     manager->nevents);
+       if (manager->events == NULL)
+               return (ISC_R_NOMEMORY);
+       manager->epoll_fd = epoll_create(manager->nevents);
+       if (manager->epoll_fd == -1) {
+               result = isc__errno2result(errno);
+               isc_mem_put(mctx, manager->events,
+                           sizeof(struct epoll_event) * manager->nevents);
+               return (result);
+       }
+#ifdef ISC_PLATFORM_USETHREADS
+       result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+       if (result != ISC_R_SUCCESS) {
+               close(manager->epoll_fd);
+               isc_mem_put(mctx, manager->events,
+                           sizeof(struct epoll_event) * manager->nevents);
+               return (result);
+       }
+#endif /* ISC_PLATFORM_USETHREADS */
+#elif defined(USE_DEVPOLL)
+       /*
+        * XXXJT: /dev/poll seems to reject large numbers of events,
+        * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
+        */
+       manager->nevents = ISC_SOCKET_MAXEVENTS;
+       manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
+                                     manager->nevents);
+       if (manager->events == NULL)
+               return (ISC_R_NOMEMORY);
+       /*
+        * Note: fdpollinfo should be able to support all possible FDs, so
+        * it must have maxsocks entries (not nevents).
+        */
+       manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
+                                         manager->maxsocks);
+       if (manager->fdpollinfo == NULL) {
+               isc_mem_put(mctx, manager->events,
+                           sizeof(pollinfo_t) * manager->maxsocks);
+               return (ISC_R_NOMEMORY);
+       }
+       memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
+       manager->devpoll_fd = open("/dev/poll", O_RDWR);
+       if (manager->devpoll_fd == -1) {
+               result = isc__errno2result(errno);
+               isc_mem_put(mctx, manager->events,
+                           sizeof(struct pollfd) * manager->nevents);
+               isc_mem_put(mctx, manager->fdpollinfo,
+                           sizeof(pollinfo_t) * manager->maxsocks);
+               return (result);
+       }
+#ifdef ISC_PLATFORM_USETHREADS
+       result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+       if (result != ISC_R_SUCCESS) {
+               close(manager->devpoll_fd);
+               isc_mem_put(mctx, manager->events,
+                           sizeof(struct pollfd) * manager->nevents);
+               isc_mem_put(mctx, manager->fdpollinfo,
+                           sizeof(pollinfo_t) * manager->maxsocks);
+               return (result);
+       }
+#endif /* ISC_PLATFORM_USETHREADS */
+#elif defined(USE_SELECT)
+       UNUSED(mctx);
+       UNUSED(result);
+
+       FD_ZERO(&manager->read_fds);
+       FD_ZERO(&manager->write_fds);
+#ifdef ISC_PLATFORM_USETHREADS
+       (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+       manager->maxfd = manager->pipe_fds[0];
+#else /* ISC_PLATFORM_USETHREADS */
+       manager->maxfd = 0;
+#endif /* ISC_PLATFORM_USETHREADS */
+#endif /* USE_KQUEUE */
+
+       return (ISC_R_SUCCESS);
+}
+
+static void
+cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
+#ifdef ISC_PLATFORM_USETHREADS
+       isc_result_t result;
+
+       result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+       if (result != ISC_R_SUCCESS) {
+               UNEXPECTED_ERROR(__FILE__, __LINE__,
+                                "epoll_ctl(DEL) %s",
+                                isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
+                                               ISC_MSG_FAILED, "failed"));
+       }
+#endif /* ISC_PLATFORM_USETHREADS */
+
+#ifdef USE_KQUEUE
+       close(manager->kqueue_fd);
+       isc_mem_put(mctx, manager->events,
+                   sizeof(struct kevent) * manager->nevents);
+#elif defined(USE_EPOLL)
+       close(manager->epoll_fd);
+       isc_mem_put(mctx, manager->events,
+                   sizeof(struct epoll_event) * manager->nevents);
+#elif defined(USE_DEVPOLL)
+       close(manager->devpoll_fd);
+       isc_mem_put(mctx, manager->events,
+                   sizeof(struct pollfd) * manager->nevents);
+       isc_mem_put(mctx, manager->fdpollinfo,
+                   sizeof(pollinfo_t) * manager->maxsocks);
+#elif defined(USE_SELECT)
+       UNUSED(mctx);
+       UNUSED(manager);
+#endif /* USE_KQUEUE */
+}
+
 isc_result_t
 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
+       int i;
        isc_socketmgr_t *manager;
 #ifdef ISC_PLATFORM_USETHREADS
        char strbuf[ISC_STRERRORSIZE];
 #endif
        isc_result_t result;
-
        REQUIRE(managerp != NULL && *managerp == NULL);
 
 #ifndef ISC_PLATFORM_USETHREADS
@@ -2623,24 +3339,59 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
        if (manager == NULL)
                return (ISC_R_NOMEMORY);
 
+       /* zero-clear so that necessary cleanup on failure will be easy */
+       memset(manager, 0, sizeof(*manager));
+
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+       manager->maxsocks = ISC_SOCKET_MAXSOCKETS;
+#elif defined (USE_SELECT)
+       manager->maxsocks = FD_SETSIZE;
+#endif
+
+       manager->fds = isc_mem_get(mctx,
+                                  manager->maxsocks * sizeof(isc_socket_t *));
+       if (manager->fds == NULL) {
+               result = ISC_R_NOMEMORY;
+               goto free_manager;
+       }
+       manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
+       if (manager->fds == NULL) {
+               result = ISC_R_NOMEMORY;
+               goto free_manager;
+       }
+
        manager->magic = SOCKET_MANAGER_MAGIC;
        manager->mctx = NULL;
-       memset(manager->fds, 0, sizeof(manager->fds));
+       memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
        ISC_LIST_INIT(manager->socklist);
        result = isc_mutex_init(&manager->lock);
-       if (result != ISC_R_SUCCESS) {
-               isc_mem_put(mctx, manager, sizeof(*manager));
-               return (result);
+       if (result != ISC_R_SUCCESS)
+               goto free_manager;
+       manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
+       if (manager->fdlock == NULL) {
+               result = ISC_R_NOMEMORY;
+               goto cleanup_lock;
        }
+       for (i = 0; i < FDLOCK_COUNT; i++) {
+               result = isc_mutex_init(&manager->fdlock[i]);
+               if (result != ISC_R_SUCCESS) {
+                       while (--i >= 0)
+                               DESTROYLOCK(&manager->fdlock[i]);
+                       isc_mem_put(mctx, manager->fdlock,
+                                   FDLOCK_COUNT * sizeof(isc_mutex_t));
+                       manager->fdlock = NULL;
+                       goto cleanup_lock;
+               }
+       }
+
 #ifdef ISC_PLATFORM_USETHREADS
        if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
-               DESTROYLOCK(&manager->lock);
-               isc_mem_put(mctx, manager, sizeof(*manager));
                UNEXPECTED_ERROR(__FILE__, __LINE__,
                                 "isc_condition_init() %s",
                                 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
                                                ISC_MSG_FAILED, "failed"));
-               return (ISC_R_UNEXPECTED);
+               result = ISC_R_UNEXPECTED;
+               goto cleanup_lock;
        }
 
        /*
@@ -2648,16 +3399,14 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
         * select/poll loop when something internal needs to be done.
         */
        if (pipe(manager->pipe_fds) != 0) {
-               DESTROYLOCK(&manager->lock);
-               isc_mem_put(mctx, manager, sizeof(*manager));
                isc__strerror(errno, strbuf, sizeof(strbuf));
                UNEXPECTED_ERROR(__FILE__, __LINE__,
                                 "pipe() %s: %s",
                                 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
                                                ISC_MSG_FAILED, "failed"),
                                 strbuf);
-
-               return (ISC_R_UNEXPECTED);
+               result = ISC_R_UNEXPECTED;
+               goto cleanup_condition;
        }
 
        RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
@@ -2671,15 +3420,10 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
        /*
         * Set up initial state for the select loop
         */
-       FD_ZERO(&manager->read_fds);
-       FD_ZERO(&manager->write_fds);
-#ifdef ISC_PLATFORM_USETHREADS
-       FD_SET(manager->pipe_fds[0], &manager->read_fds);
-       manager->maxfd = manager->pipe_fds[0];
-#else /* ISC_PLATFORM_USETHREADS */
-       manager->maxfd = 0;
-#endif /* ISC_PLATFORM_USETHREADS */
-       memset(manager->fdstate, 0, sizeof(manager->fdstate));
+       result = setup_watcher(mctx, manager);
+       if (result != ISC_R_SUCCESS)
+               goto cleanup;
+       memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
 
 #ifdef ISC_PLATFORM_USETHREADS
        /*
@@ -2687,15 +3431,13 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
         */
        if (isc_thread_create(watcher, manager, &manager->watcher) !=
            ISC_R_SUCCESS) {
-               (void)close(manager->pipe_fds[0]);
-               (void)close(manager->pipe_fds[1]);
-               DESTROYLOCK(&manager->lock);
-               isc_mem_put(mctx, manager, sizeof(*manager));
                UNEXPECTED_ERROR(__FILE__, __LINE__,
                                 "isc_thread_create() %s",
                                 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
                                                ISC_MSG_FAILED, "failed"));
-               return (ISC_R_UNEXPECTED);
+               cleanup_watcher(mctx, manager);
+               result = ISC_R_UNEXPECTED;
+               goto cleanup;
        }
 #endif /* ISC_PLATFORM_USETHREADS */
        isc_mem_attach(mctx, &manager->mctx);
@@ -2706,6 +3448,42 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
        *managerp = manager;
 
        return (ISC_R_SUCCESS);
+
+cleanup:
+#ifdef ISC_PLATFORM_USETHREADS
+       (void)close(manager->pipe_fds[0]);
+       (void)close(manager->pipe_fds[1]);
+#endif /* ISC_PLATFORM_USETHREADS */
+
+#ifdef ISC_PLATFORM_USETHREADS
+cleanup_condition:
+       (void)isc_condition_destroy(&manager->shutdown_ok);
+#endif /* ISC_PLATFORM_USETHREADS */
+
+
+cleanup_lock:
+       if (manager->fdlock != NULL) {
+               for (i = 0; i < FDLOCK_COUNT; i++)
+                       DESTROYLOCK(&manager->fdlock[i]);
+       }
+       DESTROYLOCK(&manager->lock);
+
+free_manager:
+       if (manager->fdlock != NULL) {
+               isc_mem_put(mctx, manager->fdlock,
+                           FDLOCK_COUNT * sizeof(isc_mutex_t));
+       }
+       if (manager->fdstate != NULL) {
+               isc_mem_put(mctx, manager->fdstate,
+                           manager->maxsocks * sizeof(int));
+       }
+       if (manager->fds != NULL) {
+               isc_mem_put(mctx, manager->fds,
+                           manager->maxsocks * sizeof(isc_socket_t *));
+       }
+       isc_mem_put(mctx, manager, sizeof(*manager));
+
+       return (result);
 }
 
 void
@@ -2779,16 +3557,29 @@ isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
        /*
         * Clean up.
         */
+       cleanup_watcher(manager->mctx, manager);
+
 #ifdef ISC_PLATFORM_USETHREADS
        (void)close(manager->pipe_fds[0]);
        (void)close(manager->pipe_fds[1]);
        (void)isc_condition_destroy(&manager->shutdown_ok);
 #endif /* ISC_PLATFORM_USETHREADS */
 
-       for (i = 0; i < (int)FD_SETSIZE; i++)
-               if (manager->fdstate[i] == CLOSE_PENDING)
+       for (i = 0; i < (int)manager->maxsocks; i++)
+               if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
                        (void)close(i);
 
+       isc_mem_put(manager->mctx, manager->fds,
+                   manager->maxsocks * sizeof(isc_socket_t *));
+       isc_mem_put(manager->mctx, manager->fdstate,
+                   manager->maxsocks * sizeof(int));
+
+       if (manager->fdlock != NULL) {
+               for (i = 0; i < FDLOCK_COUNT; i++)
+                       DESTROYLOCK(&manager->fdlock[i]);
+               isc_mem_put(manager->mctx, manager->fdlock,
+                           FDLOCK_COUNT * sizeof(isc_mutex_t));
+       }
        DESTROYLOCK(&manager->lock);
        manager->magic = 0;
        mctx= manager->mctx;
@@ -4005,26 +4796,84 @@ isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
 }
 
 #ifndef ISC_PLATFORM_USETHREADS
-void
-isc__socketmgr_getfdsets(fd_set *readset, fd_set *writeset, int *maxfd) {
+/* In our assumed scenario, we can simply use a single static object. */
+static isc_socketwait_t swait_private;
+
+int
+isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) {
+       int n;
+#ifdef USE_KQUEUE
+       struct timespec ts, *tsp;
+#endif
+#ifdef USE_EPOLL
+       int timeout;
+#endif
+#ifdef USE_DEVPOLL
+       struct dvpoll dvp;
+#endif
+
+       REQUIRE(swaitp != NULL && *swaitp == NULL);
+
        if (socketmgr == NULL)
-               *maxfd = 0;
-       else {
-               *readset = socketmgr->read_fds;
-               *writeset = socketmgr->write_fds;
-               *maxfd = socketmgr->maxfd + 1;
-       }
+               return (0);
+
+#ifdef USE_KQUEUE
+       if (tvp != NULL) {
+               ts.tv_sec = tvp->tv_sec;
+               ts.tv_nsec = tvp->tv_usec * 1000;
+               tsp = &ts;
+       } else
+               tsp = NULL;
+       swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0,
+                                      socketmgr->events, socketmgr->nevents,
+                                      tsp);
+       n = swait_private.nevents;
+#elif defined(USE_EPOLL)
+       if (tvp != NULL)
+               timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
+       else
+               timeout = -1;
+       swait_private.nevents = epoll_wait(socketmgr->epoll_fd,
+                                          socketmgr->events,
+                                          socketmgr->nevents, timeout);
+       n = swait_private.nevents;
+#elif defined(USE_DEVPOLL)
+       dvp.dp_fds = socketmgr->events;
+       dvp.dp_nfds = socketmgr->nevents;
+       if (tvp != NULL) {
+               dvp.dp_timeout = tvp->tv_sec * 1000 +
+                       (tvp->tv_usec + 999) / 1000;
+       } else
+               dvp.dp_timeout = -1;
+       swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp);
+       n = swait_private.nevents;
+#elif defined(USE_SELECT)
+       swait_private.readset = socketmgr->read_fds;
+       swait_private.writeset = socketmgr->write_fds;
+       swait_private.maxfd = socketmgr->maxfd + 1;
+
+       n = select(swait_private.maxfd, &swait_private.readset,
+                  &swait_private.writeset, NULL, tvp);
+#endif
+
+       *swaitp = &swait_private;
+       return (n);
 }
 
 isc_result_t
-isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) {
-       isc_socketmgr_t *manager = socketmgr;
+isc__socketmgr_dispatch(isc_socketwait_t *swait) {
+       REQUIRE(swait == &swait_private);
 
-       if (manager == NULL)
+       if (socketmgr == NULL)
                return (ISC_R_NOTFOUND);
 
-       process_fds(manager, maxfd, readset, writeset);
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+       (void)process_fds(socketmgr, socketmgr->events, swait->nevents);
        return (ISC_R_SUCCESS);
+#elif defined(USE_SELECT)
+       process_fds(socketmgr, swait->maxfd, &swait->readset, &swait->writeset);
+       return (ISC_R_SUCCESS);
+#endif
 }
 #endif /* ISC_PLATFORM_USETHREADS */
 
index 76df84e87ca5227bd8e918cc96bc6413a0b567b5..c7753493e159923d64c0eb631db69a08c1a363e9 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: socket_p.h,v 1.11 2007/06/19 23:47:18 tbox Exp $ */
+/* $Id: socket_p.h,v 1.11.128.1 2008/06/24 00:09:12 jinmei Exp $ */
 
 #ifndef ISC_SOCKET_P_H
 #define ISC_SOCKET_P_H
 #include <sys/select.h>
 #endif
 
-void
-isc__socketmgr_getfdsets(fd_set *readset, fd_set *writeset, int *maxfd);
-
-isc_result_t
-isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd);
-
+typedef struct isc_socketwait isc_socketwait_t;
+int isc__socketmgr_waitevents(struct timeval *, isc_socketwait_t **);
+isc_result_t isc__socketmgr_dispatch(isc_socketwait_t *);
 #endif /* ISC_SOCKET_P_H */
index 21ab77ad7c913f4b8c111365bfa9f7543ba83a85..c679caf258bfd3b95b5157200ea83d7765c5898a 100644 (file)
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: namedconf.c,v 1.78.46.7 2008/05/27 22:36:11 each Exp $ */
+/* $Id: namedconf.c,v 1.78.46.8 2008/06/24 00:09:12 jinmei Exp $ */
 
 /*! \file */
 
@@ -546,30 +546,80 @@ static cfg_type_t cfg_type_serverid = {
 /*%
  * Port list.
  */
+static cfg_tuplefielddef_t porttuple_fields[] = {
+       { "loport", &cfg_type_uint32, 0 },
+       { "hiport", &cfg_type_uint32, 0 },
+       { NULL, NULL, 0 }
+};
+static cfg_type_t cfg_type_porttuple = {
+       "porttuple", cfg_parse_tuple, cfg_print_tuple, cfg_doc_tuple,
+       &cfg_rep_tuple, porttuple_fields
+};
+
 static isc_result_t
-parse_port(cfg_parser_t *pctx, const cfg_type_t *type, cfg_obj_t **ret) {
+parse_port(cfg_parser_t *pctx, cfg_obj_t **ret) {
        isc_result_t result;
 
-       UNUSED(type);
-
        CHECK(cfg_parse_uint32(pctx, NULL, ret));
        if ((*ret)->value.uint32 > 0xffff) {
                cfg_parser_error(pctx, CFG_LOG_NEAR, "invalid port");
                cfg_obj_destroy(pctx, ret);
                result = ISC_R_RANGE;
        }
+       
+ cleanup:
+       return (result);
+}
+
+static isc_result_t
+parse_portrange(cfg_parser_t *pctx, const cfg_type_t *type, cfg_obj_t **ret) {
+       isc_result_t result;
+       cfg_obj_t *obj = NULL;
+
+       UNUSED(type);
+
+       CHECK(cfg_peektoken(pctx, ISC_LEXOPT_NUMBER | ISC_LEXOPT_CNUMBER));
+       if (pctx->token.type == isc_tokentype_number)
+               CHECK(parse_port(pctx, ret));
+       else {
+               CHECK(cfg_gettoken(pctx, 0));
+               if (pctx->token.type != isc_tokentype_string ||
+                   strcasecmp(TOKEN_STRING(pctx), "range") != 0) {
+                       cfg_parser_error(pctx, CFG_LOG_NEAR,
+                                        "expected integer or 'range'");
+                       return (ISC_R_UNEXPECTEDTOKEN);
+               }
+               CHECK(cfg_create_tuple(pctx, &cfg_type_porttuple, &obj));
+               CHECK(parse_port(pctx, &obj->value.tuple[0]));
+               CHECK(parse_port(pctx, &obj->value.tuple[1]));
+               if (obj->value.tuple[0]->value.uint32 >
+                   obj->value.tuple[1]->value.uint32) {
+                       cfg_parser_error(pctx, CFG_LOG_NOPREP,
+                                        "low port '%u' must not be larger "
+                                        "than high port",
+                                        obj->value.tuple[0]->value.uint32);
+                       result = ISC_R_RANGE;
+                       goto cleanup;
+               }
+               *ret = obj;
+               obj = NULL;
+       }
+
  cleanup:
+       if (obj != NULL)
+               cfg_obj_destroy(pctx, &obj);
        return (result);
 }
 
-static cfg_type_t cfg_type_port = {
-       "port", parse_port, NULL, cfg_doc_terminal,
+static cfg_type_t cfg_type_portrange = {
+       "portrange", parse_portrange, NULL, cfg_doc_terminal,
        NULL, NULL
 };
 
 static cfg_type_t cfg_type_bracketed_portlist = {
-       "bracketed_sockaddrlist", cfg_parse_bracketed_list, cfg_print_bracketed_list, cfg_doc_bracketed_list,
-       &cfg_rep_list, &cfg_type_port
+       "bracketed_sockaddrlist", cfg_parse_bracketed_list,
+       cfg_print_bracketed_list, cfg_doc_bracketed_list,
+       &cfg_rep_list, &cfg_type_portrange
 };
 
 /*%
@@ -610,6 +660,8 @@ namedconf_or_view_clauses[] = {
  */
 static cfg_clausedef_t
 options_clauses[] = {
+       { "use-v4-udp-ports", &cfg_type_bracketed_portlist, 0 },
+       { "use-v6-udp-ports", &cfg_type_bracketed_portlist, 0 },
        { "avoid-v4-udp-ports", &cfg_type_bracketed_portlist, 0 },
        { "avoid-v6-udp-ports", &cfg_type_bracketed_portlist, 0 },
        { "blackhole", &cfg_type_bracketed_aml, 0 },