+2384. [security] Additional support for query port randomization (change
+ #2375) including performance improvement and port range
+ specification. [RT #17949, #18098]
+
2383. [bug] named could double queries when they resulted in
SERVFAIL due to overkilling EDNS0 failure detection.
[RT #18182]
- PERFORMANCE OF THIS SOFTWARE.
-->
-<!-- $Id: bind9.xsl,v 1.13.130.4 2008/04/09 22:49:37 jinmei Exp $ -->
+<!-- $Id: bind9.xsl,v 1.13.130.5 2008/06/24 00:09:10 jinmei Exp $ -->
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
</head>
<body>
<div class="header">Bind 9 Configuration and Statistics</div>
-
<br/>
<table>
" </head>\n"
" <body>\n"
" <div class=\"header\">Bind 9 Configuration and Statistics</div>\n"
- "\n"
" <br/>\n"
"\n"
" <table>\n"
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: server.c,v 1.495.10.14 2008/06/23 23:30:59 jinmei Exp $ */
+/* $Id: server.c,v 1.495.10.15 2008/06/24 00:09:10 jinmei Exp $ */
/*! \file */
#include <isc/httpd.h>
#include <isc/lex.h>
#include <isc/parseint.h>
+#include <isc/portset.h>
#include <isc/print.h>
#include <isc/resource.h>
#include <isc/stdio.h>
*/
static isc_result_t
get_view_querysource_dispatch(const cfg_obj_t **maps,
- int af, dns_dispatch_t **dispatchp)
+ int af, dns_dispatch_t **dispatchp,
+ isc_boolean_t is_firstview)
{
isc_result_t result;
dns_dispatch_t *disp;
isc_sockaddr_t sa;
unsigned int attrs, attrmask;
const cfg_obj_t *obj = NULL;
+ unsigned int maxdispatchbuffers;
/*
* Make compiler happy.
attrs |= DNS_DISPATCHATTR_IPV6;
break;
}
-
- if (isc_sockaddr_getport(&sa) != 0) {
+ if (isc_sockaddr_getport(&sa) == 0) {
+ attrs |= DNS_DISPATCHATTR_EXCLUSIVE;
+ maxdispatchbuffers = 4096;
+ } else {
INSIST(obj != NULL);
- cfg_obj_log(obj, ns_g_lctx, ISC_LOG_INFO,
- "using specific query-source port suppresses port "
- "randomization and can be insecure.");
+ if (is_firstview) {
+ cfg_obj_log(obj, ns_g_lctx, ISC_LOG_INFO,
+ "using specific query-source port "
+ "suppresses port randomization and can be "
+ "insecure.");
+ }
+ maxdispatchbuffers = 1000;
}
attrmask = 0;
disp = NULL;
result = dns_dispatch_getudp(ns_g_dispatchmgr, ns_g_socketmgr,
ns_g_taskmgr, &sa, 4096,
- 1024, 32768, 16411, 16433,
+ maxdispatchbuffers, 32768, 16411, 16433,
attrs, attrmask, &disp);
if (result != ISC_R_SUCCESS) {
isc_sockaddr_t any;
*
* XXXRTH Hardwired number of tasks.
*/
- CHECK(get_view_querysource_dispatch(maps, AF_INET, &dispatch4));
- CHECK(get_view_querysource_dispatch(maps, AF_INET6, &dispatch6));
+ CHECK(get_view_querysource_dispatch(maps, AF_INET, &dispatch4,
+ ISC_TF(ISC_LIST_PREV(view, link)
+ == NULL)));
+ CHECK(get_view_querysource_dispatch(maps, AF_INET6, &dispatch6,
+ ISC_TF(ISC_LIST_PREV(view, link)
+ == NULL)));
if (dispatch4 == NULL && dispatch6 == NULL) {
UNEXPECTED_ERROR(__FILE__, __LINE__,
"unable to obtain neither an IPv4 nor"
result = ISC_R_UNEXPECTED;
goto cleanup;
}
-
- obj = NULL;
- (void)ns_config_get(maps, "use-queryport-pool", &obj);
- if (obj == NULL || cfg_obj_asboolean(obj)) {
- isc_sockaddr_t sa;
- isc_boolean_t logit4 = ISC_FALSE, logit6 = ISC_FALSE;
-
- resopts |= (DNS_RESOLVER_USEDISPATCHPOOL4 |
- DNS_RESOLVER_USEDISPATCHPOOL6);
-
- /* Check consistency with query-source(-v6) */
- if (dispatch4 == NULL)
- resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL4;
- else {
- result = dns_dispatch_getlocaladdress(dispatch4, &sa);
- INSIST(result == ISC_R_SUCCESS);
- if (isc_sockaddr_getport(&sa) != 0) {
- logit4 = ISC_TRUE;
- resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL4;
- }
- }
-
- if (dispatch6 == NULL)
- resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL6;
- else {
- result = dns_dispatch_getlocaladdress(dispatch6, &sa);
- INSIST(result == ISC_R_SUCCESS);
- if (isc_sockaddr_getport(&sa) != 0) {
- logit6 = ISC_TRUE;
- resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL6;
- }
- }
- if (logit4 && obj != NULL)
- cfg_obj_log(obj, ns_g_lctx, ISC_LOG_ERROR,
- "specific query-source port "
- "cannot coexist with queryport-pool. "
- "(Pool disabled)");
- if (logit6 && obj != NULL)
- cfg_obj_log(obj, ns_g_lctx, ISC_LOG_ERROR,
- "specific query-source-v6 port "
- "cannot coexist with queryport-pool. "
- "(Pool disabled)");
- }
-
CHECK(dns_view_createresolver(view, ns_g_taskmgr, 31,
ns_g_socketmgr, ns_g_timermgr,
resopts, ns_g_dispatchmgr,
SETLIMIT("files", openfiles, "open files");
}
-static isc_result_t
-portlist_fromconf(dns_portlist_t *portlist, unsigned int family,
- const cfg_obj_t *ports)
+static void
+portset_fromconf(isc_portset_t *portset, const cfg_obj_t *ports,
+ isc_boolean_t positive)
{
const cfg_listelt_t *element;
- isc_result_t result = ISC_R_SUCCESS;
for (element = cfg_list_first(ports);
element != NULL;
element = cfg_list_next(element)) {
const cfg_obj_t *obj = cfg_listelt_value(element);
- in_port_t port = (in_port_t)cfg_obj_asuint32(obj);
- result = dns_portlist_add(portlist, family, port);
- if (result != ISC_R_SUCCESS)
- break;
+ if (cfg_obj_isuint32(obj)) {
+ in_port_t port = (in_port_t)cfg_obj_asuint32(obj);
+
+ if (positive)
+ isc_portset_add(portset, port);
+ else
+ isc_portset_remove(portset, port);
+ } else {
+ const cfg_obj_t *obj_loport, *obj_hiport;
+ in_port_t loport, hiport;
+
+ obj_loport = cfg_tuple_get(obj, "loport");
+ loport = (in_port_t)cfg_obj_asuint32(obj_loport);
+ obj_hiport = cfg_tuple_get(obj, "hiport");
+ hiport = (in_port_t)cfg_obj_asuint32(obj_hiport);
+
+ if (positive)
+ isc_portset_addrange(portset, loport, hiport);
+ else {
+ isc_portset_removerange(portset, loport,
+ hiport);
+ }
+ }
}
- return (result);
}
static isc_result_t
const cfg_obj_t *options;
const cfg_obj_t *views;
const cfg_obj_t *obj;
- const cfg_obj_t *v4ports, *v6ports;
+ const cfg_obj_t *usev4ports, *avoidv4ports, *usev6ports, *avoidv6ports;
const cfg_obj_t *maps[3];
const cfg_obj_t *builtin_views;
const cfg_listelt_t *element;
isc_uint32_t interface_interval;
isc_uint32_t heartbeat_interval;
isc_uint32_t udpsize;
- in_port_t listen_port;
+ in_port_t listen_port, udpport_low, udpport_high;
+ isc_portset_t *v4portset = NULL;
+ isc_portset_t *v6portset = NULL;
int i;
cfg_aclconfctx_init(&aclconfctx);
CHECKM(ns_statschannels_configure(ns_g_server, config, &aclconfctx),
"configuring statistics server(s)");
- v4ports = NULL;
- v6ports = NULL;
- (void)ns_config_get(maps, "avoid-v4-udp-ports", &v4ports);
- (void)ns_config_get(maps, "avoid-v6-udp-ports", &v6ports);
- if (v4ports != NULL || v6ports != NULL) {
- dns_portlist_t *portlist = NULL;
- result = dns_portlist_create(ns_g_mctx, &portlist);
- if (result == ISC_R_SUCCESS && v4ports != NULL)
- result = portlist_fromconf(portlist, AF_INET, v4ports);
- if (result == ISC_R_SUCCESS && v6ports != NULL)
- portlist_fromconf(portlist, AF_INET6, v6ports);
- if (result == ISC_R_SUCCESS)
- dns_dispatchmgr_setblackportlist(ns_g_dispatchmgr, portlist);
- if (portlist != NULL)
- dns_portlist_detach(&portlist);
- CHECK(result);
- } else
- dns_dispatchmgr_setblackportlist(ns_g_dispatchmgr, NULL);
+ /*
+ * Configure sets of UDP query source ports.
+ */
+ CHECKM(isc_portset_create(ns_g_mctx, &v4portset),
+ "creating UDP port set");
+ CHECKM(isc_portset_create(ns_g_mctx, &v6portset),
+ "creating UDP port set");
+
+ usev4ports = NULL;
+ usev6ports = NULL;
+ avoidv4ports = NULL;
+ avoidv6ports = NULL;
+
+ (void)ns_config_get(maps, "use-v4-udp-ports", &usev4ports);
+ if (usev4ports != NULL)
+ portset_fromconf(v4portset, usev4ports, ISC_TRUE);
+ else {
+ CHECKM(isc_net_getudpportrange(AF_INET, &udpport_low,
+ &udpport_high),
+ "get the default UDP/IPv4 port range");
+ if (udpport_low == udpport_high)
+ isc_portset_add(v4portset, udpport_low);
+ else {
+ isc_portset_addrange(v4portset, udpport_low,
+ udpport_high);
+ }
+ isc_log_write(ns_g_lctx, NS_LOGCATEGORY_GENERAL,
+ NS_LOGMODULE_SERVER, ISC_LOG_INFO,
+ "using default UDP/IPv4 port range: [%d, %d]",
+ udpport_low, udpport_high);
+ }
+ (void)ns_config_get(maps, "avoid-v4-udp-ports", &avoidv4ports);
+ if (avoidv4ports != NULL)
+ portset_fromconf(v4portset, avoidv4ports, ISC_FALSE);
+
+ (void)ns_config_get(maps, "use-v6-udp-ports", &usev6ports);
+ if (usev6ports != NULL)
+ portset_fromconf(v6portset, usev6ports, ISC_TRUE);
+ else {
+ CHECKM(isc_net_getudpportrange(AF_INET6, &udpport_low,
+ &udpport_high),
+ "get the default UDP/IPv6 port range");
+ if (udpport_low == udpport_high)
+ isc_portset_add(v6portset, udpport_low);
+ else {
+ isc_portset_addrange(v6portset, udpport_low,
+ udpport_high);
+ }
+ isc_log_write(ns_g_lctx, NS_LOGCATEGORY_GENERAL,
+ NS_LOGMODULE_SERVER, ISC_LOG_INFO,
+ "using default UDP/IPv6 port range: [%d, %d]",
+ udpport_low, udpport_high);
+ }
+ (void)ns_config_get(maps, "avoid-v6-udp-ports", &avoidv6ports);
+ if (avoidv6ports != NULL)
+ portset_fromconf(v6portset, avoidv6ports, ISC_FALSE);
+
+ dns_dispatchmgr_setavailports(ns_g_dispatchmgr, v4portset, v6portset);
/*
* Set the EDNS UDP size when we don't match a view.
result = ISC_R_SUCCESS;
cleanup:
+ if (v4portset != NULL)
+ isc_portset_destroy(ns_g_mctx, &v4portset);
+
+ if (v6portset != NULL)
+ isc_portset_destroy(ns_g_mctx, &v6portset);
+
cfg_aclconfctx_destroy(&aclconfctx);
if (parser != NULL) {
esyscmd([sed "s/^/# /" COPYRIGHT])dnl
AC_DIVERT_POP()dnl
-AC_REVISION($Revision: 1.432.60.9 $)
+AC_REVISION($Revision: 1.432.60.10 $)
AC_INIT(lib/dns/name.c)
AC_PREREQ(2.59)
ISC_PLATFORM_HAVELIFCONF="#undef ISC_PLATFORM_HAVELIFCONF"])
AC_SUBST(ISC_PLATFORM_HAVELIFCONF)
+#
+# check if we have kqueue
+#
+AC_CHECK_FUNC(kqueue, ac_cv_have_kqueue=yes, ac_cv_have_kqueue=no)
+case $ac_cv_have_kqueue in
+yes)
+ ISC_PLATFORM_HAVEKQUEUE="#define ISC_PLATFORM_HAVEKQUEUE 1"
+ ;;
+*)
+ ISC_PLATFORM_HAVEKQUEUE="#undef ISC_PLATFORM_HAVEKQUEUE"
+ ;;
+esac
+AC_SUBST(ISC_PLATFORM_HAVEKQUEUE)
+
+#
+# check if we have epoll
+#
+AC_CHECK_FUNC(epoll_create, ac_cv_have_epoll=yes, ac_cv_have_epoll=no)
+case $ac_cv_have_epoll in
+yes)
+ ISC_PLATFORM_HAVEEPOLL="#define ISC_PLATFORM_HAVEEPOLL 1"
+ ;;
+*)
+ ISC_PLATFORM_HAVEEPOLL="#undef ISC_PLATFORM_HAVEEPOLL"
+ ;;
+esac
+AC_SUBST(ISC_PLATFORM_HAVEEPOLL)
+
+#
+# check if we support /dev/poll
+#
+AC_CHECK_HEADERS(sys/devpoll.h,
+ISC_PLATFORM_HAVEDEVPOLL="#define ISC_PLATFORM_HAVEDEVPOLL 1"
+,
+ISC_PLATFORM_HAVEDEVPOLL="#undef ISC_PLATFORM_HAVEDEVPOLL"
+)
+AC_SUBST(ISC_PLATFORM_HAVEDEVPOLL)
#
# check if we need to #include sys/select.h explicitly
- PERFORMANCE OF THIS SOFTWARE.
-->
-<!-- File: $Id: Bv9ARM-book.xml,v 1.340.24.12 2008/06/17 06:44:09 marka Exp $ -->
+<!-- File: $Id: Bv9ARM-book.xml,v 1.340.24.13 2008/06/24 00:09:11 jinmei Exp $ -->
<book xmlns:xi="http://www.w3.org/2001/XInclude">
<title>BIND 9 Administrator Reference Manual</title>
</para>
</entry>
</row>
+ <row rowsep="0">
+ <entry colname="1">
+ <para>
+ <varname>port_list</varname>
+ </para>
+ </entry>
+ <entry colname="2">
+ <para>
+ A list of an <varname>ip_port</varname> or a port
+ range.
+ A port range is specified in the form of
+ <userinput>range</userinput> followed by
+ two <varname>ip_port</varname>s,
+ <varname>port_low</varname> and
+ <varname>port_high</varname>, which represents
+ port numbers from <varname>port_low</varname> through
+ <varname>port_high</varname>, inclusive.
+ <varname>port_low</varname> must not be larger than
+ <varname>port_high</varname>.
+ For example,
+ <userinput>range 1024 65535</userinput> represents
+ ports from 1024 through 65535.
+ In either case an asterisk (`*') character is not
+ allowed as a valid <varname>ip_port</varname>.
+ </para>
+ </entry>
+ </row>
<row rowsep="0">
<entry colname="1">
<para>
<optional> try-tcp-refresh <replaceable>yes_or_no</replaceable>; </optional>
<optional> allow-v6-synthesis { <replaceable>address_match_list</replaceable> }; </optional>
<optional> blackhole { <replaceable>address_match_list</replaceable> }; </optional>
+ <optional> use-v4-udp-ports { <replaceable>port_list</replaceable> }; </optional>
<optional> avoid-v4-udp-ports { <replaceable>port_list</replaceable> }; </optional>
+ <optional> use-v6-udp-ports { <replaceable>port_list</replaceable> }; </optional>
<optional> avoid-v6-udp-ports { <replaceable>port_list</replaceable> }; </optional>
<optional> listen-on <optional> port <replaceable>ip_port</replaceable> </optional> { <replaceable>address_match_list</replaceable> }; </optional>
<optional> listen-on-v6 <optional> port <replaceable>ip_port</replaceable> </optional> { <replaceable>address_match_list</replaceable> }; </optional>
If <command>address</command> is <command>*</command> (asterisk) or is omitted,
a wildcard IP address (<command>INADDR_ANY</command>)
will be used.
+ </para>
+
+ <para>
If <command>port</command> is <command>*</command> or is omitted,
- a random unprivileged port number is picked up and will be
- used for each query.
- Previously, the <command>use-queryport-pool</command> was provided
- to support a pool of such random ports, but this option is now
- obsolete because reusing the same ports in the pool is not
- sufficiently secure.
+ a random port number from a pre-configured
+ range is picked up and will be used for each query.
+ The port range(s) is that specified in
+ the <command>use-v4-udp-ports</command> (for IPv4)
+ and <command>use-v6-udp-ports</command> (for IPv6)
+ options, excluding the ranges specified in
+ the <command>avoid-v4-udp-ports</command>
+ and <command>avoid-v6-udp-ports</command> options, respectively.
+ </para>
+
+ <para>
+ The defaults of the <command>query-source</command> and
+ <command>query-source-v6</command> options
+ are:
+ </para>
+
+<programlisting>query-source address * port *;
+query-source-v6 address * port *;
+</programlisting>
+
+ <para>
+ If <command>use-v4-udp-ports</command> or
+ <command>use-v6-udp-ports</command> is unspecified,
+ <command>named</command> will check if the operating
+ system provides a programming interface to retrieve the
+ system's default range for ephemeral ports.
+ If such an interface is available,
+ <command>named</command> will use the corresponding system
+ default range; otherwise, it will use its own defaults:
+ </para>
+
+<programlisting>use-v4-udp-ports { range 1024 65535; };
+use-v6-udp-ports { range 1024 65535; };
+</programlisting>
+
+ <para>
+ Note: make sure the ranges be sufficiently large for
+ security. A desirable size depends on various parameters,
+ but we generally recommend it contain at least 16384 ports
+ (14 bits of entropy).
+ Note also that the system's default range when used may be
+ too small for this purpose, and that the range may even be
+ changed while <command>named</command> is running; the new
+ range will automatically be applied when <command>named</command>
+ is reloaded.
+ It is encouraged to
+ configure <command>use-v4-udp-ports</command> and
+ <command>use-v6-udp-ports</command> explicitly so that the
+ ranges are sufficiently large and are reasonably
+ independent from the ranges used by other applications.
+ </para>
+
+ <para>
+ Note: the operational configuration
+ where <command>named</command> runs may prohibit the use
+ of some ports. For example, UNIX systems will not allow
+ <command>named</command> running without a root privilege
+ to use ports less than 1024.
+ If such ports are included in the specified (or detected)
+ set of query ports, the corresponding query attempts will
+ fail, resulting in resolution failures or delay.
+ It is therefore important to configure the set of ports
+ that can be safely used in the expected operational environment.
+ </para>
+
+ <para>
+ The defaults of the <command>avoid-v4-udp-ports</command> and
+ <command>avoid-v6-udp-ports</command> options
+ are:
+ </para>
+
+<programlisting>avoid-v4-udp-ports {};
+avoid-v6-udp-ports {};
+</programlisting>
+
+ <para>
+ Note: BIND 9.5.0 introduced
+ the <command>use-queryport-pool</command>
+ option to support a pool of such random ports, but this
+ option is now obsolete because reusing the same ports in
+ the pool may not be sufficiently secure.
For the same reason, it is generally strongly discouraged to
specify a particular port for the
<command>query-source</command> or
<command>query-source-v6</command> options;
it implicitly disables the use of randomized port numbers.
- The <command>avoid-v4-udp-ports</command>
- and <command>avoid-v6-udp-ports</command> options can be used
- to prevent named
- from selecting certain ports.
- The defaults are:
</para>
-<programlisting>query-source address * port *;
-query-source-v6 address * port *;
-</programlisting>
-
<variablelist>
<varlistentry>
<term><command>use-queryport-pool</command></term>
</sect3>
<sect3>
- <title>Bad UDP Port Lists</title>
- <para><command>avoid-v4-udp-ports</command>
- and <command>avoid-v6-udp-ports</command> specify a list
- of IPv4 and IPv6 UDP ports that will not be used as system
- assigned source ports for UDP sockets. These lists
- prevent named from choosing as its random source port a
- port that is blocked by your firewall. If a query went
- out with such a source port, the answer would not get by
- the firewall and the name server would have to query
- again.
+ <title>UDP Port Lists</title>
+ <para>
+ <command>use-v4-udp-ports</command>,
+ <command>avoid-v4-udp-ports</command>,
+ <command>use-v6-udp-ports</command>, and
+ <command>avoid-v6-udp-ports</command>
+ specify a list of IPv4 and IPv6 UDP ports that will be
+ used or not used as source ports for UDP messages.
+ See <xref linkend="query_address"/> about how the
+ available ports are determined.
+ For example, with the following configuration
</para>
+
+<programlisting>
+use-v6-udp-ports { range 32768 65535; };
+avoid-v6-udp-ports { 40000; range 50000 60000; };
+</programlisting>
+
+ <para>
+ UDP ports of IPv6 messages sent
+ from <command>named</command> will be in one
+ of the following ranges: 32768 to 39999, 40001 to 49999,
+ and 60001 to 65535.
+ </para>
+
+ <para>
+ <command>avoid-v4-udp-ports</command> and
+ <command>avoid-v6-udp-ports</command> can be used
+ to prevent <command>named</command> from choosing as its random source port a
+ port that is blocked by your firewall or a port that is
+ used by other applications;
+ if a query went out with a source port blocked by a
+ firewall, the
+ answer would not get by the firewall and the name server would
+ have to query again.
+ Note: the desired range can also be represented only with
+ <command>use-v4-udp-ports</command> and
+ <command>use-v6-udp-ports</command>, and the
+ <command>avoid-</command> options are redundant in that
+ sense; they are provided for backward compatibility and
+ to possibly simplify the port specification.
+ </para>
</sect3>
<sect3>
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: dispatch.c,v 1.137.128.4 2008/05/27 22:36:11 each Exp $ */
+/* $Id: dispatch.c,v 1.137.128.5 2008/06/24 00:09:11 jinmei Exp $ */
/*! \file */
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
+#include <stdlib.h>
#include <isc/entropy.h>
#include <isc/mem.h>
#include <isc/mutex.h>
+#include <isc/portset.h>
#include <isc/print.h>
#include <isc/random.h>
#include <isc/string.h>
typedef ISC_LIST(dns_dispentry_t) dns_displist_t;
-/* transaction ID */
-typedef struct dns_qid {
- unsigned int magic;
- unsigned int qid_nbuckets; /*%< hash table size */
- unsigned int qid_increment; /*%< id increment on collision */
- isc_mutex_t lock;
- dns_displist_t *qid_table; /*%< the table itself */
-} dns_qid_t;
-
/* ARC4 Random generator state */
typedef struct arc4ctx {
isc_uint8_t i;
isc_uint8_t j;
isc_uint8_t s[256];
int count;
+ isc_entropy_t *entropy; /*%< entropy source for ARC4 */
+ isc_mutex_t *lock;
} arc4ctx_t;
+typedef struct dns_qid {
+ unsigned int magic;
+ unsigned int qid_nbuckets; /*%< hash table size */
+ unsigned int qid_increment; /*%< id increment on collision */
+ isc_mutex_t lock;
+ dns_displist_t *qid_table; /*%< the table itself */
+ dns_displist_t *addr_table; /*%< address/port table */
+} dns_qid_t;
+
struct dns_dispatchmgr {
/* Unlocked. */
unsigned int magic;
dns_acl_t *blackhole;
dns_portlist_t *portlist;
dns_stats_t *stats;
+ isc_entropy_t *entropy; /*%< entropy source */
/* Locked by "lock". */
isc_mutex_t lock;
isc_mempool_t *rpool; /*%< memory pool for replies */
isc_mempool_t *dpool; /*%< dispatch allocations */
isc_mempool_t *bpool; /*%< memory pool for buffers */
-
- isc_entropy_t *entropy; /*%< entropy source */
+ isc_mempool_t *spool; /*%< memory pool for dispsocs */
+
+ /*%
+ * Locked by qid->lock if qid exists; otherwise, can be used without
+ * being locked.
+ * Memory footprint considerations: this is a simple implementation of
+ * available ports, i.e., an ordered array of the actual port numbers.
+ * This will require about 256KB of memory in the worst case (128KB for
+ * each of IPv4 and IPv6). We could reduce it by representing it as a
+ * more sophisticated way such as a list (or array) of ranges that are
+ * searched to identify a specific port. Our decision here is the saved
+ * memory isn't worth the implementation complexity, considering the
+ * fact that the whole BIND9 process (which is mainly named) already
+ * requires a pretty large memory footprint. We may, however, have to
+ * revisit the decision when we want to use it as a separate module for
+ * an environment where memory requirement is severer.
+ */
+ in_port_t *v4ports; /*%< available ports for IPv4 */
+ unsigned int nv4ports; /*%< # of available ports for IPv4 */
+ in_port_t *v6ports; /*%< available ports for IPv4 */
+ unsigned int nv6ports; /*%< # of available ports for IPv4 */
};
#define MGR_SHUTTINGDOWN 0x00000001U
#define IS_PRIVATE(d) (((d)->attributes & DNS_DISPATCHATTR_PRIVATE) != 0)
+typedef struct dispsocket dispsocket_t;
+
struct dns_dispentry {
unsigned int magic;
dns_dispatch_t *disp;
dns_messageid_t id;
in_port_t port;
unsigned int bucket;
+ unsigned int abucket;
isc_sockaddr_t host;
isc_task_t *task;
isc_taskaction_t action;
void *arg;
isc_boolean_t item_out;
+ dispsocket_t *dispsocket;
ISC_LIST(dns_dispatchevent_t) items;
ISC_LINK(dns_dispentry_t) link;
+ ISC_LINK(dns_dispentry_t) alink;
+};
+
+/*%
+ * Maximum number of dispatch sockets that can be pooled for reuse. The
+ * appropriate value may vary, but experiments have shown a busy caching server
+ * may need more than 1000 sockets concurrently opened. The maximum allowable
+ * number of dispatch sockets (per manager) will be set to the double of this
+ * value.
+ */
+#ifndef DNS_DISPATCH_POOLSOCKS
+#define DNS_DISPATCH_POOLSOCKS 2048
+#endif
+
+/*%
+ * Quota to control the number of dispatch sockets. If a dispatch has more
+ * than the quota of sockets, new queries will purge oldest ones, so that
+ * a massive number of outstanding queries won't prevent subsequent queries
+ * (especially if the older ones take longer time and result in timeout).
+ */
+#ifndef DNS_DISPATCH_SOCKSQUOTA
+#define DNS_DISPATCH_SOCKSQUOTA 3072
+#endif
+
+struct dispsocket {
+ unsigned int magic;
+ isc_socket_t *socket;
+ dns_dispatch_t *disp;
+ dns_dispentry_t *resp;
+ isc_task_t *task;
+ ISC_LINK(dispsocket_t) link;
};
#define INVALID_BUCKET (0xffffdead)
+/*%
+ * Number of tasks for each dispatch that use separate sockets for different
+ * transactions. This must be a power of 2 as it will divide 32 bit numbers
+ * to get an uniformly random tasks selection. See get_dispsocket().
+ */
+#define MAX_INTERNAL_TASKS 64
+
struct dns_dispatch {
/* Unlocked. */
unsigned int magic; /*%< magic */
dns_dispatchmgr_t *mgr; /*%< dispatch manager */
- isc_task_t *task; /*%< internal task */
+ int ntasks;
+ /*%
+ * internal task buckets. We use multiple tasks to distribute various
+ * socket events well when using separate dispatch sockets. We use the
+ * 1st task (task[0]) for internal control events.
+ */
+ isc_task_t *task[MAX_INTERNAL_TASKS];
isc_socket_t *socket; /*%< isc socket attached to */
isc_sockaddr_t local; /*%< local address */
in_port_t localport; /*%< local UDP port */
tcpmsg_valid : 1,
recv_pending : 1; /*%< is a recv() pending? */
isc_result_t shutdown_why;
+ ISC_LIST(dispsocket_t) activesockets;
+ ISC_LIST(dispsocket_t) inactivesockets;
+ unsigned int nsockets;
unsigned int requests; /*%< how many requests we have */
unsigned int tcpbuffers; /*%< allocated buffers */
dns_tcpmsg_t tcpmsg; /*%< for tcp streams */
dns_qid_t *qid;
+ arc4ctx_t arc4ctx; /*%< for QID/UDP port num */
};
#define QID_MAGIC ISC_MAGIC('Q', 'i', 'd', ' ')
#define RESPONSE_MAGIC ISC_MAGIC('D', 'r', 's', 'p')
#define VALID_RESPONSE(e) ISC_MAGIC_VALID((e), RESPONSE_MAGIC)
+#define DISPSOCK_MAGIC ISC_MAGIC('D', 's', 'o', 'c')
+#define VALID_DISPSOCK(e) ISC_MAGIC_VALID((e), DISPSOCK_MAGIC)
+
#define DISPATCH_MAGIC ISC_MAGIC('D', 'i', 's', 'p')
#define VALID_DISPATCH(e) ISC_MAGIC_VALID((e), DISPATCH_MAGIC)
#define DNS_QID(disp) ((disp)->socktype == isc_sockettype_tcp) ? \
(disp)->qid : (disp)->mgr->qid
+#define DISP_ARC4CTX(disp) ((disp)->socktype == isc_sockettype_udp) ? \
+ (&(disp)->arc4ctx) : (&(disp)->mgr->arc4ctx)
+
+/*%
+ * Locking a query port buffer is a bit tricky. We access the buffer without
+ * locking until qid is created. Technically, there is a possibility of race
+ * between the creation of qid and access to the port buffer; in practice,
+ * however, this should be safe because qid isn't created until the first
+ * dispatch is created and there should be no contending situation until then.
+ */
+#define PORTBUFLOCK(mgr) if ((mgr)->qid != NULL) LOCK(&((mgr)->qid->lock))
+#define PORTBUFUNLOCK(mgr) if ((mgr)->qid != NULL) UNLOCK((&(mgr)->qid->lock))
+
/*
* Statics.
*/
-static dns_dispentry_t *bucket_search(dns_qid_t *, isc_sockaddr_t *,
- dns_messageid_t, in_port_t, unsigned int);
+static dns_dispentry_t *bucket_search(dns_qid_t *, dns_displist_t *,
+ isc_sockaddr_t *, dns_messageid_t,
+ in_port_t, unsigned int, isc_boolean_t);
static isc_boolean_t destroy_disp_ok(dns_dispatch_t *);
static void destroy_disp(isc_task_t *task, isc_event_t *event);
-static void udp_recv(isc_task_t *, isc_event_t *);
+static void destroy_dispsocket(dns_dispatch_t *, dispsocket_t **);
+static void deactivate_dispsocket(dns_dispatch_t *, dispsocket_t *);
+static void udp_exrecv(isc_task_t *, isc_event_t *);
+static void udp_shrecv(isc_task_t *, isc_event_t *);
+static void udp_recv(isc_event_t *, dns_dispatch_t *, dispsocket_t *);
static void tcp_recv(isc_task_t *, isc_event_t *);
-static void startrecv(dns_dispatch_t *);
+static isc_result_t startrecv(dns_dispatch_t *, dispsocket_t *);
static isc_uint32_t dns_hash(dns_qid_t *, isc_sockaddr_t *, dns_messageid_t,
in_port_t);
static void free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len);
static dns_dispentry_t *linear_next(dns_qid_t *disp,
dns_dispentry_t *resp);
static void dispatch_free(dns_dispatch_t **dispp);
+static isc_result_t get_udpsocket(dns_dispatchmgr_t *mgr,
+ dns_dispatch_t *disp,
+ isc_socketmgr_t *sockmgr,
+ isc_sockaddr_t *localaddr,
+ isc_socket_t **sockp,
+ unsigned int maxtry);
static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr,
isc_socketmgr_t *sockmgr,
isc_taskmgr_t *taskmgr,
static isc_boolean_t destroy_mgr_ok(dns_dispatchmgr_t *mgr);
static void destroy_mgr(dns_dispatchmgr_t **mgrp);
static isc_result_t qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
- unsigned int increment, dns_qid_t **qidp);
+ unsigned int increment, dns_qid_t **qidp,
+ isc_boolean_t needaddrtable);
static void qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp);
+static isc_result_t open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
+ isc_socket_t **sockp);
+static isc_boolean_t portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
+ isc_sockaddr_t *sockaddrp);
#define LVL(x) ISC_LOG_DEBUG(x)
}
}
-/*
- * ARC4 random number generator obtained from OpenBSD
+/*%
+ * ARC4 random number generator derived from OpenBSD.
+ * Only dispatch_arc4random() and dispatch_arc4uniformrandom() are expected
+ * to be called from general dispatch routines; the rest of them are subroutines
+ * for these two.
+ *
+ * The original copyright follows:
+ * Copyright (c) 1996, David Mazieres <dm@uun.org>
+ * Copyright (c) 2008, Damien Miller <djm@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
static void
-dispatch_arc4init(arc4ctx_t *actx) {
+dispatch_arc4init(arc4ctx_t *actx, isc_entropy_t *entropy, isc_mutex_t *lock) {
int n;
for (n = 0; n < 256; n++)
actx->s[n] = n;
actx->i = 0;
actx->j = 0;
actx->count = 0;
+ actx->entropy = entropy; /* don't have to attach */
+ actx->lock = lock;
}
static void
}
static void
-dispatch_arc4stir(dns_dispatchmgr_t *mgr) {
+dispatch_arc4stir(arc4ctx_t *actx) {
int i;
union {
unsigned char rnd[128];
} rnd;
isc_result_t result;
- if (mgr->entropy != NULL) {
+ if (actx->entropy != NULL) {
/*
* We accept any quality of random data to avoid blocking.
*/
- result = isc_entropy_getdata(mgr->entropy, rnd.rnd,
+ result = isc_entropy_getdata(actx->entropy, rnd.rnd,
sizeof(rnd), NULL, 0);
RUNTIME_CHECK(result == ISC_R_SUCCESS);
} else {
for (i = 0; i < 32; i++)
isc_random_get(&rnd.rnd32[i]);
}
- dispatch_arc4addrandom(&mgr->arc4ctx, rnd.rnd, sizeof(rnd.rnd));
+ dispatch_arc4addrandom(actx, rnd.rnd, sizeof(rnd.rnd));
/*
* Discard early keystream, as per recommendations in:
* http://www.wisdom.weizmann.ac.il/~itsik/RC4/Papers/Rc4_ksa.ps
*/
for (i = 0; i < 256; i++)
- (void)dispatch_arc4get8(&mgr->arc4ctx);
+ (void)dispatch_arc4get8(actx);
/*
* Derived from OpenBSD's implementation. The rationale is not clear,
* but should be conservative enough in safety, and reasonably large
* for efficiency.
*/
- mgr->arc4ctx.count = 1600000;
+ actx->count = 1600000;
}
static isc_uint16_t
-dispatch_arc4random(dns_dispatchmgr_t *mgr) {
+dispatch_arc4random(arc4ctx_t *actx) {
isc_uint16_t result;
- LOCK(&mgr->arc4_lock);
- mgr->arc4ctx.count -= sizeof(isc_uint16_t);
- if (mgr->arc4ctx.count <= 0)
- dispatch_arc4stir(mgr);
- result = dispatch_arc4get16(&mgr->arc4ctx);
- UNLOCK(&mgr->arc4_lock);
+ if (actx->lock != NULL)
+ LOCK(actx->lock);
+
+ actx->count -= sizeof(isc_uint16_t);
+ if (actx->count <= 0)
+ dispatch_arc4stir(actx);
+ result = dispatch_arc4get16(actx);
+
+ if (actx->lock != NULL)
+ UNLOCK(actx->lock);
+
return (result);
}
static isc_uint16_t
-dispatch_arc4uniformrandom(dns_dispatchmgr_t *mgr, isc_uint16_t upper_bound) {
+dispatch_arc4uniformrandom(arc4ctx_t *actx, isc_uint16_t upper_bound) {
isc_uint16_t min, r;
- /* The caller must hold the manager lock. */
if (upper_bound < 2)
return (0);
* to re-roll.
*/
for (;;) {
- r = dispatch_arc4random(mgr);
+ r = dispatch_arc4random(actx);
if (r >= min)
break;
}
if (disp->recv_pending != 0)
return (ISC_FALSE);
+ if (!ISC_LIST_EMPTY(disp->activesockets))
+ return (ISC_FALSE);
+
if (disp->shutting_down == 0)
return (ISC_FALSE);
return (ISC_TRUE);
}
-
/*
* Called when refcount reaches 0 (and safe to destroy).
*
dns_dispatch_t *disp;
dns_dispatchmgr_t *mgr;
isc_boolean_t killmgr;
+ dispsocket_t *dispsocket;
+ int i;
INSIST(event->ev_type == DNS_EVENT_DISPATCHCONTROL);
dispatch_log(disp, LVL(90),
"shutting down; detaching from sock %p, task %p",
- disp->socket, disp->task);
+ disp->socket, disp->task[0]); /* XXXX */
- isc_socket_detach(&disp->socket);
- isc_task_detach(&disp->task);
+ if (disp->socket != NULL)
+ isc_socket_detach(&disp->socket);
+ while ((dispsocket = ISC_LIST_HEAD(disp->inactivesockets)) != NULL) {
+ ISC_LIST_UNLINK(disp->inactivesockets, dispsocket, link);
+ destroy_dispsocket(disp, &dispsocket);
+ }
+ for (i = 0; i < disp->ntasks; i++)
+ isc_task_detach(&disp->task[i]);
isc_event_free(&event);
dispatch_free(&disp);
destroy_mgr(&mgr);
}
+/*%
+ * Make a new socket for a single dispatch with a random port number.
+ * The caller must hold the disp->lock and qid->lock.
+ */
+static isc_result_t
+get_dispsocket(dns_dispatch_t *disp, isc_sockaddr_t *dest,
+ isc_socketmgr_t *sockmgr, dns_qid_t *qid,
+ dispsocket_t **dispsockp, unsigned int *abucketp,
+ in_port_t *portp)
+{
+ int i;
+ isc_uint32_t r;
+ dns_dispatchmgr_t *mgr = disp->mgr;
+ isc_socket_t *sock = NULL;
+ isc_result_t result = ISC_R_FAILURE;
+ in_port_t port;
+ isc_sockaddr_t localaddr;
+ unsigned int abucket = 0;
+ dispsocket_t *dispsock;
+ unsigned int nports;
+ in_port_t *ports;
+
+ if (isc_sockaddr_pf(&disp->local) == AF_INET) {
+ nports = disp->mgr->nv4ports;
+ ports = disp->mgr->v4ports;
+ } else {
+ nports = disp->mgr->nv6ports;
+ ports = disp->mgr->v6ports;
+ }
+ if (nports == 0)
+ return (ISC_R_ADDRNOTAVAIL);
+
+ dispsock = ISC_LIST_HEAD(disp->inactivesockets);
+ if (dispsock != NULL) {
+ ISC_LIST_UNLINK(disp->inactivesockets, dispsock, link);
+ sock = dispsock->socket;
+ dispsock->socket = NULL;
+ } else {
+ dispsock = isc_mempool_get(mgr->spool);
+ if (dispsock == NULL)
+ return (ISC_R_NOMEMORY);
+
+ disp->nsockets++;
+ dispsock->socket = NULL;
+ dispsock->disp = disp;
+ dispsock->resp = NULL;
+ isc_random_get(&r);
+ dispsock->task = NULL;
+ isc_task_attach(disp->task[r % disp->ntasks], &dispsock->task);
+ ISC_LINK_INIT(dispsock, link);
+ dispsock->magic = DISPSOCK_MAGIC;
+ }
+
+ /*
+ * Pick up a random UDP port and open a new socket with it. Avoid
+ * choosing ports that share the same destination because it will be
+ * very likely to fail in bind(2) or connect(2).
+ */
+ localaddr = disp->local;
+ for (i = 0; i < 64; i++) {
+ port = ports[dispatch_arc4uniformrandom(DISP_ARC4CTX(disp),
+ nports)];
+ isc_sockaddr_setport(&localaddr, port);
+
+ abucket = dns_hash(qid, dest, 0, port);
+ if (bucket_search(qid, qid->addr_table, dest, 0, port, abucket,
+ ISC_TRUE) != NULL) {
+ continue;
+ }
+
+ result = open_socket(sockmgr, &localaddr, &sock);
+ if (result == ISC_R_SUCCESS || result != ISC_R_ADDRINUSE)
+ break;
+ }
+
+ if (result == ISC_R_SUCCESS) {
+ dispsock->socket = sock;
+ *dispsockp = dispsock;
+ *abucketp = abucket;
+ *portp = port;
+ } else {
+ /*
+ * We could keep it in the inactive list, but since this should
+ * be an exceptional case and might be resource shortage, we'd
+ * rather destroy it.
+ */
+ if (sock != NULL)
+ isc_socket_detach(&sock);
+ destroy_dispsocket(disp, &dispsock);
+ }
+
+ return (result);
+}
+
+/*%
+ * Destroy a dedicated dispatch socket.
+ */
+static void
+destroy_dispsocket(dns_dispatch_t *disp, dispsocket_t **dispsockp) {
+ dispsocket_t *dispsock;
+
+ /*
+ * The dispatch must be locked.
+ */
+
+ REQUIRE(dispsockp != NULL && *dispsockp != NULL);
+ dispsock = *dispsockp;
+ REQUIRE(!ISC_LINK_LINKED(dispsock, link));
+
+ disp->nsockets--;
+ dispsock->magic = 0;
+ if (dispsock->socket != NULL)
+ isc_socket_detach(&dispsock->socket);
+ if (dispsock->task != NULL)
+ isc_task_detach(&dispsock->task);
+ isc_mempool_put(disp->mgr->spool, dispsock);
+
+ *dispsockp = NULL;
+}
+
+/*%
+ * Deactivate a dedicated dispatch socket. Move it to the inactive list for
+ * future reuse unless the total number of sockets are exceeding the maximum.
+ */
+static void
+deactivate_dispsocket(dns_dispatch_t *disp, dispsocket_t *dispsock) {
+ /*
+ * The dispatch must be locked.
+ */
+ ISC_LIST_UNLINK(disp->activesockets, dispsock, link);
+ if (dispsock->resp != NULL) {
+ INSIST(dispsock->resp->dispsocket == dispsock);
+ dispsock->resp->dispsocket = NULL;
+ }
+
+ if (disp->nsockets > DNS_DISPATCH_POOLSOCKS)
+ destroy_dispsocket(disp, &dispsock);
+ else {
+ isc_socket_close(dispsock->socket);
+ ISC_LIST_APPEND(disp->inactivesockets, dispsock, link);
+ }
+}
/*
- * Find an entry for query ID 'id' and socket address 'dest' in 'qid'.
+ * Find an entry for query ID 'id', socket address 'dest', and port number
+ * 'port' in 'table'.
* Return NULL if no such entry exists.
*/
static dns_dispentry_t *
-bucket_search(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
- in_port_t port, unsigned int bucket)
+bucket_search(dns_qid_t *qid, dns_displist_t *table, isc_sockaddr_t *dest,
+ dns_messageid_t id, in_port_t port, unsigned int bucket,
+ isc_boolean_t ignoreid)
{
dns_dispentry_t *res;
REQUIRE(bucket < qid->qid_nbuckets);
- res = ISC_LIST_HEAD(qid->qid_table[bucket]);
+ res = ISC_LIST_HEAD(table[bucket]);
while (res != NULL) {
- if ((res->id == id) && isc_sockaddr_equal(dest, &res->host) &&
+ if ((ignoreid || res->id == id) &&
+ isc_sockaddr_equal(dest, &res->host) &&
res->port == port) {
return (res);
}
return (ev);
}
+static void
+udp_exrecv(isc_task_t *task, isc_event_t *ev) {
+ dispsocket_t *dispsock = ev->ev_arg;
+
+ UNUSED(task);
+
+ REQUIRE(VALID_DISPSOCK(dispsock));
+ udp_recv(ev, dispsock->disp, dispsock);
+}
+
+static void
+udp_shrecv(isc_task_t *task, isc_event_t *ev) {
+ dns_dispatch_t *disp = ev->ev_arg;
+
+ UNUSED(task);
+
+ REQUIRE(VALID_DISPATCH(disp));
+ udp_recv(ev, disp, NULL);
+}
+
/*
* General flow:
*
* restart.
*/
static void
-udp_recv(isc_task_t *task, isc_event_t *ev_in) {
+udp_recv(isc_event_t *ev_in, dns_dispatch_t *disp, dispsocket_t *dispsock) {
isc_socketevent_t *ev = (isc_socketevent_t *)ev_in;
- dns_dispatch_t *disp = ev_in->ev_arg;
dns_messageid_t id;
isc_result_t dres;
isc_buffer_t source;
unsigned int flags;
- dns_dispentry_t *resp;
+ dns_dispentry_t *resp = NULL;
dns_dispatchevent_t *rev;
unsigned int bucket;
isc_boolean_t killit;
dns_qid_t *qid;
isc_netaddr_t netaddr;
int match;
-
- UNUSED(task);
+ int result;
+ isc_boolean_t qidlocked = ISC_FALSE;
LOCK(&disp->lock);
"got packet: requests %d, buffers %d, recvs %d",
disp->requests, disp->mgr->buffers, disp->recv_pending);
- if (ev->ev_type == ISC_SOCKEVENT_RECVDONE) {
+ if (dispsock == NULL && ev->ev_type == ISC_SOCKEVENT_RECVDONE) {
/*
* Unless the receive event was imported from a listening
* interface, in which case the event type is
disp->recv_pending = 0;
}
+ if (dispsock != NULL &&
+ (ev->result == ISC_R_CANCELED || dispsock->resp == NULL)) {
+ /*
+ * dispsock->resp can be NULL if this transaction was canceled
+ * just after receiving a response. Since this socket is
+ * exclusively used and there should be at most one receive
+ * event the canceled event should have been no effect. So
+ * we can (and should) deactivate the socket right now.
+ */
+ deactivate_dispsocket(disp, dispsock);
+ dispsock = NULL;
+ }
+
if (disp->shutting_down) {
/*
* This dispatcher is shutting down.
killit = destroy_disp_ok(disp);
UNLOCK(&disp->lock);
if (killit)
- isc_task_send(disp->task, &disp->ctlevent);
+ isc_task_send(disp->task[0], &disp->ctlevent);
return;
}
- if (ev->result != ISC_R_SUCCESS) {
+ if (dispsock != NULL &&
+ (disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
+ resp = dispsock->resp;
+ id = resp->id;
+ if (ev->result != ISC_R_SUCCESS) {
+ /*
+ * This is most likely a network error on a connected
+ * socket. It makes no sense to check the address or
+ * parse the packet, but it will help to return the
+ * error to the caller.
+ */
+ goto sendresponse;
+ }
+ } else if (ev->result != ISC_R_SUCCESS) {
free_buffer(disp, ev->region.base, ev->region.length);
if (ev->result != ISC_R_CANCELED)
goto restart;
}
- /* response */
- bucket = dns_hash(qid, &ev->address, id, disp->localport);
- LOCK(&qid->lock);
- resp = bucket_search(qid, &ev->address, id, disp->localport, bucket);
- dispatch_log(disp, LVL(90),
- "search for response in bucket %d: %s",
- bucket, (resp == NULL ? "not found" : "found"));
-
+ /*
+ * Search for the corresponding response. If we are using an exclusive
+ * socket, we've already identified it and we can skip the search; but
+ * the ID and the address must match the expected ones.
+ */
if (resp == NULL) {
+ bucket = dns_hash(qid, &ev->address, id, disp->localport);
+ LOCK(&qid->lock);
+ qidlocked = ISC_TRUE;
+ resp = bucket_search(qid, qid->qid_table, &ev->address, id,
+ disp->localport, bucket, ISC_FALSE);
+ dispatch_log(disp, LVL(90),
+ "search for response in bucket %d: %s",
+ bucket, (resp == NULL ? "not found" : "found"));
+
+ if (resp == NULL) {
+ dns_generalstats_increment(mgr->stats,
+ dns_resstatscounter_mismatch);
+ free_buffer(disp, ev->region.base, ev->region.length);
+ goto unlock;
+ }
+ } else if (resp->id != id || !isc_sockaddr_equal(&ev->address,
+ &resp->host)) {
+ dispatch_log(disp, LVL(90),
+ "response to an exclusive socket doesn't match");
dns_generalstats_increment(mgr->stats,
dns_resstatscounter_mismatch);
free_buffer(disp, ev->region.base, ev->region.length);
}
}
+ sendresponse:
queue_response = resp->item_out;
rev = allocate_event(resp->disp);
if (rev == NULL) {
*/
isc_buffer_init(&rev->buffer, ev->region.base, ev->region.length);
isc_buffer_add(&rev->buffer, ev->n);
- rev->result = ISC_R_SUCCESS;
+ rev->result = ev->result;
rev->id = id;
rev->addr = ev->address;
rev->pktinfo = ev->pktinfo;
isc_task_send(resp->task, ISC_EVENT_PTR(&rev));
}
unlock:
- UNLOCK(&qid->lock);
+ if (qidlocked)
+ UNLOCK(&qid->lock);
/*
* Restart recv() to get the next packet.
*/
restart:
- startrecv(disp);
-
+ result = startrecv(disp, dispsock);
+ if (result != ISC_R_SUCCESS && dispsock != NULL) {
+ /*
+ * XXX: wired. There seems to be no recovery process other than
+ * deactivate this socket anyway (since we cannot start
+ * receiving, we won't be able to receive a cancel event
+ * from the user).
+ */
+ deactivate_dispsocket(disp, dispsock);
+ }
UNLOCK(&disp->lock);
isc_event_free(&ev_in);
killit = destroy_disp_ok(disp);
UNLOCK(&disp->lock);
if (killit)
- isc_task_send(disp->task, &disp->ctlevent);
+ isc_task_send(disp->task[0], &disp->ctlevent);
return;
}
*/
bucket = dns_hash(qid, &tcpmsg->address, id, disp->localport);
LOCK(&qid->lock);
- resp = bucket_search(qid, &tcpmsg->address, id, disp->localport,
- bucket);
+ resp = bucket_search(qid, qid->qid_table, &tcpmsg->address, id,
+ disp->localport, bucket, ISC_FALSE);
dispatch_log(disp, LVL(90),
"search for response in bucket %d: %s",
bucket, (resp == NULL ? "not found" : "found"));
* Restart recv() to get the next packet.
*/
restart:
- startrecv(disp);
+ (void)startrecv(disp, NULL);
UNLOCK(&disp->lock);
/*
* disp must be locked.
*/
-static void
-startrecv(dns_dispatch_t *disp) {
+static isc_result_t
+startrecv(dns_dispatch_t *disp, dispsocket_t *dispsock) {
isc_result_t res;
isc_region_t region;
+ isc_socket_t *socket;
if (disp->shutting_down == 1)
- return;
+ return (ISC_R_SUCCESS);
if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0)
- return;
+ return (ISC_R_SUCCESS);
- if (disp->recv_pending != 0)
- return;
+ if (disp->recv_pending != 0 && dispsock == NULL)
+ return (ISC_R_SUCCESS);
if (disp->mgr->buffers >= disp->mgr->maxbuffers)
- return;
+ return (ISC_R_NOMEMORY);
+
+ if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
+ dispsock == NULL)
+ return (ISC_R_SUCCESS);
+
+ if (dispsock != NULL)
+ socket = dispsock->socket;
+ else
+ socket = disp->socket;
+ INSIST(socket != NULL);
switch (disp->socktype) {
/*
region.length = disp->mgr->buffersize;
region.base = allocate_udp_buffer(disp);
if (region.base == NULL)
- return;
- res = isc_socket_recv(disp->socket, ®ion, 1,
- disp->task, udp_recv, disp);
- if (res != ISC_R_SUCCESS) {
- free_buffer(disp, region.base, region.length);
- disp->shutdown_why = res;
- disp->shutting_down = 1;
- do_cancel(disp);
- return;
+ return (ISC_R_NOMEMORY);
+ if (dispsock != NULL) {
+ res = isc_socket_recv(socket, ®ion, 1,
+ dispsock->task, udp_exrecv,
+ dispsock);
+ if (res != ISC_R_SUCCESS) {
+ free_buffer(disp, region.base, region.length);
+ return (res);
+ }
+ } else {
+ res = isc_socket_recv(socket, ®ion, 1,
+ disp->task[0], udp_shrecv, disp);
+ if (res != ISC_R_SUCCESS) {
+ free_buffer(disp, region.base, region.length);
+ disp->shutdown_why = res;
+ disp->shutting_down = 1;
+ do_cancel(disp);
+ return (ISC_R_SUCCESS); /* recover by cancel */
+ }
+ INSIST(disp->recv_pending == 0);
+ disp->recv_pending = 1;
}
- INSIST(disp->recv_pending == 0);
- disp->recv_pending = 1;
break;
case isc_sockettype_tcp:
- res = dns_tcpmsg_readmessage(&disp->tcpmsg, disp->task,
+ res = dns_tcpmsg_readmessage(&disp->tcpmsg, disp->task[0],
tcp_recv, disp);
if (res != ISC_R_SUCCESS) {
disp->shutdown_why = res;
disp->shutting_down = 1;
do_cancel(disp);
- return;
+ return (ISC_R_SUCCESS); /* recover by cancel */
}
INSIST(disp->recv_pending == 0);
disp->recv_pending = 1;
INSIST(0);
break;
}
+
+ return (ISC_R_SUCCESS);
}
/*
isc_mempool_destroy(&mgr->rpool);
isc_mempool_destroy(&mgr->dpool);
isc_mempool_destroy(&mgr->bpool);
+ isc_mempool_destroy(&mgr->spool);
DESTROYLOCK(&mgr->pool_lock);
if (mgr->blackhole != NULL)
dns_acl_detach(&mgr->blackhole);
- if (mgr->portlist != NULL)
- dns_portlist_detach(&mgr->portlist);
-
if (mgr->stats != NULL)
dns_stats_detach(&mgr->stats);
+ if (mgr->v4ports != NULL) {
+ isc_mem_put(mctx, mgr->v4ports,
+ mgr->nv4ports * sizeof(in_port_t));
+ }
+ if (mgr->v6ports != NULL) {
+ isc_mem_put(mctx, mgr->v6ports,
+ mgr->nv6ports * sizeof(in_port_t));
+ }
isc_mem_put(mctx, mgr, sizeof(dns_dispatchmgr_t));
isc_mem_detach(&mctx);
}
static isc_result_t
-create_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
- isc_socket_t **sockp)
+open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
+ isc_socket_t **sockp)
{
isc_socket_t *sock;
isc_result_t result;
- sock = NULL;
- result = isc_socket_create(mgr, isc_sockaddr_pf(local),
- isc_sockettype_udp, &sock);
- if (result != ISC_R_SUCCESS)
- return (result);
- isc_socket_setname(sock, "dispatcher", NULL);
+ sock = *sockp;
+ if (sock == NULL) {
+ result = isc_socket_create(mgr, isc_sockaddr_pf(local),
+ isc_sockettype_udp, &sock);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+ isc_socket_setname(sock, "dispatcher", NULL);
+ } else {
+ result = isc_socket_open(sock);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+ }
#ifndef ISC_ALLOW_MAPPED
isc_socket_ipv6only(sock, ISC_TRUE);
#endif
result = isc_socket_bind(sock, local);
if (result != ISC_R_SUCCESS) {
- isc_socket_detach(&sock);
+ if (*sockp == NULL)
+ isc_socket_detach(&sock);
+ else
+ isc_socket_close(sock);
return (result);
}
return (ISC_R_SUCCESS);
}
+/*%
+ * Create a temporary port list to set the initial default set of dispatch
+ * ports: [1024, 65535]. This is almost meaningless as the application will
+ * normally set the ports explicitly, but is provided to fill some minor corner
+ * cases.
+ */
+static isc_result_t
+create_default_portset(isc_mem_t *mctx, isc_portset_t **portsetp) {
+ isc_result_t result;
+
+ result = isc_portset_create(mctx, portsetp);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+ isc_portset_addrange(*portsetp, 1024, 65535);
+
+ return (ISC_R_SUCCESS);
+}
+
/*
* Publics.
*/
{
dns_dispatchmgr_t *mgr;
isc_result_t result;
+ isc_portset_t *v4portset = NULL;
+ isc_portset_t *v6portset = NULL;
REQUIRE(mctx != NULL);
REQUIRE(mgrp != NULL && *mgrp == NULL);
isc_mem_attach(mctx, &mgr->mctx);
mgr->blackhole = NULL;
- mgr->portlist = NULL;
mgr->stats = NULL;
result = isc_mutex_init(&mgr->lock);
mgr->buffersize = 0;
mgr->maxbuffers = 0;
mgr->bpool = NULL;
+ mgr->spool = NULL;
mgr->entropy = NULL;
mgr->qid = NULL;
mgr->state = 0;
ISC_LIST_INIT(mgr->list);
+ mgr->v4ports = NULL;
+ mgr->v6ports = NULL;
+ mgr->nv4ports = 0;
+ mgr->nv6ports = 0;
mgr->magic = DNS_DISPATCHMGR_MAGIC;
+ result = create_default_portset(mctx, &v4portset);
+ if (result == ISC_R_SUCCESS) {
+ result = create_default_portset(mctx, &v6portset);
+ if (result == ISC_R_SUCCESS) {
+ result = dns_dispatchmgr_setavailports(mgr,
+ v4portset,
+ v6portset);
+ }
+ }
+ if (v4portset != NULL)
+ isc_portset_destroy(mctx, &v4portset);
+ if (v6portset != NULL)
+ isc_portset_destroy(mctx, &v6portset);
+ if (result != ISC_R_SUCCESS)
+ goto kill_dpool;
+
if (entropy != NULL)
isc_entropy_attach(entropy, &mgr->entropy);
- dispatch_arc4init(&mgr->arc4ctx);
+ dispatch_arc4init(&mgr->arc4ctx, mgr->entropy, &mgr->arc4_lock);
*mgrp = mgr;
return (ISC_R_SUCCESS);
+ kill_dpool:
+ isc_mempool_destroy(&mgr->dpool);
kill_rpool:
isc_mempool_destroy(&mgr->rpool);
kill_epool:
dns_portlist_t *portlist)
{
REQUIRE(VALID_DISPATCHMGR(mgr));
- if (mgr->portlist != NULL)
- dns_portlist_detach(&mgr->portlist);
- if (portlist != NULL)
- dns_portlist_attach(portlist, &mgr->portlist);
+ UNUSED(portlist);
+
+ /* This function is deprecated: use dns_dispatchmgr_setavailports(). */
+ return;
}
dns_portlist_t *
dns_dispatchmgr_getblackportlist(dns_dispatchmgr_t *mgr) {
REQUIRE(VALID_DISPATCHMGR(mgr));
- return (mgr->portlist);
+ return (NULL); /* this function is deprecated */
+}
+
+isc_result_t
+dns_dispatchmgr_setavailports(dns_dispatchmgr_t *mgr, isc_portset_t *v4portset,
+ isc_portset_t *v6portset)
+{
+ in_port_t *v4ports, *v6ports, p;
+ unsigned int nv4ports, nv6ports, i4, i6;
+
+ REQUIRE(VALID_DISPATCHMGR(mgr));
+
+ nv4ports = isc_portset_nports(v4portset);
+ nv6ports = isc_portset_nports(v6portset);
+
+ v4ports = NULL;
+ if (nv4ports != 0) {
+ v4ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv4ports);
+ if (v4ports == NULL)
+ return (ISC_R_NOMEMORY);
+ }
+ v6ports = NULL;
+ if (nv6ports != 0) {
+ v6ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv6ports);
+ if (v6ports == NULL) {
+ if (v4ports != NULL) {
+ isc_mem_put(mgr->mctx, v4ports,
+ sizeof(in_port_t) *
+ isc_portset_nports(v4portset));
+ }
+ return (ISC_R_NOMEMORY);
+ }
+ }
+
+ p = 0;
+ i4 = 0;
+ i6 = 0;
+ do {
+ if (isc_portset_isset(v4portset, p)) {
+ INSIST(i4 < nv4ports);
+ v4ports[i4++] = p;
+ }
+ if (isc_portset_isset(v6portset, p)) {
+ INSIST(i6 < nv6ports);
+ v6ports[i6++] = p;
+ }
+ } while (p++ < 65535);
+ INSIST(i4 == nv4ports && i6 == nv6ports);
+
+ PORTBUFLOCK(mgr);
+ if (mgr->v4ports != NULL) {
+ isc_mem_put(mgr->mctx, mgr->v4ports,
+ mgr->nv4ports * sizeof(in_port_t));
+ }
+ mgr->v4ports = v4ports;
+ mgr->nv4ports = nv4ports;
+
+ if (mgr->v6ports != NULL) {
+ isc_mem_put(mgr->mctx, mgr->v6ports,
+ mgr->nv6ports * sizeof(in_port_t));
+ }
+ mgr->v6ports = v6ports;
+ mgr->nv6ports = nv6ports;
+ PORTBUFUNLOCK(mgr);
+
+ return (ISC_R_SUCCESS);
}
static isc_result_t
dns_dispatchmgr_setudp(dns_dispatchmgr_t *mgr,
- unsigned int buffersize, unsigned int maxbuffers,
- unsigned int buckets, unsigned int increment)
+ unsigned int buffersize, unsigned int maxbuffers,
+ unsigned int maxrequests, unsigned int buckets,
+ unsigned int increment)
{
isc_result_t result;
maxbuffers = 8;
LOCK(&mgr->buffer_lock);
+
+ /* Create or adjust buffer pool */
if (mgr->bpool != NULL) {
isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
mgr->maxbuffers = maxbuffers;
+ } else {
+ result = isc_mempool_create(mgr->mctx, buffersize, &mgr->bpool);
+ if (result != ISC_R_SUCCESS) {
+ UNLOCK(&mgr->buffer_lock);
+ return (result);
+ }
+ isc_mempool_setname(mgr->bpool, "dispmgr_bpool");
+ isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
+ isc_mempool_associatelock(mgr->bpool, &mgr->pool_lock);
+ }
+
+ /* Create or adjust socket pool */
+ if (mgr->spool != NULL) {
+ isc_mempool_setmaxalloc(mgr->spool, DNS_DISPATCH_POOLSOCKS * 2);
UNLOCK(&mgr->buffer_lock);
return (ISC_R_SUCCESS);
}
-
- if (isc_mempool_create(mgr->mctx, buffersize,
- &mgr->bpool) != ISC_R_SUCCESS) {
+ result = isc_mempool_create(mgr->mctx, sizeof(dispsocket_t),
+ &mgr->spool);
+ if (result != ISC_R_SUCCESS) {
UNLOCK(&mgr->buffer_lock);
- return (ISC_R_NOMEMORY);
+ goto cleanup;
}
+ isc_mempool_setname(mgr->spool, "dispmgr_spool");
+ isc_mempool_setmaxalloc(mgr->spool, maxrequests);
+ isc_mempool_associatelock(mgr->spool, &mgr->pool_lock);
- isc_mempool_setname(mgr->bpool, "dispmgr_bpool");
- isc_mempool_setmaxalloc(mgr->bpool, maxbuffers);
- isc_mempool_associatelock(mgr->bpool, &mgr->pool_lock);
-
- result = qid_allocate(mgr, buckets, increment, &mgr->qid);
+ result = qid_allocate(mgr, buckets, increment, &mgr->qid, ISC_TRUE);
if (result != ISC_R_SUCCESS)
goto cleanup;
cleanup:
isc_mempool_destroy(&mgr->bpool);
+ if (mgr->spool != NULL)
+ isc_mempool_destroy(&mgr->spool);
UNLOCK(&mgr->buffer_lock);
- return (ISC_R_NOMEMORY);
+ return (result);
}
void
dns_stats_attach(stats, &mgr->stats);
}
+static int
+port_cmp(const void *key, const void *ent) {
+ in_port_t p1 = *(const in_port_t *)key;
+ in_port_t p2 = *(const in_port_t *)ent;
+
+ if (p1 < p2)
+ return (-1);
+ else if (p1 == p2)
+ return (0);
+ else
+ return (1);
+}
+
static isc_boolean_t
-blacklisted(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
- isc_sockaddr_t *sockaddrp)
+portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
+ isc_sockaddr_t *sockaddrp)
{
isc_sockaddr_t sockaddr;
isc_result_t result;
+ in_port_t *ports, port;
+ unsigned int nports;
+ isc_boolean_t available = ISC_FALSE;
REQUIRE(sock != NULL || sockaddrp != NULL);
- if (mgr->portlist == NULL)
- return (ISC_FALSE);
-
+ PORTBUFLOCK(mgr);
if (sock != NULL) {
sockaddrp = &sockaddr;
result = isc_socket_getsockname(sock, sockaddrp);
if (result != ISC_R_SUCCESS)
- return (ISC_FALSE);
+ goto unlock;
}
- if (dns_portlist_match(mgr->portlist, isc_sockaddr_pf(sockaddrp),
- isc_sockaddr_getport(sockaddrp)))
- return (ISC_TRUE);
- return (ISC_FALSE);
+ if (isc_sockaddr_pf(sockaddrp) == AF_INET) {
+ ports = mgr->v4ports;
+ nports = mgr->nv4ports;
+ } else {
+ ports = mgr->v6ports;
+ nports = mgr->nv6ports;
+ }
+ if (ports == NULL)
+ goto unlock;
+
+ port = isc_sockaddr_getport(sockaddrp);
+ if (bsearch(&port, ports, nports, sizeof(in_port_t), port_cmp) != NULL)
+ available = ISC_TRUE;
+
+unlock:
+ PORTBUFUNLOCK(mgr);
+ return (available);
}
#define ATTRMATCH(_a1, _a2, _mask) (((_a1) & (_mask)) == ((_a2) & (_mask)))
return (ISC_TRUE);
/*
- * Don't match wildcard ports against newly blacklisted ports.
+ * Don't match wildcard ports unless the port is available in the
+ * current configuration. We can skip this check when disp->socket is
+ * NULL because such a dispatcher will choose ports on-demand from
+ * the available set.
*/
- if (disp->mgr->portlist != NULL &&
- isc_sockaddr_getport(addr) == 0 &&
+ if (isc_sockaddr_getport(addr) == 0 &&
isc_sockaddr_getport(&disp->local) == 0 &&
- blacklisted(disp->mgr, disp->socket, NULL))
+ disp->socket != NULL &&
+ !portavailable(disp->mgr, disp->socket, NULL)) {
return (ISC_FALSE);
+ }
/*
* Check if we match the binding <address,port>.
static isc_result_t
qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
- unsigned int increment, dns_qid_t **qidp)
+ unsigned int increment, dns_qid_t **qidp,
+ isc_boolean_t needaddrtable)
{
dns_qid_t *qid;
unsigned int i;
return (ISC_R_NOMEMORY);
}
+ qid->addr_table = NULL;
+ if (needaddrtable) {
+ qid->addr_table = isc_mem_get(mgr->mctx,
+ buckets * sizeof(dns_displist_t));
+ if (qid->addr_table == NULL) {
+ isc_mem_put(mgr->mctx, qid, sizeof(*qid));
+ isc_mem_put(mgr->mctx, qid->qid_table,
+ buckets * sizeof(dns_displist_t));
+ return (ISC_R_NOMEMORY);
+ }
+ }
+
result = isc_mutex_init(&qid->lock);
if (result != ISC_R_SUCCESS) {
+ if (qid->addr_table != NULL) {
+ isc_mem_put(mgr->mctx, qid->addr_table,
+ buckets * sizeof(dns_displist_t));
+ }
isc_mem_put(mgr->mctx, qid->qid_table,
buckets * sizeof(dns_displist_t));
isc_mem_put(mgr->mctx, qid, sizeof(*qid));
return (result);
}
- for (i = 0; i < buckets; i++)
+ for (i = 0; i < buckets; i++) {
ISC_LIST_INIT(qid->qid_table[i]);
+ if (qid->addr_table != NULL)
+ ISC_LIST_INIT(qid->addr_table[i]);
+ }
qid->qid_nbuckets = buckets;
qid->qid_increment = increment;
qid->magic = 0;
isc_mem_put(mctx, qid->qid_table,
qid->qid_nbuckets * sizeof(dns_displist_t));
+ if (qid->addr_table != NULL) {
+ isc_mem_put(mctx, qid->addr_table,
+ qid->qid_nbuckets * sizeof(dns_displist_t));
+ }
DESTROYLOCK(&qid->lock);
isc_mem_put(mctx, qid, sizeof(*qid));
}
disp->requests = 0;
disp->tcpbuffers = 0;
disp->qid = NULL;
+ ISC_LIST_INIT(disp->activesockets);
+ ISC_LIST_INIT(disp->inactivesockets);
+ disp->nsockets = 0;
+ dispatch_arc4init(&disp->arc4ctx, mgr->entropy, NULL);
result = isc_mutex_init(&disp->lock);
if (result != ISC_R_SUCCESS)
INSIST(disp->tcpbuffers == 0);
INSIST(disp->requests == 0);
INSIST(disp->recv_pending == 0);
+ INSIST(ISC_LIST_EMPTY(disp->activesockets));
+ INSIST(ISC_LIST_EMPTY(disp->inactivesockets));
isc_mempool_put(mgr->epool, disp->failsafe_ev);
disp->failsafe_ev = NULL;
return (result);
}
- result = qid_allocate(mgr, buckets, increment, &disp->qid);
+ result = qid_allocate(mgr, buckets, increment, &disp->qid, ISC_FALSE);
if (result != ISC_R_SUCCESS)
goto deallocate_dispatch;
disp->socket = NULL;
isc_socket_attach(sock, &disp->socket);
- disp->task = NULL;
- result = isc_task_create(taskmgr, 0, &disp->task);
+ disp->ntasks = 1;
+ disp->task[0] = NULL;
+ result = isc_task_create(taskmgr, 0, &disp->task[0]);
if (result != ISC_R_SUCCESS)
goto kill_socket;
goto kill_task;
}
- isc_task_setname(disp->task, "tcpdispatch", disp);
+ isc_task_setname(disp->task[0], "tcpdispatch", disp);
dns_tcpmsg_init(mgr->mctx, disp->socket, &disp->tcpmsg);
disp->tcpmsg_valid = 1;
UNLOCK(&mgr->lock);
mgr_log(mgr, LVL(90), "created TCP dispatcher %p", disp);
- dispatch_log(disp, LVL(90), "created task %p", disp->task);
+ dispatch_log(disp, LVL(90), "created task %p", disp->task[0]);
*dispp = disp;
* Error returns.
*/
kill_task:
- isc_task_detach(&disp->task);
+ isc_task_detach(&disp->task[0]);
kill_socket:
isc_socket_detach(&disp->socket);
deallocate_dispatch:
REQUIRE((attributes & DNS_DISPATCHATTR_TCP) == 0);
result = dns_dispatchmgr_setudp(mgr, buffersize, maxbuffers,
- buckets, increment);
+ maxrequests, buckets, increment);
if (result != ISC_R_SUCCESS)
return (result);
LOCK(&mgr->lock);
- if ((attributes & DNS_DISPATCHATTR_RANDOMPORT) != 0) {
+ if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
REQUIRE(isc_sockaddr_getport(localaddr) == 0);
goto createudp;
}
{
disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
if (disp->recv_pending != 0)
- isc_socket_cancel(disp->socket, disp->task,
+ isc_socket_cancel(disp->socket, disp->task[0],
ISC_SOCKCANCEL_RECV);
}
#define DNS_DISPATCH_HELD 20U
#endif
+static isc_result_t
+get_udpsocket(dns_dispatchmgr_t *mgr, dns_dispatch_t *disp,
+ isc_socketmgr_t *sockmgr, isc_sockaddr_t *localaddr,
+ isc_socket_t **sockp, unsigned int maxtry)
+{
+ unsigned int i, j;
+ isc_socket_t *held[DNS_DISPATCH_HELD];
+ isc_sockaddr_t localaddr_bound;
+ isc_socket_t *sock = NULL;
+ isc_result_t result = ISC_R_SUCCESS;
+ isc_boolean_t anyport;
+
+ INSIST(sockp != NULL && *sockp == NULL);
+
+ localaddr_bound = *localaddr;
+ anyport = ISC_TF(isc_sockaddr_getport(localaddr) == 0);
+
+ if (anyport) {
+ unsigned int nports;
+ in_port_t *ports;
+
+ /*
+ * If no port is specified, we first try to pick up a random
+ * port by ourselves.
+ */
+ if (isc_sockaddr_pf(&disp->local) == AF_INET) {
+ nports = disp->mgr->nv4ports;
+ ports = disp->mgr->v4ports;
+ } else {
+ nports = disp->mgr->nv6ports;
+ ports = disp->mgr->v6ports;
+ }
+ if (nports == 0)
+ return (ISC_R_ADDRNOTAVAIL);
+
+ for (i = 0; i < 1024; i++) {
+ in_port_t prt;
+
+ prt = ports[dispatch_arc4uniformrandom(
+ DISP_ARC4CTX(disp),
+ nports)];
+ isc_sockaddr_setport(&localaddr_bound, prt);
+ result = open_socket(sockmgr, &localaddr_bound, &sock);
+ if (result == ISC_R_SUCCESS ||
+ result != ISC_R_ADDRINUSE) {
+ disp->localport = prt;
+ *sockp = sock;
+ return (result);
+ }
+ }
+
+ /*
+ * If this fails 1024 times, we then ask the kernel for
+ * choosing one.
+ */
+ }
+
+ memset(held, 0, sizeof(held));
+ i = 0;
+
+ for (j = 0; j < maxtry; j++) {
+ result = open_socket(sockmgr, localaddr, &sock);
+ if (result != ISC_R_SUCCESS)
+ goto end;
+ else if (!anyport)
+ break;
+ else if (portavailable(mgr, sock, NULL))
+ break;
+ if (held[i] != NULL)
+ isc_socket_detach(&held[i]);
+ held[i++] = sock;
+ sock = NULL;
+ if (i == DNS_DISPATCH_HELD)
+ i = 0;
+ }
+ if (j == maxtry) {
+ mgr_log(mgr, ISC_LOG_ERROR,
+ "avoid-v%s-udp-ports: unable to allocate "
+ "an available port",
+ isc_sockaddr_pf(localaddr) == AF_INET ? "4" : "6");
+ result = ISC_R_FAILURE;
+ goto end;
+ }
+ *sockp = sock;
+
+end:
+ for (i = 0; i < DNS_DISPATCH_HELD; i++) {
+ if (held[i] != NULL)
+ isc_socket_detach(&held[i]);
+ }
+
+ return (result);
+}
+
static isc_result_t
dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr,
isc_taskmgr_t *taskmgr,
isc_result_t result;
dns_dispatch_t *disp;
isc_socket_t *sock = NULL;
- isc_socket_t *held[DNS_DISPATCH_HELD];
- unsigned int i = 0, j = 0, k = 0;
- isc_sockaddr_t localaddr_bound;
- in_port_t localport = 0;
+ int i = 0;
/*
* dispatch_allocate() checks mgr for us.
if (result != ISC_R_SUCCESS)
return (result);
- /*
- * Try to allocate a socket that is not on the blacklist.
- * Hold up to DNS_DISPATCH_HELD sockets to prevent the OS
- * from returning the same port to us too quickly.
- */
- memset(held, 0, sizeof(held));
- localaddr_bound = *localaddr;
- getsocket:
- if ((attributes & DNS_DISPATCHATTR_RANDOMPORT) != 0) {
- in_port_t prt;
-
- /* XXX: should the range be configurable? */
- prt = 1024 + dispatch_arc4uniformrandom(mgr, 65535 - 1023);
- isc_sockaddr_setport(&localaddr_bound, prt);
- if (blacklisted(mgr, NULL, &localaddr_bound)) {
- if (++k == 1024)
- attributes &= ~DNS_DISPATCHATTR_RANDOMPORT;
- goto getsocket;
- }
- result = create_socket(sockmgr, &localaddr_bound, &sock);
- if (result == ISC_R_ADDRINUSE) {
- if (++k == 1024)
- attributes &= ~DNS_DISPATCHATTR_RANDOMPORT;
- goto getsocket;
- }
- localport = prt;
- } else
- result = create_socket(sockmgr, localaddr, &sock);
- if (result != ISC_R_SUCCESS)
- goto deallocate_dispatch;
- if ((attributes & DNS_DISPATCHATTR_RANDOMPORT) == 0 &&
- isc_sockaddr_getport(localaddr) == 0 &&
- blacklisted(mgr, sock, NULL))
- {
- if (held[i] != NULL)
- isc_socket_detach(&held[i]);
- held[i++] = sock;
- sock = NULL;
- if (i == DNS_DISPATCH_HELD)
- i = 0;
- if (j++ == 0xffffU) {
- mgr_log(mgr, ISC_LOG_ERROR, "avoid-v%s-udp-ports: "
- "unable to allocate a non-blacklisted port",
- isc_sockaddr_pf(localaddr) == AF_INET ?
- "4" : "6");
- result = ISC_R_FAILURE;
+ if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0) {
+ result = get_udpsocket(mgr, disp, sockmgr, localaddr, &sock,
+ 0xffffU);
+ if (result != ISC_R_SUCCESS)
goto deallocate_dispatch;
- }
- goto getsocket;
}
-
disp->socktype = isc_sockettype_udp;
disp->socket = sock;
disp->local = *localaddr;
- disp->localport = localport;
- disp->task = NULL;
- result = isc_task_create(taskmgr, 0, &disp->task);
- if (result != ISC_R_SUCCESS)
- goto kill_socket;
+ if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
+ disp->ntasks = MAX_INTERNAL_TASKS;
+ else
+ disp->ntasks = 1;
+ for (i = 0; i < disp->ntasks; i++) {
+ disp->task[i] = NULL;
+ result = isc_task_create(taskmgr, 0, &disp->task[i]);
+ if (result != ISC_R_SUCCESS) {
+ while (--i >= 0)
+ isc_task_destroy(&disp->task[i]);
+ goto kill_socket;
+ }
+ isc_task_setname(disp->task[i], "udpdispatch", disp);
+ }
disp->ctlevent = isc_event_allocate(mgr->mctx, disp,
DNS_EVENT_DISPATCHCONTROL,
goto kill_task;
}
- isc_task_setname(disp->task, "udpdispatch", disp);
-
attributes &= ~DNS_DISPATCHATTR_TCP;
attributes |= DNS_DISPATCHATTR_UDP;
disp->attributes = attributes;
ISC_LIST_APPEND(mgr->list, disp, link);
mgr_log(mgr, LVL(90), "created UDP dispatcher %p", disp);
- dispatch_log(disp, LVL(90), "created task %p", disp->task);
- dispatch_log(disp, LVL(90), "created socket %p", disp->socket);
+ dispatch_log(disp, LVL(90), "created task %p", disp->task[0]); /* XXX */
+ if (disp->socket != NULL)
+ dispatch_log(disp, LVL(90), "created socket %p", disp->socket);
*dispp = disp;
-
- goto cleanheld;
+ return (result);
/*
* Error returns.
*/
kill_task:
- isc_task_detach(&disp->task);
+ for (i = 0; i < disp->ntasks; i++)
+ isc_task_detach(&disp->task[i]);
kill_socket:
- isc_socket_detach(&disp->socket);
+ if (disp->socket != NULL)
+ isc_socket_detach(&disp->socket);
deallocate_dispatch:
dispatch_free(&disp);
- cleanheld:
- for (i = 0; i < DNS_DISPATCH_HELD; i++)
- if (held[i] != NULL)
- isc_socket_detach(&held[i]);
+
return (result);
}
void
dns_dispatch_detach(dns_dispatch_t **dispp) {
dns_dispatch_t *disp;
+ dispsocket_t *dispsock;
isc_boolean_t killit;
REQUIRE(dispp != NULL && VALID_DISPATCH(*dispp));
killit = ISC_FALSE;
if (disp->refcount == 0) {
if (disp->recv_pending > 0)
- isc_socket_cancel(disp->socket, disp->task,
+ isc_socket_cancel(disp->socket, disp->task[0],
+ ISC_SOCKCANCEL_RECV);
+ for (dispsock = ISC_LIST_HEAD(disp->activesockets);
+ dispsock != NULL;
+ dispsock = ISC_LIST_NEXT(dispsock, link)) {
+ isc_socket_cancel(dispsock->socket, dispsock->task,
ISC_SOCKCANCEL_RECV);
+ }
disp->shutting_down = 1;
}
killit = destroy_disp_ok(disp);
UNLOCK(&disp->lock);
if (killit)
- isc_task_send(disp->task, &disp->ctlevent);
+ isc_task_send(disp->task[0], &disp->ctlevent);
}
isc_result_t
-dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
- isc_task_t *task, isc_taskaction_t action, void *arg,
- dns_messageid_t *idp, dns_dispentry_t **resp)
+dns_dispatch_addresponse2(dns_dispatch_t *disp, isc_sockaddr_t *dest,
+ isc_task_t *task, isc_taskaction_t action, void *arg,
+ dns_messageid_t *idp, dns_dispentry_t **resp,
+ isc_socketmgr_t *sockmgr)
{
dns_dispentry_t *res;
unsigned int bucket;
+ unsigned int abucket;
+ in_port_t localport = 0;
dns_messageid_t id;
int i;
isc_boolean_t ok;
dns_qid_t *qid;
+ dispsocket_t *dispsocket = NULL;
+ isc_result_t result;
REQUIRE(VALID_DISPATCH(disp));
REQUIRE(task != NULL);
REQUIRE(dest != NULL);
REQUIRE(resp != NULL && *resp == NULL);
REQUIRE(idp != NULL);
+ if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
+ REQUIRE(sockmgr != NULL);
LOCK(&disp->lock);
return (ISC_R_QUOTA);
}
+ if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 &&
+ disp->nsockets > DNS_DISPATCH_SOCKSQUOTA) {
+ dispsocket_t *oldestsocket;
+ dns_dispentry_t *oldestresp;
+ dns_dispatchevent_t *rev;
+
+ /*
+ * Kill oldest outstanding query if the number of sockets
+ * exceeds the quota to keep the room for new queries.
+ */
+ oldestsocket = ISC_LIST_HEAD(disp->activesockets);
+ oldestresp = oldestsocket->resp;
+ if (oldestresp != NULL && !oldestresp->item_out) {
+ rev = allocate_event(oldestresp->disp);
+ if (rev != NULL) {
+ rev->buffer.base = NULL;
+ rev->result = ISC_R_CANCELED;
+ rev->id = oldestresp->id;
+ ISC_EVENT_INIT(rev, sizeof(*rev), 0,
+ NULL, DNS_EVENT_DISPATCH,
+ oldestresp->action,
+ oldestresp->arg, oldestresp,
+ NULL, NULL);
+ oldestresp->item_out = ISC_TRUE;
+ isc_task_send(oldestresp->task,
+ ISC_EVENT_PTR(&rev));
+ }
+ }
+
+ /*
+ * Move this entry to the tail so that it won't (easily) be
+ * examined before actually being canceled.
+ */
+ ISC_LIST_UNLINK(disp->activesockets, oldestsocket, link);
+ ISC_LIST_APPEND(disp->activesockets, oldestsocket, link);
+ }
+
+ qid = DNS_QID(disp);
+ LOCK(&qid->lock);
+
+ if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
+ /*
+ * Get a separate UDP socket with a random port number.
+ */
+ result = get_dispsocket(disp, dest, sockmgr, qid, &dispsocket,
+ &abucket, &localport);
+ if (result != ISC_R_SUCCESS) {
+ UNLOCK(&qid->lock);
+ UNLOCK(&disp->lock);
+ return (result);
+ }
+ } else {
+ abucket = 0; /* meaningless, but set explicitly */
+ localport = disp->localport;
+ }
+
/*
* Try somewhat hard to find an unique ID.
*/
- id = (dns_messageid_t)dispatch_arc4random(disp->mgr);
- qid = DNS_QID(disp);
- LOCK(&qid->lock);
- bucket = dns_hash(qid, dest, id, disp->localport);
+ id = (dns_messageid_t)dispatch_arc4random(DISP_ARC4CTX(disp));
+ bucket = dns_hash(qid, dest, id, localport);
ok = ISC_FALSE;
for (i = 0; i < 64; i++) {
- if (bucket_search(qid, dest, id, disp->localport, bucket) ==
- NULL) {
+ if (bucket_search(qid, qid->qid_table, dest, id, localport,
+ bucket, ISC_FALSE) == NULL) {
ok = ISC_TRUE;
break;
}
id += qid->qid_increment;
id &= 0x0000ffff;
- bucket = dns_hash(qid, dest, id, disp->localport);
+ bucket = dns_hash(qid, dest, id, localport);
}
if (!ok) {
if (res == NULL) {
UNLOCK(&qid->lock);
UNLOCK(&disp->lock);
+ if (dispsocket != NULL)
+ destroy_dispsocket(disp, &dispsocket);
return (ISC_R_NOMEMORY);
}
isc_task_attach(task, &res->task);
res->disp = disp;
res->id = id;
- res->port = disp->localport;
+ res->port = localport;
res->bucket = bucket;
+ res->abucket = abucket;
res->host = *dest;
res->action = action;
res->arg = arg;
+ res->dispsocket = dispsocket;
+ if (dispsocket != NULL)
+ dispsocket->resp = res;
res->item_out = ISC_FALSE;
ISC_LIST_INIT(res->items);
ISC_LINK_INIT(res, link);
+ ISC_LINK_INIT(res, alink);
res->magic = RESPONSE_MAGIC;
ISC_LIST_APPEND(qid->qid_table[bucket], res, link);
+ if (dispsocket != NULL)
+ ISC_LIST_APPEND(qid->addr_table[abucket], res, alink);
UNLOCK(&qid->lock);
request_log(disp, res, LVL(90),
"attached to task %p", res->task);
if (((disp->attributes & DNS_DISPATCHATTR_UDP) != 0) ||
- ((disp->attributes & DNS_DISPATCHATTR_CONNECTED) != 0))
- startrecv(disp);
+ ((disp->attributes & DNS_DISPATCHATTR_CONNECTED) != 0)) {
+ result = startrecv(disp, dispsocket);
+ if (result != ISC_R_SUCCESS) {
+ LOCK(&qid->lock);
+ ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
+ if (ISC_LINK_LINKED(res, alink)) {
+ ISC_LIST_UNLINK(qid->addr_table[abucket], res,
+ alink);
+ }
+ UNLOCK(&qid->lock);
+
+ if (dispsocket != NULL)
+ destroy_dispsocket(disp, &dispsocket);
+
+ disp->refcount--;
+ disp->requests--;
+
+ UNLOCK(&disp->lock);
+ isc_task_detach(&res->task);
+ isc_mempool_put(disp->mgr->rpool, res);
+ return (result);
+ }
+ }
+
+ if (dispsocket != NULL)
+ ISC_LIST_APPEND(disp->activesockets, dispsocket, link);
UNLOCK(&disp->lock);
*idp = id;
*resp = res;
+ if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0)
+ INSIST(res->dispsocket != NULL);
+
return (ISC_R_SUCCESS);
}
+isc_result_t
+dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
+ isc_task_t *task, isc_taskaction_t action, void *arg,
+ dns_messageid_t *idp, dns_dispentry_t **resp)
+{
+ REQUIRE(VALID_DISPATCH(disp));
+ REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0);
+
+ return (dns_dispatch_addresponse2(disp, dest, task, action, arg,
+ idp, resp, NULL));
+}
+
void
dns_dispatch_starttcp(dns_dispatch_t *disp) {
REQUIRE(VALID_DISPATCH(disp));
- dispatch_log(disp, LVL(90), "starttcp %p", disp->task);
+ dispatch_log(disp, LVL(90), "starttcp %p", disp->task[0]);
LOCK(&disp->lock);
disp->attributes |= DNS_DISPATCHATTR_CONNECTED;
- startrecv(disp);
+ (void)startrecv(disp, NULL);
UNLOCK(&disp->lock);
}
dns_dispatchmgr_t *mgr;
dns_dispatch_t *disp;
dns_dispentry_t *res;
+ dispsocket_t *dispsock;
dns_dispatchevent_t *ev;
unsigned int bucket;
isc_boolean_t killit;
killit = ISC_FALSE;
if (disp->refcount == 0) {
if (disp->recv_pending > 0)
- isc_socket_cancel(disp->socket, disp->task,
+ isc_socket_cancel(disp->socket, disp->task[0],
ISC_SOCKCANCEL_RECV);
+ for (dispsock = ISC_LIST_HEAD(disp->activesockets);
+ dispsock != NULL;
+ dispsock = ISC_LIST_NEXT(dispsock, link)) {
+ isc_socket_cancel(dispsock->socket, dispsock->task,
+ ISC_SOCKCANCEL_RECV);
+ }
disp->shutting_down = 1;
}
LOCK(&qid->lock);
ISC_LIST_UNLINK(qid->qid_table[bucket], res, link);
+ if (ISC_LINK_LINKED(res, alink))
+ ISC_LIST_UNLINK(qid->addr_table[res->abucket], res, alink);
UNLOCK(&qid->lock);
if (ev == NULL && res->item_out) {
request_log(disp, res, LVL(90), "detaching from task %p", res->task);
isc_task_detach(&res->task);
+ if (res->dispsocket != NULL) {
+ isc_socket_cancel(res->dispsocket->socket,
+ res->dispsocket->task, ISC_SOCKCANCEL_RECV);
+ res->dispsocket->resp = NULL;
+ }
+
/*
* Free any buffered requests as well
*/
if (disp->shutting_down == 1)
do_cancel(disp);
else
- startrecv(disp);
+ (void)startrecv(disp, NULL);
killit = destroy_disp_ok(disp);
UNLOCK(&disp->lock);
if (killit)
- isc_task_send(disp->task, &disp->ctlevent);
+ isc_task_send(disp->task[0], &disp->ctlevent);
}
static void
qid = DNS_QID(disp);
/*
- * Search for the first response handler without packets outstanding.
+ * Search for the first response handler without packets outstanding
+ * unless a specific hander is given.
*/
LOCK(&qid->lock);
for (resp = linear_first(qid);
- resp != NULL && resp->item_out != ISC_FALSE;
+ resp != NULL && !resp->item_out;
/* Empty. */)
resp = linear_next(qid, resp);
+
/*
* No one to send the cancel event to, so nothing to do.
*/
return (disp->socket);
}
+isc_socket_t *
+dns_dispatch_getentrysocket(dns_dispentry_t *resp) {
+ REQUIRE(VALID_RESPONSE(resp));
+
+ if (resp->dispsocket != NULL)
+ return (resp->dispsocket->socket);
+ else
+ return (NULL);
+}
+
isc_result_t
dns_dispatch_getlocaladdress(dns_dispatch_t *disp, isc_sockaddr_t *addrp) {
return;
}
+unsigned int
+dns_dispatch_getattributes(dns_dispatch_t *disp) {
+ REQUIRE(VALID_DISPATCH(disp));
+
+ /*
+ * We don't bother locking disp here; it's the caller's responsibility
+ * to use only non volatile flags.
+ */
+ return (disp->attributes);
+}
+
void
dns_dispatch_changeattributes(dns_dispatch_t *disp,
unsigned int attributes, unsigned int mask)
{
REQUIRE(VALID_DISPATCH(disp));
+ /* Exclusive attribute can only be set on creation */
+ REQUIRE((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0);
+ /* Also, a dispatch with randomport specified cannot start listening */
+ REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0 ||
+ (attributes & DNS_DISPATCHATTR_NOLISTEN) == 0);
/* XXXMLG
* Should check for valid attributes here!
if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0 &&
(attributes & DNS_DISPATCHATTR_NOLISTEN) == 0) {
disp->attributes &= ~DNS_DISPATCHATTR_NOLISTEN;
- startrecv(disp);
+ (void)startrecv(disp, NULL);
} else if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN)
== 0 &&
(attributes & DNS_DISPATCHATTR_NOLISTEN) != 0) {
disp->attributes |= DNS_DISPATCHATTR_NOLISTEN;
if (disp->recv_pending != 0)
- isc_socket_cancel(disp->socket, disp->task,
+ isc_socket_cancel(disp->socket, disp->task[0],
ISC_SOCKCANCEL_RECV);
}
}
INSIST(sevent->n <= disp->mgr->buffersize);
newsevent = (isc_socketevent_t *)
isc_event_allocate(disp->mgr->mctx, NULL,
- DNS_EVENT_IMPORTRECVDONE, udp_recv,
+ DNS_EVENT_IMPORTRECVDONE, udp_shrecv,
disp, sizeof(isc_socketevent_t));
if (newsevent == NULL)
return;
newsevent->pktinfo = sevent->pktinfo;
newsevent->attributes = sevent->attributes;
- isc_task_send(disp->task, ISC_EVENT_PTR(&newsevent));
+ isc_task_send(disp->task[0], ISC_EVENT_PTR(&newsevent));
}
#if 0
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: dispatch.h,v 1.56.128.3 2008/05/27 22:36:11 each Exp $ */
+/* $Id: dispatch.h,v 1.56.128.4 2008/06/24 00:09:12 jinmei Exp $ */
#ifndef DNS_DISPATCH_H
#define DNS_DISPATCH_H 1
* The dispatcher is a TCP or UDP socket.
*
* _IPV4, _IPV6
- * The dispatcher uses an ipv4 or ipv6 socket.
+ * The dispatcher uses an IPv4 or IPv6 socket.
*
* _NOLISTEN
* The dispatcher should not listen on the socket.
* accept replies from them.
*
* _RANDOMPORT
- * Allocate UDP port randomly.
+ * Previously used to indicate that the port of a dispatch UDP must be
+ * chosen randomly. This behavior now always applies and the attribute
+ * is obsoleted.
+ *
+ * _EXCLUSIVE
+ * A separate socket will be used on-demand for each transaction.
*/
#define DNS_DISPATCHATTR_PRIVATE 0x00000001U
#define DNS_DISPATCHATTR_TCP 0x00000002U
#define DNS_DISPATCHATTR_NOLISTEN 0x00000020U
#define DNS_DISPATCHATTR_MAKEQUERY 0x00000040U
#define DNS_DISPATCHATTR_CONNECTED 0x00000080U
-#define DNS_DISPATCHATTR_RANDOMPORT 0x00000100U
+/*#define DNS_DISPATCHATTR_RANDOMPORT 0x00000100U*/
+#define DNS_DISPATCHATTR_EXCLUSIVE 0x00000200U
/*@}*/
isc_result_t
dns_dispatchmgr_setblackportlist(dns_dispatchmgr_t *mgr,
dns_portlist_t *portlist);
/*%<
- * Sets a list of UDP ports that won't be used when creating a udp
- * dispatch with a wildcard port.
+ * This function is deprecated. Use dns_dispatchmgr_setavailports() instead.
*
* Requires:
*\li mgr is a valid dispatchmgr
- *\li portlist to be NULL or a valid port list.
*/
dns_portlist_t *
dns_dispatchmgr_getblackportlist(dns_dispatchmgr_t *mgr);
/*%<
- * Return the current port list.
+ * This function is deprecated and always returns NULL.
*
* Requires:
*\li mgr is a valid dispatchmgr
*/
+isc_result_t
+dns_dispatchmgr_setavailports(dns_dispatchmgr_t *mgr, isc_portset_t *v4portset,
+ isc_portset_t *v6portset);
+/*%<
+ * Sets a list of UDP ports that can be used for outgoing UDP messages.
+ *
+ * Requires:
+ *\li mgr is a valid dispatchmgr
+ *\li v4portset is NULL or a valid port set
+ *\li v6portset is NULL or a valid port set
+ */
+
void
dns_dispatchmgr_setstats(dns_dispatchmgr_t *mgr, dns_stats_t *stats);
/*%<
*\li 'disp' is valid.
*/
+isc_result_t
+dns_dispatch_addresponse2(dns_dispatch_t *disp, isc_sockaddr_t *dest,
+ isc_task_t *task, isc_taskaction_t action, void *arg,
+ isc_uint16_t *idp, dns_dispentry_t **resp,
+ isc_socketmgr_t *sockmgr);
+
isc_result_t
dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest,
isc_task_t *task, isc_taskaction_t action, void *arg,
*
*\li "resp" be non-NULL and *resp be NULL
*
+ *\li "sockmgr" be NULL or a valid socket manager. If 'disp' has
+ * the DNS_DISPATCHATTR_EXCLUSIVE attribute, this must not be NULL,
+ * which also means dns_dispatch_addresponse() cannot be used.
+ *
* Ensures:
*
*\li <id, dest> is a unique tuple. That means incoming messages
* argument to dns_dispatch_addresponse() when allocating '*resp'.
*/
+isc_socket_t *
+dns_dispatch_getentrysocket(dns_dispentry_t *resp);
isc_socket_t *
dns_dispatch_getsocket(dns_dispatch_t *disp);
*\li disp is valid.
*/
+unsigned int
+dns_dispatch_getattributes(dns_dispatch_t *disp);
+/*%<
+ * Return the attributes (DNS_DISPATCHATTR_xxx) of this dispatch. Only the
+ * non-changeable attributes are expected to be referenced by the caller.
+ *
+ * Requires:
+ *\li disp is valid.
+ */
+
void
dns_dispatch_changeattributes(dns_dispatch_t *disp,
unsigned int attributes, unsigned int mask);
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: resolver.h,v 1.56.128.2 2008/04/03 06:08:27 tbox Exp $ */
+/* $Id: resolver.h,v 1.56.128.3 2008/06/24 00:09:12 jinmei Exp $ */
#ifndef DNS_RESOLVER_H
#define DNS_RESOLVER_H 1
#define DNS_RESOLVER_CHECKNAMES 0x01
#define DNS_RESOLVER_CHECKNAMESFAIL 0x02
-#define DNS_RESOLVER_USEDISPATCHPOOL4 0x04
-#define DNS_RESOLVER_USEDISPATCHPOOL6 0x08
isc_result_t
dns_resolver_create(dns_view_t *view,
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: request.c,v 1.79 2007/06/19 23:47:16 tbox Exp $ */
+/* $Id: request.c,v 1.79.128.1 2008/06/24 00:09:11 jinmei Exp $ */
/*! \file */
static void req_senddone(isc_task_t *task, isc_event_t *event);
static void req_response(isc_task_t *task, isc_event_t *event);
static void req_timeout(isc_task_t *task, isc_event_t *event);
+static isc_socket_t * req_getsocket(dns_request_t *request);
static void req_connected(isc_task_t *task, isc_event_t *event);
static void req_sendevent(dns_request_t *request, isc_result_t result);
static void req_cancel(dns_request_t *request);
isc_socket_t *socket;
isc_result_t result;
int i;
+ unsigned int dispattr;
req_log(ISC_LOG_DEBUG(3), "dns_requestmgr_create");
REQUIRE(socketmgr != NULL);
REQUIRE(taskmgr != NULL);
REQUIRE(dispatchmgr != NULL);
+ UNUSED(socket);
if (dispatchv4 != NULL) {
- socket = dns_dispatch_getsocket(dispatchv4);
- REQUIRE(isc_socket_gettype(socket) == isc_sockettype_udp);
+ dispattr = dns_dispatch_getattributes(dispatchv4);
+ REQUIRE((dispattr & DNS_DISPATCHATTR_UDP) != 0);
}
if (dispatchv6 != NULL) {
- socket = dns_dispatch_getsocket(dispatchv6);
- REQUIRE(isc_socket_gettype(socket) == isc_sockettype_udp);
+ dispattr = dns_dispatch_getattributes(dispatchv6);
+ REQUIRE((dispattr & DNS_DISPATCHATTR_UDP) != 0);
}
requestmgr = isc_mem_get(mctx, sizeof(*requestmgr));
isc_region_t r;
isc_socket_t *socket;
isc_result_t result;
+ unsigned int dispattr;
req_log(ISC_LOG_DEBUG(3), "req_send: request %p", request);
REQUIRE(VALID_REQUEST(request));
- socket = dns_dispatch_getsocket(request->dispatch);
+ dispattr = dns_dispatch_getattributes(request->dispatch);
+ socket = req_getsocket(request);
isc_buffer_usedregion(request->query, &r);
+ /*
+ * We could connect the socket when we are using an exclusive dispatch
+ * as we do in resolver.c, but we prefer implementation simplicity
+ * at this moment.
+ */
result = isc_socket_sendto(socket, &r, task, req_senddone,
request, address, NULL);
if (result == ISC_R_SUCCESS)
if (result != ISC_R_SUCCESS)
goto cleanup;
- socket = dns_dispatch_getsocket(request->dispatch);
- INSIST(socket != NULL);
- result = dns_dispatch_addresponse(request->dispatch, destaddr, task,
- req_response, request, &id,
- &request->dispentry);
+ result = dns_dispatch_addresponse2(request->dispatch, destaddr, task,
+ req_response, request, &id,
+ &request->dispentry,
+ requestmgr->socketmgr);
if (result != ISC_R_SUCCESS)
goto cleanup;
+ socket = req_getsocket(request);
+ INSIST(socket != NULL);
+
result = isc_buffer_allocate(mctx, &request->query,
r.length + (tcp ? 2 : 0));
if (result != ISC_R_SUCCESS)
if (result != ISC_R_SUCCESS)
goto cleanup;
- socket = dns_dispatch_getsocket(request->dispatch);
- INSIST(socket != NULL);
- result = dns_dispatch_addresponse(request->dispatch, destaddr, task,
- req_response, request, &id,
- &request->dispentry);
+ result = dns_dispatch_addresponse2(request->dispatch, destaddr, task,
+ req_response, request, &id,
+ &request->dispentry,
+ requestmgr->socketmgr);
if (result != ISC_R_SUCCESS)
goto cleanup;
+ socket = req_getsocket(request);
+ INSIST(socket != NULL);
message->id = id;
if (setkey) {
*** Private: request.
***/
+static isc_socket_t *
+req_getsocket(dns_request_t *request) {
+ unsigned int dispattr;
+ isc_socket_t *socket;
+
+ dispattr = dns_dispatch_getattributes(request->dispatch);
+ if ((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
+ INSIST(request->dispentry != NULL);
+ socket = dns_dispatch_getentrysocket(request->dispentry);
+ } else
+ socket = dns_dispatch_getsocket(request->dispatch);
+
+ return (socket);
+}
+
static void
req_connected(isc_task_t *task, isc_event_t *event) {
isc_socketevent_t *sevent = (isc_socketevent_t *)event;
static void
req_cancel(dns_request_t *request) {
isc_socket_t *socket;
+ unsigned int dispattr;
REQUIRE(VALID_REQUEST(request));
if (request->timer != NULL)
isc_timer_detach(&request->timer);
+ dispattr = dns_dispatch_getattributes(request->dispatch);
+ socket = NULL;
+ if (DNS_REQUEST_CONNECTING(request) || DNS_REQUEST_SENDING(request)) {
+ if ((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0) {
+ if (request->dispentry != NULL) {
+ socket = dns_dispatch_getentrysocket(
+ request->dispentry);
+ }
+ } else
+ socket = dns_dispatch_getsocket(request->dispatch);
+ if (DNS_REQUEST_CONNECTING(request) && socket != NULL)
+ isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_CONNECT);
+ if (DNS_REQUEST_SENDING(request) && socket != NULL)
+ isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_SEND);
+ }
if (request->dispentry != NULL)
dns_dispatch_removeresponse(&request->dispentry, NULL);
- if (DNS_REQUEST_CONNECTING(request)) {
- socket = dns_dispatch_getsocket(request->dispatch);
- isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_CONNECT);
- }
- if (DNS_REQUEST_SENDING(request)) {
- socket = dns_dispatch_getsocket(request->dispatch);
- isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_SEND);
- }
dns_dispatch_detach(&request->dispatch);
}
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: resolver.c,v 1.355.12.18 2008/06/17 22:36:03 jinmei Exp $ */
+/* $Id: resolver.c,v 1.355.12.19 2008/06/24 00:09:11 jinmei Exp $ */
/*! \file */
isc_mem_t * mctx;
dns_dispatchmgr_t * dispatchmgr;
dns_dispatch_t * dispatch;
+ isc_boolean_t exclusivesocket;
dns_adbaddrinfo_t * addrinfo;
isc_socket_t * tcpsocket;
isc_time_t start;
ISC_LINK(struct alternate) link;
} alternate_t;
-#ifdef ISC_RWLOCK_USEATOMIC
-#define DNS_RESOLVER_USERWLOCK 1
-#else
-#define DNS_RESOLVER_USERWLOCK 0
-#endif
-
-#if DNS_RESOLVER_USERWLOCK
-#define RES_INITLOCK(l) isc_rwlock_init((l), 0, 0)
-#define RES_DESTROYLOCK(l) isc_rwlock_destroy(l)
-#define RES_LOCK(l, t) RWLOCK((l), (t))
-#define RES_UNLOCK(l, t) RWUNLOCK((l), (t))
-#else
-#define RES_INITLOCK(l) isc_mutex_init(l)
-#define RES_DESTROYLOCK(l) DESTROYLOCK(l)
-#define RES_LOCK(l, t) LOCK(l)
-#define RES_UNLOCK(l, t) UNLOCK(l)
-#endif
-
struct dns_resolver {
/* Unlocked. */
unsigned int magic;
isc_mutex_t lock;
isc_mutex_t nlock;
isc_mutex_t primelock;
-#if DNS_RESOLVER_USERWLOCK
- isc_rwlock_t poollock;
-#else
- isc_mutex_t poollock;
-#endif
dns_rdataclass_t rdclass;
isc_socketmgr_t * socketmgr;
isc_timermgr_t * timermgr;
unsigned int options;
dns_dispatchmgr_t * dispatchmgr;
dns_dispatch_t * dispatchv4;
+ isc_boolean_t exclusivev4;
dns_dispatch_t * dispatchv6;
+ isc_boolean_t exclusivev6;
unsigned int ndisps;
unsigned int nbuckets;
fctxbucket_t * buckets;
unsigned int spillatmin;
isc_timer_t * spillattimer;
isc_boolean_t zero_no_soa_ttl;
- isc_timer_t * disppooltimer;
/* Locked by lock. */
unsigned int references;
dns_fetch_t * primefetch;
/* Locked by nlock. */
unsigned int nfctx;
- /* Locked by poollock. */
- dns_dispatch_t ** dispatchv4pool;
- dns_dispatch_t ** dispatchv6pool;
};
#define RES_MAGIC ISC_MAGIC('R', 'e', 's', '!')
unsigned int factor;
dns_adbfind_t *find;
dns_adbaddrinfo_t *addrinfo;
+ isc_socket_t *socket;
query = *queryp;
fctx = query->fctx;
0, factor);
}
- if (query->dispentry != NULL)
- dns_dispatch_removeresponse(&query->dispentry, deventp);
-
- ISC_LIST_UNLINK(fctx->queries, query, link);
-
- if (query->tsig != NULL)
- isc_buffer_free(&query->tsig);
-
- if (query->tsigkey != NULL)
- dns_tsigkey_detach(&query->tsigkey);
-
/*
* Check for any outstanding socket events. If they exist, cancel
* them and let the event handlers finish the cleanup. The resolver
* only needs to worry about managing the connect and send events;
* the dispatcher manages the recv events.
*/
- if (RESQUERY_CONNECTING(query))
+ if (RESQUERY_CONNECTING(query)) {
/*
* Cancel the connect.
*/
- isc_socket_cancel(query->tcpsocket, NULL,
- ISC_SOCKCANCEL_CONNECT);
- else if (RESQUERY_SENDING(query))
+ if (query->tcpsocket != NULL) {
+ isc_socket_cancel(query->tcpsocket, NULL,
+ ISC_SOCKCANCEL_CONNECT);
+ } else if (query->dispentry != NULL) {
+ INSIST(query->exclusivesocket);
+ socket = dns_dispatch_getentrysocket(query->dispentry);
+ if (socket != NULL)
+ isc_socket_cancel(socket, NULL,
+ ISC_SOCKCANCEL_CONNECT);
+ }
+ } else if (RESQUERY_SENDING(query)) {
/*
* Cancel the pending send.
*/
- isc_socket_cancel(dns_dispatch_getsocket(query->dispatch),
- NULL, ISC_SOCKCANCEL_SEND);
+ if (query->exclusivesocket && query->dispentry != NULL)
+ socket = dns_dispatch_getentrysocket(query->dispentry);
+ else
+ socket = dns_dispatch_getsocket(query->dispatch);
+ if (socket != NULL)
+ isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_SEND);
+ }
+
+ if (query->dispentry != NULL)
+ dns_dispatch_removeresponse(&query->dispentry, deventp);
+
+ ISC_LIST_UNLINK(fctx->queries, query, link);
+
+ if (query->tsig != NULL)
+ isc_buffer_free(&query->tsig);
+
+ if (query->tsigkey != NULL)
+ dns_tsigkey_detach(&query->tsigkey);
if (query->dispatch != NULL)
dns_dispatch_detach(&query->dispatch);
}
static void
-resquery_senddone(isc_task_t *task, isc_event_t *event) {
+process_sendevent(resquery_t *query, isc_event_t *event) {
isc_socketevent_t *sevent = (isc_socketevent_t *)event;
- resquery_t *query = event->ev_arg;
isc_boolean_t retry = ISC_FALSE;
isc_result_t result;
fetchctx_t *fctx;
- REQUIRE(event->ev_type == ISC_SOCKEVENT_SENDDONE);
-
- QTRACE("senddone");
-
- /*
- * XXXRTH
- *
- * Currently we don't wait for the senddone event before retrying
- * a query. This means that if we get really behind, we may end
- * up doing extra work!
- */
-
- UNUSED(task);
-
- INSIST(RESQUERY_SENDING(query));
-
- query->sends--;
fctx = query->fctx;
if (RESQUERY_CANCELED(query)) {
- if (query->sends == 0) {
+ if (query->sends == 0 && query->connects == 0) {
/*
* This query was canceled while the
- * isc_socket_sendto() was in progress.
+ * isc_socket_sendto/connect() was in progress.
*/
if (query->tcpsocket != NULL)
isc_socket_detach(&query->tcpsocket);
resquery_destroy(&query);
}
- } else
+ } else {
switch (sevent->result) {
case ISC_R_SUCCESS:
break;
fctx_cancelquery(&query, NULL, NULL, ISC_FALSE);
break;
}
+ }
isc_event_free(&event);
}
}
+static void
+resquery_udpconnected(isc_task_t *task, isc_event_t *event) {
+ resquery_t *query = event->ev_arg;
+
+ REQUIRE(event->ev_type == ISC_SOCKEVENT_CONNECT);
+
+ QTRACE("udpconnected");
+
+ UNUSED(task);
+
+ INSIST(RESQUERY_CONNECTING(query));
+
+ query->connects--;
+
+ process_sendevent(query, event);
+}
+
+static void
+resquery_senddone(isc_task_t *task, isc_event_t *event) {
+ resquery_t *query = event->ev_arg;
+
+ REQUIRE(event->ev_type == ISC_SOCKEVENT_SENDDONE);
+
+ QTRACE("senddone");
+
+ /*
+ * XXXRTH
+ *
+ * Currently we don't wait for the senddone event before retrying
+ * a query. This means that if we get really behind, we may end
+ * up doing extra work!
+ */
+
+ UNUSED(task);
+
+ INSIST(RESQUERY_SENDING(query));
+
+ query->sends--;
+
+ process_sendevent(query, event);
+}
+
static inline isc_result_t
fctx_addopt(dns_message_t *message, unsigned int version,
isc_uint16_t udpsize, isc_boolean_t request_nsid)
*/
query->dispatchmgr = res->dispatchmgr;
query->dispatch = NULL;
+ query->exclusivesocket = ISC_FALSE;
query->tcpsocket = NULL;
if (res->view->peers != NULL) {
dns_peer_t *peer = NULL;
if (result != ISC_R_SUCCESS)
goto cleanup_query;
} else {
- isc_sockaddr_t localaddr;
- unsigned int attrs, attrmask;
- dns_dispatch_t *disp_base;
-
- attrs = 0;
- attrs |= DNS_DISPATCHATTR_UDP;
- attrs |= DNS_DISPATCHATTR_RANDOMPORT;
-
- attrmask = 0;
- attrmask |= DNS_DISPATCHATTR_UDP;
- attrmask |= DNS_DISPATCHATTR_TCP;
- attrmask |= DNS_DISPATCHATTR_IPV4;
- attrmask |= DNS_DISPATCHATTR_IPV6;
-
switch (isc_sockaddr_pf(&addrinfo->sockaddr)) {
- case AF_INET:
- disp_base = res->dispatchv4;
- attrs |= DNS_DISPATCHATTR_IPV4;
+ case PF_INET:
+ dns_dispatch_attach(res->dispatchv4,
+ &query->dispatch);
+ query->exclusivesocket = res->exclusivev4;
break;
- case AF_INET6:
- disp_base = res->dispatchv6;
- attrs |= DNS_DISPATCHATTR_IPV6;
+ case PF_INET6:
+ dns_dispatch_attach(res->dispatchv6,
+ &query->dispatch);
+ query->exclusivesocket = res->exclusivev6;
break;
default:
result = ISC_R_NOTIMPLEMENTED;
goto cleanup_query;
}
-
- result = dns_dispatch_getlocaladdress(disp_base,
- &localaddr);
- if (result != ISC_R_SUCCESS)
- goto cleanup_query;
- if (isc_sockaddr_getport(&localaddr) == 0) {
- result = dns_dispatch_getudp(res->dispatchmgr,
- res->socketmgr,
- res->taskmgr,
- &localaddr,
- 4096, 1000, 32768,
- 16411, 16433,
- attrs, attrmask,
- &query->dispatch);
- if (result != ISC_R_SUCCESS)
- goto cleanup_query;
- } else {
- dns_dispatch_attach(disp_base,
- &query->dispatch);
- }
}
/*
* We should always have a valid dispatcher here. If we
/*
* Get a query id from the dispatch.
*/
- result = dns_dispatch_addresponse(query->dispatch,
- &query->addrinfo->sockaddr,
- task,
- resquery_response,
- query,
- &query->id,
- &query->dispentry);
+ result = dns_dispatch_addresponse2(query->dispatch,
+ &query->addrinfo->sockaddr,
+ task,
+ resquery_response,
+ query,
+ &query->id,
+ &query->dispentry,
+ res->socketmgr);
if (result != ISC_R_SUCCESS)
goto cleanup_temps;
*/
dns_message_reset(fctx->qmessage, DNS_MESSAGE_INTENTRENDER);
- socket = dns_dispatch_getsocket(query->dispatch);
+ if (query->exclusivesocket)
+ socket = dns_dispatch_getentrysocket(query->dispentry);
+ else
+ socket = dns_dispatch_getsocket(query->dispatch);
/*
* Send the query!
*/
- if ((query->options & DNS_FETCHOPT_TCP) == 0)
+ if ((query->options & DNS_FETCHOPT_TCP) == 0) {
address = &query->addrinfo->sockaddr;
+ if (query->exclusivesocket) {
+ result = isc_socket_connect(socket, address, task,
+ resquery_udpconnected,
+ query);
+ if (result != ISC_R_SUCCESS)
+ goto cleanup_message;
+ query->connects++;
+ }
+ }
isc_buffer_usedregion(buffer, &r);
/*
static void
fctx_timeout(isc_task_t *task, isc_event_t *event) {
fetchctx_t *fctx = event->ev_arg;
+ isc_timerevent_t *tevent = (isc_timerevent_t *)event;
+ resquery_t *query;
REQUIRE(VALID_FCTX(fctx));
fctx->timeouts++;
/*
* We could cancel the running queries here, or we could let
- * them keep going. Right now we choose the latter...
+ * them keep going. Since we normally use separate sockets for
+ * different queries, we adopt the former approach to reduce
+ * the number of open sockets: cancel the oldest query if it
+ * expired before the query had started (this is usually the
+ * case but is not always so, depending on the task schedule
+ * timing).
*/
+ query = ISC_LIST_HEAD(fctx->queries);
+ if (query != NULL &&
+ isc_time_compare(&tevent->due, &query->start) >= 0) {
+ fctx_cancelquery(&query, NULL, NULL, ISC_TRUE);
+ }
fctx->attributes &= ~FCTX_ATTR_ADDRWAIT;
/*
* Our timer has triggered. Reestablish the fctx lifetime
* There's no hope for this query.
*/
keep_trying = ISC_TRUE;
+
+ /*
+ * If this is a network error on an exclusive query
+ * socket, mark the server as bad so that we won't try
+ * it for this fetch again.
+ */
+ if (query->exclusivesocket &&
+ (devent->result == ISC_R_HOSTUNREACH ||
+ devent->result == ISC_R_NETUNREACH ||
+ devent->result == ISC_R_CONNREFUSED ||
+ devent->result == ISC_R_CANCELED)) {
+ broken_server = devent->result;
+ }
}
goto done;
}
INSIST(res->nfctx == 0);
- RES_DESTROYLOCK(&res->poollock);
DESTROYLOCK(&res->primelock);
DESTROYLOCK(&res->nlock);
DESTROYLOCK(&res->lock);
dns_dispatch_detach(&res->dispatchv4);
if (res->dispatchv6 != NULL)
dns_dispatch_detach(&res->dispatchv6);
- if (res->dispatchv4pool != NULL) {
- for (i = 0; i < res->ndisps; i++)
- dns_dispatch_detach(&res->dispatchv4pool[i]);
- isc_mem_put(res->mctx, res->dispatchv4pool,
- res->ndisps * sizeof(dns_dispatch_t *));
- }
- if (res->dispatchv6pool != NULL) {
- for (i = 0; i < res->ndisps; i++)
- dns_dispatch_detach(&res->dispatchv6pool[i]);
- isc_mem_put(res->mctx, res->dispatchv6pool,
- res->ndisps * sizeof(dns_dispatch_t *));
- }
while ((a = ISC_LIST_HEAD(res->alternates)) != NULL) {
ISC_LIST_UNLINK(res->alternates, a, link);
if (!a->isaddress)
dns_name_free(&a->_u._n.name, res->mctx);
isc_mem_put(res->mctx, a, sizeof(*a));
}
- if (res->disppooltimer != NULL)
- isc_timer_detach(&res->disppooltimer);
dns_resolver_reset_algorithms(res);
dns_resolver_resetmustbesecure(res);
#if USE_ALGLOCK
unsigned int i, buckets_created = 0;
isc_task_t *task = NULL;
char name[16];
+ unsigned dispattr;
/*
* Create a resolver.
res->zero_no_soa_ttl = ISC_FALSE;
res->ndisps = 0;
res->nextdisp = 0; /* meaningless at this point, but init it */
- res->dispatchv4pool = NULL;
- res->dispatchv6pool = NULL;
- res->disppooltimer = NULL;
res->nbuckets = ntasks;
res->activebuckets = ntasks;
res->buckets = isc_mem_get(view->mctx,
}
res->dispatchv4 = NULL;
- if (dispatchv4 != NULL)
- dns_dispatch_attach(dispatchv4, &res->dispatchv4);
+ if (dispatchv4 != NULL) {
+ dns_dispatch_attach(dispatchv4, &res->dispatchv4);
+ dispattr = dns_dispatch_getattributes(dispatchv4);
+ res->exclusivev4 =
+ ISC_TF((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0);
+ }
res->dispatchv6 = NULL;
- if (dispatchv6 != NULL)
+ if (dispatchv6 != NULL) {
dns_dispatch_attach(dispatchv6, &res->dispatchv6);
+ dispattr = dns_dispatch_getattributes(dispatchv6);
+ res->exclusivev6 =
+ ISC_TF((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0);
+ }
res->references = 1;
res->exiting = ISC_FALSE;
if (result != ISC_R_SUCCESS)
goto cleanup_nlock;
- result = RES_INITLOCK(&res->poollock);
- if (result != ISC_R_SUCCESS)
- goto cleanup_primelock;
-
task = NULL;
result = isc_task_create(taskmgr, 0, &task);
if (result != ISC_R_SUCCESS)
- goto cleanup_poollock;
+ goto cleanup_primelock;
result = isc_timer_create(timermgr, isc_timertype_inactive, NULL, NULL,
task, spillattimer_countdown, res,
&res->spillattimer);
isc_task_detach(&task);
if (result != ISC_R_SUCCESS)
- goto cleanup_poollock;
+ goto cleanup_primelock;
#if USE_ALGLOCK
result = isc_rwlock_init(&res->alglock, 0, 0);
isc_timer_detach(&res->spillattimer);
#endif
- cleanup_poollock:
- RES_DESTROYLOCK(&res->poollock);
-
cleanup_primelock:
DESTROYLOCK(&res->primelock);
fctx != NULL;
fctx = ISC_LIST_NEXT(fctx, link))
fctx_shutdown(fctx);
- if (res->dispatchv4 != NULL) {
+ if (res->dispatchv4 != NULL && !res->exclusivev4) {
sock = dns_dispatch_getsocket(res->dispatchv4);
isc_socket_cancel(sock, res->buckets[i].task,
ISC_SOCKCANCEL_ALL);
}
- if (res->dispatchv6 != NULL) {
+ if (res->dispatchv6 != NULL && !res->exclusivev6) {
sock = dns_dispatch_getsocket(res->dispatchv6);
isc_socket_cancel(sock, res->buckets[i].task,
ISC_SOCKCANCEL_ALL);
return (resolver->options);
}
-
-static void
-disppooltimer_update(isc_task_t *task, isc_event_t *event) {
- dns_resolver_t *res = event->ev_arg;
- isc_sockaddr_t addr4, addr6;
- dns_dispatch_t *disp4 = NULL, *disp6 = NULL;
- isc_result_t result;
- unsigned int nxt;
- unsigned int attrs_base, attrs, attrmask;
-
- REQUIRE(VALID_RESOLVER(res));
- REQUIRE((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0 ||
- (res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0);
-
- UNUSED(task);
- isc_event_free(&event);
-
- LOCK(&res->lock);
- nxt = res->nextdisp++;
- if (res->nextdisp == res->ndisps)
- res->nextdisp = 0;
- UNLOCK(&res->lock);
-
- attrs_base = 0;
- attrs_base |= DNS_DISPATCHATTR_UDP;
- attrs_base |= DNS_DISPATCHATTR_RANDOMPORT;
-
- attrmask = 0;
- attrmask |= DNS_DISPATCHATTR_UDP;
- attrmask |= DNS_DISPATCHATTR_TCP;
- attrmask |= DNS_DISPATCHATTR_IPV4;
- attrmask |= DNS_DISPATCHATTR_IPV6;
-
- RES_LOCK(&res->poollock, isc_rwlocktype_read);
- if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) {
- result = dns_dispatch_getlocaladdress(res->dispatchv4pool[nxt],
- &addr4);
- INSIST(result == ISC_R_SUCCESS);
- }
- if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) {
- result = dns_dispatch_getlocaladdress(res->dispatchv6pool[nxt],
- &addr6);
- INSIST(result == ISC_R_SUCCESS);
- }
- RES_UNLOCK(&res->poollock, isc_rwlocktype_read);
-
- if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) {
- attrs = attrs_base;
- attrs |= DNS_DISPATCHATTR_IPV4;
-
- result = dns_dispatch_getudp(res->dispatchmgr,
- res->socketmgr,
- res->taskmgr, &addr4,
- 4096, 1000, 32768, 16411,
- 16433, attrs, attrmask,
- &disp4);
- if (result != ISC_R_SUCCESS) {
- isc_log_write(dns_lctx, DNS_LOGCATEGORY_RESOLVER,
- DNS_LOGMODULE_RESOLVER, ISC_LOG_ERROR,
- "could not update an IPv4 random query "
- "port: %s",
- isc_result_totext(result));
- /* keep the old one */
- }
-
- /*
- * We don't try to ensure the new dispatch is unique (see the
- * comments in dns_resolver_createdispatchpool()).
- */
- }
- if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) {
- attrs = attrs_base;
- attrs |= DNS_DISPATCHATTR_IPV6;
-
- result = dns_dispatch_getudp(res->dispatchmgr,
- res->socketmgr,
- res->taskmgr, &addr6,
- 4096, 1000, 32768, 16411,
- 16433, attrs, attrmask,
- &disp6);
- if (result != ISC_R_SUCCESS) {
- isc_log_write(dns_lctx, DNS_LOGCATEGORY_RESOLVER,
- DNS_LOGMODULE_RESOLVER, ISC_LOG_ERROR,
- "could not update an IPv6 random query "
- "port: %s",
- isc_result_totext(result));
- }
- }
-
- RES_LOCK(&res->poollock, isc_rwlocktype_write);
- if (disp4 != NULL) {
- dns_dispatch_detach(&res->dispatchv4pool[nxt]);
- res->dispatchv4pool[nxt] = disp4;
- }
- if (disp6 != NULL) {
- dns_dispatch_detach(&res->dispatchv6pool[nxt]);
- res->dispatchv6pool[nxt] = disp6;
- }
- RES_UNLOCK(&res->poollock, isc_rwlocktype_write);
-
- return;
-}
-
-isc_result_t
-dns_resolver_createdispatchpool(dns_resolver_t *res, unsigned int ndisps,
- unsigned int tick)
-{
- unsigned int i;
- isc_result_t result = ISC_R_SUCCESS;
- unsigned int attrs_base, attrs, attrmask;
- isc_sockaddr_t addr4, addr6;
- dns_dispatch_t *disp;
- isc_task_t *task;
- isc_interval_t interval;
-
- REQUIRE(VALID_RESOLVER(res));
- REQUIRE(!res->frozen); /* meaning we don't have to lock res */
- REQUIRE(ndisps > 0);
- REQUIRE((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0 ||
- (res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0);
-
- attrs_base = 0;
- attrs_base |= DNS_DISPATCHATTR_UDP;
- attrs_base |= DNS_DISPATCHATTR_RANDOMPORT;
-
- attrmask = 0;
- attrmask |= DNS_DISPATCHATTR_UDP;
- attrmask |= DNS_DISPATCHATTR_TCP;
- attrmask |= DNS_DISPATCHATTR_IPV4;
- attrmask |= DNS_DISPATCHATTR_IPV6;
-
- if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) {
- INSIST(res->dispatchv4 != NULL);
- result = dns_dispatch_getlocaladdress(res->dispatchv4, &addr4);
- INSIST(result == ISC_R_SUCCESS &&
- isc_sockaddr_getport(&addr4) == 0);
- res->dispatchv4pool = isc_mem_get(res->mctx,
- sizeof(dns_dispatch_t *) *
- ndisps);
- if (res->dispatchv4pool == NULL)
- return (ISC_R_NOMEMORY);
- for (i = 0; i < ndisps; i++)
- res->dispatchv4pool[i] = NULL;
- }
- if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) {
- INSIST(res->dispatchv6 != NULL);
- result = dns_dispatch_getlocaladdress(res->dispatchv6, &addr6);
- INSIST(result == ISC_R_SUCCESS &&
- isc_sockaddr_getport(&addr6) == 0);
- res->dispatchv6pool = isc_mem_get(res->mctx,
- sizeof(dns_dispatch_t *) *
- ndisps);
- if (res->dispatchv6pool == NULL) {
- isc_mem_put(res->mctx, res->dispatchv4pool,
- sizeof(dns_dispatch_t *) * ndisps);
- res->dispatchv4pool = NULL;
- return (ISC_R_NOMEMORY);
- }
- for (i = 0; i < ndisps; i++)
- res->dispatchv6pool[i] = NULL;
- }
-
- for (i = 0; i < ndisps; i++) {
- if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) {
- attrs = attrs_base;
- attrs |= DNS_DISPATCHATTR_IPV4;
-
- disp = NULL;
- result = dns_dispatch_getudp(res->dispatchmgr,
- res->socketmgr,
- res->taskmgr, &addr4,
- 4096, 1000, 32768, 16411,
- 16433, attrs, attrmask,
- &disp);
- if (result != ISC_R_SUCCESS)
- goto cleanup;
- res->dispatchv4pool[i] = disp;
-
- /*
- * It might be better to ensure all ports are
- * different, but in practice it's probably okay to
- * assume dns_dispatch_getudp() made reasonable
- * choices.
- */
- }
- if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) {
- attrs = attrs_base;
- attrs |= DNS_DISPATCHATTR_IPV6;
-
- disp = NULL;
- result = dns_dispatch_getudp(res->dispatchmgr,
- res->socketmgr,
- res->taskmgr, &addr6,
- 4096, 1000, 32768, 16411,
- 16433, attrs, attrmask,
- &disp);
- if (result != ISC_R_SUCCESS)
- goto cleanup;
-
- res->dispatchv6pool[i] = disp;
- }
- }
-
- /* start update timer */
- if (tick != 0) {
- task = NULL;
- result = isc_task_create(res->taskmgr, 0, &task);
- if (result != ISC_R_SUCCESS)
- goto cleanup;
- isc_interval_set(&interval, tick, 0);
- result = isc_timer_create(res->timermgr, isc_timertype_ticker,
- NULL, &interval, task,
- disppooltimer_update,
- res, &res->disppooltimer);
- isc_task_detach(&task);
- if (result != ISC_R_SUCCESS)
- goto cleanup;
- }
-
- res->ndisps = ndisps;
- res->nextdisp = 0;
-
- return (result);
-
- cleanup:
- if (res->dispatchv4pool != NULL) {
- for (i = 0; i < ndisps; i++)
- if (res->dispatchv4pool[i] != NULL)
- dns_dispatch_detach(&res->dispatchv4pool[i]);
- isc_mem_put(res->mctx, res->dispatchv4pool,
- sizeof(dns_dispatch_t *) * ndisps);
- }
- if (res->dispatchv6pool != NULL) {
- for (i = 0; i < ndisps; i++)
- if (res->dispatchv6pool[i] != NULL)
- dns_dispatch_detach(&res->dispatchv6pool[i]);
- isc_mem_put(res->mctx, res->dispatchv6pool,
- sizeof(dns_dispatch_t *) * ndisps);
- }
-
- return (result);
-}
# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
-# $Id: Makefile.in,v 1.93 2007/09/14 03:39:29 marka Exp $
+# $Id: Makefile.in,v 1.93.58.1 2008/06/24 00:09:12 jinmei Exp $
srcdir = @srcdir@
VPATH = @srcdir@
lex.@O@ lfsr.@O@ lib.@O@ log.@O@ \
md5.@O@ mem.@O@ mutexblock.@O@ \
netaddr.@O@ netscope.@O@ ondestroy.@O@ \
- parseint.@O@ quota.@O@ radix.@O@ random.@O@ \
+ parseint.@O@ portset.@O@ quota.@O@ radix.@O@ random.@O@ \
ratelimiter.@O@ refcount.@O@ region.@O@ result.@O@ rwlock.@O@ \
serial.@O@ sha1.@O@ sha2.@O@ sockaddr.@O@ \
string.@O@ strtoul.@O@ symtab.@O@ task.@O@ taskpool.@O@ \
lex.c lfsr.c lib.c log.c \
md5.c mem.c mutexblock.c \
netaddr.c netscope.c ondestroy.c \
- parseint.c quota.c radix.c random.c \
+ parseint.c portset.c quota.c radix.c random.c \
ratelimiter.c refcount.c region.c result.c rwlock.c \
serial.c sha1.c sha2.c sockaddr.c string.c strtoul.c \
symtab.c task.c taskpool.c timer.c version.c
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: platform.h.in,v 1.45.60.2 2008/01/24 23:46:26 tbox Exp $ */
+/* $Id: platform.h.in,v 1.45.60.3 2008/06/24 00:09:12 jinmei Exp $ */
#ifndef ISC_PLATFORM_H
#define ISC_PLATFORM_H 1
*/
@ISC_PLATFORM_FIXIN6ISADDR@
+/*! \brief
+ * Define if the system supports kqueue multiplexing
+ */
+@ISC_PLATFORM_HAVEKQUEUE@
+
+/*! \brief
+ * Define if the system supports epoll multiplexing
+ */
+@ISC_PLATFORM_HAVEEPOLL@
+
+/*! \brief
+ * Define if the system supports /dev/poll multiplexing
+ */
+@ISC_PLATFORM_HAVEDEVPOLL@
+
/*
*** Printing.
***/
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: portset.h,v 1.3 2008/06/23 23:47:11 tbox Exp $ */
+/* $Id: portset.h,v 1.3.2.1 2008/06/24 00:09:12 jinmei Exp $ */
/*! \file isc/portset.h
* \brief Transport Protocol Port Manipuration Module
isc_portset_create(isc_mem_t *mctx, isc_portset_t **portsetp);
/*%<
* Create a port set and initialize it as an empty set.
- *
+ *
* Requires:
*\li 'mctx' to be valid.
*\li 'portsetp' to be non NULL and '*portsetp' to be NULL;
isc_portset_destroy(isc_mem_t *mctx, isc_portset_t **portsetp);
/*%<
* Destroy a port set.
- *
+ *
* Requires:
*\li 'mctx' to be valid and must be the same context given when the port set
* was created.
isc_portset_isset(isc_portset_t *portset, in_port_t port);
/*%<
* Test whether the given port is stored in the portset.
- *
+ *
* Requires:
*\li 'portset' to be a valid set.
*
isc_portset_nports(isc_portset_t *portset);
/*%<
* Provides the number of ports stored in the given portset.
- *
+ *
* Requires:
*\li 'portset' to be a valid set.
*
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: socket.h,v 1.72.128.2 2008/06/04 23:46:32 tbox Exp $ */
+/* $Id: socket.h,v 1.72.128.3 2008/06/24 00:09:12 jinmei Exp $ */
#ifndef ISC_SOCKET_H
#define ISC_SOCKET_H 1
* All resources used by the socket have been freed
*/
+isc_result_t
+isc_socket_open(isc_socket_t *sock);
+/*%<
+ * Open a new socket file descriptor of the given socket structure. It simply
+ * opens a new descriptor; all of the other parameters including the socket
+ * type are inherited from the existing socket. This function is provided to
+ * avoid overhead of destroying and creating sockets when many short-lived
+ * sockets are frequently opened and closed. When the efficiency is not an
+ * issue, it should be safer to detach the unused socket and re-create a new
+ * one.
+ *
+ * Requires:
+ *
+ * \li there must be no other reference to this socket.
+ *
+ * \li 'socket' is a valid and previously closed by isc_socket_close()
+ *
+ * Returns:
+ * Same as isc_socket_create().
+ */
+
+void
+isc_socket_close(isc_socket_t *sock);
+/*%<
+ * Close a socket file descriptor of the given socket structure. This function
+ * is provided as an alternative to destroying an unused socket when overhead
+ * destroying/re-creating sockets can be significant, and is expected to be
+ * used with isc_socket_open().
+ *
+ * Requires:
+ *
+ * \li The socket must have a valid descriptor.
+ *
+ * \li There must be no other reference to this socket.
+ *
+ * \li There must be no pending I/O requests.
+ *
+ */
+
isc_result_t
isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *addressp);
/*%<
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: timer.h,v 1.38 2007/06/19 23:47:18 tbox Exp $ */
+/* $Id: timer.h,v 1.38.128.1 2008/06/24 00:09:12 jinmei Exp $ */
#ifndef ISC_TIMER_H
#define ISC_TIMER_H 1
#include <isc/event.h>
#include <isc/eventclass.h>
#include <isc/lang.h>
+#include <isc/time.h>
ISC_LANG_BEGINDECLS
typedef struct isc_timerevent {
struct isc_event common;
+ isc_time_t due;
} isc_timerevent_t;
#define ISC_TIMEREVENT_FIRSTEVENT (ISC_EVENTCLASS_TIMER + 0)
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: types.h,v 1.43.128.2 2008/01/17 23:46:37 tbox Exp $ */
+/* $Id: types.h,v 1.43.128.3 2008/06/24 00:09:12 jinmei Exp $ */
#ifndef ISC_TYPES_H
#define ISC_TYPES_H 1
typedef struct isc_msgcat isc_msgcat_t; /*%< Message Catalog */
typedef struct isc_ondestroy isc_ondestroy_t; /*%< On Destroy */
typedef struct isc_netaddr isc_netaddr_t; /*%< Net Address */
+typedef struct isc_portset isc_portset_t; /*%< Port Set */
typedef struct isc_quota isc_quota_t; /*%< Quota */
typedef struct isc_random isc_random_t; /*%< Random */
typedef struct isc_ratelimiter isc_ratelimiter_t; /*%< Rate Limiter */
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: portset.c,v 1.2 2008/06/23 19:41:19 jinmei Exp $ */
+/* $Id: portset.c,v 1.2.2.1 2008/06/24 00:09:12 jinmei Exp $ */
/*! \file */
#include <isc/mem.h>
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: timer.c,v 1.81 2007/10/24 00:57:23 marka Exp $ */
+/* $Id: timer.c,v 1.81.32.1 2008/06/24 00:09:12 jinmei Exp $ */
/*! \file */
static void
dispatch(isc_timermgr_t *manager, isc_time_t *now) {
isc_boolean_t done = ISC_FALSE, post_event, need_schedule;
- isc_event_t *event;
+ isc_timerevent_t *event;
isc_eventtype_t type = 0;
isc_timer_t *timer;
isc_result_t result;
/*
* XXX We could preallocate this event.
*/
- event = isc_event_allocate(manager->mctx,
+ event = (isc_timerevent_t *)isc_event_allocate(manager->mctx,
timer,
type,
timer->action,
timer->arg,
sizeof(*event));
- if (event != NULL)
- isc_task_send(timer->task, &event);
- else
+ if (event != NULL) {
+ event->due = timer->due;
+ isc_task_send(timer->task,
+ (isc_event_t **)&event);
+ } else
UNEXPECTED_ERROR(__FILE__, __LINE__,
isc_msgcat_get(isc_msgcat,
ISC_MSGSET_TIMER,
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: app.c,v 1.54.128.3 2008/01/17 23:46:37 tbox Exp $ */
+/* $Id: app.c,v 1.54.128.4 2008/06/24 00:09:12 jinmei Exp $ */
/*! \file */
#include <unistd.h>
#include <signal.h>
#include <sys/time.h>
+#ifdef HAVE_EPOLL
+#include <sys/epoll.h>
+#endif
#include <isc/app.h>
#include <isc/boolean.h>
int n;
isc_time_t when, now;
struct timeval tv, *tvp;
- fd_set readfds, writefds;
- int maxfd;
+ isc_socketwait_t *swait;
isc_boolean_t readytasks;
isc_boolean_t call_timer_dispatch = ISC_FALSE;
}
}
- isc__socketmgr_getfdsets(&readfds, &writefds, &maxfd);
- n = select(maxfd, &readfds, &writefds, NULL, tvp);
+ swait = NULL;
+ n = isc__socketmgr_waitevents(tvp, &swait);
if (n == 0 || call_timer_dispatch) {
/*
isc__timermgr_dispatch();
}
if (n > 0)
- (void)isc__socketmgr_dispatch(&readfds, &writefds,
- maxfd);
+ (void)isc__socketmgr_dispatch(swait);
(void)isc__taskmgr_dispatch();
if (want_reload) {
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: net.h,v 1.46 2007/06/19 23:47:19 tbox Exp $ */
+/* $Id: net.h,v 1.46.128.1 2008/06/24 00:09:12 jinmei Exp $ */
#ifndef ISC_NET_H
#define ISC_NET_H 1
* Returns whether UNIX domain sockets are supported.
*/
+isc_result_t
+isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high);
+/*%<
+ * Returns system's default range of ephemeral UDP ports, if defined.
+ * If the range is not available or unknown, ISC_NET_PORTRANGELOW and
+ * ISC_NET_PORTRANGEHIGH will be returned.
+ *
+ * Requires:
+ *
+ *\li 'low' and 'high' must be non NULL.
+ *
+ * Returns:
+ *
+ *\li *low and *high will be the ports specifying the low and high ends of
+ * the range.
+ */
+
#ifdef ISC_PLATFORM_NEEDNTOP
const char *
isc_net_ntop(int af, const void *src, char *dst, size_t size);
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: net.c,v 1.36 2007/09/13 04:45:18 each Exp $ */
+/* $Id: net.c,v 1.36.60.1 2008/06/24 00:09:12 jinmei Exp $ */
#include <config.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
#include <errno.h>
#include <unistd.h>
#include <isc/string.h>
#include <isc/util.h>
+/*%
+ * Definitions about UDP port range specification. This is a total mess of
+ * portability variants: some use sysctl (but the sysctl names vary), some use
+ * system-specific interfaces, some have the same interface for IPv4 and IPv6,
+ * some separate them, etc...
+ */
+
+/*%
+ * The last resort defaults: use all non well known port space
+ */
+#ifndef ISC_NET_PORTRANGELOW
+#define ISC_NET_PORTRANGELOW 1024
+#endif /* ISC_NET_PORTRANGELOW */
+#ifndef ISC_NET_PORTRANGEHIGH
+#define ISC_NET_PORTRANGEHIGH 65535
+#endif /* ISC_NET_PORTRANGEHIGH */
+
+/*%
+ * sysctl variants
+ */
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#define USE_SYSCTL_PORTRANGE
+#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.portrange.first"
+#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.portrange.last"
+#define SYSCTL_V6PORTRANGE_LOW "net.inet.ip.portrange.first"
+#define SYSCTL_V6PORTRANGE_HIGH "net.inet.ip.portrange.last"
+#endif
+
+#ifdef __NetBSD__
+#define USE_SYSCTL_PORTRANGE
+#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.anonportmin"
+#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.anonportmax"
+#define SYSCTL_V6PORTRANGE_LOW "net.inet6.ip6.portrange.first"
+#define SYSCTL_V6PORTRANGE_HIGH "net.inet6.ip6.portrange.last"
+#endif
+
+#ifdef __OpenBSD__
+#define USE_SYSCTL_PORTRANGE
+#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.portfirst"
+#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.portlast"
+#define SYSCTL_V6PORTRANGE_LOW "net.inet6.ip6.portrange.first"
+#define SYSCTL_V6PORTRANGE_HIGH "net.inet6.ip6.portrange.last"
+#endif
+
#if defined(ISC_PLATFORM_HAVEIPV6)
# if defined(ISC_PLATFORM_NEEDIN6ADDRANY)
const struct in6_addr isc_net_in6addrany = IN6ADDR_ANY_INIT;
return (ipv6pktinfo_result);
}
+#ifdef USE_SYSCTL_PORTRANGE
+static isc_result_t
+getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
+ int port_low, port_high;
+ size_t portlen;
+ const char *sysctlname_lowport, *sysctlname_hiport;
+
+ if (af == AF_INET) {
+ sysctlname_lowport = SYSCTL_V4PORTRANGE_LOW;
+ sysctlname_hiport = SYSCTL_V4PORTRANGE_HIGH;
+ } else {
+ sysctlname_lowport = SYSCTL_V6PORTRANGE_LOW;
+ sysctlname_hiport = SYSCTL_V6PORTRANGE_HIGH;
+ }
+ portlen = sizeof(portlen);
+ if (sysctlbyname(sysctlname_lowport, &port_low, &portlen,
+ NULL, 0) < 0) {
+ return (ISC_R_FAILURE);
+ }
+ portlen = sizeof(portlen);
+ if (sysctlbyname(sysctlname_hiport, &port_high, &portlen,
+ NULL, 0) < 0) {
+ return (ISC_R_FAILURE);
+ }
+ if ((port_low & ~0xffff) != 0 || (port_high & ~0xffff) != 0)
+ return (ISC_R_RANGE);
+
+ *low = (in_port_t)port_low;
+ *high = (in_port_t)port_high;
+
+ return (ISC_R_SUCCESS);
+}
+#endif
+
+isc_result_t
+isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) {
+ int result = ISC_R_FAILURE;
+
+ REQUIRE(low != NULL && high != NULL);
+
+#ifdef USE_SYSCTL_PORTRANGE
+ result = getudpportrange_sysctl(af, low, high);
+#else
+ UNUSED(af);
+#endif
+
+ if (result != ISC_R_SUCCESS) {
+ *low = ISC_NET_PORTRANGELOW;
+ *high = ISC_NET_PORTRANGEHIGH;
+ }
+
+ return (ISC_R_SUCCESS); /* we currently never fail in this function */
+}
+
void
isc_net_disableipv4(void) {
initialize();
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: socket.c,v 1.275.10.4 2008/03/27 21:10:24 jinmei Exp $ */
+/* $Id: socket.c,v 1.275.10.5 2008/06/24 00:09:12 jinmei Exp $ */
/*! \file */
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/stat.h>
-#ifdef ISC_PLATFORM_HAVESYSUNH
-#include <sys/un.h>
-#endif
#include <sys/time.h>
#include <sys/uio.h>
#include <isc/util.h>
#include <isc/xml.h>
+#ifdef ISC_PLATFORM_HAVESYSUNH
+#include <sys/un.h>
+#endif
+#ifdef ISC_PLATFORM_HAVEKQUEUE
+#include <sys/event.h>
+#endif
+#ifdef ISC_PLATFORM_HAVEEPOLL
+#include <sys/epoll.h>
+#endif
+#ifdef ISC_PLATFORM_HAVEDEVPOLL
+#include <sys/devpoll.h>
+#endif
+
#include "errno2result.h"
#ifndef ISC_PLATFORM_USETHREADS
#include <sys/utsname.h>
#endif
+/*%
+ * Choose the most preferable multiplex method.
+ */
+#ifdef ISC_PLATFORM_HAVEKQUEUE
+#define USE_KQUEUE
+#elif defined (ISC_PLATFORM_HAVEEPOLL)
+#define USE_EPOLL
+#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
+#define USE_DEVPOLL
+typedef struct {
+ unsigned int want_read : 1,
+ want_write : 1;
+} pollinfo_t;
+#else
+#define USE_SELECT
+#endif /* ISC_PLATFORM_HAVEKQUEUE */
+
+#ifndef ISC_PLATFORM_USETHREADS
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+struct isc_socketwait {
+ int nevents;
+};
+#elif defined (USE_SELECT)
+struct isc_socketwait {
+ fd_set readset;
+ fd_set writeset;
+ int nfds;
+ int maxfd;
+};
+#endif /* USE_KQUEUE */
+#endif /* !ISC_PLATFORM_USETHREADS */
+
+/*%
+ * Maximum number of allowable open sockets. This is also the maximum
+ * allowable socket file descriptor. This definition is meaningless with
+ * USE_SELECT due to the API limitation of select(2).
+ */
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+#ifndef ISC_SOCKET_MAXSOCKETS
+#define ISC_SOCKET_MAXSOCKETS 4096
+#endif
+#endif /* USE_KQUEUE || USE_EPOLL || USE_DEVPOLL */
+
+/*%
+ * Size of per-FD lock buckets.
+ */
+#ifdef ISC_PLATFORM_USETHREADS
+#define FDLOCK_COUNT 1024
+#define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT)
+#else
+#define FDLOCK_COUNT 1
+#define FDLOCK_ID(fd) 0
+#endif /* ISC_PLATFORM_USETHREADS */
+
+/*%
+ * Maximum number of events communicated with the kernel. There should normally
+ * be no need for having a large number.
+ */
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+#ifndef ISC_SOCKET_MAXEVENTS
+#define ISC_SOCKET_MAXEVENTS 64
+#endif
+#endif
+
/*%
* Some systems define the socket length argument as an int, some as size_t,
* some as socklen_t. This is here so it can be easily changed if needed.
unsigned int magic;
isc_mem_t *mctx;
isc_mutex_t lock;
+ isc_mutex_t *fdlock;
+#ifdef USE_KQUEUE
+ int kqueue_fd;
+ int nevents;
+ struct kevent *events;
+#endif /* USE_KQUEUE */
+#ifdef USE_EPOLL
+ int epoll_fd;
+ int nevents;
+ struct epoll_event *events;
+#endif /* USE_EPOLL */
+#ifdef USE_DEVPOLL
+ int devpoll_fd;
+ int nevents;
+ struct pollfd *events;
+#endif /* USE_DEVPOLL */
+ unsigned int maxsocks;
+#ifdef ISC_PLATFORM_USETHREADS
+ int pipe_fds[2];
+#endif
+
+ /* Locked by fdlock. */
+ isc_socket_t **fds;
+ int *fdstate;
+#ifdef USE_DEVPOLL
+ pollinfo_t *fdpollinfo;
+#endif
+
/* Locked by manager lock. */
ISC_LIST(isc_socket_t) socklist;
+#ifdef USE_SELECT
fd_set read_fds;
fd_set write_fds;
- isc_socket_t *fds[FD_SETSIZE];
- int fdstate[FD_SETSIZE];
int maxfd;
+#endif /* USE_SELECT */
#ifdef ISC_PLATFORM_USETHREADS
isc_thread_t watcher;
isc_condition_t shutdown_ok;
- int pipe_fds[2];
#else /* ISC_PLATFORM_USETHREADS */
unsigned int refs;
#endif /* ISC_PLATFORM_USETHREADS */
struct msghdr *, struct iovec *, size_t *);
static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
struct msghdr *, struct iovec *, size_t *);
+#ifdef ISC_PLATFORM_USETHREADS
+static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
+#endif
#define SELECT_POKE_SHUTDOWN (-1)
#define SELECT_POKE_NOTHING (-2)
}
}
+static inline isc_result_t
+watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
+ isc_result_t result = ISC_R_SUCCESS;
+
+#ifdef USE_KQUEUE
+ struct kevent evchange;
+
+ memset(&evchange, 0, sizeof(evchange));
+ if (msg == SELECT_POKE_READ)
+ evchange.filter = EVFILT_READ;
+ else
+ evchange.filter = EVFILT_WRITE;
+ evchange.flags = EV_ADD;
+ evchange.ident = fd;
+ if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
+ result = isc__errno2result(errno);
+
+ return (result);
+#elif defined(USE_EPOLL)
+ struct epoll_event event;
+
+ if (msg == SELECT_POKE_READ)
+ event.events = EPOLLIN;
+ else
+ event.events = EPOLLOUT;
+ event.data.fd = fd;
+ if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
+ errno != EEXIST) {
+ result = isc__errno2result(errno);
+ }
+
+ return (result);
+#elif defined(USE_DEVPOLL)
+ struct pollfd pfd;
+ int lockid = FDLOCK_ID(fd);
+
+ memset(&pfd, 0, sizeof(pfd));
+ if (msg == SELECT_POKE_READ)
+ pfd.events = POLLIN;
+ else
+ pfd.events = POLLOUT;
+ pfd.fd = fd;
+ pfd.revents = 0;
+ LOCK(&manager->fdlock[lockid]);
+ if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
+ result = isc__errno2result(errno);
+ else {
+ if (msg == SELECT_POKE_READ)
+ manager->fdpollinfo[fd].want_read = 1;
+ else
+ manager->fdpollinfo[fd].want_write = 1;
+ }
+ UNLOCK(&manager->fdlock[lockid]);
+
+ return (result);
+#elif defined(USE_SELECT)
+ LOCK(&manager->lock);
+ if (msg == SELECT_POKE_READ)
+ FD_SET(fd, &manager->read_fds);
+ if (msg == SELECT_POKE_WRITE)
+ FD_SET(fd, &manager->write_fds);
+ UNLOCK(&manager->lock);
+
+ return (result);
+#endif
+}
+
+static inline isc_result_t
+unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
+ isc_result_t result = ISC_R_SUCCESS;
+
+#ifdef USE_KQUEUE
+ struct kevent evchange;
+
+ memset(&evchange, 0, sizeof(evchange));
+ if (msg == SELECT_POKE_READ)
+ evchange.filter = EVFILT_READ;
+ else
+ evchange.filter = EVFILT_WRITE;
+ evchange.flags = EV_DELETE;
+ evchange.ident = fd;
+ if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
+ result = isc__errno2result(errno);
+
+ return (result);
+#elif defined(USE_EPOLL)
+ struct epoll_event event;
+
+ if (msg == SELECT_POKE_READ)
+ event.events = EPOLLIN;
+ else
+ event.events = EPOLLOUT;
+ event.data.fd = fd;
+ if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
+ errno != ENOENT) {
+ char strbuf[ISC_STRERRORSIZE];
+ isc__strerror(errno, strbuf, sizeof(strbuf));
+ UNEXPECTED_ERROR(__FILE__, __LINE__,
+ "epoll_ctl(DEL), %d: %s", fd, strbuf);
+ result = ISC_R_UNEXPECTED;
+ }
+ return (result);
+#elif defined(USE_DEVPOLL)
+ struct pollfd pfds[2];
+ size_t writelen = sizeof(pfds[0]);
+ int lockid = FDLOCK_ID(fd);
+
+ memset(pfds, 0, sizeof(pfds));
+ pfds[0].events = POLLREMOVE;
+ pfds[0].fd = fd;
+
+ /*
+ * Canceling read or write polling via /dev/poll is tricky. Since it
+ * only provides a way of canceling per FD, we may need to re-poll the
+ * socket for the other operation.
+ */
+ LOCK(&manager->fdlock[lockid]);
+ if (msg == SELECT_POKE_READ &&
+ manager->fdpollinfo[fd].want_write == 1) {
+ pfds[1].events = POLLOUT;
+ pfds[1].fd = fd;
+ writelen += sizeof(pfds[1]);
+ }
+ if (msg == SELECT_POKE_WRITE &&
+ manager->fdpollinfo[fd].want_read == 1) {
+ pfds[1].events = POLLIN;
+ pfds[1].fd = fd;
+ writelen += sizeof(pfds[1]);
+ }
+
+ if (write(manager->devpoll_fd, pfds, writelen) == -1)
+ result = isc__errno2result(errno);
+ else {
+ if (msg == SELECT_POKE_READ)
+ manager->fdpollinfo[fd].want_read = 0;
+ else
+ manager->fdpollinfo[fd].want_write = 0;
+ }
+ UNLOCK(&manager->fdlock[lockid]);
+
+ return (result);
+#elif defined(USE_SELECT)
+ LOCK(&manager->lock);
+ if (msg == SELECT_POKE_READ)
+ FD_CLR(fd, &manager->read_fds);
+ else if (msg == SELECT_POKE_WRITE)
+ FD_CLR(fd, &manager->write_fds);
+ UNLOCK(&manager->lock);
+
+ return (result);
+#endif
+}
+
static void
wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
- isc_socket_t *sock;
+ isc_result_t result;
+ isc_boolean_t needclose;
+ int lockid = FDLOCK_ID(fd);
/*
* This is a wakeup on a socket. If the socket is not in the
* or writes.
*/
- INSIST(fd >= 0 && fd < (int)FD_SETSIZE);
+ INSIST(fd >= 0 && fd < (int)manager->maxsocks);
+ LOCK(&manager->fdlock[lockid]);
if (manager->fdstate[fd] == CLOSE_PENDING
|| manager->fdstate[fd] == MANAGER_CLOSE_PENDING) {
- FD_CLR(fd, &manager->read_fds);
- FD_CLR(fd, &manager->write_fds);
- if (manager->fdstate[fd] == CLOSE_PENDING)
- (void)close(fd);
+ needclose = ISC_TF(manager->fdstate[fd] == CLOSE_PENDING);
manager->fdstate[fd] = CLOSED;
+ UNLOCK(&manager->fdlock[lockid]);
+
+ /*
+ * We accept (and ignore) any error from unwatch_fd() as we are
+ * closing the socket, hoping it doesn't leave dangling state in
+ * the kernel.
+ * Note that unwatch_fd() must be called after releasing the
+ * fdlock; otherwise it could cause deadlock due to a lock order
+ * reversal.
+ */
+ (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+ (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+ if (needclose)
+ (void)close(fd);
return;
}
- if (manager->fdstate[fd] != MANAGED)
+ if (manager->fdstate[fd] != MANAGED) {
+ UNLOCK(&manager->fdlock[lockid]);
return;
-
- sock = manager->fds[fd];
+ }
+ UNLOCK(&manager->fdlock[lockid]);
/*
* Set requested bit.
*/
- if (msg == SELECT_POKE_READ)
- FD_SET(sock->fd, &manager->read_fds);
- if (msg == SELECT_POKE_WRITE)
- FD_SET(sock->fd, &manager->write_fds);
+ result = watch_fd(manager, fd, msg);
+ if (result != ISC_R_SUCCESS) {
+ /*
+ * XXXJT: what should we do? Ignoring the failure of watching
+ * a socket will make the application dysfunctional, but there
+ * seems to be no reasonable recovery process.
+ */
+ isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
+ ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
+ "failed to start watching FD (%d): %s",
+ fd, isc_result_totext(result));
+ }
}
#ifdef ISC_PLATFORM_USETHREADS
memset(msg, 0, sizeof(*msg));
- if (sock->type == isc_sockettype_udp) {
+ if (!sock->connected) {
msg->msg_name = (void *)&dev->address.type.sa;
msg->msg_namelen = dev->address.length;
} else {
* Caller must ensure that the socket is not locked and no external
* references exist.
*/
+static void
+closesocket(isc_socketmgr_t *manager, isc_sockettype_t type, int fd) {
+ int lockid = FDLOCK_ID(fd);
+
+ /*
+ * No one has this socket open, so the watcher doesn't have to be
+ * poked, and the socket doesn't have to be locked.
+ */
+ LOCK(&manager->fdlock[lockid]);
+ manager->fds[fd] = NULL;
+ if (type == isc_sockettype_fdwatch)
+ manager->fdstate[fd] = MANAGER_CLOSE_PENDING;
+ else
+ manager->fdstate[fd] = CLOSE_PENDING;
+ UNLOCK(&manager->fdlock[lockid]);
+ select_poke(manager, fd, SELECT_POKE_CLOSE);
+
+ /*
+ * update manager->maxfd here (XXX: this should be implemented more
+ * efficiently)
+ */
+#ifdef USE_SELECT
+ LOCK(&manager->lock);
+ if (manager->maxfd == fd) {
+ int i;
+
+ manager->maxfd = 0;
+ for (i = fd - 1; i >= 0; i--) {
+ lockid = FDLOCK_ID(i);
+
+ LOCK(&manager->fdlock[lockid]);
+ if (manager->fdstate[i] == MANAGED) {
+ manager->maxfd = i;
+ UNLOCK(&manager->fdlock[lockid]);
+ break;
+ }
+ UNLOCK(&manager->fdlock[lockid]);
+ }
+#ifdef ISC_PLATFORM_USETHREADS
+ if (manager->maxfd < manager->pipe_fds[0])
+ manager->maxfd = manager->pipe_fds[0];
+#endif
+ }
+ UNLOCK(&manager->lock);
+#endif /* USE_SELECT */
+}
+
static void
destroy(isc_socket_t **sockp) {
+ int fd;
isc_socket_t *sock = *sockp;
isc_socketmgr_t *manager = sock->manager;
INSIST(ISC_LIST_EMPTY(sock->recv_list));
INSIST(ISC_LIST_EMPTY(sock->send_list));
INSIST(sock->connect_ev == NULL);
- REQUIRE(sock->fd >= 0 && sock->fd < (int)FD_SETSIZE);
+ REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
+
+ if (sock->fd >= 0) {
+ fd = sock->fd;
+ sock->fd = -1;
+ closesocket(manager, sock->type, fd);
+ }
LOCK(&manager->lock);
- /*
- * No one has this socket open, so the watcher doesn't have to be
- * poked, and the socket doesn't have to be locked.
- */
- manager->fds[sock->fd] = NULL;
- if (sock->type == isc_sockettype_fdwatch)
- manager->fdstate[sock->fd] = MANAGER_CLOSE_PENDING;
- else
- manager->fdstate[sock->fd] = CLOSE_PENDING;
- select_poke(manager, sock->fd, SELECT_POKE_CLOSE);
ISC_LIST_UNLINK(manager->socklist, sock, link);
#ifdef ISC_PLATFORM_USETHREADS
SIGNAL(&manager->shutdown_ok);
#endif /* ISC_PLATFORM_USETHREADS */
- /*
- * XXX should reset manager->maxfd here
- */
-
UNLOCK(&manager->lock);
free_socket(sockp);
}
#endif
-/*%
- * Create a new 'type' socket managed by 'manager'. Events
- * will be posted to 'task' and when dispatched 'action' will be
- * called with 'arg' as the arg value. The new socket is returned
- * in 'socketp'.
- */
-isc_result_t
-isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
- isc_socket_t **socketp)
-{
- isc_socket_t *sock = NULL;
- isc_result_t result;
+static isc_result_t
+opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
+ char strbuf[ISC_STRERRORSIZE];
+ const char *err = "socket";
+ int tries = 0;
#if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
int on = 1;
#endif
ISC_SOCKADDR_LEN_T optlen;
int size;
#endif
- char strbuf[ISC_STRERRORSIZE];
- const char *err = "socket";
- int tries = 0;
- REQUIRE(VALID_MANAGER(manager));
- REQUIRE(socketp != NULL && *socketp == NULL);
-
- result = allocate_socket(manager, type, &sock);
- if (result != ISC_R_SUCCESS)
- return (result);
-
- sock->pf = pf;
again:
- switch (type) {
+ switch (sock->type) {
case isc_sockettype_udp:
- sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
+ sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
break;
case isc_sockettype_tcp:
- sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
+ sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
break;
case isc_sockettype_unix:
- sock->fd = socket(pf, SOCK_STREAM, 0);
+ sock->fd = socket(sock->pf, SOCK_STREAM, 0);
break;
case isc_sockettype_fdwatch:
- INSIST(type != isc_sockettype_fdwatch);
+ INSIST(sock->type != isc_sockettype_fdwatch);
break;
}
if (sock->fd == -1 && errno == EINTR && tries++ < 42)
}
#endif
- if (sock->fd >= (int)FD_SETSIZE) {
+ if (sock->fd >= (int)manager->maxsocks) {
(void)close(sock->fd);
isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
isc_msgcat, ISC_MSGSET_SOCKET,
ISC_MSG_TOOMANYFDS,
"%s: too many open file descriptors", "socket");
- free_socket(&sock);
return (ISC_R_NORESOURCES);
}
if (sock->fd < 0) {
- free_socket(&sock);
-
switch (errno) {
case EMFILE:
case ENFILE:
if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
(void)close(sock->fd);
- free_socket(&sock);
return (ISC_R_UNEXPECTED);
}
#ifdef SO_BSDCOMPAT
RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
clear_bsdcompat) == ISC_R_SUCCESS);
- if (type != isc_sockettype_unix && bsdcompat &&
+ if (sock->type != isc_sockettype_unix && bsdcompat &&
setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
(void *)&on, sizeof(on)) < 0) {
isc__strerror(errno, strbuf, sizeof(strbuf));
#endif
#if defined(USE_CMSG) || defined(SO_RCVBUF)
- if (type == isc_sockettype_udp) {
+ if (sock->type == isc_sockettype_udp) {
#if defined(USE_CMSG)
#if defined(SO_TIMESTAMP)
#endif /* SO_TIMESTAMP */
#if defined(ISC_PLATFORM_HAVEIPV6)
- if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
+ if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
/*
* Warn explicitly because this anomaly can be hidden
* in usual operation (and unexpectedly appear later).
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
#ifdef IPV6_RECVPKTINFO
/* RFC 3542 */
- if ((pf == AF_INET6)
+ if ((sock->pf == AF_INET6)
&& (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
(void *)&on, sizeof(on)) < 0)) {
isc__strerror(errno, strbuf, sizeof(strbuf));
}
#else
/* RFC 2292 */
- if ((pf == AF_INET6)
+ if ((sock->pf == AF_INET6)
&& (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
(void *)&on, sizeof(on)) < 0)) {
isc__strerror(errno, strbuf, sizeof(strbuf));
#endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
#ifdef IPV6_USE_MIN_MTU /* RFC 3542, not too common yet*/
/* use minimum MTU */
- if (pf == AF_INET6) {
+ if (sock->pf == AF_INET6) {
(void)setsockopt(sock->fd, IPPROTO_IPV6,
IPV6_USE_MIN_MTU,
(void *)&on, sizeof(on));
}
#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
+ return (ISC_R_SUCCESS);
+}
+
+/*%
+ * Create a new 'type' socket managed by 'manager'. Events
+ * will be posted to 'task' and when dispatched 'action' will be
+ * called with 'arg' as the arg value. The new socket is returned
+ * in 'socketp'.
+ */
+isc_result_t
+isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
+ isc_socket_t **socketp)
+{
+ isc_socket_t *sock = NULL;
+ isc_result_t result;
+ int lockid;
+
+ REQUIRE(VALID_MANAGER(manager));
+ REQUIRE(socketp != NULL && *socketp == NULL);
+
+ result = allocate_socket(manager, type, &sock);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+
+ sock->pf = pf;
+ result = opensocket(manager, sock);
+ if (result != ISC_R_SUCCESS) {
+ free_socket(&sock);
+ return (result);
+ }
+
memset(sock->name, 0, sizeof(sock->name));
sock->tag = NULL;
sock->references = 1;
*socketp = sock;
- LOCK(&manager->lock);
-
/*
* Note we don't have to lock the socket like we normally would because
* there are no external references to it yet.
*/
+ lockid = FDLOCK_ID(sock->fd);
+ LOCK(&manager->fdlock[lockid]);
manager->fds[sock->fd] = sock;
manager->fdstate[sock->fd] = MANAGED;
+#ifdef USE_DEVPOLL
+ INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
+ sock->manager->fdpollinfo[sock->fd].want_write == 0);
+#endif
+ UNLOCK(&manager->fdlock[lockid]);
+
+ LOCK(&manager->lock);
ISC_LIST_APPEND(manager->socklist, sock, link);
+#ifdef USE_SELECT
if (manager->maxfd < sock->fd)
manager->maxfd = sock->fd;
-
+#endif
UNLOCK(&manager->lock);
socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
return (ISC_R_SUCCESS);
}
+isc_result_t
+isc_socket_open(isc_socket_t *sock) {
+ isc_result_t result;
+
+ REQUIRE(VALID_SOCKET(sock));
+
+ LOCK(&sock->lock);
+ REQUIRE(sock->references == 1);
+ UNLOCK(&sock->lock);
+ /*
+ * We don't need to retain the lock hereafter, since no one else has
+ * this socket.
+ */
+ REQUIRE(sock->fd == -1);
+
+ result = opensocket(sock->manager, sock);
+ if (result != ISC_R_SUCCESS)
+ sock->fd = -1;
+
+ if (result == ISC_R_SUCCESS) {
+ int lockid = FDLOCK_ID(sock->fd);
+
+ LOCK(&sock->manager->fdlock[lockid]);
+ sock->manager->fds[sock->fd] = sock;
+ sock->manager->fdstate[sock->fd] = MANAGED;
+#ifdef USE_DEVPOLL
+ INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
+ sock->manager->fdpollinfo[sock->fd].want_write == 0);
+#endif
+ UNLOCK(&sock->manager->fdlock[lockid]);
+
+#ifdef USE_SELECT
+ LOCK(&sock->manager->lock);
+ if (sock->manager->maxfd < sock->fd)
+ sock->manager->maxfd = sock->fd;
+ UNLOCK(&sock->manager->lock);
+#endif
+ }
+
+ return (result);
+}
+
/*
* Create a new 'type' socket managed by 'manager'. Events
* will be posted to 'task' and when dispatched 'action' will be
{
isc_socket_t *sock = NULL;
isc_result_t result;
+ int lockid;
REQUIRE(VALID_MANAGER(manager));
REQUIRE(socketp != NULL && *socketp == NULL);
sock->references = 1;
*socketp = sock;
- LOCK(&manager->lock);
-
/*
* Note we don't have to lock the socket like we normally would because
* there are no external references to it yet.
*/
+ lockid = FDLOCK_ID(sock->fd);
+ LOCK(&manager->fdlock[lockid]);
manager->fds[sock->fd] = sock;
manager->fdstate[sock->fd] = MANAGED;
+ UNLOCK(&manager->fdlock[lockid]);
+
+ LOCK(&manager->lock);
ISC_LIST_APPEND(manager->socklist, sock, link);
+#ifdef USE_SELECT
if (manager->maxfd < sock->fd)
manager->maxfd = sock->fd;
-
+#endif
UNLOCK(&manager->lock);
if (flags & ISC_SOCKFDWATCH_READ)
*socketp = NULL;
}
+void
+isc_socket_close(isc_socket_t *sock) {
+ int fd;
+
+ REQUIRE(VALID_SOCKET(sock));
+
+ LOCK(&sock->lock);
+ REQUIRE(sock->references == 1);
+ UNLOCK(&sock->lock);
+ /*
+ * We don't need to retain the lock hereafter, since no one else has
+ * this socket.
+ */
+
+ REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
+
+ INSIST(!sock->connecting);
+ INSIST(!sock->pending_recv);
+ INSIST(!sock->pending_send);
+ INSIST(!sock->pending_accept);
+ INSIST(ISC_LIST_EMPTY(sock->recv_list));
+ INSIST(ISC_LIST_EMPTY(sock->send_list));
+ INSIST(ISC_LIST_EMPTY(sock->accept_list));
+ INSIST(sock->connect_ev == NULL);
+
+ fd = sock->fd;
+ sock->fd = -1;
+ sock->listener = 0;
+ sock->connected = 0;
+ sock->connecting = 0;
+ sock->bound = 0;
+ isc_sockaddr_any(&sock->peer_address);
+
+ closesocket(sock->manager, sock->type, fd);
+}
+
/*
* I/O is possible on a given socket. Schedule an event to this task that
* will call an internal function to do the I/O. This will charge the
isc_socketevent_t *ev;
isc_task_t *sender;
+#if 0
+ /*
+ * XXXJT: this assertion seems to strong, but leave it here for
+ * reference.
+ */
INSIST(!sock->pending_recv);
+#endif
+ if (sock->pending_recv != 0)
+ return;
if (sock->type != isc_sockettype_fdwatch) {
ev = ISC_LIST_HEAD(sock->recv_list);
sock->pf);
(void)close(fd);
goto soft_error;
- } else if (fd >= (int)FD_SETSIZE) {
+ } else if (fd >= (int)manager->maxsocks) {
isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
isc_msgcat, ISC_MSGSET_SOCKET,
* -1 means the new socket didn't happen.
*/
if (fd != -1) {
+ int lockid = FDLOCK_ID(fd);
+
+ LOCK(&manager->fdlock[lockid]);
+ manager->fds[fd] = dev->newsocket;
+ manager->fdstate[fd] = MANAGED;
+ UNLOCK(&manager->fdlock[lockid]);
+
LOCK(&manager->lock);
ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
*/
dev->address = dev->newsocket->peer_address;
- manager->fds[fd] = dev->newsocket;
- manager->fdstate[fd] = MANAGED;
+#ifdef USE_SELECT
if (manager->maxfd < fd)
manager->maxfd = fd;
+#endif
socket_log(sock, &dev->newsocket->peer_address, CREATION,
isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
UNLOCK(&sock->lock);
}
+/*
+ * Process read/writes on each fd here. Avoid locking
+ * and unlocking twice if both reads and writes are possible.
+ */
static void
-process_fds(isc_socketmgr_t *manager, int maxfd,
- fd_set *readfds, fd_set *writefds)
+process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
+ isc_boolean_t writeable)
{
- int i;
isc_socket_t *sock;
isc_boolean_t unlock_sock;
-
- REQUIRE(maxfd <= (int)FD_SETSIZE);
+ isc_boolean_t needclose;
+ int lockid = FDLOCK_ID(fd);
/*
- * Process read/writes on other fds here. Avoid locking
- * and unlocking twice if both reads and writes are possible.
+ * If we need to close the socket, do it now.
*/
- for (i = 0; i < maxfd; i++) {
-#ifdef ISC_PLATFORM_USETHREADS
- if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
- continue;
-#endif /* ISC_PLATFORM_USETHREADS */
+ LOCK(&manager->fdlock[lockid]);
+ if (manager->fdstate[fd] == CLOSE_PENDING
+ || manager->fdstate[fd] == MANAGER_CLOSE_PENDING) {
+ needclose = ISC_TF(manager->fdstate[fd] == CLOSE_PENDING);
+ manager->fdstate[fd] = CLOSED;
+ UNLOCK(&manager->fdlock[lockid]);
+
+ (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+ (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+ if (needclose)
+ (void)close(fd);
+ return;
+ }
+
+ sock = manager->fds[fd];
+ UNLOCK(&manager->fdlock[lockid]);
+ unlock_sock = ISC_FALSE;
+ if (readable) {
+ if (sock == NULL) {
+ (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+ goto check_write;
+ }
+ unlock_sock = ISC_TRUE;
+ LOCK(&sock->lock);
+ if (!SOCK_DEAD(sock)) {
+ if (sock->listener)
+ dispatch_accept(sock);
+ else
+ dispatch_recv(sock);
+ }
+ (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+ }
+check_write:
+ if (writeable) {
+ if (sock == NULL) {
+ (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+ return;
+ }
+ if (!unlock_sock) {
+ unlock_sock = ISC_TRUE;
+ LOCK(&sock->lock);
+ }
+ if (!SOCK_DEAD(sock)) {
+ if (sock->connecting)
+ dispatch_connect(sock);
+ else
+ dispatch_send(sock);
+ }
+ (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+ }
+ if (unlock_sock)
+ UNLOCK(&sock->lock);
+}
+
+#ifdef USE_KQUEUE
+static isc_boolean_t
+process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) {
+ int i;
+ isc_boolean_t readable, writable;
+ isc_boolean_t done = ISC_FALSE;
+ if (nevents == manager->nevents) {
/*
- * If we need to close the socket, do it now.
+ * This is not an error, but something unexpected. If this
+ * happens, it may indicate the need for increasing
+ * ISC_SOCKET_MAXEVENTS.
*/
- if (manager->fdstate[i] == CLOSE_PENDING
- || manager->fdstate[i] == MANAGER_CLOSE_PENDING) {
- FD_CLR(i, &manager->read_fds);
- FD_CLR(i, &manager->write_fds);
- if (manager->fdstate[i] == CLOSE_PENDING)
- (void)close(i);
- manager->fdstate[i] = CLOSED;
+ manager_log(manager, ISC_LOGCATEGORY_GENERAL,
+ ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
+ "maximum number of FD events (%d) received",
+ nevents);
+ }
+
+ for (i = 0; i < nevents; i++) {
+ REQUIRE(events[i].ident < manager->maxsocks);
+#ifdef ISC_PLATFORM_USETHREADS
+ if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
+ done = process_ctlfd(manager);
continue;
}
+#endif
+ readable = ISC_TF(events[i].filter == EVFILT_READ);
+ writable = ISC_TF(events[i].filter == EVFILT_WRITE);
+ process_fd(manager, events[i].ident, readable, writable);
+ }
- sock = manager->fds[i];
- unlock_sock = ISC_FALSE;
- if (FD_ISSET(i, readfds)) {
- if (sock == NULL) {
- FD_CLR(i, &manager->read_fds);
- goto check_write;
- }
- unlock_sock = ISC_TRUE;
- LOCK(&sock->lock);
- if (!SOCK_DEAD(sock)) {
- if (sock->listener)
- dispatch_accept(sock);
- else
- dispatch_recv(sock);
- }
- FD_CLR(i, &manager->read_fds);
+ return (done);
+}
+#elif defined(USE_EPOLL)
+static isc_boolean_t
+process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) {
+ int i;
+ isc_boolean_t done = ISC_FALSE;
+
+ if (nevents == manager->nevents) {
+ manager_log(manager, ISC_LOGCATEGORY_GENERAL,
+ ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
+ "maximum number of FD events (%d) received",
+ nevents);
+ }
+
+ for (i = 0; i < nevents; i++) {
+ REQUIRE(events[i].data.fd < (int)manager->maxsocks);
+#ifdef ISC_PLATFORM_USETHREADS
+ if (events[i].data.fd == manager->pipe_fds[0]) {
+ done = process_ctlfd(manager);
+ continue;
}
- check_write:
- if (FD_ISSET(i, writefds)) {
- if (sock == NULL) {
- FD_CLR(i, &manager->write_fds);
- continue;
- }
- if (!unlock_sock) {
- unlock_sock = ISC_TRUE;
- LOCK(&sock->lock);
- }
- if (!SOCK_DEAD(sock)) {
- if (sock->connecting)
- dispatch_connect(sock);
- else
- dispatch_send(sock);
- }
- FD_CLR(i, &manager->write_fds);
+#endif
+ if ((events[i].events & EPOLLERR) != 0 ||
+ (events[i].events & EPOLLHUP) != 0) {
+ /*
+ * epoll does not set IN/OUT bits on an erroneous
+ * condition, so we need to try both anyway. This is a
+ * bit inefficient, but should be okay for such rare
+ * events. Note also that the read or write attempt
+ * won't block because we use non-blocking sockets.
+ */
+ events[i].events |= (EPOLLIN | EPOLLOUT);
}
- if (unlock_sock)
- UNLOCK(&sock->lock);
+ process_fd(manager, events[i].data.fd,
+ (events[i].events & EPOLLIN) != 0,
+ (events[i].events & EPOLLOUT) != 0);
+ }
+
+ return (done);
+}
+#elif defined(USE_DEVPOLL)
+static isc_boolean_t
+process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) {
+ int i;
+ isc_boolean_t done = ISC_FALSE;
+
+ if (nevents == manager->nevents) {
+ manager_log(manager, ISC_LOGCATEGORY_GENERAL,
+ ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
+ "maximum number of FD events (%d) received",
+ nevents);
+ }
+
+ for (i = 0; i < nevents; i++) {
+ REQUIRE(events[i].fd < (int)manager->maxsocks);
+#ifdef ISC_PLATFORM_USETHREADS
+ if (events[i].fd == manager->pipe_fds[0]) {
+ done = process_ctlfd(manager);
+ continue;
+ }
+#endif
+ process_fd(manager, events[i].fd,
+ (events[i].events & POLLIN) != 0,
+ (events[i].events & POLLOUT) != 0);
+ }
+
+ return (done);
+}
+#elif defined(USE_SELECT)
+static void
+process_fds(isc_socketmgr_t *manager, int maxfd,
+ fd_set *readfds, fd_set *writefds)
+{
+ int i;
+
+ REQUIRE(maxfd <= (int)manager->maxsocks);
+
+ for (i = 0; i < maxfd; i++) {
+#ifdef ISC_PLATFORM_USETHREADS
+ if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
+ continue;
+#endif /* ISC_PLATFORM_USETHREADS */
+ process_fd(manager, i, FD_ISSET(i, readfds),
+ FD_ISSET(i, writefds));
}
}
+#endif
#ifdef ISC_PLATFORM_USETHREADS
+static isc_boolean_t
+process_ctlfd(isc_socketmgr_t *manager) {
+ int msg, fd;
+
+ for (;;) {
+ select_readmsg(manager, &fd, &msg);
+
+ manager_log(manager, IOEVENT,
+ isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
+ ISC_MSG_WATCHERMSG,
+ "watcher got message %d "
+ "for socket %d"), msg, fd);
+
+ /*
+ * Nothing to read?
+ */
+ if (msg == SELECT_POKE_NOTHING)
+ return (ISC_FALSE);
+
+ /*
+ * Handle shutdown message. We really should
+ * jump out of this loop right away, but
+ * it doesn't matter if we have to do a little
+ * more work first.
+ */
+ if (msg == SELECT_POKE_SHUTDOWN)
+ return (ISC_TRUE);
+
+ /*
+ * This is a wakeup on a socket. Look
+ * at the event queue for both read and write,
+ * and decide if we need to watch on it now
+ * or not.
+ */
+ wakeup_socket(manager, fd, msg);
+ }
+
+ return (ISC_FALSE);
+}
+
/*
* This is the thread that will loop forever, always in a select or poll
* call.
isc_boolean_t done;
int ctlfd;
int cc;
+#ifdef USE_KQUEUE
+ const char *fnname = "kevent()";
+#elif defined (USE_EPOLL)
+ const char *fnname = "epoll_wait()";
+#elif defined(USE_DEVPOLL)
+ const char *fnname = "ioctl(DP_POLL)";
+ struct dvpoll dvp;
+#elif defined (USE_SELECT)
+ const char *fnname = "select()";
fd_set readfds;
fd_set writefds;
- int msg, fd;
int maxfd;
+#endif
char strbuf[ISC_STRERRORSIZE];
/*
* Get the control fd here. This will never change.
*/
- LOCK(&manager->lock);
ctlfd = manager->pipe_fds[0];
-
done = ISC_FALSE;
while (!done) {
do {
+#ifdef USE_KQUEUE
+ cc = kevent(manager->kqueue_fd, NULL, 0,
+ manager->events, manager->nevents, NULL);
+#elif defined(USE_EPOLL)
+ cc = epoll_wait(manager->epoll_fd, manager->events,
+ manager->nevents, -1);
+#elif defined(USE_DEVPOLL)
+ dvp.dp_fds = manager->events;
+ dvp.dp_nfds = manager->nevents;
+ dvp.dp_timeout = -1;
+ cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
+#elif defined(USE_SELECT)
+ LOCK(&manager->lock);
readfds = manager->read_fds;
writefds = manager->write_fds;
maxfd = manager->maxfd + 1;
-
UNLOCK(&manager->lock);
cc = select(maxfd, &readfds, &writefds, NULL, NULL);
- if (cc < 0) {
- if (!SOFT_ERROR(errno)) {
- isc__strerror(errno, strbuf,
- sizeof(strbuf));
- FATAL_ERROR(__FILE__, __LINE__,
- "select() %s: %s",
- isc_msgcat_get(isc_msgcat,
- ISC_MSGSET_GENERAL,
- ISC_MSG_FAILED,
- "failed"),
- strbuf);
- }
- }
+#endif /* USE_KQUEUE */
- LOCK(&manager->lock);
+ if (cc < 0 && !SOFT_ERROR(errno)) {
+ isc__strerror(errno, strbuf, sizeof(strbuf));
+ FATAL_ERROR(__FILE__, __LINE__,
+ "%s %s: %s", fnname,
+ isc_msgcat_get(isc_msgcat,
+ ISC_MSGSET_GENERAL,
+ ISC_MSG_FAILED,
+ "failed"), strbuf);
+ }
} while (cc < 0);
-
+#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
+ done = process_fds(manager, manager->events, cc);
+#elif defined(USE_SELECT)
/*
* Process reads on internal, control fd.
*/
- if (FD_ISSET(ctlfd, &readfds)) {
- for (;;) {
- select_readmsg(manager, &fd, &msg);
-
- manager_log(manager, IOEVENT,
- isc_msgcat_get(isc_msgcat,
- ISC_MSGSET_SOCKET,
- ISC_MSG_WATCHERMSG,
- "watcher got message %d"
- " for socket %d"),
- msg, fd);
-
- /*
- * Nothing to read?
- */
- if (msg == SELECT_POKE_NOTHING)
- break;
-
- /*
- * Handle shutdown message. We really should
- * jump out of this loop right away, but
- * it doesn't matter if we have to do a little
- * more work first.
- */
- if (msg == SELECT_POKE_SHUTDOWN) {
- done = ISC_TRUE;
-
- break;
- }
-
- /*
- * This is a wakeup on a socket. Look
- * at the event queue for both read and write,
- * and decide if we need to watch on it now
- * or not.
- */
- wakeup_socket(manager, fd, msg);
- }
- }
+ if (FD_ISSET(ctlfd, &readfds))
+ done = process_ctlfd(manager);
process_fds(manager, maxfd, &readfds, &writefds);
+#endif
}
manager_log(manager, TRACE,
isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
ISC_MSG_EXITING, "watcher exiting"));
- UNLOCK(&manager->lock);
return ((isc_threadresult_t)0);
}
#endif /* ISC_PLATFORM_USETHREADS */
/*
* Create a new socket manager.
*/
+
+static isc_result_t
+setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
+ isc_result_t result;
+
+#ifdef USE_KQUEUE
+ manager->nevents = ISC_SOCKET_MAXEVENTS;
+ manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
+ manager->nevents);
+ if (manager->events == NULL)
+ return (ISC_R_NOMEMORY);
+ manager->kqueue_fd = kqueue();
+ if (manager->kqueue_fd == -1) {
+ result = isc__errno2result(errno);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct kevent) * manager->nevents);
+ return (result);
+ }
+
+#ifdef ISC_PLATFORM_USETHREADS
+ result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+ if (result != ISC_R_SUCCESS) {
+ close(manager->kqueue_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct kevent) * manager->nevents);
+ return (result);
+ }
+#endif /* ISC_PLATFORM_USETHREADS */
+#elif defined(USE_EPOLL)
+ manager->nevents = ISC_SOCKET_MAXEVENTS;
+ manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
+ manager->nevents);
+ if (manager->events == NULL)
+ return (ISC_R_NOMEMORY);
+ manager->epoll_fd = epoll_create(manager->nevents);
+ if (manager->epoll_fd == -1) {
+ result = isc__errno2result(errno);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct epoll_event) * manager->nevents);
+ return (result);
+ }
+#ifdef ISC_PLATFORM_USETHREADS
+ result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+ if (result != ISC_R_SUCCESS) {
+ close(manager->epoll_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct epoll_event) * manager->nevents);
+ return (result);
+ }
+#endif /* ISC_PLATFORM_USETHREADS */
+#elif defined(USE_DEVPOLL)
+ /*
+ * XXXJT: /dev/poll seems to reject large numbers of events,
+ * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
+ */
+ manager->nevents = ISC_SOCKET_MAXEVENTS;
+ manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
+ manager->nevents);
+ if (manager->events == NULL)
+ return (ISC_R_NOMEMORY);
+ /*
+ * Note: fdpollinfo should be able to support all possible FDs, so
+ * it must have maxsocks entries (not nevents).
+ */
+ manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
+ manager->maxsocks);
+ if (manager->fdpollinfo == NULL) {
+ isc_mem_put(mctx, manager->events,
+ sizeof(pollinfo_t) * manager->maxsocks);
+ return (ISC_R_NOMEMORY);
+ }
+ memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
+ manager->devpoll_fd = open("/dev/poll", O_RDWR);
+ if (manager->devpoll_fd == -1) {
+ result = isc__errno2result(errno);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct pollfd) * manager->nevents);
+ isc_mem_put(mctx, manager->fdpollinfo,
+ sizeof(pollinfo_t) * manager->maxsocks);
+ return (result);
+ }
+#ifdef ISC_PLATFORM_USETHREADS
+ result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+ if (result != ISC_R_SUCCESS) {
+ close(manager->devpoll_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct pollfd) * manager->nevents);
+ isc_mem_put(mctx, manager->fdpollinfo,
+ sizeof(pollinfo_t) * manager->maxsocks);
+ return (result);
+ }
+#endif /* ISC_PLATFORM_USETHREADS */
+#elif defined(USE_SELECT)
+ UNUSED(mctx);
+ UNUSED(result);
+
+ FD_ZERO(&manager->read_fds);
+ FD_ZERO(&manager->write_fds);
+#ifdef ISC_PLATFORM_USETHREADS
+ (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+ manager->maxfd = manager->pipe_fds[0];
+#else /* ISC_PLATFORM_USETHREADS */
+ manager->maxfd = 0;
+#endif /* ISC_PLATFORM_USETHREADS */
+#endif /* USE_KQUEUE */
+
+ return (ISC_R_SUCCESS);
+}
+
+static void
+cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
+#ifdef ISC_PLATFORM_USETHREADS
+ isc_result_t result;
+
+ result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+ if (result != ISC_R_SUCCESS) {
+ UNEXPECTED_ERROR(__FILE__, __LINE__,
+ "epoll_ctl(DEL) %s",
+ isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
+ ISC_MSG_FAILED, "failed"));
+ }
+#endif /* ISC_PLATFORM_USETHREADS */
+
+#ifdef USE_KQUEUE
+ close(manager->kqueue_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct kevent) * manager->nevents);
+#elif defined(USE_EPOLL)
+ close(manager->epoll_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct epoll_event) * manager->nevents);
+#elif defined(USE_DEVPOLL)
+ close(manager->devpoll_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct pollfd) * manager->nevents);
+ isc_mem_put(mctx, manager->fdpollinfo,
+ sizeof(pollinfo_t) * manager->maxsocks);
+#elif defined(USE_SELECT)
+ UNUSED(mctx);
+ UNUSED(manager);
+#endif /* USE_KQUEUE */
+}
+
isc_result_t
isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
+ int i;
isc_socketmgr_t *manager;
#ifdef ISC_PLATFORM_USETHREADS
char strbuf[ISC_STRERRORSIZE];
#endif
isc_result_t result;
-
REQUIRE(managerp != NULL && *managerp == NULL);
#ifndef ISC_PLATFORM_USETHREADS
if (manager == NULL)
return (ISC_R_NOMEMORY);
+ /* zero-clear so that necessary cleanup on failure will be easy */
+ memset(manager, 0, sizeof(*manager));
+
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+ manager->maxsocks = ISC_SOCKET_MAXSOCKETS;
+#elif defined (USE_SELECT)
+ manager->maxsocks = FD_SETSIZE;
+#endif
+
+ manager->fds = isc_mem_get(mctx,
+ manager->maxsocks * sizeof(isc_socket_t *));
+ if (manager->fds == NULL) {
+ result = ISC_R_NOMEMORY;
+ goto free_manager;
+ }
+ manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
+ if (manager->fds == NULL) {
+ result = ISC_R_NOMEMORY;
+ goto free_manager;
+ }
+
manager->magic = SOCKET_MANAGER_MAGIC;
manager->mctx = NULL;
- memset(manager->fds, 0, sizeof(manager->fds));
+ memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
ISC_LIST_INIT(manager->socklist);
result = isc_mutex_init(&manager->lock);
- if (result != ISC_R_SUCCESS) {
- isc_mem_put(mctx, manager, sizeof(*manager));
- return (result);
+ if (result != ISC_R_SUCCESS)
+ goto free_manager;
+ manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
+ if (manager->fdlock == NULL) {
+ result = ISC_R_NOMEMORY;
+ goto cleanup_lock;
}
+ for (i = 0; i < FDLOCK_COUNT; i++) {
+ result = isc_mutex_init(&manager->fdlock[i]);
+ if (result != ISC_R_SUCCESS) {
+ while (--i >= 0)
+ DESTROYLOCK(&manager->fdlock[i]);
+ isc_mem_put(mctx, manager->fdlock,
+ FDLOCK_COUNT * sizeof(isc_mutex_t));
+ manager->fdlock = NULL;
+ goto cleanup_lock;
+ }
+ }
+
#ifdef ISC_PLATFORM_USETHREADS
if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
- DESTROYLOCK(&manager->lock);
- isc_mem_put(mctx, manager, sizeof(*manager));
UNEXPECTED_ERROR(__FILE__, __LINE__,
"isc_condition_init() %s",
isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
ISC_MSG_FAILED, "failed"));
- return (ISC_R_UNEXPECTED);
+ result = ISC_R_UNEXPECTED;
+ goto cleanup_lock;
}
/*
* select/poll loop when something internal needs to be done.
*/
if (pipe(manager->pipe_fds) != 0) {
- DESTROYLOCK(&manager->lock);
- isc_mem_put(mctx, manager, sizeof(*manager));
isc__strerror(errno, strbuf, sizeof(strbuf));
UNEXPECTED_ERROR(__FILE__, __LINE__,
"pipe() %s: %s",
isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
ISC_MSG_FAILED, "failed"),
strbuf);
-
- return (ISC_R_UNEXPECTED);
+ result = ISC_R_UNEXPECTED;
+ goto cleanup_condition;
}
RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
/*
* Set up initial state for the select loop
*/
- FD_ZERO(&manager->read_fds);
- FD_ZERO(&manager->write_fds);
-#ifdef ISC_PLATFORM_USETHREADS
- FD_SET(manager->pipe_fds[0], &manager->read_fds);
- manager->maxfd = manager->pipe_fds[0];
-#else /* ISC_PLATFORM_USETHREADS */
- manager->maxfd = 0;
-#endif /* ISC_PLATFORM_USETHREADS */
- memset(manager->fdstate, 0, sizeof(manager->fdstate));
+ result = setup_watcher(mctx, manager);
+ if (result != ISC_R_SUCCESS)
+ goto cleanup;
+ memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
#ifdef ISC_PLATFORM_USETHREADS
/*
*/
if (isc_thread_create(watcher, manager, &manager->watcher) !=
ISC_R_SUCCESS) {
- (void)close(manager->pipe_fds[0]);
- (void)close(manager->pipe_fds[1]);
- DESTROYLOCK(&manager->lock);
- isc_mem_put(mctx, manager, sizeof(*manager));
UNEXPECTED_ERROR(__FILE__, __LINE__,
"isc_thread_create() %s",
isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
ISC_MSG_FAILED, "failed"));
- return (ISC_R_UNEXPECTED);
+ cleanup_watcher(mctx, manager);
+ result = ISC_R_UNEXPECTED;
+ goto cleanup;
}
#endif /* ISC_PLATFORM_USETHREADS */
isc_mem_attach(mctx, &manager->mctx);
*managerp = manager;
return (ISC_R_SUCCESS);
+
+cleanup:
+#ifdef ISC_PLATFORM_USETHREADS
+ (void)close(manager->pipe_fds[0]);
+ (void)close(manager->pipe_fds[1]);
+#endif /* ISC_PLATFORM_USETHREADS */
+
+#ifdef ISC_PLATFORM_USETHREADS
+cleanup_condition:
+ (void)isc_condition_destroy(&manager->shutdown_ok);
+#endif /* ISC_PLATFORM_USETHREADS */
+
+
+cleanup_lock:
+ if (manager->fdlock != NULL) {
+ for (i = 0; i < FDLOCK_COUNT; i++)
+ DESTROYLOCK(&manager->fdlock[i]);
+ }
+ DESTROYLOCK(&manager->lock);
+
+free_manager:
+ if (manager->fdlock != NULL) {
+ isc_mem_put(mctx, manager->fdlock,
+ FDLOCK_COUNT * sizeof(isc_mutex_t));
+ }
+ if (manager->fdstate != NULL) {
+ isc_mem_put(mctx, manager->fdstate,
+ manager->maxsocks * sizeof(int));
+ }
+ if (manager->fds != NULL) {
+ isc_mem_put(mctx, manager->fds,
+ manager->maxsocks * sizeof(isc_socket_t *));
+ }
+ isc_mem_put(mctx, manager, sizeof(*manager));
+
+ return (result);
}
void
/*
* Clean up.
*/
+ cleanup_watcher(manager->mctx, manager);
+
#ifdef ISC_PLATFORM_USETHREADS
(void)close(manager->pipe_fds[0]);
(void)close(manager->pipe_fds[1]);
(void)isc_condition_destroy(&manager->shutdown_ok);
#endif /* ISC_PLATFORM_USETHREADS */
- for (i = 0; i < (int)FD_SETSIZE; i++)
- if (manager->fdstate[i] == CLOSE_PENDING)
+ for (i = 0; i < (int)manager->maxsocks; i++)
+ if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
(void)close(i);
+ isc_mem_put(manager->mctx, manager->fds,
+ manager->maxsocks * sizeof(isc_socket_t *));
+ isc_mem_put(manager->mctx, manager->fdstate,
+ manager->maxsocks * sizeof(int));
+
+ if (manager->fdlock != NULL) {
+ for (i = 0; i < FDLOCK_COUNT; i++)
+ DESTROYLOCK(&manager->fdlock[i]);
+ isc_mem_put(manager->mctx, manager->fdlock,
+ FDLOCK_COUNT * sizeof(isc_mutex_t));
+ }
DESTROYLOCK(&manager->lock);
manager->magic = 0;
mctx= manager->mctx;
}
#ifndef ISC_PLATFORM_USETHREADS
-void
-isc__socketmgr_getfdsets(fd_set *readset, fd_set *writeset, int *maxfd) {
+/* In our assumed scenario, we can simply use a single static object. */
+static isc_socketwait_t swait_private;
+
+int
+isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) {
+ int n;
+#ifdef USE_KQUEUE
+ struct timespec ts, *tsp;
+#endif
+#ifdef USE_EPOLL
+ int timeout;
+#endif
+#ifdef USE_DEVPOLL
+ struct dvpoll dvp;
+#endif
+
+ REQUIRE(swaitp != NULL && *swaitp == NULL);
+
if (socketmgr == NULL)
- *maxfd = 0;
- else {
- *readset = socketmgr->read_fds;
- *writeset = socketmgr->write_fds;
- *maxfd = socketmgr->maxfd + 1;
- }
+ return (0);
+
+#ifdef USE_KQUEUE
+ if (tvp != NULL) {
+ ts.tv_sec = tvp->tv_sec;
+ ts.tv_nsec = tvp->tv_usec * 1000;
+ tsp = &ts;
+ } else
+ tsp = NULL;
+ swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0,
+ socketmgr->events, socketmgr->nevents,
+ tsp);
+ n = swait_private.nevents;
+#elif defined(USE_EPOLL)
+ if (tvp != NULL)
+ timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
+ else
+ timeout = -1;
+ swait_private.nevents = epoll_wait(socketmgr->epoll_fd,
+ socketmgr->events,
+ socketmgr->nevents, timeout);
+ n = swait_private.nevents;
+#elif defined(USE_DEVPOLL)
+ dvp.dp_fds = socketmgr->events;
+ dvp.dp_nfds = socketmgr->nevents;
+ if (tvp != NULL) {
+ dvp.dp_timeout = tvp->tv_sec * 1000 +
+ (tvp->tv_usec + 999) / 1000;
+ } else
+ dvp.dp_timeout = -1;
+ swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp);
+ n = swait_private.nevents;
+#elif defined(USE_SELECT)
+ swait_private.readset = socketmgr->read_fds;
+ swait_private.writeset = socketmgr->write_fds;
+ swait_private.maxfd = socketmgr->maxfd + 1;
+
+ n = select(swait_private.maxfd, &swait_private.readset,
+ &swait_private.writeset, NULL, tvp);
+#endif
+
+ *swaitp = &swait_private;
+ return (n);
}
isc_result_t
-isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) {
- isc_socketmgr_t *manager = socketmgr;
+isc__socketmgr_dispatch(isc_socketwait_t *swait) {
+ REQUIRE(swait == &swait_private);
- if (manager == NULL)
+ if (socketmgr == NULL)
return (ISC_R_NOTFOUND);
- process_fds(manager, maxfd, readset, writeset);
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+ (void)process_fds(socketmgr, socketmgr->events, swait->nevents);
return (ISC_R_SUCCESS);
+#elif defined(USE_SELECT)
+ process_fds(socketmgr, swait->maxfd, &swait->readset, &swait->writeset);
+ return (ISC_R_SUCCESS);
+#endif
}
#endif /* ISC_PLATFORM_USETHREADS */
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: socket_p.h,v 1.11 2007/06/19 23:47:18 tbox Exp $ */
+/* $Id: socket_p.h,v 1.11.128.1 2008/06/24 00:09:12 jinmei Exp $ */
#ifndef ISC_SOCKET_P_H
#define ISC_SOCKET_P_H
#include <sys/select.h>
#endif
-void
-isc__socketmgr_getfdsets(fd_set *readset, fd_set *writeset, int *maxfd);
-
-isc_result_t
-isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd);
-
+typedef struct isc_socketwait isc_socketwait_t;
+int isc__socketmgr_waitevents(struct timeval *, isc_socketwait_t **);
+isc_result_t isc__socketmgr_dispatch(isc_socketwait_t *);
#endif /* ISC_SOCKET_P_H */
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: namedconf.c,v 1.78.46.7 2008/05/27 22:36:11 each Exp $ */
+/* $Id: namedconf.c,v 1.78.46.8 2008/06/24 00:09:12 jinmei Exp $ */
/*! \file */
/*%
* Port list.
*/
+static cfg_tuplefielddef_t porttuple_fields[] = {
+ { "loport", &cfg_type_uint32, 0 },
+ { "hiport", &cfg_type_uint32, 0 },
+ { NULL, NULL, 0 }
+};
+static cfg_type_t cfg_type_porttuple = {
+ "porttuple", cfg_parse_tuple, cfg_print_tuple, cfg_doc_tuple,
+ &cfg_rep_tuple, porttuple_fields
+};
+
static isc_result_t
-parse_port(cfg_parser_t *pctx, const cfg_type_t *type, cfg_obj_t **ret) {
+parse_port(cfg_parser_t *pctx, cfg_obj_t **ret) {
isc_result_t result;
- UNUSED(type);
-
CHECK(cfg_parse_uint32(pctx, NULL, ret));
if ((*ret)->value.uint32 > 0xffff) {
cfg_parser_error(pctx, CFG_LOG_NEAR, "invalid port");
cfg_obj_destroy(pctx, ret);
result = ISC_R_RANGE;
}
+
+ cleanup:
+ return (result);
+}
+
+static isc_result_t
+parse_portrange(cfg_parser_t *pctx, const cfg_type_t *type, cfg_obj_t **ret) {
+ isc_result_t result;
+ cfg_obj_t *obj = NULL;
+
+ UNUSED(type);
+
+ CHECK(cfg_peektoken(pctx, ISC_LEXOPT_NUMBER | ISC_LEXOPT_CNUMBER));
+ if (pctx->token.type == isc_tokentype_number)
+ CHECK(parse_port(pctx, ret));
+ else {
+ CHECK(cfg_gettoken(pctx, 0));
+ if (pctx->token.type != isc_tokentype_string ||
+ strcasecmp(TOKEN_STRING(pctx), "range") != 0) {
+ cfg_parser_error(pctx, CFG_LOG_NEAR,
+ "expected integer or 'range'");
+ return (ISC_R_UNEXPECTEDTOKEN);
+ }
+ CHECK(cfg_create_tuple(pctx, &cfg_type_porttuple, &obj));
+ CHECK(parse_port(pctx, &obj->value.tuple[0]));
+ CHECK(parse_port(pctx, &obj->value.tuple[1]));
+ if (obj->value.tuple[0]->value.uint32 >
+ obj->value.tuple[1]->value.uint32) {
+ cfg_parser_error(pctx, CFG_LOG_NOPREP,
+ "low port '%u' must not be larger "
+ "than high port",
+ obj->value.tuple[0]->value.uint32);
+ result = ISC_R_RANGE;
+ goto cleanup;
+ }
+ *ret = obj;
+ obj = NULL;
+ }
+
cleanup:
+ if (obj != NULL)
+ cfg_obj_destroy(pctx, &obj);
return (result);
}
-static cfg_type_t cfg_type_port = {
- "port", parse_port, NULL, cfg_doc_terminal,
+static cfg_type_t cfg_type_portrange = {
+ "portrange", parse_portrange, NULL, cfg_doc_terminal,
NULL, NULL
};
static cfg_type_t cfg_type_bracketed_portlist = {
- "bracketed_sockaddrlist", cfg_parse_bracketed_list, cfg_print_bracketed_list, cfg_doc_bracketed_list,
- &cfg_rep_list, &cfg_type_port
+ "bracketed_sockaddrlist", cfg_parse_bracketed_list,
+ cfg_print_bracketed_list, cfg_doc_bracketed_list,
+ &cfg_rep_list, &cfg_type_portrange
};
/*%
*/
static cfg_clausedef_t
options_clauses[] = {
+ { "use-v4-udp-ports", &cfg_type_bracketed_portlist, 0 },
+ { "use-v6-udp-ports", &cfg_type_bracketed_portlist, 0 },
{ "avoid-v4-udp-ports", &cfg_type_bracketed_portlist, 0 },
{ "avoid-v6-udp-ports", &cfg_type_bracketed_portlist, 0 },
{ "blackhole", &cfg_type_bracketed_aml, 0 },