- Change of timeout code. No more lost and backoff in blockage.

author Wouter Wijngaards <wouter@nlnetlabs.nl>

Tue, 26 Oct 2010 15:02:08 +0000 (15:02 +0000)

committer Wouter Wijngaards <wouter@nlnetlabs.nl>

Tue, 26 Oct 2010 15:02:08 +0000 (15:02 +0000)
author Wouter Wijngaards <wouter@nlnetlabs.nl>
Tue, 26 Oct 2010 15:02:08 +0000 (15:02 +0000)
committer Wouter Wijngaards <wouter@nlnetlabs.nl>
Tue, 26 Oct 2010 15:02:08 +0000 (15:02 +0000)
diff --git a/daemon/cachedump.c b/daemon/cachedump.c

index 43c1a9a23305e9ce3aa6a4fa15229cce73a4ebaa..85fe9f8396d2ba0ed46b70d1c7c9e5e1380f554e 100644 (file)
--- a/daemon/cachedump.c
+++ b/daemon/cachedump.c
@@ -802,8 +802,7 @@ print_dp_details(SSL* ssl, struct worker* worker, struct delegpt* dp)
  {
         char buf[257];
         struct delegpt_addr* a;
-       int lame, dlame, rlame, rto, edns_vs, to;
-       int entry_ttl;
+       int lame, dlame, rlame, rto, edns_vs, to, delay, entry_ttl;
         struct rtt_info ri;
         uint8_t edns_lame_known;
         for(a = dp->target_list; a; a = a->next_target) {
@@ -816,7 +815,7 @@ print_dp_details(SSL* ssl, struct worker* worker, struct delegpt* dp)
                 }
                 /* lookup in infra cache */
                 entry_ttl = infra_get_host_rto(worker->env.infra_cache,
-                       &a->addr, a->addrlen, &ri, *worker->env.now);
+                       &a->addr, a->addrlen, &ri, &delay, *worker->env.now);
                 if(entry_ttl == -1) {
                         if(!ssl_printf(ssl, "not in infra cache.\n"))
                                 return;
@@ -840,6 +839,9 @@ print_dp_details(SSL* ssl, struct worker* worker, struct delegpt* dp)
                         rlame?"NoAuthButRecursive ":"", rto, entry_ttl,
                         ri.srtt, ri.rttvar, rtt_notimeout(&ri)))
                         return;
+               if(delay)
+                       if(!ssl_printf(ssl, ", probedelay %d", delay))
+                               return;
                 if(infra_host(worker->env.infra_cache, &a->addr, a->addrlen,
                         *worker->env.now, &edns_vs, &edns_lame_known, &to)) {
                         if(edns_vs == -1) {
diff --git a/daemon/remote.c b/daemon/remote.c

index 66e89650a404ce8525d61ed4c08f84f0710ae9ab..f78e1d63272ddddde365debb6b9646432110501e 100644 (file)
--- a/daemon/remote.c
+++ b/daemon/remote.c
@@ -1572,10 +1572,11 @@ dump_infra_host(struct lruhash_entry* e, void* arg)
                 return;
         }
         if(!ssl_printf(a->ssl, "%s ttl %d ping %d var %d rtt %d rto %d "
-               "ednsknown %d edns %d\n",
+               "ednsknown %d edns %d delay %d\n",
                 ip_str, (int)(d->ttl - a->now),
                 d->rtt.srtt, d->rtt.rttvar, rtt_notimeout(&d->rtt), d->rtt.rto,
-               (int)d->edns_lame_known, (int)d->edns_version))
+               (int)d->edns_lame_known, (int)d->edns_version,
+               (int)(a->now<d->probedelay?d->probedelay-a->now:0)))
                 return;
         if(d->lameness)
                 lruhash_traverse(d->lameness, 0, &dump_infra_lame, arg);
diff --git a/doc/Changelog b/doc/Changelog

index 2e4e6d0021fed82a0358b50db7b2fcfa2346e0a8..a94467cc174863d6271048ea4f5b97c282c79370 100644 (file)
--- a/doc/Changelog
+++ b/doc/Changelog
@@ -1,6 +1,10 @@
  26 October 2010: Wouter
         - dump_infra and flush_infra commands for unbound-control.
         - no timeout backoff if meanwhile a query succeeded.
+       - Change of timeout code.  No more lost and backoff in blockage.
+         At 12sec timeout (and at least 2x lost before) one probe per IP
+         is allowed only.  At 120sec, the IP is blocked.  After 15min, a
+         120sec entry has a single retry packet.
  
  25 October 2010: Wouter
         - Configure errors if ldns is not found.
diff --git a/services/cache/infra.c b/services/cache/infra.c

index 4ac51f69b3818bc8c4920a86e0248d58b7c2085a..9e1e3a81c519e7bb598c46a38e2017ec5c6fc5e7 100644 (file)
--- a/services/cache/infra.c
+++ b/services/cache/infra.c
@@ -49,6 +49,9 @@
  #include "util/config_file.h"
  #include "iterator/iterator.h"
  
+/** Timeout when only a single probe query per IP is allowed. */
+#define PROBE_MAXRTO 12000 /* in msec */
+
  size_t 
  infra_host_sizefunc(void* k, void* ATTR_UNUSED(d))
  {
@@ -213,6 +216,7 @@ host_entry_init(struct infra_cache* infra, struct lruhash_entry* e,
         rtt_init(&data->rtt);
         data->edns_version = 0;
         data->edns_lame_known = 0;
+       data->probedelay = 0;
  }
  
  /** 
@@ -257,6 +261,7 @@ infra_host(struct infra_cache* infra, struct sockaddr_storage* addr,
         struct lruhash_entry* e = infra_lookup_host_nottl(infra, addr, 
                 addrlen, 0);
         struct infra_host_data* data;
+       int wr = 0;
         if(e && ((struct infra_host_data*)e->data)->ttl < timenow) {
                 /* it expired, try to reuse existing entry */
                 lock_rw_unlock(&e->lock);
@@ -266,6 +271,7 @@ infra_host(struct infra_cache* infra, struct sockaddr_storage* addr,
                         /* re-initialise */
                         /* do not touch lameness, it may be valid still */
                         host_entry_init(infra, e, timenow);
+                       wr = 1;
                 }
         }
         if(!e) {
@@ -284,6 +290,22 @@ infra_host(struct infra_cache* infra, struct sockaddr_storage* addr,
         *to = rtt_timeout(&data->rtt);
         *edns_vs = data->edns_version;
         *edns_lame_known = data->edns_lame_known;
+       if(*to >= PROBE_MAXRTO && rtt_notimeout(&data->rtt)*4 <= *to) {
+               /* delay other queries, this is the probe query */
+               if(!wr) {
+                       lock_rw_unlock(&e->lock);
+                       e = infra_lookup_host_nottl(infra, addr, addrlen, 1);
+                       if(!e) { /* flushed from cache real fast, no use to
+                               allocate just for the probedelay */
+                               return 1;
+                       }
+                       data = (struct infra_host_data*)e->data;
+               }
+               /* add 999 to round up the timeout value from msec to sec,
+                * then add a whole second so it is certain that this probe
+                * has timed out before the next is allowed */
+               data->probedelay = timenow + ((*to)+1999)/1000;
+       }
         lock_rw_unlock(&e->lock);
         return 1;
  }
@@ -498,6 +520,7 @@ infra_rtt_update(struct infra_cache* infra,
                 rtt_lost(&data->rtt, orig_rtt);
         } else {
                 rtt_update(&data->rtt, roundtrip);
+               data->probedelay = 0;
         }
         if(data->rtt.rto > 0)
                 rto = data->rtt.rto;
@@ -510,7 +533,7 @@ infra_rtt_update(struct infra_cache* infra,
  
  int infra_get_host_rto(struct infra_cache* infra,
          struct sockaddr_storage* addr, socklen_t addrlen,
-       struct rtt_info* rtt, uint32_t timenow)
+       struct rtt_info* rtt, int* delay, uint32_t timenow)
  {
         struct lruhash_entry* e = infra_lookup_host_nottl(infra, addr, 
                 addrlen, 0);
@@ -521,6 +544,9 @@ int infra_get_host_rto(struct infra_cache* infra,
         if(data->ttl >= timenow) {
                 ttl = (int)(data->ttl - timenow);
                 memmove(rtt, &data->rtt, sizeof(*rtt));
+               if(timenow < data->probedelay)
+                       *delay = (int)(data->probedelay - timenow);
+               else    *delay = 0;
         }
         lock_rw_unlock(&e->lock);
         return ttl;
@@ -570,6 +596,10 @@ infra_get_lame_rtt(struct infra_cache* infra,
                 return 0;
         host = (struct infra_host_data*)e->data;
         *rtt = rtt_unclamped(&host->rtt);
+       if(host->rtt.rto >= PROBE_MAXRTO && timenow < host->probedelay
+               && rtt_notimeout(&host->rtt)*4 <= host->rtt.rto)
+               /* single probe for this domain, and we are not probing */
+               *rtt = USEFUL_SERVER_TOP_TIMEOUT;
         /* check lameness first, if so, ttl on host does not matter anymore */
         if(infra_lookup_lame(host, name, namelen, timenow, 
                 &dlm, &rlm, &alm, &olm)) {
@@ -604,6 +634,13 @@ infra_get_lame_rtt(struct infra_cache* infra,
         *dnsseclame = 0;
         *reclame = 0;
         if(timenow > host->ttl) {
+               /* expired entry */
+               /* see if this can be a re-probe of an unresponsive server */
+               if(host->rtt.rto >= USEFUL_SERVER_TOP_TIMEOUT) {
+                       *rtt = USEFUL_SERVER_TOP_TIMEOUT-1;
+                       lock_rw_unlock(&e->lock);
+                       return 1;
+               }
                 lock_rw_unlock(&e->lock);
                 return 0;
         }
diff --git a/services/cache/infra.h b/services/cache/infra.h

index 9c203ee4d4471e15d29b6fb919eadf41357a8586..376e1ae5012d3cb72c05e04662d8acd2252cd5be 100644 (file)
--- a/services/cache/infra.h
+++ b/services/cache/infra.h
@@ -64,6 +64,8 @@ struct infra_host_key {
  struct infra_host_data {
         /** TTL value for this entry. absolute time. */
         uint32_t ttl;
+       /** time in seconds (absolute) when probing re-commences, 0 disabled */
+       uint32_t probedelay;
         /** round trip times for timeout calculation */
         struct rtt_info rtt;
         /** Names of the zones that are lame. NULL=no lame zones. */
@@ -173,6 +175,8 @@ struct infra_host_data* infra_lookup_host(struct infra_cache* infra,
   * Find host information to send a packet. Creates new entry if not found.
   * Lameness is empty. EDNS is 0 (try with first), and rtt is returned for 
   * the first message to it.
+ * Use this to send a packet only, because it also locks out others when
+ * probing is restricted.
   * @param infra: infrastructure cache.
   * @param addr: host address.
   * @param addrlen: length of addr.
@@ -265,6 +269,7 @@ int infra_edns_update(struct infra_cache* infra,
  
  /**
   * Get Lameness information and average RTT if host is in the cache.
+ * This information is to be used for server selection.
   * @param infra: infrastructure cache.
   * @param addr: host address.
   * @param addrlen: length of addr.
@@ -291,12 +296,13 @@ int infra_get_lame_rtt(struct infra_cache* infra,
   * @param addr: host address.
   * @param addrlen: length of addr.
   * @param rtt: the rtt_info is copied into here (caller alloced return struct).
+ * @param delay: probe delay (if any).
   * @param timenow: what time it is now.
   * @return TTL the infra host element is valid for. If -1: not found in cache.
   */
  int infra_get_host_rto(struct infra_cache* infra,
          struct sockaddr_storage* addr, socklen_t addrlen, 
-       struct rtt_info* rtt, uint32_t timenow);
+       struct rtt_info* rtt, int* delay, uint32_t timenow);
  
  /**
   * Get memory used by the infra cache.
author	Wouter Wijngaards <wouter@nlnetlabs.nl>
	Tue, 26 Oct 2010 15:02:08 +0000 (15:02 +0000)
committer	Wouter Wijngaards <wouter@nlnetlabs.nl>
	Tue, 26 Oct 2010 15:02:08 +0000 (15:02 +0000)
daemon/cachedump.c		patch \| blob \| blame \| history
daemon/remote.c		patch \| blob \| blame \| history
doc/Changelog		patch \| blob \| blame \| history
services/cache/infra.c		patch \| blob \| blame \| history
services/cache/infra.h		patch \| blob \| blame \| history