From c42056b9aa1a70cdabd0721c42136e6d43591d18 Mon Sep 17 00:00:00 2001 From: Wouter Wijngaards Date: Mon, 17 Aug 2009 12:43:23 +0000 Subject: [PATCH] Blacklist when 16 queries fail in a row. git-svn-id: file:///svn/unbound/trunk@1764 be551aaa-1e26-0410-a405-d3ace91eadb9 --- daemon/cachedump.c | 8 ++++---- doc/Changelog | 4 ++++ iterator/iter_utils.c | 11 ++++++++--- iterator/iterator.h | 4 ++++ services/cache/infra.c | 14 +++++++++++--- services/cache/infra.h | 7 ++++++- 6 files changed, 37 insertions(+), 11 deletions(-) diff --git a/daemon/cachedump.c b/daemon/cachedump.c index 01db2f933..a4a2794b8 100644 --- a/daemon/cachedump.c +++ b/daemon/cachedump.c @@ -786,7 +786,7 @@ print_dp_details(SSL* ssl, struct worker* worker, struct delegpt* dp) { char buf[257]; struct delegpt_addr* a; - int lame, dlame, rlame, rtt, edns_vs, to; + int lame, dlame, rlame, rtt, edns_vs, to, lost; uint8_t edns_lame_known; for(a = dp->target_list; a; a = a->next_target) { addr_to_str(&a->addr, a->addrlen, buf, sizeof(buf)); @@ -801,15 +801,15 @@ print_dp_details(SSL* ssl, struct worker* worker, struct delegpt* dp) * lameness won't be reported then */ if(!infra_get_lame_rtt(worker->env.infra_cache, &a->addr, a->addrlen, dp->name, dp->namelen, - LDNS_RR_TYPE_A, &lame, &dlame, &rlame, &rtt, + LDNS_RR_TYPE_A, &lame, &dlame, &rlame, &rtt, &lost, *worker->env.now)) { if(!ssl_printf(ssl, "not in infra cache.\n")) return; continue; /* skip stuff not in infra cache */ } - if(!ssl_printf(ssl, "%s%s%srtt %d msec. ", + if(!ssl_printf(ssl, "%s%s%srtt %d msec, %d lost. ", lame?"LAME ":"", dlame?"NoDNSSEC ":"", - rlame?"NoAuthButRecursive ":"", rtt)) + rlame?"NoAuthButRecursive ":"", rtt, lost)) return; if(infra_host(worker->env.infra_cache, &a->addr, a->addrlen, *worker->env.now, &edns_vs, &edns_lame_known, &to)) { diff --git a/doc/Changelog b/doc/Changelog index 6890c69c7..73eb08638 100644 --- a/doc/Changelog +++ b/doc/Changelog @@ -1,3 +1,7 @@ +17 August 2009: Wouter + - Fix so that servers are only blacklisted if they fail to reply + to 16 queries in a row and the timeout gets above 2 minutes. + 14 August 2009: Wouter - unbound-control lookup prints out infra cache information, like RTT. - Fix bug in DLV lookup reported by Amanda from Secure64. diff --git a/iterator/iter_utils.c b/iterator/iter_utils.c index bb529759c..765bc6135 100644 --- a/iterator/iter_utils.c +++ b/iterator/iter_utils.c @@ -151,6 +151,8 @@ iter_apply_cfg(struct iter_env* iter_env, struct config_file* cfg) * values 0 .. 49 are not used, unless that is changed. * USEFUL_SERVER_TOP_TIMEOUT * This value exactly is given for unresponsive blacklisted. + * USEFUL_SERVER_TOP_TIMEOUT+1 + * For non-blacklisted servers: huge timeout, but has traffic. * USEFUL_SERVER_TOP_TIMEOUT .. * dnsseclame servers get penalty * USEFUL_SERVER_TOP_TIMEOUT*2 .. @@ -170,7 +172,7 @@ iter_filter_unsuitable(struct iter_env* iter_env, struct module_env* env, uint8_t* name, size_t namelen, uint16_t qtype, uint32_t now, struct delegpt_addr* a) { - int rtt, lame, reclame, dnsseclame; + int rtt, lame, reclame, dnsseclame, lost; if(a->bogus) return -1; /* address of server is bogus */ if(donotq_lookup(iter_env->donotq, &a->addr, a->addrlen)) { @@ -182,7 +184,7 @@ iter_filter_unsuitable(struct iter_env* iter_env, struct module_env* env, /* check lameness - need zone , class info */ if(infra_get_lame_rtt(env->infra_cache, &a->addr, a->addrlen, name, namelen, qtype, &lame, &dnsseclame, &reclame, - &rtt, now)) { + &rtt, &lost, now)) { log_addr(VERB_ALGO, "servselect", &a->addr, a->addrlen); verbose(VERB_ALGO, " rtt=%d%s%s%s", rtt, lame?" LAME":"", @@ -190,9 +192,12 @@ iter_filter_unsuitable(struct iter_env* iter_env, struct module_env* env, reclame?" REC_LAME":""); if(lame) return -1; /* server is lame */ - else if(rtt >= USEFUL_SERVER_TOP_TIMEOUT) + else if(rtt >= USEFUL_SERVER_TOP_TIMEOUT && + lost >= USEFUL_SERVER_MAX_LOST) /* server is unresponsive */ return USEFUL_SERVER_TOP_TIMEOUT; + else if(rtt >= USEFUL_SERVER_TOP_TIMEOUT) /* not blacklisted*/ + return USEFUL_SERVER_TOP_TIMEOUT+1; else if(reclame) return rtt+USEFUL_SERVER_TOP_TIMEOUT*2; /* nonpref */ else if(dnsseclame ) diff --git a/iterator/iterator.h b/iterator/iterator.h index c70dadd9c..b985860e2 100644 --- a/iterator/iterator.h +++ b/iterator/iterator.h @@ -65,6 +65,10 @@ struct iter_priv; * Equals RTT_MAX_TIMEOUT */ #define USEFUL_SERVER_TOP_TIMEOUT 120000 +/** Number of lost messages in a row that get a host blacklisted. + * With 16, a couple different queries have to time out and no working + * queries are happening */ +#define USEFUL_SERVER_MAX_LOST 16 /** number of retries on outgoing queries */ #define OUTBOUND_MSG_RETRY 5 /** RTT band, within this amount from the best, servers are chosen randomly. diff --git a/services/cache/infra.c b/services/cache/infra.c index f6ecb40a4..fbc52549a 100644 --- a/services/cache/infra.c +++ b/services/cache/infra.c @@ -219,6 +219,7 @@ new_host_entry(struct infra_cache* infra, struct sockaddr_storage* addr, data->lameness = NULL; data->edns_version = 0; data->edns_lame_known = 0; + data->num_timeouts = 0; rtt_init(&data->rtt); return &key->entry; } @@ -471,9 +472,14 @@ infra_rtt_update(struct infra_cache* infra, /* have an entry, update the rtt, and the ttl */ data = (struct infra_host_data*)e->data; data->ttl = timenow + infra->host_ttl; - if(roundtrip == -1) + if(roundtrip == -1) { rtt_lost(&data->rtt, orig_rtt); - else rtt_update(&data->rtt, roundtrip); + if(data->num_timeouts<255) + data->num_timeouts++; + } else { + rtt_update(&data->rtt, roundtrip); + data->num_timeouts = 0; + } if(data->rtt.rto > 0) rto = data->rtt.rto; @@ -513,7 +519,8 @@ int infra_get_lame_rtt(struct infra_cache* infra, struct sockaddr_storage* addr, socklen_t addrlen, uint8_t* name, size_t namelen, uint16_t qtype, - int* lame, int* dnsseclame, int* reclame, int* rtt, uint32_t timenow) + int* lame, int* dnsseclame, int* reclame, int* rtt, int* lost, + uint32_t timenow) { struct infra_host_data* host; struct lruhash_entry* e = infra_lookup_host_nottl(infra, addr, @@ -523,6 +530,7 @@ infra_get_lame_rtt(struct infra_cache* infra, return 0; host = (struct infra_host_data*)e->data; *rtt = rtt_unclamped(&host->rtt); + *lost = (int)host->num_timeouts; /* check lameness first, if so, ttl on host does not matter anymore */ if(infra_lookup_lame(host, name, namelen, timenow, &dlm, &rlm, &alm, &olm)) { diff --git a/services/cache/infra.h b/services/cache/infra.h index fa693cf4e..e6fc3cda2 100644 --- a/services/cache/infra.h +++ b/services/cache/infra.h @@ -74,6 +74,8 @@ struct infra_host_data { * EDNS lame is when EDNS queries or replies are dropped, * and cause a timeout */ uint8_t edns_lame_known; + /** Number of consequtive timeouts; reset when reply arrives OK. */ + uint8_t num_timeouts; }; /** @@ -270,13 +272,16 @@ int infra_edns_update(struct infra_cache* infra, * @param reclame: if function returns true, this is if it is recursion lame. * @param rtt: if function returns true, this returns avg rtt of the server. * The rtt value is unclamped and reflects recent timeouts. + * @param lost: number of queries lost in a row. Reset to 0 when an answer + * gets back. Gives a connectivity number. * @param timenow: what time it is now. * @return if found in cache, or false if not (or TTL bad). */ int infra_get_lame_rtt(struct infra_cache* infra, struct sockaddr_storage* addr, socklen_t addrlen, uint8_t* name, size_t namelen, uint16_t qtype, - int* lame, int* dnsseclame, int* reclame, int* rtt, uint32_t timenow); + int* lame, int* dnsseclame, int* reclame, int* rtt, int* lost, + uint32_t timenow); /** * Get memory used by the infra cache. -- 2.47.3