{
char buf[257];
struct delegpt_addr* a;
- int lame, dlame, rlame, rtt, edns_vs, to;
+ int lame, dlame, rlame, rtt, edns_vs, to, lost;
uint8_t edns_lame_known;
for(a = dp->target_list; a; a = a->next_target) {
addr_to_str(&a->addr, a->addrlen, buf, sizeof(buf));
* lameness won't be reported then */
if(!infra_get_lame_rtt(worker->env.infra_cache,
&a->addr, a->addrlen, dp->name, dp->namelen,
- LDNS_RR_TYPE_A, &lame, &dlame, &rlame, &rtt,
+ LDNS_RR_TYPE_A, &lame, &dlame, &rlame, &rtt, &lost,
*worker->env.now)) {
if(!ssl_printf(ssl, "not in infra cache.\n"))
return;
continue; /* skip stuff not in infra cache */
}
- if(!ssl_printf(ssl, "%s%s%srtt %d msec. ",
+ if(!ssl_printf(ssl, "%s%s%srtt %d msec, %d lost. ",
lame?"LAME ":"", dlame?"NoDNSSEC ":"",
- rlame?"NoAuthButRecursive ":"", rtt))
+ rlame?"NoAuthButRecursive ":"", rtt, lost))
return;
if(infra_host(worker->env.infra_cache, &a->addr, a->addrlen,
*worker->env.now, &edns_vs, &edns_lame_known, &to)) {
+17 August 2009: Wouter
+ - Fix so that servers are only blacklisted if they fail to reply
+ to 16 queries in a row and the timeout gets above 2 minutes.
+
14 August 2009: Wouter
- unbound-control lookup prints out infra cache information, like RTT.
- Fix bug in DLV lookup reported by Amanda from Secure64.
* values 0 .. 49 are not used, unless that is changed.
* USEFUL_SERVER_TOP_TIMEOUT
* This value exactly is given for unresponsive blacklisted.
+ * USEFUL_SERVER_TOP_TIMEOUT+1
+ * For non-blacklisted servers: huge timeout, but has traffic.
* USEFUL_SERVER_TOP_TIMEOUT ..
* dnsseclame servers get penalty
* USEFUL_SERVER_TOP_TIMEOUT*2 ..
uint8_t* name, size_t namelen, uint16_t qtype, uint32_t now,
struct delegpt_addr* a)
{
- int rtt, lame, reclame, dnsseclame;
+ int rtt, lame, reclame, dnsseclame, lost;
if(a->bogus)
return -1; /* address of server is bogus */
if(donotq_lookup(iter_env->donotq, &a->addr, a->addrlen)) {
/* check lameness - need zone , class info */
if(infra_get_lame_rtt(env->infra_cache, &a->addr, a->addrlen,
name, namelen, qtype, &lame, &dnsseclame, &reclame,
- &rtt, now)) {
+ &rtt, &lost, now)) {
log_addr(VERB_ALGO, "servselect", &a->addr, a->addrlen);
verbose(VERB_ALGO, " rtt=%d%s%s%s", rtt,
lame?" LAME":"",
reclame?" REC_LAME":"");
if(lame)
return -1; /* server is lame */
- else if(rtt >= USEFUL_SERVER_TOP_TIMEOUT)
+ else if(rtt >= USEFUL_SERVER_TOP_TIMEOUT &&
+ lost >= USEFUL_SERVER_MAX_LOST)
/* server is unresponsive */
return USEFUL_SERVER_TOP_TIMEOUT;
+ else if(rtt >= USEFUL_SERVER_TOP_TIMEOUT) /* not blacklisted*/
+ return USEFUL_SERVER_TOP_TIMEOUT+1;
else if(reclame)
return rtt+USEFUL_SERVER_TOP_TIMEOUT*2; /* nonpref */
else if(dnsseclame )
* Equals RTT_MAX_TIMEOUT
*/
#define USEFUL_SERVER_TOP_TIMEOUT 120000
+/** Number of lost messages in a row that get a host blacklisted.
+ * With 16, a couple different queries have to time out and no working
+ * queries are happening */
+#define USEFUL_SERVER_MAX_LOST 16
/** number of retries on outgoing queries */
#define OUTBOUND_MSG_RETRY 5
/** RTT band, within this amount from the best, servers are chosen randomly.
data->lameness = NULL;
data->edns_version = 0;
data->edns_lame_known = 0;
+ data->num_timeouts = 0;
rtt_init(&data->rtt);
return &key->entry;
}
/* have an entry, update the rtt, and the ttl */
data = (struct infra_host_data*)e->data;
data->ttl = timenow + infra->host_ttl;
- if(roundtrip == -1)
+ if(roundtrip == -1) {
rtt_lost(&data->rtt, orig_rtt);
- else rtt_update(&data->rtt, roundtrip);
+ if(data->num_timeouts<255)
+ data->num_timeouts++;
+ } else {
+ rtt_update(&data->rtt, roundtrip);
+ data->num_timeouts = 0;
+ }
if(data->rtt.rto > 0)
rto = data->rtt.rto;
infra_get_lame_rtt(struct infra_cache* infra,
struct sockaddr_storage* addr, socklen_t addrlen,
uint8_t* name, size_t namelen, uint16_t qtype,
- int* lame, int* dnsseclame, int* reclame, int* rtt, uint32_t timenow)
+ int* lame, int* dnsseclame, int* reclame, int* rtt, int* lost,
+ uint32_t timenow)
{
struct infra_host_data* host;
struct lruhash_entry* e = infra_lookup_host_nottl(infra, addr,
return 0;
host = (struct infra_host_data*)e->data;
*rtt = rtt_unclamped(&host->rtt);
+ *lost = (int)host->num_timeouts;
/* check lameness first, if so, ttl on host does not matter anymore */
if(infra_lookup_lame(host, name, namelen, timenow,
&dlm, &rlm, &alm, &olm)) {
* EDNS lame is when EDNS queries or replies are dropped,
* and cause a timeout */
uint8_t edns_lame_known;
+ /** Number of consequtive timeouts; reset when reply arrives OK. */
+ uint8_t num_timeouts;
};
/**
* @param reclame: if function returns true, this is if it is recursion lame.
* @param rtt: if function returns true, this returns avg rtt of the server.
* The rtt value is unclamped and reflects recent timeouts.
+ * @param lost: number of queries lost in a row. Reset to 0 when an answer
+ * gets back. Gives a connectivity number.
* @param timenow: what time it is now.
* @return if found in cache, or false if not (or TTL bad).
*/
int infra_get_lame_rtt(struct infra_cache* infra,
struct sockaddr_storage* addr, socklen_t addrlen,
uint8_t* name, size_t namelen, uint16_t qtype,
- int* lame, int* dnsseclame, int* reclame, int* rtt, uint32_t timenow);
+ int* lame, int* dnsseclame, int* reclame, int* rtt, int* lost,
+ uint32_t timenow);
/**
* Get memory used by the infra cache.