By default, the availability of a downstream server is checked by regularly
sending an A query for "a.root-servers.net.". A different query type and target
-can be specified by passing, respectively, the 'checkType' and 'checkName'
+can be specified by passing, respectively, the `checkType` and `checkName`
parameters to `newServer`. The default behavior is to consider any valid response
-with a RCODE different from ServFail as valid. If the 'mustResolve' parameter
+with a RCODE different from ServFail as valid. If the `mustResolve` parameter
of `newServer` is set to true, a response will only be considered valid if
its RCODE differs from NXDomain, ServFail and Refused.
+The number of health check failures before a server is considered down is
+configurable via the`maxCheckFailures` parameter, defaulting to 1.
```
newServer({address="192.0.2.1", checkType="AAAA", checkName="a.root-servers.net.", mustResolve=true})
or at least the one talking to `dnsdist`, the 'useClientSubnet' parameter can be used
when declaring a new server. This parameter indicates whether an EDNS Client Subnet option
should be added to the request. If the incoming request already contains an EDNS Client Subnet value,
-it will not be overriden unless setECSOverride is set to true. The source prefix-length may be
+it will not be overriden unless `setECSOverride()` is set to true. The source prefix-length may be
configured with:
```
> setECSSourcePrefixV4(24)
The same kind of timeouts is enforced on the TCP connections to the downstream servers.
The default value of 30s can be modified by passing the `tcpRecvTimeout` and `tcpSendTimeout`
-parameters to `newServer`:
+parameters to `newServer`. If the TCP connection to a downstream server fails, `dnsdist`
+will try to establish a new one up to `retries` times before giving up.
```
-newServer({address="192.0.2.1", tcpRecvTimeout=10, tcpSendTimeout=10})
+newServer({address="192.0.2.1", tcpRecvTimeout=10, tcpSendTimeout=10, retries=5})
```
Source address
* `infolog(string)`: log at level info
* `warnlog(string)`: log at level warning
* `errlog(string)`: log at level error
+ * `setVerboseHealthChecks(bool)`: whether health check errors will be logged. Note that even if set to true, health check errors will be logged at verbose level only.
* Server related:
* `newServer("ip:port")`: instantiate a new downstream server with default settings
- * `newServer({address="ip:port", qps=1000, order=1, weight=10, pool="abuse", retries=5, tcpSendTimeout=30, tcpRecvTimeout=30, checkName="a.root-servers.net.", checkType="A", mustResolve=false, useClientSubnet=true, source="address|interface name|address@interface"})`:
+ * `newServer({address="ip:port", qps=1000, order=1, weight=10, pool="abuse", retries=5, tcpSendTimeout=30, tcpRecvTimeout=30, checkName="a.root-servers.net.", checkType="A", maxCheckFailures=1, mustResolve=false, useClientSubnet=true, source="address|interface name|address@interface"})`:
instantiate a server with additional parameters
* `showServers()`: output all servers
* `getServer(n)`: returns server with index n
"setACL(", "setDNSSECPool(", "setECSOverride(",
"setECSSourcePrefixV4(", "setECSSourcePrefixV6(", "setKey(", "setLocal(",
"setMaxTCPClientThreads(", "setMaxUDPOutstanding(", "setServerPolicy(", "setServerPolicyLua(",
- "setTCPRecvTimeout(", "setTCPSendTimeout(", "show(", "showACL()",
+ "setTCPRecvTimeout(", "setTCPSendTimeout(", "setVerboseHealthChecks(", "show(", "showACL()",
"showDNSCryptBinds()", "showDynBlocks()", "showResponseLatency()", "showRules()",
"showServerPolicy()", "showServers()", "shutdown()", "SpoofAction(",
"TCAction(", "testCrypto()", "topBandwidth(", "topClients(",
ret->useECS=boost::get<bool>(vars["useClientSubnet"]);
}
+ if(vars.count("maxCheckFailures")) {
+ ret->maxCheckFailures=std::stoi(boost::get<string>(vars["maxCheckFailures"]));
+ }
+
if(g_launchWork) {
g_launchWork->push_back([ret]() {
ret->tid = move(thread(responderThread, ret));
g_pools.setState(localPools);
return pool;
});
+
+ g_lua.writeFunction("setVerboseHealthChecks", [](bool verbose) { g_verboseHealthChecks=verbose; });
}
struct DNSDistStats g_stats;
uint16_t g_maxOutstanding;
bool g_console;
+bool g_verboseHealthChecks{false};
GlobalStateHolder<NetmaskGroup> g_ACL;
string g_outputBuffer;
}
sock.connect(ds.remote);
ssize_t sent = udpClientSendRequestToBackend(&ds, sock.getHandle(), (char*)&packet[0], packet.size());
- if (sent < 0)
+ if (sent < 0) {
+ int ret = errno;
+ if (g_verboseHealthChecks)
+ vinfolog("Error while sending a health check query to backend %s: %d", ds.getNameWithAddr(), ret);
return false;
+ }
int ret=waitForRWData(sock.getHandle(), true, 1, 0);
- if(ret < 0 || !ret) // error, timeout, both are down!
+ if(ret < 0 || !ret) { // error, timeout, both are down!
+ if (ret < 0) {
+ ret = errno;
+ if (g_verboseHealthChecks)
+ vinfolog("Error while waiting for the health check response from backend %s: %d", ds.getNameWithAddr(), ret);
+ }
+ else {
+ if (g_verboseHealthChecks)
+ vinfolog("Timeout while waiting for the health check response from backend %s", ds.getNameWithAddr());
+ }
return false;
+ }
+
string reply;
sock.recvFrom(reply, ds.remote);
const dnsheader * responseHeader = (const dnsheader *) reply.c_str();
- if (reply.size() < sizeof(*responseHeader))
+ if (reply.size() < sizeof(*responseHeader)) {
+ if (g_verboseHealthChecks)
+ vinfolog("Invalid health check response of size %d from backend %s, expecting at least %d", reply.size(), ds.getNameWithAddr(), sizeof(*responseHeader));
return false;
+ }
- if (responseHeader->id != requestHeader->id)
+ if (responseHeader->id != requestHeader->id) {
+ if (g_verboseHealthChecks)
+ vinfolog("Invalid health check response id %d from backend %s, expecting %d", responseHeader->id, ds.getNameWithAddr(), requestHeader->id);
return false;
- if (!responseHeader->qr)
+ }
+
+ if (!responseHeader->qr) {
+ if (g_verboseHealthChecks)
+ vinfolog("Invalid health check response from backend %s, expecting QR to be set", ds.getNameWithAddr());
return false;
- if (responseHeader->rcode == RCode::ServFail)
+ }
+
+ if (responseHeader->rcode == RCode::ServFail) {
+ if (g_verboseHealthChecks)
+ vinfolog("Backend %s responded to health check with ServFail", ds.getNameWithAddr());
return false;
- if (ds.mustResolve && (responseHeader->rcode == RCode::NXDomain || responseHeader->rcode == RCode::Refused))
+ }
+
+ if (ds.mustResolve && (responseHeader->rcode == RCode::NXDomain || responseHeader->rcode == RCode::Refused)) {
+ if (g_verboseHealthChecks)
+ vinfolog("Backend %s responded to health check with %s while mustResolve is set", ds.getNameWithAddr(), responseHeader->rcode == RCode::NXDomain ? "NXDomain" : "Refused");
return false;
+ }
// XXX fixme do bunch of checking here etc
return true;
}
+catch(const std::exception& e)
+{
+ if (g_verboseHealthChecks)
+ vinfolog("Error checking the health of backend %s: %s", ds.getNameWithAddr(), e.what());
+ return false;
+}
catch(...)
{
+ if (g_verboseHealthChecks)
+ vinfolog("Unknown exception while checking the health of backend %s", ds.getNameWithAddr());
return false;
}
for(auto& dss : g_dstates.getCopy()) { // this points to the actual shared_ptrs!
if(dss->availability==DownstreamState::Availability::Auto) {
bool newState=upCheck(*dss);
+ if (!newState && dss->upStatus) {
+ dss->currentCheckFailures++;
+ if (dss->currentCheckFailures < dss->maxCheckFailures) {
+ newState = true;
+ }
+ }
+
if(newState != dss->upStatus) {
warnlog("Marking downstream %s as '%s'", dss->getNameWithAddr(), newState ? "up" : "down");
+ dss->upStatus = newState;
+ dss->currentCheckFailures = 0;
}
- dss->upStatus = newState;
}
auto delta = dss->sw.udiffAndSet()/1000000.0;
int tcpSendTimeout{30};
unsigned int sourceItf{0};
uint16_t retries{5};
+ uint8_t currentCheckFailures{0};
+ uint8_t maxCheckFailures{1};
StopWatch sw;
set<string> pools;
enum class Availability { Up, Down, Auto} availability{Availability::Auto};
extern uint16_t g_ECSSourcePrefixV4;
extern uint16_t g_ECSSourcePrefixV6;
extern bool g_ECSOverride;
+extern bool g_verboseHealthChecks;
struct dnsheader;