str << base << "tcpavgqueriesperconnection" << ' ' << state->tcpAvgQueriesPerConnection.load() << " " << now << "\r\n";
str << base << "tcpavgconnectionduration" << ' ' << state->tcpAvgConnectionDuration.load() << " " << now << "\r\n";
str << base << "tcptoomanyconcurrentconnections" << ' ' << state->tcpTooManyConcurrentConnections.load() << " " << now << "\r\n";
+ str << base << "healthcheckfailures" << ' ' << state->d_healthCheckMetrics.d_failures << " " << now << "\r\n";
+ str << base << "healthcheckfailuresparsing" << ' ' << state->d_healthCheckMetrics.d_parseErrors << " " << now << "\r\n";
+ str << base << "healthcheckfailurestimeout" << ' ' << state->d_healthCheckMetrics.d_timeOuts << " " << now << "\r\n";
+ str << base << "healthcheckfailuresnetwork" << ' ' << state->d_healthCheckMetrics.d_networkErrors << " " << now << "\r\n";
+ str << base << "healthcheckfailuresmismatch" << ' ' << state->d_healthCheckMetrics.d_mismatchErrors << " " << now << "\r\n";
+ str << base << "healthcheckfailuresinvalid" << ' ' << state->d_healthCheckMetrics.d_invalidResponseErrors << " " << now << "\r\n";
}
std::map<std::string, uint64_t> frontendDuplicates;
output << "# TYPE " << statesbase << "tlsresumptions " << "counter" << "\n";
output << "# HELP " << statesbase << "tcplatency " << "Server's latency when answering TCP questions in milliseconds" << "\n";
output << "# TYPE " << statesbase << "tcplatency " << "gauge" << "\n";
+ output << "# HELP " << statesbase << "healthcheckfailures " << "Number of health check attempts that failed (total)" << "\n";
+ output << "# TYPE " << statesbase << "healthcheckfailures " << "counter" << "\n";
+ output << "# HELP " << statesbase << "healthcheckfailuresparsing " << "Number of health check attempts where the response could not be parsed" << "\n";
+ output << "# TYPE " << statesbase << "healthcheckfailuresparsing " << "counter" << "\n";
+ output << "# HELP " << statesbase << "healthcheckfailurestimeout " << "Number of health check attempts where the response did not arrive in time" << "\n";
+ output << "# TYPE " << statesbase << "healthcheckfailurestimeout " << "counter" << "\n";
+ output << "# HELP " << statesbase << "healthcheckfailuresnetwork " << "Number of health check attempts that experienced a network issue" << "\n";
+ output << "# TYPE " << statesbase << "healthcheckfailuresnetwork " << "counter" << "\n";
+ output << "# HELP " << statesbase << "healthcheckfailuresmismatch " << "Number of health check attempts where the response did not match the query" << "\n";
+ output << "# TYPE " << statesbase << "healthcheckfailuresmismatch " << "counter" << "\n";
+ output << "# HELP " << statesbase << "healthcheckfailuresinvalid " << "Number of health check attempts where the DNS response was invalid" << "\n";
+ output << "# TYPE " << statesbase << "healthcheckfailuresinvalid " << "counter" << "\n";
for (const auto& state : *states) {
string serverName;
output << statesbase << "tcpavgqueriesperconn" << label << " " << state->tcpAvgQueriesPerConnection << "\n";
output << statesbase << "tcpavgconnduration" << label << " " << state->tcpAvgConnectionDuration << "\n";
output << statesbase << "tlsresumptions" << label << " " << state->tlsResumptions << "\n";
+ output << statesbase << "healthcheckfailures" << label << " " << state->d_healthCheckMetrics.d_failures << "\n";
+ output << statesbase << "healthcheckfailuresparsing" << label << " " << state->d_healthCheckMetrics.d_parseErrors << "\n";
+ output << statesbase << "healthcheckfailurestimeout" << label << " " << state->d_healthCheckMetrics.d_timeOuts << "\n";
+ output << statesbase << "healthcheckfailuresnetwork" << label << " " << state->d_healthCheckMetrics.d_networkErrors << "\n";
+ output << statesbase << "healthcheckfailuresmismatch" << label << " " << state->d_healthCheckMetrics.d_mismatchErrors << "\n";
+ output << statesbase << "healthcheckfailuresinvalid" << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n";
}
const string frontsbase = "dnsdist_frontend_";
output << "# TYPE " << frontsbase << "tlsunknownticketkeys " << "counter" << "\n";
output << "# HELP " << frontsbase << "tlsinactiveticketkeys " << "Amount of TLS sessions resumed from an inactive key" << "\n";
output << "# TYPE " << frontsbase << "tlsinactiveticketkeys " << "counter" << "\n";
-
output << "# HELP " << frontsbase << "tlshandshakefailures " << "Amount of TLS handshake failures" << "\n";
output << "# TYPE " << frontsbase << "tlshandshakefailures " << "counter" << "\n";
{"tcpAvgConnectionDuration", (double)a->tcpAvgConnectionDuration},
{"tlsResumptions", (double)a->tlsResumptions},
{"tcpLatency", (double)(a->latencyUsecTCP/1000.0)},
+ {"healthCheckFailures", (double)(a->d_healthCheckMetrics.d_failures)},
+ {"healthCheckFailuresParsing", (double)(a->d_healthCheckMetrics.d_parseErrors)},
+ {"healthCheckFailuresTimeout", (double)(a->d_healthCheckMetrics.d_timeOuts)},
+ {"healthCheckFailuresNetwork", (double)(a->d_healthCheckMetrics.d_networkErrors)},
+ {"healthCheckFailuresMismatch", (double)(a->d_healthCheckMetrics.d_mismatchErrors)},
+ {"healthCheckFailuresInvalid", (double)(a->d_healthCheckMetrics.d_invalidResponseErrors)},
{"dropRate", (double)a->dropRate}
};
bool d_upgradeToLazyHealthChecks{false};
};
+ struct HealthCheckMetrics
+ {
+ stat_t d_failures{0};
+ stat_t d_timeOuts{0};
+ stat_t d_parseErrors{0};
+ stat_t d_networkErrors{0};
+ stat_t d_mismatchErrors{0};
+ stat_t d_invalidResponseErrors{0};
+ };
+
DownstreamState(DownstreamState::Config&& config, std::shared_ptr<TLSCtx> tlsCtx, bool connect);
DownstreamState(const ComboAddress& remote): DownstreamState(DownstreamState::Config(remote), nullptr, false)
{
~DownstreamState();
Config d_config;
+ HealthCheckMetrics d_healthCheckMetrics;
stat_t sendErrors{0};
stat_t outstanding{0};
stat_t reuseds{0};
double latencyUsecTCP{0.0};
unsigned int d_nextCheck{0};
uint16_t currentCheckFailures{0};
- uint8_t consecutiveSuccessfulChecks{0};
std::atomic<bool> hashesComputed{false};
std::atomic<bool> connected{false};
bool upStatus{false};
private:
+ void handleUDPTimeout(IDState& ids);
+ void updateNextLazyHealthCheck(LazyHealthCheckStats& stats, bool checkScheduled, std::optional<time_t> currentTime = std::nullopt);
void connectUDPSockets();
std::thread tid;
std::mutex connectLock;
std::condition_variable d_connectedWait;
std::atomic_flag threadStarted;
+ uint8_t consecutiveSuccessfulChecks{0};
bool d_stopped{false};
public:
static int s_udpTimeout;
static bool s_randomizeSockets;
static bool s_randomizeIDs;
-private:
- void handleUDPTimeout(IDState& ids);
- void updateNextLazyHealthCheck(LazyHealthCheckStats& stats, bool checkScheduled, std::optional<time_t> currentTime = std::nullopt);
};
using servers_t = vector<std::shared_ptr<DownstreamState>>;
void DownstreamState::submitHealthCheckResult(bool initial, bool newResult)
{
+ if (!newResult) {
+ ++d_healthCheckMetrics.d_failures;
+ }
+
if (initial) {
/* if this is the initial health-check, at startup, we do not care
about the minimum number of failed/successful health-checks */
auto& ds = data->d_ds;
try {
if (data->d_buffer.size() < sizeof(dnsheader)) {
+ ++data->d_ds->d_healthCheckMetrics.d_parseErrors;
if (g_verboseHealthChecks) {
infolog("Invalid health check response of size %d from backend %s, expecting at least %d", data->d_buffer.size(), ds->getNameWithAddr(), sizeof(dnsheader));
}
const dnsheader * responseHeader = reinterpret_cast<const dnsheader*>(data->d_buffer.data());
if (responseHeader->id != data->d_queryID) {
+ ++data->d_ds->d_healthCheckMetrics.d_mismatchErrors;
if (g_verboseHealthChecks) {
infolog("Invalid health check response id %d from backend %s, expecting %d", responseHeader->id, ds->getNameWithAddr(), data->d_queryID);
}
}
if (!responseHeader->qr) {
+ ++data->d_ds->d_healthCheckMetrics.d_invalidResponseErrors;
if (g_verboseHealthChecks) {
infolog("Invalid health check response from backend %s, expecting QR to be set", ds->getNameWithAddr());
}
}
if (responseHeader->rcode == RCode::ServFail) {
+ ++data->d_ds->d_healthCheckMetrics.d_invalidResponseErrors;
if (g_verboseHealthChecks) {
infolog("Backend %s responded to health check with ServFail", ds->getNameWithAddr());
}
}
if (ds->d_config.mustResolve && (responseHeader->rcode == RCode::NXDomain || responseHeader->rcode == RCode::Refused)) {
+ ++data->d_ds->d_healthCheckMetrics.d_invalidResponseErrors;
if (g_verboseHealthChecks) {
infolog("Backend %s responded to health check with %s while mustResolve is set", ds->getNameWithAddr(), responseHeader->rcode == RCode::NXDomain ? "NXDomain" : "Refused");
}
DNSName receivedName(reinterpret_cast<const char*>(data->d_buffer.data()), data->d_buffer.size(), sizeof(dnsheader), false, &receivedType, &receivedClass);
if (receivedName != data->d_checkName || receivedType != data->d_checkType || receivedClass != data->d_checkClass) {
+ ++data->d_ds->d_healthCheckMetrics.d_mismatchErrors;
if (g_verboseHealthChecks) {
infolog("Backend %s responded to health check with an invalid qname (%s vs %s), qtype (%s vs %s) or qclass (%d vs %d)", ds->getNameWithAddr(), receivedName.toLogString(), data->d_checkName.toLogString(), QType(receivedType).toString(), QType(data->d_checkType).toString(), receivedClass, data->d_checkClass);
}
return false;
}
}
- catch(const std::exception& e) {
+ catch (const std::exception& e) {
+ ++data->d_ds->d_healthCheckMetrics.d_parseErrors;
if (g_verboseHealthChecks) {
infolog("Error checking the health of backend %s: %s", ds->getNameWithAddr(), e.what());
}
void notifyIOError(InternalQueryState&& query, const struct timeval& now) override
{
+ ++d_data->d_ds->d_healthCheckMetrics.d_networkErrors;
d_data->d_ds->submitHealthCheckResult(d_data->d_initial, false);
}
if (g_verboseHealthChecks) {
infolog("Error receiving health check response from %s: %s", data->d_ds->d_config.remote.toStringWithPort(), stringerror(savederrno));
}
+ ++data->d_ds->d_healthCheckMetrics.d_networkErrors;
data->d_ds->submitHealthCheckResult(data->d_initial, false);
return;
}
if (g_verboseHealthChecks) {
infolog("Invalid health check response received from %s, expecting one from %s", from.toStringWithPort(), data->d_ds->d_config.remote.toStringWithPort());
}
+ ++data->d_ds->d_healthCheckMetrics.d_networkErrors;
data->d_ds->submitHealthCheckResult(data->d_initial, false);
return;
}
ioGuard.release();
}
catch (const std::exception& e) {
+ ++data->d_ds->d_healthCheckMetrics.d_networkErrors;
data->d_ds->submitHealthCheckResult(data->d_initial, false);
if (g_verboseHealthChecks) {
infolog("Error checking the health of backend %s: %s", data->d_ds->getNameWithAddr(), e.what());
infolog("Timeout while waiting for the health check response (ID %d) from backend %s", data->d_queryID, data->d_ds->getNameWithAddr());
}
+ ++data->d_ds->d_healthCheckMetrics.d_timeOuts;
data->d_ds->submitHealthCheckResult(initial, false);
}
catch (const std::exception& e) {
infolog("Timeout while waiting for the health check response (ID %d) from backend %s", data->d_queryID, data->d_ds->getNameWithAddr());
}
+ ++data->d_ds->d_healthCheckMetrics.d_timeOuts;
data->d_ds->submitHealthCheckResult(initial, false);
}
catch (const std::exception& e) {
:property integer tlsResumptions: The number of times a TLS session has been resumed
:property integer weight: The weight assigned to this server
:property float dropRate: The amount of packets dropped (timing out) per second by this server
+ :property integer healthCheckFailures: Number of health check attempts that failed (total)
+ :property integer healthCheckFailureParsing: Number of health check attempts that failed because the payload could not be parsed
+ :property integer healthCheckFailureTimeout: Number of health check attempts that failed because the response was not received in time
+ :property integer healthCheckFailureNetwork: Number of health check attempts that failed because of a network error
+ :property integer healthCheckFailureMismatch: Number of health check attempts that failed because the ID, qname, qtype or qclass did not match
+ :property integer healthCheckFailureInvalid: Number of health check attempts that failed because the DNS response was not valid
.. json:object:: StatisticItem
'dropRate', 'responses', 'nonCompliantResponses', 'tcpDiedSendingQuery', 'tcpDiedReadingResponse',
'tcpGaveUp', 'tcpReadTimeouts', 'tcpWriteTimeouts', 'tcpCurrentConnections',
'tcpNewConnections', 'tcpReusedConnections', 'tlsResumptions', 'tcpAvgQueriesPerConnection',
- 'tcpAvgConnectionDuration', 'tcpLatency', 'protocol']:
+ 'tcpAvgConnectionDuration', 'tcpLatency', 'protocol', 'healthCheckFailures', 'healthCheckFailuresParsing', 'healthCheckFailuresTimeout', 'healthCheckFailuresNetwork', 'healthCheckFailuresMismatch', 'healthCheckFailuresInvalid']:
self.assertIn(key, server)
for key in ['id', 'latency', 'weight', 'outstanding', 'qpsLimit', 'reuseds',
#!/usr/bin/env python
import base64
+import requests
+import ssl
import threading
import time
-import ssl
import dns
from dnsdisttests import DNSDistTest
class HealthCheckTest(DNSDistTest):
_consoleKey = DNSDistTest.generateConsoleKey()
_consoleKeyB64 = base64.b64encode(_consoleKey).decode('ascii')
- _config_params = ['_consoleKeyB64', '_consolePort', '_testServerPort']
+ _webTimeout = 2.0
+ _webServerPort = 8083
+ _webServerAPIKey = 'apisecret'
+ _webServerAPIKeyHashed = '$scrypt$ln=10,p=1,r=8$9v8JxDfzQVyTpBkTbkUqYg==$bDQzAOHeK1G9UvTPypNhrX48w974ZXbFPtRKS34+aso='
+ _config_params = ['_consoleKeyB64', '_consolePort', '_webServerPort', '_webServerAPIKeyHashed', '_testServerPort']
_config_template = """
setKey("%s")
controlSocket("127.0.0.1:%d")
+ webserver("127.0.0.1:%s")
+ setWebserverConfig({apiKey="%s"})
newServer{address="127.0.0.1:%d"}
"""
def getBackendStatus(self):
return self.sendConsoleCommand("if getServer(0):isUp() then return 'up' else return 'down' end").strip("\n")
+ def getBackendMetric(self, backendID, metricName):
+ headers = {'x-api-key': self._webServerAPIKey}
+ url = 'http://127.0.0.1:' + str(self._webServerPort) + '/api/v1/servers/localhost'
+ r = requests.get(url, headers=headers, timeout=self._webTimeout)
+ self.assertTrue(r)
+ self.assertEqual(r.status_code, 200)
+ self.assertTrue(r.json())
+ content = r.json()
+ self.assertIn('servers', content)
+ servers = content['servers']
+ server = servers[backendID]
+ return int(server[metricName])
+
class TestDefaultHealthCheck(HealthCheckTest):
# this test suite uses a different responder port
# because we need fresh counters
before = TestDefaultHealthCheck._healthCheckCounter
time.sleep(1.5)
self.assertGreater(TestDefaultHealthCheck._healthCheckCounter, before)
+ self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
self.assertEqual(self.getBackendStatus(), 'up')
self.sendConsoleCommand("getServer(0):setUp()")
time.sleep(1.5)
self.assertGreater(TestDefaultHealthCheck._healthCheckCounter, before)
self.assertEqual(self.getBackendStatus(), 'up')
+ self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
class TestHealthCheckForcedUP(HealthCheckTest):
# this test suite uses a different responder port
_config_template = """
setKey("%s")
controlSocket("127.0.0.1:%d")
+ webserver("127.0.0.1:%s")
+ setWebserverConfig({apiKey="%s"})
srv = newServer{address="127.0.0.1:%d"}
srv:setUp()
"""
time.sleep(1.5)
self.assertEqual(TestHealthCheckForcedUP._healthCheckCounter, before)
self.assertEqual(self.getBackendStatus(), 'up')
+ self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
class TestHealthCheckForcedDown(HealthCheckTest):
# this test suite uses a different responder port
_config_template = """
setKey("%s")
controlSocket("127.0.0.1:%d")
+ webserver("127.0.0.1:%s")
+ setWebserverConfig({apiKey="%s"})
srv = newServer{address="127.0.0.1:%d"}
srv:setDown()
"""
before = TestHealthCheckForcedDown._healthCheckCounter
time.sleep(1.5)
self.assertEqual(TestHealthCheckForcedDown._healthCheckCounter, before)
+ self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
class TestHealthCheckCustomName(HealthCheckTest):
# this test suite uses a different responder port
_testServerPort = 5383
_healthCheckName = 'powerdns.com.'
- _config_params = ['_consoleKeyB64', '_consolePort', '_testServerPort', '_healthCheckName']
+ _config_params = ['_consoleKeyB64', '_consolePort', '_webServerPort', '_webServerAPIKeyHashed', '_testServerPort', '_healthCheckName']
_config_template = """
setKey("%s")
controlSocket("127.0.0.1:%d")
+ webserver("127.0.0.1:%s")
+ setWebserverConfig({apiKey="%s"})
srv = newServer{address="127.0.0.1:%d", checkName='%s'}
"""
time.sleep(1.5)
self.assertGreater(TestHealthCheckCustomName._healthCheckCounter, before)
self.assertEqual(self.getBackendStatus(), 'up')
+ self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
class TestHealthCheckCustomNameNoAnswer(HealthCheckTest):
# this test suite uses a different responder port
_config_template = """
setKey("%s")
controlSocket("127.0.0.1:%d")
+ webserver("127.0.0.1:%s")
+ setWebserverConfig({apiKey="%s"})
srv = newServer{address="127.0.0.1:%d", checkName='powerdns.com.'}
"""
time.sleep(1.5)
self.assertEqual(TestHealthCheckCustomNameNoAnswer._healthCheckCounter, before)
self.assertEqual(self.getBackendStatus(), 'down')
+ self.assertGreater(self.getBackendMetric(0, 'healthCheckFailures'), 0)
+ self.assertGreater(self.getBackendMetric(0, 'healthCheckFailuresTimeout'), 0)
class TestHealthCheckCustomFunction(HealthCheckTest):
# this test suite uses a different responder port
_config_template = """
setKey("%s")
controlSocket("127.0.0.1:%d")
+ webserver("127.0.0.1:%s")
+ setWebserverConfig({apiKey="%s"})
function myHealthCheckFunction(qname, qtype, qclass, dh)
dh:setCD(true)