str << base << "healthcheckfailuresnetwork" << ' ' << state->d_healthCheckMetrics.d_networkErrors << " " << now << "\r\n";
str << base << "healthcheckfailuresmismatch" << ' ' << state->d_healthCheckMetrics.d_mismatchErrors << " " << now << "\r\n";
str << base << "healthcheckfailuresinvalid" << ' ' << state->d_healthCheckMetrics.d_invalidResponseErrors << " " << now << "\r\n";
+ str << base << "healthchecklatency" << ' ' << (state->d_healthCheckLatency / 1000.0) << " " << now << "\r\n";
}
std::map<std::string, uint64_t> frontendDuplicates;
PacketBuffer d_buffer;
Socket d_udpSocket;
DNSName d_checkName;
- struct timeval d_ttd{
- 0, 0};
+ StopWatch d_elapsed{false};
+ timeval d_ttd{0, 0};
size_t d_bufferPos{0};
uint16_t d_checkType;
uint16_t d_checkClass;
return false;
}
+ data->d_ds->d_healthCheckLatency = data->d_elapsed.udiff();
return true;
}
data->d_ttd.tv_sec += static_cast<decltype(data->d_ttd.tv_sec)>(downstream->d_config.checkTimeout / 1000); /* ms to seconds */
data->d_ttd.tv_usec += static_cast<decltype(data->d_ttd.tv_usec)>((downstream->d_config.checkTimeout % 1000) * 1000); /* remaining ms to us */
normalizeTV(data->d_ttd);
+ data->d_elapsed.start();
if (!downstream->doHealthcheckOverTCP()) {
sock.connect(downstream->d_config.remote);
output << "# TYPE " << statesbase << "healthcheckfailuresmismatch " << "counter" << "\n";
output << "# HELP " << statesbase << "healthcheckfailuresinvalid " << "Number of health check attempts where the DNS response was invalid" << "\n";
output << "# TYPE " << statesbase << "healthcheckfailuresinvalid " << "counter" << "\n";
+ output << "# HELP " << statesbase << "healthchecklatency " << "Latency of the last successfull health check attempt, in milliseconds" << "\n";
+ output << "# TYPE " << statesbase << "healthchecklatency " << "gauge" << "\n";
for (const auto& state : dnsdist::configuration::getCurrentRuntimeConfiguration().d_backends) {
string serverName;
output << statesbase << "healthcheckfailurestimeout" << label << " " << state->d_healthCheckMetrics.d_timeOuts << "\n";
output << statesbase << "healthcheckfailuresnetwork" << label << " " << state->d_healthCheckMetrics.d_networkErrors << "\n";
output << statesbase << "healthcheckfailuresmismatch" << label << " " << state->d_healthCheckMetrics.d_mismatchErrors << "\n";
- output << statesbase << "healthcheckfailuresinvalid" << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n";
+ output << statesbase << "healthcheckfailuresinvalid" << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n";
+ output << statesbase << "healthchecklatency" << label << " " << state->d_healthCheckLatency / 1000.0 << "\n";
}
const string frontsbase = "dnsdist_frontend_";
{"healthCheckFailuresNetwork", (double)(backend->d_healthCheckMetrics.d_networkErrors)},
{"healthCheckFailuresMismatch", (double)(backend->d_healthCheckMetrics.d_mismatchErrors)},
{"healthCheckFailuresInvalid", (double)(backend->d_healthCheckMetrics.d_invalidResponseErrors)},
+ {"healthCheckLatency", (double)(backend->d_healthCheckLatency / 1000.0)},
{"dropRate", (double)backend->dropRate}};
/* sending a latency for a DOWN server doesn't make sense */
size_t socketsOffset{0};
double latencyUsec{0.0};
double latencyUsecTCP{0.0};
+ double d_healthCheckLatency{0.0};
unsigned int d_nextCheck{0};
uint16_t currentCheckFailures{0};
std::atomic<bool> hashesComputed{false};
_healthQueue = queue.Queue()
_dropHealthCheck = False
+ _delayResponse = None
@classmethod
def startResponders(cls):
cls._healthQueue.put(False)
return ResponderDropAction()
response = dns.message.make_response(request)
+ if cls._delayResponse is not None:
+ time.sleep(cls._delayResponse)
cls._healthQueue.put(True)
return response.to_wire()
def setDrop(cls, flag=True):
cls._dropHealthCheck = flag
+ @classmethod
+ def setDelay(cls, delay):
+ cls._delayResponse = delay
+
class TestUpdateHCParamsCombo1(HealthCheckUpdateParams):
# this test suite uses a different responder port
# now should timeout and failure increased
self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), beforeFailure+1)
+class TestHealthCheckLatency(HealthCheckUpdateParams):
+
+ # this test suite uses a different responder port
+ _testServerPort = pickAvailablePort()
+
+ def testLatency(self):
+ """
+ HealthChecks: Check latency
+ """
+ # consume health checks upon sys init
+ try:
+ while self.wait1(False): pass
+ except queue.Empty: pass
+
+ self.assertEqual(self.wait1(), True)
+ time.sleep(0.1)
+ self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
+ self.assertEqual(self.getBackendStatus(), 'up')
+ latency = self.getBackendMetric(0, 'healthCheckLatency')
+ # less than 500 ms
+ self.assertLess(latency, 500)
+
+ # introduce 500 ms of latency
+ self.setDelay(0.5)
+
+ self.wait1(True)
+
+ # should have no failures, still up
+ self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
+ self.assertEqual(self.getBackendStatus(), 'up')
+ latency = self.getBackendMetric(0, 'healthCheckLatency')
+ # should be at least 500 ms
+ self.assertGreaterEqual(latency, 500)
+
+ self.setDelay(None)
+
class TestServerStateChange(HealthCheckTest):
_healthQueue = queue.Queue()