From: Remi Gacogne Date: Mon, 29 Sep 2025 10:28:16 +0000 (+0200) Subject: dnsdist: Add a metric for the latency of the latest health-check X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8b3b381462ec270fc81de7b68f5441a2f4ba8656;p=thirdparty%2Fpdns.git dnsdist: Add a metric for the latency of the latest health-check This is useful for monitoring purposes, since this latency is usually not impacted by the content of the backend cache. Signed-off-by: Remi Gacogne --- diff --git a/pdns/dnsdistdist/dnsdist-carbon.cc b/pdns/dnsdistdist/dnsdist-carbon.cc index d005f9552..98f254f14 100644 --- a/pdns/dnsdistdist/dnsdist-carbon.cc +++ b/pdns/dnsdistdist/dnsdist-carbon.cc @@ -114,6 +114,7 @@ static bool doOneCarbonExport(const Carbon::Endpoint& endpoint) str << base << "healthcheckfailuresnetwork" << ' ' << state->d_healthCheckMetrics.d_networkErrors << " " << now << "\r\n"; str << base << "healthcheckfailuresmismatch" << ' ' << state->d_healthCheckMetrics.d_mismatchErrors << " " << now << "\r\n"; str << base << "healthcheckfailuresinvalid" << ' ' << state->d_healthCheckMetrics.d_invalidResponseErrors << " " << now << "\r\n"; + str << base << "healthchecklatency" << ' ' << (state->d_healthCheckLatency / 1000.0) << " " << now << "\r\n"; } std::map frontendDuplicates; diff --git a/pdns/dnsdistdist/dnsdist-healthchecks.cc b/pdns/dnsdistdist/dnsdist-healthchecks.cc index be46106b0..e0d82f8bf 100644 --- a/pdns/dnsdistdist/dnsdist-healthchecks.cc +++ b/pdns/dnsdistdist/dnsdist-healthchecks.cc @@ -51,8 +51,8 @@ struct HealthCheckData PacketBuffer d_buffer; Socket d_udpSocket; DNSName d_checkName; - struct timeval d_ttd{ - 0, 0}; + StopWatch d_elapsed{false}; + timeval d_ttd{0, 0}; size_t d_bufferPos{0}; uint16_t d_checkType; uint16_t d_checkClass; @@ -134,6 +134,7 @@ static bool handleResponse(std::shared_ptr& data) return false; } + data->d_ds->d_healthCheckLatency = data->d_elapsed.udiff(); return true; } @@ -383,6 +384,7 @@ bool queueHealthCheck(std::unique_ptr& mplexer, const std::shared data->d_ttd.tv_sec += static_castd_ttd.tv_sec)>(downstream->d_config.checkTimeout / 1000); /* ms to seconds */ data->d_ttd.tv_usec += static_castd_ttd.tv_usec)>((downstream->d_config.checkTimeout % 1000) * 1000); /* remaining ms to us */ normalizeTV(data->d_ttd); + data->d_elapsed.start(); if (!downstream->doHealthcheckOverTCP()) { sock.connect(downstream->d_config.remote); diff --git a/pdns/dnsdistdist/dnsdist-web.cc b/pdns/dnsdistdist/dnsdist-web.cc index dfb617153..c04cf8cb8 100644 --- a/pdns/dnsdistdist/dnsdist-web.cc +++ b/pdns/dnsdistdist/dnsdist-web.cc @@ -612,6 +612,8 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp) output << "# TYPE " << statesbase << "healthcheckfailuresmismatch " << "counter" << "\n"; output << "# HELP " << statesbase << "healthcheckfailuresinvalid " << "Number of health check attempts where the DNS response was invalid" << "\n"; output << "# TYPE " << statesbase << "healthcheckfailuresinvalid " << "counter" << "\n"; + output << "# HELP " << statesbase << "healthchecklatency " << "Latency of the last successfull health check attempt, in milliseconds" << "\n"; + output << "# TYPE " << statesbase << "healthchecklatency " << "gauge" << "\n"; for (const auto& state : dnsdist::configuration::getCurrentRuntimeConfiguration().d_backends) { string serverName; @@ -660,7 +662,8 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp) output << statesbase << "healthcheckfailurestimeout" << label << " " << state->d_healthCheckMetrics.d_timeOuts << "\n"; output << statesbase << "healthcheckfailuresnetwork" << label << " " << state->d_healthCheckMetrics.d_networkErrors << "\n"; output << statesbase << "healthcheckfailuresmismatch" << label << " " << state->d_healthCheckMetrics.d_mismatchErrors << "\n"; - output << statesbase << "healthcheckfailuresinvalid" << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n"; + output << statesbase << "healthcheckfailuresinvalid" << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n"; + output << statesbase << "healthchecklatency" << label << " " << state->d_healthCheckLatency / 1000.0 << "\n"; } const string frontsbase = "dnsdist_frontend_"; @@ -1124,6 +1127,7 @@ static void addServerToJSON(Json::array& servers, int identifier, const std::sha {"healthCheckFailuresNetwork", (double)(backend->d_healthCheckMetrics.d_networkErrors)}, {"healthCheckFailuresMismatch", (double)(backend->d_healthCheckMetrics.d_mismatchErrors)}, {"healthCheckFailuresInvalid", (double)(backend->d_healthCheckMetrics.d_invalidResponseErrors)}, + {"healthCheckLatency", (double)(backend->d_healthCheckLatency / 1000.0)}, {"dropRate", (double)backend->dropRate}}; /* sending a latency for a DOWN server doesn't make sense */ diff --git a/pdns/dnsdistdist/dnsdist.hh b/pdns/dnsdistdist/dnsdist.hh index d3cefa56c..ecc1caca6 100644 --- a/pdns/dnsdistdist/dnsdist.hh +++ b/pdns/dnsdistdist/dnsdist.hh @@ -722,6 +722,7 @@ public: size_t socketsOffset{0}; double latencyUsec{0.0}; double latencyUsecTCP{0.0}; + double d_healthCheckLatency{0.0}; unsigned int d_nextCheck{0}; uint16_t currentCheckFailures{0}; std::atomic hashesComputed{false}; diff --git a/regression-tests.dnsdist/test_HealthChecks.py b/regression-tests.dnsdist/test_HealthChecks.py index 8e5b9bce7..4c434ce03 100644 --- a/regression-tests.dnsdist/test_HealthChecks.py +++ b/regression-tests.dnsdist/test_HealthChecks.py @@ -416,6 +416,7 @@ class HealthCheckUpdateParams(HealthCheckTest): _healthQueue = queue.Queue() _dropHealthCheck = False + _delayResponse = None @classmethod def startResponders(cls): @@ -430,6 +431,8 @@ class HealthCheckUpdateParams(HealthCheckTest): cls._healthQueue.put(False) return ResponderDropAction() response = dns.message.make_response(request) + if cls._delayResponse is not None: + time.sleep(cls._delayResponse) cls._healthQueue.put(True) return response.to_wire() @@ -441,6 +444,10 @@ class HealthCheckUpdateParams(HealthCheckTest): def setDrop(cls, flag=True): cls._dropHealthCheck = flag + @classmethod + def setDelay(cls, delay): + cls._delayResponse = delay + class TestUpdateHCParamsCombo1(HealthCheckUpdateParams): # this test suite uses a different responder port @@ -548,6 +555,42 @@ class TestUpdateHCParamsCombo2(HealthCheckUpdateParams): # now should timeout and failure increased self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), beforeFailure+1) +class TestHealthCheckLatency(HealthCheckUpdateParams): + + # this test suite uses a different responder port + _testServerPort = pickAvailablePort() + + def testLatency(self): + """ + HealthChecks: Check latency + """ + # consume health checks upon sys init + try: + while self.wait1(False): pass + except queue.Empty: pass + + self.assertEqual(self.wait1(), True) + time.sleep(0.1) + self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0) + self.assertEqual(self.getBackendStatus(), 'up') + latency = self.getBackendMetric(0, 'healthCheckLatency') + # less than 500 ms + self.assertLess(latency, 500) + + # introduce 500 ms of latency + self.setDelay(0.5) + + self.wait1(True) + + # should have no failures, still up + self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0) + self.assertEqual(self.getBackendStatus(), 'up') + latency = self.getBackendMetric(0, 'healthCheckLatency') + # should be at least 500 ms + self.assertGreaterEqual(latency, 500) + + self.setDelay(None) + class TestServerStateChange(HealthCheckTest): _healthQueue = queue.Queue()