]> git.ipfire.org Git - thirdparty/pdns.git/commitdiff
dnsdist: Add a metric for the latency of the latest health-check
authorRemi Gacogne <remi.gacogne@powerdns.com>
Mon, 29 Sep 2025 10:28:16 +0000 (12:28 +0200)
committerRemi Gacogne <remi.gacogne@powerdns.com>
Mon, 29 Sep 2025 10:28:16 +0000 (12:28 +0200)
This is useful for monitoring purposes, since this latency is usually
not impacted by the content of the backend cache.

Signed-off-by: Remi Gacogne <remi.gacogne@powerdns.com>
pdns/dnsdistdist/dnsdist-carbon.cc
pdns/dnsdistdist/dnsdist-healthchecks.cc
pdns/dnsdistdist/dnsdist-web.cc
pdns/dnsdistdist/dnsdist.hh
regression-tests.dnsdist/test_HealthChecks.py

index d005f9552acb46340a3fea1d949f0eab8c0cf469..98f254f1435bd3d7830aa07420c988b488d1b91b 100644 (file)
@@ -114,6 +114,7 @@ static bool doOneCarbonExport(const Carbon::Endpoint& endpoint)
       str << base << "healthcheckfailuresnetwork" << ' ' << state->d_healthCheckMetrics.d_networkErrors << " " << now << "\r\n";
       str << base << "healthcheckfailuresmismatch" << ' ' << state->d_healthCheckMetrics.d_mismatchErrors << " " << now << "\r\n";
       str << base << "healthcheckfailuresinvalid" << ' ' << state->d_healthCheckMetrics.d_invalidResponseErrors << " " << now << "\r\n";
+      str << base << "healthchecklatency" << ' ' << (state->d_healthCheckLatency / 1000.0) << " " << now << "\r\n";
     }
 
     std::map<std::string, uint64_t> frontendDuplicates;
index be46106b01c57a05044312afc7bb369abba275e2..e0d82f8bf6c99848f0f06ce51f6de542d1d3e437 100644 (file)
@@ -51,8 +51,8 @@ struct HealthCheckData
   PacketBuffer d_buffer;
   Socket d_udpSocket;
   DNSName d_checkName;
-  struct timeval d_ttd{
-    0, 0};
+  StopWatch d_elapsed{false};
+  timeval d_ttd{0, 0};
   size_t d_bufferPos{0};
   uint16_t d_checkType;
   uint16_t d_checkClass;
@@ -134,6 +134,7 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
     return false;
   }
 
+  data->d_ds->d_healthCheckLatency = data->d_elapsed.udiff();
   return true;
 }
 
@@ -383,6 +384,7 @@ bool queueHealthCheck(std::unique_ptr<FDMultiplexer>& mplexer, const std::shared
     data->d_ttd.tv_sec += static_cast<decltype(data->d_ttd.tv_sec)>(downstream->d_config.checkTimeout / 1000); /* ms to seconds */
     data->d_ttd.tv_usec += static_cast<decltype(data->d_ttd.tv_usec)>((downstream->d_config.checkTimeout % 1000) * 1000); /* remaining ms to us */
     normalizeTV(data->d_ttd);
+    data->d_elapsed.start();
 
     if (!downstream->doHealthcheckOverTCP()) {
       sock.connect(downstream->d_config.remote);
index dfb617153cb6fb8172dc56117c88397245d5388f..c04cf8cb8c51458e90356d77077de0b5d373ef7c 100644 (file)
@@ -612,6 +612,8 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
   output << "# TYPE " << statesbase << "healthcheckfailuresmismatch "     << "counter"                                                                              << "\n";
   output << "# HELP " << statesbase << "healthcheckfailuresinvalid "      << "Number of health check attempts where the DNS response was invalid"                   << "\n";
   output << "# TYPE " << statesbase << "healthcheckfailuresinvalid "      << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "healthchecklatency "              << "Latency of the last successfull health check attempt, in milliseconds"                << "\n";
+  output << "# TYPE " << statesbase << "healthchecklatency "              << "gauge"                                                                                << "\n";
 
   for (const auto& state : dnsdist::configuration::getCurrentRuntimeConfiguration().d_backends) {
     string serverName;
@@ -660,7 +662,8 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
     output << statesbase << "healthcheckfailurestimeout"       << label << " " << state->d_healthCheckMetrics.d_timeOuts << "\n";
     output << statesbase << "healthcheckfailuresnetwork"       << label << " " << state->d_healthCheckMetrics.d_networkErrors << "\n";
     output << statesbase << "healthcheckfailuresmismatch"      << label << " " << state->d_healthCheckMetrics.d_mismatchErrors << "\n";
-    output << statesbase << "healthcheckfailuresinvalid"        << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n";
+    output << statesbase << "healthcheckfailuresinvalid"       << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n";
+    output << statesbase << "healthchecklatency"               << label << " " << state->d_healthCheckLatency / 1000.0   << "\n";
   }
 
   const string frontsbase = "dnsdist_frontend_";
@@ -1124,6 +1127,7 @@ static void addServerToJSON(Json::array& servers, int identifier, const std::sha
     {"healthCheckFailuresNetwork", (double)(backend->d_healthCheckMetrics.d_networkErrors)},
     {"healthCheckFailuresMismatch", (double)(backend->d_healthCheckMetrics.d_mismatchErrors)},
     {"healthCheckFailuresInvalid", (double)(backend->d_healthCheckMetrics.d_invalidResponseErrors)},
+    {"healthCheckLatency", (double)(backend->d_healthCheckLatency / 1000.0)},
     {"dropRate", (double)backend->dropRate}};
 
   /* sending a latency for a DOWN server doesn't make sense */
index d3cefa56c3f207d56e49d73791c0bdd853a2c8f8..ecc1caca665c2b26d3063e5a2d33b94189c7dfaf 100644 (file)
@@ -722,6 +722,7 @@ public:
   size_t socketsOffset{0};
   double latencyUsec{0.0};
   double latencyUsecTCP{0.0};
+  double d_healthCheckLatency{0.0};
   unsigned int d_nextCheck{0};
   uint16_t currentCheckFailures{0};
   std::atomic<bool> hashesComputed{false};
index 8e5b9bce7e331d43c3c947e9a1ac34c805c9d793..4c434ce03c2ec898f295041cdb76f8b42c5b5f0e 100644 (file)
@@ -416,6 +416,7 @@ class HealthCheckUpdateParams(HealthCheckTest):
 
     _healthQueue = queue.Queue()
     _dropHealthCheck = False
+    _delayResponse = None
 
     @classmethod
     def startResponders(cls):
@@ -430,6 +431,8 @@ class HealthCheckUpdateParams(HealthCheckTest):
           cls._healthQueue.put(False)
           return ResponderDropAction()
         response = dns.message.make_response(request)
+        if cls._delayResponse is not None:
+            time.sleep(cls._delayResponse)
         cls._healthQueue.put(True)
         return response.to_wire()
 
@@ -441,6 +444,10 @@ class HealthCheckUpdateParams(HealthCheckTest):
     def setDrop(cls, flag=True):
         cls._dropHealthCheck = flag
 
+    @classmethod
+    def setDelay(cls, delay):
+        cls._delayResponse = delay
+
 class TestUpdateHCParamsCombo1(HealthCheckUpdateParams):
 
     # this test suite uses a different responder port
@@ -548,6 +555,42 @@ class TestUpdateHCParamsCombo2(HealthCheckUpdateParams):
         # now should timeout and failure increased
         self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), beforeFailure+1)
 
+class TestHealthCheckLatency(HealthCheckUpdateParams):
+
+    # this test suite uses a different responder port
+    _testServerPort = pickAvailablePort()
+
+    def testLatency(self):
+        """
+        HealthChecks: Check latency
+        """
+        # consume health checks upon sys init
+        try:
+          while self.wait1(False): pass
+        except queue.Empty: pass
+
+        self.assertEqual(self.wait1(), True)
+        time.sleep(0.1)
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
+        self.assertEqual(self.getBackendStatus(), 'up')
+        latency = self.getBackendMetric(0, 'healthCheckLatency')
+        # less than 500 ms
+        self.assertLess(latency, 500)
+
+        # introduce 500 ms of latency
+        self.setDelay(0.5)
+
+        self.wait1(True)
+
+        # should have no failures, still up
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
+        self.assertEqual(self.getBackendStatus(), 'up')
+        latency = self.getBackendMetric(0, 'healthCheckLatency')
+        # should be at least 500 ms
+        self.assertGreaterEqual(latency, 500)
+
+        self.setDelay(None)
+
 class TestServerStateChange(HealthCheckTest):
 
     _healthQueue = queue.Queue()