]> git.ipfire.org Git - thirdparty/pdns.git/commitdiff
dnsdist: Add an histogram of health-check latencies for backends
authorRemi Gacogne <remi.gacogne@powerdns.com>
Fri, 19 Dec 2025 10:14:38 +0000 (11:14 +0100)
committerRemi Gacogne <remi.gacogne@powerdns.com>
Fri, 19 Dec 2025 10:14:38 +0000 (11:14 +0100)
Signed-off-by: Remi Gacogne <remi.gacogne@powerdns.com>
pdns/dnsdistdist/dnsdist-healthchecks.cc
pdns/dnsdistdist/dnsdist-web.cc
pdns/dnsdistdist/dnsdist.hh

index aed2420cd4f1037f856dfb62f58f1be1edebf6c2..52b028ee5b989bd7c6a5f9f524e0e88eb866f257 100644 (file)
@@ -61,6 +61,34 @@ struct HealthCheckData
   bool d_initial{false};
 };
 
+static void updateLatencyMetrics(DownstreamState& downstream, int elapsed /* microseconds */)
+{
+  auto& histo = downstream.d_healthCheckLatencyHisto;
+  downstream.d_healthCheckLatency.store(elapsed);
+
+  if (elapsed < 1000) {
+    ++histo.latency0_1;
+  }
+  else if (elapsed < 10000) {
+    ++histo.latency1_10;
+  }
+  else if (elapsed < 50000) {
+    ++histo.latency10_50;
+  }
+  else if (elapsed < 100000) {
+    ++histo.latency50_100;
+  }
+  else if (elapsed < 1000000) {
+    ++histo.latency100_1000;
+  }
+  else {
+    ++histo.latencySlow;
+  }
+
+  histo.latencySum += static_cast<unsigned long>(elapsed) / 1000;
+  ++histo.latencyCount;
+}
+
 static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
 {
   const auto verboseHealthChecks = dnsdist::configuration::getCurrentRuntimeConfiguration().d_verboseHealthChecks;
@@ -134,7 +162,9 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
     return false;
   }
 
-  data->d_ds->d_healthCheckLatency.store(data->d_elapsed.udiff());
+  const auto elapsed = data->d_elapsed.udiff();
+  updateLatencyMetrics(*data->d_ds, elapsed);
+
   return true;
 }
 
index 3103742a3903e1c9f22c88f38ac542fe31e7ecc9..92a8a79abc003630c3cb59ee53460491a60d1428 100644 (file)
@@ -612,8 +612,11 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
   output << "# TYPE " << statesbase << "healthcheckfailuresmismatch "     << "counter"                                                                              << "\n";
   output << "# HELP " << statesbase << "healthcheckfailuresinvalid "      << "Number of health check attempts where the DNS response was invalid"                   << "\n";
   output << "# TYPE " << statesbase << "healthcheckfailuresinvalid "      << "counter"                                                                              << "\n";
-  output << "# HELP " << statesbase << "healthchecklatency "              << "Latency of the last successful health check attempt, in milliseconds"                << "\n";
+  output << "# HELP " << statesbase << "healthchecklatency "              << "Latency of the last successful health check attempt, in milliseconds"                 << "\n";
   output << "# TYPE " << statesbase << "healthchecklatency "              << "gauge"                                                                                << "\n";
+  // Backend latency histogram buckets
+  output << "# HELP " << statesbase << "healthchecklatency_histo "        << "Histogram of the latency of the successful health check attempts, in milliseconds"    << "\n";
+  output << "# TYPE " << statesbase << "healthchecklatency_histo "        << "histogram"                                                                            << "\n";
 
   for (const auto& state : dnsdist::configuration::getCurrentRuntimeConfiguration().d_backends) {
     string serverName;
@@ -664,6 +667,28 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
     output << statesbase << "healthcheckfailuresmismatch"      << label << " " << state->d_healthCheckMetrics.d_mismatchErrors << "\n";
     output << statesbase << "healthcheckfailuresinvalid"       << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n";
     output << statesbase << "healthchecklatency"               << label << " " << state->d_healthCheckLatency / 1000.0   << "\n";
+
+    // Health-check latency histogram
+    const std::string latency_label_prefix = boost::str(boost::format(R"({server="%1%",address="%2%")")
+                                                        % serverName % state->d_config.remote.toStringWithPort());
+    uint64_t backend_latency_amount = 0;
+    backend_latency_amount += state->d_healthCheckLatencyHisto.latency0_1;
+    output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"1\"} " << backend_latency_amount << "\n";
+    backend_latency_amount += state->d_healthCheckLatencyHisto.latency1_10;
+    output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"10\"} " << backend_latency_amount << "\n";
+    backend_latency_amount += state->d_healthCheckLatencyHisto.latency10_50;
+    output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"50\"} " << backend_latency_amount << "\n";
+    backend_latency_amount += state->d_healthCheckLatencyHisto.latency10_50;
+    output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"50\"} " << backend_latency_amount << "\n";
+    backend_latency_amount += state->d_healthCheckLatencyHisto.latency50_100;
+    output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"100\"} " << backend_latency_amount << "\n";
+    backend_latency_amount += state->d_healthCheckLatencyHisto.latency100_1000;
+    output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"1000\"} " << backend_latency_amount << "\n";
+    backend_latency_amount += state->d_healthCheckLatencyHisto.latencySlow;
+    output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"+Inf\"} " << backend_latency_amount << "\n";
+
+    output << statesbase << "healthchecklatency_histo_sum" << label << " " << state->d_healthCheckLatencyHisto.latencySum << "\n";
+    output << statesbase << "healthchecklatency_histo_count" << label << " " << state->d_healthCheckLatencyHisto.latencyCount << "\n";
   }
 
   const string frontsbase = "dnsdist_frontend_";
index 355d67f4ed3b4bd94307501ff03322551ceb4633..b71d867665df738e30615e97ee54eb4101674356 100644 (file)
@@ -641,6 +641,18 @@ struct DownstreamState : public std::enable_shared_from_this<DownstreamState>
     stat_t d_invalidResponseErrors{0};
   };
 
+  struct HealthCheckLatencyHisto
+  {
+    stat_t latency0_1{0};
+    stat_t latency1_10{0};
+    stat_t latency10_50{0};
+    stat_t latency50_100{0};
+    stat_t latency100_1000{0};
+    stat_t latencySlow{0};
+    stat_t latencySum{0};
+    stat_t latencyCount{0};
+  };
+
   DownstreamState(DownstreamState::Config&& config, std::shared_ptr<TLSCtx> tlsCtx, bool connect);
   DownstreamState(const ComboAddress& remote) :
     DownstreamState(DownstreamState::Config(remote), nullptr, false)
@@ -714,6 +726,7 @@ public:
   std::vector<std::shared_ptr<XskWorker>> d_xskInfos;
   std::vector<std::shared_ptr<XskSocket>> d_xskSockets;
 #endif
+  HealthCheckLatencyHisto d_healthCheckLatencyHisto{};
   std::atomic<uint64_t> idOffset{0};
   std::atomic<double> d_healthCheckLatency{0.0};
   size_t socketsOffset{0};