From: Remi Gacogne Date: Fri, 19 Dec 2025 10:14:38 +0000 (+0100) Subject: dnsdist: Add an histogram of health-check latencies for backends X-Git-Tag: rec-5.4.0-beta1~47^2~4 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8f607f9688fbfd83f38c52a680f85f4f6a8fcb9c;p=thirdparty%2Fpdns.git dnsdist: Add an histogram of health-check latencies for backends Signed-off-by: Remi Gacogne --- diff --git a/pdns/dnsdistdist/dnsdist-healthchecks.cc b/pdns/dnsdistdist/dnsdist-healthchecks.cc index aed2420cd4..52b028ee5b 100644 --- a/pdns/dnsdistdist/dnsdist-healthchecks.cc +++ b/pdns/dnsdistdist/dnsdist-healthchecks.cc @@ -61,6 +61,34 @@ struct HealthCheckData bool d_initial{false}; }; +static void updateLatencyMetrics(DownstreamState& downstream, int elapsed /* microseconds */) +{ + auto& histo = downstream.d_healthCheckLatencyHisto; + downstream.d_healthCheckLatency.store(elapsed); + + if (elapsed < 1000) { + ++histo.latency0_1; + } + else if (elapsed < 10000) { + ++histo.latency1_10; + } + else if (elapsed < 50000) { + ++histo.latency10_50; + } + else if (elapsed < 100000) { + ++histo.latency50_100; + } + else if (elapsed < 1000000) { + ++histo.latency100_1000; + } + else { + ++histo.latencySlow; + } + + histo.latencySum += static_cast(elapsed) / 1000; + ++histo.latencyCount; +} + static bool handleResponse(std::shared_ptr& data) { const auto verboseHealthChecks = dnsdist::configuration::getCurrentRuntimeConfiguration().d_verboseHealthChecks; @@ -134,7 +162,9 @@ static bool handleResponse(std::shared_ptr& data) return false; } - data->d_ds->d_healthCheckLatency.store(data->d_elapsed.udiff()); + const auto elapsed = data->d_elapsed.udiff(); + updateLatencyMetrics(*data->d_ds, elapsed); + return true; } diff --git a/pdns/dnsdistdist/dnsdist-web.cc b/pdns/dnsdistdist/dnsdist-web.cc index 3103742a39..92a8a79abc 100644 --- a/pdns/dnsdistdist/dnsdist-web.cc +++ b/pdns/dnsdistdist/dnsdist-web.cc @@ -612,8 +612,11 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp) output << "# TYPE " << statesbase << "healthcheckfailuresmismatch " << "counter" << "\n"; output << "# HELP " << statesbase << "healthcheckfailuresinvalid " << "Number of health check attempts where the DNS response was invalid" << "\n"; output << "# TYPE " << statesbase << "healthcheckfailuresinvalid " << "counter" << "\n"; - output << "# HELP " << statesbase << "healthchecklatency " << "Latency of the last successful health check attempt, in milliseconds" << "\n"; + output << "# HELP " << statesbase << "healthchecklatency " << "Latency of the last successful health check attempt, in milliseconds" << "\n"; output << "# TYPE " << statesbase << "healthchecklatency " << "gauge" << "\n"; + // Backend latency histogram buckets + output << "# HELP " << statesbase << "healthchecklatency_histo " << "Histogram of the latency of the successful health check attempts, in milliseconds" << "\n"; + output << "# TYPE " << statesbase << "healthchecklatency_histo " << "histogram" << "\n"; for (const auto& state : dnsdist::configuration::getCurrentRuntimeConfiguration().d_backends) { string serverName; @@ -664,6 +667,28 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp) output << statesbase << "healthcheckfailuresmismatch" << label << " " << state->d_healthCheckMetrics.d_mismatchErrors << "\n"; output << statesbase << "healthcheckfailuresinvalid" << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n"; output << statesbase << "healthchecklatency" << label << " " << state->d_healthCheckLatency / 1000.0 << "\n"; + + // Health-check latency histogram + const std::string latency_label_prefix = boost::str(boost::format(R"({server="%1%",address="%2%")") + % serverName % state->d_config.remote.toStringWithPort()); + uint64_t backend_latency_amount = 0; + backend_latency_amount += state->d_healthCheckLatencyHisto.latency0_1; + output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"1\"} " << backend_latency_amount << "\n"; + backend_latency_amount += state->d_healthCheckLatencyHisto.latency1_10; + output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"10\"} " << backend_latency_amount << "\n"; + backend_latency_amount += state->d_healthCheckLatencyHisto.latency10_50; + output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"50\"} " << backend_latency_amount << "\n"; + backend_latency_amount += state->d_healthCheckLatencyHisto.latency10_50; + output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"50\"} " << backend_latency_amount << "\n"; + backend_latency_amount += state->d_healthCheckLatencyHisto.latency50_100; + output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"100\"} " << backend_latency_amount << "\n"; + backend_latency_amount += state->d_healthCheckLatencyHisto.latency100_1000; + output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"1000\"} " << backend_latency_amount << "\n"; + backend_latency_amount += state->d_healthCheckLatencyHisto.latencySlow; + output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"+Inf\"} " << backend_latency_amount << "\n"; + + output << statesbase << "healthchecklatency_histo_sum" << label << " " << state->d_healthCheckLatencyHisto.latencySum << "\n"; + output << statesbase << "healthchecklatency_histo_count" << label << " " << state->d_healthCheckLatencyHisto.latencyCount << "\n"; } const string frontsbase = "dnsdist_frontend_"; diff --git a/pdns/dnsdistdist/dnsdist.hh b/pdns/dnsdistdist/dnsdist.hh index 355d67f4ed..b71d867665 100644 --- a/pdns/dnsdistdist/dnsdist.hh +++ b/pdns/dnsdistdist/dnsdist.hh @@ -641,6 +641,18 @@ struct DownstreamState : public std::enable_shared_from_this stat_t d_invalidResponseErrors{0}; }; + struct HealthCheckLatencyHisto + { + stat_t latency0_1{0}; + stat_t latency1_10{0}; + stat_t latency10_50{0}; + stat_t latency50_100{0}; + stat_t latency100_1000{0}; + stat_t latencySlow{0}; + stat_t latencySum{0}; + stat_t latencyCount{0}; + }; + DownstreamState(DownstreamState::Config&& config, std::shared_ptr tlsCtx, bool connect); DownstreamState(const ComboAddress& remote) : DownstreamState(DownstreamState::Config(remote), nullptr, false) @@ -714,6 +726,7 @@ public: std::vector> d_xskInfos; std::vector> d_xskSockets; #endif + HealthCheckLatencyHisto d_healthCheckLatencyHisto{}; std::atomic idOffset{0}; std::atomic d_healthCheckLatency{0.0}; size_t socketsOffset{0};