bool d_initial{false};
};
+static void updateLatencyMetrics(DownstreamState& downstream, int elapsed /* microseconds */)
+{
+ auto& histo = downstream.d_healthCheckLatencyHisto;
+ downstream.d_healthCheckLatency.store(elapsed);
+
+ if (elapsed < 1000) {
+ ++histo.latency0_1;
+ }
+ else if (elapsed < 10000) {
+ ++histo.latency1_10;
+ }
+ else if (elapsed < 50000) {
+ ++histo.latency10_50;
+ }
+ else if (elapsed < 100000) {
+ ++histo.latency50_100;
+ }
+ else if (elapsed < 1000000) {
+ ++histo.latency100_1000;
+ }
+ else {
+ ++histo.latencySlow;
+ }
+
+ histo.latencySum += static_cast<unsigned long>(elapsed) / 1000;
+ ++histo.latencyCount;
+}
+
static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
{
const auto verboseHealthChecks = dnsdist::configuration::getCurrentRuntimeConfiguration().d_verboseHealthChecks;
return false;
}
- data->d_ds->d_healthCheckLatency.store(data->d_elapsed.udiff());
+ const auto elapsed = data->d_elapsed.udiff();
+ updateLatencyMetrics(*data->d_ds, elapsed);
+
return true;
}
output << "# TYPE " << statesbase << "healthcheckfailuresmismatch " << "counter" << "\n";
output << "# HELP " << statesbase << "healthcheckfailuresinvalid " << "Number of health check attempts where the DNS response was invalid" << "\n";
output << "# TYPE " << statesbase << "healthcheckfailuresinvalid " << "counter" << "\n";
- output << "# HELP " << statesbase << "healthchecklatency " << "Latency of the last successful health check attempt, in milliseconds" << "\n";
+ output << "# HELP " << statesbase << "healthchecklatency " << "Latency of the last successful health check attempt, in milliseconds" << "\n";
output << "# TYPE " << statesbase << "healthchecklatency " << "gauge" << "\n";
+ // Backend latency histogram buckets
+ output << "# HELP " << statesbase << "healthchecklatency_histo " << "Histogram of the latency of the successful health check attempts, in milliseconds" << "\n";
+ output << "# TYPE " << statesbase << "healthchecklatency_histo " << "histogram" << "\n";
for (const auto& state : dnsdist::configuration::getCurrentRuntimeConfiguration().d_backends) {
string serverName;
output << statesbase << "healthcheckfailuresmismatch" << label << " " << state->d_healthCheckMetrics.d_mismatchErrors << "\n";
output << statesbase << "healthcheckfailuresinvalid" << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n";
output << statesbase << "healthchecklatency" << label << " " << state->d_healthCheckLatency / 1000.0 << "\n";
+
+ // Health-check latency histogram
+ const std::string latency_label_prefix = boost::str(boost::format(R"({server="%1%",address="%2%")")
+ % serverName % state->d_config.remote.toStringWithPort());
+ uint64_t backend_latency_amount = 0;
+ backend_latency_amount += state->d_healthCheckLatencyHisto.latency0_1;
+ output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"1\"} " << backend_latency_amount << "\n";
+ backend_latency_amount += state->d_healthCheckLatencyHisto.latency1_10;
+ output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"10\"} " << backend_latency_amount << "\n";
+ backend_latency_amount += state->d_healthCheckLatencyHisto.latency10_50;
+ output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"50\"} " << backend_latency_amount << "\n";
+ backend_latency_amount += state->d_healthCheckLatencyHisto.latency10_50;
+ output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"50\"} " << backend_latency_amount << "\n";
+ backend_latency_amount += state->d_healthCheckLatencyHisto.latency50_100;
+ output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"100\"} " << backend_latency_amount << "\n";
+ backend_latency_amount += state->d_healthCheckLatencyHisto.latency100_1000;
+ output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"1000\"} " << backend_latency_amount << "\n";
+ backend_latency_amount += state->d_healthCheckLatencyHisto.latencySlow;
+ output << statesbase << "healthchecklatency_histo_bucket" << latency_label_prefix << ",le=\"+Inf\"} " << backend_latency_amount << "\n";
+
+ output << statesbase << "healthchecklatency_histo_sum" << label << " " << state->d_healthCheckLatencyHisto.latencySum << "\n";
+ output << statesbase << "healthchecklatency_histo_count" << label << " " << state->d_healthCheckLatencyHisto.latencyCount << "\n";
}
const string frontsbase = "dnsdist_frontend_";
stat_t d_invalidResponseErrors{0};
};
+ struct HealthCheckLatencyHisto
+ {
+ stat_t latency0_1{0};
+ stat_t latency1_10{0};
+ stat_t latency10_50{0};
+ stat_t latency50_100{0};
+ stat_t latency100_1000{0};
+ stat_t latencySlow{0};
+ stat_t latencySum{0};
+ stat_t latencyCount{0};
+ };
+
DownstreamState(DownstreamState::Config&& config, std::shared_ptr<TLSCtx> tlsCtx, bool connect);
DownstreamState(const ComboAddress& remote) :
DownstreamState(DownstreamState::Config(remote), nullptr, false)
std::vector<std::shared_ptr<XskWorker>> d_xskInfos;
std::vector<std::shared_ptr<XskSocket>> d_xskSockets;
#endif
+ HealthCheckLatencyHisto d_healthCheckLatencyHisto{};
std::atomic<uint64_t> idOffset{0};
std::atomic<double> d_healthCheckLatency{0.0};
size_t socketsOffset{0};