From: Remi Gacogne Date: Tue, 26 Mar 2019 09:24:35 +0000 (+0100) Subject: dnsdist: Add more TCP metrics X-Git-Tag: dnsdist-1.4.0-alpha1~25^2~11 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a6e9e107f9d9be07d2cc4c1c7666cee171c80311;p=thirdparty%2Fpdns.git dnsdist: Add more TCP metrics --- diff --git a/pdns/dnsdist-carbon.cc b/pdns/dnsdist-carbon.cc index f8f4d943d8..b9b43a7140 100644 --- a/pdns/dnsdist-carbon.cc +++ b/pdns/dnsdist-carbon.cc @@ -95,6 +95,11 @@ try str<availability != DownstreamState::Availability::Down ? state->latencyUsec/1000.0 : 0) << " " << now << "\r\n"; str<sendErrors.load() << " " << now << "\r\n"; str<outstanding.load() << " " << now << "\r\n"; + str<tcpDiedSendingQuery.load() << " " << now << "\r\n"; + str<tcpDiedReadingResponse.load() << " " << now << "\r\n"; + str<tcpGaveUp.load() << " " << now << "\r\n"; + str<tcpReadTimeouts.load() << " " << now << "\r\n"; + str<tcpWriteTimeouts.load() << " " << now << "\r\n"; } for(const auto& front : g_frontends) { if (front->udpFD == -1 && front->tcpFD == -1) @@ -104,6 +109,11 @@ try boost::replace_all(frontName, ".", "_"); const string base = namespace_name + "." + hostname + "." + instance_name + ".frontends." + frontName + "."; str<queries.load() << " " << now << "\r\n"; + str<tcpDiedReadingQuery.load() << " " << now << "\r\n"; + str<tcpDiedSendingResponse.load() << " " << now << "\r\n"; + str<tcpGaveUp.load() << " " << now << "\r\n"; + str<tcpClientTimeouts.load() << " " << now << "\r\n"; + str<tcpDownstreamTimeouts.load() << " " << now << "\r\n"; } auto localPools = g_pools.getLocal(); for (const auto& entry : *localPools) { diff --git a/pdns/dnsdist-tcp.cc b/pdns/dnsdist-tcp.cc index f6f59b5abb..ebb12c8c31 100644 --- a/pdns/dnsdist-tcp.cc +++ b/pdns/dnsdist-tcp.cc @@ -125,6 +125,10 @@ static std::unique_ptr getConnectionToDownstream(std::shared_ptr& ds, std::unique_ptr&& socket) { + if (socket == nullptr) { + return; + } + const auto& it = t_downstreamSockets.find(ds->remote); if (it != t_downstreamSockets.end()) { auto& list = it->second; @@ -582,6 +586,7 @@ static void sendResponse(std::shared_ptr& state) } catch (const std::exception& e) { vinfolog("Got an exception while writing TCP response to %s: %s", state->d_ci.remote.toStringWithPort(), e.what()); + ++state->d_ci.cs->tcpDiedSendingResponse; handleNewIOState(state, IOState::Done, state->d_ci.fd, handleIOCallback); } } @@ -663,6 +668,8 @@ static void sendQueryToBackend(std::shared_ptr& stat state->d_downstreamSocket = getConnectionToDownstream(ds, state->d_downstreamFailures, state->d_freshDownstreamConnection); if (!state->d_downstreamSocket) { + ++ds->tcpGaveUp; + ++state->d_ci.cs->tcpGaveUp; vinfolog("Downstream connection to %s failed %d times in a row, giving up.", ds->getName(), state->d_downstreamFailures); return; } @@ -672,6 +679,8 @@ static void sendQueryToBackend(std::shared_ptr& stat return; } + ++ds->tcpGaveUp; + ++state->d_ci.cs->tcpGaveUp; vinfolog("Downstream connection to %s failed %u times in a row, giving up.", ds->getName(), state->d_downstreamFailures); } @@ -880,6 +889,13 @@ static void handleDownstreamIOCallback(int fd, FDMultiplexer::funcparam_t& param Let's just drop the connection */ vinfolog("Got an exception while handling (%s backend) TCP query from %s: %s", (state->d_lastIOState == IOState::NeedRead ? "reading from" : "writing to"), state->d_ci.remote.toStringWithPort(), e.what()); + if (state->d_state == IncomingTCPConnectionState::State::sendingQueryToBackend) { + ++state->d_ds->tcpDiedSendingQuery; + } + else { + ++state->d_ds->tcpDiedReadingResponse; + } + /* don't increase this counter when reusing connections */ if (state->d_freshDownstreamConnection) { ++state->d_downstreamFailures; @@ -977,6 +993,15 @@ static void handleIOCallback(int fd, FDMultiplexer::funcparam_t& param) but it might also be a real IO error or something else. Let's just drop the connection */ + if (state->d_state == IncomingTCPConnectionState::State::doingHandshake || + state->d_state == IncomingTCPConnectionState::State::readingQuerySize || + state->d_state == IncomingTCPConnectionState::State::readingQuery) { + ++state->d_ci.cs->tcpDiedReadingQuery; + } + else if (state->d_state == IncomingTCPConnectionState::State::sendingResponse) { + ++state->d_ci.cs->tcpDiedSendingResponse; + } + if (state->d_lastIOState == IOState::NeedWrite || state->d_readingFirstQuery) { vinfolog("Got an exception while handling (%s) TCP query from %s: %s", (state->d_lastIOState == IOState::NeedRead ? "reading" : "writing"), state->d_ci.remote.toStringWithPort(), e.what()); } @@ -1061,9 +1086,12 @@ void tcpClientThread(int pipefd) auto state = boost::any_cast>(conn.second); if (conn.first == state->d_ci.fd) { vinfolog("Timeout (read) from remote TCP client %s", state->d_ci.remote.toStringWithPort()); + ++state->d_ci.cs->tcpClientTimeouts; } else if (state->d_ds) { vinfolog("Timeout (read) from remote backend %s", state->d_ds->getName()); + ++state->d_ci.cs->tcpDownstreamTimeouts; + ++state->d_ds->tcpReadTimeouts; } data.mplexer->removeReadFD(conn.first); state->d_lastIOState = IOState::Done; @@ -1074,9 +1102,12 @@ void tcpClientThread(int pipefd) auto state = boost::any_cast>(conn.second); if (conn.first == state->d_ci.fd) { vinfolog("Timeout (write) from remote TCP client %s", state->d_ci.remote.toStringWithPort()); + ++state->d_ci.cs->tcpClientTimeouts; } else if (state->d_ds) { vinfolog("Timeout (write) from remote backend %s", state->d_ds->getName()); + ++state->d_ci.cs->tcpDownstreamTimeouts; + ++state->d_ds->tcpWriteTimeouts; } data.mplexer->removeWriteFD(conn.first); state->d_lastIOState = IOState::Done; diff --git a/pdns/dnsdist-web.cc b/pdns/dnsdist-web.cc index 9cb51a622b..1ee60dccd5 100644 --- a/pdns/dnsdist-web.cc +++ b/pdns/dnsdist-web.cc @@ -446,20 +446,30 @@ static void connectionThread(int sock, ComboAddress remote) auto states = g_dstates.getLocal(); const string statesbase = "dnsdist_server_"; - output << "# HELP " << statesbase << "queries " << "Amount of queries relayed to server" << "\n"; - output << "# TYPE " << statesbase << "queries " << "counter" << "\n"; - output << "# HELP " << statesbase << "drops " << "Amount of queries not answered by server" << "\n"; - output << "# TYPE " << statesbase << "drops " << "counter" << "\n"; - output << "# HELP " << statesbase << "latency " << "Server's latency when answering questions in miliseconds" << "\n"; - output << "# TYPE " << statesbase << "latency " << "gauge" << "\n"; - output << "# HELP " << statesbase << "senderrors " << "Total number of OS snd errors while relaying queries" << "\n"; - output << "# TYPE " << statesbase << "senderrors " << "counter" << "\n"; - output << "# HELP " << statesbase << "outstanding " << "Current number of queries that are waiting for a backend response" << "\n"; - output << "# TYPE " << statesbase << "outstanding " << "gauge" << "\n"; - output << "# HELP " << statesbase << "order " << "The order in which this server is picked" << "\n"; - output << "# TYPE " << statesbase << "order " << "gauge" << "\n"; - output << "# HELP " << statesbase << "weight " << "The weight within the order in which this server is picked" << "\n"; - output << "# TYPE " << statesbase << "weight " << "gauge" << "\n"; + output << "# HELP " << statesbase << "queries " << "Amount of queries relayed to server" << "\n"; + output << "# TYPE " << statesbase << "queries " << "counter" << "\n"; + output << "# HELP " << statesbase << "drops " << "Amount of queries not answered by server" << "\n"; + output << "# TYPE " << statesbase << "drops " << "counter" << "\n"; + output << "# HELP " << statesbase << "latency " << "Server's latency when answering questions in miliseconds" << "\n"; + output << "# TYPE " << statesbase << "latency " << "gauge" << "\n"; + output << "# HELP " << statesbase << "senderrors " << "Total number of OS snd errors while relaying queries" << "\n"; + output << "# TYPE " << statesbase << "senderrors " << "counter" << "\n"; + output << "# HELP " << statesbase << "outstanding " << "Current number of queries that are waiting for a backend response" << "\n"; + output << "# TYPE " << statesbase << "outstanding " << "gauge" << "\n"; + output << "# HELP " << statesbase << "order " << "The order in which this server is picked" << "\n"; + output << "# TYPE " << statesbase << "order " << "gauge" << "\n"; + output << "# HELP " << statesbase << "weight " << "The weight within the order in which this server is picked" << "\n"; + output << "# TYPE " << statesbase << "weight " << "gauge" << "\n"; + output << "# HELP " << statesbase << "tcpdiedsendingquery " << "The number of TCP I/O errors while sending the query" << "\n"; + output << "# TYPE " << statesbase << "tcpdiedsendingquery " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpdiedreadingresponse " << "The number of TCP I/O errors while reading the response" << "\n"; + output << "# TYPE " << statesbase << "tcpdiedreadingresponse " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpgaveup " << "The number of TCP connections failing after too many attempts" << "\n"; + output << "# TYPE " << statesbase << "tcpgaveup " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpreadtimeouts " << "The number of TCP read timeouts" << "\n"; + output << "# TYPE " << statesbase << "tcpreadtimeouts " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpwritetimeouts " << "The number of TCP write timeouts" << "\n"; + output << "# TYPE " << statesbase << "tcpwritetimeouts " << "counter" << "\n"; for (const auto& state : *states) { string serverName; @@ -474,13 +484,18 @@ static void connectionThread(int sock, ComboAddress remote) const std::string label = boost::str(boost::format("{server=\"%1%\",address=\"%2%\"}") % serverName % state->remote.toStringWithPort()); - output << statesbase << "queries" << label << " " << state->queries.load() << "\n"; - output << statesbase << "drops" << label << " " << state->reuseds.load() << "\n"; - output << statesbase << "latency" << label << " " << state->latencyUsec/1000.0 << "\n"; - output << statesbase << "senderrors" << label << " " << state->sendErrors.load() << "\n"; - output << statesbase << "outstanding" << label << " " << state->outstanding.load() << "\n"; - output << statesbase << "order" << label << " " << state->order << "\n"; - output << statesbase << "weight" << label << " " << state->weight << "\n"; + output << statesbase << "queries" << label << " " << state->queries.load() << "\n"; + output << statesbase << "drops" << label << " " << state->reuseds.load() << "\n"; + output << statesbase << "latency" << label << " " << state->latencyUsec/1000.0 << "\n"; + output << statesbase << "senderrors" << label << " " << state->sendErrors.load() << "\n"; + output << statesbase << "outstanding" << label << " " << state->outstanding.load() << "\n"; + output << statesbase << "order" << label << " " << state->order << "\n"; + output << statesbase << "weight" << label << " " << state->weight << "\n"; + output << statesbase << "tcpdiedsendingquery" << label << " " << state->tcpDiedSendingQuery << "\n"; + output << statesbase << "tcpdiedreadingresponse" << label << " " << state->tcpDiedReadingResponse << "\n"; + output << statesbase << "tcpgaveup" << label << " " << state->tcpGaveUp << "\n"; + output << statesbase << "tcpreadtimeouts" << label << " " << state->tcpReadTimeouts << "\n"; + output << statesbase << "tcpwritetimeouts" << label << " " << state->tcpWriteTimeouts << "\n"; } for (const auto& front : g_frontends) { @@ -562,6 +577,11 @@ static void connectionThread(int sock, ComboAddress remote) {"latency", (double)(a->latencyUsec/1000.0)}, {"queries", (double)a->queries}, {"sendErrors", (double)a->sendErrors}, + {"tcpDiedSendingQuery", (double)a->tcpDiedSendingQuery}, + {"tcpDiedReadingResponse", (double)a->tcpDiedReadingResponse}, + {"tcpGaveUp", (double)a->tcpGaveUp}, + {"tcpReadTimeouts", (double)a->tcpReadTimeouts}, + {"tcpWriteTimeouts", (double)a->tcpWriteTimeouts}, {"dropRate", (double)a->dropRate} }; @@ -583,7 +603,12 @@ static void connectionThread(int sock, ComboAddress remote) { "address", front->local.toStringWithPort() }, { "udp", front->udpFD >= 0 }, { "tcp", front->tcpFD >= 0 }, - { "queries", (double) front->queries.load() } + { "queries", (double) front->queries.load() }, + { "tcpDiedReadingQuery", (double) front->tcpDiedReadingQuery.load() }, + { "tcpDiedSendingResponse", (double) front->tcpDiedSendingResponse.load() }, + { "tcpGaveUp", (double) front->tcpGaveUp.load() }, + { "tcpClientTimeouts", (double) front->tcpClientTimeouts }, + { "tcpDownstreamTimeouts", (double) front->tcpDownstreamTimeouts }, }; frontends.push_back(frontend); } diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh index f76888c09c..9b35daaa62 100644 --- a/pdns/dnsdist.hh +++ b/pdns/dnsdist.hh @@ -586,6 +586,11 @@ struct ClientState std::shared_ptr dnscryptCtx{nullptr}; shared_ptr tlsFrontend; std::atomic queries{0}; + std::atomic tcpDiedReadingQuery{0}; + std::atomic tcpDiedSendingResponse{0}; + std::atomic tcpGaveUp{0}; + std::atomic tcpClientTimeouts{0}; + std::atomic tcpDownstreamTimeouts{0}; int udpFD{-1}; int tcpFD{-1}; bool muted{false}; @@ -720,6 +725,11 @@ struct DownstreamState std::atomic reuseds{0}; std::atomic queries{0}; } prev; + std::atomic tcpDiedSendingQuery{0}; + std::atomic tcpDiedReadingResponse{0}; + std::atomic tcpGaveUp{0}; + std::atomic tcpReadTimeouts{0}; + std::atomic tcpWriteTimeouts{0}; string name; size_t socketsOffset{0}; double queryLoad{0.0};