From: Remi Gacogne Date: Tue, 21 Jun 2022 13:45:38 +0000 (+0200) Subject: dnsdist: Add a metric to track when the TCP conns limit is reached X-Git-Tag: auth-4.8.0-alpha0~39^2~2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=02328e4ac65926d0273b92c14ea8189f3d9dddab;p=thirdparty%2Fpdns.git dnsdist: Add a metric to track when the TCP conns limit is reached --- diff --git a/pdns/dnsdist-carbon.cc b/pdns/dnsdist-carbon.cc index cae92a059e..1776f60241 100644 --- a/pdns/dnsdist-carbon.cc +++ b/pdns/dnsdist-carbon.cc @@ -107,6 +107,7 @@ void carbonDumpThread() str<tlsResumptions.load() << " " << now << "\r\n"; str<tcpAvgQueriesPerConnection.load() << " " << now << "\r\n"; str<tcpAvgConnectionDuration.load() << " " << now << "\r\n"; + str<tcpTooManyConcurrentConnections.load() << " " << now << "\r\n"; } std::map frontendDuplicates; diff --git a/pdns/dnsdist-lua-inspection.cc b/pdns/dnsdist-lua-inspection.cc index c95f23ca10..c631480982 100644 --- a/pdns/dnsdist-lua-inspection.cc +++ b/pdns/dnsdist-lua-inspection.cc @@ -629,13 +629,13 @@ void setupLuaInspection(LuaContext& luaCtx) ret << endl; ret << "Backends:" << endl; - fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-25d %-20d %-20d %-20d %-20d %-20d %-20d %-20d %-20f %-20f"); - ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Max concurrent conn" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Connect timeouts" % "Total connections" % "Reused connections" % "TLS resumptions" % "Avg queries/conn" % "Avg duration") << endl; + fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-25d %-20d %-20d %-20d %-20d %-20d %-20d %-20d %-20d %-20f %-20f"); + ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Max concurrent conn" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Connect timeouts" % "Too many conn" % "Total connections" % "Reused connections" % "TLS resumptions" % "Avg queries/conn" % "Avg duration") << endl; auto states = g_dstates.getLocal(); counter = 0; for(const auto& s : *states) { - ret << (fmt % counter % s->getName() % s->d_config.remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpMaxConcurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpConnectTimeouts % s->tcpNewConnections % s->tcpReusedConnections % s->tlsResumptions % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl; + ret << (fmt % counter % s->getName() % s->d_config.remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpMaxConcurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpConnectTimeouts % s->tcpTooManyConcurrentConnections % s->tcpNewConnections % s->tcpReusedConnections % s->tlsResumptions % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl; ++counter; } diff --git a/pdns/dnsdist-web.cc b/pdns/dnsdist-web.cc index f0b7631f97..887d81bb19 100644 --- a/pdns/dnsdist-web.cc +++ b/pdns/dnsdist-web.cc @@ -516,54 +516,56 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp) auto states = g_dstates.getLocal(); const string statesbase = "dnsdist_server_"; - output << "# HELP " << statesbase << "status " << "Whether this backend is up (1) or down (0)" << "\n"; - output << "# TYPE " << statesbase << "status " << "gauge" << "\n"; - output << "# HELP " << statesbase << "queries " << "Amount of queries relayed to server" << "\n"; - output << "# TYPE " << statesbase << "queries " << "counter" << "\n"; - output << "# HELP " << statesbase << "responses " << "Amount of responses received from this server" << "\n"; - output << "# TYPE " << statesbase << "responses " << "counter" << "\n"; - output << "# HELP " << statesbase << "noncompliantresponses " << "Amount of non-compliant responses received from this server" << "\n"; - output << "# TYPE " << statesbase << "noncompliantresponses " << "counter" << "\n"; - output << "# HELP " << statesbase << "drops " << "Amount of queries not answered by server" << "\n"; - output << "# TYPE " << statesbase << "drops " << "counter" << "\n"; - output << "# HELP " << statesbase << "latency " << "Server's latency when answering questions in milliseconds" << "\n"; - output << "# TYPE " << statesbase << "latency " << "gauge" << "\n"; - output << "# HELP " << statesbase << "senderrors " << "Total number of OS send errors while relaying queries" << "\n"; - output << "# TYPE " << statesbase << "senderrors " << "counter" << "\n"; - output << "# HELP " << statesbase << "outstanding " << "Current number of queries that are waiting for a backend response" << "\n"; - output << "# TYPE " << statesbase << "outstanding " << "gauge" << "\n"; - output << "# HELP " << statesbase << "order " << "The order in which this server is picked" << "\n"; - output << "# TYPE " << statesbase << "order " << "gauge" << "\n"; - output << "# HELP " << statesbase << "weight " << "The weight within the order in which this server is picked" << "\n"; - output << "# TYPE " << statesbase << "weight " << "gauge" << "\n"; - output << "# HELP " << statesbase << "tcpdiedsendingquery " << "The number of TCP I/O errors while sending the query" << "\n"; - output << "# TYPE " << statesbase << "tcpdiedsendingquery " << "counter" << "\n"; - output << "# HELP " << statesbase << "tcpdiedreadingresponse " << "The number of TCP I/O errors while reading the response" << "\n"; - output << "# TYPE " << statesbase << "tcpdiedreadingresponse " << "counter" << "\n"; - output << "# HELP " << statesbase << "tcpgaveup " << "The number of TCP connections failing after too many attempts" << "\n"; - output << "# TYPE " << statesbase << "tcpgaveup " << "counter" << "\n"; - output << "# HELP " << statesbase << "tcpconnecttimeouts " << "The number of TCP connect timeouts" << "\n"; - output << "# TYPE " << statesbase << "tcpconnecttimeouts " << "counter" << "\n"; - output << "# HELP " << statesbase << "tcpreadtimeouts " << "The number of TCP read timeouts" << "\n"; - output << "# TYPE " << statesbase << "tcpreadtimeouts " << "counter" << "\n"; - output << "# HELP " << statesbase << "tcpwritetimeouts " << "The number of TCP write timeouts" << "\n"; - output << "# TYPE " << statesbase << "tcpwritetimeouts " << "counter" << "\n"; - output << "# HELP " << statesbase << "tcpcurrentconnections " << "The number of current TCP connections" << "\n"; - output << "# TYPE " << statesbase << "tcpcurrentconnections " << "gauge" << "\n"; - output << "# HELP " << statesbase << "tcpmaxconcurrentconnections " << "The maximum number of concurrent TCP connections" << "\n"; - output << "# TYPE " << statesbase << "tcpmaxconcurrentconnections " << "counter" << "\n"; - output << "# HELP " << statesbase << "tcpnewconnections " << "The number of established TCP connections in total" << "\n"; - output << "# TYPE " << statesbase << "tcpnewconnections " << "counter" << "\n"; - output << "# HELP " << statesbase << "tcpreusedconnections " << "The number of times a TCP connection has been reused" << "\n"; - output << "# TYPE " << statesbase << "tcpreusedconnections " << "counter" << "\n"; - output << "# HELP " << statesbase << "tcpavgqueriesperconn " << "The average number of queries per TCP connection" << "\n"; - output << "# TYPE " << statesbase << "tcpavgqueriesperconn " << "gauge" << "\n"; - output << "# HELP " << statesbase << "tcpavgconnduration " << "The average duration of a TCP connection (ms)" << "\n"; - output << "# TYPE " << statesbase << "tcpavgconnduration " << "gauge" << "\n"; - output << "# HELP " << statesbase << "tlsresumptions " << "The number of times a TLS session has been resumed" << "\n"; - output << "# TYPE " << statesbase << "tlsersumptions " << "counter" << "\n"; - output << "# HELP " << statesbase << "tcplatency " << "Server's latency when answering TCP questions in milliseconds" << "\n"; - output << "# TYPE " << statesbase << "tcplatency " << "gauge" << "\n"; + output << "# HELP " << statesbase << "status " << "Whether this backend is up (1) or down (0)" << "\n"; + output << "# TYPE " << statesbase << "status " << "gauge" << "\n"; + output << "# HELP " << statesbase << "queries " << "Amount of queries relayed to server" << "\n"; + output << "# TYPE " << statesbase << "queries " << "counter" << "\n"; + output << "# HELP " << statesbase << "responses " << "Amount of responses received from this server" << "\n"; + output << "# TYPE " << statesbase << "responses " << "counter" << "\n"; + output << "# HELP " << statesbase << "noncompliantresponses " << "Amount of non-compliant responses received from this server" << "\n"; + output << "# TYPE " << statesbase << "noncompliantresponses " << "counter" << "\n"; + output << "# HELP " << statesbase << "drops " << "Amount of queries not answered by server" << "\n"; + output << "# TYPE " << statesbase << "drops " << "counter" << "\n"; + output << "# HELP " << statesbase << "latency " << "Server's latency when answering questions in milliseconds" << "\n"; + output << "# TYPE " << statesbase << "latency " << "gauge" << "\n"; + output << "# HELP " << statesbase << "senderrors " << "Total number of OS send errors while relaying queries" << "\n"; + output << "# TYPE " << statesbase << "senderrors " << "counter" << "\n"; + output << "# HELP " << statesbase << "outstanding " << "Current number of queries that are waiting for a backend response" << "\n"; + output << "# TYPE " << statesbase << "outstanding " << "gauge" << "\n"; + output << "# HELP " << statesbase << "order " << "The order in which this server is picked" << "\n"; + output << "# TYPE " << statesbase << "order " << "gauge" << "\n"; + output << "# HELP " << statesbase << "weight " << "The weight within the order in which this server is picked" << "\n"; + output << "# TYPE " << statesbase << "weight " << "gauge" << "\n"; + output << "# HELP " << statesbase << "tcpdiedsendingquery " << "The number of TCP I/O errors while sending the query" << "\n"; + output << "# TYPE " << statesbase << "tcpdiedsendingquery " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpdiedreadingresponse " << "The number of TCP I/O errors while reading the response" << "\n"; + output << "# TYPE " << statesbase << "tcpdiedreadingresponse " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpgaveup " << "The number of TCP connections failing after too many attempts" << "\n"; + output << "# TYPE " << statesbase << "tcpgaveup " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpconnecttimeouts " << "The number of TCP connect timeouts" << "\n"; + output << "# TYPE " << statesbase << "tcpconnecttimeouts " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpreadtimeouts " << "The number of TCP read timeouts" << "\n"; + output << "# TYPE " << statesbase << "tcpreadtimeouts " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpwritetimeouts " << "The number of TCP write timeouts" << "\n"; + output << "# TYPE " << statesbase << "tcpwritetimeouts " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpcurrentconnections " << "The number of current TCP connections" << "\n"; + output << "# TYPE " << statesbase << "tcpcurrentconnections " << "gauge" << "\n"; + output << "# HELP " << statesbase << "tcpmaxconcurrentconnections " << "The maximum number of concurrent TCP connections" << "\n"; + output << "# TYPE " << statesbase << "tcpmaxconcurrentconnections " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcptoomanyconcurrentconnections " << "Number of times we had to enforce the maximum number of concurrent TCP connections" << "\n"; + output << "# TYPE " << statesbase << "tcptoomanyconcurrentconnections " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpnewconnections " << "The number of established TCP connections in total" << "\n"; + output << "# TYPE " << statesbase << "tcpnewconnections " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpreusedconnections " << "The number of times a TCP connection has been reused" << "\n"; + output << "# TYPE " << statesbase << "tcpreusedconnections " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpavgqueriesperconn " << "The average number of queries per TCP connection" << "\n"; + output << "# TYPE " << statesbase << "tcpavgqueriesperconn " << "gauge" << "\n"; + output << "# HELP " << statesbase << "tcpavgconnduration " << "The average duration of a TCP connection (ms)" << "\n"; + output << "# TYPE " << statesbase << "tcpavgconnduration " << "gauge" << "\n"; + output << "# HELP " << statesbase << "tlsresumptions " << "The number of times a TLS session has been resumed" << "\n"; + output << "# TYPE " << statesbase << "tlsersumptions " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcplatency " << "Server's latency when answering TCP questions in milliseconds" << "\n"; + output << "# TYPE " << statesbase << "tcplatency " << "gauge" << "\n"; for (const auto& state : *states) { string serverName; @@ -580,32 +582,33 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp) const std::string label = boost::str(boost::format("{server=\"%1%\",address=\"%2%\"}") % serverName % state->d_config.remote.toStringWithPort()); - output << statesbase << "status" << label << " " << (state->isUp() ? "1" : "0") << "\n"; - output << statesbase << "queries" << label << " " << state->queries.load() << "\n"; - output << statesbase << "responses" << label << " " << state->responses.load() << "\n"; - output << statesbase << "noncompliantresponses" << label << " " << state->nonCompliantResponses.load()<< "\n"; - output << statesbase << "drops" << label << " " << state->reuseds.load() << "\n"; + output << statesbase << "status" << label << " " << (state->isUp() ? "1" : "0") << "\n"; + output << statesbase << "queries" << label << " " << state->queries.load() << "\n"; + output << statesbase << "responses" << label << " " << state->responses.load() << "\n"; + output << statesbase << "noncompliantresponses" << label << " " << state->nonCompliantResponses.load() << "\n"; + output << statesbase << "drops" << label << " " << state->reuseds.load() << "\n"; if (state->isUp()) { - output << statesbase << "latency" << label << " " << state->latencyUsec/1000.0 << "\n"; - output << statesbase << "tcplatency" << label << " " << state->latencyUsecTCP/1000.0 << "\n"; + output << statesbase << "latency" << label << " " << state->latencyUsec/1000.0 << "\n"; + output << statesbase << "tcplatency" << label << " " << state->latencyUsecTCP/1000.0 << "\n"; } - output << statesbase << "senderrors" << label << " " << state->sendErrors.load() << "\n"; - output << statesbase << "outstanding" << label << " " << state->outstanding.load() << "\n"; - output << statesbase << "order" << label << " " << state->d_config.order << "\n"; - output << statesbase << "weight" << label << " " << state->d_config.d_weight << "\n"; - output << statesbase << "tcpdiedsendingquery" << label << " " << state->tcpDiedSendingQuery << "\n"; - output << statesbase << "tcpdiedreadingresponse" << label << " " << state->tcpDiedReadingResponse << "\n"; - output << statesbase << "tcpgaveup" << label << " " << state->tcpGaveUp << "\n"; - output << statesbase << "tcpreadtimeouts" << label << " " << state->tcpReadTimeouts << "\n"; - output << statesbase << "tcpwritetimeouts" << label << " " << state->tcpWriteTimeouts << "\n"; - output << statesbase << "tcpconnecttimeouts" << label << " " << state->tcpConnectTimeouts << "\n"; - output << statesbase << "tcpcurrentconnections" << label << " " << state->tcpCurrentConnections << "\n"; - output << statesbase << "tcpmaxconcurrentconnections" << label << " " << state->tcpMaxConcurrentConnections << "\n"; - output << statesbase << "tcpnewconnections" << label << " " << state->tcpNewConnections << "\n"; - output << statesbase << "tcpreusedconnections" << label << " " << state->tcpReusedConnections << "\n"; - output << statesbase << "tcpavgqueriesperconn" << label << " " << state->tcpAvgQueriesPerConnection << "\n"; - output << statesbase << "tcpavgconnduration" << label << " " << state->tcpAvgConnectionDuration << "\n"; - output << statesbase << "tlsresumptions" << label << " " << state->tlsResumptions << "\n"; + output << statesbase << "senderrors" << label << " " << state->sendErrors.load() << "\n"; + output << statesbase << "outstanding" << label << " " << state->outstanding.load() << "\n"; + output << statesbase << "order" << label << " " << state->d_config.order << "\n"; + output << statesbase << "weight" << label << " " << state->d_config.d_weight << "\n"; + output << statesbase << "tcpdiedsendingquery" << label << " " << state->tcpDiedSendingQuery << "\n"; + output << statesbase << "tcpdiedreadingresponse" << label << " " << state->tcpDiedReadingResponse << "\n"; + output << statesbase << "tcpgaveup" << label << " " << state->tcpGaveUp << "\n"; + output << statesbase << "tcpreadtimeouts" << label << " " << state->tcpReadTimeouts << "\n"; + output << statesbase << "tcpwritetimeouts" << label << " " << state->tcpWriteTimeouts << "\n"; + output << statesbase << "tcpconnecttimeouts" << label << " " << state->tcpConnectTimeouts << "\n"; + output << statesbase << "tcpcurrentconnections" << label << " " << state->tcpCurrentConnections << "\n"; + output << statesbase << "tcpmaxconcurrentconnections" << label << " " << state->tcpMaxConcurrentConnections << "\n"; + output << statesbase << "tcptoomanyconcurrentconnections" << label << " " << state->tcpTooManyConcurrentConnections << "\n"; + output << statesbase << "tcpnewconnections" << label << " " << state->tcpNewConnections << "\n"; + output << statesbase << "tcpreusedconnections" << label << " " << state->tcpReusedConnections << "\n"; + output << statesbase << "tcpavgqueriesperconn" << label << " " << state->tcpAvgQueriesPerConnection << "\n"; + output << statesbase << "tcpavgconnduration" << label << " " << state->tcpAvgConnectionDuration << "\n"; + output << statesbase << "tlsresumptions" << label << " " << state->tlsResumptions << "\n"; } const string frontsbase = "dnsdist_frontend_"; @@ -1014,6 +1017,7 @@ static void addServerToJSON(Json::array& servers, int id, const std::shared_ptr< {"tcpWriteTimeouts", (double)a->tcpWriteTimeouts}, {"tcpCurrentConnections", (double)a->tcpCurrentConnections}, {"tcpMaxConcurrentConnections", (double)a->tcpMaxConcurrentConnections}, + {"tcpTooManyConcurrentConnections", (double)a->tcpTooManyConcurrentConnections}, {"tcpNewConnections", (double)a->tcpNewConnections}, {"tcpReusedConnections", (double)a->tcpReusedConnections}, {"tcpAvgQueriesPerConnection", (double)a->tcpAvgQueriesPerConnection}, diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh index 9b3397f9d6..0cf4f9b663 100644 --- a/pdns/dnsdist.hh +++ b/pdns/dnsdist.hh @@ -821,6 +821,8 @@ struct DownstreamState: public std::enable_shared_from_this stat_t tcpCurrentConnections{0}; /* maximum number of concurrent connections to this backend reached */ stat_t tcpMaxConcurrentConnections{0}; + /* number of times we had to enforce the maximum concurrent connections limit */ + stat_t tcpTooManyConcurrentConnections{0}; stat_t tcpReusedConnections{0}; stat_t tcpNewConnections{0}; stat_t tlsResumptions{0}; diff --git a/pdns/dnsdistdist/dnsdist-downstream-connection.hh b/pdns/dnsdistdist/dnsdist-downstream-connection.hh index b5c6a361a4..f13cbcff22 100644 --- a/pdns/dnsdistdist/dnsdist-downstream-connection.hh +++ b/pdns/dnsdistdist/dnsdist-downstream-connection.hh @@ -79,6 +79,7 @@ public: } if (ds->d_config.d_tcpConcurrentConnectionsLimit > 0 && ds->tcpCurrentConnections.load() >= ds->d_config.d_tcpConcurrentConnectionsLimit) { + ++ds->tcpTooManyConcurrentConnections; throw std::runtime_error("Maximum number of TCP connections to " + ds->getNameWithAddr() + " reached, not creating a new one"); } @@ -240,7 +241,9 @@ protected: continue; } - (*connIt)->release(); + if (entry->isIdle()) { + (*connIt)->release(); + } connIt = sidx.erase(connIt); } }