From: Remi Gacogne <remi.gacogne@powerdns.com>
Date: Tue, 21 Jun 2022 13:45:38 +0000 (+0200)
Subject: dnsdist: Add a metric to track when the TCP conns limit is reached
X-Git-Tag: auth-4.8.0-alpha0~39^2~2
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=02328e4ac65926d0273b92c14ea8189f3d9dddab;p=thirdparty%2Fpdns.git

dnsdist: Add a metric to track when the TCP conns limit is reached
---

diff --git a/pdns/dnsdist-carbon.cc b/pdns/dnsdist-carbon.cc
index cae92a059e..1776f60241 100644
--- a/pdns/dnsdist-carbon.cc
+++ b/pdns/dnsdist-carbon.cc
@@ -107,6 +107,7 @@ void carbonDumpThread()
             str<<base<<"tlsresumptions" << ' '<< state->tlsResumptions.load() << " " << now << "\r\n";
             str<<base<<"tcpavgqueriesperconnection" << ' '<< state->tcpAvgQueriesPerConnection.load() << " " << now << "\r\n";
             str<<base<<"tcpavgconnectionduration" << ' '<< state->tcpAvgConnectionDuration.load() << " " << now << "\r\n";
+            str<<base<<"tcptoomanyconcurrentconnections" << ' '<< state->tcpTooManyConcurrentConnections.load() << " " << now << "\r\n";
           }
 
           std::map<std::string,uint64_t> frontendDuplicates;
diff --git a/pdns/dnsdist-lua-inspection.cc b/pdns/dnsdist-lua-inspection.cc
index c95f23ca10..c631480982 100644
--- a/pdns/dnsdist-lua-inspection.cc
+++ b/pdns/dnsdist-lua-inspection.cc
@@ -629,13 +629,13 @@ void setupLuaInspection(LuaContext& luaCtx)
       ret << endl;
 
       ret << "Backends:" << endl;
-      fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-25d %-20d %-20d %-20d %-20d %-20d %-20d %-20d %-20f %-20f");
-      ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Max concurrent conn" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Connect timeouts" % "Total connections" % "Reused connections" % "TLS resumptions" % "Avg queries/conn" % "Avg duration") << endl;
+      fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-25d %-20d %-20d %-20d %-20d %-20d %-20d %-20d %-20d %-20f %-20f");
+      ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Max concurrent conn" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Connect timeouts" % "Too many conn" % "Total connections" % "Reused connections" % "TLS resumptions" % "Avg queries/conn" % "Avg duration") << endl;
 
       auto states = g_dstates.getLocal();
       counter = 0;
       for(const auto& s : *states) {
-        ret << (fmt % counter % s->getName() % s->d_config.remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpMaxConcurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpConnectTimeouts % s->tcpNewConnections % s->tcpReusedConnections % s->tlsResumptions % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl;
+        ret << (fmt % counter % s->getName() % s->d_config.remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpMaxConcurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpConnectTimeouts % s->tcpTooManyConcurrentConnections % s->tcpNewConnections % s->tcpReusedConnections % s->tlsResumptions % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl;
         ++counter;
       }
 
diff --git a/pdns/dnsdist-web.cc b/pdns/dnsdist-web.cc
index f0b7631f97..887d81bb19 100644
--- a/pdns/dnsdist-web.cc
+++ b/pdns/dnsdist-web.cc
@@ -516,54 +516,56 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
   auto states = g_dstates.getLocal();
   const string statesbase = "dnsdist_server_";
 
-  output << "# HELP " << statesbase << "status "                      << "Whether this backend is up (1) or down (0)"                        << "\n";
-  output << "# TYPE " << statesbase << "status "                      << "gauge"                                                             << "\n";
-  output << "# HELP " << statesbase << "queries "                     << "Amount of queries relayed to server"                               << "\n";
-  output << "# TYPE " << statesbase << "queries "                     << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "responses "                   << "Amount of responses received from this server"                     << "\n";
-  output << "# TYPE " << statesbase << "responses "                   << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "noncompliantresponses "       << "Amount of non-compliant responses received from this server"       << "\n";
-  output << "# TYPE " << statesbase << "noncompliantresponses "       << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "drops "                       << "Amount of queries not answered by server"                          << "\n";
-  output << "# TYPE " << statesbase << "drops "                       << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "latency "                     << "Server's latency when answering questions in milliseconds"         << "\n";
-  output << "# TYPE " << statesbase << "latency "                     << "gauge"                                                             << "\n";
-  output << "# HELP " << statesbase << "senderrors "                  << "Total number of OS send errors while relaying queries"             << "\n";
-  output << "# TYPE " << statesbase << "senderrors "                  << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "outstanding "                 << "Current number of queries that are waiting for a backend response" << "\n";
-  output << "# TYPE " << statesbase << "outstanding "                 << "gauge"                                                             << "\n";
-  output << "# HELP " << statesbase << "order "                       << "The order in which this server is picked"                          << "\n";
-  output << "# TYPE " << statesbase << "order "                       << "gauge"                                                             << "\n";
-  output << "# HELP " << statesbase << "weight "                      << "The weight within the order in which this server is picked"        << "\n";
-  output << "# TYPE " << statesbase << "weight "                      << "gauge"                                                             << "\n";
-  output << "# HELP " << statesbase << "tcpdiedsendingquery "         << "The number of TCP I/O errors while sending the query"              << "\n";
-  output << "# TYPE " << statesbase << "tcpdiedsendingquery "         << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "tcpdiedreadingresponse "      << "The number of TCP I/O errors while reading the response"           << "\n";
-  output << "# TYPE " << statesbase << "tcpdiedreadingresponse "      << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "tcpgaveup "                   << "The number of TCP connections failing after too many attempts"     << "\n";
-  output << "# TYPE " << statesbase << "tcpgaveup "                   << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "tcpconnecttimeouts "          << "The number of TCP connect timeouts"                                << "\n";
-  output << "# TYPE " << statesbase << "tcpconnecttimeouts "          << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "tcpreadtimeouts "             << "The number of TCP read timeouts"                                   << "\n";
-  output << "# TYPE " << statesbase << "tcpreadtimeouts "             << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "tcpwritetimeouts "            << "The number of TCP write timeouts"                                  << "\n";
-  output << "# TYPE " << statesbase << "tcpwritetimeouts "            << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "tcpcurrentconnections "       << "The number of current TCP connections"                             << "\n";
-  output << "# TYPE " << statesbase << "tcpcurrentconnections "       << "gauge"                                                             << "\n";
-  output << "# HELP " << statesbase << "tcpmaxconcurrentconnections " << "The maximum number of concurrent TCP connections"                  << "\n";
-  output << "# TYPE " << statesbase << "tcpmaxconcurrentconnections " << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "tcpnewconnections "           << "The number of established TCP connections in total"                << "\n";
-  output << "# TYPE " << statesbase << "tcpnewconnections "           << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "tcpreusedconnections "        << "The number of times a TCP connection has been reused"              << "\n";
-  output << "# TYPE " << statesbase << "tcpreusedconnections "        << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "tcpavgqueriesperconn "        << "The average number of queries per TCP connection"                  << "\n";
-  output << "# TYPE " << statesbase << "tcpavgqueriesperconn "        << "gauge"                                                             << "\n";
-  output << "# HELP " << statesbase << "tcpavgconnduration "          << "The average duration of a TCP connection (ms)"                     << "\n";
-  output << "# TYPE " << statesbase << "tcpavgconnduration "          << "gauge"                                                             << "\n";
-  output << "# HELP " << statesbase << "tlsresumptions "              << "The number of times a TLS session has been resumed"                << "\n";
-  output << "# TYPE " << statesbase << "tlsersumptions "              << "counter"                                                           << "\n";
-  output << "# HELP " << statesbase << "tcplatency "                  << "Server's latency when answering TCP questions in milliseconds"     << "\n";
-  output << "# TYPE " << statesbase << "tcplatency "                  << "gauge"                                                             << "\n";
+  output << "# HELP " << statesbase << "status "                          << "Whether this backend is up (1) or down (0)"                                           << "\n";
+  output << "# TYPE " << statesbase << "status "                          << "gauge"                                                                                << "\n";
+  output << "# HELP " << statesbase << "queries "                         << "Amount of queries relayed to server"                                                  << "\n";
+  output << "# TYPE " << statesbase << "queries "                         << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "responses "                       << "Amount of responses received from this server"                                        << "\n";
+  output << "# TYPE " << statesbase << "responses "                       << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "noncompliantresponses "           << "Amount of non-compliant responses received from this server"                          << "\n";
+  output << "# TYPE " << statesbase << "noncompliantresponses "           << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "drops "                           << "Amount of queries not answered by server"                                             << "\n";
+  output << "# TYPE " << statesbase << "drops "                           << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "latency "                         << "Server's latency when answering questions in milliseconds"                            << "\n";
+  output << "# TYPE " << statesbase << "latency "                         << "gauge"                                                                                << "\n";
+  output << "# HELP " << statesbase << "senderrors "                      << "Total number of OS send errors while relaying queries"                                << "\n";
+  output << "# TYPE " << statesbase << "senderrors "                      << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "outstanding "                     << "Current number of queries that are waiting for a backend response"                    << "\n";
+  output << "# TYPE " << statesbase << "outstanding "                     << "gauge"                                                                                << "\n";
+  output << "# HELP " << statesbase << "order "                           << "The order in which this server is picked"                                             << "\n";
+  output << "# TYPE " << statesbase << "order "                           << "gauge"                                                                                << "\n";
+  output << "# HELP " << statesbase << "weight "                          << "The weight within the order in which this server is picked"                           << "\n";
+  output << "# TYPE " << statesbase << "weight "                          << "gauge"                                                                                << "\n";
+  output << "# HELP " << statesbase << "tcpdiedsendingquery "             << "The number of TCP I/O errors while sending the query"                                 << "\n";
+  output << "# TYPE " << statesbase << "tcpdiedsendingquery "             << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "tcpdiedreadingresponse "          << "The number of TCP I/O errors while reading the response"                              << "\n";
+  output << "# TYPE " << statesbase << "tcpdiedreadingresponse "          << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "tcpgaveup "                       << "The number of TCP connections failing after too many attempts"                        << "\n";
+  output << "# TYPE " << statesbase << "tcpgaveup "                       << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "tcpconnecttimeouts "              << "The number of TCP connect timeouts"                                                   << "\n";
+  output << "# TYPE " << statesbase << "tcpconnecttimeouts "              << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "tcpreadtimeouts "                 << "The number of TCP read timeouts"                                                      << "\n";
+  output << "# TYPE " << statesbase << "tcpreadtimeouts "                 << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "tcpwritetimeouts "                << "The number of TCP write timeouts"                                                     << "\n";
+  output << "# TYPE " << statesbase << "tcpwritetimeouts "                << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "tcpcurrentconnections "           << "The number of current TCP connections"                                                << "\n";
+  output << "# TYPE " << statesbase << "tcpcurrentconnections "           << "gauge"                                                                                << "\n";
+  output << "# HELP " << statesbase << "tcpmaxconcurrentconnections "     << "The maximum number of concurrent TCP connections"                                     << "\n";
+  output << "# TYPE " << statesbase << "tcpmaxconcurrentconnections "     << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "tcptoomanyconcurrentconnections " << "Number of times we had to enforce the maximum number of concurrent TCP connections"   << "\n";
+  output << "# TYPE " << statesbase << "tcptoomanyconcurrentconnections " << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "tcpnewconnections "               << "The number of established TCP connections in total"                                   << "\n";
+  output << "# TYPE " << statesbase << "tcpnewconnections "               << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "tcpreusedconnections "            << "The number of times a TCP connection has been reused"                                 << "\n";
+  output << "# TYPE " << statesbase << "tcpreusedconnections "            << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "tcpavgqueriesperconn "            << "The average number of queries per TCP connection"                                     << "\n";
+  output << "# TYPE " << statesbase << "tcpavgqueriesperconn "            << "gauge"                                                                                << "\n";
+  output << "# HELP " << statesbase << "tcpavgconnduration "              << "The average duration of a TCP connection (ms)"                                        << "\n";
+  output << "# TYPE " << statesbase << "tcpavgconnduration "              << "gauge"                                                                                << "\n";
+  output << "# HELP " << statesbase << "tlsresumptions "                  << "The number of times a TLS session has been resumed"                                   << "\n";
+  output << "# TYPE " << statesbase << "tlsersumptions "                  << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "tcplatency "                      << "Server's latency when answering TCP questions in milliseconds"                        << "\n";
+  output << "# TYPE " << statesbase << "tcplatency "                      << "gauge"                                                                                << "\n";
 
   for (const auto& state : *states) {
     string serverName;
@@ -580,32 +582,33 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
     const std::string label = boost::str(boost::format("{server=\"%1%\",address=\"%2%\"}")
                                          % serverName % state->d_config.remote.toStringWithPort());
 
-    output << statesbase << "status"                       << label << " " << (state->isUp() ? "1" : "0")        << "\n";
-    output << statesbase << "queries"                      << label << " " << state->queries.load()              << "\n";
-    output << statesbase << "responses"                    << label << " " << state->responses.load()            << "\n";
-    output << statesbase << "noncompliantresponses"        << label << " " << state->nonCompliantResponses.load()<< "\n";
-    output << statesbase << "drops"                        << label << " " << state->reuseds.load()              << "\n";
+    output << statesbase << "status"                           << label << " " << (state->isUp() ? "1" : "0")            << "\n";
+    output << statesbase << "queries"                          << label << " " << state->queries.load()                  << "\n";
+    output << statesbase << "responses"                        << label << " " << state->responses.load()                << "\n";
+    output << statesbase << "noncompliantresponses"            << label << " " << state->nonCompliantResponses.load()    << "\n";
+    output << statesbase << "drops"                            << label << " " << state->reuseds.load()                  << "\n";
     if (state->isUp()) {
-      output << statesbase << "latency"                    << label << " " << state->latencyUsec/1000.0          << "\n";
-      output << statesbase << "tcplatency"                 << label << " " << state->latencyUsecTCP/1000.0       << "\n";
+      output << statesbase << "latency"                        << label << " " << state->latencyUsec/1000.0              << "\n";
+      output << statesbase << "tcplatency"                     << label << " " << state->latencyUsecTCP/1000.0           << "\n";
     }
-    output << statesbase << "senderrors"                   << label << " " << state->sendErrors.load()           << "\n";
-    output << statesbase << "outstanding"                  << label << " " << state->outstanding.load()          << "\n";
-    output << statesbase << "order"                        << label << " " << state->d_config.order              << "\n";
-    output << statesbase << "weight"                       << label << " " << state->d_config.d_weight           << "\n";
-    output << statesbase << "tcpdiedsendingquery"          << label << " " << state->tcpDiedSendingQuery         << "\n";
-    output << statesbase << "tcpdiedreadingresponse"       << label << " " << state->tcpDiedReadingResponse      << "\n";
-    output << statesbase << "tcpgaveup"                    << label << " " << state->tcpGaveUp                   << "\n";
-    output << statesbase << "tcpreadtimeouts"              << label << " " << state->tcpReadTimeouts             << "\n";
-    output << statesbase << "tcpwritetimeouts"             << label << " " << state->tcpWriteTimeouts            << "\n";
-    output << statesbase << "tcpconnecttimeouts"           << label << " " << state->tcpConnectTimeouts          << "\n";
-    output << statesbase << "tcpcurrentconnections"        << label << " " << state->tcpCurrentConnections       << "\n";
-    output << statesbase << "tcpmaxconcurrentconnections"  << label << " " << state->tcpMaxConcurrentConnections << "\n";
-    output << statesbase << "tcpnewconnections"            << label << " " << state->tcpNewConnections           << "\n";
-    output << statesbase << "tcpreusedconnections"         << label << " " << state->tcpReusedConnections        << "\n";
-    output << statesbase << "tcpavgqueriesperconn"         << label << " " << state->tcpAvgQueriesPerConnection  << "\n";
-    output << statesbase << "tcpavgconnduration"           << label << " " << state->tcpAvgConnectionDuration    << "\n";
-    output << statesbase << "tlsresumptions"               << label << " " << state->tlsResumptions              << "\n";
+    output << statesbase << "senderrors"                       << label << " " << state->sendErrors.load()               << "\n";
+    output << statesbase << "outstanding"                      << label << " " << state->outstanding.load()              << "\n";
+    output << statesbase << "order"                            << label << " " << state->d_config.order                  << "\n";
+    output << statesbase << "weight"                           << label << " " << state->d_config.d_weight               << "\n";
+    output << statesbase << "tcpdiedsendingquery"              << label << " " << state->tcpDiedSendingQuery             << "\n";
+    output << statesbase << "tcpdiedreadingresponse"           << label << " " << state->tcpDiedReadingResponse          << "\n";
+    output << statesbase << "tcpgaveup"                        << label << " " << state->tcpGaveUp                       << "\n";
+    output << statesbase << "tcpreadtimeouts"                  << label << " " << state->tcpReadTimeouts                 << "\n";
+    output << statesbase << "tcpwritetimeouts"                 << label << " " << state->tcpWriteTimeouts                << "\n";
+    output << statesbase << "tcpconnecttimeouts"               << label << " " << state->tcpConnectTimeouts              << "\n";
+    output << statesbase << "tcpcurrentconnections"            << label << " " << state->tcpCurrentConnections           << "\n";
+    output << statesbase << "tcpmaxconcurrentconnections"      << label << " " << state->tcpMaxConcurrentConnections     << "\n";
+    output << statesbase << "tcptoomanyconcurrentconnections"  << label << " " << state->tcpTooManyConcurrentConnections << "\n";
+    output << statesbase << "tcpnewconnections"                << label << " " << state->tcpNewConnections               << "\n";
+    output << statesbase << "tcpreusedconnections"             << label << " " << state->tcpReusedConnections            << "\n";
+    output << statesbase << "tcpavgqueriesperconn"             << label << " " << state->tcpAvgQueriesPerConnection      << "\n";
+    output << statesbase << "tcpavgconnduration"               << label << " " << state->tcpAvgConnectionDuration        << "\n";
+    output << statesbase << "tlsresumptions"                   << label << " " << state->tlsResumptions                  << "\n";
   }
 
   const string frontsbase = "dnsdist_frontend_";
@@ -1014,6 +1017,7 @@ static void addServerToJSON(Json::array& servers, int id, const std::shared_ptr<
     {"tcpWriteTimeouts", (double)a->tcpWriteTimeouts},
     {"tcpCurrentConnections", (double)a->tcpCurrentConnections},
     {"tcpMaxConcurrentConnections", (double)a->tcpMaxConcurrentConnections},
+    {"tcpTooManyConcurrentConnections", (double)a->tcpTooManyConcurrentConnections},
     {"tcpNewConnections", (double)a->tcpNewConnections},
     {"tcpReusedConnections", (double)a->tcpReusedConnections},
     {"tcpAvgQueriesPerConnection", (double)a->tcpAvgQueriesPerConnection},
diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh
index 9b3397f9d6..0cf4f9b663 100644
--- a/pdns/dnsdist.hh
+++ b/pdns/dnsdist.hh
@@ -821,6 +821,8 @@ struct DownstreamState: public std::enable_shared_from_this<DownstreamState>
   stat_t tcpCurrentConnections{0};
   /* maximum number of concurrent connections to this backend reached */
   stat_t tcpMaxConcurrentConnections{0};
+  /* number of times we had to enforce the maximum concurrent connections limit */
+  stat_t tcpTooManyConcurrentConnections{0};
   stat_t tcpReusedConnections{0};
   stat_t tcpNewConnections{0};
   stat_t tlsResumptions{0};
diff --git a/pdns/dnsdistdist/dnsdist-downstream-connection.hh b/pdns/dnsdistdist/dnsdist-downstream-connection.hh
index b5c6a361a4..f13cbcff22 100644
--- a/pdns/dnsdistdist/dnsdist-downstream-connection.hh
+++ b/pdns/dnsdistdist/dnsdist-downstream-connection.hh
@@ -79,6 +79,7 @@ public:
     }
 
     if (ds->d_config.d_tcpConcurrentConnectionsLimit > 0 && ds->tcpCurrentConnections.load() >= ds->d_config.d_tcpConcurrentConnectionsLimit) {
+      ++ds->tcpTooManyConcurrentConnections;
       throw std::runtime_error("Maximum number of TCP connections to " + ds->getNameWithAddr() + " reached, not creating a new one");
     }
 
@@ -240,7 +241,9 @@ protected:
         continue;
       }
 
-      (*connIt)->release();
+      if (entry->isIdle()) {
+        (*connIt)->release();
+      }
       connIt = sidx.erase(connIt);
     }
   }