]> git.ipfire.org Git - thirdparty/pdns.git/commitdiff
dnsdist: Add metrics about outgoing TCP connections reuse
authorRemi Gacogne <remi.gacogne@powerdns.com>
Tue, 9 Mar 2021 15:42:11 +0000 (16:42 +0100)
committerRemi Gacogne <remi.gacogne@powerdns.com>
Tue, 9 Mar 2021 15:42:11 +0000 (16:42 +0100)
As well as a new command to get a dump of the current TCP states:
`requestTCPStatesDump()`.

pdns/dnsdist-carbon.cc
pdns/dnsdist-console.cc
pdns/dnsdist-lua-inspection.cc
pdns/dnsdist-tcp.cc
pdns/dnsdist-web.cc
pdns/dnsdist.hh
pdns/dnsdistdist/dnsdist-tcp-downstream.cc

index 05ed9f62a5455a10fbd90607528e6c2d4d6af013..4938acd9f74b040a93bd308d8c0c5723f28a0dae 100644 (file)
@@ -103,6 +103,8 @@ void carbonDumpThread()
             str<<base<<"tcpreadimeouts" << ' '<< state->tcpReadTimeouts.load() << " " << now << "\r\n";
             str<<base<<"tcpwritetimeouts" << ' '<< state->tcpWriteTimeouts.load() << " " << now << "\r\n";
             str<<base<<"tcpcurrentconnections" << ' '<< state->tcpCurrentConnections.load() << " " << now << "\r\n";
+            str<<base<<"tcpnewconnections" << ' '<< state->tcpNewConnections.load() << " " << now << "\r\n";
+            str<<base<<"tcpreusedconnections" << ' '<< state->tcpReusedConnections.load() << " " << now << "\r\n";
             str<<base<<"tcpavgqueriesperconnection" << ' '<< state->tcpAvgQueriesPerConnection.load() << " " << now << "\r\n";
             str<<base<<"tcpavgconnectionduration" << ' '<< state->tcpAvgConnectionDuration.load() << " " << now << "\r\n";
           }
index 55fdc7a6ca5700a9f5b4504b241b36f02d287368..7a22d19bcb15bb4c1f9b22935f1632d92e52a4b2 100644 (file)
@@ -556,6 +556,7 @@ const std::vector<ConsoleKeyword> g_consoleKeywords{
   { "reloadAllCertificates", true, "", "reload all DNSCrypt and TLS certificates, along with their associated keys" },
   { "RemoteLogAction", true, "RemoteLogger [, alterFunction [, serverID]]", "send the content of this query to a remote logger via Protocol Buffer. `alterFunction` is a callback, receiving a DNSQuestion and a DNSDistProtoBufMessage, that can be used to modify the Protocol Buffer content, for example for anonymization purposes. `serverID` is the server identifier." },
   { "RemoteLogResponseAction", true, "RemoteLogger [,alterFunction [,includeCNAME [, serverID]]]", "send the content of this response to a remote logger via Protocol Buffer. `alterFunction` is the same callback than the one in `RemoteLogAction` and `includeCNAME` indicates whether CNAME records inside the response should be parsed and exported. The default is to only exports A and AAAA records. `serverID` is the server identifier." },
+  { "requestTCPStatesDump", true, "", "Request a dump of the TCP states (incoming connections, outgoing connections) during the next scan. Useful for debugging purposes only" },
   { "rmACL", true, "netmask", "remove netmask from ACL" },
   { "rmCacheHitResponseRule", true, "id", "remove cache hit response rule in position 'id', or whose uuid matches if 'id' is an UUID string, or finally whose name matches if 'id' is a string but not a valid UUID" },
   { "rmResponseRule", true, "id", "remove response rule in position 'id', or whose uuid matches if 'id' is an UUID string, or finally whose name matches if 'id' is a string but not a valid UUID" },
index 70e92f06f3c5582b00f9cef21425f894b8a9a554..c4f758fda66034a04d8c7ac0942ce9cbbcc24f83 100644 (file)
@@ -612,13 +612,13 @@ void setupLuaInspection(LuaContext& luaCtx)
       ret << endl;
 
       ret << "Backends:" << endl;
-      fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-20d %-20d %-20d %-20f %-20f");
-      ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Avg queries/conn" % "Avg duration") << endl;
+      fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-20d %-20d %-20d %-20d %-20d %-20f %-20f");
+      ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Total connections" % "Reused connections" % "Avg queries/conn" % "Avg duration") << endl;
 
       auto states = g_dstates.getLocal();
       counter = 0;
       for(const auto& s : *states) {
-        ret << (fmt % counter % s->getName() % s->remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl;
+        ret << (fmt % counter % s->getName() % s->remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpNewConnections % s->tcpReusedConnections % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl;
         ++counter;
       }
 
@@ -656,6 +656,12 @@ void setupLuaInspection(LuaContext& luaCtx)
       g_outputBuffer=ret.str();
     });
 
+  luaCtx.writeFunction("requestTCPStatesDump", [] {
+    setLuaNoSideEffect();
+    extern std::atomic<uint64_t> g_tcpStatesDumpRequested;
+    g_tcpStatesDumpRequested += g_tcpclientthreads->getThreadsCount();
+  });
+
   luaCtx.writeFunction("dumpStats", [] {
       setLuaNoSideEffect();
       vector<string> leftcolumn, rightcolumn;
index 1588ea560344581f966c940d308b9df89139ec2f..ae31db31c0b4c15811d18c7372a08044d1c32e93 100644 (file)
@@ -67,6 +67,7 @@ uint16_t g_downstreamTCPCleanupInterval{60};
 int g_tcpRecvTimeout{2};
 int g_tcpSendTimeout{2};
 bool g_useTCPSinglePipe{false};
+std::atomic<uint64_t> g_tcpStatesDumpRequested{0};
 
 class DownstreamConnectionsManager
 {
@@ -83,6 +84,7 @@ public:
         result = std::move(list.front());
         list.pop_front();
         result->setReused();
+        ++ds->tcpReusedConnections;
         return result;
       }
     }
@@ -1100,25 +1102,39 @@ static void tcpClientThread(int pipefd)
       DownstreamConnectionsManager::cleanupClosedTCPConnections();
       lastTCPCleanup = now.tv_sec;
 
-#if 0
-      /* just to keep things clean in the output, debug only */
-      static std::mutex s_lock;
-      std::lock_guard<decltype(s_lock)> lck(s_lock);
-      data.mplexer->runForAllWatchedFDs([](bool isRead, int fd, const FDMultiplexer::funcparam_t& param, struct timeval ttd)
-      {
-        struct timeval lnow;
-        gettimeofday(&lnow, nullptr);
-        cerr<<"- "<<isRead<<" "<<fd<<": "<<" "<<(ttd.tv_sec-lnow.tv_sec)<<endl;
-        if (param.type() == typeid(std::shared_ptr<IncomingTCPConnectionState>)) {
-          auto state = boost::any_cast<std::shared_ptr<IncomingTCPConnectionState>>(param);
-          cerr<<" - "<<state->toString()<<endl;
-        }
-        else if (param.type() == typeid(std::shared_ptr<TCPConnectionToBackend>)) {
-          auto conn = boost::any_cast<std::shared_ptr<TCPConnectionToBackend>>(param);
-          cerr<<" - "<<conn->toString()<<endl;
+      if (g_tcpStatesDumpRequested > 0) {
+        /* just to keep things clean in the output, debug only */
+        static std::mutex s_lock;
+        std::lock_guard<decltype(s_lock)> lck(s_lock);
+        if (g_tcpStatesDumpRequested > 0) {
+          /* no race here, we took the lock so it can only be increased in the meantime */
+          --g_tcpStatesDumpRequested;
+          errlog("Dumping the TCP states, as requested:");
+          data.mplexer->runForAllWatchedFDs([](bool isRead, int fd, const FDMultiplexer::funcparam_t& param, struct timeval ttd)
+          {
+            struct timeval lnow;
+            gettimeofday(&lnow, nullptr);
+            if (ttd.tv_sec > 0) {
+            errlog("- Descriptor %d is in %s state, TTD in %d", fd, (isRead ? "read" : "write"), (ttd.tv_sec-lnow.tv_sec));
+            }
+            else {
+              errlog("- Descriptor %d is in %s state, no TTD set", fd, (isRead ? "read" : "write"));
+            }
+
+            if (param.type() == typeid(std::shared_ptr<IncomingTCPConnectionState>)) {
+              auto state = boost::any_cast<std::shared_ptr<IncomingTCPConnectionState>>(param);
+              errlog(" - %s", state->toString());
+            }
+            else if (param.type() == typeid(std::shared_ptr<TCPConnectionToBackend>)) {
+              auto conn = boost::any_cast<std::shared_ptr<TCPConnectionToBackend>>(param);
+              errlog(" - %s", conn->toString());
+            }
+            else if (param.type() == typeid(TCPClientThreadData*)) {
+              errlog(" - Worker thread pipe");
+            }
+          });
         }
-      });
-#endif
+      }
     }
 
     if (now.tv_sec > lastTimeoutScan) {
index eec18b202273f2381ca3f3226c70395116a3330b..311e4d98e22c7d84dbb17ef6023d5a0d89e84588 100644 (file)
@@ -476,6 +476,10 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
   output << "# TYPE " << statesbase << "tcpwritetimeouts "       << "counter"                                                           << "\n";
   output << "# HELP " << statesbase << "tcpcurrentconnections "  << "The number of current TCP connections"                             << "\n";
   output << "# TYPE " << statesbase << "tcpcurrentconnections "  << "gauge"                                                             << "\n";
+  output << "# HELP " << statesbase << "tcpnewconnections "      << "The number of established TCP connections in total"                << "\n";
+  output << "# TYPE " << statesbase << "tcpnewconnections "      << "counter"                                                           << "\n";
+  output << "# HELP " << statesbase << "tcpreusedsconnections "  << "The number of times a TCP connection has been reused"              << "\n";
+  output << "# TYPE " << statesbase << "tcpreusedsconnections "  << "counter"                                                           << "\n";
   output << "# HELP " << statesbase << "tcpavgqueriesperconn "   << "The average number of queries per TCP connection"                  << "\n";
   output << "# TYPE " << statesbase << "tcpavgqueriesperconn "   << "gauge"                                                             << "\n";
   output << "# HELP " << statesbase << "tcpavgconnduration "     << "The average duration of a TCP connection (ms)"                     << "\n";
@@ -509,6 +513,8 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
     output << statesbase << "tcpreadtimeouts"        << label << " " << state->tcpReadTimeouts            << "\n";
     output << statesbase << "tcpwritetimeouts"       << label << " " << state->tcpWriteTimeouts           << "\n";
     output << statesbase << "tcpcurrentconnections"  << label << " " << state->tcpCurrentConnections      << "\n";
+    output << statesbase << "tcpnewconnections"      << label << " " << state->tcpNewConnections          << "\n";
+    output << statesbase << "tcpreusedconnections"   << label << " " << state->tcpReusedConnections       << "\n";
     output << statesbase << "tcpavgqueriesperconn"   << label << " " << state->tcpAvgQueriesPerConnection << "\n";
     output << statesbase << "tcpavgconnduration"     << label << " " << state->tcpAvgConnectionDuration   << "\n";
   }
@@ -898,6 +904,8 @@ static void handleStats(const YaHTTP::Request& req, YaHTTP::Response& resp)
       {"tcpReadTimeouts", (double)a->tcpReadTimeouts},
       {"tcpWriteTimeouts", (double)a->tcpWriteTimeouts},
       {"tcpCurrentConnections", (double)a->tcpCurrentConnections},
+      {"tcpNewConnections", (double)a->tcpNewConnections},
+      {"tcpReusedConnections", (double)a->tcpReusedConnections},
       {"tcpAvgQueriesPerConnection", (double)a->tcpAvgQueriesPerConnection},
       {"tcpAvgConnectionDuration", (double)a->tcpAvgConnectionDuration},
       {"dropRate", (double)a->dropRate}
index 0c00d2c831813a83b4ff949159dc686faba00765..675cd3c583676b7d1298862e3f5ecedc7a03d416 100644 (file)
@@ -906,6 +906,8 @@ struct DownstreamState
   stat_t tcpReadTimeouts{0};
   stat_t tcpWriteTimeouts{0};
   stat_t tcpCurrentConnections{0};
+  stat_t tcpReusedConnections{0};
+  stat_t tcpNewConnections{0};
   pdns::stat_t_trait<double> tcpAvgQueriesPerConnection{0.0};
   /* in ms */
   pdns::stat_t_trait<double> tcpAvgConnectionDuration{0.0};
index 3d2a2935e2752dff83d5c5308cd9dc04fe31ad22..3318a1e23e2dd716f4aa2aadfe01da81416f385d 100644 (file)
@@ -307,6 +307,7 @@ bool TCPConnectionToBackend::reconnect()
   do {
     vinfolog("TCP connecting to downstream %s (%d)", d_ds->getNameWithAddr(), d_downstreamFailures);
     DEBUGLOG("Opening TCP connection to backend "<<d_ds->getNameWithAddr());
+    ++d_ds->tcpNewConnections;
     try {
       auto socket = std::make_unique<Socket>(d_ds->remote.sin4.sin_family, SOCK_STREAM, 0);
       DEBUGLOG("result of socket() is "<<socket->getHandle());