From: Remi Gacogne Date: Tue, 9 Mar 2021 15:42:11 +0000 (+0100) Subject: dnsdist: Add metrics about outgoing TCP connections reuse X-Git-Tag: rec-4.5.0-beta1~14^2~4 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=da63cb08b3e92f8788eb0cd9d1f3b70243f9c131;p=thirdparty%2Fpdns.git dnsdist: Add metrics about outgoing TCP connections reuse As well as a new command to get a dump of the current TCP states: `requestTCPStatesDump()`. --- diff --git a/pdns/dnsdist-carbon.cc b/pdns/dnsdist-carbon.cc index 05ed9f62a5..4938acd9f7 100644 --- a/pdns/dnsdist-carbon.cc +++ b/pdns/dnsdist-carbon.cc @@ -103,6 +103,8 @@ void carbonDumpThread() str<tcpReadTimeouts.load() << " " << now << "\r\n"; str<tcpWriteTimeouts.load() << " " << now << "\r\n"; str<tcpCurrentConnections.load() << " " << now << "\r\n"; + str<tcpNewConnections.load() << " " << now << "\r\n"; + str<tcpReusedConnections.load() << " " << now << "\r\n"; str<tcpAvgQueriesPerConnection.load() << " " << now << "\r\n"; str<tcpAvgConnectionDuration.load() << " " << now << "\r\n"; } diff --git a/pdns/dnsdist-console.cc b/pdns/dnsdist-console.cc index 55fdc7a6ca..7a22d19bcb 100644 --- a/pdns/dnsdist-console.cc +++ b/pdns/dnsdist-console.cc @@ -556,6 +556,7 @@ const std::vector g_consoleKeywords{ { "reloadAllCertificates", true, "", "reload all DNSCrypt and TLS certificates, along with their associated keys" }, { "RemoteLogAction", true, "RemoteLogger [, alterFunction [, serverID]]", "send the content of this query to a remote logger via Protocol Buffer. `alterFunction` is a callback, receiving a DNSQuestion and a DNSDistProtoBufMessage, that can be used to modify the Protocol Buffer content, for example for anonymization purposes. `serverID` is the server identifier." }, { "RemoteLogResponseAction", true, "RemoteLogger [,alterFunction [,includeCNAME [, serverID]]]", "send the content of this response to a remote logger via Protocol Buffer. `alterFunction` is the same callback than the one in `RemoteLogAction` and `includeCNAME` indicates whether CNAME records inside the response should be parsed and exported. The default is to only exports A and AAAA records. `serverID` is the server identifier." }, + { "requestTCPStatesDump", true, "", "Request a dump of the TCP states (incoming connections, outgoing connections) during the next scan. Useful for debugging purposes only" }, { "rmACL", true, "netmask", "remove netmask from ACL" }, { "rmCacheHitResponseRule", true, "id", "remove cache hit response rule in position 'id', or whose uuid matches if 'id' is an UUID string, or finally whose name matches if 'id' is a string but not a valid UUID" }, { "rmResponseRule", true, "id", "remove response rule in position 'id', or whose uuid matches if 'id' is an UUID string, or finally whose name matches if 'id' is a string but not a valid UUID" }, diff --git a/pdns/dnsdist-lua-inspection.cc b/pdns/dnsdist-lua-inspection.cc index 70e92f06f3..c4f758fda6 100644 --- a/pdns/dnsdist-lua-inspection.cc +++ b/pdns/dnsdist-lua-inspection.cc @@ -612,13 +612,13 @@ void setupLuaInspection(LuaContext& luaCtx) ret << endl; ret << "Backends:" << endl; - fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-20d %-20d %-20d %-20f %-20f"); - ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Avg queries/conn" % "Avg duration") << endl; + fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-20d %-20d %-20d %-20d %-20d %-20f %-20f"); + ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Total connections" % "Reused connections" % "Avg queries/conn" % "Avg duration") << endl; auto states = g_dstates.getLocal(); counter = 0; for(const auto& s : *states) { - ret << (fmt % counter % s->getName() % s->remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl; + ret << (fmt % counter % s->getName() % s->remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpNewConnections % s->tcpReusedConnections % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl; ++counter; } @@ -656,6 +656,12 @@ void setupLuaInspection(LuaContext& luaCtx) g_outputBuffer=ret.str(); }); + luaCtx.writeFunction("requestTCPStatesDump", [] { + setLuaNoSideEffect(); + extern std::atomic g_tcpStatesDumpRequested; + g_tcpStatesDumpRequested += g_tcpclientthreads->getThreadsCount(); + }); + luaCtx.writeFunction("dumpStats", [] { setLuaNoSideEffect(); vector leftcolumn, rightcolumn; diff --git a/pdns/dnsdist-tcp.cc b/pdns/dnsdist-tcp.cc index 1588ea5603..ae31db31c0 100644 --- a/pdns/dnsdist-tcp.cc +++ b/pdns/dnsdist-tcp.cc @@ -67,6 +67,7 @@ uint16_t g_downstreamTCPCleanupInterval{60}; int g_tcpRecvTimeout{2}; int g_tcpSendTimeout{2}; bool g_useTCPSinglePipe{false}; +std::atomic g_tcpStatesDumpRequested{0}; class DownstreamConnectionsManager { @@ -83,6 +84,7 @@ public: result = std::move(list.front()); list.pop_front(); result->setReused(); + ++ds->tcpReusedConnections; return result; } } @@ -1100,25 +1102,39 @@ static void tcpClientThread(int pipefd) DownstreamConnectionsManager::cleanupClosedTCPConnections(); lastTCPCleanup = now.tv_sec; -#if 0 - /* just to keep things clean in the output, debug only */ - static std::mutex s_lock; - std::lock_guard lck(s_lock); - data.mplexer->runForAllWatchedFDs([](bool isRead, int fd, const FDMultiplexer::funcparam_t& param, struct timeval ttd) - { - struct timeval lnow; - gettimeofday(&lnow, nullptr); - cerr<<"- "<)) { - auto state = boost::any_cast>(param); - cerr<<" - "<toString()<)) { - auto conn = boost::any_cast>(param); - cerr<<" - "<toString()< 0) { + /* just to keep things clean in the output, debug only */ + static std::mutex s_lock; + std::lock_guard lck(s_lock); + if (g_tcpStatesDumpRequested > 0) { + /* no race here, we took the lock so it can only be increased in the meantime */ + --g_tcpStatesDumpRequested; + errlog("Dumping the TCP states, as requested:"); + data.mplexer->runForAllWatchedFDs([](bool isRead, int fd, const FDMultiplexer::funcparam_t& param, struct timeval ttd) + { + struct timeval lnow; + gettimeofday(&lnow, nullptr); + if (ttd.tv_sec > 0) { + errlog("- Descriptor %d is in %s state, TTD in %d", fd, (isRead ? "read" : "write"), (ttd.tv_sec-lnow.tv_sec)); + } + else { + errlog("- Descriptor %d is in %s state, no TTD set", fd, (isRead ? "read" : "write")); + } + + if (param.type() == typeid(std::shared_ptr)) { + auto state = boost::any_cast>(param); + errlog(" - %s", state->toString()); + } + else if (param.type() == typeid(std::shared_ptr)) { + auto conn = boost::any_cast>(param); + errlog(" - %s", conn->toString()); + } + else if (param.type() == typeid(TCPClientThreadData*)) { + errlog(" - Worker thread pipe"); + } + }); } - }); -#endif + } } if (now.tv_sec > lastTimeoutScan) { diff --git a/pdns/dnsdist-web.cc b/pdns/dnsdist-web.cc index eec18b2022..311e4d98e2 100644 --- a/pdns/dnsdist-web.cc +++ b/pdns/dnsdist-web.cc @@ -476,6 +476,10 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp) output << "# TYPE " << statesbase << "tcpwritetimeouts " << "counter" << "\n"; output << "# HELP " << statesbase << "tcpcurrentconnections " << "The number of current TCP connections" << "\n"; output << "# TYPE " << statesbase << "tcpcurrentconnections " << "gauge" << "\n"; + output << "# HELP " << statesbase << "tcpnewconnections " << "The number of established TCP connections in total" << "\n"; + output << "# TYPE " << statesbase << "tcpnewconnections " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpreusedsconnections " << "The number of times a TCP connection has been reused" << "\n"; + output << "# TYPE " << statesbase << "tcpreusedsconnections " << "counter" << "\n"; output << "# HELP " << statesbase << "tcpavgqueriesperconn " << "The average number of queries per TCP connection" << "\n"; output << "# TYPE " << statesbase << "tcpavgqueriesperconn " << "gauge" << "\n"; output << "# HELP " << statesbase << "tcpavgconnduration " << "The average duration of a TCP connection (ms)" << "\n"; @@ -509,6 +513,8 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp) output << statesbase << "tcpreadtimeouts" << label << " " << state->tcpReadTimeouts << "\n"; output << statesbase << "tcpwritetimeouts" << label << " " << state->tcpWriteTimeouts << "\n"; output << statesbase << "tcpcurrentconnections" << label << " " << state->tcpCurrentConnections << "\n"; + output << statesbase << "tcpnewconnections" << label << " " << state->tcpNewConnections << "\n"; + output << statesbase << "tcpreusedconnections" << label << " " << state->tcpReusedConnections << "\n"; output << statesbase << "tcpavgqueriesperconn" << label << " " << state->tcpAvgQueriesPerConnection << "\n"; output << statesbase << "tcpavgconnduration" << label << " " << state->tcpAvgConnectionDuration << "\n"; } @@ -898,6 +904,8 @@ static void handleStats(const YaHTTP::Request& req, YaHTTP::Response& resp) {"tcpReadTimeouts", (double)a->tcpReadTimeouts}, {"tcpWriteTimeouts", (double)a->tcpWriteTimeouts}, {"tcpCurrentConnections", (double)a->tcpCurrentConnections}, + {"tcpNewConnections", (double)a->tcpNewConnections}, + {"tcpReusedConnections", (double)a->tcpReusedConnections}, {"tcpAvgQueriesPerConnection", (double)a->tcpAvgQueriesPerConnection}, {"tcpAvgConnectionDuration", (double)a->tcpAvgConnectionDuration}, {"dropRate", (double)a->dropRate} diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh index 0c00d2c831..675cd3c583 100644 --- a/pdns/dnsdist.hh +++ b/pdns/dnsdist.hh @@ -906,6 +906,8 @@ struct DownstreamState stat_t tcpReadTimeouts{0}; stat_t tcpWriteTimeouts{0}; stat_t tcpCurrentConnections{0}; + stat_t tcpReusedConnections{0}; + stat_t tcpNewConnections{0}; pdns::stat_t_trait tcpAvgQueriesPerConnection{0.0}; /* in ms */ pdns::stat_t_trait tcpAvgConnectionDuration{0.0}; diff --git a/pdns/dnsdistdist/dnsdist-tcp-downstream.cc b/pdns/dnsdistdist/dnsdist-tcp-downstream.cc index 3d2a2935e2..3318a1e23e 100644 --- a/pdns/dnsdistdist/dnsdist-tcp-downstream.cc +++ b/pdns/dnsdistdist/dnsdist-tcp-downstream.cc @@ -307,6 +307,7 @@ bool TCPConnectionToBackend::reconnect() do { vinfolog("TCP connecting to downstream %s (%d)", d_ds->getNameWithAddr(), d_downstreamFailures); DEBUGLOG("Opening TCP connection to backend "<getNameWithAddr()); + ++d_ds->tcpNewConnections; try { auto socket = std::make_unique(d_ds->remote.sin4.sin_family, SOCK_STREAM, 0); DEBUGLOG("result of socket() is "<getHandle());