From: Remi Gacogne Date: Fri, 19 Mar 2021 15:14:56 +0000 (+0100) Subject: dnsdist: Fix the TCP connect timeout, add metrics X-Git-Tag: rec-4.6.0-alpha0~6^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5da53ae3ebbbe6a9c9d42e95ab4bc3b4896d8fa4;p=thirdparty%2Fpdns.git dnsdist: Fix the TCP connect timeout, add metrics --- diff --git a/pdns/dnsdist-carbon.cc b/pdns/dnsdist-carbon.cc index 4938acd9f7..db0dd411df 100644 --- a/pdns/dnsdist-carbon.cc +++ b/pdns/dnsdist-carbon.cc @@ -102,6 +102,7 @@ void carbonDumpThread() str<tcpGaveUp.load() << " " << now << "\r\n"; str<tcpReadTimeouts.load() << " " << now << "\r\n"; str<tcpWriteTimeouts.load() << " " << now << "\r\n"; + str<tcpConnectTimeouts.load() << " " << now << "\r\n"; str<tcpCurrentConnections.load() << " " << now << "\r\n"; str<tcpNewConnections.load() << " " << now << "\r\n"; str<tcpReusedConnections.load() << " " << now << "\r\n"; diff --git a/pdns/dnsdist-lua-inspection.cc b/pdns/dnsdist-lua-inspection.cc index c4f758fda6..a21d38a47e 100644 --- a/pdns/dnsdist-lua-inspection.cc +++ b/pdns/dnsdist-lua-inspection.cc @@ -612,13 +612,13 @@ void setupLuaInspection(LuaContext& luaCtx) ret << endl; ret << "Backends:" << endl; - fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-20d %-20d %-20d %-20d %-20d %-20f %-20f"); - ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Total connections" % "Reused connections" % "Avg queries/conn" % "Avg duration") << endl; + fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-20d %-20d %-20d %-20d %-20d %20d %-20f %-20f"); + ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Connect timeouts" % "Total connections" % "Reused connections" % "Avg queries/conn" % "Avg duration") << endl; auto states = g_dstates.getLocal(); counter = 0; for(const auto& s : *states) { - ret << (fmt % counter % s->getName() % s->remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpNewConnections % s->tcpReusedConnections % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl; + ret << (fmt % counter % s->getName() % s->remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpConnectTimeouts % s->tcpNewConnections % s->tcpReusedConnections % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl; ++counter; } diff --git a/pdns/dnsdist-tcp.cc b/pdns/dnsdist-tcp.cc index b4d36152e0..e9cc0857c7 100644 --- a/pdns/dnsdist-tcp.cc +++ b/pdns/dnsdist-tcp.cc @@ -469,8 +469,8 @@ IOState IncomingTCPConnectionState::sendResponse(std::shared_ptrd_lastIOBlocked = true; - return IOState::NeedWrite; DEBUGLOG("partial write"); + return iostate; } } catch (const std::exception& e) { @@ -598,7 +598,6 @@ static void handleQuery(std::shared_ptr& state, cons return; } - state->d_readingFirstQuery = false; ++state->d_queriesCount; ++state->d_ci.cs->queries; ++g_stats.queries; @@ -981,7 +980,7 @@ void IncomingTCPConnectionState::handleIO(std::shared_ptrd_ci.cs->tcpDiedSendingResponse; } - if (state->d_ioState->getState() == IOState::NeedWrite || state->d_readingFirstQuery) { + if (state->d_ioState->getState() == IOState::NeedWrite || state->d_queriesCount == 0) { DEBUGLOG("Got an exception while handling TCP query: "<d_ioState->getState() == IOState::NeedRead ? "reading" : "writing"), state->d_ci.remote.toStringWithPort(), e.what()); } @@ -1097,9 +1096,6 @@ static void handleIncomingTCPQuery(int pipefd, FDMultiplexer::funcparam_t& param delete citmp; citmp = nullptr; - /* let's update the remaining time */ - state->d_remainingTime = g_maxTCPConnectionDuration; - IncomingTCPConnectionState::handleIO(state, now); } catch(...) { diff --git a/pdns/dnsdist-web.cc b/pdns/dnsdist-web.cc index c3f528f46f..05f280dfec 100644 --- a/pdns/dnsdist-web.cc +++ b/pdns/dnsdist-web.cc @@ -471,6 +471,8 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp) output << "# TYPE " << statesbase << "tcpdiedreadingresponse " << "counter" << "\n"; output << "# HELP " << statesbase << "tcpgaveup " << "The number of TCP connections failing after too many attempts" << "\n"; output << "# TYPE " << statesbase << "tcpgaveup " << "counter" << "\n"; + output << "# HELP " << statesbase << "tcpconnecttimeouts " << "The number of TCP connect timeouts" << "\n"; + output << "# TYPE " << statesbase << "tcpconnecttimeouts " << "counter" << "\n"; output << "# HELP " << statesbase << "tcpreadtimeouts " << "The number of TCP read timeouts" << "\n"; output << "# TYPE " << statesbase << "tcpreadtimeouts " << "counter" << "\n"; output << "# HELP " << statesbase << "tcpwritetimeouts " << "The number of TCP write timeouts" << "\n"; @@ -513,6 +515,7 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp) output << statesbase << "tcpgaveup" << label << " " << state->tcpGaveUp << "\n"; output << statesbase << "tcpreadtimeouts" << label << " " << state->tcpReadTimeouts << "\n"; output << statesbase << "tcpwritetimeouts" << label << " " << state->tcpWriteTimeouts << "\n"; + output << statesbase << "tcpconnecttimeouts" << label << " " << state->tcpConnectTimeouts << "\n"; output << statesbase << "tcpcurrentconnections" << label << " " << state->tcpCurrentConnections << "\n"; output << statesbase << "tcpnewconnections" << label << " " << state->tcpNewConnections << "\n"; output << statesbase << "tcpreusedconnections" << label << " " << state->tcpReusedConnections << "\n"; @@ -902,6 +905,7 @@ static void handleStats(const YaHTTP::Request& req, YaHTTP::Response& resp) {"tcpDiedSendingQuery", (double)a->tcpDiedSendingQuery}, {"tcpDiedReadingResponse", (double)a->tcpDiedReadingResponse}, {"tcpGaveUp", (double)a->tcpGaveUp}, + {"tcpConnectTimeouts", (double)a->tcpConnectTimeouts}, {"tcpReadTimeouts", (double)a->tcpReadTimeouts}, {"tcpWriteTimeouts", (double)a->tcpWriteTimeouts}, {"tcpCurrentConnections", (double)a->tcpCurrentConnections}, diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh index e139154566..9c8405cf63 100644 --- a/pdns/dnsdist.hh +++ b/pdns/dnsdist.hh @@ -906,6 +906,7 @@ struct DownstreamState stat_t tcpGaveUp{0}; stat_t tcpReadTimeouts{0}; stat_t tcpWriteTimeouts{0}; + stat_t tcpConnectTimeouts{0}; stat_t tcpCurrentConnections{0}; stat_t tcpReusedConnections{0}; stat_t tcpNewConnections{0}; @@ -920,8 +921,8 @@ struct DownstreamState int order{1}; int weight{1}; int tcpConnectTimeout{5}; - int tcpRecvTimeout{30}; - int tcpSendTimeout{30}; + int tcpRecvTimeout{5}; + int tcpSendTimeout{5}; unsigned int checkInterval{1}; unsigned int lastCheck{0}; const unsigned int sourceItf{0}; diff --git a/pdns/dnsdistdist/dnsdist-tcp-downstream.cc b/pdns/dnsdistdist/dnsdist-tcp-downstream.cc index 29288ed146..fd9cd78535 100644 --- a/pdns/dnsdistdist/dnsdist-tcp-downstream.cc +++ b/pdns/dnsdistdist/dnsdist-tcp-downstream.cc @@ -243,7 +243,19 @@ void TCPConnectionToBackend::handleIO(std::shared_ptr& c conn->d_ioState->update(iostate, handleIOCallback, conn); } else { - conn->d_ioState->update(iostate, handleIOCallback, conn, iostate == IOState::NeedRead ? conn->getBackendReadTTD(now) : conn->getBackendWriteTTD(now)); + boost::optional ttd{boost::none}; + if (iostate == IOState::NeedRead) { + ttd = conn->getBackendReadTTD(now); + } + else if (conn->isFresh() && conn->d_queries == 0) { + /* first write just after the non-blocking connect */ + ttd = conn->getBackendConnectTTD(now); + } + else { + ttd = conn->getBackendWriteTTD(now); + } + + conn->d_ioState->update(iostate, handleIOCallback, conn, ttd); } } } @@ -333,14 +345,16 @@ bool TCPConnectionToBackend::reconnect() } socket->setNonBlocking(); + gettimeofday(&d_connectionStartTime, nullptr); auto handler = std::make_unique("", socket->releaseHandle(), 0, d_ds->d_tlsCtx, time(nullptr)); handler->tryConnect(d_ds->tcpFastOpen && isFastOpenEnabled(), d_ds->remote); + d_queries = 0; d_handler = std::move(handler); ++d_ds->tcpCurrentConnections; return true; } - catch(const std::runtime_error& e) { + catch (const std::runtime_error& e) { vinfolog("Connection to downstream server %s failed: %s", d_ds->getName(), e.what()); d_downstreamFailures++; if (d_downstreamFailures >= d_ds->retries) { @@ -356,12 +370,19 @@ bool TCPConnectionToBackend::reconnect() void TCPConnectionToBackend::handleTimeout(const struct timeval& now, bool write) { /* in some cases we could retry, here, reconnecting and sending our pending responses again */ - vinfolog("Timeout while %s TCP backend %s", (write ? "writing to" : "reading from"), d_ds->getName()); if (write) { - ++d_ds->tcpWriteTimeouts; + if (isFresh() && d_queries == 0) { + ++d_ds->tcpConnectTimeouts; + vinfolog("Timeout while connecting to TCP backend %s", d_ds->getName()); + } + else { + ++d_ds->tcpWriteTimeouts; + vinfolog("Timeout while writing to TCP backend %s", d_ds->getName()); + } } else { ++d_ds->tcpReadTimeouts; + vinfolog("Timeout while reading from TCP backend %s", d_ds->getName()); } try { diff --git a/pdns/dnsdistdist/dnsdist-tcp-downstream.hh b/pdns/dnsdistdist/dnsdist-tcp-downstream.hh index 4c5e7da44c..ee3c11fb0c 100644 --- a/pdns/dnsdistdist/dnsdist-tcp-downstream.hh +++ b/pdns/dnsdistdist/dnsdist-tcp-downstream.hh @@ -219,7 +219,7 @@ private: boost::optional getBackendWriteTTD(const struct timeval& now) const { if (d_ds == nullptr) { - throw std::runtime_error("getBackendReadTTD() called without any backend selected"); + throw std::runtime_error("getBackendWriteTTD() called without any backend selected"); } if (d_ds->tcpSendTimeout == 0) { return boost::none; @@ -231,6 +231,21 @@ private: return res; } + boost::optional getBackendConnectTTD(const struct timeval& now) const + { + if (d_ds == nullptr) { + throw std::runtime_error("getBackendConnectTTD() called without any backend selected"); + } + if (d_ds->tcpConnectTimeout == 0) { + return boost::none; + } + + struct timeval res = now; + res.tv_sec += d_ds->tcpConnectTimeout; + + return res; + } + static const uint16_t s_xfrID; PacketBuffer d_responseBuffer; diff --git a/pdns/dnsdistdist/dnsdist-tcp-upstream.hh b/pdns/dnsdistdist/dnsdist-tcp-upstream.hh index 3cbf575576..ae971b070e 100644 --- a/pdns/dnsdistdist/dnsdist-tcp-upstream.hh +++ b/pdns/dnsdistdist/dnsdist-tcp-upstream.hh @@ -136,7 +136,6 @@ public: if (elapsed >= maxConnectionDuration) { return true; } - d_remainingTime = maxConnectionDuration - elapsed; } return false; @@ -204,10 +203,8 @@ public: size_t d_proxyProtocolNeed{0}; size_t d_queriesCount{0}; size_t d_currentQueriesCount{0}; - unsigned int d_remainingTime{0}; uint16_t d_querySize{0}; State d_state{State::doingHandshake}; - bool d_readingFirstQuery{true}; bool d_isXFR{false}; bool d_proxyProtocolPayloadHasTLV{false}; bool d_lastIOBlocked{false}; diff --git a/pdns/dnsdistdist/test-dnsdisttcp_cc.cc b/pdns/dnsdistdist/test-dnsdisttcp_cc.cc index c5259f0869..78737a7ca2 100644 --- a/pdns/dnsdistdist/test-dnsdisttcp_cc.cc +++ b/pdns/dnsdistdist/test-dnsdisttcp_cc.cc @@ -2766,7 +2766,7 @@ BOOST_AUTO_TEST_CASE(test_IncomingConnectionOOOR_BackendOOOR) s_readBuffer.insert(s_readBuffer.end(), queries.at(2).begin(), queries.at(2).end()); /* make sure that the backend's timeout is shorter than the client's */ - backend->tcpSendTimeout = 1; + backend->tcpConnectTimeout = 1; g_tcpRecvTimeout = 5; bool timeout = false; @@ -2810,7 +2810,7 @@ BOOST_AUTO_TEST_CASE(test_IncomingConnectionOOOR_BackendOOOR) } struct timeval later = now; - later.tv_sec += backend->tcpSendTimeout + 1; + later.tv_sec += backend->tcpConnectTimeout + 1; auto expiredConns = threadData.mplexer->getTimeouts(later, true); BOOST_CHECK_EQUAL(expiredConns.size(), 1U); for (const auto& cbData : expiredConns) {