]> git.ipfire.org Git - thirdparty/pdns.git/commitdiff
dnsdist: Fix the TCP connect timeout, add metrics
authorRemi Gacogne <remi.gacogne@powerdns.com>
Fri, 19 Mar 2021 15:14:56 +0000 (16:14 +0100)
committerRemi Gacogne <remi.gacogne@powerdns.com>
Fri, 19 Mar 2021 17:29:26 +0000 (18:29 +0100)
pdns/dnsdist-carbon.cc
pdns/dnsdist-lua-inspection.cc
pdns/dnsdist-tcp.cc
pdns/dnsdist-web.cc
pdns/dnsdist.hh
pdns/dnsdistdist/dnsdist-tcp-downstream.cc
pdns/dnsdistdist/dnsdist-tcp-downstream.hh
pdns/dnsdistdist/dnsdist-tcp-upstream.hh
pdns/dnsdistdist/test-dnsdisttcp_cc.cc

index 4938acd9f74b040a93bd308d8c0c5723f28a0dae..db0dd411df69583718390531c5e11264eccf9822 100644 (file)
@@ -102,6 +102,7 @@ void carbonDumpThread()
             str<<base<<"tcpgaveup" << ' '<< state->tcpGaveUp.load() << " " << now << "\r\n";
             str<<base<<"tcpreadimeouts" << ' '<< state->tcpReadTimeouts.load() << " " << now << "\r\n";
             str<<base<<"tcpwritetimeouts" << ' '<< state->tcpWriteTimeouts.load() << " " << now << "\r\n";
+            str<<base<<"tcpconnecttimeouts" << ' '<< state->tcpConnectTimeouts.load() << " " << now << "\r\n";
             str<<base<<"tcpcurrentconnections" << ' '<< state->tcpCurrentConnections.load() << " " << now << "\r\n";
             str<<base<<"tcpnewconnections" << ' '<< state->tcpNewConnections.load() << " " << now << "\r\n";
             str<<base<<"tcpreusedconnections" << ' '<< state->tcpReusedConnections.load() << " " << now << "\r\n";
index c4f758fda66034a04d8c7ac0942ce9cbbcc24f83..a21d38a47e095f29b2a06f3febf98b9244e47f30 100644 (file)
@@ -612,13 +612,13 @@ void setupLuaInspection(LuaContext& luaCtx)
       ret << endl;
 
       ret << "Backends:" << endl;
-      fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-20d %-20d %-20d %-20d %-20d %-20f %-20f");
-      ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Total connections" % "Reused connections" % "Avg queries/conn" % "Avg duration") << endl;
+      fmt = boost::format("%-3d %-20.20s %-20.20s %-20d %-20d %-25d %-20d %-20d %-20d %-20d %-20d %20d %-20f %-20f");
+      ret << (fmt % "#" % "Name" % "Address" % "Connections" % "Died sending query" % "Died reading response" % "Gave up" % "Read timeouts" % "Write timeouts" % "Connect timeouts" % "Total connections" % "Reused connections" % "Avg queries/conn" % "Avg duration") << endl;
 
       auto states = g_dstates.getLocal();
       counter = 0;
       for(const auto& s : *states) {
-        ret << (fmt % counter % s->getName() % s->remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpNewConnections % s->tcpReusedConnections % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl;
+        ret << (fmt % counter % s->getName() % s->remote.toStringWithPort() % s->tcpCurrentConnections % s->tcpDiedSendingQuery % s->tcpDiedReadingResponse % s->tcpGaveUp % s->tcpReadTimeouts % s->tcpWriteTimeouts % s->tcpConnectTimeouts % s->tcpNewConnections % s->tcpReusedConnections % s->tcpAvgQueriesPerConnection % s->tcpAvgConnectionDuration) << endl;
         ++counter;
       }
 
index b4d36152e0cd7b3e28837362689d3d0f022bd464..e9cc0857c7a1685900ca6f86a2773ad31fd3a203 100644 (file)
@@ -469,8 +469,8 @@ IOState IncomingTCPConnectionState::sendResponse(std::shared_ptr<IncomingTCPConn
       return iostate;
     } else {
       state->d_lastIOBlocked = true;
-     return IOState::NeedWrite;
       DEBUGLOG("partial write");
+      return iostate;
     }
   }
   catch (const std::exception& e) {
@@ -598,7 +598,6 @@ static void handleQuery(std::shared_ptr<IncomingTCPConnectionState>& state, cons
     return;
   }
 
-  state->d_readingFirstQuery = false;
   ++state->d_queriesCount;
   ++state->d_ci.cs->queries;
   ++g_stats.queries;
@@ -981,7 +980,7 @@ void IncomingTCPConnectionState::handleIO(std::shared_ptr<IncomingTCPConnectionS
         ++state->d_ci.cs->tcpDiedSendingResponse;
       }
 
-      if (state->d_ioState->getState() == IOState::NeedWrite || state->d_readingFirstQuery) {
+      if (state->d_ioState->getState() == IOState::NeedWrite || state->d_queriesCount == 0) {
         DEBUGLOG("Got an exception while handling TCP query: "<<e.what());
         vinfolog("Got an exception while handling (%s) TCP query from %s: %s", (state->d_ioState->getState() == IOState::NeedRead ? "reading" : "writing"), state->d_ci.remote.toStringWithPort(), e.what());
       }
@@ -1097,9 +1096,6 @@ static void handleIncomingTCPQuery(int pipefd, FDMultiplexer::funcparam_t& param
     delete citmp;
     citmp = nullptr;
 
-    /* let's update the remaining time */
-    state->d_remainingTime = g_maxTCPConnectionDuration;
-
     IncomingTCPConnectionState::handleIO(state, now);
   }
   catch(...) {
index c3f528f46f12c838219ae27a542d91f735d2aa83..05f280dfecc540758814dadb56e3a942e8cc38b6 100644 (file)
@@ -471,6 +471,8 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
   output << "# TYPE " << statesbase << "tcpdiedreadingresponse " << "counter"                                                           << "\n";
   output << "# HELP " << statesbase << "tcpgaveup "              << "The number of TCP connections failing after too many attempts"     << "\n";
   output << "# TYPE " << statesbase << "tcpgaveup "              << "counter"                                                           << "\n";
+  output << "# HELP " << statesbase << "tcpconnecttimeouts "     << "The number of TCP connect timeouts"                                << "\n";
+  output << "# TYPE " << statesbase << "tcpconnecttimeouts "     << "counter"                                                           << "\n";
   output << "# HELP " << statesbase << "tcpreadtimeouts "        << "The number of TCP read timeouts"                                   << "\n";
   output << "# TYPE " << statesbase << "tcpreadtimeouts "        << "counter"                                                           << "\n";
   output << "# HELP " << statesbase << "tcpwritetimeouts "       << "The number of TCP write timeouts"                                  << "\n";
@@ -513,6 +515,7 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
     output << statesbase << "tcpgaveup"              << label << " " << state->tcpGaveUp                  << "\n";
     output << statesbase << "tcpreadtimeouts"        << label << " " << state->tcpReadTimeouts            << "\n";
     output << statesbase << "tcpwritetimeouts"       << label << " " << state->tcpWriteTimeouts           << "\n";
+    output << statesbase << "tcpconnecttimeouts"     << label << " " << state->tcpConnectTimeouts         << "\n";
     output << statesbase << "tcpcurrentconnections"  << label << " " << state->tcpCurrentConnections      << "\n";
     output << statesbase << "tcpnewconnections"      << label << " " << state->tcpNewConnections          << "\n";
     output << statesbase << "tcpreusedconnections"   << label << " " << state->tcpReusedConnections       << "\n";
@@ -902,6 +905,7 @@ static void handleStats(const YaHTTP::Request& req, YaHTTP::Response& resp)
       {"tcpDiedSendingQuery", (double)a->tcpDiedSendingQuery},
       {"tcpDiedReadingResponse", (double)a->tcpDiedReadingResponse},
       {"tcpGaveUp", (double)a->tcpGaveUp},
+      {"tcpConnectTimeouts", (double)a->tcpConnectTimeouts},
       {"tcpReadTimeouts", (double)a->tcpReadTimeouts},
       {"tcpWriteTimeouts", (double)a->tcpWriteTimeouts},
       {"tcpCurrentConnections", (double)a->tcpCurrentConnections},
index e139154566c9a606bce7a25b84f21c7f84482350..9c8405cf631e64b35bfcd78048b98ac715f6b7a7 100644 (file)
@@ -906,6 +906,7 @@ struct DownstreamState
   stat_t tcpGaveUp{0};
   stat_t tcpReadTimeouts{0};
   stat_t tcpWriteTimeouts{0};
+  stat_t tcpConnectTimeouts{0};
   stat_t tcpCurrentConnections{0};
   stat_t tcpReusedConnections{0};
   stat_t tcpNewConnections{0};
@@ -920,8 +921,8 @@ struct DownstreamState
   int order{1};
   int weight{1};
   int tcpConnectTimeout{5};
-  int tcpRecvTimeout{30};
-  int tcpSendTimeout{30};
+  int tcpRecvTimeout{5};
+  int tcpSendTimeout{5};
   unsigned int checkInterval{1};
   unsigned int lastCheck{0};
   const unsigned int sourceItf{0};
index 29288ed146ab3ff35139e42e2f13600b101f5a5b..fd9cd78535b94da4864f893650131748b2cb2bff 100644 (file)
@@ -243,7 +243,19 @@ void TCPConnectionToBackend::handleIO(std::shared_ptr<TCPConnectionToBackend>& c
         conn->d_ioState->update(iostate, handleIOCallback, conn);
       }
       else {
-        conn->d_ioState->update(iostate, handleIOCallback, conn, iostate == IOState::NeedRead ? conn->getBackendReadTTD(now) : conn->getBackendWriteTTD(now));
+        boost::optional<struct timeval> ttd{boost::none};
+        if (iostate == IOState::NeedRead) {
+          ttd = conn->getBackendReadTTD(now);
+        }
+        else if (conn->isFresh() && conn->d_queries == 0) {
+          /* first write just after the non-blocking connect */
+          ttd = conn->getBackendConnectTTD(now);
+        }
+        else {
+          ttd = conn->getBackendWriteTTD(now);
+        }
+
+        conn->d_ioState->update(iostate, handleIOCallback, conn, ttd);
       }
     }
   }
@@ -333,14 +345,16 @@ bool TCPConnectionToBackend::reconnect()
       }
       socket->setNonBlocking();
 
+      gettimeofday(&d_connectionStartTime, nullptr);
       auto handler = std::make_unique<TCPIOHandler>("", socket->releaseHandle(), 0, d_ds->d_tlsCtx, time(nullptr));
       handler->tryConnect(d_ds->tcpFastOpen && isFastOpenEnabled(), d_ds->remote);
+      d_queries = 0;
 
       d_handler = std::move(handler);
       ++d_ds->tcpCurrentConnections;
       return true;
     }
-    catch(const std::runtime_error& e) {
+    catch (const std::runtime_error& e) {
       vinfolog("Connection to downstream server %s failed: %s", d_ds->getName(), e.what());
       d_downstreamFailures++;
       if (d_downstreamFailures >= d_ds->retries) {
@@ -356,12 +370,19 @@ bool TCPConnectionToBackend::reconnect()
 void TCPConnectionToBackend::handleTimeout(const struct timeval& now, bool write)
 {
   /* in some cases we could retry, here, reconnecting and sending our pending responses again */
-  vinfolog("Timeout while %s TCP backend %s", (write ? "writing to" : "reading from"), d_ds->getName());
   if (write) {
-    ++d_ds->tcpWriteTimeouts;
+    if (isFresh() && d_queries == 0) {
+      ++d_ds->tcpConnectTimeouts;
+      vinfolog("Timeout while connecting to TCP backend %s", d_ds->getName());
+    }
+    else {
+      ++d_ds->tcpWriteTimeouts;
+      vinfolog("Timeout while writing to TCP backend %s", d_ds->getName());
+    }
   }
   else {
     ++d_ds->tcpReadTimeouts;
+    vinfolog("Timeout while reading from TCP backend %s", d_ds->getName());
   }
 
   try {
index 4c5e7da44ca5fe691ca335ba1464be407e65b146..ee3c11fb0c848a392bcd6cbc2758cc82733b8d2e 100644 (file)
@@ -219,7 +219,7 @@ private:
   boost::optional<struct timeval> getBackendWriteTTD(const struct timeval& now) const
   {
     if (d_ds == nullptr) {
-      throw std::runtime_error("getBackendReadTTD() called without any backend selected");
+      throw std::runtime_error("getBackendWriteTTD() called without any backend selected");
     }
     if (d_ds->tcpSendTimeout == 0) {
       return boost::none;
@@ -231,6 +231,21 @@ private:
     return res;
   }
 
+  boost::optional<struct timeval> getBackendConnectTTD(const struct timeval& now) const
+  {
+    if (d_ds == nullptr) {
+      throw std::runtime_error("getBackendConnectTTD() called without any backend selected");
+    }
+    if (d_ds->tcpConnectTimeout == 0) {
+      return boost::none;
+    }
+
+    struct timeval res = now;
+    res.tv_sec += d_ds->tcpConnectTimeout;
+
+    return res;
+  }
+
   static const uint16_t s_xfrID;
 
   PacketBuffer d_responseBuffer;
index 3cbf5755767b65063ef571f0aa33cbd7526ae212..ae971b070eab796b13c5cfd0d558bef65bcaaafd 100644 (file)
@@ -136,7 +136,6 @@ public:
       if (elapsed >= maxConnectionDuration) {
         return true;
       }
-      d_remainingTime = maxConnectionDuration - elapsed;
     }
 
     return false;
@@ -204,10 +203,8 @@ public:
   size_t d_proxyProtocolNeed{0};
   size_t d_queriesCount{0};
   size_t d_currentQueriesCount{0};
-  unsigned int d_remainingTime{0};
   uint16_t d_querySize{0};
   State d_state{State::doingHandshake};
-  bool d_readingFirstQuery{true};
   bool d_isXFR{false};
   bool d_proxyProtocolPayloadHasTLV{false};
   bool d_lastIOBlocked{false};
index c5259f08697915d0339ed656f503288f6c4e54c3..78737a7ca21f586fa92198dc020fbbe2e4aa68f6 100644 (file)
@@ -2766,7 +2766,7 @@ BOOST_AUTO_TEST_CASE(test_IncomingConnectionOOOR_BackendOOOR)
     s_readBuffer.insert(s_readBuffer.end(), queries.at(2).begin(), queries.at(2).end());
 
     /* make sure that the backend's timeout is shorter than the client's */
-    backend->tcpSendTimeout = 1;
+    backend->tcpConnectTimeout = 1;
     g_tcpRecvTimeout = 5;
 
     bool timeout = false;
@@ -2810,7 +2810,7 @@ BOOST_AUTO_TEST_CASE(test_IncomingConnectionOOOR_BackendOOOR)
     }
 
     struct timeval later = now;
-    later.tv_sec += backend->tcpSendTimeout + 1;
+    later.tv_sec += backend->tcpConnectTimeout + 1;
     auto expiredConns = threadData.mplexer->getTimeouts(later, true);
     BOOST_CHECK_EQUAL(expiredConns.size(), 1U);
     for (const auto& cbData : expiredConns) {