]> git.ipfire.org Git - thirdparty/pdns.git/commitdiff
dnsdist: Add more TCP metrics
authorRemi Gacogne <remi.gacogne@powerdns.com>
Tue, 26 Mar 2019 09:24:35 +0000 (10:24 +0100)
committerRemi Gacogne <remi.gacogne@powerdns.com>
Thu, 4 Apr 2019 09:54:05 +0000 (11:54 +0200)
pdns/dnsdist-carbon.cc
pdns/dnsdist-tcp.cc
pdns/dnsdist-web.cc
pdns/dnsdist.hh

index f8f4d943d881175c4a20446891cad90d21c46000..b9b43a7140f65a5d1d1b80272cd5d33674206ff3 100644 (file)
@@ -95,6 +95,11 @@ try
           str<<base<<"latency" << ' ' << (state->availability != DownstreamState::Availability::Down ? state->latencyUsec/1000.0 : 0) << " " << now << "\r\n";
           str<<base<<"senderrors" << ' ' << state->sendErrors.load() << " " << now << "\r\n";
           str<<base<<"outstanding" << ' ' << state->outstanding.load() << " " << now << "\r\n";
+          str<<base<<"tcpdiedsendingquery" << "' '"<< state->tcpDiedSendingQuery.load() << " " << now << "\r\n";
+          str<<base<<"tcpdiedreaddingresponse" << "' '"<< state->tcpDiedReadingResponse.load() << " " << now << "\r\n";
+          str<<base<<"tcpgaveup" << "' '"<< state->tcpGaveUp.load() << " " << now << "\r\n";
+          str<<base<<"tcpreadimeouts" << "' '"<< state->tcpReadTimeouts.load() << " " << now << "\r\n";
+          str<<base<<"tcpwritetimeouts" << "' '"<< state->tcpWriteTimeouts.load() << " " << now << "\r\n";
         }
         for(const auto& front : g_frontends) {
           if (front->udpFD == -1 && front->tcpFD == -1)
@@ -104,6 +109,11 @@ try
           boost::replace_all(frontName, ".", "_");
           const string base = namespace_name + "." + hostname + "." + instance_name + ".frontends." + frontName + ".";
           str<<base<<"queries" << ' ' << front->queries.load() << " " << now << "\r\n";
+          str<<base<<"tcpdiedreadingquery" << "' '"<< front->tcpDiedReadingQuery.load() << " " << now << "\r\n";
+          str<<base<<"tcpdiedsendingresponse" << "' '"<< front->tcpDiedSendingResponse.load() << " " << now << "\r\n";
+          str<<base<<"tcpgaveup" << "' '"<< front->tcpGaveUp.load() << " " << now << "\r\n";
+          str<<base<<"tcpclientimeouts" << "' '"<< front->tcpClientTimeouts.load() << " " << now << "\r\n";
+          str<<base<<"tcpdownstreamtimeouts" << "' '"<< front->tcpDownstreamTimeouts.load() << " " << now << "\r\n";
         }
         auto localPools = g_pools.getLocal();
         for (const auto& entry : *localPools) {
index f6f59b5abb55585d9768273c7eb6cf66958a1a2a..ebb12c8c31850dc6deb9ce36a1139abfbeaf4de5 100644 (file)
@@ -125,6 +125,10 @@ static std::unique_ptr<Socket> getConnectionToDownstream(std::shared_ptr<Downstr
 
 static void releaseDownstreamConnection(std::shared_ptr<DownstreamState>& ds, std::unique_ptr<Socket>&& socket)
 {
+  if (socket == nullptr) {
+    return;
+  }
+
   const auto& it = t_downstreamSockets.find(ds->remote);
   if (it != t_downstreamSockets.end()) {
     auto& list = it->second;
@@ -582,6 +586,7 @@ static void sendResponse(std::shared_ptr<IncomingTCPConnectionState>& state)
   }
   catch (const std::exception& e) {
     vinfolog("Got an exception while writing TCP response to %s: %s", state->d_ci.remote.toStringWithPort(), e.what());
+    ++state->d_ci.cs->tcpDiedSendingResponse;
     handleNewIOState(state, IOState::Done, state->d_ci.fd, handleIOCallback);
   }
 }
@@ -663,6 +668,8 @@ static void sendQueryToBackend(std::shared_ptr<IncomingTCPConnectionState>& stat
     state->d_downstreamSocket = getConnectionToDownstream(ds, state->d_downstreamFailures, state->d_freshDownstreamConnection);
 
     if (!state->d_downstreamSocket) {
+      ++ds->tcpGaveUp;
+      ++state->d_ci.cs->tcpGaveUp;
       vinfolog("Downstream connection to %s failed %d times in a row, giving up.", ds->getName(), state->d_downstreamFailures);
       return;
     }
@@ -672,6 +679,8 @@ static void sendQueryToBackend(std::shared_ptr<IncomingTCPConnectionState>& stat
     return;
   }
 
+  ++ds->tcpGaveUp;
+  ++state->d_ci.cs->tcpGaveUp;
   vinfolog("Downstream connection to %s failed %u times in a row, giving up.", ds->getName(), state->d_downstreamFailures);
 }
 
@@ -880,6 +889,13 @@ static void handleDownstreamIOCallback(int fd, FDMultiplexer::funcparam_t& param
        Let's just drop the connection
     */
     vinfolog("Got an exception while handling (%s backend) TCP query from %s: %s", (state->d_lastIOState == IOState::NeedRead ? "reading from" : "writing to"), state->d_ci.remote.toStringWithPort(), e.what());
+    if (state->d_state == IncomingTCPConnectionState::State::sendingQueryToBackend) {
+      ++state->d_ds->tcpDiedSendingQuery;
+    }
+    else {
+      ++state->d_ds->tcpDiedReadingResponse;
+    }
+
     /* don't increase this counter when reusing connections */
     if (state->d_freshDownstreamConnection) {
       ++state->d_downstreamFailures;
@@ -977,6 +993,15 @@ static void handleIOCallback(int fd, FDMultiplexer::funcparam_t& param)
        but it might also be a real IO error or something else.
        Let's just drop the connection
     */
+    if (state->d_state == IncomingTCPConnectionState::State::doingHandshake ||
+        state->d_state == IncomingTCPConnectionState::State::readingQuerySize ||
+        state->d_state == IncomingTCPConnectionState::State::readingQuery) {
+      ++state->d_ci.cs->tcpDiedReadingQuery;
+    }
+    else if (state->d_state == IncomingTCPConnectionState::State::sendingResponse) {
+      ++state->d_ci.cs->tcpDiedSendingResponse;
+    }
+
     if (state->d_lastIOState == IOState::NeedWrite || state->d_readingFirstQuery) {
       vinfolog("Got an exception while handling (%s) TCP query from %s: %s", (state->d_lastIOState == IOState::NeedRead ? "reading" : "writing"), state->d_ci.remote.toStringWithPort(), e.what());
     }
@@ -1061,9 +1086,12 @@ void tcpClientThread(int pipefd)
         auto state = boost::any_cast<std::shared_ptr<IncomingTCPConnectionState>>(conn.second);
         if (conn.first == state->d_ci.fd) {
           vinfolog("Timeout (read) from remote TCP client %s", state->d_ci.remote.toStringWithPort());
+          ++state->d_ci.cs->tcpClientTimeouts;
         }
         else if (state->d_ds) {
           vinfolog("Timeout (read) from remote backend %s", state->d_ds->getName());
+          ++state->d_ci.cs->tcpDownstreamTimeouts;
+          ++state->d_ds->tcpReadTimeouts;
         }
         data.mplexer->removeReadFD(conn.first);
         state->d_lastIOState = IOState::Done;
@@ -1074,9 +1102,12 @@ void tcpClientThread(int pipefd)
         auto state = boost::any_cast<std::shared_ptr<IncomingTCPConnectionState>>(conn.second);
         if (conn.first == state->d_ci.fd) {
           vinfolog("Timeout (write) from remote TCP client %s", state->d_ci.remote.toStringWithPort());
+          ++state->d_ci.cs->tcpClientTimeouts;
         }
         else if (state->d_ds) {
           vinfolog("Timeout (write) from remote backend %s", state->d_ds->getName());
+          ++state->d_ci.cs->tcpDownstreamTimeouts;
+          ++state->d_ds->tcpWriteTimeouts;
         }
         data.mplexer->removeWriteFD(conn.first);
         state->d_lastIOState = IOState::Done;
index 9cb51a622b17d5f1b37fb251402a9737284746f7..1ee60dccd5d6b8da0b0a96fca9fbef05ba3ed919 100644 (file)
@@ -446,20 +446,30 @@ static void connectionThread(int sock, ComboAddress remote)
         auto states = g_dstates.getLocal();
         const string statesbase = "dnsdist_server_";
 
-        output << "# HELP " << statesbase << "queries "     << "Amount of queries relayed to server"                               << "\n";
-        output << "# TYPE " << statesbase << "queries "     << "counter"                                                           << "\n";
-        output << "# HELP " << statesbase << "drops "       << "Amount of queries not answered by server"                          << "\n";
-        output << "# TYPE " << statesbase << "drops "       << "counter"                                                           << "\n";
-        output << "# HELP " << statesbase << "latency "     << "Server's latency when answering questions in miliseconds"          << "\n";
-        output << "# TYPE " << statesbase << "latency "     << "gauge"                                                             << "\n";
-        output << "# HELP " << statesbase << "senderrors "  << "Total number of OS snd errors while relaying queries"              << "\n";
-        output << "# TYPE " << statesbase << "senderrors "  << "counter"                                                           << "\n";
-        output << "# HELP " << statesbase << "outstanding " << "Current number of queries that are waiting for a backend response" << "\n";
-        output << "# TYPE " << statesbase << "outstanding " << "gauge"                                                             << "\n";
-        output << "# HELP " << statesbase << "order "       << "The order in which this server is picked"                          << "\n";
-        output << "# TYPE " << statesbase << "order "       << "gauge"                                                             << "\n";
-        output << "# HELP " << statesbase << "weight "      << "The weight within the order in which this server is picked"        << "\n";
-        output << "# TYPE " << statesbase << "weight "      << "gauge"                                                             << "\n";
+        output << "# HELP " << statesbase << "queries "                << "Amount of queries relayed to server"                               << "\n";
+        output << "# TYPE " << statesbase << "queries "                << "counter"                                                           << "\n";
+        output << "# HELP " << statesbase << "drops "                  << "Amount of queries not answered by server"                          << "\n";
+        output << "# TYPE " << statesbase << "drops "                  << "counter"                                                           << "\n";
+        output << "# HELP " << statesbase << "latency "                << "Server's latency when answering questions in miliseconds"          << "\n";
+        output << "# TYPE " << statesbase << "latency "                << "gauge"                                                             << "\n";
+        output << "# HELP " << statesbase << "senderrors "             << "Total number of OS snd errors while relaying queries"              << "\n";
+        output << "# TYPE " << statesbase << "senderrors "             << "counter"                                                           << "\n";
+        output << "# HELP " << statesbase << "outstanding "            << "Current number of queries that are waiting for a backend response" << "\n";
+        output << "# TYPE " << statesbase << "outstanding "            << "gauge"                                                             << "\n";
+        output << "# HELP " << statesbase << "order "                  << "The order in which this server is picked"                          << "\n";
+        output << "# TYPE " << statesbase << "order "                  << "gauge"                                                             << "\n";
+        output << "# HELP " << statesbase << "weight "                 << "The weight within the order in which this server is picked"        << "\n";
+        output << "# TYPE " << statesbase << "weight "                 << "gauge"                                                             << "\n";
+        output << "# HELP " << statesbase << "tcpdiedsendingquery "    << "The number of TCP I/O errors while sending the query"              << "\n";
+        output << "# TYPE " << statesbase << "tcpdiedsendingquery "    << "counter"                                                           << "\n";
+        output << "# HELP " << statesbase << "tcpdiedreadingresponse " << "The number of TCP I/O errors while reading the response"           << "\n";
+        output << "# TYPE " << statesbase << "tcpdiedreadingresponse " << "counter"                                                           << "\n";
+        output << "# HELP " << statesbase << "tcpgaveup "              << "The number of TCP connections failing after too many attempts"     << "\n";
+        output << "# TYPE " << statesbase << "tcpgaveup "              << "counter"                                                           << "\n";
+        output << "# HELP " << statesbase << "tcpreadtimeouts "        << "The number of TCP read timeouts"                                   << "\n";
+        output << "# TYPE " << statesbase << "tcpreadtimeouts "        << "counter"                                                           << "\n";
+        output << "# HELP " << statesbase << "tcpwritetimeouts "       << "The number of TCP write timeouts"                                  << "\n";
+        output << "# TYPE " << statesbase << "tcpwritetimeouts "       << "counter"                                                           << "\n";
 
         for (const auto& state : *states) {
           string serverName;
@@ -474,13 +484,18 @@ static void connectionThread(int sock, ComboAddress remote)
           const std::string label = boost::str(boost::format("{server=\"%1%\",address=\"%2%\"}")
             % serverName % state->remote.toStringWithPort());
 
-          output << statesbase << "queries"     << label << " " << state->queries.load()     << "\n";
-          output << statesbase << "drops"       << label << " " << state->reuseds.load()     << "\n";
-          output << statesbase << "latency"     << label << " " << state->latencyUsec/1000.0 << "\n";
-          output << statesbase << "senderrors"  << label << " " << state->sendErrors.load()  << "\n";
-          output << statesbase << "outstanding" << label << " " << state->outstanding.load() << "\n";
-          output << statesbase << "order"       << label << " " << state->order              << "\n";
-          output << statesbase << "weight"      << label << " " << state->weight             << "\n";
+          output << statesbase << "queries"                << label << " " << state->queries.load()         << "\n";
+          output << statesbase << "drops"                  << label << " " << state->reuseds.load()         << "\n";
+          output << statesbase << "latency"                << label << " " << state->latencyUsec/1000.0     << "\n";
+          output << statesbase << "senderrors"             << label << " " << state->sendErrors.load()      << "\n";
+          output << statesbase << "outstanding"            << label << " " << state->outstanding.load()     << "\n";
+          output << statesbase << "order"                  << label << " " << state->order                  << "\n";
+          output << statesbase << "weight"                 << label << " " << state->weight                 << "\n";
+          output << statesbase << "tcpdiedsendingquery"    << label << " " << state->tcpDiedSendingQuery    << "\n";
+          output << statesbase << "tcpdiedreadingresponse" << label << " " << state->tcpDiedReadingResponse << "\n";
+          output << statesbase << "tcpgaveup"              << label << " " << state->tcpGaveUp              << "\n";
+          output << statesbase << "tcpreadtimeouts"        << label << " " << state->tcpReadTimeouts        << "\n";
+          output << statesbase << "tcpwritetimeouts"       << label << " " << state->tcpWriteTimeouts       << "\n";
         }
 
         for (const auto& front : g_frontends) {
@@ -562,6 +577,11 @@ static void connectionThread(int sock, ComboAddress remote)
           {"latency", (double)(a->latencyUsec/1000.0)},
           {"queries", (double)a->queries},
           {"sendErrors", (double)a->sendErrors},
+          {"tcpDiedSendingQuery", (double)a->tcpDiedSendingQuery},
+          {"tcpDiedReadingResponse", (double)a->tcpDiedReadingResponse},
+          {"tcpGaveUp", (double)a->tcpGaveUp},
+          {"tcpReadTimeouts", (double)a->tcpReadTimeouts},
+          {"tcpWriteTimeouts", (double)a->tcpWriteTimeouts},
           {"dropRate", (double)a->dropRate}
         };
 
@@ -583,7 +603,12 @@ static void connectionThread(int sock, ComboAddress remote)
           { "address", front->local.toStringWithPort() },
           { "udp", front->udpFD >= 0 },
           { "tcp", front->tcpFD >= 0 },
-          { "queries", (double) front->queries.load() }
+          { "queries", (double) front->queries.load() },
+          { "tcpDiedReadingQuery", (double) front->tcpDiedReadingQuery.load() },
+          { "tcpDiedSendingResponse", (double) front->tcpDiedSendingResponse.load() },
+          { "tcpGaveUp", (double) front->tcpGaveUp.load() },
+          { "tcpClientTimeouts", (double) front->tcpClientTimeouts },
+          { "tcpDownstreamTimeouts", (double) front->tcpDownstreamTimeouts },
         };
         frontends.push_back(frontend);
       }
index f76888c09c39e985d0d5220010895c5cf7db4b38..9b35daaa62aadf9b1919403422c703813fdccba1 100644 (file)
@@ -586,6 +586,11 @@ struct ClientState
   std::shared_ptr<DNSCryptContext> dnscryptCtx{nullptr};
   shared_ptr<TLSFrontend> tlsFrontend;
   std::atomic<uint64_t> queries{0};
+  std::atomic<uint64_t> tcpDiedReadingQuery{0};
+  std::atomic<uint64_t> tcpDiedSendingResponse{0};
+  std::atomic<uint64_t> tcpGaveUp{0};
+  std::atomic<uint64_t> tcpClientTimeouts{0};
+  std::atomic<uint64_t> tcpDownstreamTimeouts{0};
   int udpFD{-1};
   int tcpFD{-1};
   bool muted{false};
@@ -720,6 +725,11 @@ struct DownstreamState
     std::atomic<uint64_t> reuseds{0};
     std::atomic<uint64_t> queries{0};
   } prev;
+  std::atomic<uint64_t> tcpDiedSendingQuery{0};
+  std::atomic<uint64_t> tcpDiedReadingResponse{0};
+  std::atomic<uint64_t> tcpGaveUp{0};
+  std::atomic<uint64_t> tcpReadTimeouts{0};
+  std::atomic<uint64_t> tcpWriteTimeouts{0};
   string name;
   size_t socketsOffset{0};
   double queryLoad{0.0};