]> git.ipfire.org Git - thirdparty/pdns.git/commitdiff
dnsdist: Properly handle reconnection failure for backend UDP sockets
authorRemi Gacogne <remi.gacogne@powerdns.com>
Wed, 5 Apr 2023 20:17:23 +0000 (22:17 +0200)
committerRemi Gacogne <remi.gacogne@powerdns.com>
Fri, 12 May 2023 14:36:10 +0000 (16:36 +0200)
We try to reconnect our UDP sockets toward backends on some kind of
network errors that indicate a topology change, but we need to be
careful to handle the case where we actually fail to reconnect, as
we end up with no remaining sockets to use.
This commit properly deals with this case by pausing the thread handling
UDP responses from the backend, instead of having it enter a busy loop,
and by attempting to reconnect if we get a `bad file number` error when
trying to send a UDP datagram to the backend.

pdns/dnsdist.cc
pdns/dnsdist.hh
pdns/dnsdistdist/dnsdist-backend.cc

index 06aa910ab68f91e832c0853007591d388951f93b..0c85289e2ab4793afd48080fc9bcbb11dc1ac97d 100644 (file)
@@ -722,9 +722,25 @@ void responderThread(std::shared_ptr<DownstreamState> dss)
   std::vector<int> sockets;
   sockets.reserve(dss->sockets.size());
 
-  for(;;) {
+  for (;;) {
     try {
+      if (dss->isStopped()) {
+        break;
+      }
+
+      if (!dss->connected) {
+        /* the sockets are not connected yet, likely because we detected a problem,
+           tried to reconnect and it failed. We will try to reconnect after the next
+           successful health-check (unless reconnectOnUp is false), or when trying
+           to send in the UDP listener thread, but until then we simply need to wait. */
+        dss->waitUntilConnected();
+        continue;
+      }
+
       dss->pickSocketsReadyForReceiving(sockets);
+
+      /* check a second time here because we might have waited quite a bit
+         since the first check */
       if (dss->isStopped()) {
         break;
       }
@@ -1117,7 +1133,7 @@ ssize_t udpClientSendRequestToBackend(const std::shared_ptr<DownstreamState>& ss
        We don't want to reconnect the real socket if the healthcheck failed,
        because it's not using the same socket.
     */
-    if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV || savederrno == ENETUNREACH)) {
+    if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV || savederrno == ENETUNREACH || savederrno == EBADF)) {
       ss->reconnect();
     }
   }
index 472a729ba17088ba3ea5002d59dda03eba82f98d..a26adeba9e753897f04f228f43f71016de8a2c1b 100644 (file)
@@ -23,6 +23,7 @@
 #include "config.h"
 #include "ext/luawrapper/include/LuaContext.hpp"
 
+#include <condition_variable>
 #include <memory>
 #include <mutex>
 #include <string>
@@ -903,6 +904,7 @@ private:
 
   std::thread tid;
   std::mutex connectLock;
+  std::condition_variable d_connectedWait;
   std::atomic_flag threadStarted;
   bool d_stopped{false};
 public:
@@ -975,6 +977,7 @@ public:
   }
 
   bool reconnect();
+  void waitUntilConnected();
   void hash();
   void setId(const boost::uuids::uuid& newId);
   void setWeight(int newWeight);
index 9113183c83f31d9ea28984f77622cb7eaa152e4c..8ffa1fba61c2aaa6b72808f31be2b7a6a6c82f5a 100644 (file)
@@ -116,9 +116,30 @@ bool DownstreamState::reconnect()
     }
   }
 
+  if (connected) {
+    tl.unlock();
+    d_connectedWait.notify_all();
+  }
+
   return connected;
 }
 
+void DownstreamState::waitUntilConnected()
+{
+  if (d_stopped) {
+    return;
+  }
+  if (connected) {
+    return;
+  }
+  {
+    std::unique_lock<std::mutex> lock(connectLock);
+    d_connectedWait.wait(lock, [this]{
+      return connected.load();
+    });
+  }
+}
+
 void DownstreamState::stop()
 {
   if (d_stopped) {