]> git.ipfire.org Git - thirdparty/pdns.git/commitdiff
dnsdist: Properly handle reconnection failure for backend UDP sockets
authorRemi Gacogne <remi.gacogne@powerdns.com>
Wed, 5 Apr 2023 20:17:23 +0000 (22:17 +0200)
committerRemi Gacogne <remi.gacogne@powerdns.com>
Mon, 14 Aug 2023 14:35:43 +0000 (16:35 +0200)
We try to reconnect our UDP sockets toward backends on some kind of
network errors that indicate a topology change, but we need to be
careful to handle the case where we actually fail to reconnect, as
we end up with no remaining sockets to use.
This commit properly deals with this case by pausing the thread handling
UDP responses from the backend, instead of having it enter a busy loop,
and by attempting to reconnect if we get a `bad file number` error when
trying to send a UDP datagram to the backend.

(cherry picked from commit 541b8df1fc0773549a76c8de13fb1123baba8bda)

pdns/dnsdist.cc
pdns/dnsdist.hh
pdns/dnsdistdist/dnsdist-backend.cc

index 7642b3a2d139d530a2cc64c34f28e87961e18b4c..3bcdcae4386120a0732340053cb101501bb382f3 100644 (file)
@@ -722,9 +722,25 @@ void responderThread(std::shared_ptr<DownstreamState> dss)
   std::vector<int> sockets;
   sockets.reserve(dss->sockets.size());
 
-  for(;;) {
+  for (;;) {
     try {
+      if (dss->isStopped()) {
+        break;
+      }
+
+      if (!dss->connected) {
+        /* the sockets are not connected yet, likely because we detected a problem,
+           tried to reconnect and it failed. We will try to reconnect after the next
+           successful health-check (unless reconnectOnUp is false), or when trying
+           to send in the UDP listener thread, but until then we simply need to wait. */
+        dss->waitUntilConnected();
+        continue;
+      }
+
       dss->pickSocketsReadyForReceiving(sockets);
+
+      /* check a second time here because we might have waited quite a bit
+         since the first check */
       if (dss->isStopped()) {
         break;
       }
@@ -1117,7 +1133,7 @@ ssize_t udpClientSendRequestToBackend(const std::shared_ptr<DownstreamState>& ss
        We don't want to reconnect the real socket if the healthcheck failed,
        because it's not using the same socket.
     */
-    if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV || savederrno == ENETUNREACH)) {
+    if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV || savederrno == ENETUNREACH || savederrno == EBADF)) {
       ss->reconnect();
     }
   }
index 472a729ba17088ba3ea5002d59dda03eba82f98d..a26adeba9e753897f04f228f43f71016de8a2c1b 100644 (file)
@@ -23,6 +23,7 @@
 #include "config.h"
 #include "ext/luawrapper/include/LuaContext.hpp"
 
+#include <condition_variable>
 #include <memory>
 #include <mutex>
 #include <string>
@@ -903,6 +904,7 @@ private:
 
   std::thread tid;
   std::mutex connectLock;
+  std::condition_variable d_connectedWait;
   std::atomic_flag threadStarted;
   bool d_stopped{false};
 public:
@@ -975,6 +977,7 @@ public:
   }
 
   bool reconnect();
+  void waitUntilConnected();
   void hash();
   void setId(const boost::uuids::uuid& newId);
   void setWeight(int newWeight);
index cfa6e5c7b653e5177b76b603ef147f63dc47d187..11f2e11f2ceb42139768ee6bca36a62c3e4d0f73 100644 (file)
@@ -115,9 +115,30 @@ bool DownstreamState::reconnect()
     }
   }
 
+  if (connected) {
+    tl.unlock();
+    d_connectedWait.notify_all();
+  }
+
   return connected;
 }
 
+void DownstreamState::waitUntilConnected()
+{
+  if (d_stopped) {
+    return;
+  }
+  if (connected) {
+    return;
+  }
+  {
+    std::unique_lock<std::mutex> lock(connectLock);
+    d_connectedWait.wait(lock, [this]{
+      return connected.load();
+    });
+  }
+}
+
 void DownstreamState::stop()
 {
   if (d_stopped) {