From: Remi Gacogne Date: Wed, 5 Apr 2023 20:17:23 +0000 (+0200) Subject: dnsdist: Properly handle reconnection failure for backend UDP sockets X-Git-Tag: dnsdist-1.8.1~10^2~3 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c9b9647d874a8b63e5936a94c404d7d875760aa7;p=thirdparty%2Fpdns.git dnsdist: Properly handle reconnection failure for backend UDP sockets We try to reconnect our UDP sockets toward backends on some kind of network errors that indicate a topology change, but we need to be careful to handle the case where we actually fail to reconnect, as we end up with no remaining sockets to use. This commit properly deals with this case by pausing the thread handling UDP responses from the backend, instead of having it enter a busy loop, and by attempting to reconnect if we get a `bad file number` error when trying to send a UDP datagram to the backend. (cherry picked from commit 541b8df1fc0773549a76c8de13fb1123baba8bda) --- diff --git a/pdns/dnsdist.cc b/pdns/dnsdist.cc index 7642b3a2d1..3bcdcae438 100644 --- a/pdns/dnsdist.cc +++ b/pdns/dnsdist.cc @@ -722,9 +722,25 @@ void responderThread(std::shared_ptr dss) std::vector sockets; sockets.reserve(dss->sockets.size()); - for(;;) { + for (;;) { try { + if (dss->isStopped()) { + break; + } + + if (!dss->connected) { + /* the sockets are not connected yet, likely because we detected a problem, + tried to reconnect and it failed. We will try to reconnect after the next + successful health-check (unless reconnectOnUp is false), or when trying + to send in the UDP listener thread, but until then we simply need to wait. */ + dss->waitUntilConnected(); + continue; + } + dss->pickSocketsReadyForReceiving(sockets); + + /* check a second time here because we might have waited quite a bit + since the first check */ if (dss->isStopped()) { break; } @@ -1117,7 +1133,7 @@ ssize_t udpClientSendRequestToBackend(const std::shared_ptr& ss We don't want to reconnect the real socket if the healthcheck failed, because it's not using the same socket. */ - if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV || savederrno == ENETUNREACH)) { + if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV || savederrno == ENETUNREACH || savederrno == EBADF)) { ss->reconnect(); } } diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh index 472a729ba1..a26adeba9e 100644 --- a/pdns/dnsdist.hh +++ b/pdns/dnsdist.hh @@ -23,6 +23,7 @@ #include "config.h" #include "ext/luawrapper/include/LuaContext.hpp" +#include #include #include #include @@ -903,6 +904,7 @@ private: std::thread tid; std::mutex connectLock; + std::condition_variable d_connectedWait; std::atomic_flag threadStarted; bool d_stopped{false}; public: @@ -975,6 +977,7 @@ public: } bool reconnect(); + void waitUntilConnected(); void hash(); void setId(const boost::uuids::uuid& newId); void setWeight(int newWeight); diff --git a/pdns/dnsdistdist/dnsdist-backend.cc b/pdns/dnsdistdist/dnsdist-backend.cc index cfa6e5c7b6..11f2e11f2c 100644 --- a/pdns/dnsdistdist/dnsdist-backend.cc +++ b/pdns/dnsdistdist/dnsdist-backend.cc @@ -115,9 +115,30 @@ bool DownstreamState::reconnect() } } + if (connected) { + tl.unlock(); + d_connectedWait.notify_all(); + } + return connected; } +void DownstreamState::waitUntilConnected() +{ + if (d_stopped) { + return; + } + if (connected) { + return; + } + { + std::unique_lock lock(connectLock); + d_connectedWait.wait(lock, [this]{ + return connected.load(); + }); + } +} + void DownstreamState::stop() { if (d_stopped) {