From: Remi Gacogne Date: Wed, 5 Apr 2023 20:17:23 +0000 (+0200) Subject: dnsdist: Properly handle reconnection failure for backend UDP sockets X-Git-Tag: rec-4.9.0-beta1~7^2~3 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=541b8df1fc0773549a76c8de13fb1123baba8bda;p=thirdparty%2Fpdns.git dnsdist: Properly handle reconnection failure for backend UDP sockets We try to reconnect our UDP sockets toward backends on some kind of network errors that indicate a topology change, but we need to be careful to handle the case where we actually fail to reconnect, as we end up with no remaining sockets to use. This commit properly deals with this case by pausing the thread handling UDP responses from the backend, instead of having it enter a busy loop, and by attempting to reconnect if we get a `bad file number` error when trying to send a UDP datagram to the backend. --- diff --git a/pdns/dnsdist.cc b/pdns/dnsdist.cc index 06aa910ab6..0c85289e2a 100644 --- a/pdns/dnsdist.cc +++ b/pdns/dnsdist.cc @@ -722,9 +722,25 @@ void responderThread(std::shared_ptr dss) std::vector sockets; sockets.reserve(dss->sockets.size()); - for(;;) { + for (;;) { try { + if (dss->isStopped()) { + break; + } + + if (!dss->connected) { + /* the sockets are not connected yet, likely because we detected a problem, + tried to reconnect and it failed. We will try to reconnect after the next + successful health-check (unless reconnectOnUp is false), or when trying + to send in the UDP listener thread, but until then we simply need to wait. */ + dss->waitUntilConnected(); + continue; + } + dss->pickSocketsReadyForReceiving(sockets); + + /* check a second time here because we might have waited quite a bit + since the first check */ if (dss->isStopped()) { break; } @@ -1117,7 +1133,7 @@ ssize_t udpClientSendRequestToBackend(const std::shared_ptr& ss We don't want to reconnect the real socket if the healthcheck failed, because it's not using the same socket. */ - if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV || savederrno == ENETUNREACH)) { + if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV || savederrno == ENETUNREACH || savederrno == EBADF)) { ss->reconnect(); } } diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh index 472a729ba1..a26adeba9e 100644 --- a/pdns/dnsdist.hh +++ b/pdns/dnsdist.hh @@ -23,6 +23,7 @@ #include "config.h" #include "ext/luawrapper/include/LuaContext.hpp" +#include #include #include #include @@ -903,6 +904,7 @@ private: std::thread tid; std::mutex connectLock; + std::condition_variable d_connectedWait; std::atomic_flag threadStarted; bool d_stopped{false}; public: @@ -975,6 +977,7 @@ public: } bool reconnect(); + void waitUntilConnected(); void hash(); void setId(const boost::uuids::uuid& newId); void setWeight(int newWeight); diff --git a/pdns/dnsdistdist/dnsdist-backend.cc b/pdns/dnsdistdist/dnsdist-backend.cc index 9113183c83..8ffa1fba61 100644 --- a/pdns/dnsdistdist/dnsdist-backend.cc +++ b/pdns/dnsdistdist/dnsdist-backend.cc @@ -116,9 +116,30 @@ bool DownstreamState::reconnect() } } + if (connected) { + tl.unlock(); + d_connectedWait.notify_all(); + } + return connected; } +void DownstreamState::waitUntilConnected() +{ + if (d_stopped) { + return; + } + if (connected) { + return; + } + { + std::unique_lock lock(connectLock); + d_connectedWait.wait(lock, [this]{ + return connected.load(); + }); + } +} + void DownstreamState::stop() { if (d_stopped) {