From: Remi Gacogne <remi.gacogne@powerdns.com>
Date: Tue, 4 Oct 2022 17:13:32 +0000 (+0200)
Subject: dnsdist: Implement a 'lazy' health-checking mode
X-Git-Tag: dnsdist-1.8.0-rc1~271^2~11
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e3ab12d9ffad04ce80207bc6e8c656b5e667aa7c;p=thirdparty%2Fpdns.git

dnsdist: Implement a 'lazy' health-checking mode

The general idea is that, on some low-end devices, we want to avoid
sending health-check queries to the backend at a regular interval,
as this might represent an order of magnitude more queries than
regular traffic.
To do that we instead look at the amount of timeouts and Server Failure
errors occurring with regular queries, and only start doing active
health-checking if these errors reach a certain threshold, temporarily
placing the backend in a 'potential failure' state.
If the health-check queries succeed, we quickly go back to the previous
'Healthy' state and stop sending health-check queries. Otherwise we
place the backend in 'Failed' state and keep sending health-check queries
but a reduced rate, as long as the backend is failing.
---

diff --git a/pdns/dnsdist-lua-bindings.cc b/pdns/dnsdist-lua-bindings.cc
index f9c321c7e7..8f0894d925 100644
--- a/pdns/dnsdist-lua-bindings.cc
+++ b/pdns/dnsdist-lua-bindings.cc
@@ -129,6 +129,12 @@ void setupLuaBindings(LuaContext& luaCtx, bool client)
       }
       s.setAuto();
     });
+  luaCtx.registerFunction<void(DownstreamState::*)(boost::optional<bool> newStatus)>("setLazyAuto", [](DownstreamState& s, boost::optional<bool> newStatus) {
+      if (newStatus) {
+        s.setUpStatus(*newStatus);
+      }
+      s.setLazyAuto();
+    });
   luaCtx.registerFunction<std::string(DownstreamState::*)()const>("getName", [](const DownstreamState& s) { return s.getName(); });
   luaCtx.registerFunction<std::string(DownstreamState::*)()const>("getNameWithAddr", [](const DownstreamState& s) { return s.getNameWithAddr(); });
   luaCtx.registerMember("upStatus", &DownstreamState::upStatus);
diff --git a/pdns/dnsdist-lua.cc b/pdns/dnsdist-lua.cc
index 96c46ceb45..aa5cf53ff8 100644
--- a/pdns/dnsdist-lua.cc
+++ b/pdns/dnsdist-lua.cc
@@ -475,6 +475,25 @@ static void setupLuaConfig(LuaContext& luaCtx, bool client, bool configCheck)
                            config.id = boost::uuids::string_generator()(boost::get<string>(vars["id"]));
                          }
 
+                         if (vars.count("healthCheckMode")) {
+                           auto mode = boost::get<string>(vars.at("healthCheckMode"));
+                           if (pdns_iequals(mode, "auto")) {
+                             config.availability = DownstreamState::Availability::Auto;
+                           }
+                           else if (pdns_iequals(mode, "lazy")) {
+                             config.availability = DownstreamState::Availability::Lazy;
+                           }
+                           else if (pdns_iequals(mode, "up")) {
+                             config.availability = DownstreamState::Availability::Up;
+                           }
+                           else if (pdns_iequals(mode, "down")) {
+                             config.availability = DownstreamState::Availability::Down;
+                           }
+                           else {
+                             warnlog("Ignoring unknown value '%s' for 'healthCheckMode' on 'newServer'", mode);
+                           }
+                         }
+
                          if (vars.count("checkName")) {
                            config.checkName = DNSName(boost::get<string>(vars["checkName"]));
                          }
@@ -507,6 +526,35 @@ static void setupLuaConfig(LuaContext& luaCtx, bool client, bool configCheck)
                            config.mustResolve = boost::get<bool>(vars["mustResolve"]);
                          }
 
+                         if (vars.count("lazyHealthCheckSampleSize")) {
+                           config.d_lazyHealthChecksSampleSize = std::stoi(boost::get<string>(vars.at("lazyHealthCheckSampleSize")));
+                         }
+
+                         if (vars.count("lazyHealthCheckMinSampleCount")) {
+                           config.d_lazyHealthChecksSampleSize = std::stoi(boost::get<string>(vars.at("lazyHealthCheckThreshold")));
+                         }
+
+                         if (vars.count("lazyHealthCheckThreshold")) {
+                           config.d_lazyHealthChecksThreshold = std::stoi(boost::get<string>(vars.at("lazyHealthCheckThreshold")));
+                         }
+
+                         if (vars.count("lazyHealthCheckFailedInterval")) {
+                           config.d_lazyHealthChecksFailedInterval = std::stoi(boost::get<string>(vars.at("lazyHealthCheckFailedInterval")));
+                         }
+
+                         if (vars.count("lazyHealthCheckMode")) {
+                           auto mode = boost::get<string>(vars.at("lazyHealthCheckMode"));
+                           if (pdns_iequals(mode, "TimeoutOnly")) {
+                             config.d_lazyHealthChecksMode = DownstreamState::LazyHealthCheckMode::TimeoutOnly;
+                           }
+                           else if (pdns_iequals(mode, "TimeoutOrServFail")) {
+                             config.d_lazyHealthChecksMode = DownstreamState::LazyHealthCheckMode::TimeoutOrServFail;
+                           }
+                           else {
+                             warnlog("Ignoring unknown value '%s' for 'lazyHealthCheckMode' on 'newServer'", mode);
+                           }
+                         }
+
                          if (vars.count("useClientSubnet")) {
                            config.useECS = boost::get<bool>(vars["useClientSubnet"]);
                          }
diff --git a/pdns/dnsdist-snmp.cc b/pdns/dnsdist-snmp.cc
index 1f102e2408..43f425f7ff 100644
--- a/pdns/dnsdist-snmp.cc
+++ b/pdns/dnsdist-snmp.cc
@@ -375,11 +375,11 @@ static int backendStatTable_handler(netsnmp_mib_handler* handler,
 }
 #endif /* HAVE_NET_SNMP */
 
-bool DNSDistSNMPAgent::sendBackendStatusChangeTrap(const std::shared_ptr<DownstreamState>& dss)
+bool DNSDistSNMPAgent::sendBackendStatusChangeTrap(const DownstreamState& dss)
 {
 #ifdef HAVE_NET_SNMP
-  const string backendAddress = dss->d_config.remote.toStringWithPort();
-  const string backendStatus = dss->getStatus();
+  const string backendAddress = dss.d_config.remote.toStringWithPort();
+  const string backendStatus = dss.getStatus();
   netsnmp_variable_list* varList = nullptr;
 
   snmp_varlist_add_variable(&varList,
@@ -394,8 +394,8 @@ bool DNSDistSNMPAgent::sendBackendStatusChangeTrap(const std::shared_ptr<Downstr
                             backendNameOID,
                             OID_LENGTH(backendNameOID),
                             ASN_OCTET_STR,
-                            dss->getName().c_str(),
-                            dss->getName().size());
+                            dss.getName().c_str(),
+                            dss.getName().size());
 
   snmp_varlist_add_variable(&varList,
                             backendAddressOID,
diff --git a/pdns/dnsdist-snmp.hh b/pdns/dnsdist-snmp.hh
index 3ccde7802b..283d43c6a5 100644
--- a/pdns/dnsdist-snmp.hh
+++ b/pdns/dnsdist-snmp.hh
@@ -31,7 +31,7 @@ class DNSDistSNMPAgent: public SNMPAgent
 {
 public:
   DNSDistSNMPAgent(const std::string& name, const std::string& daemonSocket);
-  bool sendBackendStatusChangeTrap(const std::shared_ptr<DownstreamState>&);
+  bool sendBackendStatusChangeTrap(const DownstreamState&);
   bool sendCustomTrap(const std::string& reason);
   bool sendDNSTrap(const DNSQuestion&, const std::string& reason="");
 };
diff --git a/pdns/dnsdist.cc b/pdns/dnsdist.cc
index 1c29f54f94..f9d990e7a6 100644
--- a/pdns/dnsdist.cc
+++ b/pdns/dnsdist.cc
@@ -726,6 +726,7 @@ void responderThread(std::shared_ptr<DownstreamState> dss)
         double udiff = ids->sentTime.udiff();
         // do that _before_ the processing, otherwise it's not fair to the backend
         dss->latencyUsec = (127.0 * dss->latencyUsec / 128.0) + udiff / 128.0;
+        dss->reportResponse(dh->rcode);
 
         /* don't call processResponse for DOH */
         if (du) {
@@ -1940,9 +1941,9 @@ static void healthChecksThread()
 {
   setThreadName("dnsdist/healthC");
 
-  static const int interval = 1;
+  constexpr int interval = 1;
 
-  for(;;) {
+  for (;;) {
     sleep(interval);
 
     std::unique_ptr<FDMultiplexer> mplexer{nullptr};
@@ -1955,23 +1956,18 @@ static void healthChecksThread()
       dss->prev.queries.store(dss->queries.load());
       dss->prev.reuseds.store(dss->reuseds.load());
 
-      dss->handleTimeouts();
+      dss->handleUDPTimeouts();
 
-      if (dss->d_nextCheck > 1) {
-        --dss->d_nextCheck;
+      if (!dss->healthCheckRequired()) {
         continue;
       }
 
-      dss->d_nextCheck = dss->d_config.checkInterval;
-
-      if (dss->d_config.availability == DownstreamState::Availability::Auto) {
-        if (!mplexer) {
-          mplexer = std::unique_ptr<FDMultiplexer>(FDMultiplexer::getMultiplexerSilent());
-        }
+      if (!mplexer) {
+        mplexer = std::unique_ptr<FDMultiplexer>(FDMultiplexer::getMultiplexerSilent());
+      }
 
-        if (!queueHealthCheck(mplexer, dss)) {
-          updateHealthCheckResult(dss, false, false);
-        }
+      if (!queueHealthCheck(mplexer, dss)) {
+        dss->submitHealthCheckResult(false, false);
       }
     }
 
diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh
index 7b8a0d2212..c9df37bc57 100644
--- a/pdns/dnsdist.hh
+++ b/pdns/dnsdist.hh
@@ -33,6 +33,7 @@
 
 #include <boost/variant.hpp>
 
+#include "circular_buffer.hh"
 #include "dnscrypt.hh"
 #include "dnsdist-cache.hh"
 #include "dnsdist-dynbpf.hh"
@@ -765,7 +766,8 @@ struct DownstreamState: public std::enable_shared_from_this<DownstreamState>
   DownstreamState& operator=(DownstreamState&&) = delete;
 
   typedef std::function<std::tuple<DNSName, uint16_t, uint16_t>(const DNSName&, uint16_t, uint16_t, dnsheader*)> checkfunc_t;
-  enum class Availability : uint8_t { Up, Down, Auto};
+  enum class Availability : uint8_t { Up, Down, Auto, Lazy };
+  enum class LazyHealthCheckMode : uint8_t { TimeoutOnly, TimeoutOrServFail };
 
   struct Config
   {
@@ -805,6 +807,11 @@ struct DownstreamState: public std::enable_shared_from_this<DownstreamState>
     uint16_t d_retries{5};
     uint16_t xpfRRCode{0};
     uint16_t checkTimeout{1000}; /* in milliseconds */
+    uint16_t d_lazyHealthChecksSampleSize{100};
+    uint16_t d_lazyHealthChecksMinSampleCount{1};
+    uint16_t d_lazyHealthChecksFailedInterval{30};
+    uint8_t d_lazyHealthChecksThreshold{20};
+    LazyHealthCheckMode d_lazyHealthChecksMode{LazyHealthCheckMode::TimeoutOrServFail};
     uint8_t maxCheckFailures{1};
     uint8_t minRiseSuccesses{1};
     Availability availability{Availability::Auto};
@@ -867,6 +874,16 @@ struct DownstreamState: public std::enable_shared_from_this<DownstreamState>
 private:
   LockGuarded<std::map<uint16_t, IDState>> d_idStatesMap;
   vector<IDState> idStates;
+
+  struct LazyHealthCheckStats
+  {
+    boost::circular_buffer<bool> d_lastResults;
+    time_t d_nextCheck{0};
+    enum class LazyStatus: uint8_t { Healthy = 0, PotentialFailure, Failed };
+    LazyStatus d_status{LazyStatus::Healthy};
+  };
+  LockGuarded<LazyHealthCheckStats> d_lazyHealthCheckStats;
+
 public:
   std::shared_ptr<TLSCtx> d_tlsCtx{nullptr};
   std::vector<int> sockets;
@@ -926,6 +943,12 @@ public:
   void setAuto() {
     d_config.availability = Availability::Auto;
   }
+  void setLazyAuto() {
+    d_config.availability = Availability::Lazy;
+    d_lazyHealthCheckStats.lock()->d_lastResults.set_capacity(d_config.d_lazyHealthChecksSampleSize);
+  }
+  bool healthCheckRequired();
+
   const string& getName() const {
     return d_config.name;
   }
@@ -1004,10 +1027,13 @@ public:
   bool passCrossProtocolQuery(std::unique_ptr<CrossProtocolQuery>&& cpq);
   int pickSocketForSending();
   void pickSocketsReadyForReceiving(std::vector<int>& ready);
-  void handleTimeouts();
+  void handleUDPTimeouts();
   IDState* getIDState(unsigned int& id, int64_t& generation);
   IDState* getExistingState(unsigned int id);
   void releaseState(unsigned int id);
+  void reportTimeoutOrError();
+  void reportResponse(uint8_t rcode);
+  void submitHealthCheckResult(bool initial, bool newState);
 
   dnsdist::Protocol getProtocol() const
   {
@@ -1027,7 +1053,7 @@ public:
   static bool s_randomizeSockets;
   static bool s_randomizeIDs;
 private:
-  void handleTimeout(IDState& ids);
+  void handleUDPTimeout(IDState& ids);
 };
 using servers_t = vector<std::shared_ptr<DownstreamState>>;
 
diff --git a/pdns/dnsdistdist/dnsdist-backend.cc b/pdns/dnsdistdist/dnsdist-backend.cc
index dfebb8efca..8ba61774fc 100644
--- a/pdns/dnsdistdist/dnsdist-backend.cc
+++ b/pdns/dnsdistdist/dnsdist-backend.cc
@@ -197,6 +197,11 @@ DownstreamState::DownstreamState(DownstreamState::Config&& config, std::shared_p
     setWeight(d_config.d_weight);
   }
 
+  if (d_config.availability == Availability::Lazy && d_config.d_lazyHealthChecksSampleSize > 0) {
+    d_lazyHealthCheckStats.lock()->d_lastResults.set_capacity(d_config.d_lazyHealthChecksSampleSize);
+    setUpStatus(true);
+  }
+
   setName(d_config.name);
 
   if (d_tlsCtx) {
@@ -320,7 +325,7 @@ static bool isIDSExpired(IDState& ids)
   return age > DownstreamState::s_udpTimeout;
 }
 
-void DownstreamState::handleTimeout(IDState& ids)
+void DownstreamState::handleUDPTimeout(IDState& ids)
 {
   /* We mark the state as unused as soon as possible
      to limit the risk of racing with the
@@ -349,16 +354,33 @@ void DownstreamState::handleTimeout(IDState& ids)
 
     g_rings.insertResponse(ts, ids.origRemote, ids.qname, ids.qtype, std::numeric_limits<unsigned int>::max(), 0, fake, d_config.remote, getProtocol());
   }
+
+  reportTimeoutOrError();
+}
+
+void DownstreamState::reportResponse(uint8_t rcode)
+{
+  if (d_config.availability == Availability::Lazy && d_config.d_lazyHealthChecksSampleSize > 0) {
+    bool failure = d_config.d_lazyHealthChecksMode == LazyHealthCheckMode::TimeoutOrServFail ? rcode == RCode::ServFail : false;
+    d_lazyHealthCheckStats.lock()->d_lastResults.push_back(failure);
+  }
 }
 
-void DownstreamState::handleTimeouts()
+void DownstreamState::reportTimeoutOrError()
+{
+  if (d_config.availability == Availability::Lazy && d_config.d_lazyHealthChecksSampleSize > 0) {
+    d_lazyHealthCheckStats.lock()->d_lastResults.push_back(true);
+  }
+}
+
+void DownstreamState::handleUDPTimeouts()
 {
   if (s_randomizeIDs) {
     auto map = d_idStatesMap.lock();
     for (auto it = map->begin(); it != map->end(); ) {
       auto& ids = it->second;
       if (isIDSExpired(ids)) {
-        handleTimeout(ids);
+        handleUDPTimeout(ids);
         it = map->erase(it);
         continue;
       }
@@ -376,7 +398,7 @@ void DownstreamState::handleTimeouts()
             continue;
           }
 
-          handleTimeout(ids);
+          handleUDPTimeout(ids);
         }
       }
     }
@@ -475,6 +497,128 @@ IDState* DownstreamState::getIDState(unsigned int& selectedID, int64_t& generati
   return ids;
 }
 
+bool DownstreamState::healthCheckRequired()
+{
+  if (d_config.availability == DownstreamState::Availability::Lazy) {
+    auto stats = d_lazyHealthCheckStats.lock();
+    if (stats->d_status == LazyHealthCheckStats::LazyStatus::PotentialFailure) {
+      return true;
+    }
+    if (stats->d_status == LazyHealthCheckStats::LazyStatus::Failed) {
+      auto now = time(nullptr);
+      if (stats->d_nextCheck <= now) {
+        stats->d_nextCheck = now + d_config.d_lazyHealthChecksFailedInterval;
+        return true;
+      }
+      return false;
+    }
+    if (stats->d_status == LazyHealthCheckStats::LazyStatus::Healthy) {
+      auto& lastResults = stats->d_lastResults;
+      size_t totalCount = lastResults.size();
+      if (totalCount >= d_config.d_lazyHealthChecksMinSampleCount) {
+        return false;
+      }
+
+      size_t failures = 0;
+      for (const auto& result : lastResults) {
+        if (result) {
+          ++failures;
+        }
+      }
+
+      const auto maxFailureRate = static_cast<float>(d_config.d_lazyHealthChecksThreshold);
+      if (((100.0 * failures) / totalCount) >= maxFailureRate) {
+        lastResults.clear();
+        stats->d_status = LazyHealthCheckStats::LazyStatus::PotentialFailure;
+        auto now = time(nullptr);
+        stats->d_nextCheck = now;
+        return true;
+      }
+    }
+
+    return false;
+  }
+  else if (d_config.availability == DownstreamState::Availability::Auto) {
+
+    if (d_nextCheck > 1) {
+      --d_nextCheck;
+      return false;
+    }
+
+    d_nextCheck = d_config.checkInterval;
+    return true;
+  }
+
+  return false;
+}
+
+void DownstreamState::submitHealthCheckResult(bool initial, bool newState)
+{
+  if (initial) {
+    warnlog("Marking downstream %s as '%s'", getNameWithAddr(), newState ? "up" : "down");
+    setUpStatus(newState);
+    return;
+  }
+
+  if (newState) {
+    /* check succeeded */
+    currentCheckFailures = 0;
+
+    if (!upStatus) {
+      /* we were marked as down */
+      consecutiveSuccessfulChecks++;
+      if (consecutiveSuccessfulChecks < d_config.minRiseSuccesses) {
+        /* if we need more than one successful check to rise
+           and we didn't reach the threshold yet,
+           let's stay down */
+        newState = false;
+      }
+    }
+    if (newState) {
+      if (d_config.availability == DownstreamState::Availability::Lazy) {
+        auto stats = d_lazyHealthCheckStats.lock();
+        stats->d_status = LazyHealthCheckStats::LazyStatus::Healthy;
+      }
+    }
+  }
+  else {
+    /* check failed */
+    consecutiveSuccessfulChecks = 0;
+
+    if (upStatus) {
+      /* we are currently up */
+      currentCheckFailures++;
+      if (currentCheckFailures < d_config.maxCheckFailures) {
+        /* we need more than one failure to be marked as down,
+           and we did not reach the threshold yet, let's stay up */
+        newState = true;
+      }
+      else if (d_config.availability == DownstreamState::Availability::Lazy) {
+        auto stats = d_lazyHealthCheckStats.lock();
+        stats->d_status = LazyHealthCheckStats::LazyStatus::Failed;
+        auto now = time(nullptr);
+        stats->d_nextCheck = now + d_config.d_lazyHealthChecksFailedInterval;
+      }
+    }
+  }
+
+  if (newState != upStatus) {
+    warnlog("Marking downstream %s as '%s'", getNameWithAddr(), newState ? "up" : "down");
+
+    if (newState && !isTCPOnly() && (!connected || d_config.reconnectOnUp)) {
+      newState = reconnect();
+      start();
+    }
+
+    setUpStatus(newState);
+    currentCheckFailures = 0;
+    consecutiveSuccessfulChecks = 0;
+    if (g_snmpAgent && g_snmpTrapsEnabled) {
+      g_snmpAgent->sendBackendStatusChangeTrap(*this);
+    }
+  }
+}
+
 size_t ServerPool::countServers(bool upOnly)
 {
   std::shared_ptr<const ServerPolicy::NumberedServerVector> servers = nullptr;
diff --git a/pdns/dnsdistdist/dnsdist-healthchecks.cc b/pdns/dnsdistdist/dnsdist-healthchecks.cc
index cceaa16bc5..1815d8bf1b 100644
--- a/pdns/dnsdistdist/dnsdist-healthchecks.cc
+++ b/pdns/dnsdistdist/dnsdist-healthchecks.cc
@@ -55,61 +55,6 @@ struct HealthCheckData
   bool d_initial{false};
 };
 
-void updateHealthCheckResult(const std::shared_ptr<DownstreamState>& dss, bool initial, bool newState)
-{
-  if (initial) {
-    warnlog("Marking downstream %s as '%s'", dss->getNameWithAddr(), newState ? "up" : "down");
-    dss->setUpStatus(newState);
-    return;
-  }
-
-  if (newState) {
-    /* check succeeded */
-    dss->currentCheckFailures = 0;
-
-    if (!dss->upStatus) {
-      /* we were marked as down */
-      dss->consecutiveSuccessfulChecks++;
-      if (dss->consecutiveSuccessfulChecks < dss->d_config.minRiseSuccesses) {
-        /* if we need more than one successful check to rise
-           and we didn't reach the threshold yet,
-           let's stay down */
-        newState = false;
-      }
-    }
-  }
-  else {
-    /* check failed */
-    dss->consecutiveSuccessfulChecks = 0;
-
-    if (dss->upStatus) {
-      /* we are currently up */
-      dss->currentCheckFailures++;
-      if (dss->currentCheckFailures < dss->d_config.maxCheckFailures) {
-        /* we need more than one failure to be marked as down,
-           and we did not reach the threshold yet, let's stay up */
-        newState = true;
-      }
-    }
-  }
-
-  if (newState != dss->upStatus) {
-    warnlog("Marking downstream %s as '%s'", dss->getNameWithAddr(), newState ? "up" : "down");
-
-    if (newState && !dss->isTCPOnly() && (!dss->connected || dss->d_config.reconnectOnUp)) {
-      newState = dss->reconnect();
-      dss->start();
-    }
-
-    dss->setUpStatus(newState);
-    dss->currentCheckFailures = 0;
-    dss->consecutiveSuccessfulChecks = 0;
-    if (g_snmpAgent && g_snmpTrapsEnabled) {
-      g_snmpAgent->sendBackendStatusChangeTrap(dss);
-    }
-  }
-}
-
 static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
 {
   auto& ds = data->d_ds;
@@ -203,7 +148,7 @@ public:
   void handleResponse(const struct timeval& now, TCPResponse&& response) override
   {
     d_data->d_buffer = std::move(response.d_buffer);
-    updateHealthCheckResult(d_data->d_ds, d_data->d_initial, ::handleResponse(d_data));
+    d_data->d_ds->submitHealthCheckResult(d_data->d_initial, ::handleResponse(d_data));
   }
 
   void handleXFRResponse(const struct timeval& now, TCPResponse&& response) override
@@ -213,7 +158,7 @@ public:
 
   void notifyIOError(IDState&& query, const struct timeval& now) override
   {
-    updateHealthCheckResult(d_data->d_ds, d_data->d_initial, false);
+    d_data->d_ds->submitHealthCheckResult(d_data->d_initial, false);
   }
 
 private:
@@ -234,7 +179,7 @@ static void healthCheckUDPCallback(int fd, FDMultiplexer::funcparam_t& param)
     if (g_verboseHealthChecks) {
       infolog("Error receiving health check response from %s: %s", data->d_ds->d_config.remote.toStringWithPort(), stringerror());
     }
-    updateHealthCheckResult(data->d_ds, data->d_initial, false);
+    data->d_ds->submitHealthCheckResult(data->d_initial, false);
     return;
   }
 
@@ -243,11 +188,11 @@ static void healthCheckUDPCallback(int fd, FDMultiplexer::funcparam_t& param)
     if (g_verboseHealthChecks) {
       infolog("Invalid health check response received from %s, expecting one from %s", from.toStringWithPort(), data->d_ds->d_config.remote.toStringWithPort());
     }
-    updateHealthCheckResult(data->d_ds, data->d_initial, false);
+    data->d_ds->submitHealthCheckResult(data->d_initial, false);
     return;
   }
 
-  updateHealthCheckResult(data->d_ds, data->d_initial, handleResponse(data));
+  data->d_ds->submitHealthCheckResult(data->d_initial, handleResponse(data));
 }
 
 static void healthCheckTCPCallback(int fd, FDMultiplexer::funcparam_t& param)
@@ -281,7 +226,7 @@ static void healthCheckTCPCallback(int fd, FDMultiplexer::funcparam_t& param)
     if (data->d_tcpState == HealthCheckData::TCPState::ReadingResponse) {
       ioState = data->d_tcpHandler->tryRead(data->d_buffer, data->d_bufferPos, data->d_buffer.size());
       if (ioState == IOState::Done) {
-        updateHealthCheckResult(data->d_ds, data->d_initial, handleResponse(data));
+        data->d_ds->submitHealthCheckResult(data->d_initial, handleResponse(data));
       }
     }
 
@@ -308,13 +253,13 @@ static void healthCheckTCPCallback(int fd, FDMultiplexer::funcparam_t& param)
     ioGuard.release();
   }
   catch (const std::exception& e) {
-    updateHealthCheckResult(data->d_ds, data->d_initial, false);
+    data->d_ds->submitHealthCheckResult(data->d_initial, false);
     if (g_verboseHealthChecks) {
       infolog("Error checking the health of backend %s: %s", data->d_ds->getNameWithAddr(), e.what());
     }
   }
   catch (...) {
-    updateHealthCheckResult(data->d_ds, data->d_initial, false);
+    data->d_ds->submitHealthCheckResult(data->d_initial, false);
     if (g_verboseHealthChecks) {
       infolog("Unknown exception while checking the health of backend %s", data->d_ds->getNameWithAddr());
     }
@@ -418,7 +363,7 @@ bool queueHealthCheck(std::unique_ptr<FDMultiplexer>& mplexer, const std::shared
       query.d_proxyProtocolPayload = std::move(proxyProtocolPayload);
       auto sender = std::shared_ptr<TCPQuerySender>(new HealthCheckQuerySender(data));
       if (!sendH2Query(ds, mplexer, sender, std::move(query), true)) {
-        updateHealthCheckResult(data->d_ds, data->d_initial, false);
+        data->d_ds->submitHealthCheckResult(data->d_initial, false);
       }
     }
     else {
@@ -503,7 +448,7 @@ void handleQueuedHealthChecks(FDMultiplexer& mplexer, bool initial)
           infolog("Timeout while waiting for the health check response from backend %s", data->d_ds->getNameWithAddr());
         }
 
-        updateHealthCheckResult(data->d_ds, initial, false);
+        data->d_ds->submitHealthCheckResult(initial, false);
       }
       catch (const std::exception& e) {
         if (g_verboseHealthChecks) {
@@ -529,7 +474,7 @@ void handleQueuedHealthChecks(FDMultiplexer& mplexer, bool initial)
           infolog("Timeout while waiting for the health check response from backend %s", data->d_ds->getNameWithAddr());
         }
 
-        updateHealthCheckResult(data->d_ds, initial, false);
+        data->d_ds->submitHealthCheckResult(initial, false);
       }
       catch (const std::exception& e) {
         if (g_verboseHealthChecks) {
diff --git a/pdns/dnsdistdist/dnsdist-healthchecks.hh b/pdns/dnsdistdist/dnsdist-healthchecks.hh
index 04af75b0db..825961e130 100644
--- a/pdns/dnsdistdist/dnsdist-healthchecks.hh
+++ b/pdns/dnsdistdist/dnsdist-healthchecks.hh
@@ -27,7 +27,6 @@
 
 extern bool g_verboseHealthChecks;
 
-void updateHealthCheckResult(const std::shared_ptr<DownstreamState>& dss, bool initial, bool newState);
 bool queueHealthCheck(std::unique_ptr<FDMultiplexer>& mplexer, const std::shared_ptr<DownstreamState>& ds, bool initial=false);
 void handleQueuedHealthChecks(FDMultiplexer& mplexer, bool initial=false);
 
diff --git a/pdns/dnsdistdist/dnsdist-nghttp2.cc b/pdns/dnsdistdist/dnsdist-nghttp2.cc
index f9311b8da2..87065b9a77 100644
--- a/pdns/dnsdistdist/dnsdist-nghttp2.cc
+++ b/pdns/dnsdistdist/dnsdist-nghttp2.cc
@@ -138,6 +138,14 @@ void DoHConnectionToBackend::handleResponse(PendingRequest&& request)
     if (!d_healthCheckQuery) {
       const double udiff = request.d_query.d_idstate.sentTime.udiff();
       d_ds->updateTCPLatency(udiff);
+      if (request.d_buffer.size() >= sizeof(dnsheader)) {
+        dnsheader dh;
+        memcpy(&dh, request.d_buffer.data(), sizeof(dh));
+        d_ds->reportResponse(dh.rcode);
+      }
+      else {
+        d_ds->reportTimeoutOrError();
+      }
     }
 
     request.d_sender->handleResponse(now, TCPResponse(std::move(request.d_buffer), std::move(request.d_query.d_idstate), shared_from_this()));
@@ -150,6 +158,8 @@ void DoHConnectionToBackend::handleResponse(PendingRequest&& request)
 void DoHConnectionToBackend::handleResponseError(PendingRequest&& request, const struct timeval& now)
 {
   try {
+    d_ds->reportTimeoutOrError();
+
     request.d_sender->notifyIOError(std::move(request.d_query.d_idstate), now);
   }
   catch (const std::exception& e) {
diff --git a/pdns/dnsdistdist/dnsdist-tcp-downstream.cc b/pdns/dnsdistdist/dnsdist-tcp-downstream.cc
index eb8d8f45e6..a6ba8f44d7 100644
--- a/pdns/dnsdistdist/dnsdist-tcp-downstream.cc
+++ b/pdns/dnsdistdist/dnsdist-tcp-downstream.cc
@@ -522,6 +522,7 @@ void TCPConnectionToBackend::handleTimeout(const struct timeval& now, bool write
 void TCPConnectionToBackend::notifyAllQueriesFailed(const struct timeval& now, FailureReason reason)
 {
   d_connectionDied = true;
+  d_ds->reportTimeoutOrError();
 
   /* we might be terminated while notifying a query sender */
   d_ds->outstanding -= d_pendingResponses.size();
@@ -669,6 +670,14 @@ IOState TCPConnectionToBackend::handleResponse(std::shared_ptr<TCPConnectionToBa
   auto ids = std::move(it->second.d_query.d_idstate);
   const double udiff = ids.sentTime.udiff();
   conn->d_ds->updateTCPLatency(udiff);
+  if (d_responseBuffer.size() >= sizeof(dnsheader)) {
+    dnsheader dh;
+    memcpy(&dh, d_responseBuffer.data(), sizeof(dh));
+    conn->d_ds->reportResponse(dh.rcode);
+  }
+  else {
+    conn->d_ds->reportTimeoutOrError();
+  }
 
   d_pendingResponses.erase(it);
   /* marking as idle for now, so we can accept new queries if our queues are empty */
diff --git a/pdns/dnsdistdist/dnsdist-tsan.supp b/pdns/dnsdistdist/dnsdist-tsan.supp
index f76eb6390e..f9c1d984d6 100644
--- a/pdns/dnsdistdist/dnsdist-tsan.supp
+++ b/pdns/dnsdistdist/dnsdist-tsan.supp
@@ -14,5 +14,5 @@ race:DownstreamState::setAuto
 # Same thing for whether a backend has been stopped,
 # eventual consistency is fine
 race:DownstreamState::stop
-race:updateHealthCheckResult
+race:DownstreamState::submitHealthCheckResult
 race:carbonDumpThread
diff --git a/pdns/dnsdistdist/docs/guides/downstreams.rst b/pdns/dnsdistdist/docs/guides/downstreams.rst
index 4f4e3e207b..a1550b7a05 100644
--- a/pdns/dnsdistdist/docs/guides/downstreams.rst
+++ b/pdns/dnsdistdist/docs/guides/downstreams.rst
@@ -25,21 +25,10 @@ It works as well for authoritative as for recursive servers.
 
 Healthcheck
 -----------
-dnsdist uses a health check, sent once every second, to determine the availability of a backend server.
 
-By default, an A query for "a.root-servers.net." is sent.
-A different query type, class and target can be specified by passing, respectively, the ``checkType``, ``checkClass`` and ``checkName`` parameters to :func:`newServer`.
+dnsdist uses health-check queries, sent once every second, to determine the availability of a backend server. Since 1.8.0, it also supports a ``lazy`` health-checking mode which only sends active health-check queries after a configurable threshold of regular queries have failed, see below.
 
-The default behavior is to consider any valid response with an RCODE different from ServFail as valid.
-If the ``mustResolve`` parameter of :func:`newServer` is set to ``true``, a response will only be considered valid if its RCODE differs from NXDomain, ServFail and Refused.
-
-The number of health check failures before a server is considered down is configurable via the ``maxCheckFailures`` parameter, defaulting to 1.
-The CD flag can be set on the query by setting ``setCD`` to true.
-e.g.::
-
-  newServer({address="192.0.2.1", checkType="AAAA", checkType=DNSClass.CHAOS, checkName="a.root-servers.net.", mustResolve=true})
-
-You can turn on logging of health check errors using the :func:`setVerboseHealthChecks` function.
+By default, an ``A`` query for the "a.root-servers.net." name is sent. A different query type, class and target can be specified by passing, respectively, the ``checkType``, ``checkClass`` and ``checkName`` parameters to :func:`newServer`. The interval between two health-check queries can be set via the ``checkInterval`` interval parameter, and the amount of time for a response to be received via the ``checkTimeout`` one.
 
 Since the 1.3.0 release, the ``checkFunction`` option is also supported, taking a ``Lua`` function as parameter. This function receives a DNSName, two integers and a ``DNSHeader`` object (:ref:`DNSHeader`)
 representing the QName, QType and QClass of the health check query as well as the DNS header, as they are defined before the function was called. The function must return a DNSName and two integers
@@ -57,6 +46,41 @@ The following example sets the CD flag to true and change the QName to "powerdns
 
     newServer({address="2620:0:0ccd::2", checkFunction=myHealthCheck})
 
+The default behavior is to consider any valid response with an ``RCODE`` different from ``ServFail`` as valid.
+If the ``mustResolve`` parameter of :func:`newServer` is set to ``true``, a response will only be considered valid if its ``RCODE`` differs from ``NXDomain``, ``ServFail`` and ``Refused``.
+
+The number of health check failures before a server is considered down is configurable via the ``maxCheckFailures`` parameter, defaulting to 1. In the same way, the number of consecutive successful health checks needed for a server to be considered available can be set via the ``rise`` parameter, defaulting to 1.
+
+The ``CD`` flag can be set on the query by setting ``setCD`` to true.
+e.g.::
+
+  newServer({address="192.0.2.1", checkType="AAAA", checkType=DNSClass.CHAOS, checkName="a.root-servers.net.", mustResolve=true})
+
+You can turn on logging of health check errors using the :func:`setVerboseHealthChecks` function.
+
+Lazy health-checking
+~~~~~~~~~~~~~~~~~~~~
+
+In some setups, especially on low-end devices, it might not make sense to actively send queries to the backend at a regular interval. Using the feedback from the results of regular queries can instead be used to infer if a backend might not be working properly.
+
+Since 1.8.0, dnsdist implements a ``lazy`` mode that can be set via the ``healthCheckMode`` option on :func:`newServer`. In that mode dnsdist will only send active health-check queries after seeing a configurable amount of regular queries failing. It will then place the backend in a ``PotentialFailure`` state, from the initial ``Healthy`` one, and send health-check queries every ``checkInterval`` seconds. If ``maxCheckFailures`` of these fails, the backend is then moved to a ``Failed`` state and marked as ``down``, and active health-check queries are sent every ``lazyHealthCheckFailedInterval`` seconds. After ``rise`` successful, consecutive queries, the backend will be moved back to the ``Healthy`` state and marked as ``up`` again, and health-check queries will stop.
+
+.. figure:: ../imgs/DNSDistLazyHealthChecks.png
+   :align: center
+   :alt: DNSDist Lazy health checks
+
+The threshold of failed regular queries is configured via ``lazyHealthCheckThreshold``, indicating of percentage of regular queries that should have resulted in a failure over the last recent queries. Only the results of the last ``lazyHealthCheckSampleSize`` queries will be considered, as the results are kept in a in-memory circular buffer. The results of at least ``lazyHealthCheckMinSampleCount`` queries should be present for the threshold to be considered meaningful, to avoid an issue with a too small sample.
+
+By default both queries that resulted in a timeout and those that received a ``ServFail`` answer are considered failures, but it is possible to set ``lazyHealthCheckMode`` to ``TimeoutOnly`` so that only timeouts are considered failures.
+
+So for example, if we set ``healthCheckMode`` to ``lazy``, ``lazyHealthCheckSampleSize`` to 100, ``lazyHealthCheckMinSampleCount`` to 10, ``lazyHealthCheckThreshold`` to 30, ``maxCheckFailures`` to 2 and ``rise`` to 2:
+
+- nothing will happen until at least 10 queries have been received
+- only the results of the last 100 queries will be considered
+- if at least 30 of these last 100 have failed, the threshold will be reached and active health-check queries will be sent every ``checkInterval`` seconds
+- if the health-check query is successful, the backend will stay ``up`` and no more query will be sent
+- but if instead two consecutive queries fail, the backend will be marked as ``down`` and health-check queries will be sent every ``lazyHealthCheckFailedInterval`` seconds
+
 Source address selection
 ------------------------
 
diff --git a/pdns/dnsdistdist/docs/imgs/DNSDistLazyHealthChecks.png b/pdns/dnsdistdist/docs/imgs/DNSDistLazyHealthChecks.png
new file mode 100644
index 0000000000..d05157dee7
Binary files /dev/null and b/pdns/dnsdistdist/docs/imgs/DNSDistLazyHealthChecks.png differ
diff --git a/pdns/dnsdistdist/docs/reference/config.rst b/pdns/dnsdistdist/docs/reference/config.rst
index 04974086e3..e0441e690d 100644
--- a/pdns/dnsdistdist/docs/reference/config.rst
+++ b/pdns/dnsdistdist/docs/reference/config.rst
@@ -561,7 +561,7 @@ Servers
     Added ``addXForwardedHeaders``, ``caStore``, ``checkTCP``, ``ciphers``, ``ciphers13``, ``dohPath``, ``enableRenegotiation``, ``releaseBuffers``, ``subjectName``, ``tcpOnly``, ``tls`` and ``validateCertificates`` to server_table.
 
   .. versionchanged:: 1.8.0
-    Added ``autoUpgrade``, ``autoUpgradeDoHKey``, ``autoUpgradeInterval``, ``autoUpgradeKeep``, ``autoUpgradePool``, ``maxConcurrentTCPConnections`` and ``subjectAddr`` to server_table.
+    Added ``autoUpgrade``, ``autoUpgradeDoHKey``, ``autoUpgradeInterval``, ``autoUpgradeKeep``, ``autoUpgradePool``, ``maxConcurrentTCPConnections``, ``subjectAddr``, ``lazyHealthCheckSampleSize``, ``lazyHealthCheckMinSampleCount``, ``lazyHealthCheckThreshold``, ``lazyHealthCheckFailedInterval``, ``lazyHealthCheckMode`` and ``healthCheckMode`` to server_table.
 
   Add a new backend server. Call this function with either a string::
 
@@ -572,62 +572,68 @@ Servers
   or a table::
 
     newServer({
-      address="IP:PORT",               -- IP and PORT of the backend server (mandatory)
-      id=STRING,                       -- Use a pre-defined UUID instead of a random one
-      qps=NUM,                         -- Limit the number of queries per second to NUM, when using the `firstAvailable` policy
-      order=NUM,                       -- The order of this server, used by the `leastOutstanding` and `firstAvailable` policies
-      weight=NUM,                      -- The weight of this server, used by the `wrandom`, `whashed` and `chashed` policies, default: 1
-                                       -- Supported values are a minimum of 1, and a maximum of 2147483647.
-      pool=STRING|{STRING},            -- The pools this server belongs to (unset or empty string means default pool) as a string or table of strings
-      retries=NUM,                     -- The number of TCP connection attempts to the backend, for a given query
-      tcpConnectTimeout=NUM,           -- The timeout (in seconds) of a TCP connection attempt
-      tcpSendTimeout=NUM,              -- The timeout (in seconds) of a TCP write attempt
-      tcpRecvTimeout=NUM,              -- The timeout (in seconds) of a TCP read attempt
-      tcpFastOpen=BOOL,                -- Whether to enable TCP Fast Open
-      ipBindAddrNoPort=BOOL,           -- Whether to enable IP_BIND_ADDRESS_NO_PORT if available, default: true
-      name=STRING,                     -- The name associated to this backend, for display purpose
-      checkClass=NUM,                  -- Use NUM as QCLASS in the health-check query, default: DNSClass.IN
-      checkName=STRING,                -- Use STRING as QNAME in the health-check query, default: "a.root-servers.net."
-      checkType=STRING,                -- Use STRING as QTYPE in the health-check query, default: "A"
-      checkFunction=FUNCTION,          -- Use this function to dynamically set the QNAME, QTYPE and QCLASS to use in the health-check query (see :ref:`Healthcheck`)
-      checkTimeout=NUM,                -- The timeout (in milliseconds) of a health-check query, default: 1000 (1s)
-      setCD=BOOL,                      -- Set the CD (Checking Disabled) flag in the health-check query, default: false
-      maxCheckFailures=NUM,            -- Allow NUM check failures before declaring the backend down, default: 1
-      checkInterval=NUM                -- The time in seconds between health checks
-      mustResolve=BOOL,                -- Set to true when the health check MUST return a RCODE different from NXDomain, ServFail and Refused. Default is false, meaning that every RCODE except ServFail is considered valid
-      useClientSubnet=BOOL,            -- Add the client's IP address in the EDNS Client Subnet option when forwarding the query to this backend
-      source=STRING,                   -- The source address or interface to use for queries to this backend, by default this is left to the kernel's address selection
-                                       -- The following formats are supported:
-                                       --   "address", e.g. "192.0.2.2"
-                                       --   "interface name", e.g. "eth0"
-                                       --   "address@interface", e.g. "192.0.2.2@eth0"
-      addXPF=NUM,                      -- Add the client's IP address and port to the query, along with the original destination address and port,
-                                       -- using the experimental XPF record from `draft-bellis-dnsop-xpf <https://datatracker.ietf.org/doc/draft-bellis-dnsop-xpf/>`_ and the specified option code. Default is disabled (0). This is a deprecated feature that will be removed in the near future.
-      sockets=NUM,                     -- Number of UDP sockets (and thus source ports) used toward the backend server, defaults to a single one. Note that for backends which are multithreaded, this setting will have an effect on the number of cores that will be used to process traffic from dnsdist. For example you may want to set 'sockets' to a number somewhat higher than the number of worker threads configured in the backend, particularly if the Linux kernel is being used to distribute traffic to multiple threads listening on the same socket (via `reuseport`).
-      disableZeroScope=BOOL,           -- Disable the EDNS Client Subnet 'zero scope' feature, which does a cache lookup for an answer valid for all subnets (ECS scope of 0) before adding ECS information to the query and doing the regular lookup. This requires the ``parseECS`` option of the corresponding cache to be set to true
-      rise=NUM,                        -- Require NUM consecutive successful checks before declaring the backend up, default: 1
-      useProxyProtocol=BOOL,           -- Add a proxy protocol header to the query, passing along the client's IP address and port along with the original destination address and port. Default is disabled.
-      reconnectOnUp=BOOL,              -- Close and reopen the sockets when a server transits from Down to Up. This helps when an interface is missing when dnsdist is started. Default is disabled.
-      maxInFlight=NUM,                 -- Maximum number of in-flight queries. The default is 0, which disables out-of-order processing. It should only be enabled if the backend does support out-of-order processing. As of 1.6.0, out-of-order processing needs to be enabled on the frontend as well, via :func:`addLocal` and/or :func:`addTLSLocal`. Note that out-of-order is always enabled on DoH frontends.
-      tcpOnly=BOOL,                    -- Always forward queries to that backend over TCP, never over UDP. Always enabled for TLS backends. Default is false.
-      checkTCP=BOOL,                   -- Whether to do healthcheck queries over TCP, instead of UDP. Always enabled for DNS over TLS backend. Default is false.
-      tls=STRING,                      -- Enable DNS over TLS communications for this backend, or DNS over HTTPS if ``dohPath`` is set, using the TLS provider ("openssl" or "gnutls") passed in parameter. Default is an empty string, which means this backend is used for plain UDP and TCP.
-      caStore=STRING,                  -- Specifies the path to the CA certificate file, in PEM format, to use to check the certificate presented by the backend. Default is an empty string, which means to use the system CA store. Note that this directive is only used if ``validateCertificates`` is set.
-      ciphers=STRING,                  -- The TLS ciphers to use. The exact format depends on the provider used. When the OpenSSL provider is used, ciphers for TLS 1.3 must be specified via ``ciphersTLS13``.
-      ciphersTLS13=STRING,             -- The ciphers to use for TLS 1.3, when the OpenSSL provider is used. When the GnuTLS provider is used, ``ciphers`` applies regardless of the TLS protocol and this setting is not used.
-      subjectName=STRING,              -- The subject name passed in the SNI value of the TLS handshake, and against which to validate the certificate presented by the backend. Default is empty. If set this value supersedes any ``subjectAddr`` one.
-      subjectAddr=STRING,              -- The subject IP address passed in the SNI value of the TLS handshake, and against which to validate the certificate presented by the backend. Default is empty.
-      validateCertificates=BOOL,       -- Whether the certificate presented by the backend should be validated against the CA store (see ``caStore``). Default is true.
-      dohPath=STRING,                  -- Enable DNS over HTTPS communication for this backend, using POST queries to the HTTP host supplied as ``subjectName`` and the HTTP path supplied in this parameter.
-      addXForwardedHeaders=BOOL,       -- Whether to add X-Forwarded-For, X-Forwarded-Port and X-Forwarded-Proto headers to a DNS over HTTPS backend.
-      releaseBuffers=BOOL,             -- Whether OpenSSL should release its I/O buffers when a connection goes idle, saving roughly 35 kB of memory per connection. Default to true.
-      enableRenegotiation=BOOL,        -- Whether secure TLS renegotiation should be enabled. Disabled by default since it increases the attack surface and is seldom used for DNS.
-      autoUpgrade=BOOL,                -- Whether to use the 'Discovery of Designated Resolvers' mechanism to automatically upgrade a Do53 backend to DoT or DoH, depending on the priorities present in the SVCB record returned by the backend. Default to false.
-      autoUpgradeInterval=NUM,         -- If ``autoUpgrade`` is set, how often to check if an upgrade is available, in seconds. Default is 3600 seconds.
-      autoUpgradeKeep=BOOL,            -- If ``autoUpgrade`` is set, whether to keep the existing Do53 backend around after an upgrade. Default is false which means the Do53 backend will be replaced by the upgraded one.
-      autoUpgradePool=STRING,          -- If ``autoUpgrade`` is set, in which pool to place the newly upgraded backend. Default is empty which means the backend is placed in the default pool.
-      autoUpgradeDoHKey=NUM,           -- If ``autoUpgrade`` is set, the value to use for the SVC key corresponding to the DoH path. Default is 7.
-      maxConcurrentTCPConnections=NUM  -- Maximum number of TCP connections to that backend. When that limit is reached, queries routed to that backend that cannot be forwarded over an existing connection will be dropped. Default is 0 which means no limit.
+      address="IP:PORT",                 -- IP and PORT of the backend server (mandatory)
+      id=STRING,                         -- Use a pre-defined UUID instead of a random one
+      qps=NUM,                           -- Limit the number of queries per second to NUM, when using the `firstAvailable` policy
+      order=NUM,                         -- The order of this server, used by the `leastOutstanding` and `firstAvailable` policies
+      weight=NUM,                        -- The weight of this server, used by the `wrandom`, `whashed` and `chashed` policies, default: 1
+                                         -- Supported values are a minimum of 1, and a maximum of 2147483647.
+      pool=STRING|{STRING},              -- The pools this server belongs to (unset or empty string means default pool) as a string or table of strings
+      retries=NUM,                       -- The number of TCP connection attempts to the backend, for a given query
+      tcpConnectTimeout=NUM,             -- The timeout (in seconds) of a TCP connection attempt
+      tcpSendTimeout=NUM,                -- The timeout (in seconds) of a TCP write attempt
+      tcpRecvTimeout=NUM,                -- The timeout (in seconds) of a TCP read attempt
+      tcpFastOpen=BOOL,                  -- Whether to enable TCP Fast Open
+      ipBindAddrNoPort=BOOL,             -- Whether to enable IP_BIND_ADDRESS_NO_PORT if available, default: true
+      name=STRING,                       -- The name associated to this backend, for display purpose
+      checkClass=NUM,                    -- Use NUM as QCLASS in the health-check query, default: DNSClass.IN
+      checkName=STRING,                  -- Use STRING as QNAME in the health-check query, default: "a.root-servers.net."
+      checkType=STRING,                  -- Use STRING as QTYPE in the health-check query, default: "A"
+      checkFunction=FUNCTION,            -- Use this function to dynamically set the QNAME, QTYPE and QCLASS to use in the health-check query (see :ref:`Healthcheck`)
+      checkTimeout=NUM,                  -- The timeout (in milliseconds) of a health-check query, default: 1000 (1s)
+      setCD=BOOL,                        -- Set the CD (Checking Disabled) flag in the health-check query, default: false
+      maxCheckFailures=NUM,              -- Allow NUM check failures before declaring the backend down, default: 1
+      checkInterval=NUM                  -- The time in seconds between health checks
+      mustResolve=BOOL,                  -- Set to true when the health check MUST return a RCODE different from NXDomain, ServFail and Refused. Default is false, meaning that every RCODE except ServFail is considered valid
+      useClientSubnet=BOOL,              -- Add the client's IP address in the EDNS Client Subnet option when forwarding the query to this backend
+      source=STRING,                     -- The source address or interface to use for queries to this backend, by default this is left to the kernel's address selection
+                                         -- The following formats are supported:
+                                         --   "address", e.g. "192.0.2.2"
+                                         --   "interface name", e.g. "eth0"
+                                         --   "address@interface", e.g. "192.0.2.2@eth0"
+      addXPF=NUM,                        -- Add the client's IP address and port to the query, along with the original destination address and port,
+                                         -- using the experimental XPF record from `draft-bellis-dnsop-xpf <https://datatracker.ietf.org/doc/draft-bellis-dnsop-xpf/>`_ and the specified option code. Default is disabled (0). This is a deprecated feature that will be removed in the near future.
+      sockets=NUM,                       -- Number of UDP sockets (and thus source ports) used toward the backend server, defaults to a single one. Note that for backends which are multithreaded, this setting will have an effect on the number of cores that will be used to process traffic from dnsdist. For example you may want to set 'sockets' to a number somewhat higher than the number of worker threads configured in the backend, particularly if the Linux kernel is being used to distribute traffic to multiple threads listening on the same socket (via `reuseport`).
+      disableZeroScope=BOOL,             -- Disable the EDNS Client Subnet 'zero scope' feature, which does a cache lookup for an answer valid for all subnets (ECS scope of 0) before adding ECS information to the query and doing the regular lookup. This requires the ``parseECS`` option of the corresponding cache to be set to true
+      rise=NUM,                          -- Require NUM consecutive successful checks before declaring the backend up, default: 1
+      useProxyProtocol=BOOL,             -- Add a proxy protocol header to the query, passing along the client's IP address and port along with the original destination address and port. Default is disabled.
+      reconnectOnUp=BOOL,                -- Close and reopen the sockets when a server transits from Down to Up. This helps when an interface is missing when dnsdist is started. Default is disabled.
+      maxInFlight=NUM,                   -- Maximum number of in-flight queries. The default is 0, which disables out-of-order processing. It should only be enabled if the backend does support out-of-order processing. As of 1.6.0, out-of-order processing needs to be enabled on the frontend as well, via :func:`addLocal` and/or :func:`addTLSLocal`. Note that out-of-order is always enabled on DoH frontends.
+      tcpOnly=BOOL,                      -- Always forward queries to that backend over TCP, never over UDP. Always enabled for TLS backends. Default is false.
+      checkTCP=BOOL,                     -- Whether to do healthcheck queries over TCP, instead of UDP. Always enabled for DNS over TLS backend. Default is false.
+      tls=STRING,                        -- Enable DNS over TLS communications for this backend, or DNS over HTTPS if ``dohPath`` is set, using the TLS provider ("openssl" or "gnutls") passed in parameter. Default is an empty string, which means this backend is used for plain UDP and TCP.
+      caStore=STRING,                    -- Specifies the path to the CA certificate file, in PEM format, to use to check the certificate presented by the backend. Default is an empty string, which means to use the system CA store. Note that this directive is only used if ``validateCertificates`` is set.
+      ciphers=STRING,                    -- The TLS ciphers to use. The exact format depends on the provider used. When the OpenSSL provider is used, ciphers for TLS 1.3 must be specified via ``ciphersTLS13``.
+      ciphersTLS13=STRING,               -- The ciphers to use for TLS 1.3, when the OpenSSL provider is used. When the GnuTLS provider is used, ``ciphers`` applies regardless of the TLS protocol and this setting is not used.
+      subjectName=STRING,                -- The subject name passed in the SNI value of the TLS handshake, and against which to validate the certificate presented by the backend. Default is empty. If set this value supersedes any ``subjectAddr`` one.
+      subjectAddr=STRING,                -- The subject IP address passed in the SNI value of the TLS handshake, and against which to validate the certificate presented by the backend. Default is empty.
+      validateCertificates=BOOL,         -- Whether the certificate presented by the backend should be validated against the CA store (see ``caStore``). Default is true.
+      dohPath=STRING,                    -- Enable DNS over HTTPS communication for this backend, using POST queries to the HTTP host supplied as ``subjectName`` and the HTTP path supplied in this parameter.
+      addXForwardedHeaders=BOOL,         -- Whether to add X-Forwarded-For, X-Forwarded-Port and X-Forwarded-Proto headers to a DNS over HTTPS backend.
+      releaseBuffers=BOOL,               -- Whether OpenSSL should release its I/O buffers when a connection goes idle, saving roughly 35 kB of memory per connection. Default to true.
+      enableRenegotiation=BOOL,          -- Whether secure TLS renegotiation should be enabled. Disabled by default since it increases the attack surface and is seldom used for DNS.
+      autoUpgrade=BOOL,                  -- Whether to use the 'Discovery of Designated Resolvers' mechanism to automatically upgrade a Do53 backend to DoT or DoH, depending on the priorities present in the SVCB record returned by the backend. Default to false.
+      autoUpgradeInterval=NUM,           -- If ``autoUpgrade`` is set, how often to check if an upgrade is available, in seconds. Default is 3600 seconds.
+      autoUpgradeKeep=BOOL,              -- If ``autoUpgrade`` is set, whether to keep the existing Do53 backend around after an upgrade. Default is false which means the Do53 backend will be replaced by the upgraded one.
+      autoUpgradePool=STRING,            -- If ``autoUpgrade`` is set, in which pool to place the newly upgraded backend. Default is empty which means the backend is placed in the default pool.
+      autoUpgradeDoHKey=NUM,             -- If ``autoUpgrade`` is set, the value to use for the SVC key corresponding to the DoH path. Default is 7.
+      maxConcurrentTCPConnections=NUM,   -- Maximum number of TCP connections to that backend. When that limit is reached, queries routed to that backend that cannot be forwarded over an existing connection will be dropped. Default is 0 which means no limit.
+      healthCheckMode=STRING            -- The health-check mode to use: 'auto' which sends health-check queries every ``checkInterval`` seconds, 'up' which considers that the backend is always available, 'down' that it is always not available, and 'lazy' which only sends health-check queries after a configurable amount of regular queries have failed (see ``lazyHealthCheckSampleSize``, ``lazyHealthCheckMinSampleCount``, ``lazyHealthCheckThreshold``, ``lazyHealthCheckFailedInterval`` and ``lazyHealthCheckMode`` for more information). Default is 'auto'. See :ref:`Healthcheck` for a more detailed explanation.
+      lazyHealthCheckFailedInterval=NUM, -- The interval, in seconds, between health-check queries in 'lazy' mode. These queries are only sent when a threshold of failing regular queries has been reached, and only ``maxCheckFailures`` of them are sent. Default is 30 seconds.
+      lazyHealthCheckMinSampleCount=NUM, -- The minimum amount of regular queries that should have been recorded before the ``lazyHealthCheckThreshold`` threshold can be applied. Default is 1 which means only one query is needed.
+      lazyHealthCheckMode=STRING,        -- The 'lazy' health-check mode: 'TimeoutOnly' means that only timeout and I/O errors of regular queries will be considered for the ``lazyHealthCheckThreshold``, while 'TimeoutOrServFail' will also consider 'Server Failure' answers. Default is 'TimeoutOrServFail'.
+      lazyHealthCheckSampleSize=NUM,     -- The maximum size of the sample of queries to record and consider for the ``lazyHealthCheckThreshold``. Default is 100, which means the result (failure or success) of the last 100 queries will be considered.
+      lazyHealthCheckThreshold=NUM       -- The threshold, as a percentage, of queries that should fail for the 'lazy' health-check to be triggered when ``healthCheckMode`` is set to ``lazy``. The default is 20 which means 20% of the last ``lazyHealthCheckSampleSize`` queries should fail for a health-check to be triggered.
     })
 
   :param str server_string: A simple IP:PORT string.
@@ -726,17 +732,27 @@ A server object returned by :func:`getServer` can be manipulated with these func
 
     :param bool status: Set the initial status of the server to ``up`` (true) or ``down`` (false) instead of using the last known status
 
+  .. method:: Server:setDown()
+
+    Set the server in an ``DOWN`` state.
+    The server will not receive queries and the health checks are disabled
+
+  .. method:: Server:setLazy([status])
+
+    .. versionadded:: 1.8.0
+
+    Set the server in the 'lazy' health-check mode.
+    This will enable health check queries, but only after a configurable threshold of failing regular queries has been reached and
+    only for a short time. See :ref:`Healthcheck` for a more detailed explanation.
+
+    :param bool status: Set the initial status of the server to ``up`` (true) or ``down`` (false) instead of using the last known status
+
   .. method:: Server:setQPS(limit)
 
     Limit the queries per second for this server.
 
     :param int limit: The maximum number of queries per second
 
-  .. method:: Server:setDown()
-
-    Set the server in an ``DOWN`` state.
-    The server will not receive queries and the health checks are disabled
-
   .. method:: Server:setUp()
 
     Set the server in an ``UP`` state.
diff --git a/pdns/test-dnsdist_cc.cc b/pdns/test-dnsdist_cc.cc
index af5a0a1bb1..8fe793060d 100644
--- a/pdns/test-dnsdist_cc.cc
+++ b/pdns/test-dnsdist_cc.cc
@@ -37,6 +37,11 @@
 #include "ednscookies.hh"
 #include "ednssubnet.hh"
 
+bool DNSDistSNMPAgent::sendBackendStatusChangeTrap(DownstreamState const&)
+{
+  return false;
+}
+
 BOOST_AUTO_TEST_SUITE(test_dnsdist_cc)
 
 static const uint16_t ECSSourcePrefixV4 = 24;