]> git.ipfire.org Git - thirdparty/pdns.git/commitdiff
dnsdist: Fix exponential backoff computation in edge cases
authorRemi Gacogne <remi.gacogne@powerdns.com>
Mon, 26 Feb 2024 16:07:22 +0000 (17:07 +0100)
committerRemi Gacogne <remi.gacogne@powerdns.com>
Tue, 19 Mar 2024 09:41:12 +0000 (10:41 +0100)
(cherry picked from commit a8856ce717f439832515440af5550ca9d51f4b7d)

pdns/dnsdistdist/dnsdist-backend.cc
pdns/dnsdistdist/test-dnsdistbackend_cc.cc

index 9c5da43e563554c4c8a950d9f4ba789fc2d8234a..030fbff11fa1e3ede50823393692523393d7f1b5 100644 (file)
@@ -692,6 +692,7 @@ bool DownstreamState::healthCheckRequired(std::optional<time_t> currentTime)
         lastResults.clear();
         vinfolog("Backend %s reached the lazy health-check threshold (%f%% out of %f%%, looking at sample of %d items with %d failures), moving to Potential Failure state", getNameWithAddr(), current, maxFailureRate, totalCount, failures);
         stats->d_status = LazyHealthCheckStats::LazyStatus::PotentialFailure;
+        consecutiveSuccessfulChecks = 0;
         /* we update the next check time here because the check might time out,
            and we do not want to send a second check during that time unless
            the timer is actually very short */
@@ -751,7 +752,7 @@ void DownstreamState::updateNextLazyHealthCheck(LazyHealthCheckStats& stats, boo
 
       time_t backOff = d_config.d_lazyHealthCheckMaxBackOff;
       const ExponentialBackOffTimer backOffTimer(d_config.d_lazyHealthCheckMaxBackOff);
-      auto backOffCoeffTmp = backOffTimer.get(failedTests);
+      auto backOffCoeffTmp = backOffTimer.get(failedTests - 1);
       /* backOffCoeffTmp cannot be higher than d_config.d_lazyHealthCheckMaxBackOff */
       const auto backOffCoeff = static_cast<time_t>(backOffCoeffTmp);
       if ((std::numeric_limits<time_t>::max() / d_config.d_lazyHealthCheckFailedInterval) >= backOffCoeff) {
@@ -800,12 +801,12 @@ void DownstreamState::submitHealthCheckResult(bool initial, bool newResult)
   if (newResult) {
     /* check succeeded */
     currentCheckFailures = 0;
+    consecutiveSuccessfulChecks++;
 
     if (!upStatus) {
       /* we were previously marked as "down" and had a successful health-check,
          let's see if this is enough to move to the "up" state or if we need
          more successful health-checks for that */
-      consecutiveSuccessfulChecks++;
       if (consecutiveSuccessfulChecks < d_config.minRiseSuccesses) {
         /* we need more than one successful check to rise
            and we didn't reach the threshold yet, let's stay down */
@@ -846,7 +847,7 @@ void DownstreamState::submitHealthCheckResult(bool initial, bool newResult)
         auto stats = d_lazyHealthCheckStats.lock();
         vinfolog("Backend %s failed its health-check, moving from Potential failure to Failed", getNameWithAddr());
         stats->d_status = LazyHealthCheckStats::LazyStatus::Failed;
-        currentCheckFailures = 0;
+        currentCheckFailures = 1;
         updateNextLazyHealthCheck(*stats, false);
       }
     }
index 34745f4228be0cede626dff8fb128377663a6e5c..28bf5109a79b65ec494e337a967db00d86a1b46a 100644 (file)
@@ -264,8 +264,8 @@ BOOST_AUTO_TEST_CASE(test_LazyExponentialBackOff)
   BOOST_CHECK_EQUAL(ds.getStatus(), "down");
   BOOST_CHECK_EQUAL(ds.healthCheckRequired(currentTime), false);
   /* and the wait time between two checks will double every time a failure occurs */
-  BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures))));
-  BOOST_CHECK_EQUAL(ds.currentCheckFailures, 0U);
+  BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures - 1))));
+  BOOST_CHECK_EQUAL(ds.currentCheckFailures, 1U);
 
   /* so after 5 failures */
   const size_t nbFailures = 5;
@@ -274,8 +274,8 @@ BOOST_AUTO_TEST_CASE(test_LazyExponentialBackOff)
     BOOST_CHECK(ds.healthCheckRequired(currentTime));
     ds.submitHealthCheckResult(false, false);
   }
-  BOOST_CHECK_EQUAL(ds.currentCheckFailures, nbFailures);
-  BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures))));
+  BOOST_CHECK_EQUAL(ds.currentCheckFailures, nbFailures + 1);
+  BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures - 1))));
 
   /* we need minRiseSuccesses successful health-checks to go up */
   BOOST_REQUIRE(config.minRiseSuccesses >= 1);