From: Remi Gacogne Date: Mon, 26 Feb 2024 16:07:22 +0000 (+0100) Subject: dnsdist: Fix exponential backoff computation in edge cases X-Git-Tag: rec-5.1.0-alpha1~114^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a8856ce717f439832515440af5550ca9d51f4b7d;p=thirdparty%2Fpdns.git dnsdist: Fix exponential backoff computation in edge cases --- diff --git a/pdns/dnsdistdist/dnsdist-backend.cc b/pdns/dnsdistdist/dnsdist-backend.cc index 9c5da43e56..030fbff11f 100644 --- a/pdns/dnsdistdist/dnsdist-backend.cc +++ b/pdns/dnsdistdist/dnsdist-backend.cc @@ -692,6 +692,7 @@ bool DownstreamState::healthCheckRequired(std::optional currentTime) lastResults.clear(); vinfolog("Backend %s reached the lazy health-check threshold (%f%% out of %f%%, looking at sample of %d items with %d failures), moving to Potential Failure state", getNameWithAddr(), current, maxFailureRate, totalCount, failures); stats->d_status = LazyHealthCheckStats::LazyStatus::PotentialFailure; + consecutiveSuccessfulChecks = 0; /* we update the next check time here because the check might time out, and we do not want to send a second check during that time unless the timer is actually very short */ @@ -751,7 +752,7 @@ void DownstreamState::updateNextLazyHealthCheck(LazyHealthCheckStats& stats, boo time_t backOff = d_config.d_lazyHealthCheckMaxBackOff; const ExponentialBackOffTimer backOffTimer(d_config.d_lazyHealthCheckMaxBackOff); - auto backOffCoeffTmp = backOffTimer.get(failedTests); + auto backOffCoeffTmp = backOffTimer.get(failedTests - 1); /* backOffCoeffTmp cannot be higher than d_config.d_lazyHealthCheckMaxBackOff */ const auto backOffCoeff = static_cast(backOffCoeffTmp); if ((std::numeric_limits::max() / d_config.d_lazyHealthCheckFailedInterval) >= backOffCoeff) { @@ -800,12 +801,12 @@ void DownstreamState::submitHealthCheckResult(bool initial, bool newResult) if (newResult) { /* check succeeded */ currentCheckFailures = 0; + consecutiveSuccessfulChecks++; if (!upStatus) { /* we were previously marked as "down" and had a successful health-check, let's see if this is enough to move to the "up" state or if we need more successful health-checks for that */ - consecutiveSuccessfulChecks++; if (consecutiveSuccessfulChecks < d_config.minRiseSuccesses) { /* we need more than one successful check to rise and we didn't reach the threshold yet, let's stay down */ @@ -846,7 +847,7 @@ void DownstreamState::submitHealthCheckResult(bool initial, bool newResult) auto stats = d_lazyHealthCheckStats.lock(); vinfolog("Backend %s failed its health-check, moving from Potential failure to Failed", getNameWithAddr()); stats->d_status = LazyHealthCheckStats::LazyStatus::Failed; - currentCheckFailures = 0; + currentCheckFailures = 1; updateNextLazyHealthCheck(*stats, false); } } diff --git a/pdns/dnsdistdist/test-dnsdistbackend_cc.cc b/pdns/dnsdistdist/test-dnsdistbackend_cc.cc index 34745f4228..28bf5109a7 100644 --- a/pdns/dnsdistdist/test-dnsdistbackend_cc.cc +++ b/pdns/dnsdistdist/test-dnsdistbackend_cc.cc @@ -264,8 +264,8 @@ BOOST_AUTO_TEST_CASE(test_LazyExponentialBackOff) BOOST_CHECK_EQUAL(ds.getStatus(), "down"); BOOST_CHECK_EQUAL(ds.healthCheckRequired(currentTime), false); /* and the wait time between two checks will double every time a failure occurs */ - BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures)))); - BOOST_CHECK_EQUAL(ds.currentCheckFailures, 0U); + BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures - 1)))); + BOOST_CHECK_EQUAL(ds.currentCheckFailures, 1U); /* so after 5 failures */ const size_t nbFailures = 5; @@ -274,8 +274,8 @@ BOOST_AUTO_TEST_CASE(test_LazyExponentialBackOff) BOOST_CHECK(ds.healthCheckRequired(currentTime)); ds.submitHealthCheckResult(false, false); } - BOOST_CHECK_EQUAL(ds.currentCheckFailures, nbFailures); - BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures)))); + BOOST_CHECK_EQUAL(ds.currentCheckFailures, nbFailures + 1); + BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures - 1)))); /* we need minRiseSuccesses successful health-checks to go up */ BOOST_REQUIRE(config.minRiseSuccesses >= 1);