if (vars.count("lazyHealthCheckSampleSize")) {
auto value = std::stoi(boost::get<string>(vars.at("lazyHealthCheckSampleSize")));
checkParameterBound("lazyHealthCheckSampleSize", value);
- config.d_lazyHealthChecksSampleSize = value;
+ config.d_lazyHealthCheckSampleSize = value;
}
if (vars.count("lazyHealthCheckMinSampleCount")) {
auto value = std::stoi(boost::get<string>(vars.at("lazyHealthCheckMinSampleCount")));
checkParameterBound("lazyHealthCheckMinSampleCount", value);
- config.d_lazyHealthChecksMinSampleCount = value;
+ config.d_lazyHealthCheckMinSampleCount = value;
}
if (vars.count("lazyHealthCheckThreshold")) {
auto value = std::stoi(boost::get<string>(vars.at("lazyHealthCheckThreshold")));
checkParameterBound("lazyHealthCheckThreshold", value, std::numeric_limits<uint8_t>::max());
- config.d_lazyHealthChecksThreshold = value;
+ config.d_lazyHealthCheckThreshold = value;
}
if (vars.count("lazyHealthCheckFailedInterval")) {
auto value = std::stoi(boost::get<string>(vars.at("lazyHealthCheckFailedInterval")));
checkParameterBound("lazyHealthCheckFailedInterval", value);
- config.d_lazyHealthChecksFailedInterval = value;
+ config.d_lazyHealthCheckFailedInterval = value;
}
if (vars.count("lazyHealthCheckUseExponentialBackOff")) {
- config.d_lazyHealthChecksUseExponentialBackOff = boost::get<bool>(vars.at("lazyHealthCheckUseExponentialBackOff"));
+ config.d_lazyHealthCheckUseExponentialBackOff = boost::get<bool>(vars.at("lazyHealthCheckUseExponentialBackOff"));
}
if (vars.count("lazyHealthCheckMaxBackOff")) {
auto value = std::stoi(boost::get<string>(vars.at("lazyHealthCheckMaxBackOff")));
checkParameterBound("lazyHealthCheckMaxBackOff", value);
- config.d_lazyHealthChecksMaxBackOff = value;
+ config.d_lazyHealthCheckMaxBackOff = value;
}
if (vars.count("lazyHealthCheckMode")) {
auto mode = boost::get<string>(vars.at("lazyHealthCheckMode"));
if (pdns_iequals(mode, "TimeoutOnly")) {
- config.d_lazyHealthChecksMode = DownstreamState::LazyHealthCheckMode::TimeoutOnly;
+ config.d_lazyHealthCheckMode = DownstreamState::LazyHealthCheckMode::TimeoutOnly;
}
else if (pdns_iequals(mode, "TimeoutOrServFail")) {
- config.d_lazyHealthChecksMode = DownstreamState::LazyHealthCheckMode::TimeoutOrServFail;
+ config.d_lazyHealthCheckMode = DownstreamState::LazyHealthCheckMode::TimeoutOrServFail;
}
else {
warnlog("Ignoring unknown value '%s' for 'lazyHealthCheckMode' on 'newServer'", mode);
uint16_t d_retries{5};
uint16_t xpfRRCode{0};
uint16_t checkTimeout{1000}; /* in milliseconds */
- uint16_t d_lazyHealthChecksSampleSize{100};
- uint16_t d_lazyHealthChecksMinSampleCount{1};
- uint16_t d_lazyHealthChecksFailedInterval{30};
- uint16_t d_lazyHealthChecksMaxBackOff{3600};
- uint8_t d_lazyHealthChecksThreshold{20};
- LazyHealthCheckMode d_lazyHealthChecksMode{LazyHealthCheckMode::TimeoutOrServFail};
+ uint16_t d_lazyHealthCheckSampleSize{100};
+ uint16_t d_lazyHealthCheckMinSampleCount{1};
+ uint16_t d_lazyHealthCheckFailedInterval{30};
+ uint16_t d_lazyHealthCheckMaxBackOff{3600};
+ uint8_t d_lazyHealthCheckThreshold{20};
+ LazyHealthCheckMode d_lazyHealthCheckMode{LazyHealthCheckMode::TimeoutOrServFail};
uint8_t maxCheckFailures{1};
uint8_t minRiseSuccesses{1};
Availability availability{Availability::Auto};
bool d_tcpCheck{false};
bool d_tcpOnly{false};
bool d_addXForwardedHeaders{false}; // for DoH backends
- bool d_lazyHealthChecksUseExponentialBackOff{false};
+ bool d_lazyHealthCheckUseExponentialBackOff{false};
bool d_upgradeToLazyHealthChecks{false};
};
}
void setLazyAuto() {
d_config.availability = Availability::Lazy;
- d_lazyHealthCheckStats.lock()->d_lastResults.set_capacity(d_config.d_lazyHealthChecksSampleSize);
+ d_lazyHealthCheckStats.lock()->d_lastResults.set_capacity(d_config.d_lazyHealthCheckSampleSize);
}
bool healthCheckRequired();
setWeight(d_config.d_weight);
}
- if (d_config.availability == Availability::Lazy && d_config.d_lazyHealthChecksSampleSize > 0) {
- d_lazyHealthCheckStats.lock()->d_lastResults.set_capacity(d_config.d_lazyHealthChecksSampleSize);
+ if (d_config.availability == Availability::Lazy && d_config.d_lazyHealthCheckSampleSize > 0) {
+ d_lazyHealthCheckStats.lock()->d_lastResults.set_capacity(d_config.d_lazyHealthCheckSampleSize);
setUpStatus(true);
}
void DownstreamState::reportResponse(uint8_t rcode)
{
- if (d_config.availability == Availability::Lazy && d_config.d_lazyHealthChecksSampleSize > 0) {
- bool failure = d_config.d_lazyHealthChecksMode == LazyHealthCheckMode::TimeoutOrServFail ? rcode == RCode::ServFail : false;
+ if (d_config.availability == Availability::Lazy && d_config.d_lazyHealthCheckSampleSize > 0) {
+ bool failure = d_config.d_lazyHealthCheckMode == LazyHealthCheckMode::TimeoutOrServFail ? rcode == RCode::ServFail : false;
d_lazyHealthCheckStats.lock()->d_lastResults.push_back(failure);
}
}
void DownstreamState::reportTimeoutOrError()
{
- if (d_config.availability == Availability::Lazy && d_config.d_lazyHealthChecksSampleSize > 0) {
+ if (d_config.availability == Availability::Lazy && d_config.d_lazyHealthCheckSampleSize > 0) {
d_lazyHealthCheckStats.lock()->d_lastResults.push_back(true);
}
}
if (stats->d_status == LazyHealthCheckStats::LazyStatus::Healthy) {
auto& lastResults = stats->d_lastResults;
size_t totalCount = lastResults.size();
- if (totalCount < d_config.d_lazyHealthChecksMinSampleCount) {
+ if (totalCount < d_config.d_lazyHealthCheckMinSampleCount) {
return false;
}
}
}
- const auto maxFailureRate = static_cast<float>(d_config.d_lazyHealthChecksThreshold);
+ const auto maxFailureRate = static_cast<float>(d_config.d_lazyHealthCheckThreshold);
auto current = (100.0 * failures) / totalCount;
if (current >= maxFailureRate) {
lastResults.clear();
- vinfolog("Backend %s reached the lazy health-check threshold (%f out of %f, looking at sample of %d items with %d failures), moving to Potential Failure state", getNameWithAddr(), current, maxFailureRate, totalCount, failures);
+ vinfolog("Backend %s reached the lazy health-check threshold (%f%% out of %f%%, looking at sample of %d items with %d failures), moving to Potential Failure state", getNameWithAddr(), current, maxFailureRate, totalCount, failures);
stats->d_status = LazyHealthCheckStats::LazyStatus::PotentialFailure;
/* we update the next check time here because the check might time out,
and we do not want to send a second check during that time unless
void DownstreamState::updateNextLazyHealthCheck(LazyHealthCheckStats& stats)
{
auto now = time(nullptr);
- if (d_config.d_lazyHealthChecksUseExponentialBackOff) {
+ if (d_config.d_lazyHealthCheckUseExponentialBackOff) {
if (stats.d_status == DownstreamState::LazyHealthCheckStats::LazyStatus::PotentialFailure) {
/* we are still in the "up" state, we need to send the next query quickly to
determine if the backend is really down */
- stats.d_nextCheck = now + d_config.d_lazyHealthChecksFailedInterval;
+ stats.d_nextCheck = now + d_config.d_lazyHealthCheckFailedInterval;
}
else if (consecutiveSuccessfulChecks > 0) {
/* we are in 'Failed' state, but just had one (or more) successful check,
so we want the next one to happen quite quickly as the backend might
be available again. */
- stats.d_nextCheck = now + d_config.d_lazyHealthChecksFailedInterval;
+ stats.d_nextCheck = now + d_config.d_lazyHealthCheckFailedInterval;
}
else {
const uint16_t failedTests = currentCheckFailures;
size_t backOffCoeff = std::pow(2U, failedTests);
- time_t backOff = d_config.d_lazyHealthChecksMaxBackOff;
- if ((std::numeric_limits<time_t>::max() / d_config.d_lazyHealthChecksFailedInterval) >= backOffCoeff) {
- backOff = d_config.d_lazyHealthChecksFailedInterval * backOffCoeff;
- if (backOff > d_config.d_lazyHealthChecksMaxBackOff || (std::numeric_limits<time_t>::max() - now) <= backOff) {
- backOff = d_config.d_lazyHealthChecksMaxBackOff;
+ time_t backOff = d_config.d_lazyHealthCheckMaxBackOff;
+ if ((std::numeric_limits<time_t>::max() / d_config.d_lazyHealthCheckFailedInterval) >= backOffCoeff) {
+ backOff = d_config.d_lazyHealthCheckFailedInterval * backOffCoeff;
+ if (backOff > d_config.d_lazyHealthCheckMaxBackOff || (std::numeric_limits<time_t>::max() - now) <= backOff) {
+ backOff = d_config.d_lazyHealthCheckMaxBackOff;
}
}
}
}
else {
- stats.d_nextCheck = now + d_config.d_lazyHealthChecksFailedInterval;
+ stats.d_nextCheck = now + d_config.d_lazyHealthCheckFailedInterval;
}
}
DownstreamState::Config config;
config.minRiseSuccesses = 5;
config.maxCheckFailures = 3;
- config.d_lazyHealthChecksMinSampleCount = 11;
- config.d_lazyHealthChecksThreshold = 20;
- config.d_lazyHealthChecksUseExponentialBackOff = false;
+ config.d_lazyHealthCheckMinSampleCount = 11;
+ config.d_lazyHealthCheckThreshold = 20;
+ config.d_lazyHealthCheckUseExponentialBackOff = false;
config.availability = DownstreamState::Availability::Lazy;
/* prevents a re-connection */
config.remote = ComboAddress("0.0.0.0");
}
/* the threshold should be reached (50% > 20%) but we do not have enough sample yet
- (10 < config.d_lazyHealthChecksMinSampleCount) */
+ (10 < config.d_lazyHealthCheckMinSampleCount) */
BOOST_CHECK_EQUAL(ds.isUp(), true);
BOOST_CHECK_EQUAL(ds.getStatus(), "up");
BOOST_CHECK_EQUAL(ds.healthCheckRequired(), false);
BOOST_CHECK_EQUAL(ds.healthCheckRequired(), true);
/* even if we fill the whole circular buffer with valid answers */
- for (size_t idx = 0; idx < config.d_lazyHealthChecksSampleSize; idx++) {
+ for (size_t idx = 0; idx < config.d_lazyHealthCheckSampleSize; idx++) {
ds.reportResponse(RCode::NoError);
}
BOOST_CHECK_EQUAL(ds.isUp(), true);
BOOST_CHECK_EQUAL(ds.healthCheckRequired(), false);
/* now let's reach the threshold again, this time just barely */
- for (size_t idx = 0; idx < config.d_lazyHealthChecksThreshold; idx++) {
+ for (size_t idx = 0; idx < config.d_lazyHealthCheckThreshold; idx++) {
ds.reportTimeoutOrError();
}
BOOST_CHECK_EQUAL(ds.isUp(), true);
BOOST_CHECK_EQUAL(ds.healthCheckRequired(), true);
/* we need maxCheckFailures failed health-checks to go down */
+ BOOST_REQUIRE(config.maxCheckFailures >= 1);
for (size_t idx = 0; idx < static_cast<size_t>(config.maxCheckFailures - 1); idx++) {
ds.submitHealthCheckResult(false, false);
}
/* now we are in Failed state */
BOOST_CHECK_EQUAL(ds.isUp(), false);
BOOST_CHECK_EQUAL(ds.getStatus(), "down");
- BOOST_CHECK(ds.getNextLazyHealthCheck() == (failedCheckTime + config.d_lazyHealthChecksFailedInterval));
+ BOOST_CHECK(ds.getNextLazyHealthCheck() == (failedCheckTime + config.d_lazyHealthCheckFailedInterval));
/* let fill the buffer with successes, it does not matter */
- for (size_t idx = 0; idx < config.d_lazyHealthChecksSampleSize; idx++) {
+ for (size_t idx = 0; idx < config.d_lazyHealthCheckSampleSize; idx++) {
ds.reportResponse(RCode::NoError);
}
/* we need minRiseSuccesses successful health-checks to go up */
+ BOOST_REQUIRE(config.minRiseSuccesses >= 1);
for (size_t idx = 0; idx < static_cast<size_t>(config.minRiseSuccesses - 1); idx++) {
ds.submitHealthCheckResult(false, true);
}
DownstreamState::Config config;
config.minRiseSuccesses = 5;
config.maxCheckFailures = 3;
- config.d_lazyHealthChecksMinSampleCount = 11;
- config.d_lazyHealthChecksThreshold = 20;
- config.d_lazyHealthChecksUseExponentialBackOff = true;
- config.d_lazyHealthChecksMaxBackOff = 60;
- config.d_lazyHealthChecksFailedInterval = 30;
+ config.d_lazyHealthCheckMinSampleCount = 11;
+ config.d_lazyHealthCheckThreshold = 20;
+ config.d_lazyHealthCheckUseExponentialBackOff = true;
+ config.d_lazyHealthCheckMaxBackOff = 60;
+ config.d_lazyHealthCheckFailedInterval = 30;
config.availability = DownstreamState::Availability::Lazy;
/* prevents a re-connection */
config.remote = ComboAddress("0.0.0.0");
BOOST_CHECK_EQUAL(ds.healthCheckRequired(), false);
/* submit a few failed results */
- for (size_t idx = 0; idx < config.d_lazyHealthChecksMinSampleCount; idx++) {
+ for (size_t idx = 0; idx < config.d_lazyHealthCheckMinSampleCount; idx++) {
ds.reportTimeoutOrError();
}
BOOST_CHECK_EQUAL(ds.isUp(), true);
BOOST_CHECK_EQUAL(ds.healthCheckRequired(), true);
/* we need maxCheckFailures failed health-checks to go down */
+ BOOST_REQUIRE(config.maxCheckFailures >= 1);
for (size_t idx = 0; idx < static_cast<size_t>(config.maxCheckFailures - 1); idx++) {
ds.submitHealthCheckResult(false, false);
}
BOOST_CHECK_EQUAL(ds.getStatus(), "down");
BOOST_CHECK_EQUAL(ds.healthCheckRequired(), false);
/* and the wait time between two checks will double every time a failure occurs */
- BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (failedCheckTime + (config.d_lazyHealthChecksFailedInterval * std::pow(2U, ds.currentCheckFailures))));
+ BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (failedCheckTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures))));
BOOST_CHECK_EQUAL(ds.currentCheckFailures, 0U);
/* we need minRiseSuccesses successful health-checks to go up */
+ BOOST_REQUIRE(config.minRiseSuccesses >= 1);
for (size_t idx = 0; idx < static_cast<size_t>(config.minRiseSuccesses - 1); idx++) {
ds.submitHealthCheckResult(false, true);
}