From: Remi Gacogne Date: Thu, 6 Oct 2022 16:01:28 +0000 (+0200) Subject: dnsdist: Add regression tests for the 'lazy' health-check feature X-Git-Tag: dnsdist-1.8.0-rc1~271^2~8 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=412643ce71be15684c0caa1b3336e1c661051372;p=thirdparty%2Fpdns.git dnsdist: Add regression tests for the 'lazy' health-check feature --- diff --git a/pdns/dnsdistdist/dnsdist-backend.cc b/pdns/dnsdistdist/dnsdist-backend.cc index 175db7f401..17933827ba 100644 --- a/pdns/dnsdistdist/dnsdist-backend.cc +++ b/pdns/dnsdistdist/dnsdist-backend.cc @@ -504,12 +504,14 @@ bool DownstreamState::healthCheckRequired() if (d_config.availability == DownstreamState::Availability::Lazy) { auto stats = d_lazyHealthCheckStats.lock(); if (stats->d_status == LazyHealthCheckStats::LazyStatus::PotentialFailure) { + vinfolog("Sending health-check query for %s which is still in the Potential Failure state", getNameWithAddr()); return true; } if (stats->d_status == LazyHealthCheckStats::LazyStatus::Failed) { auto now = time(nullptr); if (stats->d_nextCheck <= now) { stats->d_nextCheck = now + d_config.d_lazyHealthChecksFailedInterval; + vinfolog("Sending health-check query for %s which is still in the Failed state", getNameWithAddr()); return true; } return false; @@ -529,8 +531,10 @@ bool DownstreamState::healthCheckRequired() } const auto maxFailureRate = static_cast(d_config.d_lazyHealthChecksThreshold); - if (((100.0 * failures) / totalCount) >= maxFailureRate) { + auto current = (100.0 * failures) / totalCount; + if (current >= maxFailureRate) { lastResults.clear(); + vinfolog("Backend %s reached the lazy health-check threshold (%f out of %f, looking at sample of %d items with %d failures), moving to Potential Failure state", getNameWithAddr(), current, maxFailureRate, totalCount, failures); stats->d_status = LazyHealthCheckStats::LazyStatus::PotentialFailure; auto now = time(nullptr); stats->d_nextCheck = now; diff --git a/pdns/dnsdistdist/docs/guides/downstreams.rst b/pdns/dnsdistdist/docs/guides/downstreams.rst index a1550b7a05..8f1323402c 100644 --- a/pdns/dnsdistdist/docs/guides/downstreams.rst +++ b/pdns/dnsdistdist/docs/guides/downstreams.rst @@ -80,6 +80,11 @@ So for example, if we set ``healthCheckMode`` to ``lazy``, ``lazyHealthCheckSamp - if at least 30 of these last 100 have failed, the threshold will be reached and active health-check queries will be sent every ``checkInterval`` seconds - if the health-check query is successful, the backend will stay ``up`` and no more query will be sent - but if instead two consecutive queries fail, the backend will be marked as ``down`` and health-check queries will be sent every ``lazyHealthCheckFailedInterval`` seconds +- it will take two consecutive, successful health-checks for the backend to go back to ``Healthy`` and be marked `up` again + +.. code-block:: lua + + newServer({address="192.0.2.1", healthCheckMode='lazy', checkInterval=1, lazyHealthCheckFailedInterval=30, rise=2, maxCheckFailures=3, lazyHealthCheckThreshold=30, lazyHealthCheckSampleSize=100, lazyHealthCheckMinSampleCount=10, lazyHealthCheckMode='TimeoutOnly'}) Source address selection ------------------------ diff --git a/pdns/dnsdistdist/test-dnsdistbackend_cc.cc b/pdns/dnsdistdist/test-dnsdistbackend_cc.cc index 878ffacf6c..e063ec2b6d 100644 --- a/pdns/dnsdistdist/test-dnsdistbackend_cc.cc +++ b/pdns/dnsdistdist/test-dnsdistbackend_cc.cc @@ -179,7 +179,7 @@ BOOST_AUTO_TEST_CASE(test_Lazy) BOOST_CHECK_EQUAL(ds.healthCheckRequired(), true); /* we need maxCheckFailures failed health-checks to go down */ - for (size_t idx = 0; idx < config.maxCheckFailures - 1; idx++) { + for (size_t idx = 0; idx < static_cast(config.maxCheckFailures - 1); idx++) { ds.submitHealthCheckResult(false, false); } BOOST_CHECK_EQUAL(ds.isUp(), true); @@ -199,7 +199,7 @@ BOOST_AUTO_TEST_CASE(test_Lazy) } /* we need minRiseSuccesses successful health-checks to go down */ - for (size_t idx = 0; idx < config.minRiseSuccesses - 1; idx++) { + for (size_t idx = 0; idx < static_cast(config.minRiseSuccesses - 1); idx++) { ds.submitHealthCheckResult(false, true); } BOOST_CHECK_EQUAL(ds.isUp(), false); diff --git a/regression-tests.dnsdist/test_HealthChecks.py b/regression-tests.dnsdist/test_HealthChecks.py index 0c7a5faf4f..32749890bc 100644 --- a/regression-tests.dnsdist/test_HealthChecks.py +++ b/regression-tests.dnsdist/test_HealthChecks.py @@ -1,6 +1,8 @@ #!/usr/bin/env python import base64 +import threading import time +import ssl import dns from dnsdisttests import DNSDistTest @@ -175,3 +177,183 @@ class TestHealthCheckCustomFunction(HealthCheckTest): time.sleep(1.5) self.assertGreater(TestHealthCheckCustomFunction._healthCheckCounter, before) self.assertEqual(self.getBackendStatus(), 'up') + +_do53HealthCheckQueries = 0 +_dotHealthCheckQueries = 0 +_dohHealthCheckQueries = 0 + +class TestLazyHealthChecks(HealthCheckTest): + _do53Port = 10700 + _dotPort = 10701 + _dohPort = 10702 + + _consoleKey = DNSDistTest.generateConsoleKey() + _consoleKeyB64 = base64.b64encode(_consoleKey).decode('ascii') + _config_params = ['_consoleKeyB64', '_consolePort', '_do53Port', '_dotPort', '_dohPort'] + _config_template = """ + setKey("%s") + controlSocket("127.0.0.1:%d") + + newServer{address="127.0.0.1:%s", healthCheckMode='lazy', checkInterval=1, lazyHealthCheckFailedInterval=1, lazyHealthCheckThreshold=10, lazyHealthCheckSampleSize=100, lazyHealthCheckMinSampleCount=10, lazyHealthCheckMode='TimeoutOrServFail', pool=''} + + newServer{address="127.0.0.1:%s", tls='openssl', caStore='ca.pem', healthCheckMode='lazy', checkInterval=1, lazyHealthCheckFailedInterval=1, lazyHealthCheckThreshold=10, lazyHealthCheckSampleSize=100, lazyHealthCheckMinSampleCount=10, lazyHealthCheckMode='TimeoutOrServFail', pool='dot'} + addAction('dot.lazy.test.powerdns.com.', PoolAction('dot')) + + newServer{address="127.0.0.1:%s", tls='openssl', dohPath='/dns-query', caStore='ca.pem', healthCheckMode='lazy', checkInterval=1, lazyHealthCheckFailedInterval=1, lazyHealthCheckThreshold=10, lazyHealthCheckSampleSize=100, lazyHealthCheckMinSampleCount=10, lazyHealthCheckMode='TimeoutOrServFail', pool='doh'} + addAction('doh.lazy.test.powerdns.com.', PoolAction('doh')) + """ + _verboseMode = True + + @staticmethod + def HandleDNSQuery(request): + response = dns.message.make_response(request) + if str(request.question[0].name).startswith('server-failure'): + response.set_rcode(dns.rcode.SERVFAIL) + return response.to_wire() + + @classmethod + def Do53Callback(cls, request): + global _do53HealthCheckQueries + if str(request.question[0].name).startswith('a.root-servers.net'): + _do53HealthCheckQueries = _do53HealthCheckQueries + 1 + response = dns.message.make_response(request) + return response.to_wire() + return cls.HandleDNSQuery(request) + + @classmethod + def DoTCallback(cls, request): + global _dotHealthCheckQueries + if str(request.question[0].name).startswith('a.root-servers.net'): + _dotHealthCheckQueries = _dotHealthCheckQueries + 1 + response = dns.message.make_response(request) + return response.to_wire() + return cls.HandleDNSQuery(request) + + @classmethod + def DoHCallback(cls, request, requestHeaders, fromQueue, toQueue): + global _dohHealthCheckQueries + if str(request.question[0].name).startswith('a.root-servers.net'): + _dohHealthCheckQueries = _dohHealthCheckQueries + 1 + response = dns.message.make_response(request) + return 200, response.to_wire() + return 200, cls.HandleDNSQuery(request) + + @classmethod + def startResponders(cls): + tlsContext = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + tlsContext.load_cert_chain('server.chain', 'server.key') + + Do53Responder = threading.Thread(name='Do53 Lazy Responder', target=cls.UDPResponder, args=[cls._do53Port, cls._toResponderQueue, cls._fromResponderQueue, False, cls.Do53Callback]) + Do53Responder.setDaemon(True) + Do53Responder.start() + + Do53TCPResponder = threading.Thread(name='Do53 TCP Lazy Responder', target=cls.TCPResponder, args=[cls._do53Port, cls._toResponderQueue, cls._fromResponderQueue, False, False, cls.Do53Callback]) + Do53TCPResponder.setDaemon(True) + Do53TCPResponder.start() + + DoTResponder = threading.Thread(name='DoT Lazy Responder', target=cls.TCPResponder, args=[cls._dotPort, cls._toResponderQueue, cls._fromResponderQueue, False, False, cls.DoTCallback, tlsContext]) + DoTResponder.setDaemon(True) + DoTResponder.start() + + DoHResponder = threading.Thread(name='DoH Lazy Responder', target=cls.DOHResponder, args=[cls._dohPort, cls._toResponderQueue, cls._fromResponderQueue, False, False, cls.DoHCallback, tlsContext]) + DoHResponder.setDaemon(True) + DoHResponder.start() + + def testDo53Lazy(self): + """ + Lazy Healthchecks: Do53 + """ + self.assertEqual(_do53HealthCheckQueries, 0) + time.sleep(1) + self.assertEqual(_do53HealthCheckQueries, 0) + + name = 'do53.lazy.test.powerdns.com.' + query = dns.message.make_query(name, 'A', 'IN') + response = dns.message.make_response(query) + failedQuery = dns.message.make_query('server-failure.do53.lazy.test.powerdns.com.', 'A', 'IN') + failedResponse = dns.message.make_response(failedQuery) + failedResponse.set_rcode(dns.rcode.SERVFAIL) + + # send a few valid queries + for _ in range(5): + for method in ("sendUDPQuery", "sendTCPQuery"): + sender = getattr(self, method) + (_, receivedResponse) = sender(query, response=None, useQueue=False) + self.assertEqual(receivedResponse, response) + + self.assertEqual(_do53HealthCheckQueries, 0) + + # we need at least 10 samples, and 10 percent of them failing, so two failing queries should be enough + for _ in range(2): + (_, receivedResponse) = self.sendUDPQuery(failedQuery, response=None, useQueue=False) + self.assertEqual(receivedResponse, failedResponse) + + time.sleep(1.5) + self.assertEqual(_do53HealthCheckQueries, 1) + self.assertEqual(self.getBackendStatus(), 'up') + + def testDoTLazy(self): + """ + Lazy Healthchecks: DoT + """ + self.assertEqual(_dotHealthCheckQueries, 0) + time.sleep(1) + self.assertEqual(_dotHealthCheckQueries, 0) + + name = 'dot.lazy.test.powerdns.com.' + query = dns.message.make_query(name, 'A', 'IN') + response = dns.message.make_response(query) + failedQuery = dns.message.make_query('server-failure.dot.lazy.test.powerdns.com.', 'A', 'IN') + failedResponse = dns.message.make_response(failedQuery) + failedResponse.set_rcode(dns.rcode.SERVFAIL) + + # send a few valid queries + for _ in range(5): + for method in ("sendUDPQuery", "sendTCPQuery"): + sender = getattr(self, method) + (_, receivedResponse) = sender(query, response=None, useQueue=False) + self.assertEqual(receivedResponse, response) + + self.assertEqual(_dotHealthCheckQueries, 0) + + # we need at least 10 samples, and 10 percent of them failing, so two failing queries should be enough + for _ in range(2): + (_, receivedResponse) = self.sendUDPQuery(failedQuery, response=None, useQueue=False) + self.assertEqual(receivedResponse, failedResponse) + + time.sleep(1.5) + self.assertEqual(_dotHealthCheckQueries, 1) + self.assertEqual(self.getBackendStatus(), 'up') + + def testDoHLazy(self): + """ + Lazy Healthchecks: DoH + """ + self.assertEqual(_dohHealthCheckQueries, 0) + time.sleep(1) + self.assertEqual(_dohHealthCheckQueries, 0) + + name = 'doh.lazy.test.powerdns.com.' + query = dns.message.make_query(name, 'A', 'IN') + response = dns.message.make_response(query) + failedQuery = dns.message.make_query('server-failure.doh.lazy.test.powerdns.com.', 'A', 'IN') + failedResponse = dns.message.make_response(failedQuery) + failedResponse.set_rcode(dns.rcode.SERVFAIL) + + # send a few valid queries + for _ in range(5): + for method in ("sendUDPQuery", "sendTCPQuery"): + sender = getattr(self, method) + (_, receivedResponse) = sender(query, response=None, useQueue=False) + self.assertEqual(receivedResponse, response) + + self.assertEqual(_dohHealthCheckQueries, 0) + + # we need at least 10 samples, and 10 percent of them failing, so two failing queries should be enough + for _ in range(2): + (_, receivedResponse) = self.sendUDPQuery(failedQuery, response=None, useQueue=False) + self.assertEqual(receivedResponse, failedResponse) + + time.sleep(1.5) + self.assertEqual(_dohHealthCheckQueries, 1) + self.assertEqual(self.getBackendStatus(), 'up')