]> git.ipfire.org Git - thirdparty/pdns.git/commitdiff
dnsdist: Add regression tests for the 'lazy' health-check feature
authorRemi Gacogne <remi.gacogne@powerdns.com>
Thu, 6 Oct 2022 16:01:28 +0000 (18:01 +0200)
committerRemi Gacogne <remi.gacogne@powerdns.com>
Thu, 20 Oct 2022 12:30:59 +0000 (14:30 +0200)
pdns/dnsdistdist/dnsdist-backend.cc
pdns/dnsdistdist/docs/guides/downstreams.rst
pdns/dnsdistdist/test-dnsdistbackend_cc.cc
regression-tests.dnsdist/test_HealthChecks.py

index 175db7f4011f57e6c9bd92e1b1875c5ddb9f603a..17933827ba2c5d6a4ddb5309a1751bdb9df15cd3 100644 (file)
@@ -504,12 +504,14 @@ bool DownstreamState::healthCheckRequired()
   if (d_config.availability == DownstreamState::Availability::Lazy) {
     auto stats = d_lazyHealthCheckStats.lock();
     if (stats->d_status == LazyHealthCheckStats::LazyStatus::PotentialFailure) {
+      vinfolog("Sending health-check query for %s which is still in the Potential Failure state", getNameWithAddr());
       return true;
     }
     if (stats->d_status == LazyHealthCheckStats::LazyStatus::Failed) {
       auto now = time(nullptr);
       if (stats->d_nextCheck <= now) {
         stats->d_nextCheck = now + d_config.d_lazyHealthChecksFailedInterval;
+        vinfolog("Sending health-check query for %s which is still in the Failed state", getNameWithAddr());
         return true;
       }
       return false;
@@ -529,8 +531,10 @@ bool DownstreamState::healthCheckRequired()
       }
 
       const auto maxFailureRate = static_cast<float>(d_config.d_lazyHealthChecksThreshold);
-      if (((100.0 * failures) / totalCount) >= maxFailureRate) {
+      auto current = (100.0 * failures) / totalCount;
+      if (current >= maxFailureRate) {
         lastResults.clear();
+        vinfolog("Backend %s reached the lazy health-check threshold (%f out of %f, looking at sample of %d items with %d failures), moving to Potential Failure state", getNameWithAddr(), current, maxFailureRate, totalCount, failures);
         stats->d_status = LazyHealthCheckStats::LazyStatus::PotentialFailure;
         auto now = time(nullptr);
         stats->d_nextCheck = now;
index a1550b7a05060b217e44c5764a40acefc45ee9b6..8f1323402ce44750dc188bcbe4b2a79f9df94b65 100644 (file)
@@ -80,6 +80,11 @@ So for example, if we set ``healthCheckMode`` to ``lazy``, ``lazyHealthCheckSamp
 - if at least 30 of these last 100 have failed, the threshold will be reached and active health-check queries will be sent every ``checkInterval`` seconds
 - if the health-check query is successful, the backend will stay ``up`` and no more query will be sent
 - but if instead two consecutive queries fail, the backend will be marked as ``down`` and health-check queries will be sent every ``lazyHealthCheckFailedInterval`` seconds
+- it will take two consecutive, successful health-checks for the backend to go back to ``Healthy`` and be marked `up` again
+
+.. code-block:: lua
+
+    newServer({address="192.0.2.1", healthCheckMode='lazy', checkInterval=1, lazyHealthCheckFailedInterval=30, rise=2, maxCheckFailures=3, lazyHealthCheckThreshold=30, lazyHealthCheckSampleSize=100,  lazyHealthCheckMinSampleCount=10, lazyHealthCheckMode='TimeoutOnly'})
 
 Source address selection
 ------------------------
index 878ffacf6c61bc84aa2ab7b34feb918d9e0333ce..e063ec2b6d2d145006776bb8f09438272ce66908 100644 (file)
@@ -179,7 +179,7 @@ BOOST_AUTO_TEST_CASE(test_Lazy)
   BOOST_CHECK_EQUAL(ds.healthCheckRequired(), true);
 
   /* we need maxCheckFailures failed health-checks to go down */
-  for (size_t idx = 0; idx < config.maxCheckFailures - 1; idx++) {
+  for (size_t idx = 0; idx < static_cast<size_t>(config.maxCheckFailures - 1); idx++) {
     ds.submitHealthCheckResult(false, false);
   }
   BOOST_CHECK_EQUAL(ds.isUp(), true);
@@ -199,7 +199,7 @@ BOOST_AUTO_TEST_CASE(test_Lazy)
   }
 
   /* we need minRiseSuccesses successful health-checks to go down */
-  for (size_t idx = 0; idx < config.minRiseSuccesses - 1; idx++) {
+  for (size_t idx = 0; idx < static_cast<size_t>(config.minRiseSuccesses - 1); idx++) {
     ds.submitHealthCheckResult(false, true);
   }
   BOOST_CHECK_EQUAL(ds.isUp(), false);
index 0c7a5faf4f8b127d28128f387e84a051d6f54126..32749890bc83d59e1acd9cacdc9aedee3a1c09bb 100644 (file)
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
 import base64
+import threading
 import time
+import ssl
 import dns
 from dnsdisttests import DNSDistTest
 
@@ -175,3 +177,183 @@ class TestHealthCheckCustomFunction(HealthCheckTest):
         time.sleep(1.5)
         self.assertGreater(TestHealthCheckCustomFunction._healthCheckCounter, before)
         self.assertEqual(self.getBackendStatus(), 'up')
+
+_do53HealthCheckQueries = 0
+_dotHealthCheckQueries = 0
+_dohHealthCheckQueries = 0
+
+class TestLazyHealthChecks(HealthCheckTest):
+    _do53Port = 10700
+    _dotPort = 10701
+    _dohPort = 10702
+
+    _consoleKey = DNSDistTest.generateConsoleKey()
+    _consoleKeyB64 = base64.b64encode(_consoleKey).decode('ascii')
+    _config_params = ['_consoleKeyB64', '_consolePort', '_do53Port', '_dotPort', '_dohPort']
+    _config_template = """
+    setKey("%s")
+    controlSocket("127.0.0.1:%d")
+
+    newServer{address="127.0.0.1:%s", healthCheckMode='lazy', checkInterval=1, lazyHealthCheckFailedInterval=1, lazyHealthCheckThreshold=10, lazyHealthCheckSampleSize=100,  lazyHealthCheckMinSampleCount=10, lazyHealthCheckMode='TimeoutOrServFail', pool=''}
+
+    newServer{address="127.0.0.1:%s", tls='openssl', caStore='ca.pem', healthCheckMode='lazy', checkInterval=1, lazyHealthCheckFailedInterval=1, lazyHealthCheckThreshold=10, lazyHealthCheckSampleSize=100,  lazyHealthCheckMinSampleCount=10, lazyHealthCheckMode='TimeoutOrServFail', pool='dot'}
+    addAction('dot.lazy.test.powerdns.com.', PoolAction('dot'))
+
+    newServer{address="127.0.0.1:%s", tls='openssl', dohPath='/dns-query', caStore='ca.pem', healthCheckMode='lazy', checkInterval=1, lazyHealthCheckFailedInterval=1, lazyHealthCheckThreshold=10, lazyHealthCheckSampleSize=100,  lazyHealthCheckMinSampleCount=10, lazyHealthCheckMode='TimeoutOrServFail', pool='doh'}
+    addAction('doh.lazy.test.powerdns.com.', PoolAction('doh'))
+    """
+    _verboseMode = True
+
+    @staticmethod
+    def HandleDNSQuery(request):
+        response = dns.message.make_response(request)
+        if str(request.question[0].name).startswith('server-failure'):
+            response.set_rcode(dns.rcode.SERVFAIL)
+        return response.to_wire()
+
+    @classmethod
+    def Do53Callback(cls, request):
+        global _do53HealthCheckQueries
+        if str(request.question[0].name).startswith('a.root-servers.net'):
+            _do53HealthCheckQueries = _do53HealthCheckQueries + 1
+            response = dns.message.make_response(request)
+            return response.to_wire()
+        return cls.HandleDNSQuery(request)
+
+    @classmethod
+    def DoTCallback(cls, request):
+        global _dotHealthCheckQueries
+        if str(request.question[0].name).startswith('a.root-servers.net'):
+            _dotHealthCheckQueries = _dotHealthCheckQueries + 1
+            response = dns.message.make_response(request)
+            return response.to_wire()
+        return cls.HandleDNSQuery(request)
+
+    @classmethod
+    def DoHCallback(cls, request, requestHeaders, fromQueue, toQueue):
+        global _dohHealthCheckQueries
+        if str(request.question[0].name).startswith('a.root-servers.net'):
+            _dohHealthCheckQueries = _dohHealthCheckQueries + 1
+            response = dns.message.make_response(request)
+            return 200, response.to_wire()
+        return 200, cls.HandleDNSQuery(request)
+
+    @classmethod
+    def startResponders(cls):
+        tlsContext = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
+        tlsContext.load_cert_chain('server.chain', 'server.key')
+
+        Do53Responder = threading.Thread(name='Do53 Lazy Responder', target=cls.UDPResponder, args=[cls._do53Port, cls._toResponderQueue, cls._fromResponderQueue, False, cls.Do53Callback])
+        Do53Responder.setDaemon(True)
+        Do53Responder.start()
+
+        Do53TCPResponder = threading.Thread(name='Do53 TCP Lazy Responder', target=cls.TCPResponder, args=[cls._do53Port, cls._toResponderQueue, cls._fromResponderQueue, False, False, cls.Do53Callback])
+        Do53TCPResponder.setDaemon(True)
+        Do53TCPResponder.start()
+
+        DoTResponder = threading.Thread(name='DoT Lazy Responder', target=cls.TCPResponder, args=[cls._dotPort, cls._toResponderQueue, cls._fromResponderQueue, False, False, cls.DoTCallback, tlsContext])
+        DoTResponder.setDaemon(True)
+        DoTResponder.start()
+
+        DoHResponder = threading.Thread(name='DoH Lazy Responder', target=cls.DOHResponder, args=[cls._dohPort, cls._toResponderQueue, cls._fromResponderQueue, False, False, cls.DoHCallback, tlsContext])
+        DoHResponder.setDaemon(True)
+        DoHResponder.start()
+
+    def testDo53Lazy(self):
+        """
+        Lazy Healthchecks: Do53
+        """
+        self.assertEqual(_do53HealthCheckQueries, 0)
+        time.sleep(1)
+        self.assertEqual(_do53HealthCheckQueries, 0)
+
+        name = 'do53.lazy.test.powerdns.com.'
+        query = dns.message.make_query(name, 'A', 'IN')
+        response = dns.message.make_response(query)
+        failedQuery = dns.message.make_query('server-failure.do53.lazy.test.powerdns.com.', 'A', 'IN')
+        failedResponse = dns.message.make_response(failedQuery)
+        failedResponse.set_rcode(dns.rcode.SERVFAIL)
+
+        # send a few valid queries
+        for _ in range(5):
+            for method in ("sendUDPQuery", "sendTCPQuery"):
+                sender = getattr(self, method)
+                (_, receivedResponse) = sender(query, response=None, useQueue=False)
+                self.assertEqual(receivedResponse, response)
+
+        self.assertEqual(_do53HealthCheckQueries, 0)
+
+        # we need at least 10 samples, and 10 percent of them failing, so two failing queries should be enough
+        for _ in range(2):
+            (_, receivedResponse) = self.sendUDPQuery(failedQuery, response=None, useQueue=False)
+            self.assertEqual(receivedResponse, failedResponse)
+
+        time.sleep(1.5)
+        self.assertEqual(_do53HealthCheckQueries, 1)
+        self.assertEqual(self.getBackendStatus(), 'up')
+
+    def testDoTLazy(self):
+        """
+        Lazy Healthchecks: DoT
+        """
+        self.assertEqual(_dotHealthCheckQueries, 0)
+        time.sleep(1)
+        self.assertEqual(_dotHealthCheckQueries, 0)
+
+        name = 'dot.lazy.test.powerdns.com.'
+        query = dns.message.make_query(name, 'A', 'IN')
+        response = dns.message.make_response(query)
+        failedQuery = dns.message.make_query('server-failure.dot.lazy.test.powerdns.com.', 'A', 'IN')
+        failedResponse = dns.message.make_response(failedQuery)
+        failedResponse.set_rcode(dns.rcode.SERVFAIL)
+
+        # send a few valid queries
+        for _ in range(5):
+            for method in ("sendUDPQuery", "sendTCPQuery"):
+                sender = getattr(self, method)
+                (_, receivedResponse) = sender(query, response=None, useQueue=False)
+                self.assertEqual(receivedResponse, response)
+
+        self.assertEqual(_dotHealthCheckQueries, 0)
+
+        # we need at least 10 samples, and 10 percent of them failing, so two failing queries should be enough
+        for _ in range(2):
+            (_, receivedResponse) = self.sendUDPQuery(failedQuery, response=None, useQueue=False)
+            self.assertEqual(receivedResponse, failedResponse)
+
+        time.sleep(1.5)
+        self.assertEqual(_dotHealthCheckQueries, 1)
+        self.assertEqual(self.getBackendStatus(), 'up')
+
+    def testDoHLazy(self):
+        """
+        Lazy Healthchecks: DoH
+        """
+        self.assertEqual(_dohHealthCheckQueries, 0)
+        time.sleep(1)
+        self.assertEqual(_dohHealthCheckQueries, 0)
+
+        name = 'doh.lazy.test.powerdns.com.'
+        query = dns.message.make_query(name, 'A', 'IN')
+        response = dns.message.make_response(query)
+        failedQuery = dns.message.make_query('server-failure.doh.lazy.test.powerdns.com.', 'A', 'IN')
+        failedResponse = dns.message.make_response(failedQuery)
+        failedResponse.set_rcode(dns.rcode.SERVFAIL)
+
+        # send a few valid queries
+        for _ in range(5):
+            for method in ("sendUDPQuery", "sendTCPQuery"):
+                sender = getattr(self, method)
+                (_, receivedResponse) = sender(query, response=None, useQueue=False)
+                self.assertEqual(receivedResponse, response)
+
+        self.assertEqual(_dohHealthCheckQueries, 0)
+
+        # we need at least 10 samples, and 10 percent of them failing, so two failing queries should be enough
+        for _ in range(2):
+            (_, receivedResponse) = self.sendUDPQuery(failedQuery, response=None, useQueue=False)
+            self.assertEqual(receivedResponse, failedResponse)
+
+        time.sleep(1.5)
+        self.assertEqual(_dohHealthCheckQueries, 1)
+        self.assertEqual(self.getBackendStatus(), 'up')