dnsdist: Add metrics for health-check failures

author Remi Gacogne <remi.gacogne@powerdns.com>

Mon, 10 Jul 2023 11:50:57 +0000 (13:50 +0200)

committer Remi Gacogne <remi.gacogne@powerdns.com>

Mon, 10 Jul 2023 14:35:41 +0000 (16:35 +0200)
author Remi Gacogne <remi.gacogne@powerdns.com>
Mon, 10 Jul 2023 11:50:57 +0000 (13:50 +0200)
committer Remi Gacogne <remi.gacogne@powerdns.com>
Mon, 10 Jul 2023 14:35:41 +0000 (16:35 +0200)
diff --git a/pdns/dnsdist-carbon.cc b/pdns/dnsdist-carbon.cc

index 65f739721e87216451b310f6e4884b29e4558c66..693f498c435f8564126f9a53c0934d09c273a5ef 100644 (file)
--- a/pdns/dnsdist-carbon.cc
+++ b/pdns/dnsdist-carbon.cc
@@ -98,6 +98,12 @@ static bool doOneCarbonExport(const Carbon::Endpoint& endpoint)
        str << base << "tcpavgqueriesperconnection" << ' ' << state->tcpAvgQueriesPerConnection.load() << " " << now << "\r\n";
        str << base << "tcpavgconnectionduration" << ' ' << state->tcpAvgConnectionDuration.load() << " " << now << "\r\n";
        str << base << "tcptoomanyconcurrentconnections" << ' ' << state->tcpTooManyConcurrentConnections.load() << " " << now << "\r\n";
+      str << base << "healthcheckfailures" << ' ' << state->d_healthCheckMetrics.d_failures << " " << now << "\r\n";
+      str << base << "healthcheckfailuresparsing" << ' ' << state->d_healthCheckMetrics.d_parseErrors << " " << now << "\r\n";
+      str << base << "healthcheckfailurestimeout" << ' ' << state->d_healthCheckMetrics.d_timeOuts << " " << now << "\r\n";
+      str << base << "healthcheckfailuresnetwork" << ' ' << state->d_healthCheckMetrics.d_networkErrors << " " << now << "\r\n";
+      str << base << "healthcheckfailuresmismatch" << ' ' << state->d_healthCheckMetrics.d_mismatchErrors << " " << now << "\r\n";
+      str << base << "healthcheckfailuresinvalid" << ' ' << state->d_healthCheckMetrics.d_invalidResponseErrors << " " << now << "\r\n";
      }
  
      std::map<std::string, uint64_t> frontendDuplicates;
diff --git a/pdns/dnsdist-web.cc b/pdns/dnsdist-web.cc

index d47f33cd63318759d7460850093b0a2c361d22a0..d1132d37839fc9609bee776ac1cab5907444949c 100644 (file)
--- a/pdns/dnsdist-web.cc
+++ b/pdns/dnsdist-web.cc
@@ -593,6 +593,18 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
    output << "# TYPE " << statesbase << "tlsresumptions "                  << "counter"                                                                              << "\n";
    output << "# HELP " << statesbase << "tcplatency "                      << "Server's latency when answering TCP questions in milliseconds"                        << "\n";
    output << "# TYPE " << statesbase << "tcplatency "                      << "gauge"                                                                                << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailures "             << "Number of health check attempts that failed (total)"                                  << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailures "             << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailuresparsing "      << "Number of health check attempts where the response could not be parsed"               << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailuresparsing "      << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailurestimeout "      << "Number of health check attempts where the response did not arrive in time"            << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailurestimeout "      << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailuresnetwork "      << "Number of health check attempts that experienced a network issue"                     << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailuresnetwork "      << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailuresmismatch "     << "Number of health check attempts where the response did not match the query"           << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailuresmismatch "     << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailuresinvalid "      << "Number of health check attempts where the DNS response was invalid"                   << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailuresinvalid "      << "counter"                                                                              << "\n";
  
    for (const auto& state : *states) {
      string serverName;
@@ -636,6 +648,12 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
      output << statesbase << "tcpavgqueriesperconn"             << label << " " << state->tcpAvgQueriesPerConnection      << "\n";
      output << statesbase << "tcpavgconnduration"               << label << " " << state->tcpAvgConnectionDuration        << "\n";
      output << statesbase << "tlsresumptions"                   << label << " " << state->tlsResumptions                  << "\n";
+    output << statesbase << "healthcheckfailures"              << label << " " << state->d_healthCheckMetrics.d_failures << "\n";
+    output << statesbase << "healthcheckfailuresparsing"       << label << " " << state->d_healthCheckMetrics.d_parseErrors << "\n";
+    output << statesbase << "healthcheckfailurestimeout"       << label << " " << state->d_healthCheckMetrics.d_timeOuts << "\n";
+    output << statesbase << "healthcheckfailuresnetwork"       << label << " " << state->d_healthCheckMetrics.d_networkErrors << "\n";
+    output << statesbase << "healthcheckfailuresmismatch"      << label << " " << state->d_healthCheckMetrics.d_mismatchErrors << "\n";
+    output << statesbase << "healthcheckfailuresinvalid"        << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n";
    }
  
    const string frontsbase = "dnsdist_frontend_";
@@ -673,7 +691,6 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
    output << "# TYPE " << frontsbase << "tlsunknownticketkeys " << "counter" << "\n";
    output << "# HELP " << frontsbase << "tlsinactiveticketkeys " << "Amount of TLS sessions resumed from an inactive key" << "\n";
    output << "# TYPE " << frontsbase << "tlsinactiveticketkeys " << "counter" << "\n";
-
    output << "# HELP " << frontsbase << "tlshandshakefailures " << "Amount of TLS handshake failures" << "\n";
    output << "# TYPE " << frontsbase << "tlshandshakefailures " << "counter" << "\n";
  
@@ -1059,6 +1076,12 @@ static void addServerToJSON(Json::array& servers, int id, const std::shared_ptr<
      {"tcpAvgConnectionDuration", (double)a->tcpAvgConnectionDuration},
      {"tlsResumptions", (double)a->tlsResumptions},
      {"tcpLatency", (double)(a->latencyUsecTCP/1000.0)},
+    {"healthCheckFailures", (double)(a->d_healthCheckMetrics.d_failures)},
+    {"healthCheckFailuresParsing", (double)(a->d_healthCheckMetrics.d_parseErrors)},
+    {"healthCheckFailuresTimeout", (double)(a->d_healthCheckMetrics.d_timeOuts)},
+    {"healthCheckFailuresNetwork", (double)(a->d_healthCheckMetrics.d_networkErrors)},
+    {"healthCheckFailuresMismatch", (double)(a->d_healthCheckMetrics.d_mismatchErrors)},
+    {"healthCheckFailuresInvalid", (double)(a->d_healthCheckMetrics.d_invalidResponseErrors)},
      {"dropRate", (double)a->dropRate}
    };
  
diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh

index fca2e3c4dbdd2dd258b800b0cc4354bd0d7aa55f..be2e6ebb7c5edbb84cb56c2cf0c96b845f34cfee 100644 (file)
--- a/pdns/dnsdist.hh
+++ b/pdns/dnsdist.hh
@@ -701,6 +701,16 @@ struct DownstreamState: public std::enable_shared_from_this<DownstreamState>
      bool d_upgradeToLazyHealthChecks{false};
    };
  
+  struct HealthCheckMetrics
+  {
+    stat_t d_failures{0};
+    stat_t d_timeOuts{0};
+    stat_t d_parseErrors{0};
+    stat_t d_networkErrors{0};
+    stat_t d_mismatchErrors{0};
+    stat_t d_invalidResponseErrors{0};
+  };
+
    DownstreamState(DownstreamState::Config&& config, std::shared_ptr<TLSCtx> tlsCtx, bool connect);
    DownstreamState(const ComboAddress& remote): DownstreamState(DownstreamState::Config(remote), nullptr, false)
    {
@@ -709,6 +719,7 @@ struct DownstreamState: public std::enable_shared_from_this<DownstreamState>
    ~DownstreamState();
  
    Config d_config;
+  HealthCheckMetrics d_healthCheckMetrics;
    stat_t sendErrors{0};
    stat_t outstanding{0};
    stat_t reuseds{0};
@@ -767,18 +778,20 @@ public:
    double latencyUsecTCP{0.0};
    unsigned int d_nextCheck{0};
    uint16_t currentCheckFailures{0};
-  uint8_t consecutiveSuccessfulChecks{0};
    std::atomic<bool> hashesComputed{false};
    std::atomic<bool> connected{false};
    bool upStatus{false};
  
  private:
+  void handleUDPTimeout(IDState& ids);
+  void updateNextLazyHealthCheck(LazyHealthCheckStats& stats, bool checkScheduled, std::optional<time_t> currentTime = std::nullopt);
    void connectUDPSockets();
  
    std::thread tid;
    std::mutex connectLock;
    std::condition_variable d_connectedWait;
    std::atomic_flag threadStarted;
+  uint8_t consecutiveSuccessfulChecks{0};
    bool d_stopped{false};
  public:
  
@@ -935,9 +948,6 @@ public:
    static int s_udpTimeout;
    static bool s_randomizeSockets;
    static bool s_randomizeIDs;
-private:
-  void handleUDPTimeout(IDState& ids);
-  void updateNextLazyHealthCheck(LazyHealthCheckStats& stats, bool checkScheduled, std::optional<time_t> currentTime = std::nullopt);
  };
  using servers_t = vector<std::shared_ptr<DownstreamState>>;
  
diff --git a/pdns/dnsdistdist/dnsdist-backend.cc b/pdns/dnsdistdist/dnsdist-backend.cc

index 87326c93a59fd425a3f571d181d57536ebc7712c..45a50446da71105f47323cc1808f0f1d59c6809b 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-backend.cc
+++ b/pdns/dnsdistdist/dnsdist-backend.cc
@@ -700,6 +700,10 @@ void DownstreamState::updateNextLazyHealthCheck(LazyHealthCheckStats& stats, boo
  
  void DownstreamState::submitHealthCheckResult(bool initial, bool newResult)
  {
+  if (!newResult) {
+    ++d_healthCheckMetrics.d_failures;
+  }
+
    if (initial) {
      /* if this is the initial health-check, at startup, we do not care
         about the minimum number of failed/successful health-checks */
diff --git a/pdns/dnsdistdist/dnsdist-healthchecks.cc b/pdns/dnsdistdist/dnsdist-healthchecks.cc

index dc77022fa39b3ffab3376cadf0b558430961a8ed..cc73a9d7547872423be0b811ce527be642e547a2 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-healthchecks.cc
+++ b/pdns/dnsdistdist/dnsdist-healthchecks.cc
@@ -60,6 +60,7 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
    auto& ds = data->d_ds;
    try {
      if (data->d_buffer.size() < sizeof(dnsheader)) {
+      ++data->d_ds->d_healthCheckMetrics.d_parseErrors;
        if (g_verboseHealthChecks) {
          infolog("Invalid health check response of size %d from backend %s, expecting at least %d", data->d_buffer.size(), ds->getNameWithAddr(), sizeof(dnsheader));
        }
@@ -68,6 +69,7 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
  
      const dnsheader * responseHeader = reinterpret_cast<const dnsheader*>(data->d_buffer.data());
      if (responseHeader->id != data->d_queryID) {
+      ++data->d_ds->d_healthCheckMetrics.d_mismatchErrors;
        if (g_verboseHealthChecks) {
          infolog("Invalid health check response id %d from backend %s, expecting %d", responseHeader->id, ds->getNameWithAddr(), data->d_queryID);
        }
@@ -75,6 +77,7 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
      }
  
      if (!responseHeader->qr) {
+      ++data->d_ds->d_healthCheckMetrics.d_invalidResponseErrors;
        if (g_verboseHealthChecks) {
          infolog("Invalid health check response from backend %s, expecting QR to be set", ds->getNameWithAddr());
        }
@@ -82,6 +85,7 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
      }
  
      if (responseHeader->rcode == RCode::ServFail) {
+      ++data->d_ds->d_healthCheckMetrics.d_invalidResponseErrors;
        if (g_verboseHealthChecks) {
          infolog("Backend %s responded to health check with ServFail", ds->getNameWithAddr());
        }
@@ -89,6 +93,7 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
      }
  
      if (ds->d_config.mustResolve && (responseHeader->rcode == RCode::NXDomain || responseHeader->rcode == RCode::Refused)) {
+      ++data->d_ds->d_healthCheckMetrics.d_invalidResponseErrors;
        if (g_verboseHealthChecks) {
          infolog("Backend %s responded to health check with %s while mustResolve is set", ds->getNameWithAddr(), responseHeader->rcode == RCode::NXDomain ? "NXDomain" : "Refused");
        }
@@ -100,13 +105,15 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
      DNSName receivedName(reinterpret_cast<const char*>(data->d_buffer.data()), data->d_buffer.size(), sizeof(dnsheader), false, &receivedType, &receivedClass);
  
      if (receivedName != data->d_checkName || receivedType != data->d_checkType || receivedClass != data->d_checkClass) {
+      ++data->d_ds->d_healthCheckMetrics.d_mismatchErrors;
        if (g_verboseHealthChecks) {
          infolog("Backend %s responded to health check with an invalid qname (%s vs %s), qtype (%s vs %s) or qclass (%d vs %d)", ds->getNameWithAddr(), receivedName.toLogString(), data->d_checkName.toLogString(), QType(receivedType).toString(), QType(data->d_checkType).toString(), receivedClass, data->d_checkClass);
        }
        return false;
      }
    }
-  catch(const std::exception& e) {
+  catch (const std::exception& e) {
+    ++data->d_ds->d_healthCheckMetrics.d_parseErrors;
      if (g_verboseHealthChecks) {
        infolog("Error checking the health of backend %s: %s", ds->getNameWithAddr(), e.what());
      }
@@ -151,6 +158,7 @@ public:
  
    void notifyIOError(InternalQueryState&& query, const struct timeval& now) override
    {
+    ++d_data->d_ds->d_healthCheckMetrics.d_networkErrors;
      d_data->d_ds->submitHealthCheckResult(d_data->d_initial, false);
    }
  
@@ -173,6 +181,7 @@ static void healthCheckUDPCallback(int fd, FDMultiplexer::funcparam_t& param)
      if (g_verboseHealthChecks) {
        infolog("Error receiving health check response from %s: %s", data->d_ds->d_config.remote.toStringWithPort(), stringerror(savederrno));
      }
+    ++data->d_ds->d_healthCheckMetrics.d_networkErrors;
      data->d_ds->submitHealthCheckResult(data->d_initial, false);
      return;
    }
@@ -183,6 +192,7 @@ static void healthCheckUDPCallback(int fd, FDMultiplexer::funcparam_t& param)
      if (g_verboseHealthChecks) {
        infolog("Invalid health check response received from %s, expecting one from %s", from.toStringWithPort(), data->d_ds->d_config.remote.toStringWithPort());
      }
+    ++data->d_ds->d_healthCheckMetrics.d_networkErrors;
      data->d_ds->submitHealthCheckResult(data->d_initial, false);
      return;
    }
@@ -248,6 +258,7 @@ static void healthCheckTCPCallback(int fd, FDMultiplexer::funcparam_t& param)
      ioGuard.release();
    }
    catch (const std::exception& e) {
+    ++data->d_ds->d_healthCheckMetrics.d_networkErrors;
      data->d_ds->submitHealthCheckResult(data->d_initial, false);
      if (g_verboseHealthChecks) {
        infolog("Error checking the health of backend %s: %s", data->d_ds->getNameWithAddr(), e.what());
@@ -444,6 +455,7 @@ void handleQueuedHealthChecks(FDMultiplexer& mplexer, bool initial)
            infolog("Timeout while waiting for the health check response (ID %d) from backend %s", data->d_queryID, data->d_ds->getNameWithAddr());
          }
  
+        ++data->d_ds->d_healthCheckMetrics.d_timeOuts;
          data->d_ds->submitHealthCheckResult(initial, false);
        }
        catch (const std::exception& e) {
@@ -471,6 +483,7 @@ void handleQueuedHealthChecks(FDMultiplexer& mplexer, bool initial)
            infolog("Timeout while waiting for the health check response (ID %d) from backend %s", data->d_queryID, data->d_ds->getNameWithAddr());
          }
  
+        ++data->d_ds->d_healthCheckMetrics.d_timeOuts;
          data->d_ds->submitHealthCheckResult(initial, false);
        }
        catch (const std::exception& e) {
diff --git a/pdns/dnsdistdist/docs/guides/webserver.rst b/pdns/dnsdistdist/docs/guides/webserver.rst

index 540b2274720d3052661000e968c2deb669361e4f..453f6d1bcab8425bbc1a8370dfb652e727e68089 100755 (executable)
--- a/pdns/dnsdistdist/docs/guides/webserver.rst
+++ b/pdns/dnsdistdist/docs/guides/webserver.rst
@@ -955,6 +955,12 @@ JSON Objects
    :property integer tlsResumptions: The number of times a TLS session has been resumed
    :property integer weight: The weight assigned to this server
    :property float dropRate: The amount of packets dropped (timing out) per second by this server
+  :property integer healthCheckFailures: Number of health check attempts that failed (total)
+  :property integer healthCheckFailureParsing: Number of health check attempts that failed because the payload could not be parsed
+  :property integer healthCheckFailureTimeout: Number of health check attempts that failed because the response was not received in time
+  :property integer healthCheckFailureNetwork: Number of health check attempts that failed because of a network error
+  :property integer healthCheckFailureMismatch: Number of health check attempts that failed because the ID, qname, qtype or qclass did not match
+  :property integer healthCheckFailureInvalid: Number of health check attempts that failed because the DNS response was not valid
  
  .. json:object:: StatisticItem
  
diff --git a/regression-tests.dnsdist/test_API.py b/regression-tests.dnsdist/test_API.py

index 2a9f9bf3a2483efa72e05d17f9b1dad3e3ab0329..54e0f94177f36f2f60012e36d479c1b1a331ed77 100644 (file)
--- a/regression-tests.dnsdist/test_API.py
+++ b/regression-tests.dnsdist/test_API.py
@@ -144,7 +144,7 @@ class TestAPIBasics(APITestsBase):
                          'dropRate', 'responses', 'nonCompliantResponses', 'tcpDiedSendingQuery', 'tcpDiedReadingResponse',
                          'tcpGaveUp', 'tcpReadTimeouts', 'tcpWriteTimeouts', 'tcpCurrentConnections',
                          'tcpNewConnections', 'tcpReusedConnections', 'tlsResumptions', 'tcpAvgQueriesPerConnection',
-                        'tcpAvgConnectionDuration', 'tcpLatency', 'protocol']:
+                        'tcpAvgConnectionDuration', 'tcpLatency', 'protocol', 'healthCheckFailures', 'healthCheckFailuresParsing', 'healthCheckFailuresTimeout', 'healthCheckFailuresNetwork', 'healthCheckFailuresMismatch', 'healthCheckFailuresInvalid']:
                  self.assertIn(key, server)
  
              for key in ['id', 'latency', 'weight', 'outstanding', 'qpsLimit', 'reuseds',
diff --git a/regression-tests.dnsdist/test_HealthChecks.py b/regression-tests.dnsdist/test_HealthChecks.py

index f5554abfdd4ba4620f736c4f2e3ca6fe834b6ff0..c1ec51b85d6403a0af37e759778c3e603f105fe7 100644 (file)
--- a/regression-tests.dnsdist/test_HealthChecks.py
+++ b/regression-tests.dnsdist/test_HealthChecks.py
@@ -1,24 +1,44 @@
  #!/usr/bin/env python
  import base64
+import requests
+import ssl
  import threading
  import time
-import ssl
  import dns
  from dnsdisttests import DNSDistTest
  
  class HealthCheckTest(DNSDistTest):
      _consoleKey = DNSDistTest.generateConsoleKey()
      _consoleKeyB64 = base64.b64encode(_consoleKey).decode('ascii')
-    _config_params = ['_consoleKeyB64', '_consolePort', '_testServerPort']
+    _webTimeout = 2.0
+    _webServerPort = 8083
+    _webServerAPIKey = 'apisecret'
+    _webServerAPIKeyHashed = '$scrypt$ln=10,p=1,r=8$9v8JxDfzQVyTpBkTbkUqYg==$bDQzAOHeK1G9UvTPypNhrX48w974ZXbFPtRKS34+aso='
+    _config_params = ['_consoleKeyB64', '_consolePort', '_webServerPort', '_webServerAPIKeyHashed', '_testServerPort']
      _config_template = """
      setKey("%s")
      controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
      newServer{address="127.0.0.1:%d"}
      """
  
      def getBackendStatus(self):
          return self.sendConsoleCommand("if getServer(0):isUp() then return 'up' else return 'down' end").strip("\n")
  
+    def getBackendMetric(self, backendID, metricName):
+        headers = {'x-api-key': self._webServerAPIKey}
+        url = 'http://127.0.0.1:' + str(self._webServerPort) + '/api/v1/servers/localhost'
+        r = requests.get(url, headers=headers, timeout=self._webTimeout)
+        self.assertTrue(r)
+        self.assertEqual(r.status_code, 200)
+        self.assertTrue(r.json())
+        content = r.json()
+        self.assertIn('servers', content)
+        servers = content['servers']
+        server = servers[backendID]
+        return int(server[metricName])
+
  class TestDefaultHealthCheck(HealthCheckTest):
      # this test suite uses a different responder port
      # because we need fresh counters
@@ -31,6 +51,7 @@ class TestDefaultHealthCheck(HealthCheckTest):
          before = TestDefaultHealthCheck._healthCheckCounter
          time.sleep(1.5)
          self.assertGreater(TestDefaultHealthCheck._healthCheckCounter, before)
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
          self.assertEqual(self.getBackendStatus(), 'up')
  
          self.sendConsoleCommand("getServer(0):setUp()")
@@ -64,6 +85,7 @@ class TestDefaultHealthCheck(HealthCheckTest):
          time.sleep(1.5)
          self.assertGreater(TestDefaultHealthCheck._healthCheckCounter, before)
          self.assertEqual(self.getBackendStatus(), 'up')
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
  
  class TestHealthCheckForcedUP(HealthCheckTest):
      # this test suite uses a different responder port
@@ -73,6 +95,8 @@ class TestHealthCheckForcedUP(HealthCheckTest):
      _config_template = """
      setKey("%s")
      controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
      srv = newServer{address="127.0.0.1:%d"}
      srv:setUp()
      """
@@ -85,6 +109,7 @@ class TestHealthCheckForcedUP(HealthCheckTest):
          time.sleep(1.5)
          self.assertEqual(TestHealthCheckForcedUP._healthCheckCounter, before)
          self.assertEqual(self.getBackendStatus(), 'up')
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
  
  class TestHealthCheckForcedDown(HealthCheckTest):
      # this test suite uses a different responder port
@@ -94,6 +119,8 @@ class TestHealthCheckForcedDown(HealthCheckTest):
      _config_template = """
      setKey("%s")
      controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
      srv = newServer{address="127.0.0.1:%d"}
      srv:setDown()
      """
@@ -105,6 +132,7 @@ class TestHealthCheckForcedDown(HealthCheckTest):
          before = TestHealthCheckForcedDown._healthCheckCounter
          time.sleep(1.5)
          self.assertEqual(TestHealthCheckForcedDown._healthCheckCounter, before)
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
  
  class TestHealthCheckCustomName(HealthCheckTest):
      # this test suite uses a different responder port
@@ -112,10 +140,12 @@ class TestHealthCheckCustomName(HealthCheckTest):
      _testServerPort = 5383
  
      _healthCheckName = 'powerdns.com.'
-    _config_params = ['_consoleKeyB64', '_consolePort', '_testServerPort', '_healthCheckName']
+    _config_params = ['_consoleKeyB64', '_consolePort', '_webServerPort', '_webServerAPIKeyHashed', '_testServerPort', '_healthCheckName']
      _config_template = """
      setKey("%s")
      controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
      srv = newServer{address="127.0.0.1:%d", checkName='%s'}
      """
  
@@ -127,6 +157,7 @@ class TestHealthCheckCustomName(HealthCheckTest):
          time.sleep(1.5)
          self.assertGreater(TestHealthCheckCustomName._healthCheckCounter, before)
          self.assertEqual(self.getBackendStatus(), 'up')
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
  
  class TestHealthCheckCustomNameNoAnswer(HealthCheckTest):
      # this test suite uses a different responder port
@@ -137,6 +168,8 @@ class TestHealthCheckCustomNameNoAnswer(HealthCheckTest):
      _config_template = """
      setKey("%s")
      controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
      srv = newServer{address="127.0.0.1:%d", checkName='powerdns.com.'}
      """
  
@@ -148,6 +181,8 @@ class TestHealthCheckCustomNameNoAnswer(HealthCheckTest):
          time.sleep(1.5)
          self.assertEqual(TestHealthCheckCustomNameNoAnswer._healthCheckCounter, before)
          self.assertEqual(self.getBackendStatus(), 'down')
+        self.assertGreater(self.getBackendMetric(0, 'healthCheckFailures'), 0)
+        self.assertGreater(self.getBackendMetric(0, 'healthCheckFailuresTimeout'), 0)
  
  class TestHealthCheckCustomFunction(HealthCheckTest):
      # this test suite uses a different responder port
@@ -159,6 +194,8 @@ class TestHealthCheckCustomFunction(HealthCheckTest):
      _config_template = """
      setKey("%s")
      controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
  
      function myHealthCheckFunction(qname, qtype, qclass, dh)
        dh:setCD(true)
author	Remi Gacogne <remi.gacogne@powerdns.com>
	Mon, 10 Jul 2023 11:50:57 +0000 (13:50 +0200)
committer	Remi Gacogne <remi.gacogne@powerdns.com>
	Mon, 10 Jul 2023 14:35:41 +0000 (16:35 +0200)
pdns/dnsdist-carbon.cc		patch \| blob \| blame \| history
pdns/dnsdist-web.cc		patch \| blob \| blame \| history
pdns/dnsdist.hh		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-backend.cc		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-healthchecks.cc		patch \| blob \| blame \| history
pdns/dnsdistdist/docs/guides/webserver.rst		patch \| blob \| blame \| history
regression-tests.dnsdist/test_API.py		patch \| blob \| blame \| history
regression-tests.dnsdist/test_HealthChecks.py		patch \| blob \| blame \| history