]> git.ipfire.org Git - thirdparty/pdns.git/commitdiff
dnsdist: Add metrics for health-check failures
authorRemi Gacogne <remi.gacogne@powerdns.com>
Mon, 10 Jul 2023 11:50:57 +0000 (13:50 +0200)
committerRemi Gacogne <remi.gacogne@powerdns.com>
Mon, 10 Jul 2023 14:35:41 +0000 (16:35 +0200)
pdns/dnsdist-carbon.cc
pdns/dnsdist-web.cc
pdns/dnsdist.hh
pdns/dnsdistdist/dnsdist-backend.cc
pdns/dnsdistdist/dnsdist-healthchecks.cc
pdns/dnsdistdist/docs/guides/webserver.rst
regression-tests.dnsdist/test_API.py
regression-tests.dnsdist/test_HealthChecks.py

index 65f739721e87216451b310f6e4884b29e4558c66..693f498c435f8564126f9a53c0934d09c273a5ef 100644 (file)
@@ -98,6 +98,12 @@ static bool doOneCarbonExport(const Carbon::Endpoint& endpoint)
       str << base << "tcpavgqueriesperconnection" << ' ' << state->tcpAvgQueriesPerConnection.load() << " " << now << "\r\n";
       str << base << "tcpavgconnectionduration" << ' ' << state->tcpAvgConnectionDuration.load() << " " << now << "\r\n";
       str << base << "tcptoomanyconcurrentconnections" << ' ' << state->tcpTooManyConcurrentConnections.load() << " " << now << "\r\n";
+      str << base << "healthcheckfailures" << ' ' << state->d_healthCheckMetrics.d_failures << " " << now << "\r\n";
+      str << base << "healthcheckfailuresparsing" << ' ' << state->d_healthCheckMetrics.d_parseErrors << " " << now << "\r\n";
+      str << base << "healthcheckfailurestimeout" << ' ' << state->d_healthCheckMetrics.d_timeOuts << " " << now << "\r\n";
+      str << base << "healthcheckfailuresnetwork" << ' ' << state->d_healthCheckMetrics.d_networkErrors << " " << now << "\r\n";
+      str << base << "healthcheckfailuresmismatch" << ' ' << state->d_healthCheckMetrics.d_mismatchErrors << " " << now << "\r\n";
+      str << base << "healthcheckfailuresinvalid" << ' ' << state->d_healthCheckMetrics.d_invalidResponseErrors << " " << now << "\r\n";
     }
 
     std::map<std::string, uint64_t> frontendDuplicates;
index d47f33cd63318759d7460850093b0a2c361d22a0..d1132d37839fc9609bee776ac1cab5907444949c 100644 (file)
@@ -593,6 +593,18 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
   output << "# TYPE " << statesbase << "tlsresumptions "                  << "counter"                                                                              << "\n";
   output << "# HELP " << statesbase << "tcplatency "                      << "Server's latency when answering TCP questions in milliseconds"                        << "\n";
   output << "# TYPE " << statesbase << "tcplatency "                      << "gauge"                                                                                << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailures "             << "Number of health check attempts that failed (total)"                                  << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailures "             << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailuresparsing "      << "Number of health check attempts where the response could not be parsed"               << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailuresparsing "      << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailurestimeout "      << "Number of health check attempts where the response did not arrive in time"            << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailurestimeout "      << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailuresnetwork "      << "Number of health check attempts that experienced a network issue"                     << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailuresnetwork "      << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailuresmismatch "     << "Number of health check attempts where the response did not match the query"           << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailuresmismatch "     << "counter"                                                                              << "\n";
+  output << "# HELP " << statesbase << "healthcheckfailuresinvalid "      << "Number of health check attempts where the DNS response was invalid"                   << "\n";
+  output << "# TYPE " << statesbase << "healthcheckfailuresinvalid "      << "counter"                                                                              << "\n";
 
   for (const auto& state : *states) {
     string serverName;
@@ -636,6 +648,12 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
     output << statesbase << "tcpavgqueriesperconn"             << label << " " << state->tcpAvgQueriesPerConnection      << "\n";
     output << statesbase << "tcpavgconnduration"               << label << " " << state->tcpAvgConnectionDuration        << "\n";
     output << statesbase << "tlsresumptions"                   << label << " " << state->tlsResumptions                  << "\n";
+    output << statesbase << "healthcheckfailures"              << label << " " << state->d_healthCheckMetrics.d_failures << "\n";
+    output << statesbase << "healthcheckfailuresparsing"       << label << " " << state->d_healthCheckMetrics.d_parseErrors << "\n";
+    output << statesbase << "healthcheckfailurestimeout"       << label << " " << state->d_healthCheckMetrics.d_timeOuts << "\n";
+    output << statesbase << "healthcheckfailuresnetwork"       << label << " " << state->d_healthCheckMetrics.d_networkErrors << "\n";
+    output << statesbase << "healthcheckfailuresmismatch"      << label << " " << state->d_healthCheckMetrics.d_mismatchErrors << "\n";
+    output << statesbase << "healthcheckfailuresinvalid"        << label << " " << state->d_healthCheckMetrics.d_invalidResponseErrors << "\n";
   }
 
   const string frontsbase = "dnsdist_frontend_";
@@ -673,7 +691,6 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
   output << "# TYPE " << frontsbase << "tlsunknownticketkeys " << "counter" << "\n";
   output << "# HELP " << frontsbase << "tlsinactiveticketkeys " << "Amount of TLS sessions resumed from an inactive key" << "\n";
   output << "# TYPE " << frontsbase << "tlsinactiveticketkeys " << "counter" << "\n";
-
   output << "# HELP " << frontsbase << "tlshandshakefailures " << "Amount of TLS handshake failures" << "\n";
   output << "# TYPE " << frontsbase << "tlshandshakefailures " << "counter" << "\n";
 
@@ -1059,6 +1076,12 @@ static void addServerToJSON(Json::array& servers, int id, const std::shared_ptr<
     {"tcpAvgConnectionDuration", (double)a->tcpAvgConnectionDuration},
     {"tlsResumptions", (double)a->tlsResumptions},
     {"tcpLatency", (double)(a->latencyUsecTCP/1000.0)},
+    {"healthCheckFailures", (double)(a->d_healthCheckMetrics.d_failures)},
+    {"healthCheckFailuresParsing", (double)(a->d_healthCheckMetrics.d_parseErrors)},
+    {"healthCheckFailuresTimeout", (double)(a->d_healthCheckMetrics.d_timeOuts)},
+    {"healthCheckFailuresNetwork", (double)(a->d_healthCheckMetrics.d_networkErrors)},
+    {"healthCheckFailuresMismatch", (double)(a->d_healthCheckMetrics.d_mismatchErrors)},
+    {"healthCheckFailuresInvalid", (double)(a->d_healthCheckMetrics.d_invalidResponseErrors)},
     {"dropRate", (double)a->dropRate}
   };
 
index fca2e3c4dbdd2dd258b800b0cc4354bd0d7aa55f..be2e6ebb7c5edbb84cb56c2cf0c96b845f34cfee 100644 (file)
@@ -701,6 +701,16 @@ struct DownstreamState: public std::enable_shared_from_this<DownstreamState>
     bool d_upgradeToLazyHealthChecks{false};
   };
 
+  struct HealthCheckMetrics
+  {
+    stat_t d_failures{0};
+    stat_t d_timeOuts{0};
+    stat_t d_parseErrors{0};
+    stat_t d_networkErrors{0};
+    stat_t d_mismatchErrors{0};
+    stat_t d_invalidResponseErrors{0};
+  };
+
   DownstreamState(DownstreamState::Config&& config, std::shared_ptr<TLSCtx> tlsCtx, bool connect);
   DownstreamState(const ComboAddress& remote): DownstreamState(DownstreamState::Config(remote), nullptr, false)
   {
@@ -709,6 +719,7 @@ struct DownstreamState: public std::enable_shared_from_this<DownstreamState>
   ~DownstreamState();
 
   Config d_config;
+  HealthCheckMetrics d_healthCheckMetrics;
   stat_t sendErrors{0};
   stat_t outstanding{0};
   stat_t reuseds{0};
@@ -767,18 +778,20 @@ public:
   double latencyUsecTCP{0.0};
   unsigned int d_nextCheck{0};
   uint16_t currentCheckFailures{0};
-  uint8_t consecutiveSuccessfulChecks{0};
   std::atomic<bool> hashesComputed{false};
   std::atomic<bool> connected{false};
   bool upStatus{false};
 
 private:
+  void handleUDPTimeout(IDState& ids);
+  void updateNextLazyHealthCheck(LazyHealthCheckStats& stats, bool checkScheduled, std::optional<time_t> currentTime = std::nullopt);
   void connectUDPSockets();
 
   std::thread tid;
   std::mutex connectLock;
   std::condition_variable d_connectedWait;
   std::atomic_flag threadStarted;
+  uint8_t consecutiveSuccessfulChecks{0};
   bool d_stopped{false};
 public:
 
@@ -935,9 +948,6 @@ public:
   static int s_udpTimeout;
   static bool s_randomizeSockets;
   static bool s_randomizeIDs;
-private:
-  void handleUDPTimeout(IDState& ids);
-  void updateNextLazyHealthCheck(LazyHealthCheckStats& stats, bool checkScheduled, std::optional<time_t> currentTime = std::nullopt);
 };
 using servers_t = vector<std::shared_ptr<DownstreamState>>;
 
index 87326c93a59fd425a3f571d181d57536ebc7712c..45a50446da71105f47323cc1808f0f1d59c6809b 100644 (file)
@@ -700,6 +700,10 @@ void DownstreamState::updateNextLazyHealthCheck(LazyHealthCheckStats& stats, boo
 
 void DownstreamState::submitHealthCheckResult(bool initial, bool newResult)
 {
+  if (!newResult) {
+    ++d_healthCheckMetrics.d_failures;
+  }
+
   if (initial) {
     /* if this is the initial health-check, at startup, we do not care
        about the minimum number of failed/successful health-checks */
index dc77022fa39b3ffab3376cadf0b558430961a8ed..cc73a9d7547872423be0b811ce527be642e547a2 100644 (file)
@@ -60,6 +60,7 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
   auto& ds = data->d_ds;
   try {
     if (data->d_buffer.size() < sizeof(dnsheader)) {
+      ++data->d_ds->d_healthCheckMetrics.d_parseErrors;
       if (g_verboseHealthChecks) {
         infolog("Invalid health check response of size %d from backend %s, expecting at least %d", data->d_buffer.size(), ds->getNameWithAddr(), sizeof(dnsheader));
       }
@@ -68,6 +69,7 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
 
     const dnsheader * responseHeader = reinterpret_cast<const dnsheader*>(data->d_buffer.data());
     if (responseHeader->id != data->d_queryID) {
+      ++data->d_ds->d_healthCheckMetrics.d_mismatchErrors;
       if (g_verboseHealthChecks) {
         infolog("Invalid health check response id %d from backend %s, expecting %d", responseHeader->id, ds->getNameWithAddr(), data->d_queryID);
       }
@@ -75,6 +77,7 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
     }
 
     if (!responseHeader->qr) {
+      ++data->d_ds->d_healthCheckMetrics.d_invalidResponseErrors;
       if (g_verboseHealthChecks) {
         infolog("Invalid health check response from backend %s, expecting QR to be set", ds->getNameWithAddr());
       }
@@ -82,6 +85,7 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
     }
 
     if (responseHeader->rcode == RCode::ServFail) {
+      ++data->d_ds->d_healthCheckMetrics.d_invalidResponseErrors;
       if (g_verboseHealthChecks) {
         infolog("Backend %s responded to health check with ServFail", ds->getNameWithAddr());
       }
@@ -89,6 +93,7 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
     }
 
     if (ds->d_config.mustResolve && (responseHeader->rcode == RCode::NXDomain || responseHeader->rcode == RCode::Refused)) {
+      ++data->d_ds->d_healthCheckMetrics.d_invalidResponseErrors;
       if (g_verboseHealthChecks) {
         infolog("Backend %s responded to health check with %s while mustResolve is set", ds->getNameWithAddr(), responseHeader->rcode == RCode::NXDomain ? "NXDomain" : "Refused");
       }
@@ -100,13 +105,15 @@ static bool handleResponse(std::shared_ptr<HealthCheckData>& data)
     DNSName receivedName(reinterpret_cast<const char*>(data->d_buffer.data()), data->d_buffer.size(), sizeof(dnsheader), false, &receivedType, &receivedClass);
 
     if (receivedName != data->d_checkName || receivedType != data->d_checkType || receivedClass != data->d_checkClass) {
+      ++data->d_ds->d_healthCheckMetrics.d_mismatchErrors;
       if (g_verboseHealthChecks) {
         infolog("Backend %s responded to health check with an invalid qname (%s vs %s), qtype (%s vs %s) or qclass (%d vs %d)", ds->getNameWithAddr(), receivedName.toLogString(), data->d_checkName.toLogString(), QType(receivedType).toString(), QType(data->d_checkType).toString(), receivedClass, data->d_checkClass);
       }
       return false;
     }
   }
-  catch(const std::exception& e) {
+  catch (const std::exception& e) {
+    ++data->d_ds->d_healthCheckMetrics.d_parseErrors;
     if (g_verboseHealthChecks) {
       infolog("Error checking the health of backend %s: %s", ds->getNameWithAddr(), e.what());
     }
@@ -151,6 +158,7 @@ public:
 
   void notifyIOError(InternalQueryState&& query, const struct timeval& now) override
   {
+    ++d_data->d_ds->d_healthCheckMetrics.d_networkErrors;
     d_data->d_ds->submitHealthCheckResult(d_data->d_initial, false);
   }
 
@@ -173,6 +181,7 @@ static void healthCheckUDPCallback(int fd, FDMultiplexer::funcparam_t& param)
     if (g_verboseHealthChecks) {
       infolog("Error receiving health check response from %s: %s", data->d_ds->d_config.remote.toStringWithPort(), stringerror(savederrno));
     }
+    ++data->d_ds->d_healthCheckMetrics.d_networkErrors;
     data->d_ds->submitHealthCheckResult(data->d_initial, false);
     return;
   }
@@ -183,6 +192,7 @@ static void healthCheckUDPCallback(int fd, FDMultiplexer::funcparam_t& param)
     if (g_verboseHealthChecks) {
       infolog("Invalid health check response received from %s, expecting one from %s", from.toStringWithPort(), data->d_ds->d_config.remote.toStringWithPort());
     }
+    ++data->d_ds->d_healthCheckMetrics.d_networkErrors;
     data->d_ds->submitHealthCheckResult(data->d_initial, false);
     return;
   }
@@ -248,6 +258,7 @@ static void healthCheckTCPCallback(int fd, FDMultiplexer::funcparam_t& param)
     ioGuard.release();
   }
   catch (const std::exception& e) {
+    ++data->d_ds->d_healthCheckMetrics.d_networkErrors;
     data->d_ds->submitHealthCheckResult(data->d_initial, false);
     if (g_verboseHealthChecks) {
       infolog("Error checking the health of backend %s: %s", data->d_ds->getNameWithAddr(), e.what());
@@ -444,6 +455,7 @@ void handleQueuedHealthChecks(FDMultiplexer& mplexer, bool initial)
           infolog("Timeout while waiting for the health check response (ID %d) from backend %s", data->d_queryID, data->d_ds->getNameWithAddr());
         }
 
+        ++data->d_ds->d_healthCheckMetrics.d_timeOuts;
         data->d_ds->submitHealthCheckResult(initial, false);
       }
       catch (const std::exception& e) {
@@ -471,6 +483,7 @@ void handleQueuedHealthChecks(FDMultiplexer& mplexer, bool initial)
           infolog("Timeout while waiting for the health check response (ID %d) from backend %s", data->d_queryID, data->d_ds->getNameWithAddr());
         }
 
+        ++data->d_ds->d_healthCheckMetrics.d_timeOuts;
         data->d_ds->submitHealthCheckResult(initial, false);
       }
       catch (const std::exception& e) {
index 540b2274720d3052661000e968c2deb669361e4f..453f6d1bcab8425bbc1a8370dfb652e727e68089 100755 (executable)
@@ -955,6 +955,12 @@ JSON Objects
   :property integer tlsResumptions: The number of times a TLS session has been resumed
   :property integer weight: The weight assigned to this server
   :property float dropRate: The amount of packets dropped (timing out) per second by this server
+  :property integer healthCheckFailures: Number of health check attempts that failed (total)
+  :property integer healthCheckFailureParsing: Number of health check attempts that failed because the payload could not be parsed
+  :property integer healthCheckFailureTimeout: Number of health check attempts that failed because the response was not received in time
+  :property integer healthCheckFailureNetwork: Number of health check attempts that failed because of a network error
+  :property integer healthCheckFailureMismatch: Number of health check attempts that failed because the ID, qname, qtype or qclass did not match
+  :property integer healthCheckFailureInvalid: Number of health check attempts that failed because the DNS response was not valid
 
 .. json:object:: StatisticItem
 
index 2a9f9bf3a2483efa72e05d17f9b1dad3e3ab0329..54e0f94177f36f2f60012e36d479c1b1a331ed77 100644 (file)
@@ -144,7 +144,7 @@ class TestAPIBasics(APITestsBase):
                         'dropRate', 'responses', 'nonCompliantResponses', 'tcpDiedSendingQuery', 'tcpDiedReadingResponse',
                         'tcpGaveUp', 'tcpReadTimeouts', 'tcpWriteTimeouts', 'tcpCurrentConnections',
                         'tcpNewConnections', 'tcpReusedConnections', 'tlsResumptions', 'tcpAvgQueriesPerConnection',
-                        'tcpAvgConnectionDuration', 'tcpLatency', 'protocol']:
+                        'tcpAvgConnectionDuration', 'tcpLatency', 'protocol', 'healthCheckFailures', 'healthCheckFailuresParsing', 'healthCheckFailuresTimeout', 'healthCheckFailuresNetwork', 'healthCheckFailuresMismatch', 'healthCheckFailuresInvalid']:
                 self.assertIn(key, server)
 
             for key in ['id', 'latency', 'weight', 'outstanding', 'qpsLimit', 'reuseds',
index f5554abfdd4ba4620f736c4f2e3ca6fe834b6ff0..c1ec51b85d6403a0af37e759778c3e603f105fe7 100644 (file)
@@ -1,24 +1,44 @@
 #!/usr/bin/env python
 import base64
+import requests
+import ssl
 import threading
 import time
-import ssl
 import dns
 from dnsdisttests import DNSDistTest
 
 class HealthCheckTest(DNSDistTest):
     _consoleKey = DNSDistTest.generateConsoleKey()
     _consoleKeyB64 = base64.b64encode(_consoleKey).decode('ascii')
-    _config_params = ['_consoleKeyB64', '_consolePort', '_testServerPort']
+    _webTimeout = 2.0
+    _webServerPort = 8083
+    _webServerAPIKey = 'apisecret'
+    _webServerAPIKeyHashed = '$scrypt$ln=10,p=1,r=8$9v8JxDfzQVyTpBkTbkUqYg==$bDQzAOHeK1G9UvTPypNhrX48w974ZXbFPtRKS34+aso='
+    _config_params = ['_consoleKeyB64', '_consolePort', '_webServerPort', '_webServerAPIKeyHashed', '_testServerPort']
     _config_template = """
     setKey("%s")
     controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
     newServer{address="127.0.0.1:%d"}
     """
 
     def getBackendStatus(self):
         return self.sendConsoleCommand("if getServer(0):isUp() then return 'up' else return 'down' end").strip("\n")
 
+    def getBackendMetric(self, backendID, metricName):
+        headers = {'x-api-key': self._webServerAPIKey}
+        url = 'http://127.0.0.1:' + str(self._webServerPort) + '/api/v1/servers/localhost'
+        r = requests.get(url, headers=headers, timeout=self._webTimeout)
+        self.assertTrue(r)
+        self.assertEqual(r.status_code, 200)
+        self.assertTrue(r.json())
+        content = r.json()
+        self.assertIn('servers', content)
+        servers = content['servers']
+        server = servers[backendID]
+        return int(server[metricName])
+
 class TestDefaultHealthCheck(HealthCheckTest):
     # this test suite uses a different responder port
     # because we need fresh counters
@@ -31,6 +51,7 @@ class TestDefaultHealthCheck(HealthCheckTest):
         before = TestDefaultHealthCheck._healthCheckCounter
         time.sleep(1.5)
         self.assertGreater(TestDefaultHealthCheck._healthCheckCounter, before)
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
         self.assertEqual(self.getBackendStatus(), 'up')
 
         self.sendConsoleCommand("getServer(0):setUp()")
@@ -64,6 +85,7 @@ class TestDefaultHealthCheck(HealthCheckTest):
         time.sleep(1.5)
         self.assertGreater(TestDefaultHealthCheck._healthCheckCounter, before)
         self.assertEqual(self.getBackendStatus(), 'up')
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
 
 class TestHealthCheckForcedUP(HealthCheckTest):
     # this test suite uses a different responder port
@@ -73,6 +95,8 @@ class TestHealthCheckForcedUP(HealthCheckTest):
     _config_template = """
     setKey("%s")
     controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
     srv = newServer{address="127.0.0.1:%d"}
     srv:setUp()
     """
@@ -85,6 +109,7 @@ class TestHealthCheckForcedUP(HealthCheckTest):
         time.sleep(1.5)
         self.assertEqual(TestHealthCheckForcedUP._healthCheckCounter, before)
         self.assertEqual(self.getBackendStatus(), 'up')
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
 
 class TestHealthCheckForcedDown(HealthCheckTest):
     # this test suite uses a different responder port
@@ -94,6 +119,8 @@ class TestHealthCheckForcedDown(HealthCheckTest):
     _config_template = """
     setKey("%s")
     controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
     srv = newServer{address="127.0.0.1:%d"}
     srv:setDown()
     """
@@ -105,6 +132,7 @@ class TestHealthCheckForcedDown(HealthCheckTest):
         before = TestHealthCheckForcedDown._healthCheckCounter
         time.sleep(1.5)
         self.assertEqual(TestHealthCheckForcedDown._healthCheckCounter, before)
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
 
 class TestHealthCheckCustomName(HealthCheckTest):
     # this test suite uses a different responder port
@@ -112,10 +140,12 @@ class TestHealthCheckCustomName(HealthCheckTest):
     _testServerPort = 5383
 
     _healthCheckName = 'powerdns.com.'
-    _config_params = ['_consoleKeyB64', '_consolePort', '_testServerPort', '_healthCheckName']
+    _config_params = ['_consoleKeyB64', '_consolePort', '_webServerPort', '_webServerAPIKeyHashed', '_testServerPort', '_healthCheckName']
     _config_template = """
     setKey("%s")
     controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
     srv = newServer{address="127.0.0.1:%d", checkName='%s'}
     """
 
@@ -127,6 +157,7 @@ class TestHealthCheckCustomName(HealthCheckTest):
         time.sleep(1.5)
         self.assertGreater(TestHealthCheckCustomName._healthCheckCounter, before)
         self.assertEqual(self.getBackendStatus(), 'up')
+        self.assertEqual(self.getBackendMetric(0, 'healthCheckFailures'), 0)
 
 class TestHealthCheckCustomNameNoAnswer(HealthCheckTest):
     # this test suite uses a different responder port
@@ -137,6 +168,8 @@ class TestHealthCheckCustomNameNoAnswer(HealthCheckTest):
     _config_template = """
     setKey("%s")
     controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
     srv = newServer{address="127.0.0.1:%d", checkName='powerdns.com.'}
     """
 
@@ -148,6 +181,8 @@ class TestHealthCheckCustomNameNoAnswer(HealthCheckTest):
         time.sleep(1.5)
         self.assertEqual(TestHealthCheckCustomNameNoAnswer._healthCheckCounter, before)
         self.assertEqual(self.getBackendStatus(), 'down')
+        self.assertGreater(self.getBackendMetric(0, 'healthCheckFailures'), 0)
+        self.assertGreater(self.getBackendMetric(0, 'healthCheckFailuresTimeout'), 0)
 
 class TestHealthCheckCustomFunction(HealthCheckTest):
     # this test suite uses a different responder port
@@ -159,6 +194,8 @@ class TestHealthCheckCustomFunction(HealthCheckTest):
     _config_template = """
     setKey("%s")
     controlSocket("127.0.0.1:%d")
+    webserver("127.0.0.1:%s")
+    setWebserverConfig({apiKey="%s"})
 
     function myHealthCheckFunction(qname, qtype, qclass, dh)
       dh:setCD(true)