]> git.ipfire.org Git - thirdparty/pdns.git/commitdiff
dnsdist: Add steal, iowait and UDP errors metrics
authorRemi Gacogne <remi.gacogne@powerdns.com>
Mon, 3 Feb 2020 10:50:38 +0000 (11:50 +0100)
committerRemi Gacogne <remi.gacogne@powerdns.com>
Wed, 5 Feb 2020 09:40:41 +0000 (10:40 +0100)
This commit also moves the prometheus metrics code to a separate
header.

pdns/dnsdist-web.cc
pdns/dnsdist.cc
pdns/dnsdist.hh
pdns/dnsdistdist/Makefile.am
pdns/dnsdistdist/dnsdist-prometheus.hh [new file with mode: 0644]
pdns/dnsdistdist/docs/statistics.rst
regression-tests.dnsdist/test_API.py

index 421ce833a3a008489902ca2347f97715b4b6523d..c246a40f4b63e1c2d17a6b5c923ddfe22e585719 100644 (file)
@@ -21,6 +21,7 @@
  */
 #include "dnsdist.hh"
 #include "dnsdist-healthchecks.hh"
+#include "dnsdist-prometheus.hh"
 
 #include "sstuff.hh"
 #include "ext/json11/json11.hpp"
 bool g_apiReadWrite{false};
 WebserverConfig g_webserverConfig;
 std::string g_apiConfigDirectory;
+static const MetricDefinitionStorage s_metricDefinitions;
+
+const std::map<std::string, MetricDefinition> MetricDefinitionStorage::metrics{
+  { "responses",              MetricDefinition(PrometheusMetricType::counter, "Number of responses received from backends") },
+  { "servfail-responses",     MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received from backends") },
+  { "queries",                MetricDefinition(PrometheusMetricType::counter, "Number of received queries")},
+  { "frontend-nxdomain",      MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers sent to clients")},
+  { "frontend-servfail",      MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers sent to clients")},
+  { "frontend-noerror",       MetricDefinition(PrometheusMetricType::counter, "Number of NoError answers sent to clients")},
+  { "acl-drops",              MetricDefinition(PrometheusMetricType::counter, "Number of packets dropped because of the ACL")},
+  { "rule-drop",              MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a rule")},
+  { "rule-nxdomain",          MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers returned because of a rule")},
+  { "rule-refused",           MetricDefinition(PrometheusMetricType::counter, "Number of Refused answers returned because of a rule")},
+  { "rule-servfail",          MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received because of a rule")},
+  { "self-answered",          MetricDefinition(PrometheusMetricType::counter, "Number of self-answered responses")},
+  { "downstream-timeouts",    MetricDefinition(PrometheusMetricType::counter, "Number of queries not answered in time by a backend")},
+  { "downstream-send-errors", MetricDefinition(PrometheusMetricType::counter, "Number of errors when sending a query to a backend")},
+  { "trunc-failures",         MetricDefinition(PrometheusMetricType::counter, "Number of errors encountered while truncating an answer")},
+  { "no-policy",              MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because no server was available")},
+  { "latency0-1",             MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in less than 1ms")},
+  { "latency1-10",            MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 1-10 ms")},
+  { "latency10-50",           MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 10-50 ms")},
+  { "latency50-100",          MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 50-100 ms")},
+  { "latency100-1000",        MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 100-1000 ms")},
+  { "latency-slow",           MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in more than 1 second")},
+  { "latency-avg100",         MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 100 packets")},
+  { "latency-avg1000",        MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 1000 packets")},
+  { "latency-avg10000",       MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 10000 packets")},
+  { "latency-avg1000000",     MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 1000000 packets")},
+  { "uptime",                 MetricDefinition(PrometheusMetricType::gauge,   "Uptime of the dnsdist process in seconds")},
+  { "real-memory-usage",      MetricDefinition(PrometheusMetricType::gauge,   "Current memory usage in bytes")},
+  { "noncompliant-queries",   MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped as non-compliant")},
+  { "noncompliant-responses", MetricDefinition(PrometheusMetricType::counter, "Number of answers from a backend dropped as non-compliant")},
+  { "rdqueries",              MetricDefinition(PrometheusMetricType::counter, "Number of received queries with the recursion desired bit set")},
+  { "empty-queries",          MetricDefinition(PrometheusMetricType::counter, "Number of empty queries received from clients")},
+  { "cache-hits",             MetricDefinition(PrometheusMetricType::counter, "Number of times an answer was retrieved from cache")},
+  { "cache-misses",           MetricDefinition(PrometheusMetricType::counter, "Number of times an answer not found in the cache")},
+  { "cpu-iowait",             MetricDefinition(PrometheusMetricType::counter, "Time waiting for I/O to complete by the whole system")},
+  { "cpu-user-msec",          MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the user state")},
+  { "cpu-steal",              MetricDefinition(PrometheusMetricType::counter, "Stolen time, which is the time spent by the whole system in other operating systems when running in a virtualized environment")},
+  { "cpu-sys-msec",           MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the system state")},
+  { "fd-usage",               MetricDefinition(PrometheusMetricType::gauge,   "Number of currently used file descriptors")},
+  { "dyn-blocked",            MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a dynamic block")},
+  { "dyn-block-nmg-size",     MetricDefinition(PrometheusMetricType::gauge,   "Number of dynamic blocks entries") },
+  { "security-status",        MetricDefinition(PrometheusMetricType::gauge,   "Security status of this software. 0=unknown, 1=OK, 2=upgrade recommended, 3=upgrade mandatory") },
+  { "udp-in-errors",          MetricDefinition(PrometheusMetricType::counter, "From /proc/net/snmp InErrors") },
+  { "udp-noport-errors",      MetricDefinition(PrometheusMetricType::counter, "From /proc/net/snmp NoPorts") },
+  { "udp-recvbuf-errors",     MetricDefinition(PrometheusMetricType::counter, "From /proc/net/snmp RcvbufErrors") },
+  { "udp-sndbuf-errors",      MetricDefinition(PrometheusMetricType::counter, "From /proc/net/snmp SndbufErrors") },
+};
 
 static bool apiWriteConfigFile(const string& filebasename, const string& content)
 {
@@ -421,12 +472,12 @@ static void connectionThread(int sock, ComboAddress remote)
           }
 
           MetricDefinition metricDetails;
-          if (!g_metricDefinitions.getMetricDetails(metricName, metricDetails)) {
+          if (!s_metricDefinitions.getMetricDetails(metricName, metricDetails)) {
               vinfolog("Do not have metric details for %s", metricName);
               continue;
           }
 
-          std::string prometheusTypeName = g_metricDefinitions.getPrometheusStringMetricType(metricDetails.prometheusType);
+          std::string prometheusTypeName = s_metricDefinitions.getPrometheusStringMetricType(metricDetails.prometheusType);
 
           if (prometheusTypeName == "") {
               vinfolog("Unknown Prometheus type for %s", metricName);
index 96af451cf7f52905ac9b89ef937312c923e6f37f..a8c74d90b45f23a9d9aea1e376f8e48ee035c943 100644 (file)
@@ -81,7 +81,6 @@ using std::thread;
 bool g_verbose;
 
 struct DNSDistStats g_stats;
-MetricDefinitionStorage g_metricDefinitions;
 
 uint16_t g_maxOutstanding{std::numeric_limits<uint16_t>::max()};
 uint32_t g_staleCacheEntriesTTL{0};
index 9c65edc8740b24c9abbb6118e483f9316d4df376..496e2dd6df8e9ba3077c8ed96a4640a09cee19cf 100644 (file)
@@ -290,14 +290,20 @@ struct DNSDistStats
     {"uptime", uptimeOfProcess},
     {"real-memory-usage", getRealMemoryUsage},
     {"special-memory-usage", getSpecialMemoryUsage},
+    {"udp-in-errors", boost::bind(udpErrorStats, "udp-in-errors")},
+    {"udp-noport-errors", boost::bind(udpErrorStats, "udp-noport-errors")},
+    {"udp-recvbuf-errors", boost::bind(udpErrorStats, "udp-recvbuf-errors")},
+    {"udp-sndbuf-errors", boost::bind(udpErrorStats, "udp-sndbuf-errors")},
     {"noncompliant-queries", &nonCompliantQueries},
     {"noncompliant-responses", &nonCompliantResponses},
     {"rdqueries", &rdQueries},
     {"empty-queries", &emptyQueries},
     {"cache-hits", &cacheHits},
     {"cache-misses", &cacheMisses},
-    {"cpu-user-msec", getCPUTimeUser},
+    {"cpu-iowait", getCPUIOWait},
+    {"cpu-steal", getCPUSteal},
     {"cpu-sys-msec", getCPUTimeSystem},
+    {"cpu-user-msec", getCPUTimeUser},
     {"fd-usage", getOpenFileDescriptors},
     {"dyn-blocked", &dynBlocked},
     {"dyn-block-nmg-size", [](const std::string&) { return g_dynblockNMG.getLocal()->size(); }},
@@ -308,98 +314,6 @@ struct DNSDistStats
   };
 };
 
-// Metric types for Prometheus
-enum class PrometheusMetricType: int {
-    counter = 1,
-    gauge = 2
-};
-
-// Keeps additional information about metrics
-struct MetricDefinition {
-  MetricDefinition(PrometheusMetricType _prometheusType, const std::string& _description): description(_description), prometheusType(_prometheusType) {
-  }
-  MetricDefinition() = default;
-
-  // Metric description
-  std::string description;
-  // Metric type for Prometheus
-  PrometheusMetricType prometheusType;
-};
-
-struct MetricDefinitionStorage {
-  // Return metric definition by name
-  bool getMetricDetails(std::string metricName, MetricDefinition& metric) {
-  auto metricDetailsIter = metrics.find(metricName);
-
-  if (metricDetailsIter == metrics.end()) {
-    return false;
-  }
-
-  metric = metricDetailsIter->second;
-    return true;
-  };
-
-  // Return string representation of Prometheus metric type
-  std::string getPrometheusStringMetricType(PrometheusMetricType metricType) {
-    switch (metricType) { 
-      case PrometheusMetricType::counter:
-        return "counter";
-        break;
-      case PrometheusMetricType::gauge:
-        return "gauge";
-        break;
-      default:
-        return "";
-        break;
-    }
-  };
-
-  std::map<std::string, MetricDefinition> metrics = {
-    { "responses",              MetricDefinition(PrometheusMetricType::counter, "Number of responses received from backends") },
-    { "servfail-responses",     MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received from backends") },
-    { "queries",                MetricDefinition(PrometheusMetricType::counter, "Number of received queries")},
-    { "frontend-nxdomain",      MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers sent to clients")},
-    { "frontend-servfail",      MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers sent to clients")},
-    { "frontend-noerror",       MetricDefinition(PrometheusMetricType::counter, "Number of NoError answers sent to clients")},
-    { "acl-drops",              MetricDefinition(PrometheusMetricType::counter, "Number of packets dropped because of the ACL")},
-    { "rule-drop",              MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a rule")},
-    { "rule-nxdomain",          MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers returned because of a rule")},
-    { "rule-refused",           MetricDefinition(PrometheusMetricType::counter, "Number of Refused answers returned because of a rule")},
-    { "rule-servfail",          MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received because of a rule")},
-    { "self-answered",          MetricDefinition(PrometheusMetricType::counter, "Number of self-answered responses")},
-    { "downstream-timeouts",    MetricDefinition(PrometheusMetricType::counter, "Number of queries not answered in time by a backend")},
-    { "downstream-send-errors", MetricDefinition(PrometheusMetricType::counter, "Number of errors when sending a query to a backend")},
-    { "trunc-failures",         MetricDefinition(PrometheusMetricType::counter, "Number of errors encountered while truncating an answer")},
-    { "no-policy",              MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because no server was available")},
-    { "latency0-1",             MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in less than 1ms")},
-    { "latency1-10",            MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 1-10 ms")},
-    { "latency10-50",           MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 10-50 ms")},
-    { "latency50-100",          MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 50-100 ms")},
-    { "latency100-1000",        MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 100-1000 ms")},
-    { "latency-slow",           MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in more than 1 second")},
-    { "latency-avg100",         MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 100 packets")},
-    { "latency-avg1000",        MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 1000 packets")},
-    { "latency-avg10000",       MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 10000 packets")},
-    { "latency-avg1000000",     MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 1000000 packets")},
-    { "uptime",                 MetricDefinition(PrometheusMetricType::gauge,   "Uptime of the dnsdist process in seconds")},
-    { "real-memory-usage",      MetricDefinition(PrometheusMetricType::gauge,   "Current memory usage in bytes")},
-    { "noncompliant-queries",   MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped as non-compliant")},
-    { "noncompliant-responses", MetricDefinition(PrometheusMetricType::counter, "Number of answers from a backend dropped as non-compliant")},
-    { "rdqueries",              MetricDefinition(PrometheusMetricType::counter, "Number of received queries with the recursion desired bit set")},
-    { "empty-queries",          MetricDefinition(PrometheusMetricType::counter, "Number of empty queries received from clients")},
-    { "cache-hits",             MetricDefinition(PrometheusMetricType::counter, "Number of times an answer was retrieved from cache")},
-    { "cache-misses",           MetricDefinition(PrometheusMetricType::counter, "Number of times an answer not found in the cache")},
-    { "cpu-user-msec",          MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the user state")},
-    { "cpu-sys-msec",           MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the system state")},
-    { "fd-usage",               MetricDefinition(PrometheusMetricType::gauge,   "Number of currently used file descriptors")},
-    { "dyn-blocked",            MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a dynamic block")},
-    { "dyn-block-nmg-size",     MetricDefinition(PrometheusMetricType::gauge,   "Number of dynamic blocks entries") },
-    { "security-status",        MetricDefinition(PrometheusMetricType::gauge,   "Security status of this software. 0=unknown, 1=OK, 2=upgrade recommended, 3=upgrade mandatory") },
-  };
-};
-
-extern MetricDefinitionStorage g_metricDefinitions;
 extern struct DNSDistStats g_stats;
 void doLatencyStats(double udiff);
 
index a4982f27bd79c7ed39974759074521c60cf0a7c1..95fdacd5d83ef604d77d010629049e6c8c41b8c1 100644 (file)
@@ -139,6 +139,7 @@ dnsdist_SOURCES = \
        dnsdist-lua-inspection-ffi.cc dnsdist-lua-inspection-ffi.hh \
        dnsdist-lua-rules.cc \
        dnsdist-lua-vars.cc \
+       dnsdist-prometheus.hh \
        dnsdist-protobuf.cc dnsdist-protobuf.hh \
        dnsdist-rings.cc dnsdist-rings.hh \
        dnsdist-rules.hh \
diff --git a/pdns/dnsdistdist/dnsdist-prometheus.hh b/pdns/dnsdistdist/dnsdist-prometheus.hh
new file mode 100644 (file)
index 0000000..a64b904
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * This file is part of PowerDNS or dnsdist.
+ * Copyright -- PowerDNS.COM B.V. and its contributors
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * In addition, for the avoidance of any doubt, permission is granted to
+ * link this program with OpenSSL and to (re)distribute the binaries
+ * produced as the result of such linking.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#pragma once
+
+// Metric types for Prometheus
+enum class PrometheusMetricType: int {
+    counter = 1,
+    gauge = 2
+};
+
+// Keeps additional information about metrics
+struct MetricDefinition {
+  MetricDefinition(PrometheusMetricType _prometheusType, const std::string& _description): description(_description), prometheusType(_prometheusType) {
+  }
+
+  MetricDefinition() = default;
+
+  // Metric description
+  std::string description;
+  // Metric type for Prometheus
+  PrometheusMetricType prometheusType;
+};
+
+struct MetricDefinitionStorage {
+  // Return metric definition by name
+  bool getMetricDetails(const std::string& metricName, MetricDefinition& metric) const {
+    const auto& metricDetailsIter = metrics.find(metricName);
+
+    if (metricDetailsIter == metrics.end()) {
+      return false;
+    }
+
+    metric = metricDetailsIter->second;
+    return true;
+  };
+
+  // Return string representation of Prometheus metric type
+  std::string getPrometheusStringMetricType(PrometheusMetricType metricType) const {
+    switch (metricType) {
+      case PrometheusMetricType::counter:
+        return "counter";
+        break;
+      case PrometheusMetricType::gauge:
+        return "gauge";
+        break;
+      default:
+        return "";
+        break;
+    }
+  };
+
+  static const std::map<std::string, MetricDefinition> metrics;
+};
index ee806bb425c2390d558452ecf04011d2dcfbe9a4..5f298826ff98744a5262736c04e804a18d23b0d1 100644 (file)
@@ -34,6 +34,18 @@ cache-misses
 ------------
 Number of times an answer was not found in the :doc:`packet cache <guides/cache>`. Only counted if a packet cache was setup for the selected pool.
 
+cpu-iowait
+----------
+.. versionadded:: 1.5.0
+
+Time spent waiting for I/O to complete by the whole system.
+
+cpu-steal
+---------
+.. versionadded:: 1.5.0
+
+Stolen time, which is the time spent by the whole system in other operating systems when running in a virtualized environment.
+
 cpu-sys-msec
 ------------
 Milliseconds spent by :program:`dnsdist` in the "system" state.
@@ -201,7 +213,30 @@ trunc-failures
 --------------
 Number of errors encountered while truncating an answer.
 
+udp-in-errors
+-------------
+.. versionadded:: 1.5.0
+
+From /proc/net/snmp InErrors.
+
+udp-noport-errors
+-----------------
+.. versionadded:: 1.5.0
+
+From /proc/net/snmp NoPorts.
+
+udp-recvbuf-errors
+------------------
+.. versionadded:: 1.5.0
+
+From /proc/net/snmp RcvbufErrors.
+
+udp-sndbuf-errors
+-----------------
+.. versionadded:: 1.5.0
+
+From /proc/net/snmp SndbufErrors.
+
 uptime
 ------
 Uptime of the dnsdist process, in seconds.
-
index 178e79083d8652b273520c509d1018122c0fa25d..0928c28c168a489262848d00376cf0283c571c3e 100644 (file)
@@ -234,8 +234,9 @@ class TestAPIBasics(DNSDistTest):
                     'latency-slow', 'latency-sum', 'latency-count', 'latency-avg100', 'latency-avg1000',
                     'latency-avg10000', 'latency-avg1000000', 'uptime', 'real-memory-usage', 'noncompliant-queries',
                     'noncompliant-responses', 'rdqueries', 'empty-queries', 'cache-hits',
-                    'cache-misses', 'cpu-user-msec', 'cpu-sys-msec', 'fd-usage', 'dyn-blocked',
-                    'dyn-block-nmg-size', 'rule-servfail', 'security-status']
+                    'cache-misses', 'cpu-iowait', 'cpu-steal', 'cpu-sys-msec', 'cpu-user-msec', 'fd-usage', 'dyn-blocked',
+                    'dyn-block-nmg-size', 'rule-servfail', 'security-status',
+                    'udp-in-errors', 'udp-noport-errors', 'udp-recvbuf-errors', 'udp-sndbuf-errors']
 
         for key in expected:
             self.assertIn(key, values)