From 0d394f3528e77e196daa07711655c5ff72f7e77c Mon Sep 17 00:00:00 2001 From: Remi Gacogne Date: Mon, 3 Feb 2020 11:50:38 +0100 Subject: [PATCH] dnsdist: Add steal, iowait and UDP errors metrics This commit also moves the prometheus metrics code to a separate header. --- pdns/dnsdist-web.cc | 55 +++++++++++++- pdns/dnsdist.cc | 1 - pdns/dnsdist.hh | 100 ++----------------------- pdns/dnsdistdist/Makefile.am | 1 + pdns/dnsdistdist/dnsdist-prometheus.hh | 72 ++++++++++++++++++ pdns/dnsdistdist/docs/statistics.rst | 37 ++++++++- regression-tests.dnsdist/test_API.py | 5 +- 7 files changed, 172 insertions(+), 99 deletions(-) create mode 100644 pdns/dnsdistdist/dnsdist-prometheus.hh diff --git a/pdns/dnsdist-web.cc b/pdns/dnsdist-web.cc index 421ce833a3..c246a40f4b 100644 --- a/pdns/dnsdist-web.cc +++ b/pdns/dnsdist-web.cc @@ -21,6 +21,7 @@ */ #include "dnsdist.hh" #include "dnsdist-healthchecks.hh" +#include "dnsdist-prometheus.hh" #include "sstuff.hh" #include "ext/json11/json11.hpp" @@ -42,6 +43,56 @@ bool g_apiReadWrite{false}; WebserverConfig g_webserverConfig; std::string g_apiConfigDirectory; +static const MetricDefinitionStorage s_metricDefinitions; + +const std::map MetricDefinitionStorage::metrics{ + { "responses", MetricDefinition(PrometheusMetricType::counter, "Number of responses received from backends") }, + { "servfail-responses", MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received from backends") }, + { "queries", MetricDefinition(PrometheusMetricType::counter, "Number of received queries")}, + { "frontend-nxdomain", MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers sent to clients")}, + { "frontend-servfail", MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers sent to clients")}, + { "frontend-noerror", MetricDefinition(PrometheusMetricType::counter, "Number of NoError answers sent to clients")}, + { "acl-drops", MetricDefinition(PrometheusMetricType::counter, "Number of packets dropped because of the ACL")}, + { "rule-drop", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a rule")}, + { "rule-nxdomain", MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers returned because of a rule")}, + { "rule-refused", MetricDefinition(PrometheusMetricType::counter, "Number of Refused answers returned because of a rule")}, + { "rule-servfail", MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received because of a rule")}, + { "self-answered", MetricDefinition(PrometheusMetricType::counter, "Number of self-answered responses")}, + { "downstream-timeouts", MetricDefinition(PrometheusMetricType::counter, "Number of queries not answered in time by a backend")}, + { "downstream-send-errors", MetricDefinition(PrometheusMetricType::counter, "Number of errors when sending a query to a backend")}, + { "trunc-failures", MetricDefinition(PrometheusMetricType::counter, "Number of errors encountered while truncating an answer")}, + { "no-policy", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because no server was available")}, + { "latency0-1", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in less than 1ms")}, + { "latency1-10", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 1-10 ms")}, + { "latency10-50", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 10-50 ms")}, + { "latency50-100", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 50-100 ms")}, + { "latency100-1000", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 100-1000 ms")}, + { "latency-slow", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in more than 1 second")}, + { "latency-avg100", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 100 packets")}, + { "latency-avg1000", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 1000 packets")}, + { "latency-avg10000", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 10000 packets")}, + { "latency-avg1000000", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 1000000 packets")}, + { "uptime", MetricDefinition(PrometheusMetricType::gauge, "Uptime of the dnsdist process in seconds")}, + { "real-memory-usage", MetricDefinition(PrometheusMetricType::gauge, "Current memory usage in bytes")}, + { "noncompliant-queries", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped as non-compliant")}, + { "noncompliant-responses", MetricDefinition(PrometheusMetricType::counter, "Number of answers from a backend dropped as non-compliant")}, + { "rdqueries", MetricDefinition(PrometheusMetricType::counter, "Number of received queries with the recursion desired bit set")}, + { "empty-queries", MetricDefinition(PrometheusMetricType::counter, "Number of empty queries received from clients")}, + { "cache-hits", MetricDefinition(PrometheusMetricType::counter, "Number of times an answer was retrieved from cache")}, + { "cache-misses", MetricDefinition(PrometheusMetricType::counter, "Number of times an answer not found in the cache")}, + { "cpu-iowait", MetricDefinition(PrometheusMetricType::counter, "Time waiting for I/O to complete by the whole system")}, + { "cpu-user-msec", MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the user state")}, + { "cpu-steal", MetricDefinition(PrometheusMetricType::counter, "Stolen time, which is the time spent by the whole system in other operating systems when running in a virtualized environment")}, + { "cpu-sys-msec", MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the system state")}, + { "fd-usage", MetricDefinition(PrometheusMetricType::gauge, "Number of currently used file descriptors")}, + { "dyn-blocked", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a dynamic block")}, + { "dyn-block-nmg-size", MetricDefinition(PrometheusMetricType::gauge, "Number of dynamic blocks entries") }, + { "security-status", MetricDefinition(PrometheusMetricType::gauge, "Security status of this software. 0=unknown, 1=OK, 2=upgrade recommended, 3=upgrade mandatory") }, + { "udp-in-errors", MetricDefinition(PrometheusMetricType::counter, "From /proc/net/snmp InErrors") }, + { "udp-noport-errors", MetricDefinition(PrometheusMetricType::counter, "From /proc/net/snmp NoPorts") }, + { "udp-recvbuf-errors", MetricDefinition(PrometheusMetricType::counter, "From /proc/net/snmp RcvbufErrors") }, + { "udp-sndbuf-errors", MetricDefinition(PrometheusMetricType::counter, "From /proc/net/snmp SndbufErrors") }, +}; static bool apiWriteConfigFile(const string& filebasename, const string& content) { @@ -421,12 +472,12 @@ static void connectionThread(int sock, ComboAddress remote) } MetricDefinition metricDetails; - if (!g_metricDefinitions.getMetricDetails(metricName, metricDetails)) { + if (!s_metricDefinitions.getMetricDetails(metricName, metricDetails)) { vinfolog("Do not have metric details for %s", metricName); continue; } - std::string prometheusTypeName = g_metricDefinitions.getPrometheusStringMetricType(metricDetails.prometheusType); + std::string prometheusTypeName = s_metricDefinitions.getPrometheusStringMetricType(metricDetails.prometheusType); if (prometheusTypeName == "") { vinfolog("Unknown Prometheus type for %s", metricName); diff --git a/pdns/dnsdist.cc b/pdns/dnsdist.cc index 96af451cf7..a8c74d90b4 100644 --- a/pdns/dnsdist.cc +++ b/pdns/dnsdist.cc @@ -81,7 +81,6 @@ using std::thread; bool g_verbose; struct DNSDistStats g_stats; -MetricDefinitionStorage g_metricDefinitions; uint16_t g_maxOutstanding{std::numeric_limits::max()}; uint32_t g_staleCacheEntriesTTL{0}; diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh index 9c65edc874..496e2dd6df 100644 --- a/pdns/dnsdist.hh +++ b/pdns/dnsdist.hh @@ -290,14 +290,20 @@ struct DNSDistStats {"uptime", uptimeOfProcess}, {"real-memory-usage", getRealMemoryUsage}, {"special-memory-usage", getSpecialMemoryUsage}, + {"udp-in-errors", boost::bind(udpErrorStats, "udp-in-errors")}, + {"udp-noport-errors", boost::bind(udpErrorStats, "udp-noport-errors")}, + {"udp-recvbuf-errors", boost::bind(udpErrorStats, "udp-recvbuf-errors")}, + {"udp-sndbuf-errors", boost::bind(udpErrorStats, "udp-sndbuf-errors")}, {"noncompliant-queries", &nonCompliantQueries}, {"noncompliant-responses", &nonCompliantResponses}, {"rdqueries", &rdQueries}, {"empty-queries", &emptyQueries}, {"cache-hits", &cacheHits}, {"cache-misses", &cacheMisses}, - {"cpu-user-msec", getCPUTimeUser}, + {"cpu-iowait", getCPUIOWait}, + {"cpu-steal", getCPUSteal}, {"cpu-sys-msec", getCPUTimeSystem}, + {"cpu-user-msec", getCPUTimeUser}, {"fd-usage", getOpenFileDescriptors}, {"dyn-blocked", &dynBlocked}, {"dyn-block-nmg-size", [](const std::string&) { return g_dynblockNMG.getLocal()->size(); }}, @@ -308,98 +314,6 @@ struct DNSDistStats }; }; -// Metric types for Prometheus -enum class PrometheusMetricType: int { - counter = 1, - gauge = 2 -}; - -// Keeps additional information about metrics -struct MetricDefinition { - MetricDefinition(PrometheusMetricType _prometheusType, const std::string& _description): description(_description), prometheusType(_prometheusType) { - } - - MetricDefinition() = default; - - // Metric description - std::string description; - // Metric type for Prometheus - PrometheusMetricType prometheusType; -}; - -struct MetricDefinitionStorage { - // Return metric definition by name - bool getMetricDetails(std::string metricName, MetricDefinition& metric) { - auto metricDetailsIter = metrics.find(metricName); - - if (metricDetailsIter == metrics.end()) { - return false; - } - - metric = metricDetailsIter->second; - return true; - }; - - // Return string representation of Prometheus metric type - std::string getPrometheusStringMetricType(PrometheusMetricType metricType) { - switch (metricType) { - case PrometheusMetricType::counter: - return "counter"; - break; - case PrometheusMetricType::gauge: - return "gauge"; - break; - default: - return ""; - break; - } - }; - - std::map metrics = { - { "responses", MetricDefinition(PrometheusMetricType::counter, "Number of responses received from backends") }, - { "servfail-responses", MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received from backends") }, - { "queries", MetricDefinition(PrometheusMetricType::counter, "Number of received queries")}, - { "frontend-nxdomain", MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers sent to clients")}, - { "frontend-servfail", MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers sent to clients")}, - { "frontend-noerror", MetricDefinition(PrometheusMetricType::counter, "Number of NoError answers sent to clients")}, - { "acl-drops", MetricDefinition(PrometheusMetricType::counter, "Number of packets dropped because of the ACL")}, - { "rule-drop", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a rule")}, - { "rule-nxdomain", MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers returned because of a rule")}, - { "rule-refused", MetricDefinition(PrometheusMetricType::counter, "Number of Refused answers returned because of a rule")}, - { "rule-servfail", MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received because of a rule")}, - { "self-answered", MetricDefinition(PrometheusMetricType::counter, "Number of self-answered responses")}, - { "downstream-timeouts", MetricDefinition(PrometheusMetricType::counter, "Number of queries not answered in time by a backend")}, - { "downstream-send-errors", MetricDefinition(PrometheusMetricType::counter, "Number of errors when sending a query to a backend")}, - { "trunc-failures", MetricDefinition(PrometheusMetricType::counter, "Number of errors encountered while truncating an answer")}, - { "no-policy", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because no server was available")}, - { "latency0-1", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in less than 1ms")}, - { "latency1-10", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 1-10 ms")}, - { "latency10-50", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 10-50 ms")}, - { "latency50-100", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 50-100 ms")}, - { "latency100-1000", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 100-1000 ms")}, - { "latency-slow", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in more than 1 second")}, - { "latency-avg100", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 100 packets")}, - { "latency-avg1000", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 1000 packets")}, - { "latency-avg10000", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 10000 packets")}, - { "latency-avg1000000", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 1000000 packets")}, - { "uptime", MetricDefinition(PrometheusMetricType::gauge, "Uptime of the dnsdist process in seconds")}, - { "real-memory-usage", MetricDefinition(PrometheusMetricType::gauge, "Current memory usage in bytes")}, - { "noncompliant-queries", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped as non-compliant")}, - { "noncompliant-responses", MetricDefinition(PrometheusMetricType::counter, "Number of answers from a backend dropped as non-compliant")}, - { "rdqueries", MetricDefinition(PrometheusMetricType::counter, "Number of received queries with the recursion desired bit set")}, - { "empty-queries", MetricDefinition(PrometheusMetricType::counter, "Number of empty queries received from clients")}, - { "cache-hits", MetricDefinition(PrometheusMetricType::counter, "Number of times an answer was retrieved from cache")}, - { "cache-misses", MetricDefinition(PrometheusMetricType::counter, "Number of times an answer not found in the cache")}, - { "cpu-user-msec", MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the user state")}, - { "cpu-sys-msec", MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the system state")}, - { "fd-usage", MetricDefinition(PrometheusMetricType::gauge, "Number of currently used file descriptors")}, - { "dyn-blocked", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a dynamic block")}, - { "dyn-block-nmg-size", MetricDefinition(PrometheusMetricType::gauge, "Number of dynamic blocks entries") }, - { "security-status", MetricDefinition(PrometheusMetricType::gauge, "Security status of this software. 0=unknown, 1=OK, 2=upgrade recommended, 3=upgrade mandatory") }, - }; -}; - -extern MetricDefinitionStorage g_metricDefinitions; extern struct DNSDistStats g_stats; void doLatencyStats(double udiff); diff --git a/pdns/dnsdistdist/Makefile.am b/pdns/dnsdistdist/Makefile.am index a4982f27bd..95fdacd5d8 100644 --- a/pdns/dnsdistdist/Makefile.am +++ b/pdns/dnsdistdist/Makefile.am @@ -139,6 +139,7 @@ dnsdist_SOURCES = \ dnsdist-lua-inspection-ffi.cc dnsdist-lua-inspection-ffi.hh \ dnsdist-lua-rules.cc \ dnsdist-lua-vars.cc \ + dnsdist-prometheus.hh \ dnsdist-protobuf.cc dnsdist-protobuf.hh \ dnsdist-rings.cc dnsdist-rings.hh \ dnsdist-rules.hh \ diff --git a/pdns/dnsdistdist/dnsdist-prometheus.hh b/pdns/dnsdistdist/dnsdist-prometheus.hh new file mode 100644 index 0000000000..a64b90495e --- /dev/null +++ b/pdns/dnsdistdist/dnsdist-prometheus.hh @@ -0,0 +1,72 @@ +/* + * This file is part of PowerDNS or dnsdist. + * Copyright -- PowerDNS.COM B.V. and its contributors + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * In addition, for the avoidance of any doubt, permission is granted to + * link this program with OpenSSL and to (re)distribute the binaries + * produced as the result of such linking. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#pragma once + +// Metric types for Prometheus +enum class PrometheusMetricType: int { + counter = 1, + gauge = 2 +}; + +// Keeps additional information about metrics +struct MetricDefinition { + MetricDefinition(PrometheusMetricType _prometheusType, const std::string& _description): description(_description), prometheusType(_prometheusType) { + } + + MetricDefinition() = default; + + // Metric description + std::string description; + // Metric type for Prometheus + PrometheusMetricType prometheusType; +}; + +struct MetricDefinitionStorage { + // Return metric definition by name + bool getMetricDetails(const std::string& metricName, MetricDefinition& metric) const { + const auto& metricDetailsIter = metrics.find(metricName); + + if (metricDetailsIter == metrics.end()) { + return false; + } + + metric = metricDetailsIter->second; + return true; + }; + + // Return string representation of Prometheus metric type + std::string getPrometheusStringMetricType(PrometheusMetricType metricType) const { + switch (metricType) { + case PrometheusMetricType::counter: + return "counter"; + break; + case PrometheusMetricType::gauge: + return "gauge"; + break; + default: + return ""; + break; + } + }; + + static const std::map metrics; +}; diff --git a/pdns/dnsdistdist/docs/statistics.rst b/pdns/dnsdistdist/docs/statistics.rst index ee806bb425..5f298826ff 100644 --- a/pdns/dnsdistdist/docs/statistics.rst +++ b/pdns/dnsdistdist/docs/statistics.rst @@ -34,6 +34,18 @@ cache-misses ------------ Number of times an answer was not found in the :doc:`packet cache `. Only counted if a packet cache was setup for the selected pool. +cpu-iowait +---------- +.. versionadded:: 1.5.0 + +Time spent waiting for I/O to complete by the whole system. + +cpu-steal +--------- +.. versionadded:: 1.5.0 + +Stolen time, which is the time spent by the whole system in other operating systems when running in a virtualized environment. + cpu-sys-msec ------------ Milliseconds spent by :program:`dnsdist` in the "system" state. @@ -201,7 +213,30 @@ trunc-failures -------------- Number of errors encountered while truncating an answer. +udp-in-errors +------------- +.. versionadded:: 1.5.0 + +From /proc/net/snmp InErrors. + +udp-noport-errors +----------------- +.. versionadded:: 1.5.0 + +From /proc/net/snmp NoPorts. + +udp-recvbuf-errors +------------------ +.. versionadded:: 1.5.0 + +From /proc/net/snmp RcvbufErrors. + +udp-sndbuf-errors +----------------- +.. versionadded:: 1.5.0 + +From /proc/net/snmp SndbufErrors. + uptime ------ Uptime of the dnsdist process, in seconds. - diff --git a/regression-tests.dnsdist/test_API.py b/regression-tests.dnsdist/test_API.py index 178e79083d..0928c28c16 100644 --- a/regression-tests.dnsdist/test_API.py +++ b/regression-tests.dnsdist/test_API.py @@ -234,8 +234,9 @@ class TestAPIBasics(DNSDistTest): 'latency-slow', 'latency-sum', 'latency-count', 'latency-avg100', 'latency-avg1000', 'latency-avg10000', 'latency-avg1000000', 'uptime', 'real-memory-usage', 'noncompliant-queries', 'noncompliant-responses', 'rdqueries', 'empty-queries', 'cache-hits', - 'cache-misses', 'cpu-user-msec', 'cpu-sys-msec', 'fd-usage', 'dyn-blocked', - 'dyn-block-nmg-size', 'rule-servfail', 'security-status'] + 'cache-misses', 'cpu-iowait', 'cpu-steal', 'cpu-sys-msec', 'cpu-user-msec', 'fd-usage', 'dyn-blocked', + 'dyn-block-nmg-size', 'rule-servfail', 'security-status', + 'udp-in-errors', 'udp-noport-errors', 'udp-recvbuf-errors', 'udp-sndbuf-errors'] for key in expected: self.assertIn(key, values) -- 2.39.2