From b245dad0964fb4b86794a5b60413820e646936a2 Mon Sep 17 00:00:00 2001 From: Vasek Sraier Date: Tue, 8 Feb 2022 11:58:14 +0100 Subject: [PATCH] manager: monitoring: remove usage of protected library methods, finished proper latency metric support --- manager/.gitignore | 2 + manager/knot_resolver_manager/statistics.py | 424 +++++++++++--------- manager/scripts/run | 12 + 3 files changed, 249 insertions(+), 189 deletions(-) diff --git a/manager/.gitignore b/manager/.gitignore index 189d951a8..f5883086e 100644 --- a/manager/.gitignore +++ b/manager/.gitignore @@ -13,3 +13,5 @@ dist/ .podman-cache/ docs/_build/* *junit.xml +.build_kresd/ +.install_kresd/ diff --git a/manager/knot_resolver_manager/statistics.py b/manager/knot_resolver_manager/statistics.py index 0675be25b..29368d823 100644 --- a/manager/knot_resolver_manager/statistics.py +++ b/manager/knot_resolver_manager/statistics.py @@ -1,9 +1,16 @@ import asyncio import json import logging -from typing import Any, Awaitable, Callable, Dict, List, Tuple, TypeVar, Union - -from prometheus_client import Counter, Gauge, Histogram, exposition # type: ignore +from typing import Any, Awaitable, Callable, Dict, Generator, List, Optional, Tuple, TypeVar + +from prometheus_client import Histogram, exposition # type: ignore +from prometheus_client.core import ( # type: ignore + REGISTRY, + CounterMetricFamily, + GaugeMetricFamily, + HistogramMetricFamily, + Metric, +) from knot_resolver_manager.datamodel.config_schema import KresConfig from knot_resolver_manager.kres_id import KresID @@ -11,122 +18,6 @@ from knot_resolver_manager.kresd_controller.interface import Subprocess logger = logging.getLogger(__name__) -RESOLVER_RESPONSE_LATENCY = Histogram( - "resolver_response_latency", - "Time it takes to respond to queries in seconds", - buckets=[0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, float("inf")], - labelnames=["instance_id"], -) -RESOLVER_REQUEST_TOTAL = Counter( - "resolver_request_total", - "total number of DNS requests (including internal client requests)", - labelnames=["instance_id"], -) -RESOLVER_REQUEST_INTERNAL = Counter( - "resolver_request_internal", - "number of internal requests generated by Knot Resolver (e.g. DNSSEC trust anchor updates)", - labelnames=["instance_id"], -) -RESOLVER_REQUEST_UDP = Counter( - "resolver_request_udp", "number of external requests received over plain UDP (RFC 1035)", labelnames=["instance_id"] -) -RESOLVER_REQUEST_TCP = Counter( - "resolver_request_tcp", "number of external requests received over plain TCP (RFC 1035)", labelnames=["instance_id"] -) -RESOLVER_REQUEST_DOT = Counter( - "resolver_request_dot", - "number of external requests received over DNS-over-TLS (RFC 7858)", - labelnames=["instance_id"], -) -RESOLVER_REQUEST_DOH = Counter( - "resolver_request_doh", - "number of external requests received over DNS-over-HTTP (RFC 8484)", - labelnames=["instance_id"], -) -RESOLVER_REQUEST_XDP = Counter( - "resolver_request_xdp", - "number of external requests received over plain UDP via an AF_XDP socket", - labelnames=["instance_id"], -) -RESOLVER_ANSWER_TOTAL = Counter("resolver_answer_total", "total number of answered queries", labelnames=["instance_id"]) -RESOLVER_ANSWER_CACHED = Counter( - "resolver_answer_cached", "number of queries answered from cache", labelnames=["instance_id"] -) - -RESOLVER_ANSWER_RCODE_NOERROR = Counter( - "resolver_answer_rcode_noerror", "number of NOERROR answers", labelnames=["instance_id"] -) -RESOLVER_ANSWER_RCODE_NODATA = Counter( - "resolver_answer_rcode_nodata", "number of NOERROR answers without any data", labelnames=["instance_id"] -) -RESOLVER_ANSWER_RCODE_NXDOMAIN = Counter( - "resolver_answer_rcode_nxdomain", "number of NXDOMAIN answers", labelnames=["instance_id"] -) -RESOLVER_ANSWER_RCODE_SERVFAIL = Counter( - "resolver_answer_rcode_servfail", "number of SERVFAIL answers", labelnames=["instance_id"] -) - -RESOLVER_ANSWER_FLAG_AA = Counter( - "resolver_answer_flag_aa", "number of authoritative answers", labelnames=["instance_id"] -) -RESOLVER_ANSWER_FLAG_TC = Counter("resolver_answer_flag_tc", "number of truncated answers", labelnames=["instance_id"]) -RESOLVER_ANSWER_FLAG_RA = Counter( - "resolver_answer_flag_ra", "number of answers with recursion available flag", labelnames=["instance_id"] -) -RESOLVER_ANSWER_FLAG_RD = Counter( - "resolver_answer_flags_rd", "number of recursion desired (in answer!)", labelnames=["instance_id"] -) -RESOLVER_ANSWER_FLAG_AD = Counter( - "resolver_answer_flag_ad", "number of authentic data (DNSSEC) answers", labelnames=["instance_id"] -) -RESOLVER_ANSWER_FLAG_CD = Counter( - "resolver_answer_flag_cd", "number of checking disabled (DNSSEC) answers", labelnames=["instance_id"] -) -RESOLVER_ANSWER_FLAG_DO = Counter("resolver_answer_flag_do", "number of DNSSEC answer OK", labelnames=["instance_id"]) -RESOLVER_ANSWER_FLAG_EDNS0 = Counter( - "resolver_answer_flag_edns0", "number of answers with EDNS0 present", labelnames=["instance_id"] -) - -RESOLVER_QUERY_EDNS = Counter("resolver_query_edns", "number of queries with EDNS present", labelnames=["instance_id"]) -RESOLVER_QUERY_DNSSEC = Counter( - "resolver_query_dnssec", "number of queries with DNSSEC DO=1", labelnames=["instance_id"] -) - -RESOLVER_METRICS_LOADED = Gauge( - "resolver_metrics_loaded", - "0 if metrics from resolver instance were not loaded, otherwise 1", - labelnames=["instance_id"], -) - - -_ALL_RESOLVER_METRICS: List[Union[Counter, Gauge, Histogram]] = [ - RESOLVER_RESPONSE_LATENCY, - RESOLVER_REQUEST_TOTAL, - RESOLVER_REQUEST_INTERNAL, - RESOLVER_REQUEST_UDP, - RESOLVER_REQUEST_TCP, - RESOLVER_REQUEST_DOT, - RESOLVER_REQUEST_DOH, - RESOLVER_REQUEST_XDP, - RESOLVER_ANSWER_TOTAL, - RESOLVER_ANSWER_CACHED, - RESOLVER_ANSWER_RCODE_NOERROR, - RESOLVER_ANSWER_RCODE_NODATA, - RESOLVER_ANSWER_RCODE_NXDOMAIN, - RESOLVER_ANSWER_RCODE_SERVFAIL, - RESOLVER_ANSWER_FLAG_AA, - RESOLVER_ANSWER_FLAG_TC, - RESOLVER_ANSWER_FLAG_RA, - RESOLVER_ANSWER_FLAG_RD, - RESOLVER_ANSWER_FLAG_AD, - RESOLVER_ANSWER_FLAG_CD, - RESOLVER_ANSWER_FLAG_DO, - RESOLVER_ANSWER_FLAG_EDNS0, - RESOLVER_QUERY_EDNS, - RESOLVER_QUERY_DNSSEC, - RESOLVER_METRICS_LOADED, -] - MANAGER_REQUEST_RECONFIGURE_LATENCY = Histogram( "manager_request_reconfigure_latency", "Time it takes to change configuration" ) @@ -161,75 +52,224 @@ async def _command_registered_resolvers(cmd: str) -> Dict[KresID, str]: return dict(pairs) -def _parse_resolver_metrics(instance_id: KresID, metrics: Any) -> None: - # Uses private fields in order to translate kresd statistics into prometheus's library internal objects. - # pylint: disable=protected-access - # pyright: reportUnknownMemberType=false - - sid = str(instance_id) - - # response latency histogram - for i, duration in enumerate(("1ms", "10ms", "50ms", "100ms", "250ms", "500ms", "1000ms", "1500ms", "slow")): - RESOLVER_RESPONSE_LATENCY.labels(sid)._buckets[i].set(metrics[f"answer.{duration}"]) - # TODO add sum after fixing https://gitlab.nic.cz/knot/knot-resolver/-/issues/721 - # RESOLVER_RESPONSE_LATENCY.labels(str(id))._sum.set(sum) - - RESOLVER_REQUEST_TOTAL.labels(sid)._value.set(metrics["request.total"]) - RESOLVER_REQUEST_INTERNAL.labels(sid)._value.set(metrics["request.internal"]) - RESOLVER_REQUEST_UDP.labels(sid)._value.set(metrics["request.udp"]) - RESOLVER_REQUEST_TCP.labels(sid)._value.set(metrics["request.tcp"]) - RESOLVER_REQUEST_DOT.labels(sid)._value.set(metrics["request.dot"]) - RESOLVER_REQUEST_DOH.labels(sid)._value.set(metrics["request.doh"]) - RESOLVER_REQUEST_XDP.labels(sid)._value.set(metrics["request.xdp"]) - - RESOLVER_ANSWER_TOTAL.labels(sid)._value.set(metrics["answer.total"]) - RESOLVER_ANSWER_CACHED.labels(sid)._value.set(metrics["answer.cached"]) - - RESOLVER_ANSWER_RCODE_NOERROR.labels(sid)._value.set(metrics["answer.noerror"]) - RESOLVER_ANSWER_RCODE_NODATA.labels(sid)._value.set(metrics["answer.nodata"]) - RESOLVER_ANSWER_RCODE_NXDOMAIN.labels(sid)._value.set(metrics["answer.nxdomain"]) - RESOLVER_ANSWER_RCODE_SERVFAIL.labels(sid)._value.set(metrics["answer.servfail"]) - - RESOLVER_ANSWER_FLAG_AA.labels(sid)._value.set(metrics["answer.aa"]) - RESOLVER_ANSWER_FLAG_TC.labels(sid)._value.set(metrics["answer.tc"]) - RESOLVER_ANSWER_FLAG_RA.labels(sid)._value.set(metrics["answer.ra"]) - RESOLVER_ANSWER_FLAG_RD.labels(sid)._value.set(metrics["answer.rd"]) - RESOLVER_ANSWER_FLAG_AD.labels(sid)._value.set(metrics["answer.ad"]) - RESOLVER_ANSWER_FLAG_CD.labels(sid)._value.set(metrics["answer.cd"]) - RESOLVER_ANSWER_FLAG_DO.labels(sid)._value.set(metrics["answer.do"]) - RESOLVER_ANSWER_FLAG_EDNS0.labels(sid)._value.set(metrics["answer.edns0"]) - - RESOLVER_QUERY_EDNS.labels(sid)._value.set(metrics["query.edns"]) - RESOLVER_QUERY_DNSSEC.labels(sid)._value.set(metrics["query.dnssec"]) - - -async def _collect_resolver_stats(lazy: bool) -> None: - ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()" - STATS_QUERY = "collect_statistics()" - - cmd = ON_DEMAND_STATS_QUERY if lazy else STATS_QUERY - stats_raw = await _command_registered_resolvers(cmd) - - for kid, raw in stats_raw.items(): - RESOLVER_METRICS_LOADED.labels(str(id)).set(0) - try: - metrics = json.loads(raw[1:-1]) - _parse_resolver_metrics(kid, metrics) - - # mark that metrics have been loaded - RESOLVER_METRICS_LOADED.labels(str(id)).set(1) - except json.JSONDecodeError: - logger.warning("Failed to load metrics from resolver instance %d", id) +def _counter(name: str, description: str, label: Tuple[str, str], value: float) -> CounterMetricFamily: + c = CounterMetricFamily(name, description, labels=(label[0],)) + c.add_metric(label[1], value) # type: ignore + return c + + +def _gauge(name: str, description: str, label: Tuple[str, str], value: float) -> GaugeMetricFamily: + c = GaugeMetricFamily(name, description, labels=(label[0],)) + c.add_metric(label[1], value) # type: ignore + return c + + +def _histogram( + name: str, description: str, label: Tuple[str, str], buckets: List[Tuple[str, int]], sum_value: float +) -> HistogramMetricFamily: + c = HistogramMetricFamily(name, description, labels=(label[0],)) + c.add_metric(label[1], buckets, sum_value=sum_value) # type: ignore + return c + + +class ResolverCollector: + def __init__(self) -> None: + self._stats_raw: Optional[Dict[KresID, str]] = None + + def set_stats(self, stats_raw: Optional[Dict[KresID, str]]) -> None: + self._stats_raw = stats_raw + + def collect(self) -> Generator[Metric, None, None]: + if self._stats_raw is None: + return + + for kid, raw in self._stats_raw.items(): + success = False + try: + metrics: Dict[str, int] = json.loads(raw[1:-1]) + yield from self._parse_resolver_metrics(kid, metrics) + success = True + except json.JSONDecodeError: + logger.warning("Failed to load metrics from resolver instance %s: failed to parse statistics", str(kid)) + except KeyError as e: + logger.warning( + "Failed to load metrics from resolver instance %s: attempted to read missing statistic %s", + str(kid), + str(e), + ) + + yield _gauge( + "resolver_metrics_loaded", + "0 if metrics from resolver instance were not loaded, otherwise 1", + label=("instance_id", str(kid)), + value=int(success), + ) + + def _parse_resolver_metrics(self, instance_id: KresID, metrics: Any) -> Generator[Metric, None, None]: + sid = str(instance_id) + + # response latency histogram + BUCKET_NAMES_IN_RESOLVER = ("1ms", "10ms", "50ms", "100ms", "250ms", "500ms", "1000ms", "1500ms", "slow") + BUCKET_NAMES_PROMETHEUS = ("0.001", "0.01", "0.05", "0.1", "0.25", "0.5", "1.0", "1.5", "+Inf") + yield _histogram( + "resolver_response_latency", + "Time it takes to respond to queries in seconds", + label=("instance_id", sid), + buckets=[ + (bnp, metrics[f"answer.{duration}"]) + for bnp, duration in zip(BUCKET_NAMES_PROMETHEUS, BUCKET_NAMES_IN_RESOLVER) + ], + sum_value=metrics["answer.sum_ms"] / 1_000, + ) + + yield _counter( + "resolver_request_total", + "total number of DNS requests (including internal client requests)", + label=("instance_id", sid), + value=metrics["request.total"], + ) + yield _counter( + "resolver_request_internal", + "number of internal requests generated by Knot Resolver (e.g. DNSSEC trust anchor updates)", + label=("instance_id", sid), + value=metrics["request.internal"], + ) + yield _counter( + "resolver_request_udp", + "number of external requests received over plain UDP (RFC 1035)", + label=("instance_id", sid), + value=metrics["request.udp"], + ) + yield _counter( + "resolver_request_tcp", + "number of external requests received over plain TCP (RFC 1035)", + label=("instance_id", sid), + value=metrics["request.tcp"], + ) + yield _counter( + "resolver_request_dot", + "number of external requests received over DNS-over-TLS (RFC 7858)", + label=("instance_id", sid), + value=metrics["request.dot"], + ) + yield _counter( + "resolver_request_doh", + "number of external requests received over DNS-over-HTTP (RFC 8484)", + label=("instance_id", sid), + value=metrics["request.doh"], + ) + yield _counter( + "resolver_request_xdp", + "number of external requests received over plain UDP via an AF_XDP socket", + label=("instance_id", sid), + value=metrics["request.xdp"], + ) + yield _counter( + "resolver_answer_total", + "total number of answered queries", + label=("instance_id", sid), + value=metrics["answer.total"], + ) + yield _counter( + "resolver_answer_cached", + "number of queries answered from cache", + label=("instance_id", sid), + value=metrics["answer.cached"], + ) + yield _counter( + "resolver_answer_rcode_noerror", + "number of NOERROR answers", + label=("instance_id", sid), + value=metrics["answer.noerror"], + ) + yield _counter( + "resolver_answer_rcode_nodata", + "number of NOERROR answers without any data", + label=("instance_id", sid), + value=metrics["answer.nodata"], + ) + yield _counter( + "resolver_answer_rcode_nxdomain", + "number of NXDOMAIN answers", + label=("instance_id", sid), + value=metrics["answer.nxdomain"], + ) + yield _counter( + "resolver_answer_rcode_servfail", + "number of SERVFAIL answers", + label=("instance_id", sid), + value=metrics["answer.servfail"], + ) + yield _counter( + "resolver_answer_flag_aa", + "number of authoritative answers", + label=("instance_id", sid), + value=metrics["answer.aa"], + ) + yield _counter( + "resolver_answer_flag_tc", + "number of truncated answers", + label=("instance_id", sid), + value=metrics["answer.tc"], + ) + yield _counter( + "resolver_answer_flag_ra", + "number of answers with recursion available flag", + label=("instance_id", sid), + value=metrics["answer.ra"], + ) + yield _counter( + "resolver_answer_flags_rd", + "number of recursion desired (in answer!)", + label=("instance_id", sid), + value=metrics["answer.rd"], + ) + yield _counter( + "resolver_answer_flag_ad", + "number of authentic data (DNSSEC) answers", + label=("instance_id", sid), + value=metrics["answer.ad"], + ) + yield _counter( + "resolver_answer_flag_cd", + "number of checking disabled (DNSSEC) answers", + label=("instance_id", sid), + value=metrics["answer.cd"], + ) + yield _counter( + "resolver_answer_flag_do", + "number of DNSSEC answer OK", + label=("instance_id", sid), + value=metrics["answer.do"], + ) + yield _counter( + "resolver_answer_flag_edns0", + "number of answers with EDNS0 present", + label=("instance_id", sid), + value=metrics["answer.edns0"], + ) + yield _counter( + "resolver_query_edns", + "number of queries with EDNS present", + label=("instance_id", sid), + value=metrics["query.edns"], + ) + yield _counter( + "resolver_query_dnssec", + "number of queries with DNSSEC DO=1", + label=("instance_id", sid), + value=metrics["query.dnssec"], + ) + + +_RESOLVER_COLLECTOR = ResolverCollector() +REGISTRY.register(_RESOLVER_COLLECTOR) # type: ignore def unregister_resolver_metrics_for(subprocess: Subprocess) -> None: """ Cancel metric collection from resolver subprocess """ - sid = str(subprocess.id) - for metric in _ALL_RESOLVER_METRICS: - metric.remove(sid) - del _REGISTERED_RESOLVERS[subprocess.id] @@ -237,10 +277,6 @@ def register_resolver_metrics_for(subprocess: Subprocess) -> None: """ Register resolver subprocess for metric collection """ - sid = str(subprocess.id) - for metric in _ALL_RESOLVER_METRICS: - metric.labels(sid) - _REGISTERED_RESOLVERS[subprocess.id] = subprocess @@ -249,7 +285,17 @@ async def report_stats(config: KresConfig) -> bytes: Collects metrics from everything, returns data string in Prometheus format. """ - if config.monitoring.state != "manager-only": - await _collect_resolver_stats(config.monitoring.state == "lazy") + try: + if config.monitoring.state != "manager-only": + lazy = config.monitoring.state == "lazy" + ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()" + STATS_QUERY = "collect_statistics()" + + cmd = ON_DEMAND_STATS_QUERY if lazy else STATS_QUERY + stats_raw = await _command_registered_resolvers(cmd) + _RESOLVER_COLLECTOR.set_stats(stats_raw) - return exposition.generate_latest() + return exposition.generate_latest() # type: ignore + finally: + # after the report has been generated, clean everything + _RESOLVER_COLLECTOR.set_stats(None) diff --git a/manager/scripts/run b/manager/scripts/run index f43912aef..ec340bdf0 100755 --- a/manager/scripts/run +++ b/manager/scripts/run @@ -4,6 +4,18 @@ src_dir="$(dirname "$(realpath "$0")")" source $src_dir/_env.sh +echo +echo Building Knot Resolver +echo ---------------------- +cd .. +mkdir -p manager/.build_kresd manager/.install_kresd +meson manager/.build_kresd --prefix=$(realpath manager/.install_kresd) --default-library=static --buildtype=debug +ninja -C manager/.build_kresd +ninja install -C manager/.build_kresd +export PATH="$(realpath manager/.install_kresd)/sbin:$PATH" +cd manager + +echo echo Knot Manager API is accessible on http://localhost:5000 echo ------------------------------------------------------- -- 2.47.3