import asyncio
import json
import logging
-from typing import Any, Awaitable, Callable, Dict, List, Tuple, TypeVar, Union
-
-from prometheus_client import Counter, Gauge, Histogram, exposition # type: ignore
+from typing import Any, Awaitable, Callable, Dict, Generator, List, Optional, Tuple, TypeVar
+
+from prometheus_client import Histogram, exposition # type: ignore
+from prometheus_client.core import ( # type: ignore
+ REGISTRY,
+ CounterMetricFamily,
+ GaugeMetricFamily,
+ HistogramMetricFamily,
+ Metric,
+)
from knot_resolver_manager.datamodel.config_schema import KresConfig
from knot_resolver_manager.kres_id import KresID
logger = logging.getLogger(__name__)
-RESOLVER_RESPONSE_LATENCY = Histogram(
- "resolver_response_latency",
- "Time it takes to respond to queries in seconds",
- buckets=[0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, float("inf")],
- labelnames=["instance_id"],
-)
-RESOLVER_REQUEST_TOTAL = Counter(
- "resolver_request_total",
- "total number of DNS requests (including internal client requests)",
- labelnames=["instance_id"],
-)
-RESOLVER_REQUEST_INTERNAL = Counter(
- "resolver_request_internal",
- "number of internal requests generated by Knot Resolver (e.g. DNSSEC trust anchor updates)",
- labelnames=["instance_id"],
-)
-RESOLVER_REQUEST_UDP = Counter(
- "resolver_request_udp", "number of external requests received over plain UDP (RFC 1035)", labelnames=["instance_id"]
-)
-RESOLVER_REQUEST_TCP = Counter(
- "resolver_request_tcp", "number of external requests received over plain TCP (RFC 1035)", labelnames=["instance_id"]
-)
-RESOLVER_REQUEST_DOT = Counter(
- "resolver_request_dot",
- "number of external requests received over DNS-over-TLS (RFC 7858)",
- labelnames=["instance_id"],
-)
-RESOLVER_REQUEST_DOH = Counter(
- "resolver_request_doh",
- "number of external requests received over DNS-over-HTTP (RFC 8484)",
- labelnames=["instance_id"],
-)
-RESOLVER_REQUEST_XDP = Counter(
- "resolver_request_xdp",
- "number of external requests received over plain UDP via an AF_XDP socket",
- labelnames=["instance_id"],
-)
-RESOLVER_ANSWER_TOTAL = Counter("resolver_answer_total", "total number of answered queries", labelnames=["instance_id"])
-RESOLVER_ANSWER_CACHED = Counter(
- "resolver_answer_cached", "number of queries answered from cache", labelnames=["instance_id"]
-)
-
-RESOLVER_ANSWER_RCODE_NOERROR = Counter(
- "resolver_answer_rcode_noerror", "number of NOERROR answers", labelnames=["instance_id"]
-)
-RESOLVER_ANSWER_RCODE_NODATA = Counter(
- "resolver_answer_rcode_nodata", "number of NOERROR answers without any data", labelnames=["instance_id"]
-)
-RESOLVER_ANSWER_RCODE_NXDOMAIN = Counter(
- "resolver_answer_rcode_nxdomain", "number of NXDOMAIN answers", labelnames=["instance_id"]
-)
-RESOLVER_ANSWER_RCODE_SERVFAIL = Counter(
- "resolver_answer_rcode_servfail", "number of SERVFAIL answers", labelnames=["instance_id"]
-)
-
-RESOLVER_ANSWER_FLAG_AA = Counter(
- "resolver_answer_flag_aa", "number of authoritative answers", labelnames=["instance_id"]
-)
-RESOLVER_ANSWER_FLAG_TC = Counter("resolver_answer_flag_tc", "number of truncated answers", labelnames=["instance_id"])
-RESOLVER_ANSWER_FLAG_RA = Counter(
- "resolver_answer_flag_ra", "number of answers with recursion available flag", labelnames=["instance_id"]
-)
-RESOLVER_ANSWER_FLAG_RD = Counter(
- "resolver_answer_flags_rd", "number of recursion desired (in answer!)", labelnames=["instance_id"]
-)
-RESOLVER_ANSWER_FLAG_AD = Counter(
- "resolver_answer_flag_ad", "number of authentic data (DNSSEC) answers", labelnames=["instance_id"]
-)
-RESOLVER_ANSWER_FLAG_CD = Counter(
- "resolver_answer_flag_cd", "number of checking disabled (DNSSEC) answers", labelnames=["instance_id"]
-)
-RESOLVER_ANSWER_FLAG_DO = Counter("resolver_answer_flag_do", "number of DNSSEC answer OK", labelnames=["instance_id"])
-RESOLVER_ANSWER_FLAG_EDNS0 = Counter(
- "resolver_answer_flag_edns0", "number of answers with EDNS0 present", labelnames=["instance_id"]
-)
-
-RESOLVER_QUERY_EDNS = Counter("resolver_query_edns", "number of queries with EDNS present", labelnames=["instance_id"])
-RESOLVER_QUERY_DNSSEC = Counter(
- "resolver_query_dnssec", "number of queries with DNSSEC DO=1", labelnames=["instance_id"]
-)
-
-RESOLVER_METRICS_LOADED = Gauge(
- "resolver_metrics_loaded",
- "0 if metrics from resolver instance were not loaded, otherwise 1",
- labelnames=["instance_id"],
-)
-
-
-_ALL_RESOLVER_METRICS: List[Union[Counter, Gauge, Histogram]] = [
- RESOLVER_RESPONSE_LATENCY,
- RESOLVER_REQUEST_TOTAL,
- RESOLVER_REQUEST_INTERNAL,
- RESOLVER_REQUEST_UDP,
- RESOLVER_REQUEST_TCP,
- RESOLVER_REQUEST_DOT,
- RESOLVER_REQUEST_DOH,
- RESOLVER_REQUEST_XDP,
- RESOLVER_ANSWER_TOTAL,
- RESOLVER_ANSWER_CACHED,
- RESOLVER_ANSWER_RCODE_NOERROR,
- RESOLVER_ANSWER_RCODE_NODATA,
- RESOLVER_ANSWER_RCODE_NXDOMAIN,
- RESOLVER_ANSWER_RCODE_SERVFAIL,
- RESOLVER_ANSWER_FLAG_AA,
- RESOLVER_ANSWER_FLAG_TC,
- RESOLVER_ANSWER_FLAG_RA,
- RESOLVER_ANSWER_FLAG_RD,
- RESOLVER_ANSWER_FLAG_AD,
- RESOLVER_ANSWER_FLAG_CD,
- RESOLVER_ANSWER_FLAG_DO,
- RESOLVER_ANSWER_FLAG_EDNS0,
- RESOLVER_QUERY_EDNS,
- RESOLVER_QUERY_DNSSEC,
- RESOLVER_METRICS_LOADED,
-]
-
MANAGER_REQUEST_RECONFIGURE_LATENCY = Histogram(
"manager_request_reconfigure_latency", "Time it takes to change configuration"
)
return dict(pairs)
-def _parse_resolver_metrics(instance_id: KresID, metrics: Any) -> None:
- # Uses private fields in order to translate kresd statistics into prometheus's library internal objects.
- # pylint: disable=protected-access
- # pyright: reportUnknownMemberType=false
-
- sid = str(instance_id)
-
- # response latency histogram
- for i, duration in enumerate(("1ms", "10ms", "50ms", "100ms", "250ms", "500ms", "1000ms", "1500ms", "slow")):
- RESOLVER_RESPONSE_LATENCY.labels(sid)._buckets[i].set(metrics[f"answer.{duration}"])
- # TODO add sum after fixing https://gitlab.nic.cz/knot/knot-resolver/-/issues/721
- # RESOLVER_RESPONSE_LATENCY.labels(str(id))._sum.set(sum)
-
- RESOLVER_REQUEST_TOTAL.labels(sid)._value.set(metrics["request.total"])
- RESOLVER_REQUEST_INTERNAL.labels(sid)._value.set(metrics["request.internal"])
- RESOLVER_REQUEST_UDP.labels(sid)._value.set(metrics["request.udp"])
- RESOLVER_REQUEST_TCP.labels(sid)._value.set(metrics["request.tcp"])
- RESOLVER_REQUEST_DOT.labels(sid)._value.set(metrics["request.dot"])
- RESOLVER_REQUEST_DOH.labels(sid)._value.set(metrics["request.doh"])
- RESOLVER_REQUEST_XDP.labels(sid)._value.set(metrics["request.xdp"])
-
- RESOLVER_ANSWER_TOTAL.labels(sid)._value.set(metrics["answer.total"])
- RESOLVER_ANSWER_CACHED.labels(sid)._value.set(metrics["answer.cached"])
-
- RESOLVER_ANSWER_RCODE_NOERROR.labels(sid)._value.set(metrics["answer.noerror"])
- RESOLVER_ANSWER_RCODE_NODATA.labels(sid)._value.set(metrics["answer.nodata"])
- RESOLVER_ANSWER_RCODE_NXDOMAIN.labels(sid)._value.set(metrics["answer.nxdomain"])
- RESOLVER_ANSWER_RCODE_SERVFAIL.labels(sid)._value.set(metrics["answer.servfail"])
-
- RESOLVER_ANSWER_FLAG_AA.labels(sid)._value.set(metrics["answer.aa"])
- RESOLVER_ANSWER_FLAG_TC.labels(sid)._value.set(metrics["answer.tc"])
- RESOLVER_ANSWER_FLAG_RA.labels(sid)._value.set(metrics["answer.ra"])
- RESOLVER_ANSWER_FLAG_RD.labels(sid)._value.set(metrics["answer.rd"])
- RESOLVER_ANSWER_FLAG_AD.labels(sid)._value.set(metrics["answer.ad"])
- RESOLVER_ANSWER_FLAG_CD.labels(sid)._value.set(metrics["answer.cd"])
- RESOLVER_ANSWER_FLAG_DO.labels(sid)._value.set(metrics["answer.do"])
- RESOLVER_ANSWER_FLAG_EDNS0.labels(sid)._value.set(metrics["answer.edns0"])
-
- RESOLVER_QUERY_EDNS.labels(sid)._value.set(metrics["query.edns"])
- RESOLVER_QUERY_DNSSEC.labels(sid)._value.set(metrics["query.dnssec"])
-
-
-async def _collect_resolver_stats(lazy: bool) -> None:
- ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()"
- STATS_QUERY = "collect_statistics()"
-
- cmd = ON_DEMAND_STATS_QUERY if lazy else STATS_QUERY
- stats_raw = await _command_registered_resolvers(cmd)
-
- for kid, raw in stats_raw.items():
- RESOLVER_METRICS_LOADED.labels(str(id)).set(0)
- try:
- metrics = json.loads(raw[1:-1])
- _parse_resolver_metrics(kid, metrics)
-
- # mark that metrics have been loaded
- RESOLVER_METRICS_LOADED.labels(str(id)).set(1)
- except json.JSONDecodeError:
- logger.warning("Failed to load metrics from resolver instance %d", id)
+def _counter(name: str, description: str, label: Tuple[str, str], value: float) -> CounterMetricFamily:
+ c = CounterMetricFamily(name, description, labels=(label[0],))
+ c.add_metric(label[1], value) # type: ignore
+ return c
+
+
+def _gauge(name: str, description: str, label: Tuple[str, str], value: float) -> GaugeMetricFamily:
+ c = GaugeMetricFamily(name, description, labels=(label[0],))
+ c.add_metric(label[1], value) # type: ignore
+ return c
+
+
+def _histogram(
+ name: str, description: str, label: Tuple[str, str], buckets: List[Tuple[str, int]], sum_value: float
+) -> HistogramMetricFamily:
+ c = HistogramMetricFamily(name, description, labels=(label[0],))
+ c.add_metric(label[1], buckets, sum_value=sum_value) # type: ignore
+ return c
+
+
+class ResolverCollector:
+ def __init__(self) -> None:
+ self._stats_raw: Optional[Dict[KresID, str]] = None
+
+ def set_stats(self, stats_raw: Optional[Dict[KresID, str]]) -> None:
+ self._stats_raw = stats_raw
+
+ def collect(self) -> Generator[Metric, None, None]:
+ if self._stats_raw is None:
+ return
+
+ for kid, raw in self._stats_raw.items():
+ success = False
+ try:
+ metrics: Dict[str, int] = json.loads(raw[1:-1])
+ yield from self._parse_resolver_metrics(kid, metrics)
+ success = True
+ except json.JSONDecodeError:
+ logger.warning("Failed to load metrics from resolver instance %s: failed to parse statistics", str(kid))
+ except KeyError as e:
+ logger.warning(
+ "Failed to load metrics from resolver instance %s: attempted to read missing statistic %s",
+ str(kid),
+ str(e),
+ )
+
+ yield _gauge(
+ "resolver_metrics_loaded",
+ "0 if metrics from resolver instance were not loaded, otherwise 1",
+ label=("instance_id", str(kid)),
+ value=int(success),
+ )
+
+ def _parse_resolver_metrics(self, instance_id: KresID, metrics: Any) -> Generator[Metric, None, None]:
+ sid = str(instance_id)
+
+ # response latency histogram
+ BUCKET_NAMES_IN_RESOLVER = ("1ms", "10ms", "50ms", "100ms", "250ms", "500ms", "1000ms", "1500ms", "slow")
+ BUCKET_NAMES_PROMETHEUS = ("0.001", "0.01", "0.05", "0.1", "0.25", "0.5", "1.0", "1.5", "+Inf")
+ yield _histogram(
+ "resolver_response_latency",
+ "Time it takes to respond to queries in seconds",
+ label=("instance_id", sid),
+ buckets=[
+ (bnp, metrics[f"answer.{duration}"])
+ for bnp, duration in zip(BUCKET_NAMES_PROMETHEUS, BUCKET_NAMES_IN_RESOLVER)
+ ],
+ sum_value=metrics["answer.sum_ms"] / 1_000,
+ )
+
+ yield _counter(
+ "resolver_request_total",
+ "total number of DNS requests (including internal client requests)",
+ label=("instance_id", sid),
+ value=metrics["request.total"],
+ )
+ yield _counter(
+ "resolver_request_internal",
+ "number of internal requests generated by Knot Resolver (e.g. DNSSEC trust anchor updates)",
+ label=("instance_id", sid),
+ value=metrics["request.internal"],
+ )
+ yield _counter(
+ "resolver_request_udp",
+ "number of external requests received over plain UDP (RFC 1035)",
+ label=("instance_id", sid),
+ value=metrics["request.udp"],
+ )
+ yield _counter(
+ "resolver_request_tcp",
+ "number of external requests received over plain TCP (RFC 1035)",
+ label=("instance_id", sid),
+ value=metrics["request.tcp"],
+ )
+ yield _counter(
+ "resolver_request_dot",
+ "number of external requests received over DNS-over-TLS (RFC 7858)",
+ label=("instance_id", sid),
+ value=metrics["request.dot"],
+ )
+ yield _counter(
+ "resolver_request_doh",
+ "number of external requests received over DNS-over-HTTP (RFC 8484)",
+ label=("instance_id", sid),
+ value=metrics["request.doh"],
+ )
+ yield _counter(
+ "resolver_request_xdp",
+ "number of external requests received over plain UDP via an AF_XDP socket",
+ label=("instance_id", sid),
+ value=metrics["request.xdp"],
+ )
+ yield _counter(
+ "resolver_answer_total",
+ "total number of answered queries",
+ label=("instance_id", sid),
+ value=metrics["answer.total"],
+ )
+ yield _counter(
+ "resolver_answer_cached",
+ "number of queries answered from cache",
+ label=("instance_id", sid),
+ value=metrics["answer.cached"],
+ )
+ yield _counter(
+ "resolver_answer_rcode_noerror",
+ "number of NOERROR answers",
+ label=("instance_id", sid),
+ value=metrics["answer.noerror"],
+ )
+ yield _counter(
+ "resolver_answer_rcode_nodata",
+ "number of NOERROR answers without any data",
+ label=("instance_id", sid),
+ value=metrics["answer.nodata"],
+ )
+ yield _counter(
+ "resolver_answer_rcode_nxdomain",
+ "number of NXDOMAIN answers",
+ label=("instance_id", sid),
+ value=metrics["answer.nxdomain"],
+ )
+ yield _counter(
+ "resolver_answer_rcode_servfail",
+ "number of SERVFAIL answers",
+ label=("instance_id", sid),
+ value=metrics["answer.servfail"],
+ )
+ yield _counter(
+ "resolver_answer_flag_aa",
+ "number of authoritative answers",
+ label=("instance_id", sid),
+ value=metrics["answer.aa"],
+ )
+ yield _counter(
+ "resolver_answer_flag_tc",
+ "number of truncated answers",
+ label=("instance_id", sid),
+ value=metrics["answer.tc"],
+ )
+ yield _counter(
+ "resolver_answer_flag_ra",
+ "number of answers with recursion available flag",
+ label=("instance_id", sid),
+ value=metrics["answer.ra"],
+ )
+ yield _counter(
+ "resolver_answer_flags_rd",
+ "number of recursion desired (in answer!)",
+ label=("instance_id", sid),
+ value=metrics["answer.rd"],
+ )
+ yield _counter(
+ "resolver_answer_flag_ad",
+ "number of authentic data (DNSSEC) answers",
+ label=("instance_id", sid),
+ value=metrics["answer.ad"],
+ )
+ yield _counter(
+ "resolver_answer_flag_cd",
+ "number of checking disabled (DNSSEC) answers",
+ label=("instance_id", sid),
+ value=metrics["answer.cd"],
+ )
+ yield _counter(
+ "resolver_answer_flag_do",
+ "number of DNSSEC answer OK",
+ label=("instance_id", sid),
+ value=metrics["answer.do"],
+ )
+ yield _counter(
+ "resolver_answer_flag_edns0",
+ "number of answers with EDNS0 present",
+ label=("instance_id", sid),
+ value=metrics["answer.edns0"],
+ )
+ yield _counter(
+ "resolver_query_edns",
+ "number of queries with EDNS present",
+ label=("instance_id", sid),
+ value=metrics["query.edns"],
+ )
+ yield _counter(
+ "resolver_query_dnssec",
+ "number of queries with DNSSEC DO=1",
+ label=("instance_id", sid),
+ value=metrics["query.dnssec"],
+ )
+
+
+_RESOLVER_COLLECTOR = ResolverCollector()
+REGISTRY.register(_RESOLVER_COLLECTOR) # type: ignore
def unregister_resolver_metrics_for(subprocess: Subprocess) -> None:
"""
Cancel metric collection from resolver subprocess
"""
- sid = str(subprocess.id)
- for metric in _ALL_RESOLVER_METRICS:
- metric.remove(sid)
-
del _REGISTERED_RESOLVERS[subprocess.id]
"""
Register resolver subprocess for metric collection
"""
- sid = str(subprocess.id)
- for metric in _ALL_RESOLVER_METRICS:
- metric.labels(sid)
-
_REGISTERED_RESOLVERS[subprocess.id] = subprocess
Collects metrics from everything, returns data string in Prometheus format.
"""
- if config.monitoring.state != "manager-only":
- await _collect_resolver_stats(config.monitoring.state == "lazy")
+ try:
+ if config.monitoring.state != "manager-only":
+ lazy = config.monitoring.state == "lazy"
+ ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()"
+ STATS_QUERY = "collect_statistics()"
+
+ cmd = ON_DEMAND_STATS_QUERY if lazy else STATS_QUERY
+ stats_raw = await _command_registered_resolvers(cmd)
+ _RESOLVER_COLLECTOR.set_stats(stats_raw)
- return exposition.generate_latest()
+ return exposition.generate_latest() # type: ignore
+ finally:
+ # after the report has been generated, clean everything
+ _RESOLVER_COLLECTOR.set_stats(None)