+import asyncio
import json
-from typing import Any, Coroutine
+import logging
+from typing import Any, Awaitable, Callable, Dict, List, Tuple, TypeVar, Union
-from prometheus_client import Counter, Histogram, exposition
+from prometheus_client import Counter, Gauge, Histogram, exposition # type: ignore
from knot_resolver_manager.datamodel.config_schema import KresConfig
from knot_resolver_manager.kres_id import KresID
-from knot_resolver_manager.kres_manager import KresManager
+from knot_resolver_manager.kresd_controller.interface import Subprocess
-KRESD_RESPONSE_LATENCY = Histogram(
- "kresd_response_latency",
+logger = logging.getLogger(__name__)
+
+RESOLVER_RESPONSE_LATENCY = Histogram(
+ "resolver_response_latency",
"Time it takes to respond to queries in seconds",
buckets=[0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, float("inf")],
labelnames=["instance_id"],
)
-KRESD_REQUEST_TOTAL = Counter(
- "kresd_request_total",
+RESOLVER_REQUEST_TOTAL = Counter(
+ "resolver_request_total",
"total number of DNS requests (including internal client requests)",
labelnames=["instance_id"],
)
-KRESD_REQUEST_INTERNAL = Counter(
- "kresd_request_internal",
+RESOLVER_REQUEST_INTERNAL = Counter(
+ "resolver_request_internal",
"number of internal requests generated by Knot Resolver (e.g. DNSSEC trust anchor updates)",
labelnames=["instance_id"],
)
-KRESD_REQUEST_UDP = Counter(
- "kresd_request_udp", "number of external requests received over plain UDP (RFC 1035)", labelnames=["instance_id"]
+RESOLVER_REQUEST_UDP = Counter(
+ "resolver_request_udp", "number of external requests received over plain UDP (RFC 1035)", labelnames=["instance_id"]
)
-KRESD_REQUEST_TCP = Counter(
- "kresd_request_tcp", "number of external requests received over plain TCP (RFC 1035)", labelnames=["instance_id"]
+RESOLVER_REQUEST_TCP = Counter(
+ "resolver_request_tcp", "number of external requests received over plain TCP (RFC 1035)", labelnames=["instance_id"]
)
-KRESD_REQUEST_DOT = Counter(
- "kresd_request_dot", "number of external requests received over DNS-over-TLS (RFC 7858)", labelnames=["instance_id"]
+RESOLVER_REQUEST_DOT = Counter(
+ "resolver_request_dot",
+ "number of external requests received over DNS-over-TLS (RFC 7858)",
+ labelnames=["instance_id"],
)
-KRESD_REQUEST_DOH = Counter(
- "kresd_request_doh",
+RESOLVER_REQUEST_DOH = Counter(
+ "resolver_request_doh",
"number of external requests received over DNS-over-HTTP (RFC 8484)",
labelnames=["instance_id"],
)
-KRESD_REQUEST_XDP = Counter(
- "kresd_request_xdp",
+RESOLVER_REQUEST_XDP = Counter(
+ "resolver_request_xdp",
"number of external requests received over plain UDP via an AF_XDP socket",
labelnames=["instance_id"],
)
-KRESD_ANSWER_TOTAL = Counter("kresd_answer_total", "total number of answered queries", labelnames=["instance_id"])
-KRESD_ANSWER_CACHED = Counter(
- "kresd_answer_cached", "number of queries answered from cache", labelnames=["instance_id"]
+RESOLVER_ANSWER_TOTAL = Counter("resolver_answer_total", "total number of answered queries", labelnames=["instance_id"])
+RESOLVER_ANSWER_CACHED = Counter(
+ "resolver_answer_cached", "number of queries answered from cache", labelnames=["instance_id"]
)
-KRESD_ANSWER_RCODE_NOERROR = Counter(
- "kresd_answer_rcode_noerror", "number of NOERROR answers", labelnames=["instance_id"]
+RESOLVER_ANSWER_RCODE_NOERROR = Counter(
+ "resolver_answer_rcode_noerror", "number of NOERROR answers", labelnames=["instance_id"]
)
-KRESD_ANSWER_RCODE_NODATA = Counter(
- "kresd_answer_rcode_nodata", "number of NOERROR answers without any data", labelnames=["instance_id"]
+RESOLVER_ANSWER_RCODE_NODATA = Counter(
+ "resolver_answer_rcode_nodata", "number of NOERROR answers without any data", labelnames=["instance_id"]
)
-KRESD_ANSWER_RCODE_NXDOMAIN = Counter(
- "kresd_answer_rcode_nxdomain", "number of NXDOMAIN answers", labelnames=["instance_id"]
+RESOLVER_ANSWER_RCODE_NXDOMAIN = Counter(
+ "resolver_answer_rcode_nxdomain", "number of NXDOMAIN answers", labelnames=["instance_id"]
)
-KRESD_ANSWER_RCODE_SERVFAIL = Counter(
- "kresd_answer_rcode_servfail", "number of SERVFAIL answers", labelnames=["instance_id"]
+RESOLVER_ANSWER_RCODE_SERVFAIL = Counter(
+ "resolver_answer_rcode_servfail", "number of SERVFAIL answers", labelnames=["instance_id"]
)
-KRESD_ANSWER_FLAG_AA = Counter("kresd_answer_flag_aa", "number of authoritative answers", labelnames=["instance_id"])
-KRESD_ANSWER_FLAG_TC = Counter("kresd_answer_flag_tc", "number of truncated answers", labelnames=["instance_id"])
-KRESD_ANSWER_FLAG_RA = Counter(
- "kresd_answer_flag_ra", "number of answers with recursion available flag", labelnames=["instance_id"]
+RESOLVER_ANSWER_FLAG_AA = Counter(
+ "resolver_answer_flag_aa", "number of authoritative answers", labelnames=["instance_id"]
+)
+RESOLVER_ANSWER_FLAG_TC = Counter("resolver_answer_flag_tc", "number of truncated answers", labelnames=["instance_id"])
+RESOLVER_ANSWER_FLAG_RA = Counter(
+ "resolver_answer_flag_ra", "number of answers with recursion available flag", labelnames=["instance_id"]
+)
+RESOLVER_ANSWER_FLAG_RD = Counter(
+ "resolver_answer_flags_rd", "number of recursion desired (in answer!)", labelnames=["instance_id"]
+)
+RESOLVER_ANSWER_FLAG_AD = Counter(
+ "resolver_answer_flag_ad", "number of authentic data (DNSSEC) answers", labelnames=["instance_id"]
)
-KRESD_ANSWER_FLAG_RD = Counter(
- "kresd_answer_flags_rd", "number of recursion desired (in answer!)", labelnames=["instance_id"]
+RESOLVER_ANSWER_FLAG_CD = Counter(
+ "resolver_answer_flag_cd", "number of checking disabled (DNSSEC) answers", labelnames=["instance_id"]
)
-KRESD_ANSWER_FLAG_AD = Counter(
- "kresd_answer_flag_ad", "number of authentic data (DNSSEC) answers", labelnames=["instance_id"]
+RESOLVER_ANSWER_FLAG_DO = Counter("resolver_answer_flag_do", "number of DNSSEC answer OK", labelnames=["instance_id"])
+RESOLVER_ANSWER_FLAG_EDNS0 = Counter(
+ "resolver_answer_flag_edns0", "number of answers with EDNS0 present", labelnames=["instance_id"]
)
-KRESD_ANSWER_FLAG_CD = Counter(
- "kresd_answer_flag_cd", "number of checking disabled (DNSSEC) answers", labelnames=["instance_id"]
+
+RESOLVER_QUERY_EDNS = Counter("resolver_query_edns", "number of queries with EDNS present", labelnames=["instance_id"])
+RESOLVER_QUERY_DNSSEC = Counter(
+ "resolver_query_dnssec", "number of queries with DNSSEC DO=1", labelnames=["instance_id"]
)
-KRESD_ANSWER_FLAG_DO = Counter("kresd_answer_flag_do", "number of DNSSEC answer OK", labelnames=["instance_id"])
-KRESD_ANSWER_FLAG_EDNS0 = Counter(
- "kresd_answer_flag_edns0", "number of answers with EDNS0 present", labelnames=["instance_id"]
+
+RESOLVER_METRICS_LOADED = Gauge(
+ "resolver_metrics_loaded",
+ "0 if metrics from resolver instance were not loaded, otherwise 1",
+ labelnames=["instance_id"],
+)
+
+
+_ALL_RESOLVER_METRICS: List[Union[Counter, Gauge, Histogram]] = [
+ RESOLVER_RESPONSE_LATENCY,
+ RESOLVER_REQUEST_TOTAL,
+ RESOLVER_REQUEST_INTERNAL,
+ RESOLVER_REQUEST_UDP,
+ RESOLVER_REQUEST_TCP,
+ RESOLVER_REQUEST_DOT,
+ RESOLVER_REQUEST_DOH,
+ RESOLVER_REQUEST_XDP,
+ RESOLVER_ANSWER_TOTAL,
+ RESOLVER_ANSWER_CACHED,
+ RESOLVER_ANSWER_RCODE_NOERROR,
+ RESOLVER_ANSWER_RCODE_NODATA,
+ RESOLVER_ANSWER_RCODE_NXDOMAIN,
+ RESOLVER_ANSWER_RCODE_SERVFAIL,
+ RESOLVER_ANSWER_FLAG_AA,
+ RESOLVER_ANSWER_FLAG_TC,
+ RESOLVER_ANSWER_FLAG_RA,
+ RESOLVER_ANSWER_FLAG_RD,
+ RESOLVER_ANSWER_FLAG_AD,
+ RESOLVER_ANSWER_FLAG_CD,
+ RESOLVER_ANSWER_FLAG_DO,
+ RESOLVER_ANSWER_FLAG_EDNS0,
+ RESOLVER_QUERY_EDNS,
+ RESOLVER_QUERY_DNSSEC,
+ RESOLVER_METRICS_LOADED,
+]
+
+MANAGER_REQUEST_RECONFIGURE_LATENCY = Histogram(
+ "manager_request_reconfigure_latency", "Time it takes to change configuration"
)
-KRESD_QUERY_EDNS = Counter("kresd_query_edns", "number of queries with EDNS present", labelnames=["instance_id"])
-KRESD_QUERY_DNSSEC = Counter("kresd_query_dnssec", "number of queries with DNSSEC DO=1", labelnames=["instance_id"])
+_REGISTERED_RESOLVERS: Dict[KresID, Subprocess] = {}
+
-MANAGER_REQUEST_RECONFIGURE_LATENCY = Histogram("manager_request_reconfigure_latency", "Time it takes to change configuration")
+T = TypeVar("T")
-def async_timing_histogram(metric: Histogram):
- def decorator(func: Coroutine[Any, Any, Any]):
- async def wrapper(*args, **kwargs):
+def async_timing_histogram(metric: Histogram) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
+ """
+ Decorator which can be used to report duration on async functions
+ """
+
+ def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
+ async def wrapper(*args: Any, **kwargs: Any) -> T:
with metric.time():
res = await func(*args, **kwargs)
return res
+
return wrapper
+
return decorator
-def _generate_instance_metrics(instance_id: KresID, metrics: Any) -> None:
+async def _command_registered_resolvers(cmd: str) -> Dict[KresID, str]:
+ async def single_pair(sub: Subprocess) -> Tuple[KresID, str]:
+ return sub.id, await sub.command(cmd)
+
+ pairs = await asyncio.gather(*(single_pair(inst) for inst in _REGISTERED_RESOLVERS.values()))
+ return dict(pairs)
+
+
+def _parse_resolver_metrics(instance_id: KresID, metrics: Any) -> None:
# Uses private fields in order to translate kresd statistics into prometheus's library internal objects.
# pylint: disable=protected-access
+ # pyright: reportUnknownMemberType=false
sid = str(instance_id)
# response latency histogram
for i, duration in enumerate(("1ms", "10ms", "50ms", "100ms", "250ms", "500ms", "1000ms", "1500ms", "slow")):
- KRESD_RESPONSE_LATENCY.labels(str(sid))._buckets[i].set(metrics[f"answer.{duration}"])
+ RESOLVER_RESPONSE_LATENCY.labels(sid)._buckets[i].set(metrics[f"answer.{duration}"])
# TODO add sum after fixing https://gitlab.nic.cz/knot/knot-resolver/-/issues/721
- # KRESD_RESPONSE_LATENCY.labels(str(id))._sum.set(sum)
-
- KRESD_REQUEST_TOTAL.labels(str(sid))._value.set(metrics["request.total"])
- KRESD_REQUEST_INTERNAL.labels(str(sid))._value.set(metrics["request.internal"])
- KRESD_REQUEST_UDP.labels(str(sid))._value.set(metrics["request.udp"])
- KRESD_REQUEST_TCP.labels(str(sid))._value.set(metrics["request.tcp"])
- KRESD_REQUEST_DOT.labels(str(sid))._value.set(metrics["request.dot"])
- KRESD_REQUEST_DOH.labels(str(sid))._value.set(metrics["request.doh"])
- KRESD_REQUEST_XDP.labels(str(sid))._value.set(metrics["request.xdp"])
-
- KRESD_ANSWER_TOTAL.labels(str(sid))._value.set(metrics["answer.total"])
- KRESD_ANSWER_CACHED.labels(str(sid))._value.set(metrics["answer.cached"])
-
- KRESD_ANSWER_RCODE_NOERROR.labels(str(sid))._value.set(metrics["answer.noerror"])
- KRESD_ANSWER_RCODE_NODATA.labels(str(sid))._value.set(metrics["answer.nodata"])
- KRESD_ANSWER_RCODE_NXDOMAIN.labels(str(sid))._value.set(metrics["answer.nxdomain"])
- KRESD_ANSWER_RCODE_SERVFAIL.labels(str(sid))._value.set(metrics["answer.servfail"])
-
- KRESD_ANSWER_FLAG_AA.labels(str(sid))._value.set(metrics["answer.aa"])
- KRESD_ANSWER_FLAG_TC.labels(str(sid))._value.set(metrics["answer.tc"])
- KRESD_ANSWER_FLAG_RA.labels(str(sid))._value.set(metrics["answer.ra"])
- KRESD_ANSWER_FLAG_RD.labels(str(sid))._value.set(metrics["answer.rd"])
- KRESD_ANSWER_FLAG_AD.labels(str(sid))._value.set(metrics["answer.ad"])
- KRESD_ANSWER_FLAG_CD.labels(str(sid))._value.set(metrics["answer.cd"])
- KRESD_ANSWER_FLAG_DO.labels(str(sid))._value.set(metrics["answer.do"])
- KRESD_ANSWER_FLAG_EDNS0.labels(str(sid))._value.set(metrics["answer.edns0"])
-
- KRESD_QUERY_EDNS.labels(str(sid))._value.set(metrics["query.edns"])
- KRESD_QUERY_DNSSEC.labels(str(sid))._value.set(metrics["query.dnssec"])
-
-
-async def collect(_config: KresConfig, manager: KresManager) -> bytes:
+ # RESOLVER_RESPONSE_LATENCY.labels(str(id))._sum.set(sum)
+
+ RESOLVER_REQUEST_TOTAL.labels(sid)._value.set(metrics["request.total"])
+ RESOLVER_REQUEST_INTERNAL.labels(sid)._value.set(metrics["request.internal"])
+ RESOLVER_REQUEST_UDP.labels(sid)._value.set(metrics["request.udp"])
+ RESOLVER_REQUEST_TCP.labels(sid)._value.set(metrics["request.tcp"])
+ RESOLVER_REQUEST_DOT.labels(sid)._value.set(metrics["request.dot"])
+ RESOLVER_REQUEST_DOH.labels(sid)._value.set(metrics["request.doh"])
+ RESOLVER_REQUEST_XDP.labels(sid)._value.set(metrics["request.xdp"])
+
+ RESOLVER_ANSWER_TOTAL.labels(sid)._value.set(metrics["answer.total"])
+ RESOLVER_ANSWER_CACHED.labels(sid)._value.set(metrics["answer.cached"])
+
+ RESOLVER_ANSWER_RCODE_NOERROR.labels(sid)._value.set(metrics["answer.noerror"])
+ RESOLVER_ANSWER_RCODE_NODATA.labels(sid)._value.set(metrics["answer.nodata"])
+ RESOLVER_ANSWER_RCODE_NXDOMAIN.labels(sid)._value.set(metrics["answer.nxdomain"])
+ RESOLVER_ANSWER_RCODE_SERVFAIL.labels(sid)._value.set(metrics["answer.servfail"])
+
+ RESOLVER_ANSWER_FLAG_AA.labels(sid)._value.set(metrics["answer.aa"])
+ RESOLVER_ANSWER_FLAG_TC.labels(sid)._value.set(metrics["answer.tc"])
+ RESOLVER_ANSWER_FLAG_RA.labels(sid)._value.set(metrics["answer.ra"])
+ RESOLVER_ANSWER_FLAG_RD.labels(sid)._value.set(metrics["answer.rd"])
+ RESOLVER_ANSWER_FLAG_AD.labels(sid)._value.set(metrics["answer.ad"])
+ RESOLVER_ANSWER_FLAG_CD.labels(sid)._value.set(metrics["answer.cd"])
+ RESOLVER_ANSWER_FLAG_DO.labels(sid)._value.set(metrics["answer.do"])
+ RESOLVER_ANSWER_FLAG_EDNS0.labels(sid)._value.set(metrics["answer.edns0"])
+
+ RESOLVER_QUERY_EDNS.labels(sid)._value.set(metrics["query.edns"])
+ RESOLVER_QUERY_DNSSEC.labels(sid)._value.set(metrics["query.dnssec"])
+
+
+async def _collect_resolver_stats(lazy: bool) -> None:
+ ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()"
+ STATS_QUERY = "collect_statistics()"
+
+ cmd = ON_DEMAND_STATS_QUERY if lazy else STATS_QUERY
+ stats_raw = await _command_registered_resolvers(cmd)
+
+ for kid, raw in stats_raw.items():
+ RESOLVER_METRICS_LOADED.labels(str(id)).set(0)
+ try:
+ metrics = json.loads(raw[1:-1])
+ _parse_resolver_metrics(kid, metrics)
+
+ # mark that metrics have been loaded
+ RESOLVER_METRICS_LOADED.labels(str(id)).set(1)
+ except json.JSONDecodeError:
+ logger.warning("Failed to load metrics from resolver instance %d", id)
+
+
+def unregister_resolver_metrics_for(subprocess: Subprocess) -> None:
"""
- Collects metrics from everything, returns data string in Prometheus format.
+ Cancel metric collection from resolver subprocess
"""
+ sid = str(subprocess.id)
+ for metric in _ALL_RESOLVER_METRICS:
+ metric.remove(sid)
- ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()"
- STATS_QUERY = "collect_statistics()"
+ del _REGISTERED_RESOLVERS[subprocess.id]
+
+
+def register_resolver_metrics_for(subprocess: Subprocess) -> None:
+ """
+ Register resolver subprocess for metric collection
+ """
+ sid = str(subprocess.id)
+ for metric in _ALL_RESOLVER_METRICS:
+ metric.labels(sid)
- cmd = ON_DEMAND_STATS_QUERY
- stats_raw = await manager.command_all(cmd)
+ _REGISTERED_RESOLVERS[subprocess.id] = subprocess
+
+
+async def report_stats(config: KresConfig) -> bytes:
+ """
+ Collects metrics from everything, returns data string in Prometheus format.
+ """
- for id, raw in stats_raw.items():
- metrics = json.loads(raw[1:-1])
- _generate_instance_metrics(id, metrics)
+ if config.monitoring.state != "manager-only":
+ await _collect_resolver_stats(config.monitoring.state == "lazy")
return exposition.generate_latest()