From: Vasek Sraier Date: Mon, 7 Feb 2022 11:41:31 +0000 (+0100) Subject: manager: monitoring: implementation cleanup X-Git-Tag: v6.0.0a1~43^2~5 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=99e0d646bb11808ceaca99c73b72acdb6fde1f16;p=thirdparty%2Fknot-resolver.git manager: monitoring: implementation cleanup --- diff --git a/manager/knot_resolver_manager/datamodel/config_schema.py b/manager/knot_resolver_manager/datamodel/config_schema.py index a0d9dc1c9..fbd9ee7be 100644 --- a/manager/knot_resolver_manager/datamodel/config_schema.py +++ b/manager/knot_resolver_manager/datamodel/config_schema.py @@ -11,6 +11,7 @@ from knot_resolver_manager.datamodel.dnssec_schema import DnssecSchema from knot_resolver_manager.datamodel.forward_zone import ForwardZoneSchema from knot_resolver_manager.datamodel.logging_config import LoggingSchema from knot_resolver_manager.datamodel.lua_schema import LuaSchema +from knot_resolver_manager.datamodel.monitoring_schema import MonitoringSchema from knot_resolver_manager.datamodel.network_schema import NetworkSchema from knot_resolver_manager.datamodel.options_schema import OptionsSchema from knot_resolver_manager.datamodel.policy_schema import PolicySchema @@ -71,6 +72,7 @@ class KresConfig(SchemaNode): dnssec: Disable DNSSEC, enable with defaults or set new configuration. dns64: Disable DNS64 (RFC 6147), enable with defaults or set new configuration. logging: Logging and debugging configuration. + monitoring: Metrics exposisition configuration (Prometheus, Graphite) lua: Custom Lua configuration. """ @@ -87,6 +89,7 @@ class KresConfig(SchemaNode): dnssec: Union[bool, DnssecSchema] = True dns64: Union[bool, Dns64Schema] = False logging: LoggingSchema = LoggingSchema() + monitoring: MonitoringSchema = MonitoringSchema() lua: LuaSchema = LuaSchema() _PREVIOUS_SCHEMA = Raw @@ -104,6 +107,7 @@ class KresConfig(SchemaNode): dnssec: Union[Literal[False], DnssecSchema] dns64: Union[Literal[False], Dns64Schema] logging: LoggingSchema + monitoring: MonitoringSchema lua: LuaSchema def _dnssec(self, obj: Raw) -> Union[Literal[False], DnssecSchema]: diff --git a/manager/knot_resolver_manager/datamodel/monitoring_schema.py b/manager/knot_resolver_manager/datamodel/monitoring_schema.py new file mode 100644 index 000000000..660737abc --- /dev/null +++ b/manager/knot_resolver_manager/datamodel/monitoring_schema.py @@ -0,0 +1,23 @@ +from typing import Optional + +from typing_extensions import Literal + +from knot_resolver_manager.utils.modelling import SchemaNode + + +class GraphiteSchema(SchemaNode): + endpoint: str + prefix: str + interval_sec: int + tcp: bool + + +class MonitoringSchema(SchemaNode): + """ + --- + state: configures, whether statistics module will be loaded into resolver + graphite: optionally configures where should graphite metrics be sent to + """ + + state: Literal["manager-only", "lazy", "always"] = "always" + graphite: Optional[GraphiteSchema] = None diff --git a/manager/knot_resolver_manager/datamodel/templates/config.lua.j2 b/manager/knot_resolver_manager/datamodel/templates/config.lua.j2 index ff89cf811..08a85ab05 100644 --- a/manager/knot_resolver_manager/datamodel/templates/config.lua.j2 +++ b/manager/knot_resolver_manager/datamodel/templates/config.lua.j2 @@ -53,29 +53,5 @@ {{ cfg.lua.script }} {% endif %} --- static config used for manager's needs -local ffi = require('ffi') -local id = os.getenv('SYSTEMD_INSTANCE') -if not id then - log_err(ffi.C.LOG_GRP_SYSTEM, 'environment variable $SYSTEMD_INSTANCE not set, which should not have been possible due to running under manager') -else - -- Bind to control socket in CWD (= rundir in config) - -- FIXME replace with relative path after fixing https://gitlab.nic.cz/knot/knot-resolver/-/issues/720 - local path = '{{ cwd }}/control/'..id - log_warn(ffi.C.LOG_GRP_SYSTEM, 'path = ' .. path) - local ok, err = pcall(net.listen, path, nil, { kind = 'control' }) - if not ok then - log_warn(ffi.C.LOG_GRP_NETWORK, 'bind to '..path..' failed '..err) - end -end - -function collect_lazy_statistics() - if stats == nil then - modules.load('stats') - end - - return tojson(stats.list()) -end -function collect_statistics() - return tojson(stats.list()) -end +-- manager's monitoring configuration +{% include "monitoring.lua.j2" %} \ No newline at end of file diff --git a/manager/knot_resolver_manager/datamodel/templates/monitoring.lua.j2 b/manager/knot_resolver_manager/datamodel/templates/monitoring.lua.j2 new file mode 100644 index 000000000..31374d0ef --- /dev/null +++ b/manager/knot_resolver_manager/datamodel/templates/monitoring.lua.j2 @@ -0,0 +1,33 @@ +--- control socket location +local ffi = require('ffi') +local id = os.getenv('SYSTEMD_INSTANCE') +if not id then + log_err(ffi.C.LOG_GRP_SYSTEM, 'environment variable $SYSTEMD_INSTANCE not set, which should not have been possible due to running under manager') +else + -- Bind to control socket in CWD (= rundir in config) + -- FIXME replace with relative path after fixing https://gitlab.nic.cz/knot/knot-resolver/-/issues/720 + local path = '{{ cwd }}/control/'..id + log_warn(ffi.C.LOG_GRP_SYSTEM, 'path = ' .. path) + local ok, err = pcall(net.listen, path, nil, { kind = 'control' }) + if not ok then + log_warn(ffi.C.LOG_GRP_NETWORK, 'bind to '..path..' failed '..err) + end +end + +{% if cfg.monitoring.state == "always" %} +modules.load('stats') +{% endif %} + +--- function used for statistics collection +function collect_lazy_statistics() + if stats == nil then + modules.load('stats') + end + + return tojson(stats.list()) +end + +--- function used for statistics collection +function collect_statistics() + return tojson(stats.list()) +end diff --git a/manager/knot_resolver_manager/kres_manager.py b/manager/knot_resolver_manager/kres_manager.py index 5404d253d..7353c87cb 100644 --- a/manager/knot_resolver_manager/kres_manager.py +++ b/manager/knot_resolver_manager/kres_manager.py @@ -3,7 +3,7 @@ import logging import sys from asyncio.futures import Future from subprocess import SubprocessError -from typing import Dict, List, Optional, Tuple +from typing import List, Optional import knot_resolver_manager.kresd_controller from knot_resolver_manager import kres_id @@ -16,6 +16,7 @@ from knot_resolver_manager.kresd_controller.interface import ( SubprocessStatus, SubprocessType, ) +from knot_resolver_manager.statistics import register_resolver_metrics_for, unregister_resolver_metrics_for from knot_resolver_manager.utils.functional import Result from knot_resolver_manager.utils.types import NoneType @@ -78,14 +79,17 @@ class KresManager: async def _spawn_new_worker(self, config: KresConfig) -> None: subprocess = await self._controller.create_subprocess(config, SubprocessType.KRESD, kres_id.alloc()) await subprocess.start() + + register_resolver_metrics_for(subprocess) self._workers.append(subprocess) async def _stop_a_worker(self) -> None: if len(self._workers) == 0: raise IndexError("Can't stop a kresd when there are no running") - kresd = self._workers.pop() - await kresd.stop() + subprocess = self._workers.pop() + unregister_resolver_metrics_for(subprocess) + await subprocess.stop() async def _collect_already_running_children(self) -> None: for subp in await self._controller.get_all_running_instances(): @@ -123,13 +127,6 @@ class KresManager: await self._gc.stop() self._gc = None - async def command_all(self, cmd: str) -> Dict[kres_id.KresID, str]: - async def single_pair(sub: Subprocess) -> Tuple[kres_id.KresID, str]: - return sub.id, await sub.command(cmd) - - pairs = await asyncio.gather(*(single_pair(inst) for inst in self._workers)) - return dict(pairs) - async def validate_config(self, _old: KresConfig, new: KresConfig) -> Result[NoneType, str]: async with self._manager_lock: logger.debug("Testing the new config with a canary process") diff --git a/manager/knot_resolver_manager/kresd_controller/interface.py b/manager/knot_resolver_manager/kresd_controller/interface.py index c44b6acc9..e68d82599 100644 --- a/manager/knot_resolver_manager/kresd_controller/interface.py +++ b/manager/knot_resolver_manager/kresd_controller/interface.py @@ -89,8 +89,10 @@ class Subprocess: finally: if writer is not None: writer.close() + + # proper closing of the socket is only implemented in later versions of python if sys.version_info.minor >= 7: - await writer.wait_closed() + await writer.wait_closed() # type: ignore class SubprocessStatus(Enum): diff --git a/manager/knot_resolver_manager/server.py b/manager/knot_resolver_manager/server.py index 245bbe496..d216e08ea 100644 --- a/manager/knot_resolver_manager/server.py +++ b/manager/knot_resolver_manager/server.py @@ -61,13 +61,10 @@ class Server: # This is top-level class containing pretty much everything. Instead of global # variables, we use instance attributes. That's why there are so many and it's # ok. - def __init__(self, store: ConfigStore, config_path: Optional[Path], manager: KresManager): + def __init__(self, store: ConfigStore, config_path: Optional[Path]): # config store & server dynamic reconfiguration self.config_store = store - # stats - self._manager = manager - # HTTP server self.app = Application(middlewares=[error_handler]) self.runner = AppRunner(self.app) @@ -164,7 +161,7 @@ class Server: async def _handler_metrics(self, _request: web.Request) -> web.Response: return web.Response( - body=await statistics.collect(self.config_store.get(), self._manager), + body=await statistics.report_stats(self.config_store.get()), content_type="text/plain", charset="utf8", ) @@ -371,7 +368,7 @@ async def start_server(config: Union[Path, ParsedTree] = DEFAULT_MANAGER_CONFIG_ # At this point, all backend functionality-providing components are initialized. It's therefore save to start # the API server. - server = Server(config_store, config if isinstance(config, Path) else None, manager) + server = Server(config_store, config if isinstance(config, Path) else None) await server.start() logger.info(f"Manager fully initialized and running in {round(time() - start_time, 3)} seconds") diff --git a/manager/knot_resolver_manager/statistics.py b/manager/knot_resolver_manager/statistics.py index 060985cdd..0675be25b 100644 --- a/manager/knot_resolver_manager/statistics.py +++ b/manager/knot_resolver_manager/statistics.py @@ -1,154 +1,255 @@ +import asyncio import json -from typing import Any, Coroutine +import logging +from typing import Any, Awaitable, Callable, Dict, List, Tuple, TypeVar, Union -from prometheus_client import Counter, Histogram, exposition +from prometheus_client import Counter, Gauge, Histogram, exposition # type: ignore from knot_resolver_manager.datamodel.config_schema import KresConfig from knot_resolver_manager.kres_id import KresID -from knot_resolver_manager.kres_manager import KresManager +from knot_resolver_manager.kresd_controller.interface import Subprocess -KRESD_RESPONSE_LATENCY = Histogram( - "kresd_response_latency", +logger = logging.getLogger(__name__) + +RESOLVER_RESPONSE_LATENCY = Histogram( + "resolver_response_latency", "Time it takes to respond to queries in seconds", buckets=[0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, float("inf")], labelnames=["instance_id"], ) -KRESD_REQUEST_TOTAL = Counter( - "kresd_request_total", +RESOLVER_REQUEST_TOTAL = Counter( + "resolver_request_total", "total number of DNS requests (including internal client requests)", labelnames=["instance_id"], ) -KRESD_REQUEST_INTERNAL = Counter( - "kresd_request_internal", +RESOLVER_REQUEST_INTERNAL = Counter( + "resolver_request_internal", "number of internal requests generated by Knot Resolver (e.g. DNSSEC trust anchor updates)", labelnames=["instance_id"], ) -KRESD_REQUEST_UDP = Counter( - "kresd_request_udp", "number of external requests received over plain UDP (RFC 1035)", labelnames=["instance_id"] +RESOLVER_REQUEST_UDP = Counter( + "resolver_request_udp", "number of external requests received over plain UDP (RFC 1035)", labelnames=["instance_id"] ) -KRESD_REQUEST_TCP = Counter( - "kresd_request_tcp", "number of external requests received over plain TCP (RFC 1035)", labelnames=["instance_id"] +RESOLVER_REQUEST_TCP = Counter( + "resolver_request_tcp", "number of external requests received over plain TCP (RFC 1035)", labelnames=["instance_id"] ) -KRESD_REQUEST_DOT = Counter( - "kresd_request_dot", "number of external requests received over DNS-over-TLS (RFC 7858)", labelnames=["instance_id"] +RESOLVER_REQUEST_DOT = Counter( + "resolver_request_dot", + "number of external requests received over DNS-over-TLS (RFC 7858)", + labelnames=["instance_id"], ) -KRESD_REQUEST_DOH = Counter( - "kresd_request_doh", +RESOLVER_REQUEST_DOH = Counter( + "resolver_request_doh", "number of external requests received over DNS-over-HTTP (RFC 8484)", labelnames=["instance_id"], ) -KRESD_REQUEST_XDP = Counter( - "kresd_request_xdp", +RESOLVER_REQUEST_XDP = Counter( + "resolver_request_xdp", "number of external requests received over plain UDP via an AF_XDP socket", labelnames=["instance_id"], ) -KRESD_ANSWER_TOTAL = Counter("kresd_answer_total", "total number of answered queries", labelnames=["instance_id"]) -KRESD_ANSWER_CACHED = Counter( - "kresd_answer_cached", "number of queries answered from cache", labelnames=["instance_id"] +RESOLVER_ANSWER_TOTAL = Counter("resolver_answer_total", "total number of answered queries", labelnames=["instance_id"]) +RESOLVER_ANSWER_CACHED = Counter( + "resolver_answer_cached", "number of queries answered from cache", labelnames=["instance_id"] ) -KRESD_ANSWER_RCODE_NOERROR = Counter( - "kresd_answer_rcode_noerror", "number of NOERROR answers", labelnames=["instance_id"] +RESOLVER_ANSWER_RCODE_NOERROR = Counter( + "resolver_answer_rcode_noerror", "number of NOERROR answers", labelnames=["instance_id"] ) -KRESD_ANSWER_RCODE_NODATA = Counter( - "kresd_answer_rcode_nodata", "number of NOERROR answers without any data", labelnames=["instance_id"] +RESOLVER_ANSWER_RCODE_NODATA = Counter( + "resolver_answer_rcode_nodata", "number of NOERROR answers without any data", labelnames=["instance_id"] ) -KRESD_ANSWER_RCODE_NXDOMAIN = Counter( - "kresd_answer_rcode_nxdomain", "number of NXDOMAIN answers", labelnames=["instance_id"] +RESOLVER_ANSWER_RCODE_NXDOMAIN = Counter( + "resolver_answer_rcode_nxdomain", "number of NXDOMAIN answers", labelnames=["instance_id"] ) -KRESD_ANSWER_RCODE_SERVFAIL = Counter( - "kresd_answer_rcode_servfail", "number of SERVFAIL answers", labelnames=["instance_id"] +RESOLVER_ANSWER_RCODE_SERVFAIL = Counter( + "resolver_answer_rcode_servfail", "number of SERVFAIL answers", labelnames=["instance_id"] ) -KRESD_ANSWER_FLAG_AA = Counter("kresd_answer_flag_aa", "number of authoritative answers", labelnames=["instance_id"]) -KRESD_ANSWER_FLAG_TC = Counter("kresd_answer_flag_tc", "number of truncated answers", labelnames=["instance_id"]) -KRESD_ANSWER_FLAG_RA = Counter( - "kresd_answer_flag_ra", "number of answers with recursion available flag", labelnames=["instance_id"] +RESOLVER_ANSWER_FLAG_AA = Counter( + "resolver_answer_flag_aa", "number of authoritative answers", labelnames=["instance_id"] +) +RESOLVER_ANSWER_FLAG_TC = Counter("resolver_answer_flag_tc", "number of truncated answers", labelnames=["instance_id"]) +RESOLVER_ANSWER_FLAG_RA = Counter( + "resolver_answer_flag_ra", "number of answers with recursion available flag", labelnames=["instance_id"] +) +RESOLVER_ANSWER_FLAG_RD = Counter( + "resolver_answer_flags_rd", "number of recursion desired (in answer!)", labelnames=["instance_id"] +) +RESOLVER_ANSWER_FLAG_AD = Counter( + "resolver_answer_flag_ad", "number of authentic data (DNSSEC) answers", labelnames=["instance_id"] ) -KRESD_ANSWER_FLAG_RD = Counter( - "kresd_answer_flags_rd", "number of recursion desired (in answer!)", labelnames=["instance_id"] +RESOLVER_ANSWER_FLAG_CD = Counter( + "resolver_answer_flag_cd", "number of checking disabled (DNSSEC) answers", labelnames=["instance_id"] ) -KRESD_ANSWER_FLAG_AD = Counter( - "kresd_answer_flag_ad", "number of authentic data (DNSSEC) answers", labelnames=["instance_id"] +RESOLVER_ANSWER_FLAG_DO = Counter("resolver_answer_flag_do", "number of DNSSEC answer OK", labelnames=["instance_id"]) +RESOLVER_ANSWER_FLAG_EDNS0 = Counter( + "resolver_answer_flag_edns0", "number of answers with EDNS0 present", labelnames=["instance_id"] ) -KRESD_ANSWER_FLAG_CD = Counter( - "kresd_answer_flag_cd", "number of checking disabled (DNSSEC) answers", labelnames=["instance_id"] + +RESOLVER_QUERY_EDNS = Counter("resolver_query_edns", "number of queries with EDNS present", labelnames=["instance_id"]) +RESOLVER_QUERY_DNSSEC = Counter( + "resolver_query_dnssec", "number of queries with DNSSEC DO=1", labelnames=["instance_id"] ) -KRESD_ANSWER_FLAG_DO = Counter("kresd_answer_flag_do", "number of DNSSEC answer OK", labelnames=["instance_id"]) -KRESD_ANSWER_FLAG_EDNS0 = Counter( - "kresd_answer_flag_edns0", "number of answers with EDNS0 present", labelnames=["instance_id"] + +RESOLVER_METRICS_LOADED = Gauge( + "resolver_metrics_loaded", + "0 if metrics from resolver instance were not loaded, otherwise 1", + labelnames=["instance_id"], +) + + +_ALL_RESOLVER_METRICS: List[Union[Counter, Gauge, Histogram]] = [ + RESOLVER_RESPONSE_LATENCY, + RESOLVER_REQUEST_TOTAL, + RESOLVER_REQUEST_INTERNAL, + RESOLVER_REQUEST_UDP, + RESOLVER_REQUEST_TCP, + RESOLVER_REQUEST_DOT, + RESOLVER_REQUEST_DOH, + RESOLVER_REQUEST_XDP, + RESOLVER_ANSWER_TOTAL, + RESOLVER_ANSWER_CACHED, + RESOLVER_ANSWER_RCODE_NOERROR, + RESOLVER_ANSWER_RCODE_NODATA, + RESOLVER_ANSWER_RCODE_NXDOMAIN, + RESOLVER_ANSWER_RCODE_SERVFAIL, + RESOLVER_ANSWER_FLAG_AA, + RESOLVER_ANSWER_FLAG_TC, + RESOLVER_ANSWER_FLAG_RA, + RESOLVER_ANSWER_FLAG_RD, + RESOLVER_ANSWER_FLAG_AD, + RESOLVER_ANSWER_FLAG_CD, + RESOLVER_ANSWER_FLAG_DO, + RESOLVER_ANSWER_FLAG_EDNS0, + RESOLVER_QUERY_EDNS, + RESOLVER_QUERY_DNSSEC, + RESOLVER_METRICS_LOADED, +] + +MANAGER_REQUEST_RECONFIGURE_LATENCY = Histogram( + "manager_request_reconfigure_latency", "Time it takes to change configuration" ) -KRESD_QUERY_EDNS = Counter("kresd_query_edns", "number of queries with EDNS present", labelnames=["instance_id"]) -KRESD_QUERY_DNSSEC = Counter("kresd_query_dnssec", "number of queries with DNSSEC DO=1", labelnames=["instance_id"]) +_REGISTERED_RESOLVERS: Dict[KresID, Subprocess] = {} + -MANAGER_REQUEST_RECONFIGURE_LATENCY = Histogram("manager_request_reconfigure_latency", "Time it takes to change configuration") +T = TypeVar("T") -def async_timing_histogram(metric: Histogram): - def decorator(func: Coroutine[Any, Any, Any]): - async def wrapper(*args, **kwargs): +def async_timing_histogram(metric: Histogram) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]: + """ + Decorator which can be used to report duration on async functions + """ + + def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]: + async def wrapper(*args: Any, **kwargs: Any) -> T: with metric.time(): res = await func(*args, **kwargs) return res + return wrapper + return decorator -def _generate_instance_metrics(instance_id: KresID, metrics: Any) -> None: +async def _command_registered_resolvers(cmd: str) -> Dict[KresID, str]: + async def single_pair(sub: Subprocess) -> Tuple[KresID, str]: + return sub.id, await sub.command(cmd) + + pairs = await asyncio.gather(*(single_pair(inst) for inst in _REGISTERED_RESOLVERS.values())) + return dict(pairs) + + +def _parse_resolver_metrics(instance_id: KresID, metrics: Any) -> None: # Uses private fields in order to translate kresd statistics into prometheus's library internal objects. # pylint: disable=protected-access + # pyright: reportUnknownMemberType=false sid = str(instance_id) # response latency histogram for i, duration in enumerate(("1ms", "10ms", "50ms", "100ms", "250ms", "500ms", "1000ms", "1500ms", "slow")): - KRESD_RESPONSE_LATENCY.labels(str(sid))._buckets[i].set(metrics[f"answer.{duration}"]) + RESOLVER_RESPONSE_LATENCY.labels(sid)._buckets[i].set(metrics[f"answer.{duration}"]) # TODO add sum after fixing https://gitlab.nic.cz/knot/knot-resolver/-/issues/721 - # KRESD_RESPONSE_LATENCY.labels(str(id))._sum.set(sum) - - KRESD_REQUEST_TOTAL.labels(str(sid))._value.set(metrics["request.total"]) - KRESD_REQUEST_INTERNAL.labels(str(sid))._value.set(metrics["request.internal"]) - KRESD_REQUEST_UDP.labels(str(sid))._value.set(metrics["request.udp"]) - KRESD_REQUEST_TCP.labels(str(sid))._value.set(metrics["request.tcp"]) - KRESD_REQUEST_DOT.labels(str(sid))._value.set(metrics["request.dot"]) - KRESD_REQUEST_DOH.labels(str(sid))._value.set(metrics["request.doh"]) - KRESD_REQUEST_XDP.labels(str(sid))._value.set(metrics["request.xdp"]) - - KRESD_ANSWER_TOTAL.labels(str(sid))._value.set(metrics["answer.total"]) - KRESD_ANSWER_CACHED.labels(str(sid))._value.set(metrics["answer.cached"]) - - KRESD_ANSWER_RCODE_NOERROR.labels(str(sid))._value.set(metrics["answer.noerror"]) - KRESD_ANSWER_RCODE_NODATA.labels(str(sid))._value.set(metrics["answer.nodata"]) - KRESD_ANSWER_RCODE_NXDOMAIN.labels(str(sid))._value.set(metrics["answer.nxdomain"]) - KRESD_ANSWER_RCODE_SERVFAIL.labels(str(sid))._value.set(metrics["answer.servfail"]) - - KRESD_ANSWER_FLAG_AA.labels(str(sid))._value.set(metrics["answer.aa"]) - KRESD_ANSWER_FLAG_TC.labels(str(sid))._value.set(metrics["answer.tc"]) - KRESD_ANSWER_FLAG_RA.labels(str(sid))._value.set(metrics["answer.ra"]) - KRESD_ANSWER_FLAG_RD.labels(str(sid))._value.set(metrics["answer.rd"]) - KRESD_ANSWER_FLAG_AD.labels(str(sid))._value.set(metrics["answer.ad"]) - KRESD_ANSWER_FLAG_CD.labels(str(sid))._value.set(metrics["answer.cd"]) - KRESD_ANSWER_FLAG_DO.labels(str(sid))._value.set(metrics["answer.do"]) - KRESD_ANSWER_FLAG_EDNS0.labels(str(sid))._value.set(metrics["answer.edns0"]) - - KRESD_QUERY_EDNS.labels(str(sid))._value.set(metrics["query.edns"]) - KRESD_QUERY_DNSSEC.labels(str(sid))._value.set(metrics["query.dnssec"]) - - -async def collect(_config: KresConfig, manager: KresManager) -> bytes: + # RESOLVER_RESPONSE_LATENCY.labels(str(id))._sum.set(sum) + + RESOLVER_REQUEST_TOTAL.labels(sid)._value.set(metrics["request.total"]) + RESOLVER_REQUEST_INTERNAL.labels(sid)._value.set(metrics["request.internal"]) + RESOLVER_REQUEST_UDP.labels(sid)._value.set(metrics["request.udp"]) + RESOLVER_REQUEST_TCP.labels(sid)._value.set(metrics["request.tcp"]) + RESOLVER_REQUEST_DOT.labels(sid)._value.set(metrics["request.dot"]) + RESOLVER_REQUEST_DOH.labels(sid)._value.set(metrics["request.doh"]) + RESOLVER_REQUEST_XDP.labels(sid)._value.set(metrics["request.xdp"]) + + RESOLVER_ANSWER_TOTAL.labels(sid)._value.set(metrics["answer.total"]) + RESOLVER_ANSWER_CACHED.labels(sid)._value.set(metrics["answer.cached"]) + + RESOLVER_ANSWER_RCODE_NOERROR.labels(sid)._value.set(metrics["answer.noerror"]) + RESOLVER_ANSWER_RCODE_NODATA.labels(sid)._value.set(metrics["answer.nodata"]) + RESOLVER_ANSWER_RCODE_NXDOMAIN.labels(sid)._value.set(metrics["answer.nxdomain"]) + RESOLVER_ANSWER_RCODE_SERVFAIL.labels(sid)._value.set(metrics["answer.servfail"]) + + RESOLVER_ANSWER_FLAG_AA.labels(sid)._value.set(metrics["answer.aa"]) + RESOLVER_ANSWER_FLAG_TC.labels(sid)._value.set(metrics["answer.tc"]) + RESOLVER_ANSWER_FLAG_RA.labels(sid)._value.set(metrics["answer.ra"]) + RESOLVER_ANSWER_FLAG_RD.labels(sid)._value.set(metrics["answer.rd"]) + RESOLVER_ANSWER_FLAG_AD.labels(sid)._value.set(metrics["answer.ad"]) + RESOLVER_ANSWER_FLAG_CD.labels(sid)._value.set(metrics["answer.cd"]) + RESOLVER_ANSWER_FLAG_DO.labels(sid)._value.set(metrics["answer.do"]) + RESOLVER_ANSWER_FLAG_EDNS0.labels(sid)._value.set(metrics["answer.edns0"]) + + RESOLVER_QUERY_EDNS.labels(sid)._value.set(metrics["query.edns"]) + RESOLVER_QUERY_DNSSEC.labels(sid)._value.set(metrics["query.dnssec"]) + + +async def _collect_resolver_stats(lazy: bool) -> None: + ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()" + STATS_QUERY = "collect_statistics()" + + cmd = ON_DEMAND_STATS_QUERY if lazy else STATS_QUERY + stats_raw = await _command_registered_resolvers(cmd) + + for kid, raw in stats_raw.items(): + RESOLVER_METRICS_LOADED.labels(str(id)).set(0) + try: + metrics = json.loads(raw[1:-1]) + _parse_resolver_metrics(kid, metrics) + + # mark that metrics have been loaded + RESOLVER_METRICS_LOADED.labels(str(id)).set(1) + except json.JSONDecodeError: + logger.warning("Failed to load metrics from resolver instance %d", id) + + +def unregister_resolver_metrics_for(subprocess: Subprocess) -> None: """ - Collects metrics from everything, returns data string in Prometheus format. + Cancel metric collection from resolver subprocess """ + sid = str(subprocess.id) + for metric in _ALL_RESOLVER_METRICS: + metric.remove(sid) - ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()" - STATS_QUERY = "collect_statistics()" + del _REGISTERED_RESOLVERS[subprocess.id] + + +def register_resolver_metrics_for(subprocess: Subprocess) -> None: + """ + Register resolver subprocess for metric collection + """ + sid = str(subprocess.id) + for metric in _ALL_RESOLVER_METRICS: + metric.labels(sid) - cmd = ON_DEMAND_STATS_QUERY - stats_raw = await manager.command_all(cmd) + _REGISTERED_RESOLVERS[subprocess.id] = subprocess + + +async def report_stats(config: KresConfig) -> bytes: + """ + Collects metrics from everything, returns data string in Prometheus format. + """ - for id, raw in stats_raw.items(): - metrics = json.loads(raw[1:-1]) - _generate_instance_metrics(id, metrics) + if config.monitoring.state != "manager-only": + await _collect_resolver_stats(config.monitoring.state == "lazy") return exposition.generate_latest() diff --git a/manager/setup.py b/manager/setup.py index 8f3e04f1a..3512111b4 100644 --- a/manager/setup.py +++ b/manager/setup.py @@ -22,6 +22,7 @@ install_requires = \ 'PyYAML>=5.4.1', 'aiohttp>=3.6.12', 'click>=7.1.2', + 'prometheus-client>=0.6', 'pydbus>=0.6.0', 'requests>=2.25.1', 'typing-extensions>=3.7.2']