]> git.ipfire.org Git - thirdparty/knot-resolver.git/commitdiff
manager: monitoring: implementation cleanup
authorVasek Sraier <git@vakabus.cz>
Mon, 7 Feb 2022 11:41:31 +0000 (12:41 +0100)
committerAleš Mrázek <ales.mrazek@nic.cz>
Fri, 8 Apr 2022 14:17:54 +0000 (16:17 +0200)
manager/knot_resolver_manager/datamodel/config_schema.py
manager/knot_resolver_manager/datamodel/monitoring_schema.py [new file with mode: 0644]
manager/knot_resolver_manager/datamodel/templates/config.lua.j2
manager/knot_resolver_manager/datamodel/templates/monitoring.lua.j2 [new file with mode: 0644]
manager/knot_resolver_manager/kres_manager.py
manager/knot_resolver_manager/kresd_controller/interface.py
manager/knot_resolver_manager/server.py
manager/knot_resolver_manager/statistics.py
manager/setup.py

index a0d9dc1c974ddce5dcd7b7276e94b57108328852..fbd9ee7bed698d19ae16e802255a70ddad197596 100644 (file)
@@ -11,6 +11,7 @@ from knot_resolver_manager.datamodel.dnssec_schema import DnssecSchema
 from knot_resolver_manager.datamodel.forward_zone import ForwardZoneSchema
 from knot_resolver_manager.datamodel.logging_config import LoggingSchema
 from knot_resolver_manager.datamodel.lua_schema import LuaSchema
+from knot_resolver_manager.datamodel.monitoring_schema import MonitoringSchema
 from knot_resolver_manager.datamodel.network_schema import NetworkSchema
 from knot_resolver_manager.datamodel.options_schema import OptionsSchema
 from knot_resolver_manager.datamodel.policy_schema import PolicySchema
@@ -71,6 +72,7 @@ class KresConfig(SchemaNode):
         dnssec: Disable DNSSEC, enable with defaults or set new configuration.
         dns64: Disable DNS64 (RFC 6147), enable with defaults or set new configuration.
         logging: Logging and debugging configuration.
+        monitoring: Metrics exposisition configuration (Prometheus, Graphite)
         lua: Custom Lua configuration.
         """
 
@@ -87,6 +89,7 @@ class KresConfig(SchemaNode):
         dnssec: Union[bool, DnssecSchema] = True
         dns64: Union[bool, Dns64Schema] = False
         logging: LoggingSchema = LoggingSchema()
+        monitoring: MonitoringSchema = MonitoringSchema()
         lua: LuaSchema = LuaSchema()
 
     _PREVIOUS_SCHEMA = Raw
@@ -104,6 +107,7 @@ class KresConfig(SchemaNode):
     dnssec: Union[Literal[False], DnssecSchema]
     dns64: Union[Literal[False], Dns64Schema]
     logging: LoggingSchema
+    monitoring: MonitoringSchema
     lua: LuaSchema
 
     def _dnssec(self, obj: Raw) -> Union[Literal[False], DnssecSchema]:
diff --git a/manager/knot_resolver_manager/datamodel/monitoring_schema.py b/manager/knot_resolver_manager/datamodel/monitoring_schema.py
new file mode 100644 (file)
index 0000000..660737a
--- /dev/null
@@ -0,0 +1,23 @@
+from typing import Optional
+
+from typing_extensions import Literal
+
+from knot_resolver_manager.utils.modelling import SchemaNode
+
+
+class GraphiteSchema(SchemaNode):
+    endpoint: str
+    prefix: str
+    interval_sec: int
+    tcp: bool
+
+
+class MonitoringSchema(SchemaNode):
+    """
+    ---
+    state: configures, whether statistics module will be loaded into resolver
+    graphite: optionally configures where should graphite metrics be sent to
+    """
+
+    state: Literal["manager-only", "lazy", "always"] = "always"
+    graphite: Optional[GraphiteSchema] = None
index ff89cf811f8e67710651a8a7d8fd20feb00ba2cc..08a85ab055c6fb7c6e8255e020019f277d22ec5c 100644 (file)
 {{ cfg.lua.script }}
 {% endif %}
 
--- static config used for manager's needs
-local ffi = require('ffi')
-local id = os.getenv('SYSTEMD_INSTANCE')
-if not id then
-       log_err(ffi.C.LOG_GRP_SYSTEM, 'environment variable $SYSTEMD_INSTANCE not set, which should not have been possible due to running under manager')
-else
-       -- Bind to control socket in CWD (= rundir in config)
-    -- FIXME replace with relative path after fixing https://gitlab.nic.cz/knot/knot-resolver/-/issues/720
-       local path = '{{ cwd }}/control/'..id
-    log_warn(ffi.C.LOG_GRP_SYSTEM, 'path = ' .. path)
-       local ok, err = pcall(net.listen, path, nil, { kind = 'control' })
-       if not ok then
-               log_warn(ffi.C.LOG_GRP_NETWORK, 'bind to '..path..' failed '..err)
-       end
-end
-
-function collect_lazy_statistics()
-       if stats == nil then
-               modules.load('stats')
-       end
-
-       return tojson(stats.list())
-end
-function collect_statistics()
-       return tojson(stats.list())
-end
+-- manager's monitoring configuration
+{% include "monitoring.lua.j2" %}
\ No newline at end of file
diff --git a/manager/knot_resolver_manager/datamodel/templates/monitoring.lua.j2 b/manager/knot_resolver_manager/datamodel/templates/monitoring.lua.j2
new file mode 100644 (file)
index 0000000..31374d0
--- /dev/null
@@ -0,0 +1,33 @@
+--- control socket location
+local ffi = require('ffi')
+local id = os.getenv('SYSTEMD_INSTANCE')
+if not id then
+       log_err(ffi.C.LOG_GRP_SYSTEM, 'environment variable $SYSTEMD_INSTANCE not set, which should not have been possible due to running under manager')
+else
+       -- Bind to control socket in CWD (= rundir in config)
+    -- FIXME replace with relative path after fixing https://gitlab.nic.cz/knot/knot-resolver/-/issues/720
+       local path = '{{ cwd }}/control/'..id
+    log_warn(ffi.C.LOG_GRP_SYSTEM, 'path = ' .. path)
+       local ok, err = pcall(net.listen, path, nil, { kind = 'control' })
+       if not ok then
+               log_warn(ffi.C.LOG_GRP_NETWORK, 'bind to '..path..' failed '..err)
+       end
+end
+
+{% if cfg.monitoring.state == "always" %}
+modules.load('stats')
+{% endif %}
+
+--- function used for statistics collection
+function collect_lazy_statistics()
+       if stats == nil then
+               modules.load('stats')
+       end
+
+       return tojson(stats.list())
+end
+
+--- function used for statistics collection
+function collect_statistics()
+       return tojson(stats.list())
+end
index 5404d253df2b5fcd6bb9c537d0497c70414ff1d8..7353c87cbbe9fa3aa69c98a139b7b2ffc5a6d2b5 100644 (file)
@@ -3,7 +3,7 @@ import logging
 import sys
 from asyncio.futures import Future
 from subprocess import SubprocessError
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional
 
 import knot_resolver_manager.kresd_controller
 from knot_resolver_manager import kres_id
@@ -16,6 +16,7 @@ from knot_resolver_manager.kresd_controller.interface import (
     SubprocessStatus,
     SubprocessType,
 )
+from knot_resolver_manager.statistics import register_resolver_metrics_for, unregister_resolver_metrics_for
 from knot_resolver_manager.utils.functional import Result
 from knot_resolver_manager.utils.types import NoneType
 
@@ -78,14 +79,17 @@ class KresManager:
     async def _spawn_new_worker(self, config: KresConfig) -> None:
         subprocess = await self._controller.create_subprocess(config, SubprocessType.KRESD, kres_id.alloc())
         await subprocess.start()
+
+        register_resolver_metrics_for(subprocess)
         self._workers.append(subprocess)
 
     async def _stop_a_worker(self) -> None:
         if len(self._workers) == 0:
             raise IndexError("Can't stop a kresd when there are no running")
 
-        kresd = self._workers.pop()
-        await kresd.stop()
+        subprocess = self._workers.pop()
+        unregister_resolver_metrics_for(subprocess)
+        await subprocess.stop()
 
     async def _collect_already_running_children(self) -> None:
         for subp in await self._controller.get_all_running_instances():
@@ -123,13 +127,6 @@ class KresManager:
         await self._gc.stop()
         self._gc = None
 
-    async def command_all(self, cmd: str) -> Dict[kres_id.KresID, str]:
-        async def single_pair(sub: Subprocess) -> Tuple[kres_id.KresID, str]:
-            return sub.id, await sub.command(cmd)
-
-        pairs = await asyncio.gather(*(single_pair(inst) for inst in self._workers))
-        return dict(pairs)
-
     async def validate_config(self, _old: KresConfig, new: KresConfig) -> Result[NoneType, str]:
         async with self._manager_lock:
             logger.debug("Testing the new config with a canary process")
index c44b6acc9f9f37e2fcd4ed3f8210b5df7dd608ae..e68d825995fdf5b138108789d848b32a2dfabdb8 100644 (file)
@@ -89,8 +89,10 @@ class Subprocess:
         finally:
             if writer is not None:
                 writer.close()
+
+                # proper closing of the socket is only implemented in later versions of python
                 if sys.version_info.minor >= 7:
-                    await writer.wait_closed()
+                    await writer.wait_closed()  # type: ignore
 
 
 class SubprocessStatus(Enum):
index 245bbe496d8d4018d1ed628ac5722df6cc442623..d216e08ea73a620736cad5d3816e0f755f651bbb 100644 (file)
@@ -61,13 +61,10 @@ class Server:
     # This is top-level class containing pretty much everything. Instead of global
     # variables, we use instance attributes. That's why there are so many and it's
     # ok.
-    def __init__(self, store: ConfigStore, config_path: Optional[Path], manager: KresManager):
+    def __init__(self, store: ConfigStore, config_path: Optional[Path]):
         # config store & server dynamic reconfiguration
         self.config_store = store
 
-        # stats
-        self._manager = manager
-
         # HTTP server
         self.app = Application(middlewares=[error_handler])
         self.runner = AppRunner(self.app)
@@ -164,7 +161,7 @@ class Server:
 
     async def _handler_metrics(self, _request: web.Request) -> web.Response:
         return web.Response(
-            body=await statistics.collect(self.config_store.get(), self._manager),
+            body=await statistics.report_stats(self.config_store.get()),
             content_type="text/plain",
             charset="utf8",
         )
@@ -371,7 +368,7 @@ async def start_server(config: Union[Path, ParsedTree] = DEFAULT_MANAGER_CONFIG_
 
     # At this point, all backend functionality-providing components are initialized. It's therefore save to start
     # the API server.
-    server = Server(config_store, config if isinstance(config, Path) else None, manager)
+    server = Server(config_store, config if isinstance(config, Path) else None)
     await server.start()
     logger.info(f"Manager fully initialized and running in {round(time() - start_time, 3)} seconds")
 
index 060985cdd2b2d44c6f9a47a6755f4a650907a7d0..0675be25b42db51a6b32e01b2858acbc4d37f22a 100644 (file)
+import asyncio
 import json
-from typing import Any, Coroutine
+import logging
+from typing import Any, Awaitable, Callable, Dict, List, Tuple, TypeVar, Union
 
-from prometheus_client import Counter, Histogram, exposition
+from prometheus_client import Counter, Gauge, Histogram, exposition  # type: ignore
 
 from knot_resolver_manager.datamodel.config_schema import KresConfig
 from knot_resolver_manager.kres_id import KresID
-from knot_resolver_manager.kres_manager import KresManager
+from knot_resolver_manager.kresd_controller.interface import Subprocess
 
-KRESD_RESPONSE_LATENCY = Histogram(
-    "kresd_response_latency",
+logger = logging.getLogger(__name__)
+
+RESOLVER_RESPONSE_LATENCY = Histogram(
+    "resolver_response_latency",
     "Time it takes to respond to queries in seconds",
     buckets=[0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, float("inf")],
     labelnames=["instance_id"],
 )
-KRESD_REQUEST_TOTAL = Counter(
-    "kresd_request_total",
+RESOLVER_REQUEST_TOTAL = Counter(
+    "resolver_request_total",
     "total number of DNS requests (including internal client requests)",
     labelnames=["instance_id"],
 )
-KRESD_REQUEST_INTERNAL = Counter(
-    "kresd_request_internal",
+RESOLVER_REQUEST_INTERNAL = Counter(
+    "resolver_request_internal",
     "number of internal requests generated by Knot Resolver (e.g. DNSSEC trust anchor updates)",
     labelnames=["instance_id"],
 )
-KRESD_REQUEST_UDP = Counter(
-    "kresd_request_udp", "number of external requests received over plain UDP (RFC 1035)", labelnames=["instance_id"]
+RESOLVER_REQUEST_UDP = Counter(
+    "resolver_request_udp", "number of external requests received over plain UDP (RFC 1035)", labelnames=["instance_id"]
 )
-KRESD_REQUEST_TCP = Counter(
-    "kresd_request_tcp", "number of external requests received over plain TCP (RFC 1035)", labelnames=["instance_id"]
+RESOLVER_REQUEST_TCP = Counter(
+    "resolver_request_tcp", "number of external requests received over plain TCP (RFC 1035)", labelnames=["instance_id"]
 )
-KRESD_REQUEST_DOT = Counter(
-    "kresd_request_dot", "number of external requests received over DNS-over-TLS (RFC 7858)", labelnames=["instance_id"]
+RESOLVER_REQUEST_DOT = Counter(
+    "resolver_request_dot",
+    "number of external requests received over DNS-over-TLS (RFC 7858)",
+    labelnames=["instance_id"],
 )
-KRESD_REQUEST_DOH = Counter(
-    "kresd_request_doh",
+RESOLVER_REQUEST_DOH = Counter(
+    "resolver_request_doh",
     "number of external requests received over DNS-over-HTTP (RFC 8484)",
     labelnames=["instance_id"],
 )
-KRESD_REQUEST_XDP = Counter(
-    "kresd_request_xdp",
+RESOLVER_REQUEST_XDP = Counter(
+    "resolver_request_xdp",
     "number of external requests received over plain UDP via an AF_XDP socket",
     labelnames=["instance_id"],
 )
-KRESD_ANSWER_TOTAL = Counter("kresd_answer_total", "total number of answered queries", labelnames=["instance_id"])
-KRESD_ANSWER_CACHED = Counter(
-    "kresd_answer_cached", "number of queries answered from cache", labelnames=["instance_id"]
+RESOLVER_ANSWER_TOTAL = Counter("resolver_answer_total", "total number of answered queries", labelnames=["instance_id"])
+RESOLVER_ANSWER_CACHED = Counter(
+    "resolver_answer_cached", "number of queries answered from cache", labelnames=["instance_id"]
 )
 
-KRESD_ANSWER_RCODE_NOERROR = Counter(
-    "kresd_answer_rcode_noerror", "number of NOERROR answers", labelnames=["instance_id"]
+RESOLVER_ANSWER_RCODE_NOERROR = Counter(
+    "resolver_answer_rcode_noerror", "number of NOERROR answers", labelnames=["instance_id"]
 )
-KRESD_ANSWER_RCODE_NODATA = Counter(
-    "kresd_answer_rcode_nodata", "number of NOERROR answers without any data", labelnames=["instance_id"]
+RESOLVER_ANSWER_RCODE_NODATA = Counter(
+    "resolver_answer_rcode_nodata", "number of NOERROR answers without any data", labelnames=["instance_id"]
 )
-KRESD_ANSWER_RCODE_NXDOMAIN = Counter(
-    "kresd_answer_rcode_nxdomain", "number of NXDOMAIN answers", labelnames=["instance_id"]
+RESOLVER_ANSWER_RCODE_NXDOMAIN = Counter(
+    "resolver_answer_rcode_nxdomain", "number of NXDOMAIN answers", labelnames=["instance_id"]
 )
-KRESD_ANSWER_RCODE_SERVFAIL = Counter(
-    "kresd_answer_rcode_servfail", "number of SERVFAIL answers", labelnames=["instance_id"]
+RESOLVER_ANSWER_RCODE_SERVFAIL = Counter(
+    "resolver_answer_rcode_servfail", "number of SERVFAIL answers", labelnames=["instance_id"]
 )
 
-KRESD_ANSWER_FLAG_AA = Counter("kresd_answer_flag_aa", "number of authoritative answers", labelnames=["instance_id"])
-KRESD_ANSWER_FLAG_TC = Counter("kresd_answer_flag_tc", "number of truncated answers", labelnames=["instance_id"])
-KRESD_ANSWER_FLAG_RA = Counter(
-    "kresd_answer_flag_ra", "number of answers with recursion available flag", labelnames=["instance_id"]
+RESOLVER_ANSWER_FLAG_AA = Counter(
+    "resolver_answer_flag_aa", "number of authoritative answers", labelnames=["instance_id"]
+)
+RESOLVER_ANSWER_FLAG_TC = Counter("resolver_answer_flag_tc", "number of truncated answers", labelnames=["instance_id"])
+RESOLVER_ANSWER_FLAG_RA = Counter(
+    "resolver_answer_flag_ra", "number of answers with recursion available flag", labelnames=["instance_id"]
+)
+RESOLVER_ANSWER_FLAG_RD = Counter(
+    "resolver_answer_flags_rd", "number of recursion desired (in answer!)", labelnames=["instance_id"]
+)
+RESOLVER_ANSWER_FLAG_AD = Counter(
+    "resolver_answer_flag_ad", "number of authentic data (DNSSEC) answers", labelnames=["instance_id"]
 )
-KRESD_ANSWER_FLAG_RD = Counter(
-    "kresd_answer_flags_rd", "number of recursion desired (in answer!)", labelnames=["instance_id"]
+RESOLVER_ANSWER_FLAG_CD = Counter(
+    "resolver_answer_flag_cd", "number of checking disabled (DNSSEC) answers", labelnames=["instance_id"]
 )
-KRESD_ANSWER_FLAG_AD = Counter(
-    "kresd_answer_flag_ad", "number of authentic data (DNSSEC) answers", labelnames=["instance_id"]
+RESOLVER_ANSWER_FLAG_DO = Counter("resolver_answer_flag_do", "number of DNSSEC answer OK", labelnames=["instance_id"])
+RESOLVER_ANSWER_FLAG_EDNS0 = Counter(
+    "resolver_answer_flag_edns0", "number of answers with EDNS0 present", labelnames=["instance_id"]
 )
-KRESD_ANSWER_FLAG_CD = Counter(
-    "kresd_answer_flag_cd", "number of checking disabled (DNSSEC) answers", labelnames=["instance_id"]
+
+RESOLVER_QUERY_EDNS = Counter("resolver_query_edns", "number of queries with EDNS present", labelnames=["instance_id"])
+RESOLVER_QUERY_DNSSEC = Counter(
+    "resolver_query_dnssec", "number of queries with DNSSEC DO=1", labelnames=["instance_id"]
 )
-KRESD_ANSWER_FLAG_DO = Counter("kresd_answer_flag_do", "number of DNSSEC answer OK", labelnames=["instance_id"])
-KRESD_ANSWER_FLAG_EDNS0 = Counter(
-    "kresd_answer_flag_edns0", "number of answers with EDNS0 present", labelnames=["instance_id"]
+
+RESOLVER_METRICS_LOADED = Gauge(
+    "resolver_metrics_loaded",
+    "0 if metrics from resolver instance were not loaded, otherwise 1",
+    labelnames=["instance_id"],
+)
+
+
+_ALL_RESOLVER_METRICS: List[Union[Counter, Gauge, Histogram]] = [
+    RESOLVER_RESPONSE_LATENCY,
+    RESOLVER_REQUEST_TOTAL,
+    RESOLVER_REQUEST_INTERNAL,
+    RESOLVER_REQUEST_UDP,
+    RESOLVER_REQUEST_TCP,
+    RESOLVER_REQUEST_DOT,
+    RESOLVER_REQUEST_DOH,
+    RESOLVER_REQUEST_XDP,
+    RESOLVER_ANSWER_TOTAL,
+    RESOLVER_ANSWER_CACHED,
+    RESOLVER_ANSWER_RCODE_NOERROR,
+    RESOLVER_ANSWER_RCODE_NODATA,
+    RESOLVER_ANSWER_RCODE_NXDOMAIN,
+    RESOLVER_ANSWER_RCODE_SERVFAIL,
+    RESOLVER_ANSWER_FLAG_AA,
+    RESOLVER_ANSWER_FLAG_TC,
+    RESOLVER_ANSWER_FLAG_RA,
+    RESOLVER_ANSWER_FLAG_RD,
+    RESOLVER_ANSWER_FLAG_AD,
+    RESOLVER_ANSWER_FLAG_CD,
+    RESOLVER_ANSWER_FLAG_DO,
+    RESOLVER_ANSWER_FLAG_EDNS0,
+    RESOLVER_QUERY_EDNS,
+    RESOLVER_QUERY_DNSSEC,
+    RESOLVER_METRICS_LOADED,
+]
+
+MANAGER_REQUEST_RECONFIGURE_LATENCY = Histogram(
+    "manager_request_reconfigure_latency", "Time it takes to change configuration"
 )
 
-KRESD_QUERY_EDNS = Counter("kresd_query_edns", "number of queries with EDNS present", labelnames=["instance_id"])
-KRESD_QUERY_DNSSEC = Counter("kresd_query_dnssec", "number of queries with DNSSEC DO=1", labelnames=["instance_id"])
+_REGISTERED_RESOLVERS: Dict[KresID, Subprocess] = {}
+
 
-MANAGER_REQUEST_RECONFIGURE_LATENCY = Histogram("manager_request_reconfigure_latency", "Time it takes to change configuration")
+T = TypeVar("T")
 
 
-def async_timing_histogram(metric: Histogram):
-    def decorator(func: Coroutine[Any, Any, Any]):
-        async def wrapper(*args, **kwargs):
+def async_timing_histogram(metric: Histogram) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
+    """
+    Decorator which can be used to report duration on async functions
+    """
+
+    def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
+        async def wrapper(*args: Any, **kwargs: Any) -> T:
             with metric.time():
                 res = await func(*args, **kwargs)
                 return res
+
         return wrapper
+
     return decorator
 
 
-def _generate_instance_metrics(instance_id: KresID, metrics: Any) -> None:
+async def _command_registered_resolvers(cmd: str) -> Dict[KresID, str]:
+    async def single_pair(sub: Subprocess) -> Tuple[KresID, str]:
+        return sub.id, await sub.command(cmd)
+
+    pairs = await asyncio.gather(*(single_pair(inst) for inst in _REGISTERED_RESOLVERS.values()))
+    return dict(pairs)
+
+
+def _parse_resolver_metrics(instance_id: KresID, metrics: Any) -> None:
     # Uses private fields in order to translate kresd statistics into prometheus's library internal objects.
     # pylint: disable=protected-access
+    # pyright: reportUnknownMemberType=false
 
     sid = str(instance_id)
 
     # response latency histogram
     for i, duration in enumerate(("1ms", "10ms", "50ms", "100ms", "250ms", "500ms", "1000ms", "1500ms", "slow")):
-        KRESD_RESPONSE_LATENCY.labels(str(sid))._buckets[i].set(metrics[f"answer.{duration}"])
+        RESOLVER_RESPONSE_LATENCY.labels(sid)._buckets[i].set(metrics[f"answer.{duration}"])
     # TODO add sum after fixing https://gitlab.nic.cz/knot/knot-resolver/-/issues/721
-    # KRESD_RESPONSE_LATENCY.labels(str(id))._sum.set(sum)
-
-    KRESD_REQUEST_TOTAL.labels(str(sid))._value.set(metrics["request.total"])
-    KRESD_REQUEST_INTERNAL.labels(str(sid))._value.set(metrics["request.internal"])
-    KRESD_REQUEST_UDP.labels(str(sid))._value.set(metrics["request.udp"])
-    KRESD_REQUEST_TCP.labels(str(sid))._value.set(metrics["request.tcp"])
-    KRESD_REQUEST_DOT.labels(str(sid))._value.set(metrics["request.dot"])
-    KRESD_REQUEST_DOH.labels(str(sid))._value.set(metrics["request.doh"])
-    KRESD_REQUEST_XDP.labels(str(sid))._value.set(metrics["request.xdp"])
-
-    KRESD_ANSWER_TOTAL.labels(str(sid))._value.set(metrics["answer.total"])
-    KRESD_ANSWER_CACHED.labels(str(sid))._value.set(metrics["answer.cached"])
-
-    KRESD_ANSWER_RCODE_NOERROR.labels(str(sid))._value.set(metrics["answer.noerror"])
-    KRESD_ANSWER_RCODE_NODATA.labels(str(sid))._value.set(metrics["answer.nodata"])
-    KRESD_ANSWER_RCODE_NXDOMAIN.labels(str(sid))._value.set(metrics["answer.nxdomain"])
-    KRESD_ANSWER_RCODE_SERVFAIL.labels(str(sid))._value.set(metrics["answer.servfail"])
-
-    KRESD_ANSWER_FLAG_AA.labels(str(sid))._value.set(metrics["answer.aa"])
-    KRESD_ANSWER_FLAG_TC.labels(str(sid))._value.set(metrics["answer.tc"])
-    KRESD_ANSWER_FLAG_RA.labels(str(sid))._value.set(metrics["answer.ra"])
-    KRESD_ANSWER_FLAG_RD.labels(str(sid))._value.set(metrics["answer.rd"])
-    KRESD_ANSWER_FLAG_AD.labels(str(sid))._value.set(metrics["answer.ad"])
-    KRESD_ANSWER_FLAG_CD.labels(str(sid))._value.set(metrics["answer.cd"])
-    KRESD_ANSWER_FLAG_DO.labels(str(sid))._value.set(metrics["answer.do"])
-    KRESD_ANSWER_FLAG_EDNS0.labels(str(sid))._value.set(metrics["answer.edns0"])
-
-    KRESD_QUERY_EDNS.labels(str(sid))._value.set(metrics["query.edns"])
-    KRESD_QUERY_DNSSEC.labels(str(sid))._value.set(metrics["query.dnssec"])
-
-
-async def collect(_config: KresConfig, manager: KresManager) -> bytes:
+    # RESOLVER_RESPONSE_LATENCY.labels(str(id))._sum.set(sum)
+
+    RESOLVER_REQUEST_TOTAL.labels(sid)._value.set(metrics["request.total"])
+    RESOLVER_REQUEST_INTERNAL.labels(sid)._value.set(metrics["request.internal"])
+    RESOLVER_REQUEST_UDP.labels(sid)._value.set(metrics["request.udp"])
+    RESOLVER_REQUEST_TCP.labels(sid)._value.set(metrics["request.tcp"])
+    RESOLVER_REQUEST_DOT.labels(sid)._value.set(metrics["request.dot"])
+    RESOLVER_REQUEST_DOH.labels(sid)._value.set(metrics["request.doh"])
+    RESOLVER_REQUEST_XDP.labels(sid)._value.set(metrics["request.xdp"])
+
+    RESOLVER_ANSWER_TOTAL.labels(sid)._value.set(metrics["answer.total"])
+    RESOLVER_ANSWER_CACHED.labels(sid)._value.set(metrics["answer.cached"])
+
+    RESOLVER_ANSWER_RCODE_NOERROR.labels(sid)._value.set(metrics["answer.noerror"])
+    RESOLVER_ANSWER_RCODE_NODATA.labels(sid)._value.set(metrics["answer.nodata"])
+    RESOLVER_ANSWER_RCODE_NXDOMAIN.labels(sid)._value.set(metrics["answer.nxdomain"])
+    RESOLVER_ANSWER_RCODE_SERVFAIL.labels(sid)._value.set(metrics["answer.servfail"])
+
+    RESOLVER_ANSWER_FLAG_AA.labels(sid)._value.set(metrics["answer.aa"])
+    RESOLVER_ANSWER_FLAG_TC.labels(sid)._value.set(metrics["answer.tc"])
+    RESOLVER_ANSWER_FLAG_RA.labels(sid)._value.set(metrics["answer.ra"])
+    RESOLVER_ANSWER_FLAG_RD.labels(sid)._value.set(metrics["answer.rd"])
+    RESOLVER_ANSWER_FLAG_AD.labels(sid)._value.set(metrics["answer.ad"])
+    RESOLVER_ANSWER_FLAG_CD.labels(sid)._value.set(metrics["answer.cd"])
+    RESOLVER_ANSWER_FLAG_DO.labels(sid)._value.set(metrics["answer.do"])
+    RESOLVER_ANSWER_FLAG_EDNS0.labels(sid)._value.set(metrics["answer.edns0"])
+
+    RESOLVER_QUERY_EDNS.labels(sid)._value.set(metrics["query.edns"])
+    RESOLVER_QUERY_DNSSEC.labels(sid)._value.set(metrics["query.dnssec"])
+
+
+async def _collect_resolver_stats(lazy: bool) -> None:
+    ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()"
+    STATS_QUERY = "collect_statistics()"
+
+    cmd = ON_DEMAND_STATS_QUERY if lazy else STATS_QUERY
+    stats_raw = await _command_registered_resolvers(cmd)
+
+    for kid, raw in stats_raw.items():
+        RESOLVER_METRICS_LOADED.labels(str(id)).set(0)
+        try:
+            metrics = json.loads(raw[1:-1])
+            _parse_resolver_metrics(kid, metrics)
+
+            # mark that metrics have been loaded
+            RESOLVER_METRICS_LOADED.labels(str(id)).set(1)
+        except json.JSONDecodeError:
+            logger.warning("Failed to load metrics from resolver instance %d", id)
+
+
+def unregister_resolver_metrics_for(subprocess: Subprocess) -> None:
     """
-    Collects metrics from everything, returns data string in Prometheus format.
+    Cancel metric collection from resolver subprocess
     """
+    sid = str(subprocess.id)
+    for metric in _ALL_RESOLVER_METRICS:
+        metric.remove(sid)
 
-    ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()"
-    STATS_QUERY = "collect_statistics()"
+    del _REGISTERED_RESOLVERS[subprocess.id]
+
+
+def register_resolver_metrics_for(subprocess: Subprocess) -> None:
+    """
+    Register resolver subprocess for metric collection
+    """
+    sid = str(subprocess.id)
+    for metric in _ALL_RESOLVER_METRICS:
+        metric.labels(sid)
 
-    cmd = ON_DEMAND_STATS_QUERY
-    stats_raw = await manager.command_all(cmd)
+    _REGISTERED_RESOLVERS[subprocess.id] = subprocess
+
+
+async def report_stats(config: KresConfig) -> bytes:
+    """
+    Collects metrics from everything, returns data string in Prometheus format.
+    """
 
-    for id, raw in stats_raw.items():
-        metrics = json.loads(raw[1:-1])
-        _generate_instance_metrics(id, metrics)
+    if config.monitoring.state != "manager-only":
+        await _collect_resolver_stats(config.monitoring.state == "lazy")
 
     return exposition.generate_latest()
index 8f3e04f1aa7037912e41a367ceccd22fca4a3e50..3512111b40255fee2fb10fe6b1dcc7ac7ecb6249 100644 (file)
@@ -22,6 +22,7 @@ install_requires = \
  'PyYAML>=5.4.1',
  'aiohttp>=3.6.12',
  'click>=7.1.2',
+ 'prometheus-client>=0.6',
  'pydbus>=0.6.0',
  'requests>=2.25.1',
  'typing-extensions>=3.7.2']