]> git.ipfire.org Git - thirdparty/knot-resolver.git/commitdiff
manager: exportings prometheus metrics FIXME
authorVasek Sraier <git@vakabus.cz>
Sun, 6 Feb 2022 18:10:17 +0000 (19:10 +0100)
committerAleš Mrázek <ales.mrazek@nic.cz>
Fri, 8 Apr 2022 14:17:54 +0000 (16:17 +0200)
manager/knot_resolver_manager/datamodel/config_schema.py
manager/knot_resolver_manager/datamodel/templates/config.lua.j2
manager/knot_resolver_manager/kres_id.py
manager/knot_resolver_manager/kres_manager.py
manager/knot_resolver_manager/kresd_controller/interface.py
manager/knot_resolver_manager/server.py
manager/knot_resolver_manager/statistics.py [new file with mode: 0644]
manager/poetry.lock
manager/pyproject.toml

index 55c33711d5b7c9e36df16d006e5e5008d7dd156c..a0d9dc1c974ddce5dcd7b7276e94b57108328852 100644 (file)
@@ -1,5 +1,4 @@
 import os
-import pathlib
 import sys
 from typing import Dict, Optional, Union
 
index de5d64fdf844925e6c7ba5e863ed51ce75d1cdc5..ff89cf811f8e67710651a8a7d8fd20feb00ba2cc 100644 (file)
@@ -68,3 +68,14 @@ else
                log_warn(ffi.C.LOG_GRP_NETWORK, 'bind to '..path..' failed '..err)
        end
 end
+
+function collect_lazy_statistics()
+       if stats == nil then
+               modules.load('stats')
+       end
+
+       return tojson(stats.list())
+end
+function collect_statistics()
+       return tojson(stats.list())
+end
index 946b58d0a0b7962f35cc6b19275deb7907bb6b14..5455076b42731182819cd30ef962acf74e4e3c51 100644 (file)
@@ -24,6 +24,9 @@ class KresID:
         else:
             return self._repr
 
+    def __repr__(self) -> str:
+        return f"KresID({self})"
+
     def __hash__(self) -> int:
         return self._id
 
index 821fe5bf6bb2addb643ae95c7587c56dd36be801..9a66082a2f4361c83db473e5491fe61056fa5343 100644 (file)
@@ -3,7 +3,7 @@ import logging
 import sys
 from asyncio.futures import Future
 from subprocess import SubprocessError
-from typing import List, Optional
+from typing import Dict, List, Optional, Tuple
 
 import knot_resolver_manager.kresd_controller
 from knot_resolver_manager import kres_id
@@ -124,6 +124,13 @@ class KresManager:
         await self._gc.stop()
         self._gc = None
 
+    async def command_all(self, cmd: str) -> Dict[kres_id.KresID, str]:
+        async def single_pair(sub: Subprocess) -> Tuple[kres_id.KresID, str]:
+            return sub.id, await sub.command(cmd)
+
+        pairs = await asyncio.gather(*(single_pair(inst) for inst in self._workers))
+        return dict(pairs)
+
     async def validate_config(self, _old: KresConfig, new: KresConfig) -> Result[NoneType, str]:
         async with self._manager_lock:
             logger.debug("Testing the new config with a canary process")
index 9293d22ddd1dda897ef09085658e2b73c5fe0d6d..c44b6acc9f9f37e2fcd4ed3f8210b5df7dd608ae 100644 (file)
@@ -1,5 +1,7 @@
+import asyncio
+import sys
 from enum import Enum, auto
-from typing import Dict, Iterable
+from typing import Dict, Iterable, Optional
 
 from knot_resolver_manager.constants import kresd_config_file
 from knot_resolver_manager.datamodel.config_schema import KresConfig
@@ -66,6 +68,30 @@ class Subprocess:
     def id(self) -> KresID:
         raise NotImplementedError()
 
+    async def command(self, cmd: str) -> str:
+        reader: asyncio.StreamReader
+        writer: Optional[asyncio.StreamWriter] = None
+        try:
+            reader, writer = await asyncio.open_unix_connection(f"./control/{self.id}")
+
+            # drop prompt
+            _ = await reader.read(2)
+
+            # write command
+            writer.write(cmd.encode("utf8"))
+            writer.write(b"\n")
+            await writer.drain()
+
+            # read result
+            result_bytes = await reader.readline()
+            return result_bytes.decode("utf8")[:-1]  # strip trailing newline
+
+        finally:
+            if writer is not None:
+                writer.close()
+                if sys.version_info.minor >= 7:
+                    await writer.wait_closed()
+
 
 class SubprocessStatus(Enum):
     RUNNING = auto()
index ed3253002cbff33cb2bfeaa08581377ec7aa353a..d1f520d5fd77ef77f27459c1a3defdb65ffb89af 100644 (file)
@@ -14,7 +14,7 @@ from aiohttp.web_app import Application
 from aiohttp.web_response import json_response
 from aiohttp.web_runner import AppRunner, TCPSite, UnixSite
 
-from knot_resolver_manager import log
+from knot_resolver_manager import log, statistics
 from knot_resolver_manager.compat import asyncio as asyncio_compat
 from knot_resolver_manager.config_store import ConfigStore
 from knot_resolver_manager.constants import DEFAULT_MANAGER_CONFIG_FILE
@@ -61,10 +61,13 @@ class Server:
     # This is top-level class containing pretty much everything. Instead of global
     # variables, we use instance attributes. That's why there are so many and it's
     # ok.
-    def __init__(self, store: ConfigStore, config_path: Optional[Path]):
+    def __init__(self, store: ConfigStore, config_path: Optional[Path], manager: KresManager):
         # config store & server dynamic reconfiguration
         self.config_store = store
 
+        # stats
+        self._manager = manager
+
         # HTTP server
         self.app = Application(middlewares=[error_handler])
         self.runner = AppRunner(self.app)
@@ -158,6 +161,13 @@ class Server:
         # return success
         return web.Response()
 
+    async def _handler_metrics(self, _request: web.Request) -> web.Response:
+        return web.Response(
+            body=await statistics.collect(self.config_store.get(), self._manager),
+            content_type="text/plain",
+            charset="utf8",
+        )
+
     async def _handler_schema(self, _request: web.Request) -> web.Response:
         return web.json_response(KresConfig.json_schema(), headers={"Access-Control-Allow-Origin": "*"})
 
@@ -207,6 +217,7 @@ class Server:
                 web.post("/stop", self._handler_stop),
                 web.get("/schema", self._handler_schema),
                 web.get("/schema/ui", self._handle_view_schema),
+                web.get("/metrics", self._handler_metrics),
             ]
         )
 
@@ -359,7 +370,7 @@ async def start_server(config: Union[Path, ParsedTree] = DEFAULT_MANAGER_CONFIG_
 
     # At this point, all backend functionality-providing components are initialized. It's therefore save to start
     # the API server.
-    server = Server(config_store, config if isinstance(config, Path) else None)
+    server = Server(config_store, config if isinstance(config, Path) else None, manager)
     await server.start()
     logger.info(f"Manager fully initialized and running in {round(time() - start_time, 3)} seconds")
 
diff --git a/manager/knot_resolver_manager/statistics.py b/manager/knot_resolver_manager/statistics.py
new file mode 100644 (file)
index 0000000..653d588
--- /dev/null
@@ -0,0 +1,142 @@
+import json
+from typing import Any
+
+from prometheus_client import Counter, Histogram, exposition
+
+from knot_resolver_manager.datamodel.config_schema import KresConfig
+from knot_resolver_manager.kres_id import KresID
+from knot_resolver_manager.kres_manager import KresManager
+
+KRESD_RESPONSE_LATENCY = Histogram(
+    "kresd_response_latency",
+    "Time it takes to respond to queries in seconds",
+    buckets=[0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, float("inf")],
+    labelnames=["instance_id"],
+)
+KRESD_REQUEST_TOTAL = Counter(
+    "kresd_request_total",
+    "total number of DNS requests (including internal client requests)",
+    labelnames=["instance_id"],
+)
+KRESD_REQUEST_INTERNAL = Counter(
+    "kresd_request_internal",
+    "number of internal requests generated by Knot Resolver (e.g. DNSSEC trust anchor updates)",
+    labelnames=["instance_id"],
+)
+KRESD_REQUEST_UDP = Counter(
+    "kresd_request_udp", "number of external requests received over plain UDP (RFC 1035)", labelnames=["instance_id"]
+)
+KRESD_REQUEST_TCP = Counter(
+    "kresd_request_tcp", "number of external requests received over plain TCP (RFC 1035)", labelnames=["instance_id"]
+)
+KRESD_REQUEST_DOT = Counter(
+    "kresd_request_dot", "number of external requests received over DNS-over-TLS (RFC 7858)", labelnames=["instance_id"]
+)
+KRESD_REQUEST_DOH = Counter(
+    "kresd_request_doh",
+    "number of external requests received over DNS-over-HTTP (RFC 8484)",
+    labelnames=["instance_id"],
+)
+KRESD_REQUEST_XDP = Counter(
+    "kresd_request_xdp",
+    "number of external requests received over plain UDP via an AF_XDP socket",
+    labelnames=["instance_id"],
+)
+KRESD_ANSWER_TOTAL = Counter("kresd_answer_total", "total number of answered queries", labelnames=["instance_id"])
+KRESD_ANSWER_CACHED = Counter(
+    "kresd_answer_cached", "number of queries answered from cache", labelnames=["instance_id"]
+)
+
+KRESD_ANSWER_RCODE_NOERROR = Counter(
+    "kresd_answer_rcode_noerror", "number of NOERROR answers", labelnames=["instance_id"]
+)
+KRESD_ANSWER_RCODE_NODATA = Counter(
+    "kresd_answer_rcode_nodata", "number of NOERROR answers without any data", labelnames=["instance_id"]
+)
+KRESD_ANSWER_RCODE_NXDOMAIN = Counter(
+    "kresd_answer_rcode_nxdomain", "number of NXDOMAIN answers", labelnames=["instance_id"]
+)
+KRESD_ANSWER_RCODE_SERVFAIL = Counter(
+    "kresd_answer_rcode_servfail", "number of SERVFAIL answers", labelnames=["instance_id"]
+)
+
+KRESD_ANSWER_FLAG_AA = Counter("kresd_answer_flag_aa", "number of authoritative answers", labelnames=["instance_id"])
+KRESD_ANSWER_FLAG_TC = Counter("kresd_answer_flag_tc", "number of truncated answers", labelnames=["instance_id"])
+KRESD_ANSWER_FLAG_RA = Counter(
+    "kresd_answer_flag_ra", "number of answers with recursion available flag", labelnames=["instance_id"]
+)
+KRESD_ANSWER_FLAG_RD = Counter(
+    "kresd_answer_flags_rd", "number of recursion desired (in answer!)", labelnames=["instance_id"]
+)
+KRESD_ANSWER_FLAG_AD = Counter(
+    "kresd_answer_flag_ad", "number of authentic data (DNSSEC) answers", labelnames=["instance_id"]
+)
+KRESD_ANSWER_FLAG_CD = Counter(
+    "kresd_answer_flag_cd", "number of checking disabled (DNSSEC) answers", labelnames=["instance_id"]
+)
+KRESD_ANSWER_FLAG_DO = Counter("kresd_answer_flag_do", "number of DNSSEC answer OK", labelnames=["instance_id"])
+KRESD_ANSWER_FLAG_EDNS0 = Counter(
+    "kresd_answer_flag_edns0", "number of answers with EDNS0 present", labelnames=["instance_id"]
+)
+
+KRESD_QUERY_EDNS = Counter("kresd_query_edns", "number of queries with EDNS present", labelnames=["instance_id"])
+KRESD_QUERY_DNSSEC = Counter("kresd_query_dnssec", "number of queries with DNSSEC DO=1", labelnames=["instance_id"])
+
+
+def _generate_instance_metrics(instance_id: KresID, metrics: Any) -> None:
+    # Uses private fields in order to translate kresd statistics into prometheus's library internal objects.
+    # pylint: disable=protected-access
+
+    sid = str(instance_id)
+
+    # response latency histogram
+    for i, duration in enumerate(("1ms", "10ms", "50ms", "100ms", "250ms", "500ms", "1000ms", "1500ms", "slow")):
+        KRESD_RESPONSE_LATENCY.labels(str(sid))._buckets[i].set(metrics[f"answer.{duration}"])
+    # TODO add sum after fixing https://gitlab.nic.cz/knot/knot-resolver/-/issues/721
+    # KRESD_RESPONSE_LATENCY.labels(str(id))._sum.set(sum)
+
+    KRESD_REQUEST_TOTAL.labels(str(sid))._value.set(metrics["request.total"])
+    KRESD_REQUEST_INTERNAL.labels(str(sid))._value.set(metrics["request.internal"])
+    KRESD_REQUEST_UDP.labels(str(sid))._value.set(metrics["request.udp"])
+    KRESD_REQUEST_TCP.labels(str(sid))._value.set(metrics["request.tcp"])
+    KRESD_REQUEST_DOT.labels(str(sid))._value.set(metrics["request.dot"])
+    KRESD_REQUEST_DOH.labels(str(sid))._value.set(metrics["request.doh"])
+    KRESD_REQUEST_XDP.labels(str(sid))._value.set(metrics["request.xdp"])
+
+    KRESD_ANSWER_TOTAL.labels(str(sid))._value.set(metrics["answer.total"])
+    KRESD_ANSWER_CACHED.labels(str(sid))._value.set(metrics["answer.cached"])
+
+    KRESD_ANSWER_RCODE_NOERROR.labels(str(sid))._value.set(metrics["answer.noerror"])
+    KRESD_ANSWER_RCODE_NODATA.labels(str(sid))._value.set(metrics["answer.nodata"])
+    KRESD_ANSWER_RCODE_NXDOMAIN.labels(str(sid))._value.set(metrics["answer.nxdomain"])
+    KRESD_ANSWER_RCODE_SERVFAIL.labels(str(sid))._value.set(metrics["answer.servfail"])
+
+    KRESD_ANSWER_FLAG_AA.labels(str(sid))._value.set(metrics["answer.aa"])
+    KRESD_ANSWER_FLAG_TC.labels(str(sid))._value.set(metrics["answer.tc"])
+    KRESD_ANSWER_FLAG_RA.labels(str(sid))._value.set(metrics["answer.ra"])
+    KRESD_ANSWER_FLAG_RD.labels(str(sid))._value.set(metrics["answer.rd"])
+    KRESD_ANSWER_FLAG_AD.labels(str(sid))._value.set(metrics["answer.ad"])
+    KRESD_ANSWER_FLAG_CD.labels(str(sid))._value.set(metrics["answer.cd"])
+    KRESD_ANSWER_FLAG_DO.labels(str(sid))._value.set(metrics["answer.do"])
+    KRESD_ANSWER_FLAG_EDNS0.labels(str(sid))._value.set(metrics["answer.edns0"])
+
+    KRESD_QUERY_EDNS.labels(str(sid))._value.set(metrics["query.edns"])
+    KRESD_QUERY_DNSSEC.labels(str(sid))._value.set(metrics["query.dnssec"])
+
+
+async def collect(_config: KresConfig, manager: KresManager) -> bytes:
+    """
+    Collects metrics from everything, returns data string in Prometheus format.
+    """
+
+    ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()"
+    STATS_QUERY = "collect_statistics()"
+
+    cmd = ON_DEMAND_STATS_QUERY
+    stats_raw = await manager.command_all(cmd)
+
+    for id, raw in stats_raw.items():
+        metrics = json.loads(raw[1:-1])
+        _generate_instance_metrics(id, metrics)
+
+    return exposition.generate_latest()
index c1b66e62e0f77f7d6b2e255e3e4591231037d807..bb670f61175ce9e60af6d2582bd49833cac85479 100644 (file)
@@ -681,6 +681,17 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 [package.dependencies]
 importlib-metadata = {version = ">=1.7.0,<2.0.0", markers = "python_version >= \"2.7\" and python_version < \"2.8\" or python_version >= \"3.5\" and python_version < \"3.8\""}
 
+[[package]]
+name = "prometheus-client"
+version = "0.6.0"
+description = "Python client for the Prometheus monitoring system."
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.extras]
+twisted = ["twisted"]
+
 [[package]]
 name = "ptyprocess"
 version = "0.7.0"
@@ -1268,7 +1279,7 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytes
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.6.8"
-content-hash = "421fc5a54b8b87ecf0af518886c50b59be12ef7d6fc84bfbbfadbc8328a8a359"
+content-hash = "d623052939bf87e455b7193e41aa06582ecc2db154da2cb5c2c77a183876186f"
 
 [metadata.files]
 aiohttp = [
@@ -1962,6 +1973,9 @@ poetry-core = [
     {file = "poetry-core-1.0.7.tar.gz", hash = "sha256:98c11c755a16ef6c5673c22ca94a3802a7df4746a0853a70b6fae8b9f5cac206"},
     {file = "poetry_core-1.0.7-py2.py3-none-any.whl", hash = "sha256:4f8a7f5390d772f42c4c4c3f188e6424b802cb4b57466c6633a1b9ac36f18a43"},
 ]
+prometheus-client = [
+    {file = "prometheus_client-0.6.0.tar.gz", hash = "sha256:1b38b958750f66f208bcd9ab92a633c0c994d8859c831f7abc1f46724fcee490"},
+]
 ptyprocess = [
     {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
     {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
index 329845321b2ecc25ca085b8df18171a766666ebb..5b29ba5677c06aefe2d1e21329b6d9c07351e16c 100644 (file)
@@ -17,6 +17,7 @@ click = "^7.1.2"
 PyYAML = "^5.4.1"
 requests = "^2.25.1"
 typing-extensions = ">=3.7.2"
+prometheus-client = "^0.6"
 
 [tool.poetry.dev-dependencies]
 pytest-cov = "^2.11.1"