From: Vasek Sraier Date: Sun, 6 Feb 2022 18:10:17 +0000 (+0100) Subject: manager: exportings prometheus metrics FIXME X-Git-Tag: v6.0.0a1~43^2~8 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7d66c650926f35f00ef264394e7e5b940c32e274;p=thirdparty%2Fknot-resolver.git manager: exportings prometheus metrics FIXME --- diff --git a/manager/knot_resolver_manager/datamodel/config_schema.py b/manager/knot_resolver_manager/datamodel/config_schema.py index 55c33711d..a0d9dc1c9 100644 --- a/manager/knot_resolver_manager/datamodel/config_schema.py +++ b/manager/knot_resolver_manager/datamodel/config_schema.py @@ -1,5 +1,4 @@ import os -import pathlib import sys from typing import Dict, Optional, Union diff --git a/manager/knot_resolver_manager/datamodel/templates/config.lua.j2 b/manager/knot_resolver_manager/datamodel/templates/config.lua.j2 index de5d64fdf..ff89cf811 100644 --- a/manager/knot_resolver_manager/datamodel/templates/config.lua.j2 +++ b/manager/knot_resolver_manager/datamodel/templates/config.lua.j2 @@ -68,3 +68,14 @@ else log_warn(ffi.C.LOG_GRP_NETWORK, 'bind to '..path..' failed '..err) end end + +function collect_lazy_statistics() + if stats == nil then + modules.load('stats') + end + + return tojson(stats.list()) +end +function collect_statistics() + return tojson(stats.list()) +end diff --git a/manager/knot_resolver_manager/kres_id.py b/manager/knot_resolver_manager/kres_id.py index 946b58d0a..5455076b4 100644 --- a/manager/knot_resolver_manager/kres_id.py +++ b/manager/knot_resolver_manager/kres_id.py @@ -24,6 +24,9 @@ class KresID: else: return self._repr + def __repr__(self) -> str: + return f"KresID({self})" + def __hash__(self) -> int: return self._id diff --git a/manager/knot_resolver_manager/kres_manager.py b/manager/knot_resolver_manager/kres_manager.py index 821fe5bf6..9a66082a2 100644 --- a/manager/knot_resolver_manager/kres_manager.py +++ b/manager/knot_resolver_manager/kres_manager.py @@ -3,7 +3,7 @@ import logging import sys from asyncio.futures import Future from subprocess import SubprocessError -from typing import List, Optional +from typing import Dict, List, Optional, Tuple import knot_resolver_manager.kresd_controller from knot_resolver_manager import kres_id @@ -124,6 +124,13 @@ class KresManager: await self._gc.stop() self._gc = None + async def command_all(self, cmd: str) -> Dict[kres_id.KresID, str]: + async def single_pair(sub: Subprocess) -> Tuple[kres_id.KresID, str]: + return sub.id, await sub.command(cmd) + + pairs = await asyncio.gather(*(single_pair(inst) for inst in self._workers)) + return dict(pairs) + async def validate_config(self, _old: KresConfig, new: KresConfig) -> Result[NoneType, str]: async with self._manager_lock: logger.debug("Testing the new config with a canary process") diff --git a/manager/knot_resolver_manager/kresd_controller/interface.py b/manager/knot_resolver_manager/kresd_controller/interface.py index 9293d22dd..c44b6acc9 100644 --- a/manager/knot_resolver_manager/kresd_controller/interface.py +++ b/manager/knot_resolver_manager/kresd_controller/interface.py @@ -1,5 +1,7 @@ +import asyncio +import sys from enum import Enum, auto -from typing import Dict, Iterable +from typing import Dict, Iterable, Optional from knot_resolver_manager.constants import kresd_config_file from knot_resolver_manager.datamodel.config_schema import KresConfig @@ -66,6 +68,30 @@ class Subprocess: def id(self) -> KresID: raise NotImplementedError() + async def command(self, cmd: str) -> str: + reader: asyncio.StreamReader + writer: Optional[asyncio.StreamWriter] = None + try: + reader, writer = await asyncio.open_unix_connection(f"./control/{self.id}") + + # drop prompt + _ = await reader.read(2) + + # write command + writer.write(cmd.encode("utf8")) + writer.write(b"\n") + await writer.drain() + + # read result + result_bytes = await reader.readline() + return result_bytes.decode("utf8")[:-1] # strip trailing newline + + finally: + if writer is not None: + writer.close() + if sys.version_info.minor >= 7: + await writer.wait_closed() + class SubprocessStatus(Enum): RUNNING = auto() diff --git a/manager/knot_resolver_manager/server.py b/manager/knot_resolver_manager/server.py index ed3253002..d1f520d5f 100644 --- a/manager/knot_resolver_manager/server.py +++ b/manager/knot_resolver_manager/server.py @@ -14,7 +14,7 @@ from aiohttp.web_app import Application from aiohttp.web_response import json_response from aiohttp.web_runner import AppRunner, TCPSite, UnixSite -from knot_resolver_manager import log +from knot_resolver_manager import log, statistics from knot_resolver_manager.compat import asyncio as asyncio_compat from knot_resolver_manager.config_store import ConfigStore from knot_resolver_manager.constants import DEFAULT_MANAGER_CONFIG_FILE @@ -61,10 +61,13 @@ class Server: # This is top-level class containing pretty much everything. Instead of global # variables, we use instance attributes. That's why there are so many and it's # ok. - def __init__(self, store: ConfigStore, config_path: Optional[Path]): + def __init__(self, store: ConfigStore, config_path: Optional[Path], manager: KresManager): # config store & server dynamic reconfiguration self.config_store = store + # stats + self._manager = manager + # HTTP server self.app = Application(middlewares=[error_handler]) self.runner = AppRunner(self.app) @@ -158,6 +161,13 @@ class Server: # return success return web.Response() + async def _handler_metrics(self, _request: web.Request) -> web.Response: + return web.Response( + body=await statistics.collect(self.config_store.get(), self._manager), + content_type="text/plain", + charset="utf8", + ) + async def _handler_schema(self, _request: web.Request) -> web.Response: return web.json_response(KresConfig.json_schema(), headers={"Access-Control-Allow-Origin": "*"}) @@ -207,6 +217,7 @@ class Server: web.post("/stop", self._handler_stop), web.get("/schema", self._handler_schema), web.get("/schema/ui", self._handle_view_schema), + web.get("/metrics", self._handler_metrics), ] ) @@ -359,7 +370,7 @@ async def start_server(config: Union[Path, ParsedTree] = DEFAULT_MANAGER_CONFIG_ # At this point, all backend functionality-providing components are initialized. It's therefore save to start # the API server. - server = Server(config_store, config if isinstance(config, Path) else None) + server = Server(config_store, config if isinstance(config, Path) else None, manager) await server.start() logger.info(f"Manager fully initialized and running in {round(time() - start_time, 3)} seconds") diff --git a/manager/knot_resolver_manager/statistics.py b/manager/knot_resolver_manager/statistics.py new file mode 100644 index 000000000..653d5888c --- /dev/null +++ b/manager/knot_resolver_manager/statistics.py @@ -0,0 +1,142 @@ +import json +from typing import Any + +from prometheus_client import Counter, Histogram, exposition + +from knot_resolver_manager.datamodel.config_schema import KresConfig +from knot_resolver_manager.kres_id import KresID +from knot_resolver_manager.kres_manager import KresManager + +KRESD_RESPONSE_LATENCY = Histogram( + "kresd_response_latency", + "Time it takes to respond to queries in seconds", + buckets=[0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, float("inf")], + labelnames=["instance_id"], +) +KRESD_REQUEST_TOTAL = Counter( + "kresd_request_total", + "total number of DNS requests (including internal client requests)", + labelnames=["instance_id"], +) +KRESD_REQUEST_INTERNAL = Counter( + "kresd_request_internal", + "number of internal requests generated by Knot Resolver (e.g. DNSSEC trust anchor updates)", + labelnames=["instance_id"], +) +KRESD_REQUEST_UDP = Counter( + "kresd_request_udp", "number of external requests received over plain UDP (RFC 1035)", labelnames=["instance_id"] +) +KRESD_REQUEST_TCP = Counter( + "kresd_request_tcp", "number of external requests received over plain TCP (RFC 1035)", labelnames=["instance_id"] +) +KRESD_REQUEST_DOT = Counter( + "kresd_request_dot", "number of external requests received over DNS-over-TLS (RFC 7858)", labelnames=["instance_id"] +) +KRESD_REQUEST_DOH = Counter( + "kresd_request_doh", + "number of external requests received over DNS-over-HTTP (RFC 8484)", + labelnames=["instance_id"], +) +KRESD_REQUEST_XDP = Counter( + "kresd_request_xdp", + "number of external requests received over plain UDP via an AF_XDP socket", + labelnames=["instance_id"], +) +KRESD_ANSWER_TOTAL = Counter("kresd_answer_total", "total number of answered queries", labelnames=["instance_id"]) +KRESD_ANSWER_CACHED = Counter( + "kresd_answer_cached", "number of queries answered from cache", labelnames=["instance_id"] +) + +KRESD_ANSWER_RCODE_NOERROR = Counter( + "kresd_answer_rcode_noerror", "number of NOERROR answers", labelnames=["instance_id"] +) +KRESD_ANSWER_RCODE_NODATA = Counter( + "kresd_answer_rcode_nodata", "number of NOERROR answers without any data", labelnames=["instance_id"] +) +KRESD_ANSWER_RCODE_NXDOMAIN = Counter( + "kresd_answer_rcode_nxdomain", "number of NXDOMAIN answers", labelnames=["instance_id"] +) +KRESD_ANSWER_RCODE_SERVFAIL = Counter( + "kresd_answer_rcode_servfail", "number of SERVFAIL answers", labelnames=["instance_id"] +) + +KRESD_ANSWER_FLAG_AA = Counter("kresd_answer_flag_aa", "number of authoritative answers", labelnames=["instance_id"]) +KRESD_ANSWER_FLAG_TC = Counter("kresd_answer_flag_tc", "number of truncated answers", labelnames=["instance_id"]) +KRESD_ANSWER_FLAG_RA = Counter( + "kresd_answer_flag_ra", "number of answers with recursion available flag", labelnames=["instance_id"] +) +KRESD_ANSWER_FLAG_RD = Counter( + "kresd_answer_flags_rd", "number of recursion desired (in answer!)", labelnames=["instance_id"] +) +KRESD_ANSWER_FLAG_AD = Counter( + "kresd_answer_flag_ad", "number of authentic data (DNSSEC) answers", labelnames=["instance_id"] +) +KRESD_ANSWER_FLAG_CD = Counter( + "kresd_answer_flag_cd", "number of checking disabled (DNSSEC) answers", labelnames=["instance_id"] +) +KRESD_ANSWER_FLAG_DO = Counter("kresd_answer_flag_do", "number of DNSSEC answer OK", labelnames=["instance_id"]) +KRESD_ANSWER_FLAG_EDNS0 = Counter( + "kresd_answer_flag_edns0", "number of answers with EDNS0 present", labelnames=["instance_id"] +) + +KRESD_QUERY_EDNS = Counter("kresd_query_edns", "number of queries with EDNS present", labelnames=["instance_id"]) +KRESD_QUERY_DNSSEC = Counter("kresd_query_dnssec", "number of queries with DNSSEC DO=1", labelnames=["instance_id"]) + + +def _generate_instance_metrics(instance_id: KresID, metrics: Any) -> None: + # Uses private fields in order to translate kresd statistics into prometheus's library internal objects. + # pylint: disable=protected-access + + sid = str(instance_id) + + # response latency histogram + for i, duration in enumerate(("1ms", "10ms", "50ms", "100ms", "250ms", "500ms", "1000ms", "1500ms", "slow")): + KRESD_RESPONSE_LATENCY.labels(str(sid))._buckets[i].set(metrics[f"answer.{duration}"]) + # TODO add sum after fixing https://gitlab.nic.cz/knot/knot-resolver/-/issues/721 + # KRESD_RESPONSE_LATENCY.labels(str(id))._sum.set(sum) + + KRESD_REQUEST_TOTAL.labels(str(sid))._value.set(metrics["request.total"]) + KRESD_REQUEST_INTERNAL.labels(str(sid))._value.set(metrics["request.internal"]) + KRESD_REQUEST_UDP.labels(str(sid))._value.set(metrics["request.udp"]) + KRESD_REQUEST_TCP.labels(str(sid))._value.set(metrics["request.tcp"]) + KRESD_REQUEST_DOT.labels(str(sid))._value.set(metrics["request.dot"]) + KRESD_REQUEST_DOH.labels(str(sid))._value.set(metrics["request.doh"]) + KRESD_REQUEST_XDP.labels(str(sid))._value.set(metrics["request.xdp"]) + + KRESD_ANSWER_TOTAL.labels(str(sid))._value.set(metrics["answer.total"]) + KRESD_ANSWER_CACHED.labels(str(sid))._value.set(metrics["answer.cached"]) + + KRESD_ANSWER_RCODE_NOERROR.labels(str(sid))._value.set(metrics["answer.noerror"]) + KRESD_ANSWER_RCODE_NODATA.labels(str(sid))._value.set(metrics["answer.nodata"]) + KRESD_ANSWER_RCODE_NXDOMAIN.labels(str(sid))._value.set(metrics["answer.nxdomain"]) + KRESD_ANSWER_RCODE_SERVFAIL.labels(str(sid))._value.set(metrics["answer.servfail"]) + + KRESD_ANSWER_FLAG_AA.labels(str(sid))._value.set(metrics["answer.aa"]) + KRESD_ANSWER_FLAG_TC.labels(str(sid))._value.set(metrics["answer.tc"]) + KRESD_ANSWER_FLAG_RA.labels(str(sid))._value.set(metrics["answer.ra"]) + KRESD_ANSWER_FLAG_RD.labels(str(sid))._value.set(metrics["answer.rd"]) + KRESD_ANSWER_FLAG_AD.labels(str(sid))._value.set(metrics["answer.ad"]) + KRESD_ANSWER_FLAG_CD.labels(str(sid))._value.set(metrics["answer.cd"]) + KRESD_ANSWER_FLAG_DO.labels(str(sid))._value.set(metrics["answer.do"]) + KRESD_ANSWER_FLAG_EDNS0.labels(str(sid))._value.set(metrics["answer.edns0"]) + + KRESD_QUERY_EDNS.labels(str(sid))._value.set(metrics["query.edns"]) + KRESD_QUERY_DNSSEC.labels(str(sid))._value.set(metrics["query.dnssec"]) + + +async def collect(_config: KresConfig, manager: KresManager) -> bytes: + """ + Collects metrics from everything, returns data string in Prometheus format. + """ + + ON_DEMAND_STATS_QUERY = "collect_lazy_statistics()" + STATS_QUERY = "collect_statistics()" + + cmd = ON_DEMAND_STATS_QUERY + stats_raw = await manager.command_all(cmd) + + for id, raw in stats_raw.items(): + metrics = json.loads(raw[1:-1]) + _generate_instance_metrics(id, metrics) + + return exposition.generate_latest() diff --git a/manager/poetry.lock b/manager/poetry.lock index c1b66e62e..bb670f611 100644 --- a/manager/poetry.lock +++ b/manager/poetry.lock @@ -681,6 +681,17 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [package.dependencies] importlib-metadata = {version = ">=1.7.0,<2.0.0", markers = "python_version >= \"2.7\" and python_version < \"2.8\" or python_version >= \"3.5\" and python_version < \"3.8\""} +[[package]] +name = "prometheus-client" +version = "0.6.0" +description = "Python client for the Prometheus monitoring system." +category = "main" +optional = false +python-versions = "*" + +[package.extras] +twisted = ["twisted"] + [[package]] name = "ptyprocess" version = "0.7.0" @@ -1268,7 +1279,7 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytes [metadata] lock-version = "1.1" python-versions = "^3.6.8" -content-hash = "421fc5a54b8b87ecf0af518886c50b59be12ef7d6fc84bfbbfadbc8328a8a359" +content-hash = "d623052939bf87e455b7193e41aa06582ecc2db154da2cb5c2c77a183876186f" [metadata.files] aiohttp = [ @@ -1962,6 +1973,9 @@ poetry-core = [ {file = "poetry-core-1.0.7.tar.gz", hash = "sha256:98c11c755a16ef6c5673c22ca94a3802a7df4746a0853a70b6fae8b9f5cac206"}, {file = "poetry_core-1.0.7-py2.py3-none-any.whl", hash = "sha256:4f8a7f5390d772f42c4c4c3f188e6424b802cb4b57466c6633a1b9ac36f18a43"}, ] +prometheus-client = [ + {file = "prometheus_client-0.6.0.tar.gz", hash = "sha256:1b38b958750f66f208bcd9ab92a633c0c994d8859c831f7abc1f46724fcee490"}, +] ptyprocess = [ {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, diff --git a/manager/pyproject.toml b/manager/pyproject.toml index 329845321..5b29ba567 100644 --- a/manager/pyproject.toml +++ b/manager/pyproject.toml @@ -17,6 +17,7 @@ click = "^7.1.2" PyYAML = "^5.4.1" requests = "^2.25.1" typing-extensions = ">=3.7.2" +prometheus-client = "^0.6" [tool.poetry.dev-dependencies] pytest-cov = "^2.11.1"