From f9a88887bb5b9e5e184ce3cc49e6d91f792f45b8 Mon Sep 17 00:00:00 2001 From: Otto Moerbeek Date: Thu, 12 Jun 2025 11:07:10 +0200 Subject: [PATCH] docs plus some cleanup Signed-off-by: Otto Moerbeek --- docs/manpages/sdig.1.rst | 3 +- pdns/dnsmessage.proto | 2 +- pdns/ednsoptions.hh | 1 + pdns/protozero-trace.hh | 7 +- .../recursordist/docs/lua-config/protobuf.rst | 3 + pdns/recursordist/docs/performance.rst | 234 ++++++++++++++++++ pdns/recursordist/pdns_recursor.cc | 9 +- pdns/recursordist/rec-eventtrace.cc | 7 +- pdns/recursordist/rec-main.cc | 5 +- pdns/recursordist/rec-main.hh | 2 +- pdns/recursordist/rec-rust-lib/table.py | 5 +- pdns/recursordist/rec-tcp.cc | 2 +- pdns/recursordist/rec_channel_rec.cc | 2 +- pdns/recursordist/test-protozero-trace.cc | 2 +- pdns/sdig.cc | 8 +- 15 files changed, 265 insertions(+), 27 deletions(-) diff --git a/docs/manpages/sdig.1.rst b/docs/manpages/sdig.1.rst index cd3b53ae52..5da95be9dd 100644 --- a/docs/manpages/sdig.1.rst +++ b/docs/manpages/sdig.1.rst @@ -59,7 +59,8 @@ opcode *OPNUM* Use opcode *OPNUM* instead of 0 (Query). For example, ``sdig 192.0.2.1 53 example.com SOA opcode 4`` sends a ``NOTIFY``. cookie *COOKIE* if *COOKIE* is ``-`` send a random client cookie. Otherwise send the given cookie, which should be a hex string received from a server earlier. - +otid *TraceID* + include a TraceID and an empty SpanID value into the EDNS data. If TraceID is ``-``, a random TraceID is generated, otherwise it is a hex string. Examples -------- diff --git a/pdns/dnsmessage.proto b/pdns/dnsmessage.proto index 57215b85e3..cfc721f2f2 100644 --- a/pdns/dnsmessage.proto +++ b/pdns/dnsmessage.proto @@ -192,7 +192,7 @@ message PBDNSMessage { optional uint32 outgoingQueries = 27; // Number of outgoing queries used to answer the query optional uint32 headerFlags = 28; // Flags field in wire format, 16 bits used optional uint32 ednsVersion = 29; // EDNS version and flags in wire format, see https://www.rfc-editor.org/rfc/rfc6891.html#section-6.1.3 - optional bytes openTelemetryData = 30; // Protobuf encoded OPen Telemetry Data, see https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/trace/v1/trace.proto + optional bytes openTelemetryData = 30; // Protobuf encoded Open Telemetry Data, see https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/trace/v1/trace.proto } message PBDNSMessageList { diff --git a/pdns/ednsoptions.hh b/pdns/ednsoptions.hh index 101d1e03b0..9ec8a5b81f 100644 --- a/pdns/ednsoptions.hh +++ b/pdns/ednsoptions.hh @@ -24,6 +24,7 @@ struct EDNSOptionCode { + // Temporary code assigned for OpenTelemetry TraceID and SpanID enum EDNSOptionCodeEnum : uint16_t {NSID=3, DAU=5, DHU=6, N3U=7, ECS=8, EXPIRE=9, COOKIE=10, TCPKEEPALIVE=11, PADDING=12, CHAIN=13, KEYTAG=14, EXTENDEDERROR=15, OTTRACEID=65500, OTSPANID=65501}; }; diff --git a/pdns/protozero-trace.hh b/pdns/protozero-trace.hh index 66b87b4c2b..632879fd99 100644 --- a/pdns/protozero-trace.hh +++ b/pdns/protozero-trace.hh @@ -246,7 +246,7 @@ inline void clear(SpanID& span) inline void fill(TraceID& trace, const std::string& data) { if (data.size() != trace.size()) { - throw std::runtime_error("TracID size mismatch"); + throw std::runtime_error("TraceID size mismatch"); } memcpy(trace.data(), data.data(), trace.size()); } @@ -254,7 +254,7 @@ inline void fill(TraceID& trace, const std::string& data) inline void fill(SpanID& span, const std::string& data) { if (data.size() != span.size()) { - throw std::runtime_error("TracID size mismatch"); + throw std::runtime_error("SpanID size mismatch"); } memcpy(span.data(), data.data(), span.size()); } @@ -671,8 +671,9 @@ struct TracesData return data; } - static TracesData boilerPlate(std::string&& service, std::vector&& spans) + static TracesData boilerPlate(std::string&& service, std::string&& req, std::vector&& spans) { + spans.at(0).attributes.push_back({"value", {req}}); return TracesData{ .resource_spans = {pdns::trace::ResourceSpans{.resource = {.attributes = {{"service.name", {{std::move(service)}}}}}, .scope_spans = {{.spans = std::move(spans)}}}}}; } diff --git a/pdns/recursordist/docs/lua-config/protobuf.rst b/pdns/recursordist/docs/lua-config/protobuf.rst index ec2e4216cd..eaecdd8c1b 100644 --- a/pdns/recursordist/docs/lua-config/protobuf.rst +++ b/pdns/recursordist/docs/lua-config/protobuf.rst @@ -8,6 +8,9 @@ In addition, messages related to responses contain the name, type, class and rda Finally, if a RPZ or custom Lua policy has been applied, response messages also contain the applied policy name and some tags. This is particularly useful to detect and act on infected hosts. +The Protobuf loggging stream can also include event information in a proprietary format and (starting with version 5.3.0) OpenTelemetry Trace data. +See :ref:`event-tracing` for more information. + Configuring Protocol Buffer logs -------------------------------- Protobuf export to a server is enabled using the ``protobufServer()`` directive: diff --git a/pdns/recursordist/docs/performance.rst b/pdns/recursordist/docs/performance.rst index 0b44f11801..f5e3cdfa4c 100644 --- a/pdns/recursordist/docs/performance.rst +++ b/pdns/recursordist/docs/performance.rst @@ -302,6 +302,8 @@ Measuring performance The PowerDNS Recursor exposes many :doc:`metrics ` that can be graphed and monitored. +.. _event-tracing: + Event Tracing ------------- Event tracing is an experimental feature introduced in version 4.6.0 that allows following the internals of processing queries in more detail. @@ -309,6 +311,7 @@ Event tracing is an experimental feature introduced in version 4.6.0 that allows In certain spots in the resolving process event records are created that contain an identification of the event, a timestamp, potentially a value and an indication if this was the start or the end of an event. This is relevant for events that describe stages in the resolving process. At this point in time event logs of queries can be exported using a protobuf log or they can be written to the log file. +Starting with version 5.3.0, the same information can also be exported as OpenTelemetry Traces data. Note that this is an experimental feature that will change in upcoming releases. @@ -406,3 +409,234 @@ An example where various Lua related events can be seen: There is no packet cache hit, so SyncRes is called which does a couple of outgoing queries. +OpenTelemetry Traces +^^^^^^^^^^^^^^^^^^^^ + +OpenTelemetry Traces can be generated by setting :ref:`setting-yaml-recursor.event_trace_enabled` or by using the :doc:`rec_control ` subcommand ``set-event-trace-enabled``. +:program:`Recursor` will set the ``openTelemetryData`` field of ``dnsmessage.proto`` messages crated to contain Protobuf encoded data. +The encoding used is defined in https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/trace/v1/trace.proto. + +The example decoder in https://github.com/PowerDNS/pdns/blob/master/contrib/ProtobufLogger.py can show a JSON representation of the data and optionally submit the data (in Protobuf format) to an OpenTelemetry trace collector using a POST to a URL given on the command line. + +An example, encoded in JSON: + +.. code-block:: json + + { + "resource_spans": [ + { + "resource": { + "attributes": [ + { + "key": "service.name", + "value": { + "string_value": "rec" + } + } + ] + }, + "scope_spans": [ + { + "spans": [ + { + "trace_id": "hYxA7fc6b/0KnrVRZTZPiw==", + "span_id": "JXJ4ddrswYQ=", + "parent_span_id": "AAAAAAAAAAA=", + "name": "RecRequest", + "start_time_unix_nano": "1749561792313272000", + "end_time_unix_nano": "1749561792750566000" + }, + { + "trace_id": "hYxA7fc6b/0KnrVRZTZPiw==", + "span_id": "MQHynlz1HY8=", + "parent_span_id": "JXJ4ddrswYQ=", + "name": "ReqRecv", + "start_time_unix_nano": "1749561792313272000", + "end_time_unix_nano": "1749561792750560000", + "attributes": [ + { + "key": "value", + "value": { + "string_value": "www.powerdns.net/A" + } + } + ] + }, + { + "trace_id": "hYxA7fc6b/0KnrVRZTZPiw==", + "span_id": "NZy+TnYqDHA=", + "parent_span_id": "JXJ4ddrswYQ=", + "name": "PCacheCheck", + "start_time_unix_nano": "1749561792313363000", + "end_time_unix_nano": "1749561792313365000", + "attributes": [ + { + "key": "value", + "value": { + "bool_value": false + } + } + ] + }, + { + "trace_id": "hYxA7fc6b/0KnrVRZTZPiw==", + "span_id": "Bo/RN3dz9fs=", + "parent_span_id": "JXJ4ddrswYQ=", + "name": "SyncRes", + "start_time_unix_nano": "1749561792313428000", + "end_time_unix_nano": "1749561792750302000", + "attributes": [ + { + "key": "value", + "value": { + "int_value": "0" + } + } + ] + }, + { + "trace_id": "hYxA7fc6b/0KnrVRZTZPiw==", + "span_id": "xNMXyT7mWjA=", + "parent_span_id": "Bo/RN3dz9fs=", + "name": "AuthRequest", + "start_time_unix_nano": "1749561792313485000", + "end_time_unix_nano": "1749561792571098000", + "attributes": [ + { + "key": "value", + "value": { + "string_value": "net/A" + } + }, + { + "key": "value", + "value": { + "int_value": "0" + } + } + ] + }, + { + "trace_id": "hYxA7fc6b/0KnrVRZTZPiw==", + "span_id": "p/JA/yx/qE0=", + "parent_span_id": "Bo/RN3dz9fs=", + "name": "AuthRequest", + "start_time_unix_nano": "1749561792572165000", + "end_time_unix_nano": "1749561792596116000", + "attributes": [ + { + "key": "value", + "value": { + "string_value": "powerdns.net/A" + } + }, + { + "key": "value", + "value": { + "int_value": "0" + } + } + ] + }, + { + "trace_id": "hYxA7fc6b/0KnrVRZTZPiw==", + "span_id": "CJOTDQJtrCo=", + "parent_span_id": "Bo/RN3dz9fs=", + "name": "AuthRequest", + "start_time_unix_nano": "1749561792596318000", + "end_time_unix_nano": "1749561792610541000", + "attributes": [ + { + "key": "value", + "value": { + "string_value": "net/DNSKEY" + } + }, + { + "key": "value", + "value": { + "int_value": "0" + } + } + ] + }, + { + "trace_id": "hYxA7fc6b/0KnrVRZTZPiw==", + "span_id": "/WEYJlz7JO4=", + "parent_span_id": "Bo/RN3dz9fs=", + "name": "AuthRequest", + "start_time_unix_nano": "1749561792611791000", + "end_time_unix_nano": "1749561792709927000", + "attributes": [ + { + "key": "value", + "value": { + "string_value": "www.powerdns.net/A" + } + }, + { + "key": "value", + "value": { + "int_value": "0" + } + } + ] + }, + { + "trace_id": "hYxA7fc6b/0KnrVRZTZPiw==", + "span_id": "XMYoT4nXjBo=", + "parent_span_id": "Bo/RN3dz9fs=", + "name": "AuthRequest", + "start_time_unix_nano": "1749561792710051000", + "end_time_unix_nano": "1749561792730321000", + "attributes": [ + { + "key": "value", + "Value": { + "string_value": "powerdns.net/DNSKEY" + } + }, + { + "key": "value", + "value": { + "int_value": "0" + } + } + ] + }, + { + "trace_id": "hYxA7fc6b/0KnrVRZTZPiw==", + "span_id": "wVWnDBsGOIo=", + "parent_span_id": "Bo/RN3dz9fs=", + "name": "AuthRequest", + "start_time_unix_nano": "1749561792730960000", + "end_time_unix_nano": "1749561792750073000", + "attributes": [ + { + "key": "value", + "value": { + "string_value": "powerdns.net/A" + } + }, + { + "key": "value", + "value": { + "int_value": "0" + } + } + ] + }, + { + "trace_id": "hYxA7fc6b/0KnrVRZTZPiw==", + "span_id": "qllduzHSRbQ=", + "parent_span_id": "JXJ4ddrswYQ=", + "name": "AnswerSent", + "start_time_unix_nano": "1749561792750557000", + "end_time_unix_nano": "1749561792750557000" + } + ] + } + ] + } + ] + } diff --git a/pdns/recursordist/pdns_recursor.cc b/pdns/recursordist/pdns_recursor.cc index 6a38e8f003..3935356607 100644 --- a/pdns/recursordist/pdns_recursor.cc +++ b/pdns/recursordist/pdns_recursor.cc @@ -1815,10 +1815,7 @@ void startDoResolve(void* arg) // NOLINT(readability-function-cognitive-complexi tcpGuard.setHandled(); } - if (resolver.d_eventTrace.enabled()) { - resolver.d_eventTrace.add(RecEventTrace::AnswerSent); - resolver.d_eventTrace.add(RecEventTrace::CustomEvent, comboWriter->d_mdp.d_qname.toLogString() + '/' + QType(comboWriter->d_mdp.d_qtype).toString(), false, -1U); // XXX - } + resolver.d_eventTrace.add(RecEventTrace::AnswerSent); // Now do the per query changing part of the protobuf message if (t_protobufServers.servers && !(luaconfsLocal->protobufExportConfig.taggedOnly && appliedPolicy.getName().empty() && comboWriter->d_policyTags.empty())) { @@ -1879,7 +1876,7 @@ void startDoResolve(void* arg) // NOLINT(readability-function-cognitive-complexi } if (resolver.d_eventTrace.enabled() && SyncRes::eventTraceEnabled(SyncRes::event_trace_to_ot)) { resolver.d_otTrace.close(); - auto otTrace = pdns::trace::TracesData::boilerPlate("rec", resolver.d_eventTrace.convertToOT(resolver.d_otTrace)); + auto otTrace = pdns::trace::TracesData::boilerPlate("rec", comboWriter->d_mdp.d_qname.toLogString() + '/' + QType(comboWriter->d_mdp.d_qtype).toString(), resolver.d_eventTrace.convertToOT(resolver.d_otTrace)); string otData = otTrace.encode(); pbMessage.setOpenTelemetryData(otData); } @@ -2345,7 +2342,7 @@ static string* doProcessUDPQuestion(const std::string& question, const ComboAddr eventTrace.add(RecEventTrace::AnswerSent); if (t_protobufServers.servers && logResponse && (!luaconfsLocal->protobufExportConfig.taggedOnly || (pbData && pbData->d_tagged))) { - protobufLogResponse(dnsheader, luaconfsLocal, pbData, tval, false, source, destination, mappedSource, ednssubnet, uniqueId, requestorId, deviceId, deviceName, meta, eventTrace, otTrace, policyTags); + protobufLogResponse(qname, qtype, dnsheader, luaconfsLocal, pbData, tval, false, source, destination, mappedSource, ednssubnet, uniqueId, requestorId, deviceId, deviceName, meta, eventTrace, otTrace, policyTags); } if (eventTrace.enabled() && SyncRes::eventTraceEnabled(SyncRes::event_trace_to_log)) { diff --git a/pdns/recursordist/rec-eventtrace.cc b/pdns/recursordist/rec-eventtrace.cc index fd4d4d4294..7b5797bfbb 100644 --- a/pdns/recursordist/rec-eventtrace.cc +++ b/pdns/recursordist/rec-eventtrace.cc @@ -87,7 +87,6 @@ std::vector RecEventTrace::convertToOT(const Span& span) cons size_t index = 0; for (const auto& event : d_events) { - cerr << index << ' ' << event.toString () << ' ' << event.d_matching << endl; if (event.d_start) { // It's an open event Span work{ @@ -113,11 +112,7 @@ std::vector RecEventTrace::convertToOT(const Span& span) cons } else { // It's a close event - if (event.d_matching == -1U) { - auto& work = ret.at(0); - addValue(event, work); - } - else if (ids.find(event.d_matching) != ids.end()) { + if (ids.find(event.d_matching) != ids.end()) { auto& work = ret.at(ids.at(event.d_matching)); addValue(event, work); work.end_time_unix_nano = static_cast(event.d_ts + diff); diff --git a/pdns/recursordist/rec-main.cc b/pdns/recursordist/rec-main.cc index 1175550642..e494f768bc 100644 --- a/pdns/recursordist/rec-main.cc +++ b/pdns/recursordist/rec-main.cc @@ -594,7 +594,8 @@ void protobufLogResponse(pdns::ProtoZero::RecMessage& message) } } -void protobufLogResponse(const struct dnsheader* header, LocalStateHolder& luaconfsLocal, +void protobufLogResponse(const DNSName& qname, QType qtype, + const struct dnsheader* header, LocalStateHolder& luaconfsLocal, const RecursorPacketCache::OptPBData& pbData, const struct timeval& tval, bool tcp, const ComboAddress& source, const ComboAddress& destination, const ComboAddress& mappedSource, @@ -665,7 +666,7 @@ void protobufLogResponse(const struct dnsheader* header, LocalStateHolder& luaconfsLocal, +void protobufLogResponse(const DNSName& qname, QType qtype, const struct dnsheader* header, LocalStateHolder& luaconfsLocal, const RecursorPacketCache::OptPBData& pbData, const struct timeval& tv, bool tcp, const ComboAddress& source, const ComboAddress& destination, const ComboAddress& mappedSource, const EDNSSubnetOpts& ednssubnet, diff --git a/pdns/recursordist/rec-rust-lib/table.py b/pdns/recursordist/rec-rust-lib/table.py index bafbeba43a..cf1cc25c68 100644 --- a/pdns/recursordist/rec-rust-lib/table.py +++ b/pdns/recursordist/rec-rust-lib/table.py @@ -995,10 +995,11 @@ This file can be used to serve data authoritatively using :ref:`setting-export-e 'section' : 'recursor', 'type' : LType.Uint64, 'default' : '0', - 'help' : 'If set, event traces are collected and send out via protobuf logging (1), logfile (2) or both(3)', + 'help' : 'If set, event traces are collected and send out via protobuf logging (1), logfile (2), opentelemetry trace data (4) or a combination', 'doc' : ''' Enable the recording and logging of ref:`event traces`. This is an experimental feature and subject to change. -Possible values are 0: (disabled), 1 (add information to protobuf logging messages) and 2 (write to log) and 3 (both). +Possible values are 0: (disabled), 1 (add information to protobuf logging messages), 2 (write to log), 4 (output OpenTelemetry Trace data in protobuf logging messages). Values can be added to get multiple types of logging simultaneously. +For example, 6 means: write to log and output OpenTelemetry Trace data in the protobuf stream. ''', 'versionadded': '4.6.0', 'runtime': 'set-event-trace-enabled', diff --git a/pdns/recursordist/rec-tcp.cc b/pdns/recursordist/rec-tcp.cc index 6989f7331e..0c59266e0e 100644 --- a/pdns/recursordist/rec-tcp.cc +++ b/pdns/recursordist/rec-tcp.cc @@ -474,7 +474,7 @@ static void doProcessTCPQuestion(std::unique_ptr& comboWriter, s if (t_protobufServers.servers && comboWriter->d_logResponse && (!luaconfsLocal->protobufExportConfig.taggedOnly || (pbData && pbData->d_tagged))) { struct timeval tval{ 0, 0}; - protobufLogResponse(dnsheader, luaconfsLocal, pbData, tval, true, comboWriter->d_source, comboWriter->d_destination, comboWriter->d_mappedSource, comboWriter->d_ednssubnet, comboWriter->d_uuid, comboWriter->d_requestorId, comboWriter->d_deviceId, comboWriter->d_deviceName, comboWriter->d_meta, comboWriter->d_eventTrace, comboWriter->d_otTrace, comboWriter->d_policyTags); + protobufLogResponse(qname, qtype, dnsheader, luaconfsLocal, pbData, tval, true, comboWriter->d_source, comboWriter->d_destination, comboWriter->d_mappedSource, comboWriter->d_ednssubnet, comboWriter->d_uuid, comboWriter->d_requestorId, comboWriter->d_deviceId, comboWriter->d_deviceName, comboWriter->d_meta, comboWriter->d_eventTrace, comboWriter->d_otTrace, comboWriter->d_policyTags); } if (comboWriter->d_eventTrace.enabled() && SyncRes::eventTraceEnabled(SyncRes::event_trace_to_log)) { diff --git a/pdns/recursordist/rec_channel_rec.cc b/pdns/recursordist/rec_channel_rec.cc index 293ae1dbca..1371094ee1 100644 --- a/pdns/recursordist/rec_channel_rec.cc +++ b/pdns/recursordist/rec_channel_rec.cc @@ -1894,7 +1894,7 @@ static RecursorControlChannel::Answer help() "set-minimum-ttl value set minimum-ttl-override\n" "set-carbon-server set a carbon server for telemetry\n" "set-dnssec-log-bogus SETTING enable (SETTING=yes) or disable (SETTING=no) logging of DNSSEC validation failures\n" - "set-event-trace-enabled SETTING set logging of event trace messages, 0 = disabled, 1 = protobuf, 2 = log file, 3 = both\n" + "set-event-trace-enabled SETTING set logging of event trace messages, 0 = disabled, 1 = protobuf, 2 = log file, 4 = opentelemetry traces, combinations allowed by adding\n" "show-yaml [file] show yaml config derived from old-style config\n" "trace-regex [regex file] emit resolution trace for matching queries (no arguments clears tracing)\n" "top-largeanswer-remotes show top remotes receiving large answers\n" diff --git a/pdns/recursordist/test-protozero-trace.cc b/pdns/recursordist/test-protozero-trace.cc index d50715c81a..2ecf860103 100644 --- a/pdns/recursordist/test-protozero-trace.cc +++ b/pdns/recursordist/test-protozero-trace.cc @@ -206,7 +206,7 @@ BOOST_AUTO_TEST_CASE(traces2) std::string data; protozero::pbf_writer writer{data}; traces.encode(writer); -#if 1 +#if 0 std::ofstream x("x"); x << data; #endif diff --git a/pdns/sdig.cc b/pdns/sdig.cc index 319a9dce96..151770d35d 100644 --- a/pdns/sdig.cc +++ b/pdns/sdig.cc @@ -95,8 +95,8 @@ static void fillPacket(vector& packet, const string& q, const string& t opts.emplace_back(EDNSOptionCode::COOKIE, cookieOpt.makeOptString()); } if (otids) { - opts.emplace_back(EDNSOptionCode::OTTRACEID, std::string_view(reinterpret_cast(otids->first.data()), otids->first.size())); - opts.emplace_back(EDNSOptionCode::OTSPANID, std::string_view(reinterpret_cast(otids->second.data()), otids->second.size())); + opts.emplace_back(EDNSOptionCode::OTTRACEID, std::string_view(reinterpret_cast(otids->first.data()), otids->first.size())); // NOLINT + opts.emplace_back(EDNSOptionCode::OTSPANID, std::string_view(reinterpret_cast(otids->second.data()), otids->second.size())); // NOLINT } pw.addOpt(bufsize, 0, dnssec ? EDNSOpts::DNSSECOK : 0, opts); pw.commit(); @@ -370,6 +370,10 @@ try { } else { auto traceIDStr = makeBytesFromHex(traceIDArg); + if (traceIDStr.size() > traceid.size()) { + cerr << "Maximum length of TraceID is " << traceid.size() << " bytes" << endl; + exit(EXIT_FAILURE); + } traceIDStr.resize(traceid.size()); pdns::trace::fill(traceid, traceIDStr); } -- 2.47.2