From: Otto Moerbeek Date: Tue, 13 Aug 2024 07:56:54 +0000 (+0200) Subject: rec: distinguish OS imposed limits from app imposed limits, specifically on chains X-Git-Tag: rec-5.2.0-alpha1~140^2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=refs%2Fpull%2F14554%2Fhead;p=thirdparty%2Fpdns.git rec: distinguish OS imposed limits from app imposed limits, specifically on chains --- diff --git a/pdns/recursordist/RECURSOR-MIB.txt b/pdns/recursordist/RECURSOR-MIB.txt index b64afadc99..ba743382dd 100644 --- a/pdns/recursordist/RECURSOR-MIB.txt +++ b/pdns/recursordist/RECURSOR-MIB.txt @@ -63,6 +63,9 @@ rec MODULE-IDENTITY REVISION "202405230000Z" DESCRIPTION "Added metrics for maximum chain length and weight" + REVISION "202408130000Z" + DESCRIPTION "Added metric for chain limits reached" + ::= { powerdns 2 } powerdns OBJECT IDENTIFIER ::= { enterprises 43315 } @@ -1269,6 +1272,14 @@ maxChainWeight OBJECT-TYPE "Maximum chain weight" ::= { stats 150 } +chainLimits OBJECT-TYPE + SYNTAX Counter64 + MAX-ACCESS read-only + STATUS current + DESCRIPTION + "Chain limits reached" + ::= { stats 151 } + --- --- Traps / Notifications --- @@ -1466,7 +1477,8 @@ recGroup OBJECT-GROUP nodEvents, udrEvents, maxChainLength, - maxChainWeight + maxChainWeight, + chainLimits } STATUS current DESCRIPTION "Objects conformance group for PowerDNS Recursor" diff --git a/pdns/recursordist/docs/metrics.rst b/pdns/recursordist/docs/metrics.rst index 2f3e1f8ff7..8f1fef53db 100644 --- a/pdns/recursordist/docs/metrics.rst +++ b/pdns/recursordist/docs/metrics.rst @@ -262,6 +262,10 @@ case-mismatches ^^^^^^^^^^^^^^^ counts the number of mismatches in character case since starting +chain-limits +^^^^^^^^^^^^ +counts the number of times a chain limit (size or age) has been hit + chain-resends ^^^^^^^^^^^^^ number of queries chained to existing outstanding query diff --git a/pdns/recursordist/lwres.hh b/pdns/recursordist/lwres.hh index 216d44c90c..58d43535e3 100644 --- a/pdns/recursordist/lwres.hh +++ b/pdns/recursordist/lwres.hh @@ -69,9 +69,15 @@ public: Success = 1, PermanentError = 2 /* not transport related */, OSLimitError = 3, - Spoofed = 4 /* Spoofing attempt (too many near-misses) */ + Spoofed = 4, /* Spoofing attempt (too many near-misses) */ + ChainLimitError = 5, }; + [[nodiscard]] static bool isLimitError(Result res) + { + return res == Result::OSLimitError || res == Result::ChainLimitError; + } + vector d_records; int d_rcode{0}; bool d_validpacket{false}; diff --git a/pdns/recursordist/pdns_recursor.cc b/pdns/recursordist/pdns_recursor.cc index 6f9b2839a1..e636cf57a8 100644 --- a/pdns/recursordist/pdns_recursor.cc +++ b/pdns/recursordist/pdns_recursor.cc @@ -308,12 +308,12 @@ LWResult::Result asendto(const void* data, size_t len, int /* flags */, *fileDesc = -1; // gets used in waitEvent / sendEvent later on auto currentChainSize = chain.first->key->authReqChain.size(); if (g_maxChainLength > 0 && currentChainSize >= g_maxChainLength) { - return LWResult::Result::OSLimitError; + return LWResult::Result::ChainLimitError; } assert(uSec(chain.first->key->creationTime) != 0); // NOLINT auto age = now - chain.first->key->creationTime; if (uSec(age) > static_cast(1000) * authWaitTimeMSec(g_multiTasker) * 2 / 3) { - return LWResult::Result::OSLimitError; + return LWResult::Result::ChainLimitError; } chain.first->key->authReqChain.insert(qid); // we can chain auto maxLength = t_Counters.at(rec::Counter::maxChainLength); diff --git a/pdns/recursordist/rec-snmp.cc b/pdns/recursordist/rec-snmp.cc index f433f38374..f6e91ec455 100644 --- a/pdns/recursordist/rec-snmp.cc +++ b/pdns/recursordist/rec-snmp.cc @@ -205,6 +205,7 @@ static const oid10 nodEventsOID = {RECURSOR_STATS_OID, 147}; static const oid10 udrEventsOID = {RECURSOR_STATS_OID, 148}; static const oid10 maxChainLengthOID = {RECURSOR_STATS_OID, 149}; static const oid10 maxChainWeightOID = {RECURSOR_STATS_OID, 150}; +static const oid10 chainLimitsOID = {RECURSOR_STATS_OID, 151}; static std::unordered_map s_statsMap; @@ -435,6 +436,7 @@ RecursorSNMPAgent::RecursorSNMPAgent(const std::string& name, const std::string& registerCounter64Stat("non-resolving-nameserver-entries", nonResolvingNameserverEntriesOID); registerCounter64Stat("maintenance-usec", maintenanceUSecOID); registerCounter64Stat("maintenance-calls", maintenanceCallsOID); + registerCounter64Stat("chain-limits", chainLimitsOID); #define RCODE(num) registerCounter64Stat("auth-" + RCode::to_short_s(num) + "-answers", rcode##num##AnswersOID) // NOLINT(cppcoreguidelines-macro-usage) RCODE(0); diff --git a/pdns/recursordist/rec-tcounters.hh b/pdns/recursordist/rec-tcounters.hh index bc74b05d57..fdbce78d91 100644 --- a/pdns/recursordist/rec-tcounters.hh +++ b/pdns/recursordist/rec-tcounters.hh @@ -98,6 +98,7 @@ enum class Counter : uint8_t udrCount, maxChainLength, maxChainWeight, + chainLimits, numberOfCounters }; diff --git a/pdns/recursordist/rec_channel_rec.cc b/pdns/recursordist/rec_channel_rec.cc index a2b040cb33..b46fed7684 100644 --- a/pdns/recursordist/rec_channel_rec.cc +++ b/pdns/recursordist/rec_channel_rec.cc @@ -1568,6 +1568,7 @@ static void registerAllStats1() addGetStat("max-chain-length", [] { return g_Counters.max(rec::Counter::maxChainLength); }); addGetStat("max-chain-weight", [] { return g_Counters.max(rec::Counter::maxChainWeight); }); + addGetStat("chain-limits", [] { return g_Counters.sum(rec::Counter::chainLimits); }); /* make sure that the ECS stats are properly initialized */ SyncRes::clearECSStats(); diff --git a/pdns/recursordist/syncres.cc b/pdns/recursordist/syncres.cc index 04589a29b3..5f15cc1731 100644 --- a/pdns/recursordist/syncres.cc +++ b/pdns/recursordist/syncres.cc @@ -1609,7 +1609,7 @@ LWResult::Result SyncRes::asyncresolveWrapper(const ComboAddress& address, bool ret = asyncresolve(address, sendQname, type, doTCP, sendRDQuery, EDNSLevel, now, srcmask, ctx, d_outgoingProtobufServers, d_frameStreamServers, luaconfsLocal->outgoingProtobufExportConfig.exportTypes, res, chained); } - if (ret == LWResult::Result::PermanentError || ret == LWResult::Result::OSLimitError || ret == LWResult::Result::Spoofed) { + if (ret == LWResult::Result::PermanentError || LWResult::isLimitError(ret) || ret == LWResult::Result::Spoofed) { break; // transport error, nothing to learn here } @@ -5477,6 +5477,11 @@ bool SyncRes::doResolveAtThisIP(const std::string& prefix, const DNSName& qname, LOG(prefix << qname << ": Hit a local resource limit resolving" << (doTCP ? " over TCP" : "") << ", probable error: " << stringerror() << endl); t_Counters.at(rec::Counter::resourceLimits)++; } + else if (resolveret == LWResult::Result::ChainLimitError) { + /* Chain resource limit reached */ + LOG(prefix << qname << ": Hit a chain limit resolving" << (doTCP ? " over TCP" : "")); + t_Counters.at(rec::Counter::chainLimits)++; + } else { /* LWResult::Result::PermanentError */ t_Counters.at(rec::Counter::unreachables)++; @@ -5487,7 +5492,7 @@ bool SyncRes::doResolveAtThisIP(const std::string& prefix, const DNSName& qname, // don't account for resource limits, they are our own fault // And don't throttle when the IP address is on the dontThrottleNetmasks list or the name is part of dontThrottleNames - if (resolveret != LWResult::Result::OSLimitError && !chained && !dontThrottle) { + if (!LWResult::isLimitError(resolveret) && !chained && !dontThrottle) { uint32_t responseUsec = 1000000; // 1 sec for non-timeout cases // Use the actual time if we saw a timeout if (resolveret == LWResult::Result::Timeout) { diff --git a/pdns/recursordist/ws-recursor.cc b/pdns/recursordist/ws-recursor.cc index 342d54c430..c2a745ac2d 100644 --- a/pdns/recursordist/ws-recursor.cc +++ b/pdns/recursordist/ws-recursor.cc @@ -1257,6 +1257,10 @@ const std::map MetricDefinitionStorage::d_metrics {"max-chain-weight", MetricDefinition(PrometheusMetricType::counter, "Maximum chain weight")}, + + {"chain-limits", + MetricDefinition(PrometheusMetricType::counter, + "Chain limits reached")}, }; constexpr bool CHECK_PROMETHEUS_METRICS = false; diff --git a/regression-tests.recursor-dnssec/test_SNMP.py b/regression-tests.recursor-dnssec/test_SNMP.py index baa087548d..27b31ffbe3 100644 --- a/regression-tests.recursor-dnssec/test_SNMP.py +++ b/regression-tests.recursor-dnssec/test_SNMP.py @@ -21,7 +21,7 @@ class TestSNMP(RecursorTest): """ def _checkStatsValues(self, results): - count = 150 + count = 151 for i in list(range(1, count)): oid = self._snmpOID + '.1.' + str(i) + '.0' self.assertTrue(oid in results)