From: Otto Moerbeek Date: Tue, 8 Nov 2022 10:10:54 +0000 (+0100) Subject: Introducing TCounters X-Git-Tag: dnsdist-1.8.0-rc1~165^2~6 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7d3d2f4f5a441537748f9e271260c5f9a1f35ca2;p=thirdparty%2Fpdns.git Introducing TCounters This is a mostly lockless (and not using atomics) way to keep track of counters and other metrics. Atomic value are more expensive than you would think (especially if your platform has no native atomic support for your data type), and using locking all the time for often updated counters is very expensive as well. The idea for `TCounters` is based on https://github.com/ahupowerdns/mcounter But addresses the issues raised in https://github.com/ahupowerdns/mcounter/issues/3 Templates are used, the application has to provide a specific class to hold the values and enums to index these values. The application specific class also has to provide a `merge()` method to merge two instances of the application specific data. For counters that is simple: just add them. Averages (or histogrfam) requires a bit more work. This is demonstrated in `rec-tcounters.{cc,hh}` At the end of a body of work the application's threads should call the `updateSnap()` function. If a certain amount of time has passed since the last time, a thread local snapshot of the thread local data will be created in a thread-safe way. The class that collects the aggregated values reads (also in a thread safe way) from the snapshot values in each thread. Updates of individual counters are done on thread-local data, potentially many times per second. The snaps contain a consistent set of the values and are taken by default once per 100ms, so reletively seldom. By using the snap mnechanism the aggragate values computed are based on internally consistent counter values (as long as related counters are updated from the same thread). A (small) drawback is that the values computed might be a bit out of date. The snapshot approach was suggested by @wojas. This PR de demonstrates `TCounters` for a few Recursor metrics: simple counters and double typed average values. For the latter weights are kept, so that the average of averages can be computed in a proper way. --- diff --git a/pdns/histogram.hh b/pdns/histogram.hh index 7cfbcac343..931841eb5d 100644 --- a/pdns/histogram.hh +++ b/pdns/histogram.hh @@ -21,6 +21,7 @@ */ #pragma once +#include #include #include #include @@ -35,8 +36,10 @@ namespace pdns // By convention, we are using microsecond units struct Bucket { - const std::string d_name; - const uint64_t d_boundary{0}; + Bucket(std::string name, uint64_t boundary, uint64_t val) : + d_name(std::move(name)), d_boundary(boundary), d_count(val) {} + std::string d_name; + uint64_t d_boundary{0}; mutable uint64_t d_count{0}; }; @@ -147,9 +150,22 @@ public: d_sum += d; } + BaseHistogram& operator+=(const BaseHistogram& rhs) + { + assert(d_name == rhs.d_name); + assert(d_buckets.size() == rhs.d_buckets.size()); + for (size_t bucket = 0; bucket < d_buckets.size(); ++bucket) { + assert(d_buckets[bucket].d_name == rhs.d_buckets[bucket].d_name); + assert(d_buckets[bucket].d_boundary == rhs.d_buckets[bucket].d_boundary); + d_buckets[bucket].d_count += rhs.d_buckets[bucket].d_count; + } + d_sum += rhs.d_sum; + return *this; + } + private: std::vector d_buckets; - const std::string d_name; + std::string d_name; mutable SumType d_sum{0}; std::vector to125(uint64_t start, int num) diff --git a/pdns/lwres.cc b/pdns/lwres.cc index 7cc80f4e59..aa1c7889c1 100644 --- a/pdns/lwres.cc +++ b/pdns/lwres.cc @@ -466,7 +466,7 @@ static LWResult::Result asyncresolve(const ComboAddress& ip, const DNSName& doma if(!doTCP) { int queryfd; if (ip.sin4.sin_family==AF_INET6) { - g_stats.ipv6queries++; + t_Counters.at(rec::Counter::ipv6queries)++; } ret = asendto((const char*)&*vpacket.begin(), vpacket.size(), 0, ip, qid, domain, type, &queryfd); @@ -624,7 +624,7 @@ static LWResult::Result asyncresolve(const ComboAddress& ip, const DNSName& doma lwr->d_rcode = RCode::FormErr; lwr->d_validpacket = false; - g_stats.serverParseError++; + t_Counters.at(rec::Counter::serverParseError)++; if(outgoingLoggers) { logIncomingResponse(outgoingLoggers, context ? context->d_initialRequestId : boost::none, uuid, ip, domain, type, qid, doTCP, dnsOverTLS, srcmask, len, lwr->d_rcode, lwr->d_records, queryTime, exportTypes); @@ -637,7 +637,7 @@ static LWResult::Result asyncresolve(const ComboAddress& ip, const DNSName& doma g_slogout->info(Logr::Notice, "Unknown error parsing packet from remote server", "server", Logging::Loggable(ip))); } - g_stats.serverParseError++; + t_Counters.at(rec::Counter::serverParseError)++; out: if (!lwr->d_rcode) { diff --git a/pdns/pdns_recursor.cc b/pdns/pdns_recursor.cc index ee826d784d..b88c1987e3 100644 --- a/pdns/pdns_recursor.cc +++ b/pdns/pdns_recursor.cc @@ -338,7 +338,7 @@ LWResult::Result arecvfrom(PacketBuffer& packet, int flags, const ComboAddress& "nearmisslimit", Logging::Loggable(nearMissLimit), "qname", Logging::Loggable(domain), "from", Logging::Loggable(fromaddr))); - g_stats.spoofCount++; + t_Counters.at(rec::Counter::spoofCount)++; return LWResult::Result::Spoofed; } @@ -366,13 +366,13 @@ static void updateResponseStats(int res, const ComboAddress& remote, unsigned in if (query && t_servfailqueryring) // packet cache t_servfailqueryring->push_back({*query, qtype}); } - g_stats.servFails++; + ++t_Counters.at(rec::Counter::servFails); break; case RCode::NXDomain: - g_stats.nxDomains++; + ++t_Counters.at(rec::Counter::nxDomains); break; case RCode::NoError: - g_stats.noErrors++; + t_Counters.at(rec::Counter::noErrors)++; break; } } @@ -505,7 +505,7 @@ static PolicyResult handlePolicyHit(const DNSFilterEngine::Policy& appliedPolicy case DNSFilterEngine::PolicyKind::Drop: tcpGuard.setDropOnIdle(); - ++g_stats.policyDrops; + ++t_Counters.at(rec::Counter::policyDrops); return PolicyResult::Drop; case DNSFilterEngine::PolicyKind::NXDOMAIN: @@ -594,7 +594,7 @@ static void sendNODLookup(Logr::log_t nodlogger, const DNSName& dname) if (g_logCommonErrors) { nodlogger->v(10)->error(Logr::Error, "DNSName too long", "Unable to send NOD lookup"); } - ++g_stats.nodLookupsDroppedOversize; + ++t_Counters.at(rec::Counter::nodLookupsDroppedOversize); return; } nodlogger->v(10)->info(Logr::Debug, "Sending NOD lookup", "nodqname", Logging::Loggable(qname)); @@ -721,7 +721,7 @@ int getFakeAAAARecords(const DNSName& qname, ComboAddress prefix, vector& ret) auto log = g_slog->withName("dns64")->withValues("method", Logging::Loggable("getPTR")); int rcode = directResolve(DNSName(newquery), QType::PTR, QClass::IN, ret, t_pdl, log); - g_stats.dns64prefixanswers++; + t_Counters.at(rec::Counter::dns64prefixanswers)++; return rcode; } @@ -952,14 +952,14 @@ void startDoResolve(void* p) // Does the requestor want DNSSEC records? if (edo.d_extFlags & EDNSOpts::DNSSECOK) { DNSSECOK = true; - g_stats.dnssecQueries++; + t_Counters.at(rec::Counter::dnssecQueries)++; } if (dc->d_mdp.d_header.cd) { /* Per rfc6840 section 5.9, "When processing a request with the Checking Disabled (CD) bit set, a resolver SHOULD attempt to return all response data, even data that has failed DNSSEC validation. */ - ++g_stats.dnssecCheckDisabledQueries; + ++t_Counters.at(rec::Counter::dnssecCheckDisabledQueries); } if (dc->d_mdp.d_header.ad) { /* Per rfc6840 section 5.7, "the AD bit in a query as a signal @@ -967,7 +967,7 @@ void startDoResolve(void* p) value of the AD bit in the response. This allows a requester to indicate that it understands the AD bit without also requesting DNSSEC data via the DO bit. */ - ++g_stats.dnssecAuthenticDataQueries; + ++t_Counters.at(rec::Counter::dnssecAuthenticDataQueries); } } else { @@ -1165,7 +1165,7 @@ void startDoResolve(void* p) } catch (const ImmediateQueryDropException& e) { // XXX We need to export a protobuf message (and do a NOD lookup) if requested! - g_stats.policyDrops++; + t_Counters.at(rec::Counter::policyDrops)++; SLOG(g_log << Logger::Debug << "Dropping query because of a filtering policy " << makeLoginfo(dc) << endl, sr.d_slog->info(Logr::Debug, "Dropping query because of a filtering policy")); return; @@ -1286,7 +1286,7 @@ void startDoResolve(void* p) if (res == -1) { pw.getHeader()->rcode = RCode::ServFail; // no commit here, because no record - g_stats.servFails++; + ++t_Counters.at(rec::Counter::servFails); } else { pw.getHeader()->rcode = res; @@ -1584,7 +1584,7 @@ void startDoResolve(void* p) #endif /* NOD_ENABLED */ if (variableAnswer || sr.wasVariable()) { - g_stats.variableResponses++; + t_Counters.at(rec::Counter::variableResponses)++; } if (t_protobufServers.servers && !(luaconfsLocal->protobufExportConfig.taggedOnly && appliedPolicy.getName().empty() && dc->d_policyTags.empty())) { @@ -1753,19 +1753,19 @@ void startDoResolve(void* p) } } - g_stats.answers(spentUsec); - g_stats.cumulativeAnswers(spentUsec); + t_Counters.at(rec::Histogram::answers)(spentUsec); + t_Counters.at(rec::Histogram::cumulativeAnswers)(spentUsec); double newLat = spentUsec; newLat = min(newLat, g_networkTimeoutMsec * 1000.0); // outliers of several minutes exist.. - g_stats.avgLatencyUsec = (1.0 - 1.0 / g_latencyStatSize) * g_stats.avgLatencyUsec + newLat / g_latencyStatSize; + t_Counters.at(rec::DoubleWAvgCounter::avgLatencyUsec).addToRollingAvg(newLat, g_latencyStatSize); // no worries, we do this for packet cache hits elsewhere if (spentUsec >= sr.d_totUsec) { uint64_t ourtime = spentUsec - sr.d_totUsec; - g_stats.ourtime(ourtime); + t_Counters.at(rec::Histogram::ourtime)(ourtime); newLat = ourtime; // usec - g_stats.avgLatencyOursUsec = (1.0 - 1.0 / g_latencyStatSize) * g_stats.avgLatencyOursUsec + newLat / g_latencyStatSize; + t_Counters.at(rec::DoubleWAvgCounter::avgLatencyOursUsec).addToRollingAvg(newLat, g_latencyStatSize); } #ifdef NOD_ENABLED @@ -1807,9 +1807,13 @@ void startDoResolve(void* p) sr.d_slog->info(Logr::Error, "Any other exception in a resolver context")); } + if (g_regressionTestMode) { + t_Counters.updateSnap(g_regressionTestMode); + } runTaskOnce(g_logCommonErrors); - g_stats.maxMThreadStackUsage = max(MT->getMaxStackUsage(), g_stats.maxMThreadStackUsage.load()); + t_Counters.at(rec::Counter::maxMThreadStackUsage) = max(MT->getMaxStackUsage(), t_Counters.at(rec::Counter::maxMThreadStackUsage)); + t_Counters.updateSnap(g_regressionTestMode); } void getQNameAndSubnet(const std::string& question, DNSName* dnsname, uint16_t* qtype, uint16_t* qclass, @@ -1916,19 +1920,20 @@ bool checkForCacheHit(bool qnameParsed, unsigned int tag, const string& data, } } - g_stats.packetCacheHits++; - SyncRes::s_queries++; + t_Counters.at(rec::Counter::packetCacheHits)++; + t_Counters.at(rec::Counter::syncresqueries)++; // XXX ageDNSPacket(response, age); if (response.length() >= sizeof(struct dnsheader)) { const struct dnsheader* dh = reinterpret_cast(response.data()); updateResponseStats(dh->rcode, source, response.length(), 0, 0); } - g_stats.avgLatencyUsec = (1.0 - 1.0 / g_latencyStatSize) * g_stats.avgLatencyUsec + 0.0; // we assume 0 usec - g_stats.avgLatencyOursUsec = (1.0 - 1.0 / g_latencyStatSize) * g_stats.avgLatencyOursUsec + 0.0; // we assume 0 usec + // we assume 0 usec + t_Counters.at(rec::DoubleWAvgCounter::avgLatencyUsec).addToRollingAvg(0.0, g_latencyStatSize); + t_Counters.at(rec::DoubleWAvgCounter::avgLatencyOursUsec).addToRollingAvg(0.0, g_latencyStatSize); #if 0 // XXX changes behaviour compared to old code! - g_stats.answers(0); - g_stats.ourtime(0); + t_Counters.at(rec::Counter::answers)(0); + t_Counters.at(rec::Counter::ourtime)(0); #endif } @@ -1977,14 +1982,15 @@ static string* doProcessUDPQuestion(const std::string& question, const ComboAddr double delta = (diff.tv_sec * 1000 + diff.tv_usec / 1000.0); if (delta > 1000.0) { - g_stats.tooOldDrops++; + t_Counters.at(rec::Counter::tooOldDrops)++; return nullptr; } } - ++g_stats.qcounter; + ++t_Counters.at(rec::Counter::qcounter); + if (fromaddr.sin4.sin_family == AF_INET6) - g_stats.ipv6qcounter++; + t_Counters.at(rec::Counter::ipv6qcounter)++; string response; const dnsheader_aligned headerdata(question.data()); @@ -2142,7 +2148,8 @@ static string* doProcessUDPQuestion(const std::string& question, const ComboAddr struct timeval now; Utility::gettimeofday(&now, nullptr); uint64_t spentUsec = uSec(now - tv); - g_stats.cumulativeAnswers(spentUsec); + t_Counters.at(rec::Histogram::cumulativeAnswers)(spentUsec); + t_Counters.updateSnap(g_regressionTestMode); return 0; } } @@ -2162,7 +2169,7 @@ static string* doProcessUDPQuestion(const std::string& question, const ComboAddr SLOG(g_log << Logger::Notice << RecThreadInfo::id() << " [" << MT->getTid() << "/" << MT->numProcesses() << "] DROPPED question from " << source.toStringWithPort() << (source != fromaddr ? " (via " + fromaddr.toStringWithPort() + ")" : "") << " based on policy" << endl, g_slogudpin->info(Logr::Notice, "Dropped question based on policy", "source", Logging::Loggable(source), "remote", Logging::Loggable(fromaddr))); } - g_stats.policyDrops++; + t_Counters.at(rec::Counter::policyDrops)++; return 0; } } @@ -2174,7 +2181,7 @@ static string* doProcessUDPQuestion(const std::string& question, const ComboAddr g_slogudpin->info(Logr::Notice, "Dropping UDP NOTIFY, zone not matched by allow-notify-for", "source", Logging::Loggable(source), "remote", Logging::Loggable(fromaddr))); } - g_stats.zoneDisallowedNotify++; + t_Counters.at(rec::Counter::zoneDisallowedNotify)++; return 0; } @@ -2197,7 +2204,7 @@ static string* doProcessUDPQuestion(const std::string& question, const ComboAddr SLOG(g_log << Logger::Notice << RecThreadInfo::id() << " [" << MT->getTid() << "/" << MT->numProcesses() << "] DROPPED question from " << source.toStringWithPort() << (source != fromaddr ? " (via " + fromaddr.toStringWithPort() + ")" : "") << ", over capacity" << endl, g_slogudpin->info(Logr::Notice, "Dropped question, over capacity", "source", Logging::Loggable(source), "remote", Logging::Loggable(fromaddr))); - g_stats.overCapacityDrops++; + t_Counters.at(rec::Counter::overCapacityDrops)++; return 0; } @@ -2270,7 +2277,7 @@ static void handleNewUDPQuestion(int fd, FDMultiplexer::funcparam_t& var) firstQuery = false; if (msgh.msg_flags & MSG_TRUNC) { - g_stats.truncatedDrops++; + t_Counters.at(rec::Counter::truncatedDrops)++; if (!g_quiet) { SLOG(g_log << Logger::Error << "Ignoring truncated query from " << fromaddr.toString() << endl, g_slogudpin->info(Logr::Error, "Ignoring truncated query", "remote", Logging::Loggable(fromaddr))); @@ -2284,7 +2291,7 @@ static void handleNewUDPQuestion(int fd, FDMultiplexer::funcparam_t& var) bool tcp; ssize_t used = parseProxyHeader(data, proxyProto, source, destination, tcp, proxyProtocolValues); if (used <= 0) { - ++g_stats.proxyProtocolInvalidCount; + ++t_Counters.at(rec::Counter::proxyProtocolInvalidCount); if (!g_quiet) { SLOG(g_log << Logger::Error << "Ignoring invalid proxy protocol (" << std::to_string(len) << ", " << std::to_string(used) << ") query from " << fromaddr.toStringWithPort() << endl, g_slogudpin->info(Logr::Error, "Ignoring invalid proxy protocol query", "length", Logging::Loggable(len), @@ -2298,7 +2305,7 @@ static void handleNewUDPQuestion(int fd, FDMultiplexer::funcparam_t& var) g_slogudpin->info(Logr::Error, "Proxy protocol header in UDP packet is larger than proxy-protocol-maximum-size", "used", Logging::Loggable(used), "remote", Logging::Loggable(fromaddr))); } - ++g_stats.proxyProtocolInvalidCount; + ++t_Counters.at(rec::Counter::proxyProtocolInvalidCount); return; } @@ -2306,7 +2313,7 @@ static void handleNewUDPQuestion(int fd, FDMultiplexer::funcparam_t& var) } else if (len > 512) { /* we only allow UDP packets larger than 512 for those with a proxy protocol header */ - g_stats.truncatedDrops++; + t_Counters.at(rec::Counter::truncatedDrops)++; if (!g_quiet) { SLOG(g_log << Logger::Error << "Ignoring truncated query from " << fromaddr.toStringWithPort() << endl, g_slogudpin->info(Logr::Error, "Ignoring truncated query", "remote", Logging::Loggable(fromaddr))); @@ -2315,7 +2322,7 @@ static void handleNewUDPQuestion(int fd, FDMultiplexer::funcparam_t& var) } if (data.size() < sizeof(dnsheader)) { - g_stats.ignoredCount++; + t_Counters.at(rec::Counter::ignoredCount)++; if (!g_quiet) { SLOG(g_log << Logger::Error << "Ignoring too-short (" << std::to_string(data.size()) << ") query from " << fromaddr.toString() << endl, g_slogudpin->info(Logr::Error, "Ignoring too-short query", "length", Logging::Loggable(data.size()), @@ -2344,7 +2351,7 @@ static void handleNewUDPQuestion(int fd, FDMultiplexer::funcparam_t& var) g_slogudpin->info(Logr::Error, "Dropping UDP query, address not matched by allow-from", "source", Logging::Loggable(mappedSource))); } - g_stats.unauthorizedUDP++; + t_Counters.at(rec::Counter::unauthorizedUDP)++; return; } @@ -2355,7 +2362,7 @@ static void handleNewUDPQuestion(int fd, FDMultiplexer::funcparam_t& var) g_slogudpin->info(Logr::Error, "Dropping UDP query can't deal with port 0", "remote", Logging::Loggable(fromaddr))); } - g_stats.clientParseError++; // not quite the best place to put it, but needs to go somewhere + t_Counters.at(rec::Counter::clientParseError)++; // not quite the best place to put it, but needs to go somewhere return; } @@ -2364,21 +2371,21 @@ static void handleNewUDPQuestion(int fd, FDMultiplexer::funcparam_t& var) const dnsheader* dh = headerdata.get(); if (dh->qr) { - g_stats.ignoredCount++; + t_Counters.at(rec::Counter::ignoredCount)++; if (g_logCommonErrors) { SLOG(g_log << Logger::Error << "Ignoring answer from " << fromaddr.toString() << " on server socket!" << endl, g_slogudpin->info(Logr::Error, "Ignoring answer on server socket", "remote", Logging::Loggable(fromaddr))); } } else if (dh->opcode != Opcode::Query && dh->opcode != Opcode::Notify) { - g_stats.ignoredCount++; + t_Counters.at(rec::Counter::ignoredCount)++; if (g_logCommonErrors) { SLOG(g_log << Logger::Error << "Ignoring unsupported opcode " << Opcode::to_s(dh->opcode) << " from " << fromaddr.toString() << " on server socket!" << endl, g_slogudpin->info(Logr::Error, "Ignoring unsupported opcode server socket", "remote", Logging::Loggable(fromaddr), "opcode", Logging::Loggable(Opcode::to_s(dh->opcode)))); } } else if (dh->qdcount == 0) { - g_stats.emptyQueriesCount++; + t_Counters.at(rec::Counter::emptyQueriesCount)++; if (g_logCommonErrors) { SLOG(g_log << Logger::Error << "Ignoring empty (qdcount == 0) query from " << fromaddr.toString() << " on server socket!" << endl, g_slogudpin->info(Logr::Error, "Ignoring empty (qdcount == 0) query on server socket!", "remote", Logging::Loggable(fromaddr))); @@ -2393,7 +2400,7 @@ static void handleNewUDPQuestion(int fd, FDMultiplexer::funcparam_t& var) "source", Logging::Loggable(mappedSource))); } - g_stats.sourceDisallowedNotify++; + t_Counters.at(rec::Counter::sourceDisallowedNotify)++; return; } } @@ -2435,14 +2442,14 @@ static void handleNewUDPQuestion(int fd, FDMultiplexer::funcparam_t& var) } } catch (const MOADNSException& mde) { - g_stats.clientParseError++; + t_Counters.at(rec::Counter::clientParseError)++; if (g_logCommonErrors) { SLOG(g_log << Logger::Error << "Unable to parse packet from remote UDP client " << fromaddr.toString() << ": " << mde.what() << endl, g_slogudpin->error(Logr::Error, mde.what(), "Unable to parse packet from remote UDP client", "remote", Logging::Loggable(fromaddr), "exception", Logging::Loggable("MOADNSException"))); } } catch (const std::runtime_error& e) { - g_stats.clientParseError++; + t_Counters.at(rec::Counter::clientParseError)++; if (g_logCommonErrors) { SLOG(g_log << Logger::Error << "Unable to parse packet from remote UDP client " << fromaddr.toString() << ": " << e.what() << endl, g_slogudpin->error(Logr::Error, e.what(), "Unable to parse packet from remote UDP client", "remote", Logging::Loggable(fromaddr), "exception", Logging::Loggable("std::runtime_error"))); @@ -2452,12 +2459,13 @@ static void handleNewUDPQuestion(int fd, FDMultiplexer::funcparam_t& var) else { // cerr< targetLoad) { - ++g_stats.rebalancedQueries; + ++t_Counters.at(rec::Counter::rebalancedQueries); do { worker = (worker + 1) % RecThreadInfo::numWorkers(); } while (load[worker] > targetLoad); @@ -2637,7 +2645,7 @@ void distributeAsyncFunction(const string& packet, const pipefunc_t& func) unsigned int hash = hashQuestion(reinterpret_cast(packet.data()), packet.length(), g_disthashseed, ok); if (!ok) { // hashQuestion does detect invalid names, so we might as well punt here instead of in the worker thread - g_stats.ignoredCount++; + t_Counters.at(rec::Counter::ignoredCount)++; throw MOADNSException("too-short (" + std::to_string(packet.length()) + ") or invalid name"); } unsigned int target = selectWorker(hash); @@ -2655,7 +2663,7 @@ void distributeAsyncFunction(const string& packet, const pipefunc_t& func) } while (newTarget == target); if (!trySendingQueryToWorker(newTarget, tmsg)) { - g_stats.queryPipeFullDrops++; + t_Counters.at(rec::Counter::queryPipeFullDrops)++; delete tmsg; } } @@ -2675,7 +2683,7 @@ static void doResends(MT_t::waiters_t::iterator& iter, const std::shared_ptrfd = -1; r->id = *i; MT->sendEvent(r, &content); - g_stats.chainResends++; + t_Counters.at(rec::Counter::chainResends)++; } } @@ -2694,7 +2702,7 @@ static void handleUDPServerResponse(int fd, FDMultiplexer::funcparam_t& var) if (len < 0) ; // cerr<<"Error on fd "<info(Logr::Error, "Unable to parse packet from remote UDP server", "from", Logging::Loggable(fromaddr))); @@ -2736,7 +2744,7 @@ static void handleUDPServerResponse(int fd, FDMultiplexer::funcparam_t& var) pident->domain = DNSName(reinterpret_cast(packet.data()), len, 12, false, &pident->type); // don't copy this from above - we need to do the actual read } catch (std::exception& e) { - g_stats.serverParseError++; // won't be fed to lwres.cc, so we have to increment + t_Counters.at(rec::Counter::serverParseError)++; // won't be fed to lwres.cc, so we have to increment SLOG(g_log << Logger::Warning << "Error in packet from remote nameserver " << fromaddr.toStringWithPort() << ": " << e.what() << endl, g_slogudpin->error(Logr::Warning, "Error in packet from remote nameserver", "from", Logging::Loggable(fromaddr))); return; @@ -2770,7 +2778,7 @@ retryWithName: goto retryWithName; // note that this only passes on an error, lwres will still reject the packet } } - g_stats.unexpectedCount++; // if we made it here, it really is an unexpected answer + t_Counters.at(rec::Counter::unexpectedCount)++; // if we made it here, it really is an unexpected answer if (g_logCommonErrors) { SLOG(g_log << Logger::Warning << "Discarding unexpected packet from " << fromaddr.toStringWithPort() << ": " << (pident->domain.empty() ? "" : pident->domain.toString()) << ", " << pident->type << ", " << MT->d_waiters.size() << " waiters" << endl, g_slogudpin->info(Logr::Warning, "Discarding unexpected packet", "from", Logging::Loggable(fromaddr), diff --git a/pdns/rec_channel_rec.cc b/pdns/rec_channel_rec.cc index 52c93bf418..cae909fd02 100644 --- a/pdns/rec_channel_rec.cc +++ b/pdns/rec_channel_rec.cc @@ -1149,7 +1149,7 @@ static uint64_t doGetMallocated() return 0; } -static StatsMap toStatsMap(const string& name, const pdns::AtomicHistogram& histogram) +static StatsMap toStatsMap(const string& name, const pdns::Histogram& histogram) { const auto& data = histogram.getCumulativeBuckets(); const string pbasename = getPrometheusName(name); @@ -1169,7 +1169,7 @@ static StatsMap toStatsMap(const string& name, const pdns::AtomicHistogram& hist return entries; } -static StatsMap toStatsMap(const string& name, const pdns::AtomicHistogram& histogram4, const pdns::AtomicHistogram& histogram6) +static StatsMap toStatsMap(const string& name, const pdns::Histogram& histogram4, const pdns::Histogram& histogram6) { const string pbasename = getPrometheusName(name); StatsMap entries; @@ -1199,13 +1199,14 @@ static StatsMap toStatsMap(const string& name, const pdns::AtomicHistogram& hist return entries; } -static StatsMap toAuthRCodeStatsMap(const string& name, const std::array& v) +static StatsMap toAuthRCodeStatsMap(const string& name) { const string pbasename = getPrometheusName(name); StatsMap entries; uint8_t n = 0; - for (const auto& entry : v) { + auto rcodes = g_Counters.sum(rec::RCode::auth).rcodeCounters; + for (const auto& entry : rcodes) { const auto key = RCode::to_short_s(n); std::string pname = pbasename + "{rcode=\"" + key + "\"}"; entries.emplace("auth-" + key + "-answers", StatsMapEntry{pname, std::to_string(entry)}); @@ -1313,9 +1314,9 @@ static StatsMap toRemoteLoggerStatsMap(const string& name) static void registerAllStats1() { - addGetStat("questions", &g_stats.qcounter); - addGetStat("ipv6-questions", &g_stats.ipv6qcounter); - addGetStat("tcp-questions", &g_stats.tcpqcounter); + addGetStat("questions", [] { return g_Counters.sum(rec::Counter::qcounter); }); + addGetStat("ipv6-questions", [] { return g_Counters.sum(rec::Counter::ipv6qcounter); }); + addGetStat("tcp-questions", [] { return g_Counters.sum(rec::Counter::tcpqcounter); }); addGetStat("cache-hits", doGetCacheHits); addGetStat("cache-misses", doGetCacheMisses); @@ -1339,63 +1340,63 @@ static void registerAllStats1() addGetStat("malloc-bytes", doGetMallocated); - addGetStat("servfail-answers", &g_stats.servFails); - addGetStat("nxdomain-answers", &g_stats.nxDomains); - addGetStat("noerror-answers", &g_stats.noErrors); - - addGetStat("unauthorized-udp", &g_stats.unauthorizedUDP); - addGetStat("unauthorized-tcp", &g_stats.unauthorizedTCP); - addGetStat("source-disallowed-notify", &g_stats.sourceDisallowedNotify); - addGetStat("zone-disallowed-notify", &g_stats.zoneDisallowedNotify); - addGetStat("tcp-client-overflow", &g_stats.tcpClientOverflow); - - addGetStat("client-parse-errors", &g_stats.clientParseError); - addGetStat("server-parse-errors", &g_stats.serverParseError); - addGetStat("too-old-drops", &g_stats.tooOldDrops); - addGetStat("truncated-drops", &g_stats.truncatedDrops); - addGetStat("query-pipe-full-drops", &g_stats.queryPipeFullDrops); - - addGetStat("answers0-1", []() { return g_stats.answers.getCount(0); }); - addGetStat("answers1-10", []() { return g_stats.answers.getCount(1); }); - addGetStat("answers10-100", []() { return g_stats.answers.getCount(2); }); - addGetStat("answers100-1000", []() { return g_stats.answers.getCount(3); }); - addGetStat("answers-slow", []() { return g_stats.answers.getCount(4); }); - - addGetStat("x-ourtime0-1", []() { return g_stats.ourtime.getCount(0); }); - addGetStat("x-ourtime1-2", []() { return g_stats.ourtime.getCount(1); }); - addGetStat("x-ourtime2-4", []() { return g_stats.ourtime.getCount(2); }); - addGetStat("x-ourtime4-8", []() { return g_stats.ourtime.getCount(3); }); - addGetStat("x-ourtime8-16", []() { return g_stats.ourtime.getCount(4); }); - addGetStat("x-ourtime16-32", []() { return g_stats.ourtime.getCount(5); }); - addGetStat("x-ourtime-slow", []() { return g_stats.ourtime.getCount(6); }); - - addGetStat("auth4-answers0-1", []() { return g_stats.auth4Answers.getCount(0); }); - addGetStat("auth4-answers1-10", []() { return g_stats.auth4Answers.getCount(1); }); - addGetStat("auth4-answers10-100", []() { return g_stats.auth4Answers.getCount(2); }); - addGetStat("auth4-answers100-1000", []() { return g_stats.auth4Answers.getCount(3); }); - addGetStat("auth4-answers-slow", []() { return g_stats.auth4Answers.getCount(4); }); - - addGetStat("auth6-answers0-1", []() { return g_stats.auth6Answers.getCount(0); }); - addGetStat("auth6-answers1-10", []() { return g_stats.auth6Answers.getCount(1); }); - addGetStat("auth6-answers10-100", []() { return g_stats.auth6Answers.getCount(2); }); - addGetStat("auth6-answers100-1000", []() { return g_stats.auth6Answers.getCount(3); }); - addGetStat("auth6-answers-slow", []() { return g_stats.auth6Answers.getCount(4); }); - - addGetStat("qa-latency", []() { return round(g_stats.avgLatencyUsec.load()); }); - addGetStat("x-our-latency", []() { return round(g_stats.avgLatencyOursUsec.load()); }); - addGetStat("unexpected-packets", &g_stats.unexpectedCount); - addGetStat("case-mismatches", &g_stats.caseMismatchCount); - addGetStat("spoof-prevents", &g_stats.spoofCount); - - addGetStat("nsset-invalidations", &g_stats.nsSetInvalidations); - - addGetStat("resource-limits", &g_stats.resourceLimits); - addGetStat("over-capacity-drops", &g_stats.overCapacityDrops); - addGetStat("policy-drops", &g_stats.policyDrops); - addGetStat("no-packet-error", &g_stats.noPacketError); - addGetStat("ignored-packets", &g_stats.ignoredCount); - addGetStat("empty-queries", &g_stats.emptyQueriesCount); - addGetStat("max-mthread-stack", &g_stats.maxMThreadStackUsage); + addGetStat("servfail-answers", [] { return g_Counters.sum(rec::Counter::servFails); }); + addGetStat("nxdomain-answers", [] { return g_Counters.sum(rec::Counter::nxDomains); }); + addGetStat("noerror-answers", [] { return g_Counters.sum(rec::Counter::noErrors); }); + + addGetStat("unauthorized-udp", [] { return g_Counters.sum(rec::Counter::unauthorizedUDP); }); + addGetStat("unauthorized-tcp", [] { return g_Counters.sum(rec::Counter::unauthorizedTCP); }); + addGetStat("source-disallowed-notify", [] { return g_Counters.sum(rec::Counter::sourceDisallowedNotify); }); + addGetStat("zone-disallowed-notify", [] { return g_Counters.sum(rec::Counter::zoneDisallowedNotify); }); + addGetStat("tcp-client-overflow", [] { return g_Counters.sum(rec::Counter::tcpClientOverflow); }); + + addGetStat("client-parse-errors", [] { return g_Counters.sum(rec::Counter::clientParseError); }); + addGetStat("server-parse-errors", [] { return g_Counters.sum(rec::Counter::serverParseError); }); + addGetStat("too-old-drops", [] { return g_Counters.sum(rec::Counter::tooOldDrops); }); + addGetStat("truncated-drops", [] { return g_Counters.sum(rec::Counter::truncatedDrops); }); + addGetStat("query-pipe-full-drops", [] { return g_Counters.sum(rec::Counter::queryPipeFullDrops); }); + + addGetStat("answers0-1", []() { return g_Counters.sum(rec::Histogram::answers).getCount(0); }); + addGetStat("answers1-10", []() { return g_Counters.sum(rec::Histogram::answers).getCount(1); }); + addGetStat("answers10-100", []() { return g_Counters.sum(rec::Histogram::answers).getCount(2); }); + addGetStat("answers100-1000", []() { return g_Counters.sum(rec::Histogram::answers).getCount(3); }); + addGetStat("answers-slow", []() { return g_Counters.sum(rec::Histogram::answers).getCount(4); }); + + addGetStat("x-ourtime0-1", []() { return g_Counters.sum(rec::Histogram::ourtime).getCount(0); }); + addGetStat("x-ourtime1-2", []() { return g_Counters.sum(rec::Histogram::ourtime).getCount(1); }); + addGetStat("x-ourtime2-4", []() { return g_Counters.sum(rec::Histogram::ourtime).getCount(2); }); + addGetStat("x-ourtime4-8", []() { return g_Counters.sum(rec::Histogram::ourtime).getCount(3); }); + addGetStat("x-ourtime8-16", []() { return g_Counters.sum(rec::Histogram::ourtime).getCount(4); }); + addGetStat("x-ourtime16-32", []() { return g_Counters.sum(rec::Histogram::ourtime).getCount(5); }); + addGetStat("x-ourtime-slow", []() { return g_Counters.sum(rec::Histogram::ourtime).getCount(6); }); + + addGetStat("auth4-answers0-1", []() { return g_Counters.sum(rec::Histogram::auth4Answers).getCount(0); }); + addGetStat("auth4-answers1-10", []() { return g_Counters.sum(rec::Histogram::auth4Answers).getCount(1); }); + addGetStat("auth4-answers10-100", []() { return g_Counters.sum(rec::Histogram::auth4Answers).getCount(2); }); + addGetStat("auth4-answers100-1000", []() { return g_Counters.sum(rec::Histogram::auth4Answers).getCount(3); }); + addGetStat("auth4-answers-slow", []() { return g_Counters.sum(rec::Histogram::auth4Answers).getCount(4); }); + + addGetStat("auth6-answers0-1", []() { return g_Counters.sum(rec::Histogram::auth6Answers).getCount(0); }); + addGetStat("auth6-answers1-10", []() { return g_Counters.sum(rec::Histogram::auth6Answers).getCount(1); }); + addGetStat("auth6-answers10-100", []() { return g_Counters.sum(rec::Histogram::auth6Answers).getCount(2); }); + addGetStat("auth6-answers100-1000", []() { return g_Counters.sum(rec::Histogram::auth6Answers).getCount(3); }); + addGetStat("auth6-answers-slow", []() { return g_Counters.sum(rec::Histogram::auth6Answers).getCount(4); }); + + addGetStat("qa-latency", []() { return round(g_Counters.avg(rec::DoubleWAvgCounter::avgLatencyUsec)); }); + addGetStat("x-our-latency", []() { return round(g_Counters.avg(rec::DoubleWAvgCounter::avgLatencyOursUsec)); }); + addGetStat("unexpected-packets", [] { return g_Counters.sum(rec::Counter::unexpectedCount); }); + addGetStat("case-mismatches", [] { return g_Counters.sum(rec::Counter::caseMismatchCount); }); + addGetStat("spoof-prevents", [] { return g_Counters.sum(rec::Counter::spoofCount); }); + + addGetStat("nsset-invalidations", [] { return g_Counters.sum(rec::Counter::nsSetInvalidations); }); + + addGetStat("resource-limits", [] { return g_Counters.sum(rec::Counter::resourceLimits); }); + addGetStat("over-capacity-drops", [] { return g_Counters.sum(rec::Counter::overCapacityDrops); }); + addGetStat("policy-drops", [] { return g_Counters.sum(rec::Counter::policyDrops); }); + addGetStat("no-packet-error", [] { return g_Counters.sum(rec::Counter::noPacketError); }); + addGetStat("ignored-packets", [] { return g_Counters.sum(rec::Counter::ignoredCount); }); + addGetStat("empty-queries", [] { return g_Counters.sum(rec::Counter::emptyQueriesCount); }); + addGetStat("max-mthread-stack", [] { return g_Counters.max(rec::Counter::maxMThreadStackUsage); }); addGetStat("negcache-entries", getNegCacheSize); addGetStat("throttle-entries", SyncRes::getThrottledServersSize); @@ -1406,22 +1407,22 @@ static void registerAllStats1() addGetStat("concurrent-queries", getConcurrentQueries); addGetStat("security-status", &g_security_status); - addGetStat("outgoing-timeouts", &SyncRes::s_outgoingtimeouts); - addGetStat("outgoing4-timeouts", &SyncRes::s_outgoing4timeouts); - addGetStat("outgoing6-timeouts", &SyncRes::s_outgoing6timeouts); - addGetStat("auth-zone-queries", &SyncRes::s_authzonequeries); - addGetStat("tcp-outqueries", &SyncRes::s_tcpoutqueries); - addGetStat("dot-outqueries", &SyncRes::s_dotoutqueries); - addGetStat("all-outqueries", &SyncRes::s_outqueries); - addGetStat("ipv6-outqueries", &g_stats.ipv6queries); - addGetStat("throttled-outqueries", &SyncRes::s_throttledqueries); - addGetStat("dont-outqueries", &SyncRes::s_dontqueries); - addGetStat("qname-min-fallback-success", &SyncRes::s_qnameminfallbacksuccess); - addGetStat("throttled-out", &SyncRes::s_throttledqueries); - addGetStat("unreachables", &SyncRes::s_unreachables); + addGetStat("outgoing-timeouts", [] { return g_Counters.sum(rec::Counter::outgoingtimeouts); }); + addGetStat("outgoing4-timeouts", [] { return g_Counters.sum(rec::Counter::outgoing4timeouts); }); + addGetStat("outgoing6-timeouts", [] { return g_Counters.sum(rec::Counter::outgoing6timeouts); }); + addGetStat("auth-zone-queries", [] { return g_Counters.sum(rec::Counter::authzonequeries); }); + addGetStat("tcp-outqueries", [] { return g_Counters.sum(rec::Counter::tcpoutqueries); }); + addGetStat("dot-outqueries", [] { return g_Counters.sum(rec::Counter::dotoutqueries); }); + addGetStat("all-outqueries", [] { return g_Counters.sum(rec::Counter::outqueries); }); + addGetStat("ipv6-outqueries", [] { return g_Counters.sum(rec::Counter::ipv6queries); }); + addGetStat("throttled-outqueries", [] { return g_Counters.sum(rec::Counter::throttledqueries); }); + addGetStat("dont-outqueries", [] { return g_Counters.sum(rec::Counter::dontqueries); }); + addGetStat("qname-min-fallback-success", [] { return g_Counters.sum(rec::Counter::qnameminfallbacksuccess); }); + addGetStat("throttled-out", [] { return g_Counters.sum(rec::Counter::throttledqueries); }); + addGetStat("unreachables", [] { return g_Counters.sum(rec::Counter::unreachables); }); addGetStat("ecs-queries", &SyncRes::s_ecsqueries); addGetStat("ecs-responses", &SyncRes::s_ecsresponses); - addGetStat("chain-resends", &g_stats.chainResends); + addGetStat("chain-resends", [] { return g_Counters.sum(rec::Counter::chainResends); }); addGetStat("tcp-clients", [] { return TCPConnection::getCurrentConnections(); }); #ifdef __linux__ @@ -1437,17 +1438,17 @@ static void registerAllStats1() addGetStat("udp6-in-csum-errors", [] { return udp6ErrorStats("udp6-in-csum-errors"); }); #endif - addGetStat("edns-ping-matches", &g_stats.ednsPingMatches); - addGetStat("edns-ping-mismatches", &g_stats.ednsPingMismatches); - addGetStat("dnssec-queries", &g_stats.dnssecQueries); + addGetStat("edns-ping-matches", [] { return g_Counters.sum(rec::Counter::ednsPingMatches); }); + addGetStat("edns-ping-mismatches", [] { return g_Counters.sum(rec::Counter::ednsPingMismatches); }); + addGetStat("dnssec-queries", [] { return g_Counters.sum(rec::Counter::dnssecQueries); }); - addGetStat("dnssec-authentic-data-queries", &g_stats.dnssecAuthenticDataQueries); - addGetStat("dnssec-check-disabled-queries", &g_stats.dnssecCheckDisabledQueries); + addGetStat("dnssec-authentic-data-queries", [] { return g_Counters.sum(rec::Counter::dnssecAuthenticDataQueries); }); + addGetStat("dnssec-check-disabled-queries", [] { return g_Counters.sum(rec::Counter::dnssecCheckDisabledQueries); }); - addGetStat("variable-responses", &g_stats.variableResponses); + addGetStat("variable-responses", [] { return g_Counters.sum(rec::Counter::variableResponses); }); - addGetStat("noping-outqueries", &g_stats.noPingOutQueries); - addGetStat("noedns-outqueries", &g_stats.noEdnsOutQueries); + addGetStat("noping-outqueries", [] { return g_Counters.sum(rec::Counter::noPingOutQueries); }); + addGetStat("noedns-outqueries", [] { return g_Counters.sum(rec::Counter::noEdnsOutQueries); }); addGetStat("uptime", calculateUptime); addGetStat("real-memory-usage", [] { return getRealMemoryUsage(string()); }); @@ -1471,7 +1472,7 @@ static void registerAllStats1() addGetStat("memory-allocated", [] { return g_mtracer->getTotAllocated(string()); }); #endif - addGetStat("dnssec-validations", &g_stats.dnssecValidations); + addGetStat("dnssec-validations", [] { return g_Counters.sum(rec::Counter::dnssecValidations); }); addGetStat("dnssec-result-insecure", &g_stats.dnssecResults[vState::Insecure]); addGetStat("dnssec-result-secure", &g_stats.dnssecResults[vState::Secure]); addGetStat("dnssec-result-bogus", []() { @@ -1538,17 +1539,17 @@ static void registerAllStats1() addGetStat("policy-result-truncate", &g_stats.policyResults[DNSFilterEngine::PolicyKind::Truncate]); addGetStat("policy-result-custom", &g_stats.policyResults[DNSFilterEngine::PolicyKind::Custom]); - addGetStat("rebalanced-queries", &g_stats.rebalancedQueries); + addGetStat("rebalanced-queries", [] { return g_Counters.sum(rec::Counter::rebalancedQueries); }); - addGetStat("proxy-protocol-invalid", &g_stats.proxyProtocolInvalidCount); + addGetStat("proxy-protocol-invalid", [] { return g_Counters.sum(rec::Counter::proxyProtocolInvalidCount); }); - addGetStat("nod-lookups-dropped-oversize", &g_stats.nodLookupsDroppedOversize); + addGetStat("nod-lookups-dropped-oversize", [] { return g_Counters.sum(rec::Counter::nodLookupsDroppedOversize); }); addGetStat("taskqueue-pushed", []() { return getTaskPushes(); }); addGetStat("taskqueue-expired", []() { return getTaskExpired(); }); addGetStat("taskqueue-size", []() { return getTaskSize(); }); - addGetStat("dns64-prefix-answers", &g_stats.dns64prefixanswers); + addGetStat("dns64-prefix-answers", [] { return g_Counters.sum(rec::Counter::dns64prefixanswers); }); addGetStat("almost-expired-pushed", []() { return getAlmostExpiredTasksPushed(); }); addGetStat("almost-expired-run", []() { return getAlmostExpiredTasksRun(); }); @@ -1556,8 +1557,8 @@ static void registerAllStats1() addGetStat("idle-tcpout-connections", getCurrentIdleTCPConnections); - addGetStat("maintenance-usec", &g_stats.maintenanceUsec); - addGetStat("maintenance-calls", &g_stats.maintenanceCalls); + addGetStat("maintenance-usec", [] { return g_Counters.sum(rec::Counter::maintenanceUsec); }); + addGetStat("maintenance-calls", [] { return g_Counters.sum(rec::Counter::maintenanceCalls); }); /* make sure that the ECS stats are properly initialized */ SyncRes::clearECSStats(); @@ -1571,10 +1572,10 @@ static void registerAllStats1() } addGetStat("cumul-clientanswers", []() { - return toStatsMap(g_stats.cumulativeAnswers.getName(), g_stats.cumulativeAnswers); + return toStatsMap(t_Counters.at(rec::Histogram::cumulativeAnswers).getName(), g_Counters.sum(rec::Histogram::cumulativeAnswers)); }); addGetStat("cumul-authanswers", []() { - return toStatsMap(g_stats.cumulativeAuth4Answers.getName(), g_stats.cumulativeAuth4Answers, g_stats.cumulativeAuth6Answers); + return toStatsMap(t_Counters.at(rec::Histogram::cumulativeAuth4Answers).getName(), g_Counters.sum(rec::Histogram::cumulativeAuth4Answers), g_Counters.sum(rec::Histogram::cumulativeAuth6Answers)); }); addGetStat("policy-hits", []() { return toRPZStatsMap("policy-hits", g_stats.policyHits); @@ -1583,7 +1584,7 @@ static void registerAllStats1() return toProxyMappingStatsMap("proxy-mapping-total"); }); addGetStat("auth-rcode-answers", []() { - return toAuthRCodeStatsMap("auth-rcode-answers", g_stats.authRCode); + return toAuthRCodeStatsMap("auth-rcode-answers"); }); addGetStat("remote-logger-count", []() { return toRemoteLoggerStatsMap("remote-logger-count"); diff --git a/pdns/recursordist/Makefile.am b/pdns/recursordist/Makefile.am index 629976e0d1..0371e08c3f 100644 --- a/pdns/recursordist/Makefile.am +++ b/pdns/recursordist/Makefile.am @@ -171,6 +171,7 @@ pdns_recursor_SOURCES = \ rec-protozero.cc rec-protozero.hh \ rec-snmp.hh rec-snmp.cc \ rec-taskqueue.cc rec-taskqueue.hh \ + rec-tcounters.cc rec-tcounters.hh \ rec-tcp.cc \ rec-tcpout.cc rec-tcpout.hh \ rec-zonetocache.cc rec-zonetocache.hh \ @@ -201,6 +202,7 @@ pdns_recursor_SOURCES = \ svc-records.cc svc-records.hh \ syncres.cc syncres.hh \ taskqueue.cc taskqueue.hh \ + tcounters.hh \ tcpiohandler.cc tcpiohandler.hh \ threadname.hh threadname.cc \ tsigverifier.cc tsigverifier.hh \ @@ -294,6 +296,7 @@ testrunner_SOURCES = \ rcpgenerator.cc \ rec-eventtrace.cc rec-eventtrace.hh \ rec-taskqueue.cc rec-taskqueue.hh \ + rec-tcounters.cc rec-tcounters.hh \ rec-zonetocache.cc rec-zonetocache.hh \ recpacketcache.cc recpacketcache.hh \ recursor_cache.cc recursor_cache.hh \ @@ -334,6 +337,7 @@ testrunner_SOURCES = \ test-packetcache_hh.cc \ test-rcpgenerator_cc.cc \ test-rec-taskqueue.cc \ + test-rec-tcounters_cc.cc \ test-rec-zonetocache.cc \ test-recpacketcache_cc.cc \ test-recursorcache_cc.cc \ diff --git a/pdns/recursordist/rec-main.cc b/pdns/recursordist/rec-main.cc index 415b7c8338..a74515ff57 100644 --- a/pdns/recursordist/rec-main.cc +++ b/pdns/recursordist/rec-main.cc @@ -62,6 +62,7 @@ thread_local FrameStreamServersInfo t_nodFrameStreamServersInfo; string g_programname = "pdns_recursor"; string g_pidfname; RecursorControlChannel g_rcc; // only active in the handler thread +bool g_regressionTestMode; #ifdef NOD_ENABLED bool g_nodEnabled; @@ -997,9 +998,16 @@ static void doStats(void) auto log = g_slog->withName("stats"); - if (g_stats.qcounter && (cacheHits + cacheMisses) && SyncRes::s_queries && SyncRes::s_outqueries) { + auto qcounter = g_Counters.sum(rec::Counter::qcounter); + auto syncresqueries = g_Counters.sum(rec::Counter::syncresqueries); + auto outqueries = g_Counters.sum(rec::Counter::outqueries); + auto throttledqueries = g_Counters.sum(rec::Counter::throttledqueries); + auto tcpoutqueries = g_Counters.sum(rec::Counter::tcpoutqueries); + auto dotoutqueries = g_Counters.sum(rec::Counter::dotoutqueries); + auto outgoingtimeouts = g_Counters.sum(rec::Counter::outgoingtimeouts); + if (qcounter > 0 && (cacheHits + cacheMisses) > 0 && syncresqueries > 0 && outqueries > 0) { if (!g_slogStructured) { - g_log << Logger::Notice << "stats: " << g_stats.qcounter << " questions, " << cacheSize << " cache entries, " << negCacheSize << " negative entries, " << ratePercentage(cacheHits, cacheHits + cacheMisses) << "% cache hits" << endl; + g_log << Logger::Notice << "stats: " << qcounter << " questions, " << cacheSize << " cache entries, " << negCacheSize << " negative entries, " << ratePercentage(cacheHits, cacheHits + cacheMisses) << "% cache hits" << endl; g_log << Logger::Notice << "stats: cache contended/acquired " << rc_stats.first << '/' << rc_stats.second << " = " << r << '%' << endl; g_log << Logger::Notice << "stats: throttle map: " @@ -1010,18 +1018,18 @@ static void doStats(void) << SyncRes::getNonResolvingNSSize() << ", saved-parentsets: " << SyncRes::getSaveParentsNSSetsSize() << endl; - g_log << Logger::Notice << "stats: outpacket/query ratio " << ratePercentage(SyncRes::s_outqueries, SyncRes::s_queries) << "%"; - g_log << Logger::Notice << ", " << ratePercentage(SyncRes::s_throttledqueries, SyncRes::s_outqueries + SyncRes::s_throttledqueries) << "% throttled" << endl; - g_log << Logger::Notice << "stats: " << SyncRes::s_tcpoutqueries << "/" << SyncRes::s_dotoutqueries << "/" << getCurrentIdleTCPConnections() << " outgoing tcp/dot/idle connections, " << broadcastAccFunction(pleaseGetConcurrentQueries) << " queries running, " << SyncRes::s_outgoingtimeouts << " outgoing timeouts " << endl; + g_log << Logger::Notice << "stats: outpacket/query ratio " << ratePercentage(outqueries, syncresqueries) << "%"; + g_log << Logger::Notice << ", " << ratePercentage(throttledqueries, outqueries + throttledqueries) << "% throttled" << endl; + g_log << Logger::Notice << "stats: " << tcpoutqueries << "/" << dotoutqueries << "/" << getCurrentIdleTCPConnections() << " outgoing tcp/dot/idle connections, " << broadcastAccFunction(pleaseGetConcurrentQueries) << " queries running, " << outgoingtimeouts << " outgoing timeouts " << endl; - g_log << Logger::Notice << "stats: " << pcSize << " packet cache entries, " << ratePercentage(pcHits, g_stats.qcounter) << "% packet cache hits" << endl; + g_log << Logger::Notice << "stats: " << pcSize << " packet cache entries, " << ratePercentage(pcHits, qcounter) << "% packet cache hits" << endl; g_log << Logger::Notice << "stats: tasks pushed/expired/queuesize: " << taskPushes << '/' << taskExpired << '/' << taskSize << endl; } else { const string m = "Periodic statistics report"; log->info(Logr::Info, m, - "questions", Logging::Loggable(g_stats.qcounter), + "questions", Logging::Loggable(qcounter), "cache-entries", Logging::Loggable(cacheSize), "negcache-entries", Logging::Loggable(negCacheSize), "record-cache-hitratio-perc", Logging::Loggable(ratePercentage(cacheHits, cacheHits + cacheMisses)), @@ -1035,17 +1043,17 @@ static void doStats(void) "edns-entries", Logging::Loggable(SyncRes::getEDNSStatusesSize()), "non-resolving-nameserver-entries", Logging::Loggable(SyncRes::getNonResolvingNSSize()), "saved-parent-ns-sets-entries", Logging::Loggable(SyncRes::getSaveParentsNSSetsSize()), - "outqueries-per-query", Logging::Loggable(ratePercentage(SyncRes::s_outqueries, SyncRes::s_queries))); + "outqueries-per-query", Logging::Loggable(ratePercentage(outqueries, syncresqueries))); log->info(Logr::Info, m, - "throttled-queries-perc", Logging::Loggable(ratePercentage(SyncRes::s_throttledqueries, SyncRes::s_outqueries + SyncRes::s_throttledqueries)), - "tcp-outqueries", Logging::Loggable(SyncRes::s_tcpoutqueries), - "dot-outqueries", Logging::Loggable(SyncRes::s_dotoutqueries), + "throttled-queries-perc", Logging::Loggable(ratePercentage(throttledqueries, outqueries + throttledqueries)), + "tcp-outqueries", Logging::Loggable(tcpoutqueries), + "dot-outqueries", Logging::Loggable(dotoutqueries), "idle-tcpout-connections", Logging::Loggable(getCurrentIdleTCPConnections()), "concurrent-queries", Logging::Loggable(broadcastAccFunction(pleaseGetConcurrentQueries)), - "outgoing-timeouts", Logging::Loggable(SyncRes::s_outgoingtimeouts)); + "outgoing-timeouts", Logging::Loggable(outgoingtimeouts)); log->info(Logr::Info, m, "packetcache-entries", Logging::Loggable(pcSize), - "packetcache-hitratio-perc", Logging::Loggable(ratePercentage(pcHits, g_stats.qcounter)), + "packetcache-hitratio-perc", Logging::Loggable(ratePercentage(pcHits, qcounter)), "taskqueue-pushed", Logging::Loggable(taskPushes), "taskqueue-expired", Logging::Loggable(taskExpired), "taskqueue-size", Logging::Loggable(taskSize)); @@ -1060,12 +1068,12 @@ static void doStats(void) } time_t now = time(0); if (lastOutputTime && lastQueryCount && now != lastOutputTime) { - SLOG(g_log << Logger::Notice << "stats: " << (g_stats.qcounter - lastQueryCount) / (now - lastOutputTime) << " qps (average over " << (now - lastOutputTime) << " seconds)" << endl, - log->info(Logr::Info, "Periodic QPS report", "qps", Logging::Loggable((g_stats.qcounter - lastQueryCount) / (now - lastOutputTime)), + SLOG(g_log << Logger::Notice << "stats: " << (qcounter - lastQueryCount) / (now - lastOutputTime) << " qps (average over " << (now - lastOutputTime) << " seconds)" << endl, + log->info(Logr::Info, "Periodic QPS report", "qps", Logging::Loggable((qcounter - lastQueryCount) / (now - lastOutputTime)), "averagedOver", Logging::Loggable(now - lastOutputTime))); } lastOutputTime = now; - lastQueryCount = g_stats.qcounter; + lastQueryCount = qcounter; } else if (statsWanted) { SLOG(g_log << Logger::Notice << "stats: no stats yet!" << endl, @@ -1354,6 +1362,7 @@ static int serviceMain(int argc, char* argv[], Logr::log_t log) g_log.setName(g_programname); g_log.disableSyslog(::arg().mustDo("disable-syslog")); g_log.setTimestamps(::arg().mustDo("log-timestamp")); + g_regressionTestMode = ::arg().mustDo("devonly-regression-test-mode"); if (!::arg()["logging-facility"].empty()) { int val = logFacilityToLOG(::arg().asNum("logging-facility")); @@ -1950,6 +1959,9 @@ static int serviceMain(int argc, char* argv[], Logr::log_t log) disableStats(StatComponent::RecControl, ::arg()["stats-rec-control-disabled-list"]); disableStats(StatComponent::SNMP, ::arg()["stats-snmp-disabled-list"]); + // Run before any thread doing stats related things + registerAllStats(); + if (::arg().mustDo("snmp-agent")) { #ifdef HAVE_NET_SNMP string setting = ::arg()["snmp-daemon-socket"]; @@ -2120,6 +2132,7 @@ static void houseKeeping(void*) struct timeval now; Utility::gettimeofday(&now); + t_Counters.updateSnap(now, g_regressionTestMode); // Below are the tasks that run for every recursorThread, including handler and taskThread if (t_packetCache) { @@ -2225,7 +2238,10 @@ static void houseKeeping(void*) const unsigned int minRootRefreshInterval = 10; static PeriodicTask rootUpdateTask{"rootUpdateTask", std::max(SyncRes::s_maxcachettl * 8 / 10, minRootRefreshInterval)}; rootUpdateTask.runIfDue(now, [now, &log, minRootRefreshInterval]() { - int res = SyncRes::getRootNS(now, nullptr, 0, log); + int res = 0; + if (!g_regressionTestMode) { + res = SyncRes::getRootNS(now, nullptr, 0, log); + } if (res == 0) { // Success, go back to the defaut period rootUpdateTask.setPeriod(std::max(SyncRes::s_maxcachettl * 8 / 10, minRootRefreshInterval)); @@ -2292,6 +2308,7 @@ static void houseKeeping(void*) } }); } + t_Counters.updateSnap(g_regressionTestMode); t_running = false; } catch (const PDNSException& ae) { @@ -2311,6 +2328,7 @@ static void houseKeeping(void*) static void recursorThread() { auto log = g_slog->withName("runtime"); + t_Counters.updateSnap(true); try { auto& threadInfo = RecThreadInfo::self(); { @@ -2443,8 +2461,6 @@ static void recursorThread() } } - registerAllStats(); - if (threadInfo.isHandler()) { t_fdm->addReadFD(g_rcc.d_fd, handleRCC); // control channel } @@ -2457,7 +2473,6 @@ static void recursorThread() time_t last_carbon = 0, last_lua_maintenance = 0; time_t carbonInterval = ::arg().asNum("carbon-interval"); time_t luaMaintenanceInterval = ::arg().asNum("lua-maintenance-interval"); - s_counter.store(0); // used to periodically execute certain tasks #ifdef HAVE_SYSTEMD if (threadInfo.isHandler()) { @@ -2467,7 +2482,6 @@ static void recursorThread() sd_notify(0, "READY=1"); } #endif - while (!RecursorControlChannel::stop) { while (MT->schedule(&g_now)) ; // MTasker letting the mthreads do their thing @@ -2481,8 +2495,8 @@ static void recursorThread() if (!threadInfo.isTaskThread()) { struct timeval stop; Utility::gettimeofday(&stop); - g_stats.maintenanceUsec += uSec(stop - start); - ++g_stats.maintenanceCalls; + t_Counters.at(rec::Counter::maintenanceUsec) += uSec(stop - start); + ++t_Counters.at(rec::Counter::maintenanceCalls); } } @@ -2526,8 +2540,8 @@ static void recursorThread() last_lua_maintenance = g_now.tv_sec; struct timeval stop; Utility::gettimeofday(&stop); - g_stats.maintenanceUsec += uSec(stop - start); - ++g_stats.maintenanceCalls; + t_Counters.at(rec::Counter::maintenanceUsec) += uSec(stop - start); + ++t_Counters.at(rec::Counter::maintenanceCalls); } } } @@ -2587,6 +2601,8 @@ int main(int argc, char** argv) #else ::arg().set("stack-size", "stack size per mthread") = "200000"; #endif + // This mode forces metrics snap updates and dsiable root-refresh, to get consistent counters + ::arg().setSwitch("devonly-regression-test-mode", "internal use only") = "no"; ::arg().set("soa-minimum-ttl", "Don't change") = "0"; ::arg().set("no-shuffle", "Don't change") = "off"; ::arg().set("local-port", "port to listen on") = "53"; diff --git a/pdns/recursordist/rec-main.hh b/pdns/recursordist/rec-main.hh index 6c231f7fbe..cb3787a8ae 100644 --- a/pdns/recursordist/rec-main.hh +++ b/pdns/recursordist/rec-main.hh @@ -154,6 +154,7 @@ struct DNSComboWriter extern thread_local unique_ptr t_fdm; extern uint16_t g_minUdpSourcePort; extern uint16_t g_maxUdpSourcePort; +extern bool g_regressionTestMode; // you can ask this class for a UDP socket to send a query from // this socket is not yours, don't even think about deleting it diff --git a/pdns/recursordist/rec-tcounters.cc b/pdns/recursordist/rec-tcounters.cc new file mode 100644 index 0000000000..b40815a8cc --- /dev/null +++ b/pdns/recursordist/rec-tcounters.cc @@ -0,0 +1,156 @@ +/* + * This file is part of PowerDNS or dnsdist. + * Copyright -- PowerDNS.COM B.V. and its contributors + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * In addition, for the avoidance of any doubt, permission is granted to + * link this program with OpenSSL and to (re)distribute the binaries + * produced as the result of such linking. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "rec-tcounters.hh" + +#include + +namespace rec +{ + +Counters& Counters::merge(const Counters& data) +{ + // Counters are simply added + for (size_t i = 0; i < uint64Count.size(); i++) { + uint64Count.at(i) += data.uint64Count.at(i); + } + // Averages: take weight into account + for (size_t i = 0; i < doubleWAvg.size(); i++) { + auto& lhs = doubleWAvg.at(i); + const auto& rhs = data.doubleWAvg.at(i); + auto weight = lhs.weight + rhs.weight; + auto avg = lhs.avg * static_cast(lhs.weight) + rhs.avg * static_cast(rhs.weight); + avg = weight == 0 ? 0 : avg / weight; + lhs.avg = avg; + lhs.weight = weight; + } + // Rcode Counters are simply added + for (size_t i = 0; i < auth.rcodeCounters.size(); i++) { + auth.rcodeCounters.at(i) += data.auth.rcodeCounters.at(i); + } + // Histograms counts are added by += operator on Histograms + for (size_t i = 0; i < histograms.size(); i++) { + histograms.at(i) += data.histograms.at(i); + } + + return *this; +} + +std::string Counters::toString() const +{ + std::ostringstream stream; + + for (auto element : uint64Count) { + stream << element << ' '; + } + stream << std::endl; + for (auto element : doubleWAvg) { + stream << '(' << element.avg << ' ' << element.weight << ')'; + } + stream << " RCodes: "; + for (auto element : auth.rcodeCounters) { + stream << element << ' '; + } + stream << "Histograms: "; + for (const auto& element : histograms) { + stream << element.getName() << ": NYI"; + } + stream << std::endl; + return stream.str(); +} + +} + +// Compile with: +// c++ -DTEST_TCOUNTER_TIMING -Wall -std=c++17 -O2 rec-tcounters.cc -pthread + +#if TEST_TCOUNTER_TIMING + +#include +#include +#include +#include +#include + +rec::GlobalCounters g_counters; +thread_local rec::TCounters t_counters(g_counters); + +std::atomic atomicCounter; + +size_t iterations; + +void atomicThread() +{ + for (size_t i = 0; i < iterations; i++) { + ++atomicCounter; + } +} + +void tcounterThread() +{ + for (size_t i = 0; i < iterations; i++) { + ++t_counters.at(rec::Counter::qcounter); + if (i % 100 == 0) { + t_counters.updateSnap(); + } + } +} + +int main(int argc, char* argv[]) +{ + size_t threads = std::atoi(argv[1]); + iterations = std::atoi(argv[2]); + + std::cout << "Starting " << threads << " threads doing " << iterations << " iterations using atomics" << std::endl; + std::vector thr; + thr.resize(threads); + + timeval start; + gettimeofday(&start, nullptr); + for (size_t i = 0; i < threads; i++) { + thr[i] = std::thread(atomicThread); + } + for (size_t i = 0; i < threads; i++) { + thr[i].join(); + } + timeval stop; + gettimeofday(&stop, nullptr); + timeval diff; + timersub(&stop, &start, &diff); + auto elapsed = (diff.tv_sec + diff.tv_usec / 1e6); + std::cout << "Sum is " << atomicCounter << " elapsed is " << elapsed << std::endl; + + std::cout << "Now doing the same with tcounters" << std::endl; + gettimeofday(&start, nullptr); + for (size_t i = 0; i < threads; i++) { + thr[i] = std::thread(tcounterThread); + } + for (size_t i = 0; i < threads; i++) { + thr[i].join(); + } + gettimeofday(&stop, nullptr); + timersub(&stop, &start, &diff); + elapsed = (diff.tv_sec + diff.tv_usec / 1e6); + std::cout << "Sum is " << g_counters.sum(rec::Counter::qcounter) << " elapsed is " << elapsed << std::endl; +} + +#endif diff --git a/pdns/recursordist/rec-tcounters.hh b/pdns/recursordist/rec-tcounters.hh new file mode 100644 index 0000000000..d315192b38 --- /dev/null +++ b/pdns/recursordist/rec-tcounters.hh @@ -0,0 +1,219 @@ +/* + * This file is part of PowerDNS or dnsdist. + * Copyright -- PowerDNS.COM B.V. and its contributors + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * In addition, for the avoidance of any doubt, permission is granted to + * link this program with OpenSSL and to (re)distribute the binaries + * produced as the result of such linking. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#pragma once + +#include "tcounters.hh" + +#include + +#include "histogram.hh" + +namespace rec +{ + +// Simple counters +enum class Counter : uint8_t +{ + syncresqueries, + outgoingtimeouts, + outgoing4timeouts, + outgoing6timeouts, + throttledqueries, + dontqueries, + qnameminfallbacksuccess, + authzonequeries, + outqueries, + tcpoutqueries, + dotoutqueries, + unreachables, + servFails, + nxDomains, + noErrors, + qcounter, + ipv6qcounter, + tcpqcounter, + unauthorizedUDP, // when this is increased, qcounter isn't + unauthorizedTCP, // when this is increased, qcounter isn't + sourceDisallowedNotify, // when this is increased, qcounter is also + zoneDisallowedNotify, // when this is increased, qcounter is also + policyDrops, + tcpClientOverflow, + clientParseError, + serverParseError, + tooOldDrops, + truncatedDrops, + queryPipeFullDrops, + unexpectedCount, + caseMismatchCount, + spoofCount, + resourceLimits, + overCapacityDrops, + ipv6queries, + chainResends, + nsSetInvalidations, + ednsPingMatches, + ednsPingMismatches, + noPingOutQueries, + noEdnsOutQueries, + packetCacheHits, + noPacketError, + ignoredCount, + emptyQueriesCount, + dnssecQueries, + dnssecAuthenticDataQueries, + dnssecCheckDisabledQueries, + variableResponses, + maxMThreadStackUsage, + dnssecValidations, // should be the sum of all dnssecResult* stats + rebalancedQueries, + proxyProtocolInvalidCount, + nodLookupsDroppedOversize, + dns64prefixanswers, + maintenanceUsec, + maintenanceCalls, + + numberOfCounters +}; + +// double avegares times, weighted according to how many packets they processed +enum class DoubleWAvgCounter : uint8_t +{ + avgLatencyUsec, + avgLatencyOursUsec, + numberOfCounters +}; + +// An RCode histogram +enum class RCode : uint8_t +{ + auth, + numberOfCounters +}; + +// A few other histograms +enum class Histogram : uint8_t +{ + answers, + auth4Answers, + auth6Answers, + ourtime, + cumulativeAnswers, + cumulativeAuth4Answers, + cumulativeAuth6Answers, + + numberOfCounters +}; + +struct Counters +{ + // An aray of simple counters + std::array(Counter::numberOfCounters)> uint64Count{}; + + struct WeightedAverage + { + double avg{}; + uint64_t weight{}; + + void add(double value) + { + avg = value; + ++weight; + } + + void addToRollingAvg(double value, uint64_t rollsize) + { + add((1.0 - 1.0 / static_cast(rollsize)) * avg + value / static_cast(rollsize)); + } + }; + // And an array of weighted averaged values + std::array(DoubleWAvgCounter::numberOfCounters)> doubleWAvg{}; + + struct RCodeCounters + { + RCodeCounters& operator+=(const RCodeCounters& rhs) + { + for (size_t i = 0; i < rcodeCounters.size(); i++) { + rcodeCounters.at(i) += rhs.rcodeCounters.at(i); + } + return *this; + } + static const size_t numberoOfRCodes = 16; + std::array rcodeCounters; + }; + // An RCodes histogram + RCodeCounters auth{}; + + std::array(Histogram::numberOfCounters)> histograms = { + pdns::Histogram{"answers", {1000, 10000, 100000, 1000000}}, + pdns::Histogram{"auth4answers", {1000, 10000, 100000, 1000000}}, + pdns::Histogram{"auth6answers", {1000, 10000, 100000, 1000000}}, + pdns::Histogram{"ourtime", {1000, 2000, 4000, 8000, 16000, 32000}}, + pdns::Histogram{"cumul-clientanswers-", 10, 19}, + pdns::Histogram{"cumul-authanswers-", 1000, 13}, + pdns::Histogram{"cumul-authanswers-", 1000, 13}}; + + Counters() + { + for (auto& elem : uint64Count) { + elem = 0; + } + // doubleWAvg has a default ct that initializes + for (auto& elem : auth.rcodeCounters) { + elem = 0; + } + } + + // Merge a set of counters into an existing set of counters. For simple counters, that will be additions + // for averages, we should take the weights into account. Histograms need to sum all individual counts. + Counters& merge(const Counters& data); + + // The following accessors select the rightcounter type based on the index type + uint64_t& at(Counter index) + { + return uint64Count.at(static_cast(index)); + } + + WeightedAverage& at(DoubleWAvgCounter index) + { + return doubleWAvg.at(static_cast(index)); + } + + RCodeCounters& at(RCode index) + { + // We only have a single RCode indexed Histogram, so no need to select a specific one + return auth; + } + + pdns::Histogram& at(Histogram index) + { + return histograms.at(static_cast(index)); + } + + // Mainly for debugging purposes + [[nodiscard]] std::string toString() const; +}; + +// The application specific types, one for thread local, one for the aggregator +using TCounters = pdns::TLocalCounters; +using GlobalCounters = pdns::GlobalCounters; +} diff --git a/pdns/recursordist/rec-tcp.cc b/pdns/recursordist/rec-tcp.cc index e1bc143b6e..a0b1972e06 100644 --- a/pdns/recursordist/rec-tcp.cc +++ b/pdns/recursordist/rec-tcp.cc @@ -229,7 +229,7 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) SLOG(g_log << Logger::Error << "Unable to consume proxy protocol header in packet from TCP client " << conn->d_remote.toStringWithPort() << endl, g_slogtcpin->info(Logr::Error, "Unable to consume proxy protocol header in packet from TCP client", "remote", Logging::Loggable(conn->d_remote))); } - ++g_stats.proxyProtocolInvalidCount; + ++t_Counters.at(rec::Counter::proxyProtocolInvalidCount); return; } else if (remaining < 0) { @@ -250,7 +250,7 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) SLOG(g_log << Logger::Error << "Unable to parse proxy protocol header in packet from TCP client " << conn->d_remote.toStringWithPort() << endl, g_slogtcpin->info(Logr::Error, "Unable to parse proxy protocol header in packet from TCP client", "remote", Logging::Loggable(conn->d_remote))); } - ++g_stats.proxyProtocolInvalidCount; + ++t_Counters.at(rec::Counter::proxyProtocolInvalidCount); return; } else if (static_cast(used) > g_proxyProtocolMaximumSize) { @@ -258,7 +258,7 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) SLOG(g_log << Logger::Error << "Proxy protocol header in packet from TCP client " << conn->d_remote.toStringWithPort() << " is larger than proxy-protocol-maximum-size (" << used << "), dropping" << endl, g_slogtcpin->info(Logr::Error, "Proxy protocol header in packet from TCP client is larger than proxy-protocol-maximum-size", "remote", Logging::Loggable(conn->d_remote), "size", Logging::Loggable(used))); } - ++g_stats.proxyProtocolInvalidCount; + ++t_Counters.at(rec::Counter::proxyProtocolInvalidCount); return; } @@ -278,7 +278,7 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) g_slogtcpin->info(Logr::Error, "Dropping TCP query, address not matched by allow-from", "remote", Logging::Loggable(conn->d_remote))); } - ++g_stats.unauthorizedTCP; + ++t_Counters.at(rec::Counter::unauthorizedTCP); return; } @@ -348,7 +348,7 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) dc = std::make_unique(conn->data, g_now, t_pdl); } catch (const MOADNSException& mde) { - g_stats.clientParseError++; + t_Counters.at(rec::Counter::clientParseError)++; if (g_logCommonErrors) { SLOG(g_log << Logger::Error << "Unable to parse packet from TCP client " << conn->d_remote.toStringWithPort() << endl, g_slogtcpin->info(Logr::Error, "Unable to parse packet from TCP client", "remte", Logging::Loggable(conn->d_remote))); @@ -471,13 +471,13 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) SLOG(g_log << Logger::Notice << RecThreadInfo::id() << " [" << MT->getTid() << "/" << MT->numProcesses() << "] DROPPED TCP question from " << dc->d_source.toStringWithPort() << (dc->d_source != dc->d_remote ? " (via " + dc->d_remote.toStringWithPort() + ")" : "") << " based on policy" << endl, g_slogtcpin->info(Logr::Info, "Dropped TCP question based on policy", "remote", Logging::Loggable(conn->d_remote), "source", Logging::Loggable(dc->d_source))); } - g_stats.policyDrops++; + t_Counters.at(rec::Counter::policyDrops)++; return; } } if (dc->d_mdp.d_header.qr) { - g_stats.ignoredCount++; + t_Counters.at(rec::Counter::ignoredCount)++; if (g_logCommonErrors) { SLOG(g_log << Logger::Error << "Ignoring answer from TCP client " << dc->getRemote() << " on server socket!" << endl, g_slogtcpin->info(Logr::Error, "Ignoring answer from TCP client on server socket", "remote", Logging::Loggable(dc->getRemote()))); @@ -485,7 +485,7 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) return; } if (dc->d_mdp.d_header.opcode != Opcode::Query && dc->d_mdp.d_header.opcode != Opcode::Notify) { - g_stats.ignoredCount++; + t_Counters.at(rec::Counter::ignoredCount)++; if (g_logCommonErrors) { SLOG(g_log << Logger::Error << "Ignoring unsupported opcode " << Opcode::to_s(dc->d_mdp.d_header.opcode) << " from TCP client " << dc->getRemote() << " on server socket!" << endl, g_slogtcpin->info(Logr::Error, "Ignoring unsupported opcode from TCP client", "remote", Logging::Loggable(dc->getRemote()), "opcode", Logging::Loggable(Opcode::to_s(dc->d_mdp.d_header.opcode)))); @@ -495,7 +495,7 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) return; } else if (dh->qdcount == 0) { - g_stats.emptyQueriesCount++; + t_Counters.at(rec::Counter::emptyQueriesCount)++; if (g_logCommonErrors) { SLOG(g_log << Logger::Error << "Ignoring empty (qdcount == 0) query from " << dc->getRemote() << " on server socket!" << endl, g_slogtcpin->info(Logr::Error, "Ignoring empty (qdcount == 0) query on server socket", "remote", Logging::Loggable(dc->getRemote()))); @@ -506,8 +506,9 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) } else { // We have read a proper query - ++g_stats.qcounter; - ++g_stats.tcpqcounter; + //++t_Counters.at(rec::Counter::qcounter); + ++t_Counters.at(rec::Counter::qcounter); + ++t_Counters.at(rec::Counter::tcpqcounter); if (dc->d_mdp.d_header.opcode == Opcode::Notify) { if (!t_allowNotifyFrom || !t_allowNotifyFrom->match(dc->d_mappedSource)) { @@ -516,7 +517,7 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) g_slogtcpin->info(Logr::Error, "Dropping TCP NOTIFY, address not matched by allow-notify-from", "source", Logging::Loggable(dc->d_mappedSource))); } - g_stats.sourceDisallowedNotify++; + t_Counters.at(rec::Counter::sourceDisallowedNotify)++; return; } @@ -526,7 +527,7 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) g_slogtcpin->info(Logr::Error, "Dropping TCP NOTIFY, zone not matched by allow-notify-for", "source", Logging::Loggable(dc->d_mappedSource), "zone", Logging::Loggable(qname))); } - g_stats.zoneDisallowedNotify++; + t_Counters.at(rec::Counter::zoneDisallowedNotify)++; return; } } @@ -555,7 +556,7 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) struct timeval now; Utility::gettimeofday(&now, nullptr); uint64_t spentUsec = uSec(now - start); - g_stats.cumulativeAnswers(spentUsec); + t_Counters.at(rec::Histogram::cumulativeAnswers)(spentUsec); dc->d_eventTrace.add(RecEventTrace::AnswerSent); if (t_protobufServers.servers && dc->d_logResponse && !(luaconfsLocal->protobufExportConfig.taggedOnly && pbData && !pbData->d_tagged)) { @@ -571,6 +572,7 @@ static void handleRunningTCPQuestion(int fd, FDMultiplexer::funcparam_t& var) g_slogtcpin->info(Logr::Info, dc->d_eventTrace.toString())); // More fancy? } tcpGuard.keep(); + t_Counters.updateSnap(g_regressionTestMode); return; } // cache hit } // query opcode @@ -618,7 +620,7 @@ void handleNewTCPQuestion(int fd, FDMultiplexer::funcparam_t&) int newsock = accept(fd, (struct sockaddr*)&addr, &addrlen); if (newsock >= 0) { if (MT->numProcesses() > g_maxMThreads) { - g_stats.overCapacityDrops++; + t_Counters.at(rec::Counter::overCapacityDrops)++; try { closesocket(newsock); } @@ -646,7 +648,7 @@ void handleNewTCPQuestion(int fd, FDMultiplexer::funcparam_t&) SLOG(g_log << Logger::Error << "[" << MT->getTid() << "] dropping TCP query from " << mappedSource.toString() << ", address neither matched by allow-from nor proxy-protocol-from" << endl, g_slogtcpin->info(Logr::Error, "dropping TCP query address neither matched by allow-from nor proxy-protocol-from", "source", Logging::Loggable(mappedSource))); - g_stats.unauthorizedTCP++; + t_Counters.at(rec::Counter::unauthorizedTCP)++; try { closesocket(newsock); } @@ -658,7 +660,7 @@ void handleNewTCPQuestion(int fd, FDMultiplexer::funcparam_t&) } if (g_maxTCPPerClient && t_tcpClientCounts->count(addr) && (*t_tcpClientCounts)[addr] >= g_maxTCPPerClient) { - g_stats.tcpClientOverflow++; + t_Counters.at(rec::Counter::tcpClientOverflow)++; try { closesocket(newsock); // don't call TCPConnection::closeAndCleanup here - did not enter it in the counts yet! } diff --git a/pdns/recursordist/tcounters.hh b/pdns/recursordist/tcounters.hh new file mode 120000 index 0000000000..7bf637d71b --- /dev/null +++ b/pdns/recursordist/tcounters.hh @@ -0,0 +1 @@ +../tcounters.hh \ No newline at end of file diff --git a/pdns/recursordist/test-rec-tcounters_cc.cc b/pdns/recursordist/test-rec-tcounters_cc.cc new file mode 100644 index 0000000000..a86cab0d34 --- /dev/null +++ b/pdns/recursordist/test-rec-tcounters_cc.cc @@ -0,0 +1,149 @@ +#define BOOST_TEST_DYN_LINK +#define BOOST_TEST_NO_MAIN + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include + +#include +#include +#include "rec-tcounters.hh" + +static rec::GlobalCounters global; +static thread_local rec::TCounters tlocal(global); + +BOOST_AUTO_TEST_SUITE(test_rec_tcounters_cc) + +BOOST_AUTO_TEST_CASE(destruct) +{ + global.reset(); + + const size_t count = 100000; + std::thread thread1([] { + for (size_t i = 0; i < count; i++) { + ++tlocal.at(rec::Counter::servFails); + } + }); + std::thread thread2([] { + for (size_t i = 0; i < count; i++) { + ++tlocal.at(rec::Counter::nxDomains); + } + }); + thread1.join(); + thread2.join(); + BOOST_CHECK_EQUAL(global.sum(rec::Counter::servFails), count); + BOOST_CHECK_EQUAL(global.sum(rec::Counter::nxDomains), count); +} + +BOOST_AUTO_TEST_CASE(update_fast) +{ + global.reset(); + + std::atomic done{}; + + const size_t count = 10000000; + std::thread thread1([&done] { + for (size_t i = 0; i < count; i++) { + ++tlocal.at(rec::Counter::servFails); + ++tlocal.at(rec::Counter::nxDomains); + tlocal.at(rec::DoubleWAvgCounter::avgLatencyUsec).add(1.1); + if (random() % 10000 == 0) { + tlocal.updateSnap(); + } + } + done++; + }); + std::thread thread2([&done] { + for (size_t i = 0; i < count / 2; i++) { + ++tlocal.at(rec::Counter::servFails); + ++tlocal.at(rec::Counter::nxDomains); + tlocal.at(rec::DoubleWAvgCounter::avgLatencyUsec).add(2.2); + if (random() % 10000 == 0) { + tlocal.updateSnap(); + } + } + done++; + }); + std::thread thread3([&done] { + while (done < 2) { + auto counts = global.aggregatedSnap(); + BOOST_CHECK_EQUAL(counts.uint64Count[0], counts.uint64Count[1]); + auto avg = counts.at(rec::DoubleWAvgCounter::avgLatencyUsec).avg; + BOOST_CHECK(avg == 0.0 || (avg >= 1.1 && avg <= 2.2)); + } + }); + thread1.join(); + thread2.join(); + thread3.join(); + BOOST_CHECK_EQUAL(global.sum(rec::Counter::servFails), count + count / 2); + BOOST_CHECK_EQUAL(global.sum(rec::Counter::nxDomains), count + count / 2); + auto avg = global.avg(rec::DoubleWAvgCounter::avgLatencyUsec); + BOOST_CHECK(avg >= 1.1 && avg <= 2.2); +} + +BOOST_AUTO_TEST_CASE(update_with_sleep) +{ + + global.reset(); + + std::atomic done{}; + + const size_t count = 1000000; + std::thread thread1([&done] { + for (size_t i = 0; i < count; i++) { + ++tlocal.at(rec::Counter::servFails); + ++tlocal.at(rec::Counter::nxDomains); + tlocal.at(rec::DoubleWAvgCounter::avgLatencyUsec).add(1.1); + if (random() % 10000 == 0) { + tlocal.updateSnap(); + } + struct timespec interval + { + 0, random() % 5000 + }; + nanosleep(&interval, nullptr); + } + done++; + }); + std::thread thread2([&done] { + for (size_t i = 0; i < count / 2; i++) { + ++tlocal.at(rec::Counter::servFails); + ++tlocal.at(rec::Counter::nxDomains); + tlocal.at(rec::DoubleWAvgCounter::avgLatencyUsec).add(2.2); + if (random() % 10000 == 0) { + tlocal.updateSnap(); + } + struct timespec interval + { + 0, random() % 999 + }; + nanosleep(&interval, nullptr); + } + done++; + }); + std::thread thread3([&done] { + while (done < 2) { + auto counts = global.aggregatedSnap(); + BOOST_CHECK_EQUAL(counts.uint64Count[0], counts.uint64Count[1]); + auto avg = counts.at(rec::DoubleWAvgCounter::avgLatencyUsec).avg; + // std::cerr << avg << std::endl; + BOOST_CHECK(avg == 0.0 || (avg >= 1.1 && avg <= 2.2)); + struct timespec interval + { + 0, random() % 10000 + }; + nanosleep(&interval, nullptr); + } + }); + thread1.join(); + thread2.join(); + thread3.join(); + BOOST_CHECK_EQUAL(global.sum(rec::Counter::servFails), count + count / 2); + BOOST_CHECK_EQUAL(global.sum(rec::Counter::nxDomains), count + count / 2); + auto avg = global.avg(rec::DoubleWAvgCounter::avgLatencyUsec); + BOOST_CHECK(avg >= 1.1 && avg <= 2.2); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/pdns/recursordist/test-syncres_cc.cc b/pdns/recursordist/test-syncres_cc.cc index ae2c062132..4d47fcb0fc 100644 --- a/pdns/recursordist/test-syncres_cc.cc +++ b/pdns/recursordist/test-syncres_cc.cc @@ -29,10 +29,6 @@ ArgvMap& arg() return theArg; } -void primeRootNSZones(DNSSECMode, unsigned int) -{ -} - BaseLua4::~BaseLua4() { } diff --git a/pdns/syncres.cc b/pdns/syncres.cc index 5fe1dbf69b..039ae29801 100644 --- a/pdns/syncres.cc +++ b/pdns/syncres.cc @@ -38,6 +38,9 @@ #include "validate-recursor.hh" #include "rec-taskqueue.hh" +rec::GlobalCounters g_Counters; +thread_local rec::TCounters t_Counters(g_Counters); + template class fails_t : public boost::noncopyable { @@ -427,18 +430,6 @@ unsigned int SyncRes::s_serverdownthrottletime; unsigned int SyncRes::s_nonresolvingnsmaxfails; unsigned int SyncRes::s_nonresolvingnsthrottletime; unsigned int SyncRes::s_ecscachelimitttl; -pdns::stat_t SyncRes::s_authzonequeries; -pdns::stat_t SyncRes::s_queries; -pdns::stat_t SyncRes::s_outgoingtimeouts; -pdns::stat_t SyncRes::s_outgoing4timeouts; -pdns::stat_t SyncRes::s_outgoing6timeouts; -pdns::stat_t SyncRes::s_outqueries; -pdns::stat_t SyncRes::s_tcpoutqueries; -pdns::stat_t SyncRes::s_dotoutqueries; -pdns::stat_t SyncRes::s_throttledqueries; -pdns::stat_t SyncRes::s_dontqueries; -pdns::stat_t SyncRes::s_qnameminfallbacksuccess; -pdns::stat_t SyncRes::s_unreachables; pdns::stat_t SyncRes::s_ecsqueries; pdns::stat_t SyncRes::s_ecsresponses; std::map SyncRes::s_ecsResponsesBySubnetSize4; @@ -486,11 +477,11 @@ static inline std::string fmtfloat(const char* fmt, double f) static inline void accountAuthLatency(uint64_t usec, int family) { if (family == AF_INET) { - g_stats.auth4Answers(usec); - g_stats.cumulativeAuth4Answers(usec); + t_Counters.at(rec::Histogram::auth4Answers)(usec); + t_Counters.at(rec::Histogram::cumulativeAuth4Answers)(usec); } else { - g_stats.auth6Answers(usec); - g_stats.cumulativeAuth6Answers(usec); + t_Counters.at(rec::Histogram::auth6Answers)(usec); + t_Counters.at(rec::Histogram::cumulativeAuth6Answers)(usec); } } @@ -694,7 +685,7 @@ int SyncRes::beginResolve(const DNSName &qname, const QType qtype, QClass qclass { d_eventTrace.add(RecEventTrace::SyncRes); vState state = vState::Indeterminate; - s_queries++; + t_Counters.at(rec::Counter::syncresqueries)++; d_wasVariable=false; d_wasOutOfBand=false; d_cutStates.clear(); @@ -726,7 +717,7 @@ int SyncRes::beginResolve(const DNSName &qname, const QType qtype, QClass qclass if (shouldValidate()) { if (d_queryValidationState != vState::Indeterminate) { - g_stats.dnssecValidations++; + t_Counters.at(rec::Counter::dnssecValidations)++; } auto xdnssec = g_xdnssec.getLocal(); if (xdnssec->check(qname)) { @@ -993,7 +984,7 @@ int SyncRes::AuthDomain::getRecords(const DNSName& qname, const QType qtype, std bool SyncRes::doOOBResolve(const AuthDomain& domain, const DNSName &qname, const QType qtype, vector&ret, int& res) { d_authzonequeries++; - s_authzonequeries++; + t_Counters.at(rec::Counter::authzonequeries)++; res = domain.getRecords(qname, qtype, ret); return true; @@ -1516,7 +1507,7 @@ LWResult::Result SyncRes::asyncresolveWrapper(const ComboAddress& ip, bool ednsM for (int tries = 0; tries < 2; ++tries) { if (mode == EDNSStatus::NOEDNS) { - g_stats.noEdnsOutQueries++; + t_Counters.at(rec::Counter::noEdnsOutQueries)++; EDNSLevel = 0; // level != mode } else if (ednsMANDATORY || mode != EDNSStatus::NOEDNS) { @@ -1772,7 +1763,7 @@ int SyncRes::doResolve(const DNSName &qname, const QType qtype, vector(lwr.d_rcode)); if (fromCache) *fromCache = true; @@ -3242,7 +3233,7 @@ void SyncRes::handlePolicyHit(const std::string& prefix, const DNSName& qname, c return; case DNSFilterEngine::PolicyKind::Drop: - ++g_stats.policyDrops; + ++t_Counters.at(rec::Counter::policyDrops); throw ImmediateQueryDropException(); case DNSFilterEngine::PolicyKind::NXDOMAIN: @@ -3417,12 +3408,14 @@ bool SyncRes::throttledOrBlocked(const std::string& prefix, const ComboAddress& { if (isThrottled(d_now.tv_sec, remoteIP)) { LOG(prefix<match(&remoteIP)) { @@ -3432,14 +3425,14 @@ bool SyncRes::throttledOrBlocked(const std::string& prefix, const ComboAddress& auto it = getBestAuthZone(&forwardCandidate); if (it == t_sstorage.domainmap->end()) { LOG(prefix<second.d_servers; if (std::find(ips.cbegin(), ips.cend(), remoteIP) == ips.cend()) { LOG(prefix<(lwr.d_rcode)); if (!dontThrottle) { auto dontThrottleNames = g_dontThrottleNames.getLocal(); @@ -5216,12 +5209,12 @@ bool SyncRes::doResolveAtThisIP(const std::string& prefix, const DNSName& qname, LOG(prefix<push_back(remoteIP); @@ -5229,14 +5222,14 @@ bool SyncRes::doResolveAtThisIP(const std::string& prefix, const DNSName& qname, else if (resolveret == LWResult::Result::OSLimitError) { /* OS resource limit reached */ LOG(prefix<doAgeCache(d_now.tv_sec, auth, QType::NS, 10)) - g_stats.nsSetInvalidations++; + t_Counters.at(rec::Counter::nsSetInvalidations)++; } return -1; } diff --git a/pdns/syncres.hh b/pdns/syncres.hh index 904089aa6d..66b2508d95 100644 --- a/pdns/syncres.hh +++ b/pdns/syncres.hh @@ -53,6 +53,7 @@ #include "tcpiohandler.hh" #include "rec-eventtrace.hh" #include "logr.hh" +#include "rec-tcounters.hh" #ifdef HAVE_CONFIG_H #include "config.h" @@ -458,18 +459,6 @@ public: static thread_local ThreadLocalStorage t_sstorage; - static pdns::stat_t s_queries; - static pdns::stat_t s_outgoingtimeouts; - static pdns::stat_t s_outgoing4timeouts; - static pdns::stat_t s_outgoing6timeouts; - static pdns::stat_t s_throttledqueries; - static pdns::stat_t s_dontqueries; - static pdns::stat_t s_qnameminfallbacksuccess; - static pdns::stat_t s_authzonequeries; - static pdns::stat_t s_outqueries; - static pdns::stat_t s_tcpoutqueries; - static pdns::stat_t s_dotoutqueries; - static pdns::stat_t s_unreachables; static pdns::stat_t s_ecsqueries; static pdns::stat_t s_ecsresponses; static std::map s_ecsResponsesBySubnetSize4; @@ -760,79 +749,17 @@ struct PacketIDBirthdayCompare }; extern std::unique_ptr g_recCache; +extern rec::GlobalCounters g_Counters; +extern thread_local rec::TCounters t_Counters; + struct RecursorStats { - pdns::stat_t servFails; - pdns::stat_t nxDomains; - pdns::stat_t noErrors; - pdns::AtomicHistogram answers; - pdns::AtomicHistogram auth4Answers; - pdns::AtomicHistogram auth6Answers; - pdns::AtomicHistogram ourtime; - pdns::AtomicHistogram cumulativeAnswers; - pdns::AtomicHistogram cumulativeAuth4Answers; - pdns::AtomicHistogram cumulativeAuth6Answers; - pdns::stat_t_trait avgLatencyUsec; - pdns::stat_t_trait avgLatencyOursUsec; - pdns::stat_t qcounter; // not increased for unauth packets - pdns::stat_t ipv6qcounter; - pdns::stat_t tcpqcounter; - pdns::stat_t unauthorizedUDP; // when this is increased, qcounter isn't - pdns::stat_t unauthorizedTCP; // when this is increased, qcounter isn't - pdns::stat_t sourceDisallowedNotify; // when this is increased, qcounter is also - pdns::stat_t zoneDisallowedNotify; // when this is increased, qcounter is also - pdns::stat_t policyDrops; - pdns::stat_t tcpClientOverflow; - pdns::stat_t clientParseError; - pdns::stat_t serverParseError; - pdns::stat_t tooOldDrops; - pdns::stat_t truncatedDrops; - pdns::stat_t queryPipeFullDrops; - pdns::stat_t unexpectedCount; - pdns::stat_t caseMismatchCount; - pdns::stat_t spoofCount; - pdns::stat_t resourceLimits; - pdns::stat_t overCapacityDrops; - pdns::stat_t ipv6queries; - pdns::stat_t chainResends; - pdns::stat_t nsSetInvalidations; - pdns::stat_t ednsPingMatches; - pdns::stat_t ednsPingMismatches; - pdns::stat_t noPingOutQueries, noEdnsOutQueries; - pdns::stat_t packetCacheHits; - pdns::stat_t noPacketError; - pdns::stat_t ignoredCount; - pdns::stat_t emptyQueriesCount; time_t startupTime{time(nullptr)}; - pdns::stat_t dnssecQueries; - pdns::stat_t dnssecAuthenticDataQueries; - pdns::stat_t dnssecCheckDisabledQueries; - pdns::stat_t variableResponses; - pdns::stat_t maxMThreadStackUsage; - pdns::stat_t dnssecValidations; // should be the sum of all dnssecResult* stats + // XXX Convert counter below to be part of rec::Counters std::map dnssecResults; std::map xdnssecResults; std::map policyResults; LockGuarded> policyHits; - pdns::stat_t rebalancedQueries{0}; - pdns::stat_t proxyProtocolInvalidCount{0}; - pdns::stat_t nodLookupsDroppedOversize{0}; - pdns::stat_t dns64prefixanswers{0}; - pdns::stat_t maintenanceUsec{0}; - pdns::stat_t maintenanceCalls{0}; - std::array authRCode; - - RecursorStats() : - answers("answers", { 1000, 10000, 100000, 1000000 }), - auth4Answers("auth4answers", { 1000, 10000, 100000, 1000000 }), - auth6Answers("auth6answers", { 1000, 10000, 100000, 1000000 }), - ourtime("ourtime", { 1000, 2000, 4000, 8000, 16000, 32000 }), - cumulativeAnswers("cumul-clientanswers-", 10, 19), - // These two will be merged when outputting - cumulativeAuth4Answers("cumul-authanswers-", 1000, 13), - cumulativeAuth6Answers("cumul-authanswers-", 1000, 13) - { - } }; //! represents a running TCP/IP client session diff --git a/pdns/tcounters.hh b/pdns/tcounters.hh new file mode 100644 index 0000000000..c6956c68cb --- /dev/null +++ b/pdns/tcounters.hh @@ -0,0 +1,260 @@ +/* + * This file is part of PowerDNS or dnsdist. + * Copyright -- PowerDNS.COM B.V. and its contributors + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * In addition, for the avoidance of any doubt, permission is granted to + * link this program with OpenSSL and to (re)distribute the binaries + * produced as the result of such linking. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#pragma once + +#include +#include +#include +#include + +#include "lock.hh" + +namespace pdns +{ +// We keep three sets of related counters: +// +// 1. The current counters (thread-local, updated individually by thread code very often) +// 2. The snapshot counters (thread local, updated by thread code in one single mutex protected copy) +// 3. The history counters (global) to keep track of the counters of deleted threads + +// We have two main clasess: one that holds the thread local counters +// (both current and snapshot ones) and one that aggregates the +// values for all threads adn keeps the history counters. + +// The thread local current counters are the ones updated by +// performance critical code. Every once in a while, all values in the +// current counters are copied to the snapshot thread local copies in +// a thread safe way. + +// The snapshot counters are aggregated by the GlobalCounters +// class, as these can be accessed safely from multiple threads. + +// Make sure to call the thread local tlocal.updatesAtomics() once +// in a while. This will fill the snapshot values for that thread if some +// time has passed since the last snap update. + +// To fet aggregate value call globals.sum(counter1) or +// globals.avg(counter2), or any aggreggation function. If multiple +// counters need to be collected in a consistent way: +// auto data = globals.aggregatedSnap(); +// +// Note that the aggregate values can mix somewhat older thread-local +// with newer thread-local info from another thread. So it is possible +// to see the following: +// +// If thread T1 increments "received" and then passes the packet to +// thread T2 that increments "processed", it may happen that the value +// of "processed" observed by sum() is higher than "received", as T1 +// might not have called updateSnap() yet while T2 did. To avoid this +// inconsistentcy, be careful to update related counters in a single +// thread only. + +// For an example of the use of these templates, see rec-tcounters.hh + +template +class TLocalCounters; + +template +class GlobalCounters +{ +public: + // Register a thread local set of values + void subscribe(TLocalCounters* ptr) + { + auto lock = d_guarded.lock(); + lock->d_instances.emplace(ptr); + } + + // Unregister, typically done when a thread exits + void unsubscribe(TLocalCounters* ptr, const Counters& data) + { + auto lock = d_guarded.lock(); + lock->d_instances.erase(ptr); + lock->d_history.merge(data); + } + + // Two ways of computing aggregated values for a specific counter: simple additions of all thread data, or taking weighted averages into account + template + auto sum(Enum index); + template + auto avg(Enum index); + template + auto max(Enum index); + + // Aggregate all counter data for all threads + Counters aggregatedSnap(); + + // Reset history + void reset() + { + auto lock = d_guarded.lock(); + lock->d_history = Counters(); + } + +private: + struct Guarded + { + // We have x instances, normally one per thread + std::set*> d_instances; + // If an instance gets deleted because its thread is cleaned up, the values + // are accumulated in d_history + Counters d_history; + }; + LockGuarded d_guarded; +}; + +template +class TLocalCounters +{ +public: + static const suseconds_t defaultSnapUpdatePeriodus = 100000; + TLocalCounters(GlobalCounters& collector, timeval interval = timeval{0, defaultSnapUpdatePeriodus}) : + d_collector(collector), d_interval(interval) + { + collector.subscribe(this); + } + + ~TLocalCounters() + { + d_collector.unsubscribe(this, d_current); + } + + TLocalCounters(const TLocalCounters&) = delete; + TLocalCounters(TLocalCounters&&) = delete; + TLocalCounters& operator=(const TLocalCounters&) = delete; + TLocalCounters& operator=(TLocalCounters&&) = delete; + + template + auto& at(Enum index) + { + return d_current.at(index); + } + + template + auto snapAt(Enum index) + { + return d_snapshot.lock()->at(index); + } + + [[nodiscard]] Counters getSnap() + { + return *(d_snapshot.lock()); + } + + bool updateSnap(const timeval& tv_now, bool force = false) + { + timeval tv_diff{}; + + if (!force) { + timersub(&tv_now, &d_last, &tv_diff); + } + if (force || timercmp(&tv_diff, &d_interval, >=)) { + // It's a copy + *(d_snapshot.lock()) = d_current; + d_last = tv_now; + return true; + } + return false; + } + + bool updateSnap(bool force = false) + { + timeval tv_now{}; + + if (!force) { + gettimeofday(&tv_now, nullptr); + } + return updateSnap(tv_now, force); + } + +private: + GlobalCounters& d_collector; + Counters d_current; + LockGuarded d_snapshot; + timeval d_last{0, 0}; + const timeval d_interval; +}; + +// Sum for a specific index +// In the future we might the move the specifics of computing an aggregated value to the +// app specific Counters class +template +template +auto GlobalCounters::sum(Enum index) +{ + auto lock = d_guarded.lock(); + auto sum = lock->d_history.at(index); + for (const auto& instance : lock->d_instances) { + sum += instance->snapAt(index); + } + return sum; +} + +// Average for a specific index +// In the future we might the move the specifics of computing an aggregated value to the +// app specific Counters class +template +template +auto GlobalCounters::avg(Enum index) +{ + auto lock = d_guarded.lock(); + auto wavg = lock->d_history.at(index); + auto sum = wavg.avg * wavg.weight; + auto count = wavg.weight; + for (const auto& instance : lock->d_instances) { + auto val = instance->snapAt(index); + count += val.weight; + sum += val.avg * val.weight; + } + return count > 0 ? sum / count : 0; +} + +// Max for a specific index +// In the future we might the move the specifics of computing an aggregated value to the +// app specific Counters class +template +template +auto GlobalCounters::max(Enum index) +{ + auto lock = d_guarded.lock(); + uint64_t max = 0; // ignore history + for (const auto& instance : lock->d_instances) { + max = std::max(instance->snapAt(index), max); + } + return max; +} + +// Get a consistent snap of *all* aggregated values +template +Counters GlobalCounters::aggregatedSnap() +{ + auto lock = d_guarded.lock(); + Counters ret = lock->d_history; + for (const auto& instance : lock->d_instances) { + auto snap = instance->getSnap(); + ret.merge(snap); + } + return ret; +} + +} diff --git a/regression-tests.recursor-dnssec/recursortests.py b/regression-tests.recursor-dnssec/recursortests.py index 2576b0955b..165208b792 100644 --- a/regression-tests.recursor-dnssec/recursortests.py +++ b/regression-tests.recursor-dnssec/recursortests.py @@ -39,7 +39,6 @@ class RecursorTest(AssertEqualDNSMessageMixin, unittest.TestCase): _confdir = 'recursor' - _recursorStartupDelay = 2.0 _recursorPort = 5300 _recursor = None @@ -649,11 +648,11 @@ distributor-threads={threads}""".format(confdir=confdir, raise AssertionError('%s failed (%d)' % (recursorcmd, cls._recursor.returncode)) @classmethod - def wipeRecursorCache(cls, confdir): + def wipeRecursorCache(cls, confdir, name='.$'): rec_controlCmd = [os.environ['RECCONTROL'], '--config-dir=%s' % confdir, 'wipe-cache', - '.$'] + name] try: subprocess.check_output(rec_controlCmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: diff --git a/regression-tests.recursor-dnssec/test_AggressiveNSECCache.py b/regression-tests.recursor-dnssec/test_AggressiveNSECCache.py index a4640176c6..808bbdd3d8 100644 --- a/regression-tests.recursor-dnssec/test_AggressiveNSECCache.py +++ b/regression-tests.recursor-dnssec/test_AggressiveNSECCache.py @@ -11,6 +11,7 @@ class AggressiveNSECCacheBase(RecursorTest): _wsTimeout = 10 _wsPassword = 'secretpassword' _apiKey = 'secretapikey' + #_recursorStartupDelay = 4.0 _config_template = """ dnssec=validate aggressive-nsec-cache-size=10000 @@ -19,12 +20,14 @@ class AggressiveNSECCacheBase(RecursorTest): webserver-address=127.0.0.1 webserver-password=%s api-key=%s + devonly-regression-test-mode """ % (_wsPort, _wsPassword, _apiKey) @classmethod def wipe(cls): confdir = os.path.join('configs', cls._confdir) - cls.wipeRecursorCache(confdir) + # Only wipe examples, as wiping the root triggers root NS refreshes + cls.wipeRecursorCache(confdir, "example$") def getMetric(self, name): headers = {'x-api-key': self._apiKey} @@ -42,8 +45,8 @@ class AggressiveNSECCacheBase(RecursorTest): self.assertTrue(False) def testNoData(self): - self.wipe() + # first we query a non-existent type, to get the NSEC in our cache entries = self.getMetric('aggressive-nsec-cache-entries') res = self.sendQuery('host1.secure.example.', 'TXT') @@ -72,8 +75,8 @@ class AggressiveNSECCacheNSEC(AggressiveNSECCacheBase): # we can't use the same tests for NSEC and NSEC3 because the hashed NSEC3s # do not deny the same names than the non-hashed NSECs do def testNXD(self): - self.wipe() + # first we query a non-existent name, to get the needed NSECs (name + widcard) in our cache entries = self.getMetric('aggressive-nsec-cache-entries') hits = self.getMetric('aggressive-nsec-cache-nsec-hits') @@ -100,8 +103,8 @@ class AggressiveNSECCacheNSEC(AggressiveNSECCacheBase): self.assertGreater(self.getMetric('aggressive-nsec-cache-nsec-hits'), hits) def testWildcard(self): - self.wipe() + # first we query a non-existent name, but for which a wildcard matches, # to get the NSEC in our cache res = self.sendQuery('test1.wildcard.secure.example.', 'A') @@ -231,6 +234,7 @@ class AggressiveNSECCacheNSEC3(AggressiveNSECCacheBase): raise AssertionError('%s failed (%d): %s' % (pdnsutilCmd, e.returncode, e.output)) def testNXD(self): + self.wipe() # first we query a non-existent name, to get the needed NSEC3s in our cache res = self.sendQuery('host2.secure.example.', 'TXT') @@ -250,6 +254,7 @@ class AggressiveNSECCacheNSEC3(AggressiveNSECCacheBase): self.assertEqual(nbQueries, self.getMetric('all-outqueries')) def testWildcard(self): + self.wipe() # first let's get the SOA and wildcard NSEC in our cache by asking a name that matches the wildcard # but a type that does not exist @@ -297,6 +302,8 @@ class AggressiveNSECCacheNSEC3(AggressiveNSECCacheBase): self.assertEqual(nbQueries, self.getMetric('all-outqueries')) def test_OptOut(self): + self.wipe() + # query a name in an opt-out zone res = self.sendQuery('ns2.optout.example.', 'A') self.assertRcodeEqual(res, dns.rcode.NXDOMAIN) diff --git a/regression-tests.recursor-dnssec/test_RootNXTrust.py b/regression-tests.recursor-dnssec/test_RootNXTrust.py index 62f6537e2c..b9d1b0b156 100644 --- a/regression-tests.recursor-dnssec/test_RootNXTrust.py +++ b/regression-tests.recursor-dnssec/test_RootNXTrust.py @@ -46,6 +46,7 @@ webserver-port=%d webserver-address=127.0.0.1 webserver-password=%s api-key=%s +devonly-regression-test-mode """ % (_wsPort, _wsPassword, _apiKey) def testRootNXTrust(self): @@ -94,6 +95,7 @@ webserver-port=%d webserver-address=127.0.0.1 webserver-password=%s api-key=%s +devonly-regression-test-mode """ % (_wsPort, _wsPassword, _apiKey) def testRootNXTrust(self): diff --git a/regression-tests.recursor-dnssec/test_SimpleDoT.py b/regression-tests.recursor-dnssec/test_SimpleDoT.py index 0994a47ec9..1026001d9e 100644 --- a/regression-tests.recursor-dnssec/test_SimpleDoT.py +++ b/regression-tests.recursor-dnssec/test_SimpleDoT.py @@ -12,6 +12,7 @@ class testSimpleDoT(RecursorTest): _config_template = """ dnssec=validate dot-to-auth-names=powerdns.com +devonly-regression-test-mode """ _roothints = None diff --git a/regression-tests.recursor-dnssec/test_SimpleForwardOverDoT.py b/regression-tests.recursor-dnssec/test_SimpleForwardOverDoT.py index 95e2fbca04..d839d43084 100644 --- a/regression-tests.recursor-dnssec/test_SimpleForwardOverDoT.py +++ b/regression-tests.recursor-dnssec/test_SimpleForwardOverDoT.py @@ -12,6 +12,7 @@ class testSimpleForwardOverDoT(RecursorTest): _config_template = """ dnssec=validate forward-zones-recurse=.=9.9.9.9:853 +devonly-regression-test-mode """ @classmethod