From: Remi Gacogne Date: Wed, 25 Nov 2020 16:19:09 +0000 (+0100) Subject: dnsdist: Fix dynamic block metrics collection X-Git-Tag: rec-4.5.0-alpha1~92^2~5 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=59b37d25af56d7088144214c5e499d571e7017f8;p=thirdparty%2Fpdns.git dnsdist: Fix dynamic block metrics collection --- diff --git a/pdns/dnsdist-dynblocks.hh b/pdns/dnsdist-dynblocks.hh index 5e23d90025..8ebe2cfe85 100644 --- a/pdns/dnsdist-dynblocks.hh +++ b/pdns/dnsdist-dynblocks.hh @@ -314,8 +314,6 @@ public: d_beQuiet = quiet; } - void purgeExpired(const struct timespec& now); - private: bool checkIfQueryTypeMatches(const Rings::Query& query); @@ -369,26 +367,34 @@ private: bool d_beQuiet{false}; }; -class DynBlockRulesMetricsCache +class DynBlockMaintenance { public: - DynBlockRulesMetricsCache(size_t topN, unsigned int validity): d_validityPeriod(validity), d_topN(topN) - { - } + static void run(); - std::map>> getTopNetmasks(); - std::map>> getTopSuffixes(); - void invalidate(); - void setParameters(size_t topN, unsigned int validity); + /* return the (cached) number of hits per second for the top offenders, averaged over 60s */ + static std::map>> getHitsForTopNetmasks(); + static std::map>> getHitsForTopSuffixes(); + + /* get the the top offenders based on the current value of the counters */ + static std::map>> getTopNetmasks(); + static std::map>> getTopSuffixes(); + static void purgeExpired(const struct timespec& now); private: - std::map>> d_cachedNetmasks; - std::map>> d_cachedSuffixes; - std::mutex d_mutex; - time_t d_netmasksValidUntil{0}; - time_t d_suffixesValidUntil{0}; - unsigned int d_validityPeriod{0}; - size_t d_topN{0}; -}; + static void collectMetrics(); + static void generateMetrics(); -extern DynBlockRulesMetricsCache g_dynBlocksMetricsCache; + struct MetricsSnapshot + { + std::map>> nmgData; + std::map>> smtData; + }; + + static std::mutex s_topsMutex; + // need N+1 datapoints to be able to do the diff after a collection point has been reached + static std::list s_metricsData; + static std::map>> s_topNMGsByReason; + static std::map>> s_topSMTsByReason; + static size_t s_topN; +}; diff --git a/pdns/dnsdist-web.cc b/pdns/dnsdist-web.cc index dcd56b2201..4b5b63a5cd 100644 --- a/pdns/dnsdist-web.cc +++ b/pdns/dnsdist-web.cc @@ -668,21 +668,21 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp) addRulesToPrometheusOutput(output, g_cachehitresprulactions); addRulesToPrometheusOutput(output, g_selfansweredresprulactions); - output << "# HELP dnsdist_dynblocks_nmg_top_offenders " << "Top offenders blocked by Dynamic Blocks (netmasks)" << "\n"; - output << "# TYPE dnsdist_dynblocks_nmg_top_offenders " << "gauge" << "\n"; - auto topNetmasksByReason = g_dynBlocksMetricsCache.getTopNetmasks(); + output << "# HELP dnsdist_dynblocks_nmg_top_offenders_hits_per_second " << "Number of hits per second blocked by Dynamic Blocks (netmasks) for the top offenders, averaged over the last 60s" << "\n"; + output << "# TYPE dnsdist_dynblocks_nmg_top_offenders_hits_per_second " << "gauge" << "\n"; + auto topNetmasksByReason = DynBlockMaintenance::getHitsForTopNetmasks(); for (const auto& entry : topNetmasksByReason) { for (const auto& netmask : entry.second) { - output << "dnsdist_dynblocks_nmg_top_offenders{reason=\"" << entry.first << "\",netmask=\"" << netmask.first.toString() << "\"} " << netmask.second << "\n"; + output << "dnsdist_dynblocks_nmg_top_offenders_hits_per_second{reason=\"" << entry.first << "\",netmask=\"" << netmask.first.toString() << "\"} " << netmask.second << "\n"; } } - output << "# HELP dnsdist_dynblocks_smt_top_offenders " << "Top offenders blocked by Dynamic Blocks (suffixes)" << "\n"; - output << "# TYPE dnsdist_dynblocks_smt_top_offenders " << "gauge" << "\n"; - auto topSuffixesByReason = g_dynBlocksMetricsCache.getTopSuffixes(); + output << "# HELP dnsdist_dynblocks_smt_top_offenders_hits_per_second " << "Number of this per second blocked by Dynamic Blocks (suffixes) for the top offenders, averaged over the last 60s" << "\n"; + output << "# TYPE dnsdist_dynblocks_smt_top_offenders_hits_per_second " << "gauge" << "\n"; + auto topSuffixesByReason = DynBlockMaintenance::getHitsForTopSuffixes(); for (const auto& entry : topSuffixesByReason) { for (const auto& suffix : entry.second) { - output << "dnsdist_dynblocks_smt_top_offenders{reason=\"" << entry.first << "\",suffix=\"" << suffix.first.toString() << "\"} " << suffix.second << "\n"; + output << "dnsdist_dynblocks_smt_top_offenders_hits_per_second{reason=\"" << entry.first << "\",suffix=\"" << suffix.first.toString() << "\"} " << suffix.second << "\n"; } } diff --git a/pdns/dnsdist.cc b/pdns/dnsdist.cc index cd7b0b9396..e414ddbd29 100644 --- a/pdns/dnsdist.cc +++ b/pdns/dnsdist.cc @@ -47,6 +47,7 @@ #include "dnsdist.hh" #include "dnsdist-cache.hh" #include "dnsdist-console.hh" +#include "dnsdist-dynblocks.hh" #include "dnsdist-ecs.hh" #include "dnsdist-healthchecks.hh" #include "dnsdist-lua.hh" @@ -1569,18 +1570,18 @@ static void maintThread() size_t counter = 0; int32_t secondsToWaitLog = 0; - for(;;) { + for (;;) { sleep(interval); { std::lock_guard lock(g_luamutex); auto f = g_lua.readVariable > >("maintenance"); - if(f) { + if (f) { try { (*f)(); secondsToWaitLog = 0; } - catch(std::exception &e) { + catch(const std::exception &e) { if (secondsToWaitLog <= 0) { infolog("Error during execution of maintenance function: %s", e.what()); secondsToWaitLog = 61; @@ -1631,11 +1632,16 @@ static void maintThread() } counter = 0; } - - // ponder pruning g_dynblocks of expired entries here } } +static void dynBlockMaintenanceThread() +{ + setThreadName("dnsdist/dynBloc"); + + DynBlockMaintenance::run(); +} + static void secPollThread() { setThreadName("dnsdist/secpoll"); @@ -2412,6 +2418,9 @@ try thread healththread(healthChecksThread); + thread dynBlockMaintThread(dynBlockMaintenanceThread); + dynBlockMaintThread.detach(); + if (!g_secPollSuffix.empty()) { thread secpollthread(secPollThread); secpollthread.detach(); diff --git a/pdns/dnsdistdist/dnsdist-dynblocks.cc b/pdns/dnsdistdist/dnsdist-dynblocks.cc index 65ae642a24..a24bd90615 100644 --- a/pdns/dnsdistdist/dnsdist-dynblocks.cc +++ b/pdns/dnsdistdist/dnsdist-dynblocks.cc @@ -338,7 +338,7 @@ void DynBlockRulesGroup::processResponseRules(counts_t& counts, StatNode& root, } } -void DynBlockRulesGroup::purgeExpired(const struct timespec& now) +void DynBlockMaintenance::purgeExpired(const struct timespec& now) { { auto blocks = g_dynblockNMG.getLocal(); @@ -375,94 +375,287 @@ void DynBlockRulesGroup::purgeExpired(const struct timespec& now) } } -std::map>> DynBlockRulesMetricsCache::getTopNetmasks() +std::map>> DynBlockMaintenance::getTopNetmasks() { std::map>> results; - if (d_topN == 0) { + if (s_topN == 0) { return results; } - time_t now = time(nullptr); - { - std::lock_guard lock(d_mutex); - if (now < d_netmasksValidUntil) { - return d_cachedNetmasks; - } - - auto blocks = g_dynblockNMG.getLocal(); - for (const auto& entry : *blocks) { - auto& topsForReason = results[entry.second.reason]; - if (topsForReason.size() < d_topN || topsForReason.front().second < entry.second.blocks) { - auto newEntry = std::make_pair(entry.first, entry.second.blocks.load()); + auto blocks = g_dynblockNMG.getLocal(); + for (const auto& entry : *blocks) { + auto& topsForReason = results[entry.second.reason]; + if (topsForReason.size() < s_topN || topsForReason.front().second < entry.second.blocks) { + auto newEntry = std::make_pair(entry.first, entry.second.blocks.load()); - if (topsForReason.size() >= d_topN) { - topsForReason.pop_front(); - } - - topsForReason.insert(std::lower_bound(topsForReason.begin(), topsForReason.end(), newEntry, [](const std::pair& a, const std::pair& b) { - return a.second < b.second; - }), - newEntry); + if (topsForReason.size() >= s_topN) { + topsForReason.pop_front(); } + + topsForReason.insert(std::lower_bound(topsForReason.begin(), topsForReason.end(), newEntry, [](const std::pair& a, const std::pair& b) { + return a.second < b.second; + }), + newEntry); } - d_cachedNetmasks = results; - d_netmasksValidUntil = time(nullptr) + d_validityPeriod; } return results; } -std::map>> DynBlockRulesMetricsCache::getTopSuffixes() +std::map>> DynBlockMaintenance::getTopSuffixes() { std::map>> results; - if (d_topN == 0) { + if (s_topN == 0) { return results; } - time_t now = time(nullptr); + auto blocks = g_dynblockSMT.getLocal(); + blocks->visit([&results](const SuffixMatchTree& node) { + auto& topsForReason = results[node.d_value.reason]; + if (topsForReason.size() < DynBlockMaintenance::s_topN || topsForReason.front().second < node.d_value.blocks) { + auto newEntry = std::make_pair(node.d_value.domain, node.d_value.blocks.load()); + + if (topsForReason.size() >= DynBlockMaintenance::s_topN) { + topsForReason.pop_front(); + } + + topsForReason.insert(std::lower_bound(topsForReason.begin(), topsForReason.end(), newEntry, [](const std::pair& a, const std::pair& b) { + return a.second < b.second; + }), + newEntry); + } + }); + + return results; +} + +struct DynBlockEntryStat +{ + size_t sum; + unsigned int lastSeenValue{0}; +}; + +std::mutex DynBlockMaintenance::s_topsMutex; +std::list DynBlockMaintenance::s_metricsData; +std::map>> DynBlockMaintenance::s_topNMGsByReason; +std::map>> DynBlockMaintenance::s_topSMTsByReason; +size_t DynBlockMaintenance::s_topN{20}; + +void DynBlockMaintenance::collectMetrics() +{ + MetricsSnapshot snapshot; + snapshot.smtData = getTopSuffixes(); + snapshot.nmgData = getTopNetmasks(); + { - std::lock_guard lock(d_mutex); - if (now < d_suffixesValidUntil) { - return d_cachedSuffixes; + std::lock_guard lock(s_topsMutex); + if (s_metricsData.size() >= 7) { + s_metricsData.pop_front(); } + s_metricsData.push_back(std::move(snapshot)); + } +} - auto blocks = g_dynblockSMT.getLocal(); - blocks->visit([&results, this](const SuffixMatchTree& node) { - auto& topsForReason = results[node.d_value.reason]; - if (topsForReason.size() < d_topN || topsForReason.front().second < node.d_value.blocks) { - auto newEntry = std::make_pair(node.d_value.domain, node.d_value.blocks.load()); +void DynBlockMaintenance::generateMetrics() +{ + if (s_metricsData.empty()) { + return; + } + + /* do NMG */ + std::map> nm; + for (const auto& reason : s_metricsData.front().nmgData) { + auto& reasonStat = nm[reason.first]; + + /* prepare the counters by scanning the oldest entry (N+1) */ + for (const auto& entry : reason.second) { + auto& stat = reasonStat[entry.first]; + stat.sum = 0; + stat.lastSeenValue = entry.second; + } + } - if (topsForReason.size() >= d_topN) { - topsForReason.pop_front(); + /* scan all the N entries, updating the counters */ + bool first = true; + for (const auto& snap : s_metricsData) { + if (first) { + first = false; + continue; + } + + auto& nmgData = snap.nmgData; + for (const auto& reason : nmgData) { + auto& reasonStat = nm[reason.first]; + for (const auto& entry : reason.second) { + auto& stat = reasonStat[entry.first]; + if (entry.second < stat.lastSeenValue) { + /* it wrapped, or we did not have a last value */ + stat.sum += entry.second; } + else { + stat.sum += entry.second - stat.lastSeenValue; + } + stat.lastSeenValue = entry.second; + } + } + } - topsForReason.insert(std::lower_bound(topsForReason.begin(), topsForReason.end(), newEntry, [](const std::pair& a, const std::pair& b) { + /* now we need to get the top N entries (for each "reason") based on our counters (sum of the last N entries) */ + std::map>> topNMGs; + { + for (const auto& reason : nm) { + auto& topsForReason = topNMGs[reason.first]; + for (const auto& entry : reason.second) { + if (topsForReason.size() < s_topN || topsForReason.front().second < entry.second.sum) { + /* Note that this is a gauge, so we need to divide by the number of elapsed seconds */ + auto newEntry = std::pair(entry.first, std::round(entry.second.sum / 60.0)); + if (topsForReason.size() >= s_topN) { + topsForReason.pop_front(); + } + + topsForReason.insert(std::lower_bound(topsForReason.begin(), topsForReason.end(), newEntry, [](const std::pair& a, const std::pair& b) { return a.second < b.second; }), - newEntry); + newEntry); + } } - }); - d_cachedSuffixes = results; - d_suffixesValidUntil = time(nullptr) + d_validityPeriod; + } } - return results; + /* do SMT */ + std::map> smt; + for (const auto& reason : s_metricsData.front().smtData) { + auto& reasonStat = smt[reason.first]; + + /* prepare the counters by scanning the oldest entry (N+1) */ + for (const auto& entry : reason.second) { + auto& stat = reasonStat[entry.first]; + stat.sum = 0; + stat.lastSeenValue = entry.second; + } + } + + /* scan all the N entries, updating the counters */ + first = true; + for (const auto& snap : s_metricsData) { + if (first) { + first = false; + continue; + } + + auto& smtData = snap.smtData; + for (const auto& reason : smtData) { + auto& reasonStat = smt[reason.first]; + for (const auto& entry : reason.second) { + auto& stat = reasonStat[entry.first]; + if (entry.second < stat.lastSeenValue) { + /* it wrapped, or we did not have a last value */ + stat.sum = entry.second; + } + else { + stat.sum = entry.second - stat.lastSeenValue; + } + stat.lastSeenValue = entry.second; + } + } + } + + /* now we need to get the top N entries (for each "reason") based on our counters (sum of the last N entries) */ + std::map>> topSMTs; + { + for (const auto& reason : smt) { + auto& topsForReason = topSMTs[reason.first]; + for (const auto& entry : reason.second) { + if (topsForReason.size() < s_topN || topsForReason.front().second < entry.second.sum) { + /* Note that this is a gauge, so we need to divide by the number of elapsed seconds */ + auto newEntry = std::pair(entry.first, std::round(entry.second.sum / 60.0)); + if (topsForReason.size() >= s_topN) { + topsForReason.pop_front(); + } + + topsForReason.insert(std::lower_bound(topsForReason.begin(), topsForReason.end(), newEntry, [](const std::pair& a, const std::pair& b) { + return a.second < b.second; + }), + newEntry); + } + } + } + } + + { + std::lock_guard lock(s_topsMutex); + s_topNMGsByReason = std::move(topNMGs); + s_topSMTsByReason = std::move(topSMTs); + } } -DynBlockRulesMetricsCache g_dynBlocksMetricsCache(20, 60); +void DynBlockMaintenance::run() +{ + /* alright, so the main idea is to: + 1/ clean up the NMG and SMT from expired entries from time to time + 2/ generate metrics that can be used in the API and prometheus endpoints + */ + + static const time_t expiredPurgeInterval = 300; + static const time_t metricsCollectionInterval = 10; + static const time_t metricsGenerationInterval = 60; + + time_t now = time(nullptr); + time_t nextExpiredPurge = now + expiredPurgeInterval; + time_t nextMetricsCollect = now + metricsCollectionInterval; + time_t nextMetricsGeneration = now + metricsGenerationInterval; + + while (true) { + time_t sleepDelay = (nextExpiredPurge - now); + sleepDelay = std::min(sleepDelay, (nextMetricsCollect - now)); + sleepDelay = std::min(sleepDelay, (nextMetricsGeneration - now)); + + sleep(sleepDelay); + + try { + now = time(nullptr); + if (now >= nextMetricsCollect) { + /* every ten seconds we store the top N entries */ + collectMetrics(); + + now = time(nullptr); + nextMetricsCollect = now + metricsCollectionInterval; + } + + if (now >= nextMetricsGeneration) { + generateMetrics(); + + now = time(nullptr); + /* every minute we compute the averaged top N entries of the last 60 seconds, + and update the cached entry. */ + nextMetricsGeneration = now + metricsGenerationInterval; + } + + if (now >= nextExpiredPurge) { + struct timespec tspec; + gettime(&tspec); + purgeExpired(tspec); + + now = time(nullptr); + nextExpiredPurge = now + expiredPurgeInterval; + } + } + catch (const std::exception& e) { + warnlog("Error in the dynamic block maintenance thread: %s", e.what()); + } + catch (...) { + } + } +} -void DynBlockRulesMetricsCache::invalidate() +std::map>> DynBlockMaintenance::getHitsForTopNetmasks() { - std::lock_guard lock(d_mutex); - d_netmasksValidUntil = 0; - d_suffixesValidUntil = 0; - d_cachedNetmasks.clear(); - d_cachedSuffixes.clear(); + std::lock_guard lock(s_topsMutex); + return s_topNMGsByReason; } -void DynBlockRulesMetricsCache::setParameters(size_t topN, unsigned int validity) +std::map>> DynBlockMaintenance::getHitsForTopSuffixes() { - d_validityPeriod = validity; - d_topN = topN; - invalidate(); + std::lock_guard lock(s_topsMutex); + return s_topSMTsByReason; } diff --git a/pdns/dnsdistdist/test-dnsdistdynblocks_hh.cc b/pdns/dnsdistdist/test-dnsdistdynblocks_hh.cc index d950fac2c2..3fbf322da6 100644 --- a/pdns/dnsdistdist/test-dnsdistdynblocks_hh.cc +++ b/pdns/dnsdistdist/test-dnsdistdynblocks_hh.cc @@ -714,8 +714,7 @@ BOOST_AUTO_TEST_CASE(test_DynBlockRulesMetricsCache_GetTopN) { /* now we ask for the top 20 offenders for each reason */ StopWatch sw; sw.start(); - DynBlockRulesMetricsCache cache(20, 1); - auto top = cache.getTopNetmasks(); + auto top = DynBlockMaintenance::getTopNetmasks(); BOOST_REQUIRE_EQUAL(top.size(), 1U); auto offenders = top.at(reason); BOOST_REQUIRE_EQUAL(offenders.size(), 20U); @@ -728,7 +727,7 @@ BOOST_AUTO_TEST_CASE(test_DynBlockRulesMetricsCache_GetTopN) { struct timespec expired = now; expired.tv_sec += blockDuration + 1; - dbrg.purgeExpired(expired); + DynBlockMaintenance::purgeExpired(expired); BOOST_CHECK_EQUAL(g_dynblockNMG.getLocal()->size(), 0U); } @@ -771,8 +770,7 @@ BOOST_AUTO_TEST_CASE(test_DynBlockRulesMetricsCache_GetTopN) { /* now we ask for the top 20 offenders for each reason */ StopWatch sw; sw.start(); - DynBlockRulesMetricsCache cache(20, 1); - auto top = cache.getTopSuffixes(); + auto top = DynBlockMaintenance::getTopSuffixes(); BOOST_REQUIRE_EQUAL(top.size(), 1U); auto suffixes = top.at(reason); BOOST_REQUIRE_EQUAL(suffixes.size(), 20U); @@ -785,10 +783,11 @@ BOOST_AUTO_TEST_CASE(test_DynBlockRulesMetricsCache_GetTopN) { struct timespec expired = now; expired.tv_sec += blockDuration + 1; - dbrg.purgeExpired(expired); + DynBlockMaintenance::purgeExpired(expired); BOOST_CHECK(g_dynblockSMT.getLocal()->getNodes().empty()); } +#define BENCH_DYNBLOCKS #ifdef BENCH_DYNBLOCKS { /* now insert 1M names */ @@ -827,14 +826,13 @@ BOOST_AUTO_TEST_CASE(test_DynBlockRulesMetricsCache_GetTopN) { cerr<<"added 1000000 entries in "<size(), 1000000U); sw.start(); - DynBlockRulesMetricsCache cache(20, 1); - auto top = cache.getTopNetmasks(); + auto top = DynBlockMaintenance::getTopNetmasks(); cerr<<"scanned "<size()<<" entries in "<size(), 0U); }