From 7219311873b518a915b7b49aa8cd4373fa80ca43 Mon Sep 17 00:00:00 2001 From: Miod Vallat Date: Wed, 18 Dec 2024 15:11:23 +0100 Subject: [PATCH] Implement optional interval and minimumFailure options in if*up. --- docs/lua-records/functions.rst | 4 + pdns/lua-record.cc | 65 ++++++++-- regression-tests.auth-py/authtests.py | 23 ++++ regression-tests.auth-py/test_LuaRecords.py | 124 +++++++++++++++++++- 4 files changed, 202 insertions(+), 14 deletions(-) diff --git a/docs/lua-records/functions.rst b/docs/lua-records/functions.rst index 2bd42a3d5b..a078ee4200 100644 --- a/docs/lua-records/functions.rst +++ b/docs/lua-records/functions.rst @@ -65,6 +65,8 @@ Record creation functions - ``backupSelector``: used to pick the address(es) from all addresses if all addresses are down. Choices include 'pickclosest', 'random', 'hashed', 'all' (default 'random'). - ``source``: Source address to check from - ``timeout``: Maximum time in seconds that you allow the check to take (default 2) + - ``interval``: Time interval between two checks, in seconds. Defaults to :ref:`setting-lua-health-checks-interval` if not specified. + - ``minimumFailures``: The number of unsuccessful checks in a row required to mark the address as down. Defaults to 1 if not specified, i.e. report as down on the first unsuccessful check. .. function:: ifurlup(url, addresses[, options]) @@ -86,9 +88,11 @@ Record creation functions - ``backupSelector``: used to pick the address from all addresses if all addresses are down. Choices include 'pickclosest', 'random', 'hashed', 'all' (default 'random'). - ``source``: Source address to check from - ``timeout``: Maximum time in seconds that you allow the check to take (default 2) + - ``interval``: Time interval between two checks, in seconds. Defaults to :ref:`setting-lua-health-checks-interval` if not specified. - ``stringmatch``: check ``url`` for this string, only declare 'up' if found - ``useragent``: Set the HTTP "User-Agent" header in the requests. By default it is set to "PowerDNS Authoritative Server" - ``byteslimit``: Limit the maximum download size to ``byteslimit`` bytes (default 0 meaning no limit). + - ``minimumFailures``: The number of unsuccessful checks in a row required to mark the address as down. Defaults to 1 if not specified, i.e. report as down on the first unsuccessful check. An example of a list of address sets: diff --git a/pdns/lua-record.cc b/pdns/lua-record.cc index b64482464f..1cb8355b46 100644 --- a/pdns/lua-record.cc +++ b/pdns/lua-record.cc @@ -29,8 +29,8 @@ ponder netmask tree from file for huge number of netmasks - unify ifurlup/ifportup - add attribute for certificate check + add attribute for certificate check in genericIfUp + add list of current monitors expire them too? @@ -73,10 +73,14 @@ private: std::atomic status{false}; /* current weight */ std::atomic weight{0}; - /* first check ? */ + /* first check? */ std::atomic first{true}; + /* number of successive checks returning failure */ + std::atomic failures{0}; /* last time the status was accessed */ std::atomic lastAccess{0}; + /* last time the status was modified */ + std::atomic lastStatusUpdate{0}; }; public: @@ -91,7 +95,7 @@ public: int isUp(const CheckDesc& cd); private: - void checkURL(const CheckDesc& cd, const bool status, const bool first = false) + void checkURL(const CheckDesc& cd, const bool status, const bool first) // NOLINT(readability-identifier-length) { setThreadName("pdns/lua-c-url"); @@ -154,7 +158,7 @@ private: setDown(cd); } } - void checkTCP(const CheckDesc& cd, const bool status, const bool first = false) { + void checkTCP(const CheckDesc& cd, const bool status, const bool first) { // NOLINT(readability-identifier-length) setThreadName("pdns/lua-c-tcp"); try { int timeout = 2; @@ -192,19 +196,46 @@ private: std::chrono::system_clock::time_point checkStart = std::chrono::system_clock::now(); std::vector> results; std::vector toDelete; + time_t interval{g_luaHealthChecksInterval}; { // make sure there's no insertion auto statuses = d_statuses.read_lock(); for (auto& it: *statuses) { auto& desc = it.first; auto& state = it.second; + time_t checkInterval{0}; + auto lastAccess = std::chrono::system_clock::from_time_t(state->lastAccess); + + if (desc.opts.count("interval") != 0) { + checkInterval = std::atoi(desc.opts.at("interval").c_str()); + if (checkInterval != 0) { + interval = std::gcd(interval, checkInterval); + } + } + + if (not state->first) { + time_t nextCheckSecond = state->lastStatusUpdate; + if (checkInterval != 0) { + nextCheckSecond += checkInterval; + } + else { + nextCheckSecond += g_luaHealthChecksInterval; + } + if (checkStart < std::chrono::system_clock::from_time_t(nextCheckSecond)) { + continue; // too early + } + } if (desc.url.empty()) { // TCP results.push_back(std::async(std::launch::async, &IsUpOracle::checkTCP, this, desc, state->status.load(), state->first.load())); } else { // URL results.push_back(std::async(std::launch::async, &IsUpOracle::checkURL, this, desc, state->status.load(), state->first.load())); } - if (std::chrono::system_clock::from_time_t(state->lastAccess) < (checkStart - std::chrono::seconds(g_luaHealthChecksExpireDelay))) { + // Give it a chance to run at least once. + // If minimumFailures * interval > lua-health-checks-expire-delay, then a down status will never get reported. + // This is unlikely to be a problem in practice due to the default value of the expire delay being one hour. + if (not state->first && + lastAccess < (checkStart - std::chrono::seconds(g_luaHealthChecksExpireDelay))) { toDelete.push_back(desc); } } @@ -223,7 +254,7 @@ private: // set thread name again, in case std::async surprised us by doing work in this thread setThreadName("pdns/luaupcheck"); - std::this_thread::sleep_until(checkStart + std::chrono::seconds(g_luaHealthChecksInterval)); + std::this_thread::sleep_until(checkStart + std::chrono::seconds(interval)); } } @@ -237,9 +268,23 @@ private: { auto statuses = d_statuses.write_lock(); auto& state = (*statuses)[cd]; - state->status = status; - if (state->first) { - state->first = false; + state->lastStatusUpdate = time(nullptr); + state->first = false; + if (status) { + state->failures = 0; + state->status = true; + } else { + unsigned int minimumFailures = 1; + if (cd.opts.count("minimumFailures") != 0) { + unsigned int value = std::atoi(cd.opts.at("minimumFailures").c_str()); + if (value != 0) { + minimumFailures = std::max(minimumFailures, value); + } + } + // Since `status' was set to false at constructor time, we need to + // recompute its value unconditionally to expose "down, but not enough + // times yet" targets as up. + state->status = ++state->failures < minimumFailures; } } diff --git a/regression-tests.auth-py/authtests.py b/regression-tests.auth-py/authtests.py index 4256cce9af..9e9e9c999e 100644 --- a/regression-tests.auth-py/authtests.py +++ b/regression-tests.auth-py/authtests.py @@ -493,6 +493,29 @@ options { raise AssertionError("RRset not found in answer\n%s" % "\n".join(([ans.to_text() for ans in msg.answer]))) + def assertNoneRRsetInAnswer(self, msg, rrsets): + """Asserts that none of the supplied rrsets exist (without comparing TTL) + in the answer section of msg + + @param msg: the dns.message.Message to check + @param rrsets: an array of dns.rrset.RRset object""" + + if not isinstance(msg, dns.message.Message): + raise TypeError("msg is not a dns.message.Message") + + found = False + for rrset in rrsets: + if not isinstance(rrset, dns.rrset.RRset): + raise TypeError("rrset is not a dns.rrset.RRset") + for ans in msg.answer: + if ans.match(rrset.name, rrset.rdclass, rrset.rdtype, 0, None): + if ans == rrset: + found = True + + if found: + raise AssertionError("RRset incorrectly found in answer\n%s" % + "\n".join(([ans.to_text() for ans in msg.answer]))) + def assertMatchingRRSIGInAnswer(self, msg, coveredRRset, keys=None): """Looks for coveredRRset in the answer section and if there is an RRSIG RRset that covers that RRset. If keys is not None, this function will also try to diff --git a/regression-tests.auth-py/test_LuaRecords.py b/regression-tests.auth-py/test_LuaRecords.py index 04a105ce93..213f1767df 100644 --- a/regression-tests.auth-py/test_LuaRecords.py +++ b/regression-tests.auth-py/test_LuaRecords.py @@ -38,7 +38,7 @@ class FakeHTTPServer(BaseHTTPRequestHandler): def do_HEAD(self): self._set_headers() -class TestLuaRecords(AuthTest): +class BaseLuaTest(AuthTest): _config_template = """ geoip-database-files=../modules/geoipbackend/regression-tests/GeoLiteCity.mmdb edns-subnet-processing=yes @@ -102,6 +102,16 @@ usa-ext IN LUA A ( ";include('config') " "return ifurlup('http://www.lua.org:8080/', " "{{EUEips, USAips}}, settings) ") +usa-unreachable IN LUA A ( ";settings={{stringmatch='Programming in Lua', minimumFailures=2}} " + "USAips={{'{prefix}.103', '192.168.42.105'}}" + "return ifurlup('http://www.lua.org:8080/', " + "USAips, settings) ") + +usa-slowcheck IN LUA A ( ";settings={{stringmatch='Programming in Lua', interval=8}} " + "USAips={{'{prefix}.103', '192.168.42.105'}}" + "return ifurlup('http://www.lua.org:8080/', " + "USAips, settings) ") + mix.ifurlup IN LUA A ("ifurlup('http://www.other.org:8080/ping.json', " "{{ '192.168.42.101', '{prefix}.101' }}, " "{{ stringmatch='pong' }}) ") @@ -189,7 +199,7 @@ whitespace IN LUA TXT "'foo" "bar'" @classmethod def setUpClass(cls): - super(TestLuaRecords, cls).setUpClass() + super(BaseLuaTest, cls).setUpClass() cls._web_rrsets = [dns.rrset.from_text('web1.example.org.', 0, dns.rdataclass.IN, 'A', '{prefix}.101'.format(prefix=cls._PREFIX)), @@ -199,6 +209,8 @@ whitespace IN LUA TXT "'foo" "bar'" '{prefix}.103'.format(prefix=cls._PREFIX)) ] +class TestLuaRecords(BaseLuaTest): + def testPickRandom(self): """ Basic pickrandom() test with a set of A records @@ -440,7 +452,7 @@ whitespace IN LUA TXT "'foo" "bar'" self.assertRcodeEqual(res, dns.rcode.NOERROR) self.assertAnyRRsetInAnswer(res, all_rrs) - # the timeout in the LUA health checker is 2 second, so we make sure to wait slightly longer here + # the timeout in the LUA health checker is 1 second, so we make sure to wait slightly longer here time.sleep(3) res = self.sendUDPQuery(query) self.assertRcodeEqual(res, dns.rcode.NOERROR) @@ -468,7 +480,7 @@ whitespace IN LUA TXT "'foo" "bar'" self.assertRcodeEqual(res, dns.rcode.NOERROR) self.assertAnyRRsetInAnswer(res, all_rrs) - # the timeout in the LUA health checker is 2 second, so we make sure to wait slightly longer here + # the timeout in the LUA health checker is 1 second, so we make sure to wait slightly longer here time.sleep(3) res = self.sendUDPQuery(query) self.assertRcodeEqual(res, dns.rcode.NOERROR) @@ -1161,6 +1173,110 @@ lua-health-checks-interval=1 def testWhitespace(self): return TestLuaRecords.testWhitespace(self, False) +class TestLuaRecordsSlowTimeouts(BaseLuaTest): + # This configuration is similar to BaseLuaTest, but the health check + # interval is increased to 5 seconds. + _config_template = """ +geoip-database-files=../modules/geoipbackend/regression-tests/GeoLiteCity.mmdb +edns-subnet-processing=yes +launch=bind geoip +any-to-tcp=no +enable-lua-records +lua-records-insert-whitespace=yes +lua-health-checks-interval=5 +""" + + def testIfurlupMinimumFailures(self): + """ + Simple ifurlup() test with minimumFailures option set. + """ + reachable = [ + '{prefix}.103'.format(prefix=self._PREFIX) + ] + unreachable = ['192.168.42.105'] + ips = reachable + unreachable + all_rrs = [] + reachable_rrs = [] + unreachable_rrs = [] + for ip in ips: + rr = dns.rrset.from_text('usa-unreachable.example.org.', 0, dns.rdataclass.IN, 'A', ip) + all_rrs.append(rr) + if ip in reachable: + reachable_rrs.append(rr) + else: + unreachable_rrs.append(rr) + + query = dns.message.make_query('usa-unreachable.example.org', 'A') + res = self.sendUDPQuery(query) + self.assertRcodeEqual(res, dns.rcode.NOERROR) + self.assertAnyRRsetInAnswer(res, all_rrs) + + # The above request being sent at time T, the following events occur: + # T+00: results computed using backupSelector as no data available yet + # T+00: checker thread starts + # T+02: 192.168.42.105 found down, first time, still kept up + # T+05: checker thread wakes up, decides to skip 192.168.42.105 check, + # as its last update time was T+02, hence no check until T+07 + # T+10: checker thread wakes up + # T+12: 192.168.42.105 found down, second time, finally marked down + + # Due to minimumFailures set, there should be no error yet. + time.sleep(5) + res = self.sendUDPQuery(query) + self.assertRcodeEqual(res, dns.rcode.NOERROR) + self.assertAnyRRsetInAnswer(res, all_rrs) + + # Wait for another check. At this point the checker thread should have + # reached the minimumFailures threshold and mark the unreachable IP + # as such. + time.sleep(8) + res = self.sendUDPQuery(query) + self.assertRcodeEqual(res, dns.rcode.NOERROR) + self.assertAnyRRsetInAnswer(res, reachable_rrs) + self.assertNoneRRsetInAnswer(res, unreachable_rrs) + + def testIfurlupInterval(self): + """ + Simple ifurlup() test with interval option set. + """ + reachable = [ + '{prefix}.103'.format(prefix=self._PREFIX) + ] + unreachable = ['192.168.42.105'] + ips = reachable + unreachable + all_rrs = [] + reachable_rrs = [] + unreachable_rrs = [] + for ip in ips: + rr = dns.rrset.from_text('usa-slowcheck.example.org.', 0, dns.rdataclass.IN, 'A', ip) + all_rrs.append(rr) + if ip in reachable: + reachable_rrs.append(rr) + else: + unreachable_rrs.append(rr) + + query = dns.message.make_query('usa-slowcheck.example.org', 'A') + res = self.sendUDPQuery(query) + self.assertRcodeEqual(res, dns.rcode.NOERROR) + self.assertAnyRRsetInAnswer(res, all_rrs) + + # the timeout in the LUA health checker is 5 second, but usa-slowcheck + # uses 8 seconds, which forces the thread to run every second (gcd + # of 5 and 8). + time.sleep(6) + + res = self.sendUDPQuery(query) + self.assertRcodeEqual(res, dns.rcode.NOERROR) + # due to minimumFailures set, there should be no error yet + self.assertAnyRRsetInAnswer(res, all_rrs) + + # At this point the check should have fired. + time.sleep(3) + res = self.sendUDPQuery(query) + self.assertRcodeEqual(res, dns.rcode.NOERROR) + self.assertAnyRRsetInAnswer(res, reachable_rrs) + self.assertNoneRRsetInAnswer(res, unreachable_rrs) + if __name__ == '__main__': unittest.main() exit(0) -- 2.47.2