ponder netmask tree from file for huge number of netmasks
- unify ifurlup/ifportup
- add attribute for certificate check
+ add attribute for certificate check in genericIfUp
+
add list of current monitors
expire them too?
std::atomic<bool> status{false};
/* current weight */
std::atomic<int> weight{0};
- /* first check ? */
+ /* first check? */
std::atomic<bool> first{true};
+ /* number of successive checks returning failure */
+ std::atomic<unsigned int> failures{0};
/* last time the status was accessed */
std::atomic<time_t> lastAccess{0};
+ /* last time the status was modified */
+ std::atomic<time_t> lastStatusUpdate{0};
};
public:
int isUp(const CheckDesc& cd);
private:
- void checkURL(const CheckDesc& cd, const bool status, const bool first = false)
+ void checkURL(const CheckDesc& cd, const bool status, const bool first) // NOLINT(readability-identifier-length)
{
setThreadName("pdns/lua-c-url");
setDown(cd);
}
}
- void checkTCP(const CheckDesc& cd, const bool status, const bool first = false) {
+ void checkTCP(const CheckDesc& cd, const bool status, const bool first) { // NOLINT(readability-identifier-length)
setThreadName("pdns/lua-c-tcp");
try {
int timeout = 2;
std::chrono::system_clock::time_point checkStart = std::chrono::system_clock::now();
std::vector<std::future<void>> results;
std::vector<CheckDesc> toDelete;
+ time_t interval{g_luaHealthChecksInterval};
{
// make sure there's no insertion
auto statuses = d_statuses.read_lock();
for (auto& it: *statuses) {
auto& desc = it.first;
auto& state = it.second;
+ time_t checkInterval{0};
+ auto lastAccess = std::chrono::system_clock::from_time_t(state->lastAccess);
+
+ if (desc.opts.count("interval") != 0) {
+ checkInterval = std::atoi(desc.opts.at("interval").c_str());
+ if (checkInterval != 0) {
+ interval = std::gcd(interval, checkInterval);
+ }
+ }
+
+ if (not state->first) {
+ time_t nextCheckSecond = state->lastStatusUpdate;
+ if (checkInterval != 0) {
+ nextCheckSecond += checkInterval;
+ }
+ else {
+ nextCheckSecond += g_luaHealthChecksInterval;
+ }
+ if (checkStart < std::chrono::system_clock::from_time_t(nextCheckSecond)) {
+ continue; // too early
+ }
+ }
if (desc.url.empty()) { // TCP
results.push_back(std::async(std::launch::async, &IsUpOracle::checkTCP, this, desc, state->status.load(), state->first.load()));
} else { // URL
results.push_back(std::async(std::launch::async, &IsUpOracle::checkURL, this, desc, state->status.load(), state->first.load()));
}
- if (std::chrono::system_clock::from_time_t(state->lastAccess) < (checkStart - std::chrono::seconds(g_luaHealthChecksExpireDelay))) {
+ // Give it a chance to run at least once.
+ // If minimumFailures * interval > lua-health-checks-expire-delay, then a down status will never get reported.
+ // This is unlikely to be a problem in practice due to the default value of the expire delay being one hour.
+ if (not state->first &&
+ lastAccess < (checkStart - std::chrono::seconds(g_luaHealthChecksExpireDelay))) {
toDelete.push_back(desc);
}
}
// set thread name again, in case std::async surprised us by doing work in this thread
setThreadName("pdns/luaupcheck");
- std::this_thread::sleep_until(checkStart + std::chrono::seconds(g_luaHealthChecksInterval));
+ std::this_thread::sleep_until(checkStart + std::chrono::seconds(interval));
}
}
{
auto statuses = d_statuses.write_lock();
auto& state = (*statuses)[cd];
- state->status = status;
- if (state->first) {
- state->first = false;
+ state->lastStatusUpdate = time(nullptr);
+ state->first = false;
+ if (status) {
+ state->failures = 0;
+ state->status = true;
+ } else {
+ unsigned int minimumFailures = 1;
+ if (cd.opts.count("minimumFailures") != 0) {
+ unsigned int value = std::atoi(cd.opts.at("minimumFailures").c_str());
+ if (value != 0) {
+ minimumFailures = std::max(minimumFailures, value);
+ }
+ }
+ // Since `status' was set to false at constructor time, we need to
+ // recompute its value unconditionally to expose "down, but not enough
+ // times yet" targets as up.
+ state->status = ++state->failures < minimumFailures;
}
}
def do_HEAD(self):
self._set_headers()
-class TestLuaRecords(AuthTest):
+class BaseLuaTest(AuthTest):
_config_template = """
geoip-database-files=../modules/geoipbackend/regression-tests/GeoLiteCity.mmdb
edns-subnet-processing=yes
"return ifurlup('http://www.lua.org:8080/', "
"{{EUEips, USAips}}, settings) ")
+usa-unreachable IN LUA A ( ";settings={{stringmatch='Programming in Lua', minimumFailures=2}} "
+ "USAips={{'{prefix}.103', '192.168.42.105'}}"
+ "return ifurlup('http://www.lua.org:8080/', "
+ "USAips, settings) ")
+
+usa-slowcheck IN LUA A ( ";settings={{stringmatch='Programming in Lua', interval=8}} "
+ "USAips={{'{prefix}.103', '192.168.42.105'}}"
+ "return ifurlup('http://www.lua.org:8080/', "
+ "USAips, settings) ")
+
mix.ifurlup IN LUA A ("ifurlup('http://www.other.org:8080/ping.json', "
"{{ '192.168.42.101', '{prefix}.101' }}, "
"{{ stringmatch='pong' }}) ")
@classmethod
def setUpClass(cls):
- super(TestLuaRecords, cls).setUpClass()
+ super(BaseLuaTest, cls).setUpClass()
cls._web_rrsets = [dns.rrset.from_text('web1.example.org.', 0, dns.rdataclass.IN, 'A',
'{prefix}.101'.format(prefix=cls._PREFIX)),
'{prefix}.103'.format(prefix=cls._PREFIX))
]
+class TestLuaRecords(BaseLuaTest):
+
def testPickRandom(self):
"""
Basic pickrandom() test with a set of A records
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, all_rrs)
- # the timeout in the LUA health checker is 2 second, so we make sure to wait slightly longer here
+ # the timeout in the LUA health checker is 1 second, so we make sure to wait slightly longer here
time.sleep(3)
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, all_rrs)
- # the timeout in the LUA health checker is 2 second, so we make sure to wait slightly longer here
+ # the timeout in the LUA health checker is 1 second, so we make sure to wait slightly longer here
time.sleep(3)
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
def testWhitespace(self):
return TestLuaRecords.testWhitespace(self, False)
+class TestLuaRecordsSlowTimeouts(BaseLuaTest):
+ # This configuration is similar to BaseLuaTest, but the health check
+ # interval is increased to 5 seconds.
+ _config_template = """
+geoip-database-files=../modules/geoipbackend/regression-tests/GeoLiteCity.mmdb
+edns-subnet-processing=yes
+launch=bind geoip
+any-to-tcp=no
+enable-lua-records
+lua-records-insert-whitespace=yes
+lua-health-checks-interval=5
+"""
+
+ def testIfurlupMinimumFailures(self):
+ """
+ Simple ifurlup() test with minimumFailures option set.
+ """
+ reachable = [
+ '{prefix}.103'.format(prefix=self._PREFIX)
+ ]
+ unreachable = ['192.168.42.105']
+ ips = reachable + unreachable
+ all_rrs = []
+ reachable_rrs = []
+ unreachable_rrs = []
+ for ip in ips:
+ rr = dns.rrset.from_text('usa-unreachable.example.org.', 0, dns.rdataclass.IN, 'A', ip)
+ all_rrs.append(rr)
+ if ip in reachable:
+ reachable_rrs.append(rr)
+ else:
+ unreachable_rrs.append(rr)
+
+ query = dns.message.make_query('usa-unreachable.example.org', 'A')
+ res = self.sendUDPQuery(query)
+ self.assertRcodeEqual(res, dns.rcode.NOERROR)
+ self.assertAnyRRsetInAnswer(res, all_rrs)
+
+ # The above request being sent at time T, the following events occur:
+ # T+00: results computed using backupSelector as no data available yet
+ # T+00: checker thread starts
+ # T+02: 192.168.42.105 found down, first time, still kept up
+ # T+05: checker thread wakes up, decides to skip 192.168.42.105 check,
+ # as its last update time was T+02, hence no check until T+07
+ # T+10: checker thread wakes up
+ # T+12: 192.168.42.105 found down, second time, finally marked down
+
+ # Due to minimumFailures set, there should be no error yet.
+ time.sleep(5)
+ res = self.sendUDPQuery(query)
+ self.assertRcodeEqual(res, dns.rcode.NOERROR)
+ self.assertAnyRRsetInAnswer(res, all_rrs)
+
+ # Wait for another check. At this point the checker thread should have
+ # reached the minimumFailures threshold and mark the unreachable IP
+ # as such.
+ time.sleep(8)
+ res = self.sendUDPQuery(query)
+ self.assertRcodeEqual(res, dns.rcode.NOERROR)
+ self.assertAnyRRsetInAnswer(res, reachable_rrs)
+ self.assertNoneRRsetInAnswer(res, unreachable_rrs)
+
+ def testIfurlupInterval(self):
+ """
+ Simple ifurlup() test with interval option set.
+ """
+ reachable = [
+ '{prefix}.103'.format(prefix=self._PREFIX)
+ ]
+ unreachable = ['192.168.42.105']
+ ips = reachable + unreachable
+ all_rrs = []
+ reachable_rrs = []
+ unreachable_rrs = []
+ for ip in ips:
+ rr = dns.rrset.from_text('usa-slowcheck.example.org.', 0, dns.rdataclass.IN, 'A', ip)
+ all_rrs.append(rr)
+ if ip in reachable:
+ reachable_rrs.append(rr)
+ else:
+ unreachable_rrs.append(rr)
+
+ query = dns.message.make_query('usa-slowcheck.example.org', 'A')
+ res = self.sendUDPQuery(query)
+ self.assertRcodeEqual(res, dns.rcode.NOERROR)
+ self.assertAnyRRsetInAnswer(res, all_rrs)
+
+ # the timeout in the LUA health checker is 5 second, but usa-slowcheck
+ # uses 8 seconds, which forces the thread to run every second (gcd
+ # of 5 and 8).
+ time.sleep(6)
+
+ res = self.sendUDPQuery(query)
+ self.assertRcodeEqual(res, dns.rcode.NOERROR)
+ # due to minimumFailures set, there should be no error yet
+ self.assertAnyRRsetInAnswer(res, all_rrs)
+
+ # At this point the check should have fired.
+ time.sleep(3)
+ res = self.sendUDPQuery(query)
+ self.assertRcodeEqual(res, dns.rcode.NOERROR)
+ self.assertAnyRRsetInAnswer(res, reachable_rrs)
+ self.assertNoneRRsetInAnswer(res, unreachable_rrs)
+
if __name__ == '__main__':
unittest.main()
exit(0)