From: Petr Špaček Date: Tue, 8 Oct 2019 12:56:12 +0000 (+0200) Subject: watchdog rewrite X-Git-Tag: v4.3.0~24^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=refs%2Fenvironments%2Fobs-knot-resolver-bs4hbr%2Fdeployments%2F282;p=thirdparty%2Fknot-resolver.git watchdog rewrite The watchdog module now can be loaded without systemd, has customisable callbacks, and can do real DNS queries and check their results. --- diff --git a/daemon/lua/meson.build b/daemon/lua/meson.build index 255532114..bd652141c 100644 --- a/daemon/lua/meson.build +++ b/daemon/lua/meson.build @@ -9,11 +9,6 @@ lua_config = configuration_data() lua_config.set('keyfile_default', keyfile_default) lua_config.set('etc_dir', etc_dir) lua_config.set('unmanaged', managed_ta ? 'false' : 'true') -if libsystemd.found() and libsystemd.version().version_compare('>=183') - lua_config.set('sd_watchdog', 'modules.load(\'sd_watchdog\')') -else - lua_config.set('sd_watchdog', '') -endif trust_anchors = configure_file( input: 'trust_anchors.lua.in', diff --git a/daemon/lua/sandbox.lua.in b/daemon/lua/sandbox.lua.in index cbe020d31..b3b939472 100644 --- a/daemon/lua/sandbox.lua.in +++ b/daemon/lua/sandbox.lua.in @@ -415,7 +415,6 @@ setfenv(0, _G) -- Load default modules trust_anchors = require('trust_anchors') -@sd_watchdog@ modules.load('ta_update') modules.load('ta_signal_query') modules.load('policy') @@ -425,6 +424,7 @@ modules.load('detect_time_jump') modules.load('ta_sentinel') modules.load('edns_keepalive') modules.load('refuse_nord') +modules.load('watchdog') -- Load keyfile_default trust_anchors.add_file('@keyfile_default@', @unmanaged@) diff --git a/doc/modules.rst b/doc/modules.rst index c6a60f5c6..10e4a1b0e 100644 --- a/doc/modules.rst +++ b/doc/modules.rst @@ -37,3 +37,4 @@ Modules .. include:: ../modules/edns_keepalive/README.rst .. include:: ../modules/experimental_dot_auth/README.rst .. include:: ../modules/refuse_nord/README.rst +.. include:: ../modules/watchdog/README.rst diff --git a/modules/meson.build b/modules/meson.build index b0bcc4dbb..215475a70 100644 --- a/modules/meson.build +++ b/modules/meson.build @@ -15,6 +15,7 @@ lua_mod_src = [ # add lua modules without separate meson.build files('ta_sentinel/ta_sentinel.lua'), files('ta_signal_query/ta_signal_query.lua'), files('ta_update/ta_update.lua'), + files('watchdog/watchdog.lua'), files('workarounds/workarounds.lua'), ] @@ -52,9 +53,6 @@ subdir('policy') subdir('refuse_nord') subdir('stats') subdir('view') -if libsystemd.found() and libsystemd.version().version_compare('>=183') - subdir('sd_watchdog') -endif # install lua modules foreach mod : lua_mod_src diff --git a/modules/sd_watchdog/README.rst b/modules/sd_watchdog/README.rst deleted file mode 100644 index e4301af57..000000000 --- a/modules/sd_watchdog/README.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. _mod-bogus_log: - -Systemd watchdog ----------------- - -This module is loaded by default when compiled with systemd. It enables the use -systemd watchdog to restart the process in case it stops responding. The -upstream systemd unit files are configured to use this feature, which is turned -on with the ``WatchdogSec=`` directive in the service file. diff --git a/modules/sd_watchdog/meson.build b/modules/sd_watchdog/meson.build deleted file mode 100644 index ad0b134d8..000000000 --- a/modules/sd_watchdog/meson.build +++ /dev/null @@ -1,15 +0,0 @@ -# C module: sd_watchdog - -sd_watchdog_src = files([ - 'sd_watchdog.c', -]) -c_src_lint += sd_watchdog_src - -sd_watchdog_mod = shared_module( - 'sd_watchdog', - sd_watchdog_src, - include_directories: mod_inc_dir, - name_prefix: '', - install: true, - install_dir: modules_dir, -) diff --git a/modules/sd_watchdog/sd_watchdog.c b/modules/sd_watchdog/sd_watchdog.c deleted file mode 100644 index 3aeb7b051..000000000 --- a/modules/sd_watchdog/sd_watchdog.c +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (C) Knot Resolver contributors. Licensed under GNU GPLv3 or - * (at your option) any later version. See COPYING for text of the license. - * - * sd_watchdog module implements support for systemd watchdog supervision */ - -#include -#include - -#include "lib/module.h" - -struct watchdog_config { - bool enabled; - uint64_t timeout_usec; - uv_timer_t timer; -}; - -static void keepalive_ping(uv_timer_t *timer) -{ - // NOTE: in the future, some sanity checks could be used here - // It is generally recommended to ignore the return value of this call. - sd_notify(0, "WATCHDOG=1"); -} - -KR_EXPORT -int sd_watchdog_init(struct kr_module *module) -{ - struct watchdog_config *conf = calloc(1, sizeof(*conf)); - if (!conf) { - return kr_error(ENOMEM); - } - module->data = conf; - - /* Check if watchdog is enabled */ - int ret = sd_watchdog_enabled(1, &conf->timeout_usec); - if (ret < 0) { - kr_log_error("[sd_watchdog] error: %s\n", strerror(abs(ret))); - return kr_error(ret); - } - conf->enabled = ret > 0; - if (!conf->enabled) { - kr_log_verbose("[sd_watchdog] disabled (not required)\n"); - return kr_ok(); - } - - uint64_t delay_ms = (conf->timeout_usec / 1000) / 2; - if (delay_ms == 0) { - kr_log_error("[sd_watchdog] error: WatchdogSec= must be at least 2ms!\n"); - return kr_error(ENOTSUP); - } - - uv_loop_t *loop = uv_default_loop(); - uv_timer_init(loop, &conf->timer); - ret = uv_timer_start(&conf->timer, keepalive_ping, delay_ms, delay_ms); - if (ret != 0) { - kr_log_error("[sd_watchdog] error: failed to start uv_timer: %s\n", - uv_strerror(ret)); - conf->timer.loop = NULL; - return kr_error(ret); - } - - kr_log_verbose("[sd_watchdog] enabled (repeat: %"PRIu64" ms, timeout: %"PRIu64" ms)\n", - delay_ms, conf->timeout_usec / 1000); - - return kr_ok(); -} - -KR_EXPORT -int sd_watchdog_deinit(struct kr_module *module) -{ - struct watchdog_config *conf = module->data; - if (conf && conf->timer.loop == uv_default_loop()) { /* normal state */ - int ret = uv_timer_stop(&conf->timer); - if (ret != 0) { - kr_log_error("[sd_watchdog] error: failed to stop uv_timer: %s\n", - uv_strerror(ret)); - } - /* We have a problem: UV timer can't be closed immediately, - * but as soon as we return from _deinit(), we get dlclose() - * so no function from this module may be usable anymore. */ - conf->timer.data = conf; - uv_close((uv_handle_t *)&conf->timer, kr_uv_free_cb); - } else { /* watchdog might be just disabled */ - free(conf); - } - return kr_ok(); -} - -KR_MODULE_EXPORT(sd_watchdog) diff --git a/modules/watchdog/README.rst b/modules/watchdog/README.rst new file mode 100644 index 000000000..04341e59c --- /dev/null +++ b/modules/watchdog/README.rst @@ -0,0 +1,41 @@ +.. _mod-watchdog: + +Watchdog +-------- + +This module cooperates with Systemd watchdog to restart the process in case +the internal event loop gets stuck. The upstream Systemd unit files are configured +to use this feature, which is turned on with the ``WatchdogSec=`` directive +in the service file. + +As an optional feature, this module can also do an internal DNS query to check if resolver +answers correctly. To use this feature you must configure DNS name and type to query for: + +.. code-block:: lua + + watchdog.config({ qname = 'nic.cz.', qtype = kres.type.A }) + +Each single query from watchdog must result in answer with +RCODE = NOERROR or NXDOMAIN. Any other result will terminate the resolver +(with exit code 69) to allow the supervisor process to do cleanup and restart +the resolver. + +It is recommended to use a name with a very short TTL to make sure the watchdog +is testing all parts of resolver and not only its cache. Obviously this check +makes sense only when used with very reliable domains; otherwise a failure +on authoritative side will shutdown resolver! + +`WatchdogSec` specifies deadline for supervisor when the process will be killed. +Watchdog queries are executed each `WatchdogSec / 2` seconds. +This implies that **half** of `WatchdogSec` interval must be long enough for +normal DNS query to succeed, so do not forget to add two or three seconds +for random network timeouts etc. + +The module is loaded by default. If you'd like to disable it you can unload it: + +.. code-block:: lua + + modules.unload('watchdog') + +Beware that unloading the module without disabling watchdog feature in supervisor +will lead to infinite restart loop. diff --git a/modules/watchdog/watchdog.lua b/modules/watchdog/watchdog.lua new file mode 100644 index 000000000..2251a4141 --- /dev/null +++ b/modules/watchdog/watchdog.lua @@ -0,0 +1,135 @@ +local ffi = require('ffi') + +ffi.cdef([[ + int sd_watchdog_enabled(int unset_environment, uint64_t *usec); + int sd_notify(int unset_environment, const char *state); +]]) + +local watchdog = {} +local private = {} + +local function sd_signal_ok() + ffi.C.sd_notify(0, 'WATCHDOG=1') +end + +function private.fail_callback() + log('[watchdog] TERMINATING resolver, supervisor is expected to restart it') + os.exit(69) -- unclean exit code = EX_UNAVAILABLE +end + +-- logging +local function add_tracer(logbuf) + return function (req) + local function qrylogger(qry, src, msg) + local req_uid = (qry and qry.request and qry.request.uid) or 0 + local qry_uid = (qry and qry.uid) or 0 + local logline = string.format("[%05u.%02u][%s] %s", req_uid, qry_uid, ffi.string(src), ffi.string(msg)) + table.insert(logbuf, logline) + if verbose() then -- without this message would be missing in verbose log + ffi.C.kr_log_qverbose_impl(qry, src, msg) + end + end + req.trace_log = ffi.cast('trace_log_f', qrylogger) + end +end + +local function check_answer(logbuf) + return function (pkt, req) + req.trace_log:free() + if pkt:rcode() == kres.rcode.NOERROR or pkt:rcode() == kres.rcode.NXDOMAIN then + private.ok_callback() + return + end + log('[watchdog] watchdog query returned unexpected answer! query verbose log:') + log(table.concat(logbuf, '')) + log('[watchdog] problematic answer:\n%s', pkt) + -- failure! quit immediatelly to allow process supervisor to restart us + private.fail_callback() + end +end +private.check_answer_callback = check_answer + +local function timer() + local logbuf = {} + -- fire watchdog query + if private.qname and private.qtype then + if verbose() then + log('[watchdog] starting watchdog query %s %s', private.qname, private.qtype) + end + resolve(private.qname, + private.qtype, + kres.class.IN, + {'TRACE'}, + private.check_answer_callback(logbuf), + add_tracer(logbuf)) + else + private.ok_callback() + end +end + +function watchdog.config(cfg) + -- read only + if not cfg then + return private + end + + local interval = tonumber(cfg.interval or private.interval or 10000) + if not interval or interval < 1 then + error('[watchdog] interval must be >= 1 ms') + end + private.interval = interval + + -- qname = nil will disable DNS queries + private.qname = cfg.qname + private.qtype = cfg.qtype or kres.type.A + + -- restart timers + watchdog.deinit() + private.event = event.recurrent(private.interval, timer) + return private +end + +-- automatically enable watchdog if it is configured in systemd +function watchdog.init() + if private.event then + error('[watchdog] module is already loaded') + end + local timeoutptr = ffi.new('uint64_t[1]') + local systemd_present, ret = pcall(function() return ffi.C.sd_watchdog_enabled(0, timeoutptr) end) + if not systemd_present then + if verbose() then + log('[watchdog] systemd library not detected') + end + return + end + private.ok_callback = sd_signal_ok + if ret < 0 then + error('[watchdog] %s', ffi.string(ffi.C.knot_strerror(math.abs(ret)))) + return + elseif ret == 0 then + if verbose() then + log('[watchdog] disabled in systemd (WatchdogSec= not specified)') + end + return + end + local timeout = tonumber(timeoutptr[0]) / 1000 -- convert to ms + local interval = timeout / 2 -- halve interval to make sure we are never late + if interval < 1 then + log('[watchdog] error: WatchdogSec= must be at least 2ms! (got %d usec)', + tonumber(timeoutptr[0])) + end + watchdog.config({ interval = interval }) + if verbose() then + log('[watchdog] systemd watchdog enabled (check interval: %s ms, timeout: %s ms)', + private.interval, timeout) + end +end + +function watchdog.deinit() + if private.event then + event.cancel(private.event) + private.event = nil + end +end + +return watchdog diff --git a/systemd/kresd@.service.in b/systemd/kresd@.service.in index 19531743b..d048782a3 100644 --- a/systemd/kresd@.service.in +++ b/systemd/kresd@.service.in @@ -12,6 +12,7 @@ ExecStart=@sbin_dir@/kresd --config=@etc_dir@/kresd.conf User=@user@ TimeoutStopSec=10s WatchdogSec=10s +RestartForceExitStatus=69 Restart=on-abnormal LimitNOFILE=1048576 Sockets=kresd.socket