lua_config.set('keyfile_default', keyfile_default)
lua_config.set('etc_dir', etc_dir)
lua_config.set('unmanaged', managed_ta ? 'false' : 'true')
-if libsystemd.found() and libsystemd.version().version_compare('>=183')
- lua_config.set('sd_watchdog', 'modules.load(\'sd_watchdog\')')
-else
- lua_config.set('sd_watchdog', '')
-endif
trust_anchors = configure_file(
input: 'trust_anchors.lua.in',
-- Load default modules
trust_anchors = require('trust_anchors')
-@sd_watchdog@
modules.load('ta_update')
modules.load('ta_signal_query')
modules.load('policy')
modules.load('ta_sentinel')
modules.load('edns_keepalive')
modules.load('refuse_nord')
+modules.load('watchdog')
-- Load keyfile_default
trust_anchors.add_file('@keyfile_default@', @unmanaged@)
.. include:: ../modules/edns_keepalive/README.rst
.. include:: ../modules/experimental_dot_auth/README.rst
.. include:: ../modules/refuse_nord/README.rst
+.. include:: ../modules/watchdog/README.rst
files('ta_sentinel/ta_sentinel.lua'),
files('ta_signal_query/ta_signal_query.lua'),
files('ta_update/ta_update.lua'),
+ files('watchdog/watchdog.lua'),
files('workarounds/workarounds.lua'),
]
subdir('refuse_nord')
subdir('stats')
subdir('view')
-if libsystemd.found() and libsystemd.version().version_compare('>=183')
- subdir('sd_watchdog')
-endif
# install lua modules
foreach mod : lua_mod_src
+++ /dev/null
-.. _mod-bogus_log:
-
-Systemd watchdog
-----------------
-
-This module is loaded by default when compiled with systemd. It enables the use
-systemd watchdog to restart the process in case it stops responding. The
-upstream systemd unit files are configured to use this feature, which is turned
-on with the ``WatchdogSec=`` directive in the service file.
+++ /dev/null
-# C module: sd_watchdog
-
-sd_watchdog_src = files([
- 'sd_watchdog.c',
-])
-c_src_lint += sd_watchdog_src
-
-sd_watchdog_mod = shared_module(
- 'sd_watchdog',
- sd_watchdog_src,
- include_directories: mod_inc_dir,
- name_prefix: '',
- install: true,
- install_dir: modules_dir,
-)
+++ /dev/null
-/* Copyright (C) Knot Resolver contributors. Licensed under GNU GPLv3 or
- * (at your option) any later version. See COPYING for text of the license.
- *
- * sd_watchdog module implements support for systemd watchdog supervision */
-
-#include <systemd/sd-daemon.h>
-#include <uv.h>
-
-#include "lib/module.h"
-
-struct watchdog_config {
- bool enabled;
- uint64_t timeout_usec;
- uv_timer_t timer;
-};
-
-static void keepalive_ping(uv_timer_t *timer)
-{
- // NOTE: in the future, some sanity checks could be used here
- // It is generally recommended to ignore the return value of this call.
- sd_notify(0, "WATCHDOG=1");
-}
-
-KR_EXPORT
-int sd_watchdog_init(struct kr_module *module)
-{
- struct watchdog_config *conf = calloc(1, sizeof(*conf));
- if (!conf) {
- return kr_error(ENOMEM);
- }
- module->data = conf;
-
- /* Check if watchdog is enabled */
- int ret = sd_watchdog_enabled(1, &conf->timeout_usec);
- if (ret < 0) {
- kr_log_error("[sd_watchdog] error: %s\n", strerror(abs(ret)));
- return kr_error(ret);
- }
- conf->enabled = ret > 0;
- if (!conf->enabled) {
- kr_log_verbose("[sd_watchdog] disabled (not required)\n");
- return kr_ok();
- }
-
- uint64_t delay_ms = (conf->timeout_usec / 1000) / 2;
- if (delay_ms == 0) {
- kr_log_error("[sd_watchdog] error: WatchdogSec= must be at least 2ms!\n");
- return kr_error(ENOTSUP);
- }
-
- uv_loop_t *loop = uv_default_loop();
- uv_timer_init(loop, &conf->timer);
- ret = uv_timer_start(&conf->timer, keepalive_ping, delay_ms, delay_ms);
- if (ret != 0) {
- kr_log_error("[sd_watchdog] error: failed to start uv_timer: %s\n",
- uv_strerror(ret));
- conf->timer.loop = NULL;
- return kr_error(ret);
- }
-
- kr_log_verbose("[sd_watchdog] enabled (repeat: %"PRIu64" ms, timeout: %"PRIu64" ms)\n",
- delay_ms, conf->timeout_usec / 1000);
-
- return kr_ok();
-}
-
-KR_EXPORT
-int sd_watchdog_deinit(struct kr_module *module)
-{
- struct watchdog_config *conf = module->data;
- if (conf && conf->timer.loop == uv_default_loop()) { /* normal state */
- int ret = uv_timer_stop(&conf->timer);
- if (ret != 0) {
- kr_log_error("[sd_watchdog] error: failed to stop uv_timer: %s\n",
- uv_strerror(ret));
- }
- /* We have a problem: UV timer can't be closed immediately,
- * but as soon as we return from _deinit(), we get dlclose()
- * so no function from this module may be usable anymore. */
- conf->timer.data = conf;
- uv_close((uv_handle_t *)&conf->timer, kr_uv_free_cb);
- } else { /* watchdog might be just disabled */
- free(conf);
- }
- return kr_ok();
-}
-
-KR_MODULE_EXPORT(sd_watchdog)
--- /dev/null
+.. _mod-watchdog:
+
+Watchdog
+--------
+
+This module cooperates with Systemd watchdog to restart the process in case
+the internal event loop gets stuck. The upstream Systemd unit files are configured
+to use this feature, which is turned on with the ``WatchdogSec=`` directive
+in the service file.
+
+As an optional feature, this module can also do an internal DNS query to check if resolver
+answers correctly. To use this feature you must configure DNS name and type to query for:
+
+.. code-block:: lua
+
+ watchdog.config({ qname = 'nic.cz.', qtype = kres.type.A })
+
+Each single query from watchdog must result in answer with
+RCODE = NOERROR or NXDOMAIN. Any other result will terminate the resolver
+(with exit code 69) to allow the supervisor process to do cleanup and restart
+the resolver.
+
+It is recommended to use a name with a very short TTL to make sure the watchdog
+is testing all parts of resolver and not only its cache. Obviously this check
+makes sense only when used with very reliable domains; otherwise a failure
+on authoritative side will shutdown resolver!
+
+`WatchdogSec` specifies deadline for supervisor when the process will be killed.
+Watchdog queries are executed each `WatchdogSec / 2` seconds.
+This implies that **half** of `WatchdogSec` interval must be long enough for
+normal DNS query to succeed, so do not forget to add two or three seconds
+for random network timeouts etc.
+
+The module is loaded by default. If you'd like to disable it you can unload it:
+
+.. code-block:: lua
+
+ modules.unload('watchdog')
+
+Beware that unloading the module without disabling watchdog feature in supervisor
+will lead to infinite restart loop.
--- /dev/null
+local ffi = require('ffi')
+
+ffi.cdef([[
+ int sd_watchdog_enabled(int unset_environment, uint64_t *usec);
+ int sd_notify(int unset_environment, const char *state);
+]])
+
+local watchdog = {}
+local private = {}
+
+local function sd_signal_ok()
+ ffi.C.sd_notify(0, 'WATCHDOG=1')
+end
+
+function private.fail_callback()
+ log('[watchdog] TERMINATING resolver, supervisor is expected to restart it')
+ os.exit(69) -- unclean exit code = EX_UNAVAILABLE
+end
+
+-- logging
+local function add_tracer(logbuf)
+ return function (req)
+ local function qrylogger(qry, src, msg)
+ local req_uid = (qry and qry.request and qry.request.uid) or 0
+ local qry_uid = (qry and qry.uid) or 0
+ local logline = string.format("[%05u.%02u][%s] %s", req_uid, qry_uid, ffi.string(src), ffi.string(msg))
+ table.insert(logbuf, logline)
+ if verbose() then -- without this message would be missing in verbose log
+ ffi.C.kr_log_qverbose_impl(qry, src, msg)
+ end
+ end
+ req.trace_log = ffi.cast('trace_log_f', qrylogger)
+ end
+end
+
+local function check_answer(logbuf)
+ return function (pkt, req)
+ req.trace_log:free()
+ if pkt:rcode() == kres.rcode.NOERROR or pkt:rcode() == kres.rcode.NXDOMAIN then
+ private.ok_callback()
+ return
+ end
+ log('[watchdog] watchdog query returned unexpected answer! query verbose log:')
+ log(table.concat(logbuf, ''))
+ log('[watchdog] problematic answer:\n%s', pkt)
+ -- failure! quit immediatelly to allow process supervisor to restart us
+ private.fail_callback()
+ end
+end
+private.check_answer_callback = check_answer
+
+local function timer()
+ local logbuf = {}
+ -- fire watchdog query
+ if private.qname and private.qtype then
+ if verbose() then
+ log('[watchdog] starting watchdog query %s %s', private.qname, private.qtype)
+ end
+ resolve(private.qname,
+ private.qtype,
+ kres.class.IN,
+ {'TRACE'},
+ private.check_answer_callback(logbuf),
+ add_tracer(logbuf))
+ else
+ private.ok_callback()
+ end
+end
+
+function watchdog.config(cfg)
+ -- read only
+ if not cfg then
+ return private
+ end
+
+ local interval = tonumber(cfg.interval or private.interval or 10000)
+ if not interval or interval < 1 then
+ error('[watchdog] interval must be >= 1 ms')
+ end
+ private.interval = interval
+
+ -- qname = nil will disable DNS queries
+ private.qname = cfg.qname
+ private.qtype = cfg.qtype or kres.type.A
+
+ -- restart timers
+ watchdog.deinit()
+ private.event = event.recurrent(private.interval, timer)
+ return private
+end
+
+-- automatically enable watchdog if it is configured in systemd
+function watchdog.init()
+ if private.event then
+ error('[watchdog] module is already loaded')
+ end
+ local timeoutptr = ffi.new('uint64_t[1]')
+ local systemd_present, ret = pcall(function() return ffi.C.sd_watchdog_enabled(0, timeoutptr) end)
+ if not systemd_present then
+ if verbose() then
+ log('[watchdog] systemd library not detected')
+ end
+ return
+ end
+ private.ok_callback = sd_signal_ok
+ if ret < 0 then
+ error('[watchdog] %s', ffi.string(ffi.C.knot_strerror(math.abs(ret))))
+ return
+ elseif ret == 0 then
+ if verbose() then
+ log('[watchdog] disabled in systemd (WatchdogSec= not specified)')
+ end
+ return
+ end
+ local timeout = tonumber(timeoutptr[0]) / 1000 -- convert to ms
+ local interval = timeout / 2 -- halve interval to make sure we are never late
+ if interval < 1 then
+ log('[watchdog] error: WatchdogSec= must be at least 2ms! (got %d usec)',
+ tonumber(timeoutptr[0]))
+ end
+ watchdog.config({ interval = interval })
+ if verbose() then
+ log('[watchdog] systemd watchdog enabled (check interval: %s ms, timeout: %s ms)',
+ private.interval, timeout)
+ end
+end
+
+function watchdog.deinit()
+ if private.event then
+ event.cancel(private.event)
+ private.event = nil
+ end
+end
+
+return watchdog
User=@user@
TimeoutStopSec=10s
WatchdogSec=10s
+RestartForceExitStatus=69
Restart=on-abnormal
LimitNOFILE=1048576
Sockets=kresd.socket