]> git.ipfire.org Git - thirdparty/knot-resolver.git/commitdiff
watchdog rewrite obs-knot-resolver-bs4hbr/deployments/282
authorPetr Špaček <petr.spacek@nic.cz>
Tue, 8 Oct 2019 12:56:12 +0000 (14:56 +0200)
committerPetr Špaček <petr.spacek@nic.cz>
Tue, 8 Oct 2019 15:24:22 +0000 (17:24 +0200)
The watchdog module now can be loaded without systemd, has customisable
callbacks, and can do real DNS queries and check their results.

daemon/lua/meson.build
daemon/lua/sandbox.lua.in
doc/modules.rst
modules/meson.build
modules/sd_watchdog/README.rst [deleted file]
modules/sd_watchdog/meson.build [deleted file]
modules/sd_watchdog/sd_watchdog.c [deleted file]
modules/watchdog/README.rst [new file with mode: 0644]
modules/watchdog/watchdog.lua [new file with mode: 0644]
systemd/kresd@.service.in

index 2555321149735e031f415f8cf402c4d381bed08c..bd652141c44e227204f0cb7e0b1457935be27fcf 100644 (file)
@@ -9,11 +9,6 @@ lua_config = configuration_data()
 lua_config.set('keyfile_default', keyfile_default)
 lua_config.set('etc_dir', etc_dir)
 lua_config.set('unmanaged', managed_ta ? 'false' : 'true')
-if libsystemd.found() and libsystemd.version().version_compare('>=183')
-  lua_config.set('sd_watchdog', 'modules.load(\'sd_watchdog\')')
-else
-  lua_config.set('sd_watchdog', '')
-endif
 
 trust_anchors = configure_file(
   input: 'trust_anchors.lua.in',
index cbe020d31808db4e888e241f36a8dd7b7636099e..b3b9394727b45741ad08bf286917216151dfc0ef 100644 (file)
@@ -415,7 +415,6 @@ setfenv(0, _G)
 
 -- Load default modules
 trust_anchors = require('trust_anchors')
-@sd_watchdog@
 modules.load('ta_update')
 modules.load('ta_signal_query')
 modules.load('policy')
@@ -425,6 +424,7 @@ modules.load('detect_time_jump')
 modules.load('ta_sentinel')
 modules.load('edns_keepalive')
 modules.load('refuse_nord')
+modules.load('watchdog')
 
 -- Load keyfile_default
 trust_anchors.add_file('@keyfile_default@', @unmanaged@)
index c6a60f5c6464fa4e7025606d9fbc3b255b05104b..10e4a1b0e506ec45bbbd0c44fd72d6e20bf81e32 100644 (file)
@@ -37,3 +37,4 @@ Modules
 .. include:: ../modules/edns_keepalive/README.rst
 .. include:: ../modules/experimental_dot_auth/README.rst
 .. include:: ../modules/refuse_nord/README.rst
+.. include:: ../modules/watchdog/README.rst
index b0bcc4dbb0efa0b9b8a6a0df8f82e93de8b47c6c..215475a7093b52e95d26f5636efc11df910e61f3 100644 (file)
@@ -15,6 +15,7 @@ lua_mod_src = [  # add lua modules without separate meson.build
   files('ta_sentinel/ta_sentinel.lua'),
   files('ta_signal_query/ta_signal_query.lua'),
   files('ta_update/ta_update.lua'),
+  files('watchdog/watchdog.lua'),
   files('workarounds/workarounds.lua'),
 ]
 
@@ -52,9 +53,6 @@ subdir('policy')
 subdir('refuse_nord')
 subdir('stats')
 subdir('view')
-if libsystemd.found() and libsystemd.version().version_compare('>=183')
-  subdir('sd_watchdog')
-endif
 
 # install lua modules
 foreach mod : lua_mod_src
diff --git a/modules/sd_watchdog/README.rst b/modules/sd_watchdog/README.rst
deleted file mode 100644 (file)
index e4301af..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-.. _mod-bogus_log:
-
-Systemd watchdog
-----------------
-
-This module is loaded by default when compiled with systemd. It enables the use
-systemd watchdog to restart the process in case it stops responding.  The
-upstream systemd unit files are configured to use this feature, which is turned
-on with the ``WatchdogSec=`` directive in the service file.
diff --git a/modules/sd_watchdog/meson.build b/modules/sd_watchdog/meson.build
deleted file mode 100644 (file)
index ad0b134..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-# C module: sd_watchdog
-
-sd_watchdog_src = files([
-  'sd_watchdog.c',
-])
-c_src_lint += sd_watchdog_src
-
-sd_watchdog_mod = shared_module(
-  'sd_watchdog',
-  sd_watchdog_src,
-  include_directories: mod_inc_dir,
-  name_prefix: '',
-  install: true,
-  install_dir: modules_dir,
-)
diff --git a/modules/sd_watchdog/sd_watchdog.c b/modules/sd_watchdog/sd_watchdog.c
deleted file mode 100644 (file)
index 3aeb7b0..0000000
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (C) Knot Resolver contributors. Licensed under GNU GPLv3 or
- * (at your option) any later version. See COPYING for text of the license.
- *
- * sd_watchdog module implements support for systemd watchdog supervision */
-
-#include <systemd/sd-daemon.h>
-#include <uv.h>
-
-#include "lib/module.h"
-
-struct watchdog_config {
-       bool enabled;
-       uint64_t timeout_usec;
-       uv_timer_t timer;
-};
-
-static void keepalive_ping(uv_timer_t *timer)
-{
-       // NOTE: in the future, some sanity checks could be used here
-       // It is generally recommended to ignore the return value of this call.
-       sd_notify(0, "WATCHDOG=1");
-}
-
-KR_EXPORT
-int sd_watchdog_init(struct kr_module *module)
-{
-       struct watchdog_config *conf = calloc(1, sizeof(*conf));
-       if (!conf) {
-               return kr_error(ENOMEM);
-       }
-       module->data = conf;
-
-       /* Check if watchdog is enabled */
-       int ret = sd_watchdog_enabled(1, &conf->timeout_usec);
-       if (ret < 0) {
-               kr_log_error("[sd_watchdog] error: %s\n", strerror(abs(ret)));
-               return kr_error(ret);
-       }
-       conf->enabled = ret > 0;
-       if (!conf->enabled) {
-               kr_log_verbose("[sd_watchdog] disabled (not required)\n");
-               return kr_ok();
-       }
-
-       uint64_t delay_ms = (conf->timeout_usec / 1000) / 2;
-       if (delay_ms == 0) {
-               kr_log_error("[sd_watchdog] error: WatchdogSec= must be at least 2ms!\n");
-               return kr_error(ENOTSUP);
-       }
-
-       uv_loop_t *loop = uv_default_loop();
-       uv_timer_init(loop, &conf->timer);
-       ret = uv_timer_start(&conf->timer, keepalive_ping, delay_ms, delay_ms);
-       if (ret != 0) {
-               kr_log_error("[sd_watchdog] error: failed to start uv_timer: %s\n",
-                               uv_strerror(ret));
-               conf->timer.loop = NULL;
-               return kr_error(ret);
-       }
-
-       kr_log_verbose("[sd_watchdog] enabled (repeat: %"PRIu64" ms, timeout: %"PRIu64" ms)\n",
-               delay_ms, conf->timeout_usec / 1000);
-
-       return kr_ok();
-}
-
-KR_EXPORT
-int sd_watchdog_deinit(struct kr_module *module)
-{
-       struct watchdog_config *conf = module->data;
-       if (conf && conf->timer.loop == uv_default_loop()) { /* normal state */
-               int ret = uv_timer_stop(&conf->timer);
-               if (ret != 0) {
-                       kr_log_error("[sd_watchdog] error: failed to stop uv_timer: %s\n",
-                                       uv_strerror(ret));
-               }
-               /* We have a problem: UV timer can't be closed immediately,
-                * but as soon as we return from _deinit(), we get dlclose()
-                * so no function from this module may be usable anymore. */
-               conf->timer.data = conf;
-               uv_close((uv_handle_t *)&conf->timer, kr_uv_free_cb);
-       } else { /* watchdog might be just disabled */
-               free(conf);
-       }
-       return kr_ok();
-}
-
-KR_MODULE_EXPORT(sd_watchdog)
diff --git a/modules/watchdog/README.rst b/modules/watchdog/README.rst
new file mode 100644 (file)
index 0000000..04341e5
--- /dev/null
@@ -0,0 +1,41 @@
+.. _mod-watchdog:
+
+Watchdog
+--------
+
+This module cooperates with Systemd watchdog to restart the process in case
+the internal event loop gets stuck. The upstream Systemd unit files are configured
+to use this feature, which is turned on with the ``WatchdogSec=`` directive
+in the service file.
+
+As an optional feature, this module can also do an internal DNS query to check if resolver
+answers correctly. To use this feature you must configure DNS name and type to query for:
+
+.. code-block:: lua
+
+       watchdog.config({ qname = 'nic.cz.', qtype = kres.type.A })
+
+Each single query from watchdog must result in answer with
+RCODE = NOERROR or NXDOMAIN. Any other result will terminate the resolver
+(with exit code 69) to allow the supervisor process to do cleanup and restart
+the resolver.
+
+It is recommended to use a name with a very short TTL to make sure the watchdog
+is testing all parts of resolver and not only its cache. Obviously this check
+makes sense only when used with very reliable domains; otherwise a failure
+on authoritative side will shutdown resolver!
+
+`WatchdogSec` specifies deadline for supervisor when the process will be killed.
+Watchdog queries are executed each `WatchdogSec / 2` seconds.
+This implies that **half** of `WatchdogSec` interval must be long enough for
+normal DNS query to succeed, so do not forget to add two or three seconds
+for random network timeouts etc.
+
+The module is loaded by default. If you'd like to disable it you can unload it:
+
+.. code-block:: lua
+
+   modules.unload('watchdog')
+
+Beware that unloading the module without disabling watchdog feature in supervisor
+will lead to infinite restart loop.
diff --git a/modules/watchdog/watchdog.lua b/modules/watchdog/watchdog.lua
new file mode 100644 (file)
index 0000000..2251a41
--- /dev/null
@@ -0,0 +1,135 @@
+local ffi = require('ffi')
+
+ffi.cdef([[
+       int sd_watchdog_enabled(int unset_environment, uint64_t *usec);
+       int sd_notify(int unset_environment, const char *state);
+]])
+
+local watchdog = {}
+local private = {}
+
+local function sd_signal_ok()
+       ffi.C.sd_notify(0, 'WATCHDOG=1')
+end
+
+function private.fail_callback()
+       log('[watchdog] TERMINATING resolver, supervisor is expected to restart it')
+       os.exit(69)  -- unclean exit code = EX_UNAVAILABLE
+end
+
+-- logging
+local function add_tracer(logbuf)
+       return function (req)
+               local function qrylogger(qry, src, msg)
+                       local req_uid = (qry and qry.request and qry.request.uid) or 0
+                       local qry_uid = (qry and qry.uid) or 0
+                       local logline = string.format("[%05u.%02u][%s] %s", req_uid, qry_uid, ffi.string(src), ffi.string(msg))
+                       table.insert(logbuf, logline)
+                       if verbose() then  -- without this message would be missing in verbose log
+                               ffi.C.kr_log_qverbose_impl(qry, src, msg)
+                       end
+               end
+               req.trace_log = ffi.cast('trace_log_f', qrylogger)
+       end
+end
+
+local function check_answer(logbuf)
+       return function (pkt, req)
+               req.trace_log:free()
+               if pkt:rcode() == kres.rcode.NOERROR or pkt:rcode() == kres.rcode.NXDOMAIN then
+                       private.ok_callback()
+                       return
+               end
+               log('[watchdog] watchdog query returned unexpected answer! query verbose log:')
+               log(table.concat(logbuf, ''))
+               log('[watchdog] problematic answer:\n%s', pkt)
+               -- failure! quit immediatelly to allow process supervisor to restart us
+               private.fail_callback()
+       end
+end
+private.check_answer_callback = check_answer
+
+local function timer()
+       local logbuf = {}
+       -- fire watchdog query
+       if private.qname and private.qtype then
+               if verbose() then
+                       log('[watchdog] starting watchdog query %s %s', private.qname, private.qtype)
+               end
+               resolve(private.qname,
+                       private.qtype,
+                       kres.class.IN,
+                       {'TRACE'},
+                       private.check_answer_callback(logbuf),
+                       add_tracer(logbuf))
+       else
+               private.ok_callback()
+       end
+end
+
+function watchdog.config(cfg)
+       -- read only
+       if not cfg then
+               return private
+       end
+
+       local interval = tonumber(cfg.interval or private.interval or 10000)
+       if not interval or interval < 1 then
+               error('[watchdog] interval must be >= 1 ms')
+       end
+       private.interval = interval
+
+       -- qname = nil will disable DNS queries
+       private.qname = cfg.qname
+       private.qtype = cfg.qtype or kres.type.A
+
+       -- restart timers
+       watchdog.deinit()
+       private.event = event.recurrent(private.interval, timer)
+       return private
+end
+
+-- automatically enable watchdog if it is configured in systemd
+function watchdog.init()
+       if private.event then
+               error('[watchdog] module is already loaded')
+       end
+       local timeoutptr = ffi.new('uint64_t[1]')
+       local systemd_present, ret = pcall(function() return ffi.C.sd_watchdog_enabled(0, timeoutptr) end)
+       if not systemd_present then
+               if verbose() then
+                       log('[watchdog] systemd library not detected')
+               end
+               return
+       end
+       private.ok_callback = sd_signal_ok
+       if ret < 0 then
+               error('[watchdog] %s', ffi.string(ffi.C.knot_strerror(math.abs(ret))))
+               return
+       elseif ret == 0 then
+               if verbose() then
+                       log('[watchdog] disabled in systemd (WatchdogSec= not specified)')
+               end
+               return
+       end
+       local timeout = tonumber(timeoutptr[0]) / 1000  -- convert to ms
+       local interval = timeout / 2  -- halve interval to make sure we are never late
+       if interval < 1 then
+               log('[watchdog] error: WatchdogSec= must be at least 2ms! (got %d usec)',
+                       tonumber(timeoutptr[0]))
+       end
+       watchdog.config({ interval = interval })
+       if verbose() then
+               log('[watchdog] systemd watchdog enabled (check interval: %s ms, timeout: %s ms)',
+                       private.interval, timeout)
+       end
+end
+
+function watchdog.deinit()
+       if private.event then
+               event.cancel(private.event)
+               private.event = nil
+       end
+end
+
+return watchdog
index 19531743b21383203bac8feb9d3b26ce2bd34352..d048782a3ac3d648fa343a611e69c7e64c219cae 100644 (file)
@@ -12,6 +12,7 @@ ExecStart=@sbin_dir@/kresd --config=@etc_dir@/kresd.conf
 User=@user@
 TimeoutStopSec=10s
 WatchdogSec=10s
+RestartForceExitStatus=69
 Restart=on-abnormal
 LimitNOFILE=1048576
 Sockets=kresd.socket