From 7c242178fee993571f58622fc66a53533a0ef3b9 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Vladim=C3=ADr=20=C4=8Cun=C3=A1t?= Date: Thu, 15 Apr 2021 12:07:09 +0200 Subject: [PATCH] lib/utils kr_fail(): simple rate limit for forking Default 5 minutes (but off). Randomize the delays +-25%. --- daemon/lua/kres-gen.lua | 2 +- daemon/lua/kres-gen.sh | 2 +- doc/config-debugging.rst | 14 +++++++++----- lib/utils.c | 20 +++++++++++++++++--- lib/utils.h | 16 +++++++++++----- meson.build | 6 +++++- 6 files changed, 44 insertions(+), 16 deletions(-) diff --git a/daemon/lua/kres-gen.lua b/daemon/lua/kres-gen.lua index e520790f1..b0100d726 100644 --- a/daemon/lua/kres-gen.lua +++ b/daemon/lua/kres-gen.lua @@ -302,7 +302,7 @@ struct kr_server_selection { kr_layer_t kr_layer_t_static; _Bool kr_dbg_assumption_abort; -_Bool kr_dbg_assumption_fork; +int kr_dbg_assumption_fork; typedef int32_t (*kr_stale_cb)(int32_t ttl, const knot_dname_t *owner, uint16_t type, const struct kr_query *qry); diff --git a/daemon/lua/kres-gen.sh b/daemon/lua/kres-gen.sh index 9263d4ee4..1a2b58df6 100755 --- a/daemon/lua/kres-gen.sh +++ b/daemon/lua/kres-gen.sh @@ -135,7 +135,7 @@ EOF printf " kr_layer_t kr_layer_t_static; _Bool kr_dbg_assumption_abort; -_Bool kr_dbg_assumption_fork; +int kr_dbg_assumption_fork; " printf " diff --git a/doc/config-debugging.rst b/doc/config-debugging.rst index 9de4295fd..f7eb6e888 100644 --- a/doc/config-debugging.rst +++ b/doc/config-debugging.rst @@ -10,21 +10,25 @@ process is out of the scope of this documentation, but some tips can be found Kresd uses *assumptions*, which are checks that should always pass and indicate some weird or unexpected state if they don't. In such cases, they show up in -the log as errors. By default, the process recovers from those states, but the +the log as errors. By default, the process recovers from those states if possible, but the behaviour can be changed with the following options to aid further debugging. .. envvar:: debugging.assumption_abort = false|true - :return: boolean (default: false) + :return: boolean (default: false in meson's release mode, true otherwise) Allow the process to be aborted in case it encounters a failed assumption. + (Some critical conditions always lead to abortion, regardless of settings.) -.. envvar:: debugging.assumption_fork = true|false +.. envvar:: debugging.assumption_fork = milliseconds - :return: boolean (default: true) + :return: int (default: 5 minutes in meson's release mode, 0 otherwise) If a proccess should be aborted, it can be done in two ways. When this is - set to true (default), a child is forked and aborted to obtain a coredump, + set to nonzero (default), a child is forked and aborted to obtain a coredump, while the parent process recovers and keeps running. This can be useful to debug a rare issue that occurs in production, since it doesn't affect the main process. + + As the dumping can be costly, the value is a lower bound on delay between + consecutive coredumps of each process. It is randomized by +-25% each time. diff --git a/lib/utils.c b/lib/utils.c index 85c929932..b3e9b549e 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -39,7 +39,7 @@ /* Logging & debugging */ bool kr_verbose_status = false; bool kr_dbg_assumption_abort = DBG_ASSUMPTION_ABORT; -bool kr_dbg_assumption_fork = DBG_ASSUMPTION_FORK; +int kr_dbg_assumption_fork = DBG_ASSUMPTION_FORK; void kr_fail(bool is_fatal, const char *expr, const char *func, const char *file, int line) { @@ -50,8 +50,22 @@ void kr_fail(bool is_fatal, const char *expr, const char *func, const char *file if (is_fatal || (kr_dbg_assumption_abort && !kr_dbg_assumption_fork)) abort(); - else if (kr_dbg_assumption_abort && kr_dbg_assumption_fork) - fork() == 0 ? abort() : (void)0; + else if (!kr_dbg_assumption_abort || !kr_dbg_assumption_fork) + return; + // We want to fork and abort the child, unless rate-limited. + static uint64_t limited_until = 0; + const uint64_t now = kr_now(); + if (now < limited_until) + return; + if (kr_dbg_assumption_fork > 0) { + // Add jitter +- 25%; in other words: 75% + uniform(0,50%). + // Motivation: if a persistent problem starts happening, desynchronize + // coredumps from different instances as they're not cheap. + limited_until = now + kr_dbg_assumption_fork * 3 / 4 + + kr_dbg_assumption_fork * kr_rand_bytes(1) / 256 / 2; + } + if (fork() == 0) + abort(); } /* diff --git a/lib/utils.h b/lib/utils.h index 588405000..fa53c1a58 100644 --- a/lib/utils.h +++ b/lib/utils.h @@ -64,13 +64,19 @@ typedef void (*trace_log_f)(const struct kr_request *request, const char *msg); /** Whether kr_assume() checks should abort. */ KR_EXPORT extern bool kr_dbg_assumption_abort; -/** Whether kr_assume() should fork the process before issuing abort (if configured). +/** How often kr_assume() should fork the process before issuing abort (if configured). * - * This can be useful for debugging rare edge-cases in production. When both - * kr_debug_assumption_abort and kr_debug_assumption_fork are set to true, it is + * This can be useful for debugging rare edge-cases in production. + * if (kr_debug_assumption_abort && kr_debug_assumption_fork), it is * possible to both obtain a coredump (from forked child) and recover from the - * non-fatal error in the parent process. */ -KR_EXPORT extern bool kr_dbg_assumption_fork; + * non-fatal error in the parent process. + * + * == 0 (false): no forking + * > 0: minimum delay between forks + * (in milliseconds, each instance separately, randomized +-25%) + * < 0: no rate-limiting (not recommended) + */ +KR_EXPORT extern int kr_dbg_assumption_fork; /** Use kr_require() and kr_assume() instead of directly this function. */ KR_EXPORT KR_COLD void kr_fail(bool is_fatal, const char* expr, const char *func, diff --git a/meson.build b/meson.build index 809fecc27..73b73d962 100644 --- a/meson.build +++ b/meson.build @@ -179,7 +179,11 @@ conf_data.set('ENABLE_XDP', xdp.to_int()) conf_data.set('ENABLE_CAP_NG', capng.found().to_int()) conf_data.set('ENABLE_DOH2', nghttp2.found().to_int()) conf_data.set('DBG_ASSUMPTION_ABORT', get_option('debug').to_int()) -conf_data.set('DBG_ASSUMPTION_FORK', (not get_option('debug')).to_int()) +if get_option('debug') + conf_data.set('DBG_ASSUMPTION_FORK', '0') +else + conf_data.set('DBG_ASSUMPTION_FORK', '(5 * 60 * 1000) /* five minutes */') +endif kresconfig = configure_file( output: 'kresconfig.h', -- 2.47.2