From efc08a6491c731a8d809c280d99570ffa2228888 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Vladim=C3=ADr=20=C4=8Cun=C3=A1t?= Date: Wed, 12 Sep 2018 14:59:46 +0200 Subject: [PATCH] cache: improve out-of-disk condition When suspect SIGBUS happens, print helpful error and try to remove the cache, so that the service might work again if auto-restarted. Theoretically we could longjmp() out of the SIGBUS handler, but that would be rather messy, so let the process die. --- NEWS | 8 +++++ daemon/main.c | 90 ++++++++++++++++++++++++++++++++++++------------- lib/cache/api.c | 14 +++++++- lib/cache/api.h | 6 ++++ 4 files changed, 93 insertions(+), 25 deletions(-) diff --git a/NEWS b/NEWS index feae4a9df..a29c77f8c 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,11 @@ +Knot Resolver 3.x.y (2018-mm-dd) +================================ + +Improvements +------------ +- cache: handle out-of-space SIGBUS slightly better (#197) + + Knot Resolver 3.0.0 (2018-08-20) ================================ diff --git a/daemon/main.c b/daemon/main.c index 5568e3476..a414b2627 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -15,21 +15,23 @@ */ #include +#include +#include +#include #include #include #include -#include -#include +#include + #include -#include -#include -#include -#include -#include #ifdef HAS_SYSTEMD #include #endif +#include +#include +#include +#include #include "lib/defines.h" #include "lib/resolve.h" #include "lib/dnssec.h" @@ -300,6 +302,38 @@ static void signal_handler(uv_signal_t *handle, int signum) uv_signal_stop(handle); } +/** SIGBUS -> attempt to remove the overflowing cache file and abort. */ +static void sigbus_handler(int sig, siginfo_t *siginfo, void *ptr) +{ + /* We can't safely assume that printf-like functions work, but write() is OK. + * See POSIX for the safe functions, e.g. 2017 version just above this link: + * http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_04_04 + */ + #define WRITE_ERR(err_charray) \ + (void)write(STDERR_FILENO, err_charray, sizeof(err_charray)) + const char msg_typical[] = + "\nSIGBUS received; this is most likely due to filling up the filesystem where cache resides.\n", + msg_unknown[] = "\nSIGBUS received, cause unknown.\n", + msg_deleted[] = "Cache file deleted.\n", + msg_del_fail[] = "Cache file deletion failed.\n", + msg_final[] = "kresd can not recover reliably by itself, exiting.\n"; + if (siginfo->si_code != BUS_ADRERR) { + WRITE_ERR(msg_unknown); + goto end; + } + WRITE_ERR(msg_typical); + if (!kr_cache_emergency_file_to_remove) goto end; + if (unlink(kr_cache_emergency_file_to_remove)) { + WRITE_ERR(msg_del_fail); + } else { + WRITE_ERR(msg_deleted); + } +end: + WRITE_ERR(msg_final); + _exit(128 - sig); /*< regular return from OS-raised SIGBUS can't work anyway */ + #undef WRITE_ERR +} + /** Split away port from the address. */ static const char *set_addr(char *addr, int *port) { @@ -700,28 +734,36 @@ int main(int argc, char **argv) goto cleanup; } - /* Workaround for https://github.com/libuv/libuv/issues/45 - * (Write after ECONNRESET crash.) */ - if (ret == 0 && signal(SIGPIPE, SIG_IGN) == SIG_ERR) { - kr_log_error("[system] can't block SIGPIPE signal: %s\n", - strerror(errno)); - ret = EXIT_FAILURE; - } + /* Catch some signals. */ - if (ret != 0) { + loop = uv_default_loop(); + uv_signal_t sigint, sigterm; + if (true) ret = uv_signal_init(loop, &sigint); + if (!ret) ret = uv_signal_init(loop, &sigterm); + if (!ret) ret = uv_signal_start(&sigint, signal_handler, SIGINT); + if (!ret) ret = uv_signal_start(&sigterm, signal_handler, SIGTERM); + /* Block SIGPIPE; see https://github.com/libuv/libuv/issues/45 */ + if (!ret && signal(SIGPIPE, SIG_IGN) == SIG_ERR) ret = errno; + if (!ret) { + /* Catching SIGBUS via uv_signal_* can't work; see: + * https://github.com/libuv/libuv/pull/1987 */ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = sigbus_handler; + sa.sa_flags = SA_SIGINFO; + if (sigaction(SIGBUS, &sa, NULL)) { + ret = errno; + } + } + if (ret) { + kr_log_error("[system] failed to set up signal handlers: %s\n", + strerror(abs(errno))); + ret = EXIT_FAILURE; goto cleanup; } - engine_set_moduledir(&engine, args.moduledir); - - /* Block signals. */ - loop = uv_default_loop(); - uv_signal_t sigint, sigterm; - uv_signal_init(loop, &sigint); - uv_signal_init(loop, &sigterm); - uv_signal_start(&sigint, signal_handler, SIGINT); - uv_signal_start(&sigterm, signal_handler, SIGTERM); /* Start the scripting engine */ + engine_set_moduledir(&engine, args.moduledir); worker->loop = loop; loop->data = worker; diff --git a/lib/cache/api.c b/lib/cache/api.c index 34b89b9d5..df9b43915 100644 --- a/lib/cache/api.c +++ b/lib/cache/api.c @@ -130,10 +130,20 @@ int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct k cache->ttl_max = KR_CACHE_DEFAULT_TTL_MAX; /* Check cache ABI version */ kr_cache_make_checkpoint(cache); - (void) assert_right_version(cache); + (void)assert_right_version(cache); + + char *fpath; + ret = asprintf(&fpath, "%s/data.mdb", opts->path); + if (ret > 0) { + kr_cache_emergency_file_to_remove = fpath; + } else { + assert(false); /* non-critical, but still */ + } return 0; } +const char *kr_cache_emergency_file_to_remove = NULL; + #define cache_isvalid(cache) ((cache) && (cache)->api && (cache)->db) @@ -143,6 +153,8 @@ void kr_cache_close(struct kr_cache *cache) cache_op(cache, close); cache->db = NULL; } + free(/*const-cast*/(char*)kr_cache_emergency_file_to_remove); + kr_cache_emergency_file_to_remove = NULL; } int kr_cache_sync(struct kr_cache *cache) diff --git a/lib/cache/api.h b/lib/cache/api.h index 84d0f07db..61bf796a9 100644 --- a/lib/cache/api.h +++ b/lib/cache/api.h @@ -65,6 +65,12 @@ struct kr_cache KR_EXPORT int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct kr_cdb_opts *opts, knot_mm_t *mm); +/** + * Path to cache file to remove on critical out-of-space error. (do NOT modify it) + */ +KR_EXPORT extern +const char *kr_cache_emergency_file_to_remove; + /** * Close persistent cache. * @note This doesn't clear the data, just closes the connection to the database. -- 2.47.3