]> git.ipfire.org Git - thirdparty/knot-resolver.git/commitdiff
cache: improve out-of-disk condition
authorVladimír Čunát <vladimir.cunat@nic.cz>
Wed, 12 Sep 2018 12:59:46 +0000 (14:59 +0200)
committerVladimír Čunát <vladimir.cunat@nic.cz>
Fri, 14 Sep 2018 06:48:29 +0000 (08:48 +0200)
When suspect SIGBUS happens, print helpful error and try to remove
the cache, so that the service might work again if auto-restarted.
Theoretically we could longjmp() out of the SIGBUS handler,
but that would be rather messy, so let the process die.

NEWS
daemon/main.c
lib/cache/api.c
lib/cache/api.h

diff --git a/NEWS b/NEWS
index feae4a9df7776d4e88addd15ea2353a88e3dc977..a29c77f8c73d69424475d54eb010605a708fc32f 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,11 @@
+Knot Resolver 3.x.y (2018-mm-dd)
+================================
+
+Improvements
+------------
+- cache: handle out-of-space SIGBUS slightly better (#197)
+
+
 Knot Resolver 3.0.0 (2018-08-20)
 ================================
 
index 5568e34763e50b183be58c0ed6a36a36b729929d..a414b2627c2180cbfe3945e203d266e7de9e52fa 100644 (file)
  */
 
 #include <arpa/inet.h>
+#include <assert.h>
+#include <getopt.h>
+#include <libgen.h>
 #include <signal.h>
 #include <stdlib.h>
 #include <string.h>
-#include <getopt.h>
-#include <libgen.h>
+#include <unistd.h>
+
 #include <uv.h>
-#include <assert.h>
-#include <contrib/cleanup.h>
-#include <contrib/ucw/mempool.h>
-#include <contrib/ccan/asprintf/asprintf.h>
-#include <libknot/error.h>
 #ifdef HAS_SYSTEMD
 #include <systemd/sd-daemon.h>
 #endif
+#include <libknot/error.h>
 
+#include <contrib/cleanup.h>
+#include <contrib/ucw/mempool.h>
+#include <contrib/ccan/asprintf/asprintf.h>
 #include "lib/defines.h"
 #include "lib/resolve.h"
 #include "lib/dnssec.h"
@@ -300,6 +302,38 @@ static void signal_handler(uv_signal_t *handle, int signum)
        uv_signal_stop(handle);
 }
 
+/** SIGBUS -> attempt to remove the overflowing cache file and abort. */
+static void sigbus_handler(int sig, siginfo_t *siginfo, void *ptr)
+{
+       /* We can't safely assume that printf-like functions work, but write() is OK.
+        * See POSIX for the safe functions, e.g. 2017 version just above this link:
+        * http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_04_04
+        */
+       #define WRITE_ERR(err_charray) \
+               (void)write(STDERR_FILENO, err_charray, sizeof(err_charray))
+       const char msg_typical[] =
+               "\nSIGBUS received; this is most likely due to filling up the filesystem where cache resides.\n",
+               msg_unknown[] = "\nSIGBUS received, cause unknown.\n",
+               msg_deleted[] = "Cache file deleted.\n",
+               msg_del_fail[] = "Cache file deletion failed.\n",
+               msg_final[] = "kresd can not recover reliably by itself, exiting.\n";
+       if (siginfo->si_code != BUS_ADRERR) {
+               WRITE_ERR(msg_unknown);
+               goto end;
+       }
+       WRITE_ERR(msg_typical);
+       if (!kr_cache_emergency_file_to_remove) goto end;
+       if (unlink(kr_cache_emergency_file_to_remove)) {
+               WRITE_ERR(msg_del_fail);
+       } else {
+               WRITE_ERR(msg_deleted);
+       }
+end:
+       WRITE_ERR(msg_final);
+       _exit(128 - sig); /*< regular return from OS-raised SIGBUS can't work anyway */
+       #undef WRITE_ERR
+}
+
 /** Split away port from the address. */
 static const char *set_addr(char *addr, int *port)
 {
@@ -700,28 +734,36 @@ int main(int argc, char **argv)
                goto cleanup;
        }
 
-       /* Workaround for https://github.com/libuv/libuv/issues/45
-        * (Write after ECONNRESET crash.) */
-       if (ret == 0 && signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
-               kr_log_error("[system] can't block SIGPIPE signal: %s\n",
-                               strerror(errno));
-               ret = EXIT_FAILURE;
-       }
+       /* Catch some signals. */
 
-       if (ret != 0) {
+       loop = uv_default_loop();
+       uv_signal_t sigint, sigterm;
+       if (true) ret = uv_signal_init(loop, &sigint);
+       if (!ret) ret = uv_signal_init(loop, &sigterm);
+       if (!ret) ret = uv_signal_start(&sigint, signal_handler, SIGINT);
+       if (!ret) ret = uv_signal_start(&sigterm, signal_handler, SIGTERM);
+       /* Block SIGPIPE; see https://github.com/libuv/libuv/issues/45 */
+       if (!ret && signal(SIGPIPE, SIG_IGN) == SIG_ERR) ret = errno;
+       if (!ret) {
+               /* Catching SIGBUS via uv_signal_* can't work; see:
+                * https://github.com/libuv/libuv/pull/1987 */
+               struct sigaction sa;
+               memset(&sa, 0, sizeof(sa));
+               sa.sa_sigaction = sigbus_handler;
+               sa.sa_flags = SA_SIGINFO;
+               if (sigaction(SIGBUS, &sa, NULL)) {
+                       ret = errno;
+               }
+       }
+       if (ret) {
+               kr_log_error("[system] failed to set up signal handlers: %s\n",
+                               strerror(abs(errno)));
+               ret = EXIT_FAILURE;
                goto cleanup;
        }
 
-       engine_set_moduledir(&engine, args.moduledir);
-       
-       /* Block signals. */
-       loop = uv_default_loop();
-       uv_signal_t sigint, sigterm;
-       uv_signal_init(loop, &sigint);
-       uv_signal_init(loop, &sigterm);
-       uv_signal_start(&sigint, signal_handler, SIGINT);
-       uv_signal_start(&sigterm, signal_handler, SIGTERM);
        /* Start the scripting engine */
+       engine_set_moduledir(&engine, args.moduledir);
        worker->loop = loop;
        loop->data = worker;
 
index 34b89b9d5aea86fd828f5eb39da36fb1d3716bbc..df9b439152fdcb8f7085aada6258a98eef4112b1 100644 (file)
@@ -130,10 +130,20 @@ int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct k
        cache->ttl_max = KR_CACHE_DEFAULT_TTL_MAX;
        /* Check cache ABI version */
        kr_cache_make_checkpoint(cache);
-       (void) assert_right_version(cache);
+       (void)assert_right_version(cache);
+
+       char *fpath;
+       ret = asprintf(&fpath, "%s/data.mdb", opts->path);
+       if (ret > 0) {
+               kr_cache_emergency_file_to_remove = fpath;
+       } else {
+               assert(false); /* non-critical, but still */
+       }
        return 0;
 }
 
+const char *kr_cache_emergency_file_to_remove = NULL;
+
 
 #define cache_isvalid(cache) ((cache) && (cache)->api && (cache)->db)
 
@@ -143,6 +153,8 @@ void kr_cache_close(struct kr_cache *cache)
                cache_op(cache, close);
                cache->db = NULL;
        }
+       free(/*const-cast*/(char*)kr_cache_emergency_file_to_remove);
+       kr_cache_emergency_file_to_remove = NULL;
 }
 
 int kr_cache_sync(struct kr_cache *cache)
index 84d0f07dbe0a0cbc5656f16f42ee09e59b590a84..61bf796a94d8467140f1bc99a9ecd0bf4481a11b 100644 (file)
@@ -65,6 +65,12 @@ struct kr_cache
 KR_EXPORT
 int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct kr_cdb_opts *opts, knot_mm_t *mm);
 
+/**
+ * Path to cache file to remove on critical out-of-space error. (do NOT modify it)
+ */
+KR_EXPORT extern
+const char *kr_cache_emergency_file_to_remove;
+
 /**
  * Close persistent cache.
  * @note This doesn't clear the data, just closes the connection to the database.