]> git.ipfire.org Git - thirdparty/knot-resolver.git/commitdiff
daemon/defer: add coredump-period docs-develop-defe-yjbb02/deployments/6342
authorLukáš Ondráček <lukas.ondracek@nic.cz>
Wed, 5 Mar 2025 13:59:43 +0000 (14:59 +0100)
committerLukáš Ondráček <lukas.ondracek@nic.cz>
Wed, 5 Mar 2025 14:54:23 +0000 (15:54 +0100)
daemon/defer.c
daemon/defer.h
daemon/lua/kres-gen-33.lua
daemon/main.c
daemon/ratelimiting.c
doc/_static/config.schema.json
doc/user/config-defer.rst
lib/utils.h
python/knot_resolver/datamodel/defer_schema.py
python/knot_resolver/datamodel/templates/defer.lua.j2

index 941a01c3b97201f00b0df472bdd53f4c5b90c8a2..d10728512d037bd6e29711cc2ea4a3b8f468de95 100644 (file)
@@ -77,9 +77,11 @@ struct defer {
        kru_price_t max_decay;
        uint32_t log_period;
        uint32_t hard_timeout;
+       uint32_t coredump_period;
        int cpus;
        bool using_avx2;
        _Atomic uint32_t log_time;
+       _Atomic uint32_t coredump_time;
        _Alignas(64) uint8_t kru[];
 };
 struct defer *defer = NULL;
@@ -459,17 +461,9 @@ static inline void process_single_deferred(void)
                VERBOSE_LOG("    BREAK (timeout)\n");
 
                // notice logging according to log-period
-               const uint32_t time_now = kr_now();
-               uint32_t log_time_orig = atomic_load_explicit(&defer->log_time, memory_order_relaxed);
-               if (defer->log_period) {
-                       while (time_now - log_time_orig + 1024 >= defer->log_period + 1024) {
-                               if (atomic_compare_exchange_weak_explicit(&defer->log_time, &log_time_orig, time_now,
-                                               memory_order_relaxed, memory_order_relaxed)) {
-                                       kr_log_notice(DEFER, "Data from %s too long in queue, dropping. (%0.3f MiB in queues)\n",
-                                                       kr_straddr(ctx->comm->src_addr), waiting_requests_size / 1024.0 / 1024.0);
-                                       break;
-                               }
-                       }
+               if (kr_log_period(defer->log_period, &defer->log_time)) {
+                       kr_log_notice(DEFER, "Data from %s too long in queue, dropping. (%0.3f MiB in queues)\n",
+                                       kr_straddr(ctx->comm->src_addr), waiting_requests_size / 1024.0 / 1024.0);
                }
 
                break_query(ctx, ETIME);
@@ -678,16 +672,22 @@ static void defer_alarm(int signum)
 
        if (rest_to_timeout_ms <= 0) {
                defer_charge(elapsed, &defer_sample_state.addr, defer_sample_state.stream);
+               bool coredump = kr_log_period(defer->coredump_period, &defer->coredump_time);
                SIGSAFE_LOG(KR_STRADDR_MAXLEN + 100,
-                       "Host %r used %f s of cpu time continuously, interrupting kresd.\n",
-                       &defer_sample_state.addr.ip, elapsed / 1000000000.0);
-               abort();
+                       "Host %r used %f s of cpu time continuously, interrupting kresd (%s).\n",
+                       &defer_sample_state.addr.ip, elapsed / 1000000000.0,
+                       coredump ? "abort" : "exit");
+               if (coredump) {
+                       abort();
+               } else {
+                       _exit(EXIT_FAILURE);
+               }
        }
        alarm((rest_to_timeout_ms + 999) / 1000);
 }
 
 /// Initialize shared memory, queues. To be called from Lua.
-int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, int cpus)
+int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, uint32_t coredump_period, int cpus)
        // TODO possibly remove cpus; not needed
 {
        defer_initialized = true;
@@ -707,6 +707,7 @@ int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout
                .max_decay = MAX_DECAY,
                .log_period = log_period,
                .hard_timeout = hard_timeout,
+               .coredump_period = coredump_period,
                .cpus = cpus,
                .using_avx2 = using_avx2(),
        };
@@ -722,6 +723,7 @@ int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout
                        sizeof(header.max_decay) +
                        sizeof(header.log_period) +
                        sizeof(header.hard_timeout) +
+                       sizeof(header.coredump_period) +
                        sizeof(header.cpus),
                "detected padding with undefined data inside mmapped header");
 
@@ -738,7 +740,8 @@ int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout
                        goto fail;
                }
 
-               defer->log_time = kr_now() - log_period;
+               defer->log_time = kr_log_period_init(log_period);
+               defer->coredump_time = kr_log_period_init(coredump_period);
 
                ret = mmapped_init_continue(&defer_mmapped);
                if (ret != 0) goto fail;
index e09b87fdf7d96018e99df98916a81f1c1075f7b5..57ad13d3922b73c512a5666e32cf53250946e41f 100644 (file)
@@ -9,7 +9,7 @@
 
 /// Initialize defer, incl. shared memory with KRU, excl. idle.
 KR_EXPORT
-int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, int cpus);
+int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, uint32_t coredump_period, int cpus);
 
 /// Initialize idle and SIGALRM handler.
 int defer_init_idle(uv_loop_t *loop);
index f779002798fa985e5b316fa5ef6a4aba8d95fb61..9e602472668d6a7886d2f32833b8853ee34a84a5 100644 (file)
@@ -620,7 +620,7 @@ struct qr_task *worker_resolve_start(knot_pkt_t *, struct kr_qflags);
 int zi_zone_import(const zi_config_t);
 _Bool ratelimiting_request_begin(struct kr_request *);
 int ratelimiting_init(const char *, size_t, uint32_t, uint32_t, uint16_t, uint32_t, _Bool);
-int defer_init(const char *, uint32_t, uint32_t, int);
+int defer_init(const char *, uint32_t, uint32_t, uint32_t, int);
 void defer_set_price_factor16(struct kr_request *, uint32_t);
 struct engine {
        char _stub[];
index 5f0717d112a68eb37ab82221b5d5a697b2fe23ef..773bbbde5e666abd120d126749bb750bfd373034 100644 (file)
@@ -618,7 +618,7 @@ int main(int argc, char **argv)
 
        if (!defer_initialized) {
                kr_log_warning(SYSTEM, "Prioritization not initialized from Lua, using hardcoded default.\n");
-               ret = defer_init("defer", 1, 0, 1);
+               ret = defer_init("defer", 1, 0, 0, 1);
                if (ret) {
                        ret = EXIT_FAILURE;
                        goto cleanup;
index d182658e49b5aa090ea49a8dac23440eb6605036..f42a7469210d8bf542dc332b37db88e76015d6fd 100644 (file)
@@ -90,7 +90,7 @@ int ratelimiting_init(const char *mmap_file, size_t capacity, uint32_t instant_l
                        goto fail;
                }
 
-               ratelimiting->log_time = kr_now() - log_period;
+               ratelimiting->log_time = kr_log_period_init(log_period);
 
                for (size_t i = 0; i < V4_PREFIXES_CNT; i++) {
                        ratelimiting->v4_prices[i] = base_price / V4_RATE_MULT[i];
@@ -175,18 +175,11 @@ bool ratelimiting_request_begin(struct kr_request *req)
                        ((ratelimiting->slip == 1) ? true : false);
 
        // logging
-       uint32_t log_time_orig = atomic_load_explicit(&ratelimiting->log_time, memory_order_relaxed);
-       if (ratelimiting->log_period) {
-               while (time_now - log_time_orig + 1024 >= ratelimiting->log_period + 1024) {
-                       if (atomic_compare_exchange_weak_explicit(&ratelimiting->log_time, &log_time_orig, time_now,
-                                       memory_order_relaxed, memory_order_relaxed)) {
-                               kr_log_notice(SYSTEM, "address %s rate-limited on /%d (%s%s)\n",
-                                               kr_straddr(req->qsource.addr), limited_prefix,
-                                               ratelimiting->dry_run ? "dry-run, " : "",
-                                               tc ? "truncated" : "dropped");
-                               break;
-                       }
-               }
+       if (kr_log_period(ratelimiting->log_period, &ratelimiting->log_time)) {
+               kr_log_notice(SYSTEM, "address %s rate-limited on /%d (%s%s)\n",
+                               kr_straddr(req->qsource.addr), limited_prefix,
+                               ratelimiting->dry_run ? "dry-run, " : "",
+                               tc ? "truncated" : "dropped");
        }
 
        req->ratelimited = true; // we set this even on dry_run
index c54d5e085f9e141d39ce532aefcc6ae905539deb..a22a92832ac0bcdb22da943f9be03d1567e743ce 100644 (file)
                     "pattern": "^(\\d+)(us|ms|s|m|h|d)$",
                     "description": "If a measured operation lasts longer, kresd is interrupted; use '0s' to disable.",
                     "default": "0s"
+                },
+                "coredump-period": {
+                    "type": "string",
+                    "pattern": "^(\\d+)(us|ms|s|m|h|d)$",
+                    "description": "Minimal time between two coredumps caused by hard_timeout, or '0s' to disable them.",
+                    "default": "10m"
                 }
             },
             "default": {
                 "enabled": false,
                 "log_period": "0s",
-                "hard_timeout": "0s"
+                "hard_timeout": "0s",
+                "coredump_period": "10m"
             }
         },
         "lua": {
index 2baafa958a283cfe6cc9d5b667d9f09384b12769..41bb4e373798f596b8f38e7743a73555ea1ab623 100644 (file)
@@ -67,8 +67,27 @@ The limits can be adjusted for different packet origins using :option:`price-fac
     it is crucial to use :ref:`multiple workers <config-multiple-workers>`
     as those data are shared between them and disappear with the last one.
 
-    A continuous work on a single request usually takes under 1 ms. (TODO check)
-    Set the timeout at least to several seconds to avoid random crashes. (TODO or more?)
+    A continuous work on a single request usually takes under 1 ms.
+    Set the timeout to 1s or higher values to avoid random crashes.
+
+.. option:: defer/coredump-period: <time ms|s|m|h|d>
+
+    :defeult: 10m
+
+    Minimal time between two coredumps caused by :option:`hard-timeout <defer/hard-timeout: <time ms|s|m|h|d>`,
+    or ``0s`` to disable them.
+
+    If kresd is to be terminated due to :option:`hard-timeout <defer/hard-timeout: <time ms|s|m|h|d>`,
+    it calls ``abort``, which might cause coredump to be generated, and disables this behaviour
+    for :option:`coredump-period <defer/coredump-period: <time ms|s|m|h|d>`.
+    Subsequent terminations call just ``_exit``, so that kresd is terminated without coredump.
+
+    The last abortion timestamp is stored along with other defer data
+    in the memory shared between workers which disappears with the last one;
+    it is thus needed to use :ref:`multiple workers <config-multiple-workers>`
+    to keep the data alive during restart.
+    Otherwise, :option:`coredump-period <defer/coredump-period: <time ms|s|m|h|d>` has no effect
+    and coredumps are always enabled.
 
 
 Implementation details
index b6b350c8d8b3724411dbc361a2b9d83a749870e3..d7596a67ba2c223451420029b3df84de1ca62ffa 100644 (file)
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <dirent.h>
+#include <stdatomic.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
@@ -628,3 +629,22 @@ static inline const  knot_dname_t * knot_dname_next_label(const knot_dname_t *dn
 }
 #endif
 
+/* Determine whether to perform an action (logging) limited once per time period in ms. */
+static inline bool kr_log_period(uint32_t period, _Atomic uint32_t *last_time) {
+       const uint32_t time_now = kr_now(); // 32 bits are sufficient here
+       uint32_t last_time_orig = atomic_load_explicit(last_time, memory_order_relaxed);
+       if (period) {
+               while (time_now - last_time_orig + 1024 >= period + 1024) {
+                       if (atomic_compare_exchange_weak_explicit(last_time, &last_time_orig, time_now,
+                                       memory_order_relaxed, memory_order_relaxed)) {
+                               return true;
+                       }
+               }
+       }
+       return false;
+}
+
+/* Initialize last_time for kr_log_period. */
+static inline uint32_t kr_log_period_init(uint32_t period) {
+       return kr_now() - period;
+}
index 8ec7cf8080a4de37c06653c3937ec19c9e5ff1dd..3154e1044ed2bca61f74652290bf238de6300231 100644 (file)
@@ -10,8 +10,10 @@ class DeferSchema(ConfigSchema):
     enabled: Use request prioritization.
     log_period: Minimal time between two log messages, or '0s' to disable.
     hard_timeout: If a measured operation lasts longer, kresd is interrupted; use '0s' to disable.
+    coredump_period: Minimal time between two coredumps caused by hard_timeout, or '0s' to disable them.
     """
 
     enabled: bool = False
     log_period: TimeUnit = TimeUnit("0s")
     hard_timeout: TimeUnit = TimeUnit("0s")
+    coredump_period: TimeUnit = TimeUnit("10m")
index 18549aaf90b92ddb9dfdbb57a90008de6b018e3d..e43a12fb0b58965debdddc4c11f10e02e37cf5d7 100644 (file)
@@ -5,7 +5,8 @@ assert(C.defer_init(
        '{{ cfg.rundir }}/defer',
        {{ cfg.defer.log_period.millis() }},
        {{ cfg.defer.hard_timeout.millis() }},
+       {{ cfg.defer.coredump_period.millis() }},
        {{ cfg.workers }}) == 0)
 {% else %}
-assert(C.defer_init(nil, 0, 0, 0) == 0)
+assert(C.defer_init(nil, 0, 0, 0, 0) == 0)
 {%- endif %}