From: Lukáš Ondráček Date: Wed, 5 Mar 2025 13:59:43 +0000 (+0100) Subject: daemon/defer: add coredump-period X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d831b7e756afa1e7c2e4e90b127e7d075bb7e5ad;p=thirdparty%2Fknot-resolver.git daemon/defer: add coredump-period --- diff --git a/daemon/defer.c b/daemon/defer.c index 941a01c3b..d10728512 100644 --- a/daemon/defer.c +++ b/daemon/defer.c @@ -77,9 +77,11 @@ struct defer { kru_price_t max_decay; uint32_t log_period; uint32_t hard_timeout; + uint32_t coredump_period; int cpus; bool using_avx2; _Atomic uint32_t log_time; + _Atomic uint32_t coredump_time; _Alignas(64) uint8_t kru[]; }; struct defer *defer = NULL; @@ -459,17 +461,9 @@ static inline void process_single_deferred(void) VERBOSE_LOG(" BREAK (timeout)\n"); // notice logging according to log-period - const uint32_t time_now = kr_now(); - uint32_t log_time_orig = atomic_load_explicit(&defer->log_time, memory_order_relaxed); - if (defer->log_period) { - while (time_now - log_time_orig + 1024 >= defer->log_period + 1024) { - if (atomic_compare_exchange_weak_explicit(&defer->log_time, &log_time_orig, time_now, - memory_order_relaxed, memory_order_relaxed)) { - kr_log_notice(DEFER, "Data from %s too long in queue, dropping. (%0.3f MiB in queues)\n", - kr_straddr(ctx->comm->src_addr), waiting_requests_size / 1024.0 / 1024.0); - break; - } - } + if (kr_log_period(defer->log_period, &defer->log_time)) { + kr_log_notice(DEFER, "Data from %s too long in queue, dropping. (%0.3f MiB in queues)\n", + kr_straddr(ctx->comm->src_addr), waiting_requests_size / 1024.0 / 1024.0); } break_query(ctx, ETIME); @@ -678,16 +672,22 @@ static void defer_alarm(int signum) if (rest_to_timeout_ms <= 0) { defer_charge(elapsed, &defer_sample_state.addr, defer_sample_state.stream); + bool coredump = kr_log_period(defer->coredump_period, &defer->coredump_time); SIGSAFE_LOG(KR_STRADDR_MAXLEN + 100, - "Host %r used %f s of cpu time continuously, interrupting kresd.\n", - &defer_sample_state.addr.ip, elapsed / 1000000000.0); - abort(); + "Host %r used %f s of cpu time continuously, interrupting kresd (%s).\n", + &defer_sample_state.addr.ip, elapsed / 1000000000.0, + coredump ? "abort" : "exit"); + if (coredump) { + abort(); + } else { + _exit(EXIT_FAILURE); + } } alarm((rest_to_timeout_ms + 999) / 1000); } /// Initialize shared memory, queues. To be called from Lua. -int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, int cpus) +int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, uint32_t coredump_period, int cpus) // TODO possibly remove cpus; not needed { defer_initialized = true; @@ -707,6 +707,7 @@ int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout .max_decay = MAX_DECAY, .log_period = log_period, .hard_timeout = hard_timeout, + .coredump_period = coredump_period, .cpus = cpus, .using_avx2 = using_avx2(), }; @@ -722,6 +723,7 @@ int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout sizeof(header.max_decay) + sizeof(header.log_period) + sizeof(header.hard_timeout) + + sizeof(header.coredump_period) + sizeof(header.cpus), "detected padding with undefined data inside mmapped header"); @@ -738,7 +740,8 @@ int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout goto fail; } - defer->log_time = kr_now() - log_period; + defer->log_time = kr_log_period_init(log_period); + defer->coredump_time = kr_log_period_init(coredump_period); ret = mmapped_init_continue(&defer_mmapped); if (ret != 0) goto fail; diff --git a/daemon/defer.h b/daemon/defer.h index e09b87fdf..57ad13d39 100644 --- a/daemon/defer.h +++ b/daemon/defer.h @@ -9,7 +9,7 @@ /// Initialize defer, incl. shared memory with KRU, excl. idle. KR_EXPORT -int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, int cpus); +int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, uint32_t coredump_period, int cpus); /// Initialize idle and SIGALRM handler. int defer_init_idle(uv_loop_t *loop); diff --git a/daemon/lua/kres-gen-33.lua b/daemon/lua/kres-gen-33.lua index f77900279..9e6024726 100644 --- a/daemon/lua/kres-gen-33.lua +++ b/daemon/lua/kres-gen-33.lua @@ -620,7 +620,7 @@ struct qr_task *worker_resolve_start(knot_pkt_t *, struct kr_qflags); int zi_zone_import(const zi_config_t); _Bool ratelimiting_request_begin(struct kr_request *); int ratelimiting_init(const char *, size_t, uint32_t, uint32_t, uint16_t, uint32_t, _Bool); -int defer_init(const char *, uint32_t, uint32_t, int); +int defer_init(const char *, uint32_t, uint32_t, uint32_t, int); void defer_set_price_factor16(struct kr_request *, uint32_t); struct engine { char _stub[]; diff --git a/daemon/main.c b/daemon/main.c index 5f0717d11..773bbbde5 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -618,7 +618,7 @@ int main(int argc, char **argv) if (!defer_initialized) { kr_log_warning(SYSTEM, "Prioritization not initialized from Lua, using hardcoded default.\n"); - ret = defer_init("defer", 1, 0, 1); + ret = defer_init("defer", 1, 0, 0, 1); if (ret) { ret = EXIT_FAILURE; goto cleanup; diff --git a/daemon/ratelimiting.c b/daemon/ratelimiting.c index d182658e4..f42a74692 100644 --- a/daemon/ratelimiting.c +++ b/daemon/ratelimiting.c @@ -90,7 +90,7 @@ int ratelimiting_init(const char *mmap_file, size_t capacity, uint32_t instant_l goto fail; } - ratelimiting->log_time = kr_now() - log_period; + ratelimiting->log_time = kr_log_period_init(log_period); for (size_t i = 0; i < V4_PREFIXES_CNT; i++) { ratelimiting->v4_prices[i] = base_price / V4_RATE_MULT[i]; @@ -175,18 +175,11 @@ bool ratelimiting_request_begin(struct kr_request *req) ((ratelimiting->slip == 1) ? true : false); // logging - uint32_t log_time_orig = atomic_load_explicit(&ratelimiting->log_time, memory_order_relaxed); - if (ratelimiting->log_period) { - while (time_now - log_time_orig + 1024 >= ratelimiting->log_period + 1024) { - if (atomic_compare_exchange_weak_explicit(&ratelimiting->log_time, &log_time_orig, time_now, - memory_order_relaxed, memory_order_relaxed)) { - kr_log_notice(SYSTEM, "address %s rate-limited on /%d (%s%s)\n", - kr_straddr(req->qsource.addr), limited_prefix, - ratelimiting->dry_run ? "dry-run, " : "", - tc ? "truncated" : "dropped"); - break; - } - } + if (kr_log_period(ratelimiting->log_period, &ratelimiting->log_time)) { + kr_log_notice(SYSTEM, "address %s rate-limited on /%d (%s%s)\n", + kr_straddr(req->qsource.addr), limited_prefix, + ratelimiting->dry_run ? "dry-run, " : "", + tc ? "truncated" : "dropped"); } req->ratelimited = true; // we set this even on dry_run diff --git a/doc/_static/config.schema.json b/doc/_static/config.schema.json index c54d5e085..a22a92832 100644 --- a/doc/_static/config.schema.json +++ b/doc/_static/config.schema.json @@ -1757,12 +1757,19 @@ "pattern": "^(\\d+)(us|ms|s|m|h|d)$", "description": "If a measured operation lasts longer, kresd is interrupted; use '0s' to disable.", "default": "0s" + }, + "coredump-period": { + "type": "string", + "pattern": "^(\\d+)(us|ms|s|m|h|d)$", + "description": "Minimal time between two coredumps caused by hard_timeout, or '0s' to disable them.", + "default": "10m" } }, "default": { "enabled": false, "log_period": "0s", - "hard_timeout": "0s" + "hard_timeout": "0s", + "coredump_period": "10m" } }, "lua": { diff --git a/doc/user/config-defer.rst b/doc/user/config-defer.rst index 2baafa958..41bb4e373 100644 --- a/doc/user/config-defer.rst +++ b/doc/user/config-defer.rst @@ -67,8 +67,27 @@ The limits can be adjusted for different packet origins using :option:`price-fac it is crucial to use :ref:`multiple workers ` as those data are shared between them and disappear with the last one. - A continuous work on a single request usually takes under 1 ms. (TODO check) - Set the timeout at least to several seconds to avoid random crashes. (TODO or more?) + A continuous work on a single request usually takes under 1 ms. + Set the timeout to 1s or higher values to avoid random crashes. + +.. option:: defer/coredump-period: