kru_price_t max_decay;
uint32_t log_period;
uint32_t hard_timeout;
+ uint32_t coredump_period;
int cpus;
bool using_avx2;
_Atomic uint32_t log_time;
+ _Atomic uint32_t coredump_time;
_Alignas(64) uint8_t kru[];
};
struct defer *defer = NULL;
VERBOSE_LOG(" BREAK (timeout)\n");
// notice logging according to log-period
- const uint32_t time_now = kr_now();
- uint32_t log_time_orig = atomic_load_explicit(&defer->log_time, memory_order_relaxed);
- if (defer->log_period) {
- while (time_now - log_time_orig + 1024 >= defer->log_period + 1024) {
- if (atomic_compare_exchange_weak_explicit(&defer->log_time, &log_time_orig, time_now,
- memory_order_relaxed, memory_order_relaxed)) {
- kr_log_notice(DEFER, "Data from %s too long in queue, dropping. (%0.3f MiB in queues)\n",
- kr_straddr(ctx->comm->src_addr), waiting_requests_size / 1024.0 / 1024.0);
- break;
- }
- }
+ if (kr_log_period(defer->log_period, &defer->log_time)) {
+ kr_log_notice(DEFER, "Data from %s too long in queue, dropping. (%0.3f MiB in queues)\n",
+ kr_straddr(ctx->comm->src_addr), waiting_requests_size / 1024.0 / 1024.0);
}
break_query(ctx, ETIME);
if (rest_to_timeout_ms <= 0) {
defer_charge(elapsed, &defer_sample_state.addr, defer_sample_state.stream);
+ bool coredump = kr_log_period(defer->coredump_period, &defer->coredump_time);
SIGSAFE_LOG(KR_STRADDR_MAXLEN + 100,
- "Host %r used %f s of cpu time continuously, interrupting kresd.\n",
- &defer_sample_state.addr.ip, elapsed / 1000000000.0);
- abort();
+ "Host %r used %f s of cpu time continuously, interrupting kresd (%s).\n",
+ &defer_sample_state.addr.ip, elapsed / 1000000000.0,
+ coredump ? "abort" : "exit");
+ if (coredump) {
+ abort();
+ } else {
+ _exit(EXIT_FAILURE);
+ }
}
alarm((rest_to_timeout_ms + 999) / 1000);
}
/// Initialize shared memory, queues. To be called from Lua.
-int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, int cpus)
+int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, uint32_t coredump_period, int cpus)
// TODO possibly remove cpus; not needed
{
defer_initialized = true;
.max_decay = MAX_DECAY,
.log_period = log_period,
.hard_timeout = hard_timeout,
+ .coredump_period = coredump_period,
.cpus = cpus,
.using_avx2 = using_avx2(),
};
sizeof(header.max_decay) +
sizeof(header.log_period) +
sizeof(header.hard_timeout) +
+ sizeof(header.coredump_period) +
sizeof(header.cpus),
"detected padding with undefined data inside mmapped header");
goto fail;
}
- defer->log_time = kr_now() - log_period;
+ defer->log_time = kr_log_period_init(log_period);
+ defer->coredump_time = kr_log_period_init(coredump_period);
ret = mmapped_init_continue(&defer_mmapped);
if (ret != 0) goto fail;
/// Initialize defer, incl. shared memory with KRU, excl. idle.
KR_EXPORT
-int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, int cpus);
+int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, uint32_t coredump_period, int cpus);
/// Initialize idle and SIGALRM handler.
int defer_init_idle(uv_loop_t *loop);
int zi_zone_import(const zi_config_t);
_Bool ratelimiting_request_begin(struct kr_request *);
int ratelimiting_init(const char *, size_t, uint32_t, uint32_t, uint16_t, uint32_t, _Bool);
-int defer_init(const char *, uint32_t, uint32_t, int);
+int defer_init(const char *, uint32_t, uint32_t, uint32_t, int);
void defer_set_price_factor16(struct kr_request *, uint32_t);
struct engine {
char _stub[];
if (!defer_initialized) {
kr_log_warning(SYSTEM, "Prioritization not initialized from Lua, using hardcoded default.\n");
- ret = defer_init("defer", 1, 0, 1);
+ ret = defer_init("defer", 1, 0, 0, 1);
if (ret) {
ret = EXIT_FAILURE;
goto cleanup;
goto fail;
}
- ratelimiting->log_time = kr_now() - log_period;
+ ratelimiting->log_time = kr_log_period_init(log_period);
for (size_t i = 0; i < V4_PREFIXES_CNT; i++) {
ratelimiting->v4_prices[i] = base_price / V4_RATE_MULT[i];
((ratelimiting->slip == 1) ? true : false);
// logging
- uint32_t log_time_orig = atomic_load_explicit(&ratelimiting->log_time, memory_order_relaxed);
- if (ratelimiting->log_period) {
- while (time_now - log_time_orig + 1024 >= ratelimiting->log_period + 1024) {
- if (atomic_compare_exchange_weak_explicit(&ratelimiting->log_time, &log_time_orig, time_now,
- memory_order_relaxed, memory_order_relaxed)) {
- kr_log_notice(SYSTEM, "address %s rate-limited on /%d (%s%s)\n",
- kr_straddr(req->qsource.addr), limited_prefix,
- ratelimiting->dry_run ? "dry-run, " : "",
- tc ? "truncated" : "dropped");
- break;
- }
- }
+ if (kr_log_period(ratelimiting->log_period, &ratelimiting->log_time)) {
+ kr_log_notice(SYSTEM, "address %s rate-limited on /%d (%s%s)\n",
+ kr_straddr(req->qsource.addr), limited_prefix,
+ ratelimiting->dry_run ? "dry-run, " : "",
+ tc ? "truncated" : "dropped");
}
req->ratelimited = true; // we set this even on dry_run
"pattern": "^(\\d+)(us|ms|s|m|h|d)$",
"description": "If a measured operation lasts longer, kresd is interrupted; use '0s' to disable.",
"default": "0s"
+ },
+ "coredump-period": {
+ "type": "string",
+ "pattern": "^(\\d+)(us|ms|s|m|h|d)$",
+ "description": "Minimal time between two coredumps caused by hard_timeout, or '0s' to disable them.",
+ "default": "10m"
}
},
"default": {
"enabled": false,
"log_period": "0s",
- "hard_timeout": "0s"
+ "hard_timeout": "0s",
+ "coredump_period": "10m"
}
},
"lua": {
it is crucial to use :ref:`multiple workers <config-multiple-workers>`
as those data are shared between them and disappear with the last one.
- A continuous work on a single request usually takes under 1 ms. (TODO check)
- Set the timeout at least to several seconds to avoid random crashes. (TODO or more?)
+ A continuous work on a single request usually takes under 1 ms.
+ Set the timeout to 1s or higher values to avoid random crashes.
+
+.. option:: defer/coredump-period: <time ms|s|m|h|d>
+
+ :defeult: 10m
+
+ Minimal time between two coredumps caused by :option:`hard-timeout <defer/hard-timeout: <time ms|s|m|h|d>`,
+ or ``0s`` to disable them.
+
+ If kresd is to be terminated due to :option:`hard-timeout <defer/hard-timeout: <time ms|s|m|h|d>`,
+ it calls ``abort``, which might cause coredump to be generated, and disables this behaviour
+ for :option:`coredump-period <defer/coredump-period: <time ms|s|m|h|d>`.
+ Subsequent terminations call just ``_exit``, so that kresd is terminated without coredump.
+
+ The last abortion timestamp is stored along with other defer data
+ in the memory shared between workers which disappears with the last one;
+ it is thus needed to use :ref:`multiple workers <config-multiple-workers>`
+ to keep the data alive during restart.
+ Otherwise, :option:`coredump-period <defer/coredump-period: <time ms|s|m|h|d>` has no effect
+ and coredumps are always enabled.
Implementation details
#pragma once
#include <dirent.h>
+#include <stdatomic.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
}
#endif
+/* Determine whether to perform an action (logging) limited once per time period in ms. */
+static inline bool kr_log_period(uint32_t period, _Atomic uint32_t *last_time) {
+ const uint32_t time_now = kr_now(); // 32 bits are sufficient here
+ uint32_t last_time_orig = atomic_load_explicit(last_time, memory_order_relaxed);
+ if (period) {
+ while (time_now - last_time_orig + 1024 >= period + 1024) {
+ if (atomic_compare_exchange_weak_explicit(last_time, &last_time_orig, time_now,
+ memory_order_relaxed, memory_order_relaxed)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+/* Initialize last_time for kr_log_period. */
+static inline uint32_t kr_log_period_init(uint32_t period) {
+ return kr_now() - period;
+}
enabled: Use request prioritization.
log_period: Minimal time between two log messages, or '0s' to disable.
hard_timeout: If a measured operation lasts longer, kresd is interrupted; use '0s' to disable.
+ coredump_period: Minimal time between two coredumps caused by hard_timeout, or '0s' to disable them.
"""
enabled: bool = False
log_period: TimeUnit = TimeUnit("0s")
hard_timeout: TimeUnit = TimeUnit("0s")
+ coredump_period: TimeUnit = TimeUnit("10m")
'{{ cfg.rundir }}/defer',
{{ cfg.defer.log_period.millis() }},
{{ cfg.defer.hard_timeout.millis() }},
+ {{ cfg.defer.coredump_period.millis() }},
{{ cfg.workers }}) == 0)
{% else %}
-assert(C.defer_init(nil, 0, 0, 0) == 0)
+assert(C.defer_init(nil, 0, 0, 0, 0) == 0)
{%- endif %}