From: Arran Cudbard-Bell Date: Mon, 27 Apr 2026 18:01:35 +0000 (-0400) Subject: log: bypass per-thread pool once shutdown has freed it X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=f3d4c64b48227ee53ca4074f80fbc405537bfdeb;p=thirdparty%2Ffreeradius-server.git log: bypass per-thread pool once shutdown has freed it `fr_atexit_thread_trigger_all()` runs each registered thread destructor on the calling (main) thread, including `_fr_log_pool_free` for every worker that ever logged. That frees the underlying pool chunk but can't reset the `_Thread_local fr_log_pool` slot in any thread other than main, so threads spawned outside our schedule (librdkafka's bg threads, perl, etc.) keep a dangling pointer. The next log call from those threads (typically the "Terminating instance" debug line that librdkafka emits inside `rd_kafka_destroy()` during `mod_detach`) hands the dead pointer to `talloc_new` and aborts with "Bad talloc magic value". Add `fr_log_disable_pools()` and call it from radiusd right after the trigger. `fr_log_pool_init()` short-circuits to NULL once set, so the TLS read is skipped and downstream `talloc_new(NULL)` allocates a top-level chunk for the duration of the line. Relaxed atomic because the flag is a single-writer signal with no other state to synchronise through it. --- diff --git a/src/bin/radiusd.c b/src/bin/radiusd.c index 634094a10ca..4f4111fccfe 100644 --- a/src/bin/radiusd.c +++ b/src/bin/radiusd.c @@ -1122,6 +1122,18 @@ cleanup: */ (void) fr_schedule_destroy(&sc); + /* + * Threads not managed by the schedule (e.g. librdkafka's + * bg threads) hold a `_Thread_local fr_log_pool` we can't + * clear from this thread. `fr_atexit_thread_trigger_all` + * below frees the underlying chunks, leaving those slots + * dangling. Flip the logger into pool-less mode first so + * any log line that races the trigger - or fires after it - + * goes through `talloc_new(NULL)` instead of dereferencing + * a freed pool. + */ + fr_log_disable_pools(); + /* * Ensure all thread local memory is cleaned up * before we start cleaning up global resources. diff --git a/src/lib/util/log.c b/src/lib/util/log.c index 7e98206f6e8..5d77294f360 100644 --- a/src/lib/util/log.c +++ b/src/lib/util/log.c @@ -29,6 +29,7 @@ RCSID("$Id$") #include #include +#include #ifdef HAVE_FEATURES_H # include #endif @@ -41,6 +42,23 @@ int fr_debug_lvl = 0; static _Thread_local TALLOC_CTX *fr_log_pool; +/** Latched once shutdown has freed every thread's log pool + * + * `fr_atexit_thread_trigger_all()` runs every registered thread destructor + * on the calling (main) thread, so it frees the log pool memory for threads + * whose TLS slot it can't reach (librdkafka's bg threads, anything spawned + * by a third-party library that bypasses our schedule). Those threads + * still hold the now-dangling pointer in their `_Thread_local fr_log_pool`, + * and will hand it to `talloc_new` on the next log call - "Bad talloc magic + * value" abort. + * + * Once set, `fr_log_pool_init()` ignores the TLS slot entirely and returns + * NULL; downstream `talloc_new(NULL)` / `talloc_asprintf(NULL, ...)` calls + * just allocate top-level chunks for the duration of the log line. No + * pooling, no TLS, safe from any thread. + */ +static atomic_bool log_pools_disabled; + static uint32_t location_indent = 30; static fr_event_list_t *log_el; //!< Event loop we use for process logging data. @@ -303,17 +321,38 @@ static int _fr_log_pool_free(void *arg) return 0; } +/** Disable per-thread log pools for the rest of the process lifetime + * + * Call this from the main thread immediately after + * `fr_atexit_thread_trigger_all()`, which frees every other thread's log + * pool but can't reset their `_Thread_local` slot. After this returns, + * subsequent `fr_log` calls fall back to `talloc_new(NULL)` instead of + * touching the (now dangling) TLS pool pointer. + */ +void fr_log_disable_pools(void) +{ + atomic_store_explicit(&log_pools_disabled, true, memory_order_relaxed); +} + /** talloc ctx to use when composing log messages * * Functions must ensure that they allocate a new ctx from the one returned * here, and that this ctx is freed before the function returns. * - * @return talloc pool to use for scratch space. + * @return talloc pool to use for scratch space, or NULL if pools have been + * disabled - callers must tolerate a NULL return. */ TALLOC_CTX *fr_log_pool_init(void) { TALLOC_CTX *pool; + /* + * Post-trigger of every thread atexit handler the TLS slot + * may be a dangling pointer for any thread we don't own + * (librdkafka's bg threads etc.) - skip the pool entirely. + */ + if (unlikely(atomic_load_explicit(&log_pools_disabled, memory_order_relaxed))) return NULL; + pool = fr_log_pool; if (unlikely(!pool)) { if (fr_atexit_is_exiting()) return NULL; /* No new pools if we're exiting */ diff --git a/src/lib/util/log.h b/src/lib/util/log.h index 44b844242d8..da96f254584 100644 --- a/src/lib/util/log.h +++ b/src/lib/util/log.h @@ -196,6 +196,8 @@ int fr_log_close(fr_log_t *log) CC_HINT(nonnull); TALLOC_CTX *fr_log_pool_init(void); +void fr_log_disable_pools(void); + int fr_log_global_init(fr_event_list_t *el, bool daemonize) CC_HINT(nonnull); void fr_log_global_free(void);