From 0d90b4d29db3e9086e193001f8412163eb80bb70 Mon Sep 17 00:00:00 2001 From: Arran Cudbard-Bell Date: Mon, 27 Apr 2026 14:01:35 -0400 Subject: [PATCH] atexit: skip TLS-cached pools once shutdown has freed them `fr_atexit_thread_trigger_all()` runs each registered thread destructor on the calling (main) thread, including ones that free `_Thread_local` caches owned by threads outside our schedule (librdkafka's bg threads, perl, etc.). We can free their pool chunks but can't reset another thread's TLS slot, so the next call from those threads dereferences a dangling pointer and aborts with "Bad talloc magic value" - first seen in `_kafka_log_cb` from the "Terminating instance" debug line emitted inside `rd_kafka_destroy()` during `mod_detach`. Add `fr_atexit_thread_local_disable_alloc()` / ..._alloc_disabled()` in atexit.c as the single source of truth, called once from radiusd *before* the trigger. TLS-pool initialisers consult it before reading their slot and fall back to `talloc_*(NULL, ...)` when set: - log.c:fr_log_pool_init returns NULL - sbuff.c:sbuff_scratch_init returns NULL Other TLS pools registered the same way (md4/md5/hmac_*, strerror, talloc autofree) can opt in as crashes surface; the single flag means the fix is one extra check at the top of each initialiser. --- src/lib/util/atexit.c | 51 +++++++++++++++++++++++++++++++++++++++++++ src/lib/util/atexit.h | 4 ++++ src/lib/util/log.c | 8 +++---- src/lib/util/sbuff.c | 9 +++++++- 4 files changed, 67 insertions(+), 5 deletions(-) diff --git a/src/lib/util/atexit.c b/src/lib/util/atexit.c index 0d01316db71..47f0261a68d 100644 --- a/src/lib/util/atexit.c +++ b/src/lib/util/atexit.c @@ -30,6 +30,8 @@ RCSID("$Id$") #include #include +#include + #ifdef HAVE_PTHREADS #endif @@ -76,6 +78,24 @@ static fr_atexit_list_t *fr_atexit_global = NULL; static bool is_exiting; static _Thread_local bool thread_is_exiting; +/** Latched once main has decided no more thread-local pools should be handed out + * + * `fr_atexit_thread_trigger_all()` runs every registered thread destructor on + * the calling (main) thread, including ones that free `_Thread_local` pools + * owned by threads we don't manage (librdkafka's bg threads, perl, etc.). We + * can free the underlying chunks but can't reset another thread's TLS slot, + * so those threads keep a dangling pointer in their per-thread cache. Code + * that lazily allocates a TLS-cached pool (log.c, sbuff.c, strerror.c, ...) + * checks this flag *before* reading its TLS slot and falls back to + * `talloc_*(NULL, ...)` (or returns NULL) when set, side-stepping the + * dangling pointer entirely. + * + * Set once, never cleared. Relaxed atomic because this is a one-shot signal + * with no other state synchronised through it - readers tolerate observing + * the old value briefly. + */ +static atomic_bool thread_local_alloc_disabled; + /** Call the exit handler * */ @@ -403,6 +423,27 @@ do_threads: } +/** Disable lazy allocation of thread-local caches for the rest of the process + * + * Call this from main *before* `fr_atexit_thread_trigger_all()`. After it + * returns, every TLS-pool initialiser that consults + * `fr_atexit_thread_local_alloc_disabled()` falls back to the un-cached path, + * so `_Thread_local` slots in other threads aren't read after the trigger + * frees their backing memory. + */ +void fr_atexit_thread_local_disable_alloc(void) +{ + atomic_store_explicit(&thread_local_alloc_disabled, true, memory_order_relaxed); +} + +/** Has @ref fr_atexit_thread_local_disable_alloc been called yet + * + */ +bool fr_atexit_thread_local_alloc_disabled(void) +{ + return atomic_load_explicit(&thread_local_alloc_disabled, memory_order_relaxed); +} + /** Return whether we're currently in the teardown phase * * When this function returns true no more thread local or global @@ -602,6 +643,16 @@ int fr_atexit_thread_trigger_all(void) fr_atexit_list_t *list; unsigned int count = 0; + /* + * Disable TLS-cached pool handouts before we start freeing + * any of them. Threads we don't manage (librdkafka's bg + * threads, etc.) keep dangling pointers in their + * `_Thread_local` slots once the chunks here are reaped, so + * any TLS-pool initialiser that consults the flag falls back + * to `talloc_*(NULL, ...)` from now on. + */ + fr_atexit_thread_local_disable_alloc(); + /* * Iterate over the list of thread local * destructor lists running the diff --git a/src/lib/util/atexit.h b/src/lib/util/atexit.h index 6bff98f54c5..0a43d8a321f 100644 --- a/src/lib/util/atexit.h +++ b/src/lib/util/atexit.h @@ -67,6 +67,10 @@ int fr_atexit_trigger(bool uctx_scope, fr_atexit_t func, void const *uctx); bool fr_atexit_is_exiting(void); +void fr_atexit_thread_local_disable_alloc(void); + +bool fr_atexit_thread_local_alloc_disabled(void); + #ifdef HAVE_PTHREADS /* * Because GCC only added support in 2013 *sigh* diff --git a/src/lib/util/log.c b/src/lib/util/log.c index 5d77294f360..d1a9cc9f883 100644 --- a/src/lib/util/log.c +++ b/src/lib/util/log.c @@ -347,11 +347,11 @@ TALLOC_CTX *fr_log_pool_init(void) TALLOC_CTX *pool; /* - * Post-trigger of every thread atexit handler the TLS slot - * may be a dangling pointer for any thread we don't own - * (librdkafka's bg threads etc.) - skip the pool entirely. + * Once main has signalled shutdown the TLS slot may be a + * dangling pointer for any thread we don't own (librdkafka's + * bg threads etc.) - skip the pool entirely. */ - if (unlikely(atomic_load_explicit(&log_pools_disabled, memory_order_relaxed))) return NULL; + if (unlikely(fr_atexit_thread_local_alloc_disabled())) return NULL; pool = fr_log_pool; if (unlikely(!pool)) { diff --git a/src/lib/util/sbuff.c b/src/lib/util/sbuff.c index 44331d59436..7f1132b4c65 100644 --- a/src/lib/util/sbuff.c +++ b/src/lib/util/sbuff.c @@ -1542,7 +1542,14 @@ static inline CC_HINT(always_inline) int sbuff_scratch_init(TALLOC_CTX **out) { TALLOC_CTX *scratch; - if (sbuff_scratch_freed) { + /* + * Once main has signalled shutdown the TLS slot may be a + * dangling pointer on threads we don't own; skip the scratch + * cache and let callers allocate at top level instead. The + * TLS-local `sbuff_scratch_freed` is left in place for the + * per-thread teardown path on FR-managed threads. + */ + if (sbuff_scratch_freed || fr_atexit_thread_local_alloc_disabled()) { *out = NULL; return 0; } -- 2.47.3