From 148eb5875fb7e6c46c0a9eac486dcb7b3bca931d Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 6 Nov 2024 11:21:45 +0100 Subject: [PATCH] DEBUG: wdt: better detect apparently locked up threads and warn about them In order to help users detect when threads are behaving abnormally, let's try to emit a warning when one is no longer making any progress. This will allow to catch faulty situations more accurately, instead of occasionally triggering just after the long task. It will also let users know that there is something wrong with their configuration, and inspect the call trace to figure whether they're using excessively long rules or Lua for example (the usual warnings about lua-load vs lua-load-per-thread are still reported). The warning will only be emitted for threads not yet marked as stuck so as not to interfere with panic dumps and avoid sending a warning just before a panic. A tainted flag is set when this happens however (0x2000). --- include/haproxy/bug.h | 1 + src/debug.c | 2 +- src/wdt.c | 12 +++++++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/haproxy/bug.h b/include/haproxy/bug.h index 556d8167ee..e708bcedf9 100644 --- a/include/haproxy/bug.h +++ b/include/haproxy/bug.h @@ -421,6 +421,7 @@ enum tainted_flags { TAINTED_LUA_STUCK = 0x00000400, /* stuck in a Lua context */ TAINTED_LUA_STUCK_SHARED = 0x00000800, /* stuck in a shared Lua context */ TAINTED_MEM_TRIMMING_STUCK = 0x00001000, /* stuck while trimming memory */ + TAINTED_WARN_BLOCKED_TRAFFIC = 0x00002000, /* emitted a warning about blocked traffic */ }; /* this is a bit field made of TAINTED_*, and is declared in haproxy.c */ diff --git a/src/debug.c b/src/debug.c index 4d79ef17c0..3feff5f098 100644 --- a/src/debug.c +++ b/src/debug.c @@ -738,7 +738,7 @@ void ha_stuck_warning(int thr) struct buffer buf; ullong n, p; - if (get_tainted() & TAINTED_PANIC) { + if (mark_tainted(TAINTED_WARN_BLOCKED_TRAFFIC) & TAINTED_PANIC) { /* a panic dump is already in progress, let's not disturb it, * we'll be called via signal DEBUGSIG. By returning we may be * able to leave a current signal handler (e.g. WDT) so that diff --git a/src/wdt.c b/src/wdt.c index fd07d7fb78..a28923b333 100644 --- a/src/wdt.c +++ b/src/wdt.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -38,6 +39,7 @@ */ static struct { timer_t timer; + uint prev_ctxsw; } per_thread_wd_ctx[MAX_THREADS]; /* Setup (or ping) the watchdog timer for thread . Returns non-zero on @@ -106,10 +108,18 @@ void wdt_handler(int sig, siginfo_t *si, void *arg) * scheduler is still alive by setting the TH_FL_STUCK flag * that the scheduler clears when switching to the next task. * If it's already set, then it's our second call with no - * progress and the thread is dead. + * progress and the thread is dead. However, if we figure + * that the scheduler made no progress since last time, we'll + * at least emit a warning. */ if (!(_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_STUCK)) { + uint prev_ctxsw; + _HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK); + prev_ctxsw = HA_ATOMIC_LOAD(&per_thread_wd_ctx[tid].prev_ctxsw); + if (HA_ATOMIC_LOAD(&activity[thr].ctxsw) == prev_ctxsw) + ha_stuck_warning(thr); + HA_ATOMIC_STORE(&activity[thr].ctxsw, prev_ctxsw); goto update_and_leave; } -- 2.39.5