BUG/MEDIUM: wdt: improve stuck task detection accuracy

author Willy Tarreau <w@1wt.eu>

Wed, 1 Oct 2025 06:28:54 +0000 (08:28 +0200)

committer Willy Tarreau <w@1wt.eu>

Wed, 1 Oct 2025 08:18:53 +0000 (10:18 +0200)
author Willy Tarreau <w@1wt.eu>
Wed, 1 Oct 2025 06:28:54 +0000 (08:28 +0200)
committer Willy Tarreau <w@1wt.eu>
Wed, 1 Oct 2025 08:18:53 +0000 (10:18 +0200)
diff --git a/src/wdt.c b/src/wdt.c

index 4290d6115966c6e4984c6198e9bae207c13520de..a18745f0ff0a4c136004d0023f2c207414e91318 100644 (file)
--- a/src/wdt.c
+++ b/src/wdt.c
@@ -41,6 +41,7 @@
   */
  static struct {
         timer_t timer;
+       uint64_t stuck_start; /* cpu time when the scheduler's stuck was last set */
  } per_thread_wd_ctx[MAX_THREADS];
  
  /* warn about stuck tasks after this delay (ns) */
@@ -89,14 +90,6 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
  
                 tgrp = ha_thread_info[thr].tgid;
                 thr_bit = ha_thread_info[thr].ltid_bit;
-               p = ha_thread_ctx[thr].prev_cpu_time;
-               n = now_cpu_time_thread(thr);
-
-               /* not yet reached the deadline of 1 sec,
-                * or p wasn't initialized yet
-                */
-               if (!p)
-                       goto update_and_leave;
  
                 if ((_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_SLEEPING) ||
                     (_HA_ATOMIC_LOAD(&ha_tgroup_ctx[tgrp-1].threads_harmless) & thr_bit)) {
@@ -109,38 +102,21 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
                         goto update_and_leave;
                 }
  
-               /* So the thread indeed appears locked up. In order to be
-                * certain that we're not witnessing an exceptional spike of
-                * CPU usage due to a configuration issue (like running tens
-                * of thousands of tasks in a single loop), we'll check if the
-                * scheduler is still alive by setting the TH_FL_STUCK flag
-                * that the scheduler clears when switching to the next task.
-                * If it's already set, then it's our second call with no
-                * progress and the thread is dead. However, if we figure
-                * that the scheduler made no progress since last time, we'll
-                * at least emit a warning.
+               /* check whether the scheduler is still running. The first time
+                * we check, we mark it as possibly stuck to challenge it, we
+                * store the last date where we did this, and we quit. On next
+                * wakeup, if it has not moved, we'll wake up the suspicious
+                * thread which will perform its own date checks. This way we
+                * avoid complex computations in a possibly unrelated thread
+                * and don't wake another thread up as long as everything's OK.
                  */
-               if (!(_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_STUCK)) {
-                       /* after one second it's clear that we're stuck */
-                       if (n - p >= 1000000000ULL) {
-                               _HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
-                               goto update_and_leave;
-                       }
-                       else if (n - p < (ullong)wdt_warn_blocked_traffic_ns) {
-                               /* if we haven't crossed the warning boundary,
-                                * let's just refresh the reporting thread's timer.
-                                */
-                               goto update_and_leave;
-                       }
+               if (is_sched_alive(thr)) {
+                       n = now_cpu_time_thread(thr);
+                       _HA_ATOMIC_STORE(&per_thread_wd_ctx[thr].stuck_start, n);
+                       goto update_and_leave;
                 }
  
-               /* OK so we've crossed the warning boundary and possibly the
-                * panic one as well. This may only be reported by the original
-                * thread. Let's fall back to the common code below which will
-                * possibly bounce to the reporting thread, which will then
-                * check the ctxsw count and decide whether to do nothing, to
-                * warn, or either panic.
-                */
+               /* Suspiciously didn't change: fall through target thread signaling */
                 break;
  
  #if defined(USE_THREAD) && defined(SI_TKILL) /* Linux uses this */
@@ -165,8 +141,8 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
         }
  
         /* Right here, we either got a bounce from another thread's WDT to
-        * report a crossed period, or we noticed it for the current thread.
-        * For other threads, we're bouncing.
+        * report a suspciously stuck scheduler, or we noticed it for the
+        * current thread. For other threads, we're bouncing.
          */
  #ifdef USE_THREAD
         if (thr != tid) {
@@ -175,23 +151,52 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
         }
  #endif
  
-       /* Now the interesting things begin. We're on the thread of interest.
-        * Its timer was at least as large as the warning threshold since poll
-        * was left. If it was at least as high as the panic threshold, we also
-        * have TH_FL_STUCK, which now proves that nothing is happening since
-        * the scheduler clears it for each task. We can still recheck whether
-        * the scheduler looks alive and get away with all of this if we've got
-        * a proof that it's making forward progress. If stuck, we have to die,
-        * otherwise we just send a warning. In short, is_sched_alive() serves
-        * as a ping to detect the warning condition while TH_FL_STUCK works
-        * the same but for a panic condition.
+       /* OK here we're on the target thread (thr==tid). It was reported that
+        * the scheduler was not moving. This might have changed since, if we
+        * got that from another thread. Otherwise we'll run time checks to
+        * verify the situation, and possibly the need to warn or panic.
+        */
+       n = now_cpu_time();
+
+       if (is_sched_alive(thr)) {
+               _HA_ATOMIC_STORE(&per_thread_wd_ctx[thr].stuck_start, n);
+               goto update_and_leave;
+       }
+
+       /* check when we saw last activity (in CPU time) */
+       p = ha_thread_ctx[thr].prev_cpu_time;
+
+       /* p not yet initialized (e.g. signal received during early boot) */
+       if (!p)
+               goto update_and_leave;
+
+       /* check the most recent known activity */
+       if (p < per_thread_wd_ctx[thr].stuck_start)
+               p = per_thread_wd_ctx[thr].stuck_start;
+
+       /* if we haven't crossed the warning boundary, let's just refresh the
+        * reporting thread's timer.
+        */
+       if (n - p < (ullong)wdt_warn_blocked_traffic_ns)
+               goto update_and_leave;
+
+       /* The thread indeed appears locked up, it hasn't made any progress
+        * for at least the configured warning time. If it crosses the second,
+        * we'll mark it with TH_FL_STUCK so that the next call will panic.
+        * Doing so still permits exceptionally long operations to mark
+        * themselves as under control and not stuck to avoid the panic.
+        * Otherwise we just emit a warning, and this one doesn't consider
+        * TH_FL_STUCK (i.e. a slow code path must always be reported to the
+        * user, even if under control).
          */
         if (_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_STUCK)
                 ha_panic();
  
-       if (!is_sched_alive(thr))
-               ha_stuck_warning();
+       /* after one second it's clear that we're stuck */
+       if (n - p >= 1000000000ULL)
+               _HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
  
+       ha_stuck_warning();
         /* let's go on */
  
   update_and_leave:
author	Willy Tarreau <w@1wt.eu>
	Wed, 1 Oct 2025 06:28:54 +0000 (08:28 +0200)
committer	Willy Tarreau <w@1wt.eu>
	Wed, 1 Oct 2025 08:18:53 +0000 (10:18 +0200)