]> git.ipfire.org Git - thirdparty/haproxy.git/commitdiff
MEDIUM: wdt: always make the faulty thread report its own warnings
authorWilly Tarreau <w@1wt.eu>
Thu, 17 Apr 2025 06:42:36 +0000 (08:42 +0200)
committerWilly Tarreau <w@1wt.eu>
Thu, 17 Apr 2025 14:25:47 +0000 (16:25 +0200)
Warnings remain tricky to deal with, especially for other threads as
they require some inter-thread synchronization that doesn't cope very
well with other parallel activities such as "show threads" for example.

However there is nothing that forces us to handle them this way. The
panic for example is already handled by bouncing the WDT signal to the
faulty thread.

This commit rearranges the WDT handler to make a better used of this
existing signal bouncing feature of the WDT handler so that it's no
longer limited to panics but can also deal with warnings. In order not
to bounce on all wakeups, we only bounce when there is a suspicion,
that is, when the warning timer has been crossed. We'll let the target
thread verify the stuck flag and context switch count by itself to
decide whether or not to panic, warn, or just do nothing and update
the counters.

As a bonus, now all warning traces look the same regardless of the
reporting thread:

   call trace(16):
   |       0x6bc733 <01 00 00 e8 6d e6 de ff]: ha_dump_backtrace+0x73/0x309 > main-0x2570
   |       0x6bd37a <00 00 00 e8 d6 fb ff ff]: ha_thread_dump_fill+0xda/0x104 > ha_thread_dump_one
   |       0x6bd625 <00 00 00 e8 7b fc ff ff]: ha_stuck_warning+0xc5/0x19e > ha_thread_dump_fill
   |       0x7b2b60 <64 8b 3b e8 00 aa f0 ff]: wdt_handler+0x1f0/0x212 > ha_stuck_warning
   | 0x7fd7e2cef3a0 <00 00 00 00 0f 1f 40 00]: libpthread:+0x123a0
   | 0x7ffc6af9e634 <85 a6 00 00 00 0f 01 f9]: linux-vdso:__vdso_gettimeofday+0x34/0x2b0
   |       0x6bad74 <7c 24 10 e8 9c 01 df ff]: sc_conn_io_cb+0x9fa4 > main-0x2400
   |       0x67c457 <89 f2 4c 89 e6 41 ff d0]: main+0x1cf147
   |       0x67d401 <48 89 df e8 8f ed ff ff]: cli_io_handler+0x191/0xb38 > main+0x1cee80
   |       0x6dd605 <40 48 8b 45 60 ff 50 18]: task_process_applet+0x275/0xce9

src/wdt.c

index 16af0a914393c45554b0ca744693910378bbf1f9..dc33f9621c82dd25cdb75f5ad616880657ab0a95 100644 (file)
--- a/src/wdt.c
+++ b/src/wdt.c
@@ -65,6 +65,7 @@ int wdt_ping(int thr)
 void wdt_handler(int sig, siginfo_t *si, void *arg)
 {
        unsigned long long n, p;
+       uint prev_ctxsw, curr_ctxsw;
        ulong thr_bit;
        int thr, tgrp;
 
@@ -123,27 +124,23 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
                 * at least emit a warning.
                 */
                if (!(_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_STUCK)) {
-                       uint prev_ctxsw;
-
-                       prev_ctxsw = HA_ATOMIC_LOAD(&per_thread_wd_ctx[thr].prev_ctxsw);
-
-                       /* only after one second it's clear we're stuck */
+                       /* after one second it's clear that we're stuck */
                        if (n - p >= 1000000000ULL)
                                _HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
-
-                       /* have we crossed the warning boundary ? If so we note were we
-                        * where, and second time called from the same place will trigger
-                        * a warning (unless already stuck).
-                        */
-                       if (n - p >= (ullong)wdt_warn_blocked_traffic_ns) {
-                               uint curr_ctxsw = HA_ATOMIC_LOAD(&activity[thr].ctxsw);
-
-                               if (curr_ctxsw == prev_ctxsw)
-                                       ha_stuck_warning(thr);
-                               HA_ATOMIC_STORE(&per_thread_wd_ctx[thr].prev_ctxsw, curr_ctxsw);
+                       else if (n - p < (ullong)wdt_warn_blocked_traffic_ns) {
+                               /* if we haven't crossed the warning boundary,
+                                * let's just refresh the reporting thread's timer.
+                                */
+                               goto update_and_leave;
                        }
 
-                       goto update_and_leave;
+                       /* OK so we've crossed the warning boundary and possibly the
+                        * panic one as well. This may only be reported by the original
+                        * thread. Let's fall back to the common code below which will
+                        * possibly bounce to the reporting thread, which will then
+                        * check the ctxsw count and decide whether to do nothing, to
+                        * warn, or either panic.
+                        */
                }
 
                /* No doubt now, there's no hop to recover, die loudly! */
@@ -170,24 +167,38 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
                return;
        }
 
-       /* By default we terminate. If we're not on the victim thread, better
-        * bounce the signal there so that we produce a cleaner stack trace
-        * with the other thread interrupted exactly where it was running and
-        * the current one not involved in this.
+       /* Right here, we either got a bounce from another thread's WDT to
+        * report a crossed period, or we noticed it for the current thread.
+        * For other threads, we're bouncing.
         */
 #ifdef USE_THREAD
-       if (thr != tid)
+       if (thr != tid) {
                ha_tkill(thr, sig);
-       else
+               goto leave;
+       }
 #endif
+
+       /* Now the interesting things begin. The timer was at least as large
+        * as the warning threshold. If the stuck bit was set, we must now
+        * panic. Otherwise we're checking if we're still context-switching
+        * or not and we'll either warn if not, or just update the ctxsw
+        * counter to check next time.
+        */
+       if (_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_STUCK)
                ha_panic();
 
-       _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_IN_WDT_HANDLER);
-       return;
+       prev_ctxsw = per_thread_wd_ctx[tid].prev_ctxsw;
+       curr_ctxsw = activity[tid].ctxsw;
+
+       if (curr_ctxsw == prev_ctxsw)
+               ha_stuck_warning(tid);
+       else
+               per_thread_wd_ctx[tid].prev_ctxsw = curr_ctxsw;
+       /* let's go on */
 
  update_and_leave:
        wdt_ping(thr);
-
+ leave:
        _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_IN_WDT_HANDLER);
 }