void wdt_handler(int sig, siginfo_t *si, void *arg)
{
unsigned long long n, p;
+ uint prev_ctxsw, curr_ctxsw;
ulong thr_bit;
int thr, tgrp;
* at least emit a warning.
*/
if (!(_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_STUCK)) {
- uint prev_ctxsw;
-
- prev_ctxsw = HA_ATOMIC_LOAD(&per_thread_wd_ctx[thr].prev_ctxsw);
-
- /* only after one second it's clear we're stuck */
+ /* after one second it's clear that we're stuck */
if (n - p >= 1000000000ULL)
_HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
-
- /* have we crossed the warning boundary ? If so we note were we
- * where, and second time called from the same place will trigger
- * a warning (unless already stuck).
- */
- if (n - p >= (ullong)wdt_warn_blocked_traffic_ns) {
- uint curr_ctxsw = HA_ATOMIC_LOAD(&activity[thr].ctxsw);
-
- if (curr_ctxsw == prev_ctxsw)
- ha_stuck_warning(thr);
- HA_ATOMIC_STORE(&per_thread_wd_ctx[thr].prev_ctxsw, curr_ctxsw);
+ else if (n - p < (ullong)wdt_warn_blocked_traffic_ns) {
+ /* if we haven't crossed the warning boundary,
+ * let's just refresh the reporting thread's timer.
+ */
+ goto update_and_leave;
}
- goto update_and_leave;
+ /* OK so we've crossed the warning boundary and possibly the
+ * panic one as well. This may only be reported by the original
+ * thread. Let's fall back to the common code below which will
+ * possibly bounce to the reporting thread, which will then
+ * check the ctxsw count and decide whether to do nothing, to
+ * warn, or either panic.
+ */
}
/* No doubt now, there's no hop to recover, die loudly! */
return;
}
- /* By default we terminate. If we're not on the victim thread, better
- * bounce the signal there so that we produce a cleaner stack trace
- * with the other thread interrupted exactly where it was running and
- * the current one not involved in this.
+ /* Right here, we either got a bounce from another thread's WDT to
+ * report a crossed period, or we noticed it for the current thread.
+ * For other threads, we're bouncing.
*/
#ifdef USE_THREAD
- if (thr != tid)
+ if (thr != tid) {
ha_tkill(thr, sig);
- else
+ goto leave;
+ }
#endif
+
+ /* Now the interesting things begin. The timer was at least as large
+ * as the warning threshold. If the stuck bit was set, we must now
+ * panic. Otherwise we're checking if we're still context-switching
+ * or not and we'll either warn if not, or just update the ctxsw
+ * counter to check next time.
+ */
+ if (_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_STUCK)
ha_panic();
- _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_IN_WDT_HANDLER);
- return;
+ prev_ctxsw = per_thread_wd_ctx[tid].prev_ctxsw;
+ curr_ctxsw = activity[tid].ctxsw;
+
+ if (curr_ctxsw == prev_ctxsw)
+ ha_stuck_warning(tid);
+ else
+ per_thread_wd_ctx[tid].prev_ctxsw = curr_ctxsw;
+ /* let's go on */
update_and_leave:
wdt_ping(thr);
-
+ leave:
_HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_IN_WDT_HANDLER);
}