From: Willy Tarreau Date: Wed, 22 May 2019 05:06:44 +0000 (+0200) Subject: MINOR: threads: add a "stuck" flag to the thread_info struct X-Git-Tag: v2.0-dev4~23 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e6a02fa65a;p=thirdparty%2Fhaproxy.git MINOR: threads: add a "stuck" flag to the thread_info struct This flag is constantly cleared by the scheduler and will be set by the watchdog timer to detect stuck threads. It is also set by the "show threads" command so that it is easy to spot if the situation has evolved between two subsequent calls : if the first "show threads" shows no stuck thread and the second one shows such a stuck thread, it indicates that this thread didn't manage to make any forward progress since the previous call, which is extremely suspicious. --- diff --git a/doc/management.txt b/doc/management.txt index 9baea96b3a..09ae26e8a0 100644 --- a/doc/management.txt +++ b/doc/management.txt @@ -2525,11 +2525,18 @@ show threads an advanced dump mechanism involving thread signals is used so that each thread can dump its own state in turn. Without this option, the thread processing the command shows all its details but the other ones are less - detailed. A stat ('*') is displayed in front of the thread handling the - command. The output format is purposely not documented so that it can easily - evolve as new needs are identified, without having to maintain any backwards - compatibility, and just like with "show activity", the values are only - meaningful with the code at hand. + detailed. A star ('*') is displayed in front of the thread handling the + command. A right angle bracket ('>') may also be displayed in front of + threads which didn't make any progress since last invocation of this command, + indicating a bug in the code which must absolutely be reported. When this + happens between two threads it usually indicates a deadlock. If a thread is + alone, it's a different bug like a corrupted list. In all cases the process + needs is not fully functional anymore and needs to be restarted. + + The output format is purposely not documented so that it can easily evolve as + new needs are identified, without having to maintain any form of backwards + compatibility, and just like with "show activity", the values are meaningless + without the code at hand. show tls-keys [id|*] Dump all loaded TLS ticket keys references. The TLS ticket key reference ID diff --git a/include/common/hathreads.h b/include/common/hathreads.h index 77bd617aa8..32099094c0 100644 --- a/include/common/hathreads.h +++ b/include/common/hathreads.h @@ -38,6 +38,10 @@ * only one thread is enabled, it equals 1. */ +/* thread info flags, for thread_info[].flags */ +#define TI_FL_STUCK 0x00000001 + + #ifndef USE_THREAD #define MAX_THREADS 1 @@ -57,6 +61,7 @@ extern struct thread_info { uint64_t prev_cpu_time; /* previous per thread CPU time */ uint64_t prev_mono_time; /* previous system wide monotonic time */ unsigned int idle_pct; /* idle to total ratio over last sample (percent) */ + unsigned int flags; /* thread info flags, TI_FL_* */ /* pad to cache line (64B) */ char __pad[0]; /* unused except to check remaining room */ char __end[0] __attribute__((aligned(64))); @@ -405,6 +410,7 @@ extern struct thread_info { uint64_t prev_cpu_time; /* previous per thread CPU time */ uint64_t prev_mono_time; /* previous system wide monotonic time */ unsigned int idle_pct; /* idle to total ratio over last sample (percent) */ + unsigned int flags; /* thread info flags, TI_FL_* */ /* pad to cache line (64B) */ char __pad[0]; /* unused except to check remaining room */ char __end[0] __attribute__((aligned(64))); diff --git a/src/debug.c b/src/debug.c index e47b85c83a..b34df72018 100644 --- a/src/debug.c +++ b/src/debug.c @@ -33,18 +33,20 @@ * optionally extra info for the current thread. The dump will be appended to * the buffer, so the caller is responsible for preliminary initializing it. * The calling thread ID needs to be passed in to display a star - * in front of the calling thread's line (usually it's tid). + * in front of the calling thread's line (usually it's tid). Any stuck thread + * is also prefixed with a '>'. */ void ha_thread_dump(struct buffer *buf, int thr, int calling_tid) { unsigned long thr_bit = 1UL << thr; unsigned long long p = thread_info[thr].prev_cpu_time; unsigned long long n = now_cpu_time_thread(&thread_info[thr]); + int stuck = !!(thread_info[thr].flags & TI_FL_STUCK); chunk_appendf(buf, - "%c Thread %-2u: act=%d glob=%d wq=%d rq=%d tl=%d tlsz=%d rqsz=%d\n" - " fdcache=%d prof=%d", - (thr == calling_tid) ? '*' : ' ', thr + 1, + "%c%cThread %-2u: act=%d glob=%d wq=%d rq=%d tl=%d tlsz=%d rqsz=%d\n" + " stuck=%d fdcache=%d prof=%d", + (thr == calling_tid) ? '*' : ' ', stuck ? '>' : ' ', thr + 1, !!(active_tasks_mask & thr_bit), !!(global_tasks_mask & thr_bit), !eb_is_empty(&task_per_thread[thr].timers), @@ -52,6 +54,7 @@ void ha_thread_dump(struct buffer *buf, int thr, int calling_tid) !LIST_ISEMPTY(&task_per_thread[thr].task_list), task_per_thread[thr].task_list_size, task_per_thread[thr].rqueue_size, + stuck, !!(fd_cache_mask & thr_bit), !!(task_profiling_mask & thr_bit)); @@ -467,6 +470,12 @@ void debug_handler(int sig, siginfo_t *si, void *arg) else ha_thread_relax(); } + + /* mark the current thread as stuck to detect it upon next invocation + * if it didn't move. + */ + if (!((threads_harmless_mask|sleeping_thread_mask) & tid_bit)) + ti->flags |= TI_FL_STUCK; } static int init_debug_per_thread() diff --git a/src/task.c b/src/task.c index afdd2b85e4..5cf7d74e2e 100644 --- a/src/task.c +++ b/src/task.c @@ -281,6 +281,8 @@ void process_runnable_tasks() struct task *t; int max_processed; + ti->flags &= ~TI_FL_STUCK; // this thread is still running + if (!(active_tasks_mask & tid_bit)) { activity[tid].empty_rq++; return; @@ -372,6 +374,7 @@ void process_runnable_tasks() __ha_barrier_atomic_store(); __task_remove_from_tasklet_list(t); + ti->flags &= ~TI_FL_STUCK; // this thread is still running activity[tid].ctxsw++; ctx = t->context; process = t->process;