From 57230d7f229cdf1caab90daef3b07be3e32e34c5 Mon Sep 17 00:00:00 2001 From: "W.C.A. Wijngaards" Date: Thu, 1 Sep 2022 15:14:20 +0200 Subject: [PATCH] - Fix to log a verbose message at operational notice level if a thread is not responding, to stats requests. It is logged with thread identifiers. --- config.h.in | 9 +++++++ configure | 70 ++++++++++++++++++++++++++++++++++++++++++++++++- configure.ac | 6 ++++- daemon/stats.c | 26 ++++++++++++++++++ daemon/worker.c | 3 +++ daemon/worker.h | 4 +++ doc/Changelog | 3 +++ util/tube.c | 42 +++++++++++++++++++++++++++++ util/tube.h | 8 ++++++ 9 files changed, 169 insertions(+), 2 deletions(-) diff --git a/config.h.in b/config.h.in index 2a5214803..2caecf30d 100644 --- a/config.h.in +++ b/config.h.in @@ -298,6 +298,9 @@ /* Define to 1 if you have the `getrlimit' function. */ #undef HAVE_GETRLIMIT +/* Define to 1 if you have the `gettid' function. */ +#undef HAVE_GETTID + /* Define to 1 if you have the `glob' function. */ #undef HAVE_GLOB @@ -806,12 +809,18 @@ /* Shared data */ #undef SHARE_DIR +/* The size of `pthread_t', as computed by sizeof. */ +#undef SIZEOF_PTHREAD_T + /* The size of `size_t', as computed by sizeof. */ #undef SIZEOF_SIZE_T /* The size of `time_t', as computed by sizeof. */ #undef SIZEOF_TIME_T +/* The size of `unsigned long', as computed by sizeof. */ +#undef SIZEOF_UNSIGNED_LONG + /* define if (v)snprintf does not return length needed, (but length used) */ #undef SNPRINTF_RET_BROKEN diff --git a/configure b/configure index cd44265fb..c1f244e4f 100755 --- a/configure +++ b/configure @@ -15256,6 +15256,74 @@ cat >>confdefs.h <<_ACEOF _ACEOF +# The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned long" >&5 +$as_echo_n "checking size of unsigned long... " >&6; } +if ${ac_cv_sizeof_unsigned_long+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned long))" "ac_cv_sizeof_unsigned_long" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_unsigned_long" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (unsigned long) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_unsigned_long=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_long" >&5 +$as_echo "$ac_cv_sizeof_unsigned_long" >&6; } + + + +cat >>confdefs.h <<_ACEOF +#define SIZEOF_UNSIGNED_LONG $ac_cv_sizeof_unsigned_long +_ACEOF + + +if test x_$ub_have_pthreads != x_yes; then + # The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of pthread_t" >&5 +$as_echo_n "checking size of pthread_t... " >&6; } +if ${ac_cv_sizeof_pthread_t+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (pthread_t))" "ac_cv_sizeof_pthread_t" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_pthread_t" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (pthread_t) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_pthread_t=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_pthread_t" >&5 +$as_echo "$ac_cv_sizeof_pthread_t" >&6; } + + + +cat >>confdefs.h <<_ACEOF +#define SIZEOF_PTHREAD_T $ac_cv_sizeof_pthread_t +_ACEOF + + +fi # add option to disable the evil rpath @@ -20594,7 +20662,7 @@ if test "$ac_res" != no; then : fi -for ac_func in tzset sigprocmask fcntl getpwnam endpwent getrlimit setrlimit setsid chroot kill chown sleep usleep random srandom recvmsg sendmsg writev socketpair glob initgroups strftime localtime_r setusercontext _beginthreadex endservent endprotoent fsync shmget accept4 getifaddrs if_nametoindex poll +for ac_func in tzset sigprocmask fcntl getpwnam endpwent getrlimit setrlimit setsid chroot kill chown sleep usleep random srandom recvmsg sendmsg writev socketpair glob initgroups strftime localtime_r setusercontext _beginthreadex endservent endprotoent fsync shmget accept4 getifaddrs if_nametoindex poll gettid do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.ac b/configure.ac index 63902646e..a2b1a8201 100644 --- a/configure.ac +++ b/configure.ac @@ -487,6 +487,10 @@ AC_INCLUDES_DEFAULT #endif ]) AC_CHECK_SIZEOF(size_t) +AC_CHECK_SIZEOF([unsigned long]) +if test x_$ub_have_pthreads != x_yes; then + AC_CHECK_SIZEOF(pthread_t) +fi # add option to disable the evil rpath ACX_ARG_RPATH @@ -1644,7 +1648,7 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([ AC_MSG_RESULT(no)) AC_SEARCH_LIBS([setusercontext], [util]) -AC_CHECK_FUNCS([tzset sigprocmask fcntl getpwnam endpwent getrlimit setrlimit setsid chroot kill chown sleep usleep random srandom recvmsg sendmsg writev socketpair glob initgroups strftime localtime_r setusercontext _beginthreadex endservent endprotoent fsync shmget accept4 getifaddrs if_nametoindex poll]) +AC_CHECK_FUNCS([tzset sigprocmask fcntl getpwnam endpwent getrlimit setrlimit setsid chroot kill chown sleep usleep random srandom recvmsg sendmsg writev socketpair glob initgroups strftime localtime_r setusercontext _beginthreadex endservent endprotoent fsync shmget accept4 getifaddrs if_nametoindex poll gettid]) AC_CHECK_FUNCS([setresuid],,[AC_CHECK_FUNCS([setreuid])]) AC_CHECK_FUNCS([setresgid],,[AC_CHECK_FUNCS([setregid])]) diff --git a/daemon/stats.c b/daemon/stats.c index 57c428271..02348aada 100644 --- a/daemon/stats.c +++ b/daemon/stats.c @@ -69,6 +69,10 @@ #ifdef HAVE_SSL #include #endif +#include /* DEBUG */ + +/** How long to wait for threads to transmit statistics, in msec. */ +#define STATS_THREAD_WAIT 60000 /** add timers and the values do not overflow or become negative */ static void @@ -380,6 +384,28 @@ void server_stats_obtain(struct worker* worker, struct worker* who, worker_send_cmd(who, worker_cmd_stats); else worker_send_cmd(who, worker_cmd_stats_noreset); verbose(VERB_ALGO, "wait for stats reply"); + if(tube_wait_timeout(worker->cmd, STATS_THREAD_WAIT) == 0) { + verbose(VERB_OPS, "no response from thread %d" +#ifdef HAVE_GETTID + " LWP %u" +#endif +#if defined(HAVE_PTHREAD) && defined(SIZEOF_PTHREAD_T) && defined(SIZEOF_UNSIGNED_LONG) +# if SIZEOF_PTHREAD_T == SIZEOF_UNSIGNED_LONG + " pthread 0x%lx" +# endif +#endif + , + who->thread_num +#ifdef HAVE_GETTID + , (unsigned)who->thread_tid +#endif +#if defined(HAVE_PTHREAD) && defined(SIZEOF_PTHREAD_T) && defined(SIZEOF_UNSIGNED_LONG) +# if SIZEOF_PTHREAD_T == SIZEOF_UNSIGNED_LONG + , (unsigned long)*((unsigned long*)&who->thr_id) +# endif +#endif + ); + } if(!tube_read_msg(worker->cmd, &reply, &len, 0)) fatal_exit("failed to read stats over cmd channel"); if(len != (uint32_t)sizeof(*s)) diff --git a/daemon/worker.c b/daemon/worker.c index 010c4dc0a..4a06f53ec 100644 --- a/daemon/worker.c +++ b/daemon/worker.c @@ -1903,6 +1903,9 @@ worker_init(struct worker* worker, struct config_file *cfg, struct dt_env* dtenv = &worker->dtenv; #else void* dtenv = NULL; +#endif +#ifdef HAVE_GETTID + worker->thread_tid = gettid(); #endif worker->need_to_exit = 0; worker->base = comm_base_create(do_sigs); diff --git a/daemon/worker.h b/daemon/worker.h index 3887d0405..3fb52abd9 100644 --- a/daemon/worker.h +++ b/daemon/worker.h @@ -86,6 +86,10 @@ struct worker { struct daemon* daemon; /** thread id */ ub_thread_type thr_id; +#ifdef HAVE_GETTID + /** thread tid, the LWP id. */ + pid_t thread_tid; +#endif /** pipe, for commands for this worker */ struct tube* cmd; /** the event base this worker works with */ diff --git a/doc/Changelog b/doc/Changelog index b4abf6f0d..8af4c6e0d 100644 --- a/doc/Changelog +++ b/doc/Changelog @@ -3,6 +3,9 @@ work on OpenBSD. - Slow down log frequency of write wait failures. - Fix to set out of file descriptor warning to operational verbosity. + - Fix to log a verbose message at operational notice level if a + thread is not responding, to stats requests. It is logged with + thread identifiers. 31 August 2022: Wouter - Fix to avoid process wide fcntl calls mixed with nonblocking diff --git a/util/tube.c b/util/tube.c index 40556e720..43455feef 100644 --- a/util/tube.c +++ b/util/tube.c @@ -424,6 +424,28 @@ int tube_wait(struct tube* tube) return pollit(tube->sr, NULL); } +int tube_wait_timeout(struct tube* tube, int msec) +{ + struct timeval t; + int fd = tube->sr; + fd_set r; + t.tv_sec = msec/1000; + t.tv_usec = (msec%1000)*1000; +#ifndef S_SPLINT_S + FD_ZERO(&r); + FD_SET(FD_SET_T fd, &r); +#endif + while(1) { + if(select(fd+1, &r, NULL, NULL, &t) == -1) { + if(errno == EAGAIN || errno == EINTR) + continue; + return -1; + } + break; + } + return (int)(FD_ISSET(fd, &r)); +} + int tube_read_fd(struct tube* tube) { return tube->sr; @@ -649,6 +671,26 @@ int tube_wait(struct tube* tube) return 1; } +int tube_wait_timeout(struct tube* tube, int msec) +{ + /* block on eventhandle */ + DWORD res = WSAWaitForMultipleEvents( + 1 /* one event in array */, + &tube->event /* the event to wait for, our pipe signal */, + 0 /* wait for all events is false */, + msec /* wait for timeout */, + 0 /* we are not alertable for IO completion routines */ + ); + if(res == WSA_WAIT_TIMEOUT) { + return 0; + } + if(res == WSA_WAIT_IO_COMPLETION) { + /* a bit unexpected, since we were not alertable */ + return -1; + } + return 1; +} + int tube_read_fd(struct tube* ATTR_UNUSED(tube)) { /* nothing sensible on Windows */ diff --git a/util/tube.h b/util/tube.h index 5b1fdb8e8..5e4fb8644 100644 --- a/util/tube.h +++ b/util/tube.h @@ -204,6 +204,14 @@ int tube_poll(struct tube* tube); */ int tube_wait(struct tube* tube); +/** + * Wait for data to be ready with a timeout. + * @param tube: the tube to wait on. + * @param msec: timeout in milliseconds. + * @return 1 if there is something to read within timeout, readability. + * 0 on a timeout. On failures -1, like errors. */ +int tube_wait_timeout(struct tube* tube, int msec); + /** * Get FD that is readable when new information arrives. * @param tube -- 2.47.3