From: Martin Schwenke Date: Mon, 19 Feb 2024 10:42:11 +0000 (+1100) Subject: ctdb-scripts: Add service_stats_command variable to NFS checks X-Git-Tag: tdb-1.4.11~302 X-Git-Url: http://git.ipfire.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=5b7d17d44d9abdf2f3e15b225bd53b6d21ac41dd;p=thirdparty%2Fsamba.git ctdb-scripts: Add service_stats_command variable to NFS checks When monitoring an RPC service, the rpcinfo command might time out even though the service is making progress. In this case, it is just slow, so counting the timeout as a failure and potentially restarting the service will not help. The problem is determining if a service is making progress. Add a new NFS checks service_stats_command. This command is intended to run a statistics command. The output is naively compared using cmp(1). If the output changes then rpcinfo failures are converted to successes. Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs --- diff --git a/ctdb/config/events/legacy/60.nfs.script b/ctdb/config/events/legacy/60.nfs.script index 6935ad9fadc..867cc72d50c 100755 --- a/ctdb/config/events/legacy/60.nfs.script +++ b/ctdb/config/events/legacy/60.nfs.script @@ -80,6 +80,13 @@ nfs_check_services() # traces of threads that have not exited, since # they may be stuck doing I/O; # no default, see also function program_stack_traces() +# * service_stats_cmd - command to retrieve statistics for given service; +# if this is set and RPC checks fail (or +# $service_check_cmd fails), then statistics are +# compared (using cmp) to see if the service is +# making progress or is truly hung; +# no default, failed service does not double-check +# failure using statistics # # Quoting in values is not preserved # @@ -103,6 +110,7 @@ nfs_check_service() service_start_cmd="" service_check_cmd="" service_debug_cmd="" + service_stats_cmd="" # Eval line-by-line. Expands variable references in values. # Also allows variable name checking, which seems useful. @@ -113,7 +121,8 @@ nfs_check_service() family=* | version=* | \ unhealthy_after=* | restart_every=* | \ service_stop_cmd=* | service_start_cmd=* | \ - service_check_cmd=* | service_debug_cmd=*) + service_check_cmd=* | service_debug_cmd=* | \ + service_stats_cmd=*) eval "$_line" ;; @@ -144,6 +153,30 @@ nfs_check_service() fi fi + if [ -n "$service_stats_cmd" ]; then + # If configured, always update stats, + # regardless of RPC status... + + # shellcheck disable=SC2154 + # script_state_dir set by ctdb_setup_state_dir + _curr="${script_state_dir}/stats_${_progname}.out" + _prev="${_curr}.prev" + + if [ -f "$_curr" ]; then + mv -f "$_curr" "$_prev" + fi + eval "$service_stats_cmd" >"$_curr" 2>&1 + + if ! $_ok && + ! cmp "$_prev" "$_curr" >/dev/null 2>&1; then + # Stats always implicitly change on + # the first monitor event, since + # previous stats don't exists... + echo "WARNING: statistics changed but ${_err}" + _ok=true + fi + fi + if $_ok; then if [ $unhealthy_after -ne 1 ] || [ $restart_every -ne 0 ]; then diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh new file mode 100755 index 00000000000..860436328d4 --- /dev/null +++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh @@ -0,0 +1,26 @@ +#!/bin/sh + +. "${TEST_SCRIPTS_DIR}/unit.sh" + +define_test "NFS RPC service down, stats change, 10 iterations" + +setup + +cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <"$_rc_file" + printf 'WARNING: statistics changed but %s\n' \ + "$_rpc_check_out" >>"$_out" + elif [ $unhealthy_after -gt 0 ] && [ "$_numfails" -ge $unhealthy_after ]; then _unhealthy=true echo 1 >"$_rc_file" @@ -411,9 +441,17 @@ EOF fi if [ -n "$_rpc_service" ]; then if rpcinfo -T tcp localhost "$_rpc_service" \ - >/dev/null 2>&1 ; then + >/dev/null 2>&1; then _iterate_failcount=0 + elif nfs_stats_check_changed \ + "$_rpc_service" "$_iteration"; then + _iterate_failcount=-1 else + # -1 above is a special case of 0: + # hack, unhack ;-) + if [ $_iterate_failcount -eq -1 ]; then + _iterate_failcount=0 + fi _iterate_failcount=$((_iterate_failcount + 1)) fi rpc_set_service_failure_response \