]> git.ipfire.org Git - thirdparty/samba.git/commitdiff
ctdb-scripts: Add service_stats_command variable to NFS checks
authorMartin Schwenke <mschwenke@ddn.com>
Mon, 19 Feb 2024 10:42:11 +0000 (21:42 +1100)
committerMartin Schwenke <martins@samba.org>
Tue, 25 Jun 2024 03:16:37 +0000 (03:16 +0000)
When monitoring an RPC service, the rpcinfo command might time out
even though the service is making progress.  In this case, it is just
slow, so counting the timeout as a failure and potentially restarting
the service will not help.  The problem is determining if a service is
making progress.

Add a new NFS checks service_stats_command.  This command is intended
to run a statistics command.  The output is naively compared using
cmp(1).  If the output changes then rpcinfo failures are converted to
successes.

Signed-off-by: Martin Schwenke <mschwenke@ddn.com>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/config/events/legacy/60.nfs.script
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh [new file with mode: 0755]
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh [new file with mode: 0755]
ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh

index 6935ad9fadcef375c97202418cc30c49a6a6b703..867cc72d50c5a27746e04f10b041063004cac0ac 100755 (executable)
@@ -80,6 +80,13 @@ nfs_check_services()
 #                        traces of threads that have not exited, since
 #                        they may be stuck doing I/O;
 #                        no default, see also function program_stack_traces()
+# * service_stats_cmd  - command to retrieve statistics for  given service;
+#                        if this is set and RPC checks fail (or
+#                        $service_check_cmd fails), then statistics are
+#                        compared (using cmp) to see if the service is
+#                        making progress or is truly hung;
+#                        no default, failed service does not double-check
+#                        failure using statistics
 #
 # Quoting in values is not preserved
 #
@@ -103,6 +110,7 @@ nfs_check_service()
                service_start_cmd=""
                service_check_cmd=""
                service_debug_cmd=""
+               service_stats_cmd=""
 
                # Eval line-by-line.  Expands variable references in values.
                # Also allows variable name checking, which seems useful.
@@ -113,7 +121,8 @@ nfs_check_service()
                        family=* | version=* | \
                                unhealthy_after=* | restart_every=* | \
                                service_stop_cmd=* | service_start_cmd=* | \
-                               service_check_cmd=* | service_debug_cmd=*)
+                               service_check_cmd=* | service_debug_cmd=* | \
+                               service_stats_cmd=*)
 
                                eval "$_line"
                                ;;
@@ -144,6 +153,30 @@ nfs_check_service()
                        fi
                fi
 
+               if [ -n "$service_stats_cmd" ]; then
+                       # If configured, always update stats,
+                       # regardless of RPC status...
+
+                       # shellcheck disable=SC2154
+                       # script_state_dir set by ctdb_setup_state_dir
+                       _curr="${script_state_dir}/stats_${_progname}.out"
+                       _prev="${_curr}.prev"
+
+                       if [ -f "$_curr" ]; then
+                               mv -f "$_curr" "$_prev"
+                       fi
+                       eval "$service_stats_cmd" >"$_curr" 2>&1
+
+                       if ! $_ok &&
+                               ! cmp "$_prev" "$_curr" >/dev/null 2>&1; then
+                               # Stats always implicitly change on
+                               # the first monitor event, since
+                               # previous stats don't exists...
+                               echo "WARNING: statistics changed but ${_err}"
+                               _ok=true
+                       fi
+               fi
+
                if $_ok; then
                        if [ $unhealthy_after -ne 1 ] ||
                                [ $restart_every -ne 0 ]; then
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh
new file mode 100755 (executable)
index 0000000..8604363
--- /dev/null
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service down, stats change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="date --rfc-3339=ns | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "nfs" "status"
+
+rpc_services_down "nfs"
+
+nfs_iterate_test 10 "nfs"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh
new file mode 100755 (executable)
index 0000000..1bdd29e
--- /dev/null
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service down, stats don't change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="echo 'hello world' | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "status"
+
+rpc_services_down "nfs"
+
+nfs_iterate_test 10 "nfs"
index 9c614c76bb7cbb9b75978698f750e6b58a13405e..7c3435c8ce031c744a74fc62308d3a7633c12eb2 100644 (file)
@@ -128,6 +128,31 @@ nfs_setup_fake_threads()
        esac
 }
 
+nfs_stats_set_changed()
+{
+       FAKE_NFS_STATS_CHANGED=" $* "
+}
+
+nfs_stats_check_changed()
+{
+       _rpc_service="$1"
+       _iteration="$2"
+
+       _t="$FAKE_NFS_STATS_CHANGED"
+       if [ -z "$_t" ]; then
+               return 1
+       fi
+       if [ "${_t#* "${_rpc_service}"}" != "$_t" ]; then
+               return 0
+       fi
+       # Statistics always change on the first iteration
+       if [ "$_iteration" -eq 1 ]; then
+               return 0
+       fi
+
+       return 1
+}
+
 guess_output()
 {
        case "$1" in
@@ -304,7 +329,12 @@ $_rpc_service failed RPC check:
 rpcinfo: RPC: Program not registered
 program $_rpc_service${_ver:+ version }${_ver} is not available"
 
-               if [ $unhealthy_after -gt 0 ] &&
+               if [ "$_numfails" -eq -1 ]; then
+                       _unhealthy=false
+                       echo 0 >"$_rc_file"
+                       printf 'WARNING: statistics changed but %s\n' \
+                               "$_rpc_check_out" >>"$_out"
+               elif [ $unhealthy_after -gt 0 ] &&
                        [ "$_numfails" -ge $unhealthy_after ]; then
                        _unhealthy=true
                        echo 1 >"$_rc_file"
@@ -411,9 +441,17 @@ EOF
                fi
                if [ -n "$_rpc_service" ]; then
                        if rpcinfo -T tcp localhost "$_rpc_service" \
-                                  >/dev/null 2>&1 ; then
+                               >/dev/null 2>&1; then
                                _iterate_failcount=0
+                       elif nfs_stats_check_changed \
+                               "$_rpc_service" "$_iteration"; then
+                               _iterate_failcount=-1
                        else
+                               # -1 above is a special case of 0:
+                               # hack, unhack ;-)
+                               if [ $_iterate_failcount -eq -1 ]; then
+                                       _iterate_failcount=0
+                               fi
                                _iterate_failcount=$((_iterate_failcount + 1))
                        fi
                        rpc_set_service_failure_response \