]> git.ipfire.org Git - thirdparty/samba.git/commitdiff
ctdb-scripts: Make initial statistics output empty
authorMartin Schwenke <mschwenke@ddn.com>
Sat, 29 Jun 2024 09:24:25 +0000 (19:24 +1000)
committerMartin Schwenke <martins@samba.org>
Tue, 20 Aug 2024 22:50:34 +0000 (22:50 +0000)
This makes initial failure to retrieve statistics less likely to
result in a statistics change.  To help with this, statistics
retrieval stderr now goes to the log - only stdout goes to the file.

This means that the test code for checking statistics changes needs to
be redone to actually run the statistics command and check.  As with
rpcinfo output, this output needs to behave as deterministically in
the test code as it done in the event script.

Signed-off-by: Martin Schwenke <mschwenke@ddn.com>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/config/events/legacy/60.nfs.script
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.119.sh [new file with mode: 0755]
ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh

index bdea3d63fa1f0ceabc5b3988796bf069fe409ade..61e25a5ea0e1cc586bbb07375f7ede73de2f4b0c 100755 (executable)
@@ -164,20 +164,22 @@ nfs_check_service()
 
                        if [ -f "$_curr" ]; then
                                mv -f "$_curr" "$_prev"
+                       else
+                               # Make initial stats empty, so a
+                               # failed attempt to retrieve them on
+                               # service stall is less likely to
+                               # result in a false stats change
+                               : >"$_prev"
                        fi
-                       eval "$service_stats_cmd" >"$_curr" 2>&1
+                       eval "$service_stats_cmd" >"$_curr"
 
                        # Only consider statistics on timeout.  This
                        # is done below by checking if this string is
                        # contained in $_err.
                        _t="rpcinfo: RPC: Timed out"
-
                        if ! $_ok &&
                                [ "${_err#*"${_t}"}" != "$_err" ] &&
                                ! cmp "$_prev" "$_curr" >/dev/null 2>&1; then
-                               # Stats always implicitly change on
-                               # the first monitor event, since
-                               # previous stats don't exists...
                                echo "WARNING: statistics changed but ${_err}"
                                _ok=true
                        fi
index b51e20d0740202f668992ea5071a465346ebfa62..7afb906049d53b14e4d0a8e30ee19531edf92eb2 100755 (executable)
@@ -18,7 +18,4 @@ service_debug_cmd="program_stack_traces nfsd 5"
 service_stats_cmd="date --rfc-3339=ns | grep ."
 EOF
 
-# Test flag to indicate that stats are expected to change
-nfs_stats_set_changed "nfs" "status"
-
 nfs_iterate_test 10 "nfs"
index 6865c9eca9800946febcf2adeefa59d7feba6fea..a2025e9368034f33a9666742c42df31cb3817551 100755 (executable)
@@ -18,7 +18,4 @@ service_debug_cmd="program_stack_traces nfsd 5"
 service_stats_cmd="echo 'hello world' | grep ."
 EOF
 
-# Test flag to indicate that stats are expected to change
-nfs_stats_set_changed "status"
-
 nfs_iterate_test 10 "nfs"
index 9d96bf1891aafaf9977ccef2f6c21ad25548790d..0bac550c52fdbfd1bdb7f07008c5c9d4ddedf81c 100755 (executable)
@@ -18,7 +18,4 @@ service_debug_cmd="program_stack_traces nfsd 5"
 service_stats_cmd="date --rfc-3339=ns | grep ."
 EOF
 
-# Test flag to indicate that stats are expected to change
-nfs_stats_set_changed "nfs" "status"
-
 nfs_iterate_test 10 "nfs:TIMEOUT"
index 76013f10eae62a1b12449adf02cecfd5989d20f1..ef94d7fba9b32f3462f056ad30d01867ce185f8b 100755 (executable)
@@ -18,7 +18,4 @@ service_debug_cmd="program_stack_traces nfsd 5"
 service_stats_cmd="echo 'hello world' | grep ."
 EOF
 
-# Test flag to indicate that stats are expected to change
-nfs_stats_set_changed "status"
-
 nfs_iterate_test 10 "nfs:TIMEOUT"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.119.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.119.sh
new file mode 100755 (executable)
index 0000000..c291f69
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service timeout, silent stats error, 10 iterations"
+
+# It would be nice to have a non-silent stats error... but that's a
+# bit hard for the current test code to handle.  :-(
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+service_stats_cmd="false"
+EOF
+
+nfs_iterate_test 10 "nfs:TIMEOUT"
index 24f309704243cbe0717477eae2974ac672ad3ebb..dd7035b73030ccd0394f8faff3e21e7a5b322ee1 100644 (file)
@@ -136,29 +136,27 @@ nfs_setup_fake_threads()
        esac
 }
 
-nfs_stats_set_changed()
-{
-       FAKE_NFS_STATS_CHANGED=" $* "
-}
-
 nfs_stats_check_changed()
 {
        _rpc_service="$1"
-       _iteration="$2"
+       _cmd="$2"
 
-       _t="$FAKE_NFS_STATS_CHANGED"
-       if [ -z "$_t" ]; then
+       if [ -z "$_cmd" ]; then
+               # No stats command, statistics don't change...
                return 1
        fi
-       if [ "${_t#* "${_rpc_service}"}" != "$_t" ]; then
-               return 0
-       fi
-       # Statistics always change on the first iteration
-       if [ "$_iteration" -eq 1 ]; then
-               return 0
+
+       _curr="${CTDB_TEST_TMP_DIR}/${_rpc_service}.stats"
+       _prev="${_curr}.prev"
+
+       : >"$_prev"
+       if [ -e "$_curr" ]; then
+               mv "$_curr" "$_prev"
        fi
 
-       return 1
+       eval "$_cmd" >"$_curr"
+
+       ! diff "$_prev" "$_curr" >/dev/null
 }
 
 rpcinfo_timed_out()
@@ -344,6 +342,7 @@ rpc_set_service_failure_response()
                # Unused, but for completeness, possible future use
                service_check_cmd=""
                service_debug_cmd=""
+               service_stats_cmd=""
 
                # Don't bother syntax checking, eventscript does that...
                . "$_file"
@@ -360,6 +359,17 @@ rpc_set_service_failure_response()
                        esac
                fi
 
+               # It doesn't matter here if the statistics have
+               # changed.  However, this generates the current
+               # statistics, which needs to happen, regardless of
+               # service health, so they can be compared when they
+               # matter...
+               _stats_changed=false
+               if nfs_stats_check_changed \
+                          "$_rpc_service" "$service_stats_cmd"; then
+                       _stats_changed=true
+               fi
+
                _why=""
                _ri_out=$(rpcinfo -T tcp localhost "$_rpc_service" 2>&1)
                # Check exit code separately for readability
@@ -370,9 +380,7 @@ rpc_set_service_failure_response()
                elif rpcinfo_timed_out "$_ri_out"; then
                        _why="Timed out"
 
-                       if nfs_stats_check_changed \
-                               "$_rpc_service" "$_iteration"; then
-
+                       if $_stats_changed; then
                                rpc_failure \
                                        "WARNING: statistics changed but" \
                                        "$_rpc_service" \