ctdb-scripts: Make initial statistics output empty

author Martin Schwenke <mschwenke@ddn.com>

Sat, 29 Jun 2024 09:24:25 +0000 (19:24 +1000)

committer Martin Schwenke <martins@samba.org>

Tue, 20 Aug 2024 22:50:34 +0000 (22:50 +0000)
author Martin Schwenke <mschwenke@ddn.com>
Sat, 29 Jun 2024 09:24:25 +0000 (19:24 +1000)
committer Martin Schwenke <martins@samba.org>
Tue, 20 Aug 2024 22:50:34 +0000 (22:50 +0000)
diff --git a/ctdb/config/events/legacy/60.nfs.script b/ctdb/config/events/legacy/60.nfs.script

index bdea3d63fa1f0ceabc5b3988796bf069fe409ade..61e25a5ea0e1cc586bbb07375f7ede73de2f4b0c 100755 (executable)
--- a/ctdb/config/events/legacy/60.nfs.script
+++ b/ctdb/config/events/legacy/60.nfs.script
@@ -164,20 +164,22 @@ nfs_check_service()
  
                         if [ -f "$_curr" ]; then
                                 mv -f "$_curr" "$_prev"
+                       else
+                               # Make initial stats empty, so a
+                               # failed attempt to retrieve them on
+                               # service stall is less likely to
+                               # result in a false stats change
+                               : >"$_prev"
                         fi
-                       eval "$service_stats_cmd" >"$_curr" 2>&1
+                       eval "$service_stats_cmd" >"$_curr"
  
                         # Only consider statistics on timeout.  This
                         # is done below by checking if this string is
                         # contained in $_err.
                         _t="rpcinfo: RPC: Timed out"
-
                         if ! $_ok &&
                                 [ "${_err#*"${_t}"}" != "$_err" ] &&
                                 ! cmp "$_prev" "$_curr" >/dev/null 2>&1; then
-                               # Stats always implicitly change on
-                               # the first monitor event, since
-                               # previous stats don't exists...
                                 echo "WARNING: statistics changed but ${_err}"
                                 _ok=true
                         fi
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh

index b51e20d0740202f668992ea5071a465346ebfa62..7afb906049d53b14e4d0a8e30ee19531edf92eb2 100755 (executable)
--- a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh
@@ -18,7 +18,4 @@ service_debug_cmd="program_stack_traces nfsd 5"
  service_stats_cmd="date --rfc-3339=ns | grep ."
  EOF
  
-# Test flag to indicate that stats are expected to change
-nfs_stats_set_changed "nfs" "status"
-
  nfs_iterate_test 10 "nfs"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh

index 6865c9eca9800946febcf2adeefa59d7feba6fea..a2025e9368034f33a9666742c42df31cb3817551 100755 (executable)
--- a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh
@@ -18,7 +18,4 @@ service_debug_cmd="program_stack_traces nfsd 5"
  service_stats_cmd="echo 'hello world' | grep ."
  EOF
  
-# Test flag to indicate that stats are expected to change
-nfs_stats_set_changed "status"
-
  nfs_iterate_test 10 "nfs"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh

index 9d96bf1891aafaf9977ccef2f6c21ad25548790d..0bac550c52fdbfd1bdb7f07008c5c9d4ddedf81c 100755 (executable)
--- a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh
@@ -18,7 +18,4 @@ service_debug_cmd="program_stack_traces nfsd 5"
  service_stats_cmd="date --rfc-3339=ns | grep ."
  EOF
  
-# Test flag to indicate that stats are expected to change
-nfs_stats_set_changed "nfs" "status"
-
  nfs_iterate_test 10 "nfs:TIMEOUT"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh

index 76013f10eae62a1b12449adf02cecfd5989d20f1..ef94d7fba9b32f3462f056ad30d01867ce185f8b 100755 (executable)
--- a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh
@@ -18,7 +18,4 @@ service_debug_cmd="program_stack_traces nfsd 5"
  service_stats_cmd="echo 'hello world' | grep ."
  EOF
  
-# Test flag to indicate that stats are expected to change
-nfs_stats_set_changed "status"
-
  nfs_iterate_test 10 "nfs:TIMEOUT"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.119.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.119.sh

new file mode 100755 (executable)

index 0000000..c291f69
--- /dev/null
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.119.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service timeout, silent stats error, 10 iterations"
+
+# It would be nice to have a non-silent stats error... but that's a
+# bit hard for the current test code to handle.  :-(
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+service_stats_cmd="false"
+EOF
+
+nfs_iterate_test 10 "nfs:TIMEOUT"
diff --git a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh

index 24f309704243cbe0717477eae2974ac672ad3ebb..dd7035b73030ccd0394f8faff3e21e7a5b322ee1 100644 (file)
--- a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
+++ b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
@@ -136,29 +136,27 @@ nfs_setup_fake_threads()
         esac
  }
  
-nfs_stats_set_changed()
-{
-       FAKE_NFS_STATS_CHANGED=" $* "
-}
-
  nfs_stats_check_changed()
  {
         _rpc_service="$1"
-       _iteration="$2"
+       _cmd="$2"
  
-       _t="$FAKE_NFS_STATS_CHANGED"
-       if [ -z "$_t" ]; then
+       if [ -z "$_cmd" ]; then
+               # No stats command, statistics don't change...
                 return 1
         fi
-       if [ "${_t#* "${_rpc_service}"}" != "$_t" ]; then
-               return 0
-       fi
-       # Statistics always change on the first iteration
-       if [ "$_iteration" -eq 1 ]; then
-               return 0
+
+       _curr="${CTDB_TEST_TMP_DIR}/${_rpc_service}.stats"
+       _prev="${_curr}.prev"
+
+       : >"$_prev"
+       if [ -e "$_curr" ]; then
+               mv "$_curr" "$_prev"
         fi
  
-       return 1
+       eval "$_cmd" >"$_curr"
+
+       ! diff "$_prev" "$_curr" >/dev/null
  }
  
  rpcinfo_timed_out()
@@ -344,6 +342,7 @@ rpc_set_service_failure_response()
                 # Unused, but for completeness, possible future use
                 service_check_cmd=""
                 service_debug_cmd=""
+               service_stats_cmd=""
  
                 # Don't bother syntax checking, eventscript does that...
                 . "$_file"
@@ -360,6 +359,17 @@ rpc_set_service_failure_response()
                         esac
                 fi
  
+               # It doesn't matter here if the statistics have
+               # changed.  However, this generates the current
+               # statistics, which needs to happen, regardless of
+               # service health, so they can be compared when they
+               # matter...
+               _stats_changed=false
+               if nfs_stats_check_changed \
+                          "$_rpc_service" "$service_stats_cmd"; then
+                       _stats_changed=true
+               fi
+
                 _why=""
                 _ri_out=$(rpcinfo -T tcp localhost "$_rpc_service" 2>&1)
                 # Check exit code separately for readability
@@ -370,9 +380,7 @@ rpc_set_service_failure_response()
                 elif rpcinfo_timed_out "$_ri_out"; then
                         _why="Timed out"
  
-                       if nfs_stats_check_changed \
-                               "$_rpc_service" "$_iteration"; then
-
+                       if $_stats_changed; then
                                 rpc_failure \
                                         "WARNING: statistics changed but" \
                                         "$_rpc_service" \
author	Martin Schwenke <mschwenke@ddn.com>
	Sat, 29 Jun 2024 09:24:25 +0000 (19:24 +1000)
committer	Martin Schwenke <martins@samba.org>
	Tue, 20 Aug 2024 22:50:34 +0000 (22:50 +0000)
ctdb/config/events/legacy/60.nfs.script		patch \| blob \| blame \| history
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh		patch \| blob \| blame \| history
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh		patch \| blob \| blame \| history
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh		patch \| blob \| blame \| history
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh		patch \| blob \| blame \| history
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.119.sh	[new file with mode: 0755]	patch \| blob
ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh		patch \| blob \| blame \| history