ctdb-scripts: Avoid flapping NFS services at startup

author Martin Schwenke <mschwenke@ddn.com>

Sat, 29 Jun 2024 02:25:59 +0000 (12:25 +1000)

committer Martin Schwenke <martins@samba.org>

Tue, 20 Aug 2024 22:50:34 +0000 (22:50 +0000)
author Martin Schwenke <mschwenke@ddn.com>
Sat, 29 Jun 2024 02:25:59 +0000 (12:25 +1000)
committer Martin Schwenke <martins@samba.org>
Tue, 20 Aug 2024 22:50:34 +0000 (22:50 +0000)
diff --git a/ctdb/config/events/legacy/60.nfs.script b/ctdb/config/events/legacy/60.nfs.script

index 61e25a5ea0e1cc586bbb07375f7ede73de2f4b0c..bc5be241f67c933380ef061a866d982488f19965 100755 (executable)
--- a/ctdb/config/events/legacy/60.nfs.script
+++ b/ctdb/config/events/legacy/60.nfs.script
@@ -193,6 +193,12 @@ nfs_check_service()
                         exit 0
                 fi
  
+               # Don't count immediately after startup
+               if ! ctdb_counter_exists "$_progname"; then
+                       echo "ERROR: $_err"
+                       exit 1
+               fi
+
                 ctdb_counter_incr "$_progname"
                 _failcount=$(ctdb_counter_get "$_progname")
  
diff --git a/ctdb/config/functions b/ctdb/config/functions

index ef79dbf2162d896f8b14cc43ea182a7d98228e37..f8f539ad53f76473bfe7d120038741521016b568 100755 (executable)
--- a/ctdb/config/functions
+++ b/ctdb/config/functions
@@ -866,6 +866,11 @@ ctdb_counter_get()
         # shellcheck disable=SC2086
         echo $_val
  }
+ctdb_counter_exists()
+{
+       _ctdb_counter_common "$1"
+       [ -e "$_counter_file" ]
+}
  
  #
  # Fail counter/threshold combination to control warnings and node unhealthy
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh

new file mode 100755 (executable)

index 0000000..71d0f18
--- /dev/null
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "nfs down, 1 iteration, not previously healthy"
+
+setup
+
+nfs_iterate_test -i 1 "nfs"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh

new file mode 100755 (executable)

index 0000000..81851a2
--- /dev/null
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "nfs down, 10 iterations, not previously healthy"
+
+setup
+
+nfs_iterate_test -i 10 "nfs"
diff --git a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh

index dd7035b73030ccd0394f8faff3e21e7a5b322ee1..fa7797c694533b438c4486eb5b050f6fa20ebbac 100644 (file)
--- a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
+++ b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
@@ -22,6 +22,8 @@ EOF
  
         export RPCNFSDCOUNT
  
+       TEST_RPC_ALL_SERVICES="portmapper nfs mountd rquotad nlockmgr status"
+
         if [ "$1" != "down" ]; then
                 debug <<EOF
  Setting up NFS environment: all RPC services up, NFS managed by CTDB
@@ -40,9 +42,9 @@ EOF
                         ;;
                 esac
  
-               _rpc_services_up \
-                       "portmapper" "nfs" "mountd" "rquotad" \
-                       "nlockmgr" "status"
+               # Intentional word splitting
+               # shellcheck disable=SC2086
+               _rpc_services_up $TEST_RPC_ALL_SERVICES
  
                 nfs_setup_fake_threads "nfsd"
                 nfs_setup_fake_threads "rpc.foobar" # Set the variable to empty
@@ -283,6 +285,35 @@ program ${_rpc_service}${_ver:+ version }${_ver} is not available
  EOF
  }
  
+_rpc_was_healthy_common()
+{
+       _rpc_service="$1"
+
+       _f="rpc.${_rpc_service}.was_healthy"
+       _rpc_was_healthy_file="${CTDB_TEST_TMP_DIR}/${_f}"
+}
+
+_rpc_set_was_healthy()
+{
+       if [ $# -eq 0 ]; then
+               # Intentional word splitting
+               # shellcheck disable=SC2086
+               set -- $TEST_RPC_ALL_SERVICES
+       fi
+
+       for _rpc_service; do
+               _rpc_was_healthy_common "$_rpc_service"
+               touch "$_rpc_was_healthy_file"
+       done
+}
+
+_rpc_check_was_healthy()
+{
+       _rpc_was_healthy_common "$1"
+
+       [ -e "$_rpc_was_healthy_file" ]
+}
+
  # Set the required result for a particular RPC program having failed
  # for a certain number of iterations.  This is probably still a work
  # in progress.  Note that we could hook aggressively
@@ -299,6 +330,7 @@ rpc_set_service_failure_response()
         ok_null
  
         if [ -z "$_rpc_service" ]; then
+               _rpc_set_was_healthy
                 return
         fi
  
@@ -376,6 +408,7 @@ rpc_set_service_failure_response()
                 # shellcheck disable=SC2181
                 if [ $? -eq 0 ]; then
                         echo 0 >"$_failcount_file"
+                       _rpc_set_was_healthy "$_rpc_service"
                         exit # from subshell
                 elif rpcinfo_timed_out "$_ri_out"; then
                         _why="Timed out"
@@ -390,6 +423,10 @@ rpc_set_service_failure_response()
                                 echo 0 >"$_failcount_file"
                                 exit # from subshell
                         fi
+               elif ! _rpc_check_was_healthy "$_rpc_service"; then
+                       echo 1 >"$_rc_file"
+                       rpc_failure "ERROR:" "$_rpc_service" "$_ver" >"$_out"
+                       exit # from subshell
                 fi
  
                 _numfails=$((_numfails + 1))
@@ -407,6 +444,7 @@ rpc_set_service_failure_response()
                                 >"$_out"
                 else
                         _unhealthy=false
+                       _rpc_set_was_healthy "$_rpc_service"
                 fi
  
                 if [ $restart_every -gt 0 ] &&
@@ -480,6 +518,12 @@ program_stack_traces()
  #
  nfs_iterate_test()
  {
+       _initial_monitor_event=false
+       if [ "$1" = "-i" ]; then
+               shift
+               _initial_monitor_event=true
+       fi
+
         _repeats="$1"
         _rpc_service="$2"
         _up_iteration="${3:--1}"
@@ -490,10 +534,6 @@ nfs_iterate_test()
         fi
  
         if [ -n "$_rpc_service" ]; then
-               debug <<EOF
---------------------------------------------------
-EOF
-
                 _action="${_rpc_service#*:}"
                 if [ "$_action" != "$_rpc_service" ]; then
                         _rpc_service="${_rpc_service%:*}"
@@ -501,6 +541,23 @@ EOF
                         _action=""
                 fi
  
+               if ! $_initial_monitor_event; then
+                       cat <<EOF
+--------------------------------------------------
+Running initial monitor event
+
+EOF
+                       # Remember a successful test result...
+                       rpc_set_service_failure_response "$_rpc_service"
+                       # ... and a successful monitor result
+                       simple_test
+               fi
+
+
+               cat <<EOF
+--------------------------------------------------
+EOF
+
                 if [ -n "$_action" ]; then
                         case "$_action" in
                         TIMEOUT)
author	Martin Schwenke <mschwenke@ddn.com>
	Sat, 29 Jun 2024 02:25:59 +0000 (12:25 +1000)
committer	Martin Schwenke <martins@samba.org>
	Tue, 20 Aug 2024 22:50:34 +0000 (22:50 +0000)
ctdb/config/events/legacy/60.nfs.script		patch \| blob \| blame \| history
ctdb/config/functions		patch \| blob \| blame \| history
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh	[new file with mode: 0755]	patch \| blob
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh	[new file with mode: 0755]	patch \| blob
ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh		patch \| blob \| blame \| history