From 578dfa576517b10d979c9aef539ac910b2f95381 Mon Sep 17 00:00:00 2001
From: Martin Schwenke <mschwenke@ddn.com>
Date: Sat, 29 Jun 2024 12:25:59 +1000
Subject: [PATCH] ctdb-scripts: Avoid flapping NFS services at startup

If an NFS service check is set to, say, unhealthy_after=2 then it will
always switch from the (default startup) unhealthy state to healthy,
even if there is a fatal problem.  If all services/scripts appear OK
then the node will become healthy.  When the counter hits the limit it
will return to unhealthy.  This is misleading.

Instead, never use the counter at startup, until the service becomes
healthy.  This stops services flapping unhealthy-healthy-unhealthy.

A side-effect is that a service that starts in a broken state will
never be restarted to try to fix the problem.  This makes sense.  The
counting and restarting really exist to deal with problems that might
occur under load.  The first monitor events occur before public IPs
are hosted, so there can be no load.  If a service doesn't start
reliably the first time then the admin probably wants to know about
it.

nfs_iterate_test() is updated to run an initial monitor event to mark
the services as healthy.  This initialises the counter so it can be
used for the important part of the test.  Passing the -i option avoids
running the extra monitor event, so the first iteration will be the
initial monitor event.

Signed-off-by: Martin Schwenke <mschwenke@ddn.com>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
---
 ctdb/config/events/legacy/60.nfs.script       |  6 ++
 ctdb/config/functions                         |  5 ++
 .../UNIT/eventscripts/60.nfs.monitor.171.sh   |  9 +++
 .../UNIT/eventscripts/60.nfs.monitor.172.sh   |  9 +++
 .../tests/UNIT/eventscripts/scripts/60.nfs.sh | 71 +++++++++++++++++--
 5 files changed, 93 insertions(+), 7 deletions(-)
 create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh
 create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh

diff --git a/ctdb/config/events/legacy/60.nfs.script b/ctdb/config/events/legacy/60.nfs.script
index 61e25a5ea0e..bc5be241f67 100755
--- a/ctdb/config/events/legacy/60.nfs.script
+++ b/ctdb/config/events/legacy/60.nfs.script
@@ -193,6 +193,12 @@ nfs_check_service()
 			exit 0
 		fi
 
+		# Don't count immediately after startup
+		if ! ctdb_counter_exists "$_progname"; then
+			echo "ERROR: $_err"
+			exit 1
+		fi
+
 		ctdb_counter_incr "$_progname"
 		_failcount=$(ctdb_counter_get "$_progname")
 
diff --git a/ctdb/config/functions b/ctdb/config/functions
index ef79dbf2162..f8f539ad53f 100755
--- a/ctdb/config/functions
+++ b/ctdb/config/functions
@@ -866,6 +866,11 @@ ctdb_counter_get()
 	# shellcheck disable=SC2086
 	echo $_val
 }
+ctdb_counter_exists()
+{
+	_ctdb_counter_common "$1"
+	[ -e "$_counter_file" ]
+}
 
 #
 # Fail counter/threshold combination to control warnings and node unhealthy
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh
new file mode 100755
index 00000000000..71d0f18afb0
--- /dev/null
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "nfs down, 1 iteration, not previously healthy"
+
+setup
+
+nfs_iterate_test -i 1 "nfs"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh
new file mode 100755
index 00000000000..81851a265a5
--- /dev/null
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "nfs down, 10 iterations, not previously healthy"
+
+setup
+
+nfs_iterate_test -i 10 "nfs"
diff --git a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
index dd7035b7303..fa7797c6945 100644
--- a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
+++ b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
@@ -22,6 +22,8 @@ EOF
 
 	export RPCNFSDCOUNT
 
+	TEST_RPC_ALL_SERVICES="portmapper nfs mountd rquotad nlockmgr status"
+
 	if [ "$1" != "down" ]; then
 		debug <<EOF
 Setting up NFS environment: all RPC services up, NFS managed by CTDB
@@ -40,9 +42,9 @@ EOF
 			;;
 		esac
 
-		_rpc_services_up \
-			"portmapper" "nfs" "mountd" "rquotad" \
-			"nlockmgr" "status"
+		# Intentional word splitting
+		# shellcheck disable=SC2086
+		_rpc_services_up $TEST_RPC_ALL_SERVICES
 
 		nfs_setup_fake_threads "nfsd"
 		nfs_setup_fake_threads "rpc.foobar" # Set the variable to empty
@@ -283,6 +285,35 @@ program ${_rpc_service}${_ver:+ version }${_ver} is not available
 EOF
 }
 
+_rpc_was_healthy_common()
+{
+	_rpc_service="$1"
+
+	_f="rpc.${_rpc_service}.was_healthy"
+	_rpc_was_healthy_file="${CTDB_TEST_TMP_DIR}/${_f}"
+}
+
+_rpc_set_was_healthy()
+{
+	if [ $# -eq 0 ]; then
+		# Intentional word splitting
+		# shellcheck disable=SC2086
+		set -- $TEST_RPC_ALL_SERVICES
+	fi
+
+	for _rpc_service; do
+		_rpc_was_healthy_common "$_rpc_service"
+		touch "$_rpc_was_healthy_file"
+	done
+}
+
+_rpc_check_was_healthy()
+{
+	_rpc_was_healthy_common "$1"
+
+	[ -e "$_rpc_was_healthy_file" ]
+}
+
 # Set the required result for a particular RPC program having failed
 # for a certain number of iterations.  This is probably still a work
 # in progress.  Note that we could hook aggressively
@@ -299,6 +330,7 @@ rpc_set_service_failure_response()
 	ok_null
 
 	if [ -z "$_rpc_service" ]; then
+		_rpc_set_was_healthy
 		return
 	fi
 
@@ -376,6 +408,7 @@ rpc_set_service_failure_response()
 		# shellcheck disable=SC2181
 		if [ $? -eq 0 ]; then
 			echo 0 >"$_failcount_file"
+			_rpc_set_was_healthy "$_rpc_service"
 			exit # from subshell
 		elif rpcinfo_timed_out "$_ri_out"; then
 			_why="Timed out"
@@ -390,6 +423,10 @@ rpc_set_service_failure_response()
 				echo 0 >"$_failcount_file"
 				exit # from subshell
 			fi
+		elif ! _rpc_check_was_healthy "$_rpc_service"; then
+			echo 1 >"$_rc_file"
+			rpc_failure "ERROR:" "$_rpc_service" "$_ver" >"$_out"
+			exit # from subshell
 		fi
 
 		_numfails=$((_numfails + 1))
@@ -407,6 +444,7 @@ rpc_set_service_failure_response()
 				>"$_out"
 		else
 			_unhealthy=false
+			_rpc_set_was_healthy "$_rpc_service"
 		fi
 
 		if [ $restart_every -gt 0 ] &&
@@ -480,6 +518,12 @@ program_stack_traces()
 #
 nfs_iterate_test()
 {
+	_initial_monitor_event=false
+	if [ "$1" = "-i" ]; then
+		shift
+		_initial_monitor_event=true
+	fi
+
 	_repeats="$1"
 	_rpc_service="$2"
 	_up_iteration="${3:--1}"
@@ -490,10 +534,6 @@ nfs_iterate_test()
 	fi
 
 	if [ -n "$_rpc_service" ]; then
-		debug <<EOF
---------------------------------------------------
-EOF
-
 		_action="${_rpc_service#*:}"
 		if [ "$_action" != "$_rpc_service" ]; then
 			_rpc_service="${_rpc_service%:*}"
@@ -501,6 +541,23 @@ EOF
 			_action=""
 		fi
 
+		if ! $_initial_monitor_event; then
+			cat <<EOF
+--------------------------------------------------
+Running initial monitor event
+
+EOF
+			# Remember a successful test result...
+			rpc_set_service_failure_response "$_rpc_service"
+			# ... and a successful monitor result
+			simple_test
+		fi
+
+
+		cat <<EOF
+--------------------------------------------------
+EOF
+
 		if [ -n "$_action" ]; then
 			case "$_action" in
 			TIMEOUT)
-- 
2.47.3