From 578dfa576517b10d979c9aef539ac910b2f95381 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Sat, 29 Jun 2024 12:25:59 +1000 Subject: [PATCH] ctdb-scripts: Avoid flapping NFS services at startup If an NFS service check is set to, say, unhealthy_after=2 then it will always switch from the (default startup) unhealthy state to healthy, even if there is a fatal problem. If all services/scripts appear OK then the node will become healthy. When the counter hits the limit it will return to unhealthy. This is misleading. Instead, never use the counter at startup, until the service becomes healthy. This stops services flapping unhealthy-healthy-unhealthy. A side-effect is that a service that starts in a broken state will never be restarted to try to fix the problem. This makes sense. The counting and restarting really exist to deal with problems that might occur under load. The first monitor events occur before public IPs are hosted, so there can be no load. If a service doesn't start reliably the first time then the admin probably wants to know about it. nfs_iterate_test() is updated to run an initial monitor event to mark the services as healthy. This initialises the counter so it can be used for the important part of the test. Passing the -i option avoids running the extra monitor event, so the first iteration will be the initial monitor event. Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs --- ctdb/config/events/legacy/60.nfs.script | 6 ++ ctdb/config/functions | 5 ++ .../UNIT/eventscripts/60.nfs.monitor.171.sh | 9 +++ .../UNIT/eventscripts/60.nfs.monitor.172.sh | 9 +++ .../tests/UNIT/eventscripts/scripts/60.nfs.sh | 71 +++++++++++++++++-- 5 files changed, 93 insertions(+), 7 deletions(-) create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh diff --git a/ctdb/config/events/legacy/60.nfs.script b/ctdb/config/events/legacy/60.nfs.script index 61e25a5ea0e..bc5be241f67 100755 --- a/ctdb/config/events/legacy/60.nfs.script +++ b/ctdb/config/events/legacy/60.nfs.script @@ -193,6 +193,12 @@ nfs_check_service() exit 0 fi + # Don't count immediately after startup + if ! ctdb_counter_exists "$_progname"; then + echo "ERROR: $_err" + exit 1 + fi + ctdb_counter_incr "$_progname" _failcount=$(ctdb_counter_get "$_progname") diff --git a/ctdb/config/functions b/ctdb/config/functions index ef79dbf2162..f8f539ad53f 100755 --- a/ctdb/config/functions +++ b/ctdb/config/functions @@ -866,6 +866,11 @@ ctdb_counter_get() # shellcheck disable=SC2086 echo $_val } +ctdb_counter_exists() +{ + _ctdb_counter_common "$1" + [ -e "$_counter_file" ] +} # # Fail counter/threshold combination to control warnings and node unhealthy diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh new file mode 100755 index 00000000000..71d0f18afb0 --- /dev/null +++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +. "${TEST_SCRIPTS_DIR}/unit.sh" + +define_test "nfs down, 1 iteration, not previously healthy" + +setup + +nfs_iterate_test -i 1 "nfs" diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh new file mode 100755 index 00000000000..81851a265a5 --- /dev/null +++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +. "${TEST_SCRIPTS_DIR}/unit.sh" + +define_test "nfs down, 10 iterations, not previously healthy" + +setup + +nfs_iterate_test -i 10 "nfs" diff --git a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh index dd7035b7303..fa7797c6945 100644 --- a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh +++ b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh @@ -22,6 +22,8 @@ EOF export RPCNFSDCOUNT + TEST_RPC_ALL_SERVICES="portmapper nfs mountd rquotad nlockmgr status" + if [ "$1" != "down" ]; then debug <"$_failcount_file" + _rpc_set_was_healthy "$_rpc_service" exit # from subshell elif rpcinfo_timed_out "$_ri_out"; then _why="Timed out" @@ -390,6 +423,10 @@ rpc_set_service_failure_response() echo 0 >"$_failcount_file" exit # from subshell fi + elif ! _rpc_check_was_healthy "$_rpc_service"; then + echo 1 >"$_rc_file" + rpc_failure "ERROR:" "$_rpc_service" "$_ver" >"$_out" + exit # from subshell fi _numfails=$((_numfails + 1)) @@ -407,6 +444,7 @@ rpc_set_service_failure_response() >"$_out" else _unhealthy=false + _rpc_set_was_healthy "$_rpc_service" fi if [ $restart_every -gt 0 ] && @@ -480,6 +518,12 @@ program_stack_traces() # nfs_iterate_test() { + _initial_monitor_event=false + if [ "$1" = "-i" ]; then + shift + _initial_monitor_event=true + fi + _repeats="$1" _rpc_service="$2" _up_iteration="${3:--1}" @@ -490,10 +534,6 @@ nfs_iterate_test() fi if [ -n "$_rpc_service" ]; then - debug <