]> git.ipfire.org Git - thirdparty/samba.git/commitdiff
ctdb-scripts: Avoid flapping NFS services at startup
authorMartin Schwenke <mschwenke@ddn.com>
Sat, 29 Jun 2024 02:25:59 +0000 (12:25 +1000)
committerMartin Schwenke <martins@samba.org>
Tue, 20 Aug 2024 22:50:34 +0000 (22:50 +0000)
If an NFS service check is set to, say, unhealthy_after=2 then it will
always switch from the (default startup) unhealthy state to healthy,
even if there is a fatal problem.  If all services/scripts appear OK
then the node will become healthy.  When the counter hits the limit it
will return to unhealthy.  This is misleading.

Instead, never use the counter at startup, until the service becomes
healthy.  This stops services flapping unhealthy-healthy-unhealthy.

A side-effect is that a service that starts in a broken state will
never be restarted to try to fix the problem.  This makes sense.  The
counting and restarting really exist to deal with problems that might
occur under load.  The first monitor events occur before public IPs
are hosted, so there can be no load.  If a service doesn't start
reliably the first time then the admin probably wants to know about
it.

nfs_iterate_test() is updated to run an initial monitor event to mark
the services as healthy.  This initialises the counter so it can be
used for the important part of the test.  Passing the -i option avoids
running the extra monitor event, so the first iteration will be the
initial monitor event.

Signed-off-by: Martin Schwenke <mschwenke@ddn.com>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/config/events/legacy/60.nfs.script
ctdb/config/functions
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh [new file with mode: 0755]
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh [new file with mode: 0755]
ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh

index 61e25a5ea0e1cc586bbb07375f7ede73de2f4b0c..bc5be241f67c933380ef061a866d982488f19965 100755 (executable)
@@ -193,6 +193,12 @@ nfs_check_service()
                        exit 0
                fi
 
+               # Don't count immediately after startup
+               if ! ctdb_counter_exists "$_progname"; then
+                       echo "ERROR: $_err"
+                       exit 1
+               fi
+
                ctdb_counter_incr "$_progname"
                _failcount=$(ctdb_counter_get "$_progname")
 
index ef79dbf2162d896f8b14cc43ea182a7d98228e37..f8f539ad53f76473bfe7d120038741521016b568 100755 (executable)
@@ -866,6 +866,11 @@ ctdb_counter_get()
        # shellcheck disable=SC2086
        echo $_val
 }
+ctdb_counter_exists()
+{
+       _ctdb_counter_common "$1"
+       [ -e "$_counter_file" ]
+}
 
 #
 # Fail counter/threshold combination to control warnings and node unhealthy
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh
new file mode 100755 (executable)
index 0000000..71d0f18
--- /dev/null
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "nfs down, 1 iteration, not previously healthy"
+
+setup
+
+nfs_iterate_test -i 1 "nfs"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh
new file mode 100755 (executable)
index 0000000..81851a2
--- /dev/null
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "nfs down, 10 iterations, not previously healthy"
+
+setup
+
+nfs_iterate_test -i 10 "nfs"
index dd7035b73030ccd0394f8faff3e21e7a5b322ee1..fa7797c694533b438c4486eb5b050f6fa20ebbac 100644 (file)
@@ -22,6 +22,8 @@ EOF
 
        export RPCNFSDCOUNT
 
+       TEST_RPC_ALL_SERVICES="portmapper nfs mountd rquotad nlockmgr status"
+
        if [ "$1" != "down" ]; then
                debug <<EOF
 Setting up NFS environment: all RPC services up, NFS managed by CTDB
@@ -40,9 +42,9 @@ EOF
                        ;;
                esac
 
-               _rpc_services_up \
-                       "portmapper" "nfs" "mountd" "rquotad" \
-                       "nlockmgr" "status"
+               # Intentional word splitting
+               # shellcheck disable=SC2086
+               _rpc_services_up $TEST_RPC_ALL_SERVICES
 
                nfs_setup_fake_threads "nfsd"
                nfs_setup_fake_threads "rpc.foobar" # Set the variable to empty
@@ -283,6 +285,35 @@ program ${_rpc_service}${_ver:+ version }${_ver} is not available
 EOF
 }
 
+_rpc_was_healthy_common()
+{
+       _rpc_service="$1"
+
+       _f="rpc.${_rpc_service}.was_healthy"
+       _rpc_was_healthy_file="${CTDB_TEST_TMP_DIR}/${_f}"
+}
+
+_rpc_set_was_healthy()
+{
+       if [ $# -eq 0 ]; then
+               # Intentional word splitting
+               # shellcheck disable=SC2086
+               set -- $TEST_RPC_ALL_SERVICES
+       fi
+
+       for _rpc_service; do
+               _rpc_was_healthy_common "$_rpc_service"
+               touch "$_rpc_was_healthy_file"
+       done
+}
+
+_rpc_check_was_healthy()
+{
+       _rpc_was_healthy_common "$1"
+
+       [ -e "$_rpc_was_healthy_file" ]
+}
+
 # Set the required result for a particular RPC program having failed
 # for a certain number of iterations.  This is probably still a work
 # in progress.  Note that we could hook aggressively
@@ -299,6 +330,7 @@ rpc_set_service_failure_response()
        ok_null
 
        if [ -z "$_rpc_service" ]; then
+               _rpc_set_was_healthy
                return
        fi
 
@@ -376,6 +408,7 @@ rpc_set_service_failure_response()
                # shellcheck disable=SC2181
                if [ $? -eq 0 ]; then
                        echo 0 >"$_failcount_file"
+                       _rpc_set_was_healthy "$_rpc_service"
                        exit # from subshell
                elif rpcinfo_timed_out "$_ri_out"; then
                        _why="Timed out"
@@ -390,6 +423,10 @@ rpc_set_service_failure_response()
                                echo 0 >"$_failcount_file"
                                exit # from subshell
                        fi
+               elif ! _rpc_check_was_healthy "$_rpc_service"; then
+                       echo 1 >"$_rc_file"
+                       rpc_failure "ERROR:" "$_rpc_service" "$_ver" >"$_out"
+                       exit # from subshell
                fi
 
                _numfails=$((_numfails + 1))
@@ -407,6 +444,7 @@ rpc_set_service_failure_response()
                                >"$_out"
                else
                        _unhealthy=false
+                       _rpc_set_was_healthy "$_rpc_service"
                fi
 
                if [ $restart_every -gt 0 ] &&
@@ -480,6 +518,12 @@ program_stack_traces()
 #
 nfs_iterate_test()
 {
+       _initial_monitor_event=false
+       if [ "$1" = "-i" ]; then
+               shift
+               _initial_monitor_event=true
+       fi
+
        _repeats="$1"
        _rpc_service="$2"
        _up_iteration="${3:--1}"
@@ -490,10 +534,6 @@ nfs_iterate_test()
        fi
 
        if [ -n "$_rpc_service" ]; then
-               debug <<EOF
---------------------------------------------------
-EOF
-
                _action="${_rpc_service#*:}"
                if [ "$_action" != "$_rpc_service" ]; then
                        _rpc_service="${_rpc_service%:*}"
@@ -501,6 +541,23 @@ EOF
                        _action=""
                fi
 
+               if ! $_initial_monitor_event; then
+                       cat <<EOF
+--------------------------------------------------
+Running initial monitor event
+
+EOF
+                       # Remember a successful test result...
+                       rpc_set_service_failure_response "$_rpc_service"
+                       # ... and a successful monitor result
+                       simple_test
+               fi
+
+
+               cat <<EOF
+--------------------------------------------------
+EOF
+
                if [ -n "$_action" ]; then
                        case "$_action" in
                        TIMEOUT)