]> git.ipfire.org Git - thirdparty/samba.git/commitdiff
ctdb-scripts: Only consider statistics on timeout
authorMartin Schwenke <mschwenke@ddn.com>
Sun, 30 Jun 2024 00:35:09 +0000 (10:35 +1000)
committerMartin Schwenke <martins@samba.org>
Tue, 20 Aug 2024 22:50:34 +0000 (22:50 +0000)
Checking statistics is only really relevant to timeouts.  That is, if
an rpcinfo times out it is worth checking if the service making
progress.  If the RPC service is not registered then the statistics
don't need to be checked because they shouldn't be changing.

The 2 previously added tests added to check statistics progress now
behave identically and fail on all iterations.  To support testing
with "timeouts", an optional TIMEOUT flag can now be added to the RPC
service passed to nfs_iterate_test().  2 new tests are added to
exercise the new behaviour.

The 2 new "if" statements in nfs_iterate_test() could be combined.
However, a subsequent commit would split them and would be more
difficult to read.

Signed-off-by: Martin Schwenke <mschwenke@ddn.com>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/config/events/legacy/60.nfs.script
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh [new file with mode: 0755]
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh [new file with mode: 0755]
ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
ctdb/tests/UNIT/eventscripts/stubs/rpcinfo

index 246a856bca8957b758c547319fe3d40131ece7af..bdea3d63fa1f0ceabc5b3988796bf069fe409ade 100755 (executable)
@@ -167,7 +167,13 @@ nfs_check_service()
                        fi
                        eval "$service_stats_cmd" >"$_curr" 2>&1
 
+                       # Only consider statistics on timeout.  This
+                       # is done below by checking if this string is
+                       # contained in $_err.
+                       _t="rpcinfo: RPC: Timed out"
+
                        if ! $_ok &&
+                               [ "${_err#*"${_t}"}" != "$_err" ] &&
                                ! cmp "$_prev" "$_curr" >/dev/null 2>&1; then
                                # Stats always implicitly change on
                                # the first monitor event, since
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh
new file mode 100755 (executable)
index 0000000..9d96bf1
--- /dev/null
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service timeout, stats change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="date --rfc-3339=ns | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "nfs" "status"
+
+nfs_iterate_test 10 "nfs:TIMEOUT"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh
new file mode 100755 (executable)
index 0000000..76013f1
--- /dev/null
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service timeout, stats don't change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="echo 'hello world' | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "status"
+
+nfs_iterate_test 10 "nfs:TIMEOUT"
index cabcc17a6cc526440e3a1f2224e75afc4cfc3556..24f309704243cbe0717477eae2974ac672ad3ebb 100644 (file)
@@ -82,6 +82,21 @@ _rpc_services_down()
        FAKE_RPCINFO_SERVICES="$_out"
 }
 
+_rpc_services_timeout()
+{
+       _out=""
+       for _s in $FAKE_RPCINFO_SERVICES; do
+               for _i; do
+                       if [ "$_i" = "${_s%%:*}" ]; then
+                               debug "Marking RPC service \"${_i}\" as TIMEOUT"
+                               _s="${_s}:TIMEOUT"
+                       fi
+               done
+               _out="${_out}${_out:+ }${_s}"
+       done
+       FAKE_RPCINFO_SERVICES="$_out"
+}
+
 _rpc_services_up()
 {
        _out="$FAKE_RPCINFO_SERVICES"
@@ -146,6 +161,11 @@ nfs_stats_check_changed()
        return 1
 }
 
+rpcinfo_timed_out()
+{
+       echo "$1" | grep -q "Timed out"
+}
+
 guess_output()
 {
        case "$1" in
@@ -256,10 +276,11 @@ rpc_failure()
        _err_or_warn="$1"
        _rpc_service="$2"
        _ver="$3"
+       _why="${4:-Program not registered}"
 
        cat <<EOF
 ${_err_or_warn} ${_rpc_service} failed RPC check:
-rpcinfo: RPC: Program not registered
+rpcinfo: RPC: ${_why}
 program ${_rpc_service}${_ver:+ version }${_ver} is not available
 EOF
 }
@@ -339,21 +360,28 @@ rpc_set_service_failure_response()
                        esac
                fi
 
-               if rpcinfo -T tcp localhost "$_rpc_service" \
-                       >/dev/null 2>&1; then
-
+               _why=""
+               _ri_out=$(rpcinfo -T tcp localhost "$_rpc_service" 2>&1)
+               # Check exit code separately for readability
+               # shellcheck disable=SC2181
+               if [ $? -eq 0 ]; then
                        echo 0 >"$_failcount_file"
                        exit # from subshell
-               elif nfs_stats_check_changed \
-                       "$_rpc_service" "$_iteration"; then
+               elif rpcinfo_timed_out "$_ri_out"; then
+                       _why="Timed out"
 
-                       rpc_failure \
-                               "WARNING: statistics changed but" \
-                               "$_rpc_service" \
-                               "$_ver" \
-                               >"$_out"
-                       echo 0 >"$_failcount_file"
-                       exit # from subshell
+                       if nfs_stats_check_changed \
+                               "$_rpc_service" "$_iteration"; then
+
+                               rpc_failure \
+                                       "WARNING: statistics changed but" \
+                                       "$_rpc_service" \
+                                       "$_ver" \
+                                       "$_why" \
+                                       >"$_out"
+                               echo 0 >"$_failcount_file"
+                               exit # from subshell
+                       fi
                fi
 
                _numfails=$((_numfails + 1))
@@ -367,6 +395,7 @@ rpc_set_service_failure_response()
                                "ERROR:" \
                                "$_rpc_service" \
                                "$_ver" \
+                               "$_why" \
                                >"$_out"
                else
                        _unhealthy=false
@@ -379,6 +408,7 @@ rpc_set_service_failure_response()
                                        "WARNING:" \
                                        "$_rpc_service" \
                                        "$_ver" \
+                                       "$_why" \
                                        >"$_out"
                        fi
 
@@ -424,7 +454,8 @@ program_stack_traces()
 #
 # - 1st argument is the number of iterations.
 #
-# - 2nd argument is the NFS/RPC service being tested
+# - 2nd argument is the NFS/RPC service being tested, with optional
+#   TIMEOUT flag
 #
 #   This service is marked down before the 1st iteration.
 #
@@ -454,7 +485,23 @@ nfs_iterate_test()
                debug <<EOF
 --------------------------------------------------
 EOF
-               _rpc_services_down "$_rpc_service"
+
+               _action="${_rpc_service#*:}"
+               if [ "$_action" != "$_rpc_service" ]; then
+                       _rpc_service="${_rpc_service%:*}"
+               else
+                       _action=""
+               fi
+
+               if [ -n "$_action" ]; then
+                       case "$_action" in
+                       TIMEOUT)
+                               _rpc_services_timeout "$_rpc_service"
+                               ;;
+                       esac
+               else
+                       _rpc_services_down "$_rpc_service"
+               fi
        fi
 
        debug <<EOF
index 8732751396e0c34fbb50fd5a203c87befeb99127..86b809411c435a363ba22fcd202340da061487bf 100755 (executable)
@@ -40,6 +40,8 @@ parse_options()
 
 parse_options "$@"
 
+_fail_msg="rpcinfo: RPC: Program not registered"
+
 for i in ${FAKE_RPCINFO_SERVICES}; do
        # This is stupidly cumulative, but needs to happen after the
        # initial split of the list above.
@@ -47,9 +49,16 @@ for i in ${FAKE_RPCINFO_SERVICES}; do
        # Want glob expansion
        # shellcheck disable=SC2086
        set -- $i
-       # $1 = program, $2 = low version, $3 = high version
+       # $1 = program, $2 = low version, $3 = high version, $4 = flag
 
        if [ "$1" = "$p" ]; then
+               case "$4" in
+               TIMEOUT)
+                       _fail_msg="rpcinfo: RPC: Timed out"
+                       break
+                       ;;
+               esac
+
                if [ -n "$v" ]; then
                        if [ "$2" -le "$v" ] && [ "$v" -le "$3" ]; then
                                echo "program ${p} version ${v} ready and waiting"
@@ -68,7 +77,7 @@ for i in ${FAKE_RPCINFO_SERVICES}; do
        fi
 done
 
-echo "rpcinfo: RPC: Program not registered" >&2
+echo "$_fail_msg" >&2
 if [ -n "$v" ]; then
        echo "program ${p} version ${v} is not available"
 else