# traces of threads that have not exited, since
# they may be stuck doing I/O;
# no default, see also function program_stack_traces()
+# * service_stats_cmd - command to retrieve statistics for given service;
+# if this is set and RPC checks fail (or
+# $service_check_cmd fails), then statistics are
+# compared (using cmp) to see if the service is
+# making progress or is truly hung;
+# no default, failed service does not double-check
+# failure using statistics
#
# Quoting in values is not preserved
#
service_start_cmd=""
service_check_cmd=""
service_debug_cmd=""
+ service_stats_cmd=""
# Eval line-by-line. Expands variable references in values.
# Also allows variable name checking, which seems useful.
family=* | version=* | \
unhealthy_after=* | restart_every=* | \
service_stop_cmd=* | service_start_cmd=* | \
- service_check_cmd=* | service_debug_cmd=*)
+ service_check_cmd=* | service_debug_cmd=* | \
+ service_stats_cmd=*)
eval "$_line"
;;
fi
fi
+ if [ -n "$service_stats_cmd" ]; then
+ # If configured, always update stats,
+ # regardless of RPC status...
+
+ # shellcheck disable=SC2154
+ # script_state_dir set by ctdb_setup_state_dir
+ _curr="${script_state_dir}/stats_${_progname}.out"
+ _prev="${_curr}.prev"
+
+ if [ -f "$_curr" ]; then
+ mv -f "$_curr" "$_prev"
+ fi
+ eval "$service_stats_cmd" >"$_curr" 2>&1
+
+ if ! $_ok &&
+ ! cmp "$_prev" "$_curr" >/dev/null 2>&1; then
+ # Stats always implicitly change on
+ # the first monitor event, since
+ # previous stats don't exists...
+ echo "WARNING: statistics changed but ${_err}"
+ _ok=true
+ fi
+ fi
+
if $_ok; then
if [ $unhealthy_after -ne 1 ] ||
[ $restart_every -ne 0 ]; then
--- /dev/null
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service down, stats change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="date --rfc-3339=ns | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "nfs" "status"
+
+rpc_services_down "nfs"
+
+nfs_iterate_test 10 "nfs"
--- /dev/null
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service down, stats don't change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="echo 'hello world' | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "status"
+
+rpc_services_down "nfs"
+
+nfs_iterate_test 10 "nfs"
esac
}
+nfs_stats_set_changed()
+{
+ FAKE_NFS_STATS_CHANGED=" $* "
+}
+
+nfs_stats_check_changed()
+{
+ _rpc_service="$1"
+ _iteration="$2"
+
+ _t="$FAKE_NFS_STATS_CHANGED"
+ if [ -z "$_t" ]; then
+ return 1
+ fi
+ if [ "${_t#* "${_rpc_service}"}" != "$_t" ]; then
+ return 0
+ fi
+ # Statistics always change on the first iteration
+ if [ "$_iteration" -eq 1 ]; then
+ return 0
+ fi
+
+ return 1
+}
+
guess_output()
{
case "$1" in
rpcinfo: RPC: Program not registered
program $_rpc_service${_ver:+ version }${_ver} is not available"
- if [ $unhealthy_after -gt 0 ] &&
+ if [ "$_numfails" -eq -1 ]; then
+ _unhealthy=false
+ echo 0 >"$_rc_file"
+ printf 'WARNING: statistics changed but %s\n' \
+ "$_rpc_check_out" >>"$_out"
+ elif [ $unhealthy_after -gt 0 ] &&
[ "$_numfails" -ge $unhealthy_after ]; then
_unhealthy=true
echo 1 >"$_rc_file"
fi
if [ -n "$_rpc_service" ]; then
if rpcinfo -T tcp localhost "$_rpc_service" \
- >/dev/null 2>&1 ; then
+ >/dev/null 2>&1; then
_iterate_failcount=0
+ elif nfs_stats_check_changed \
+ "$_rpc_service" "$_iteration"; then
+ _iterate_failcount=-1
else
+ # -1 above is a special case of 0:
+ # hack, unhack ;-)
+ if [ $_iterate_failcount -eq -1 ]; then
+ _iterate_failcount=0
+ fi
_iterate_failcount=$((_iterate_failcount + 1))
fi
rpc_set_service_failure_response \