From: Martin Schwenke Date: Fri, 3 Mar 2023 06:49:05 +0000 (+1100) Subject: ctdb-scripts: Implement failcount handling with thresholds X-Git-Tag: tevent-0.16.0~251 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8303c3a534fe2d31687d9d6386ba9c8a341c7a06;p=thirdparty%2Fsamba.git ctdb-scripts: Implement failcount handling with thresholds This can be used for simple failure counting, without restarts, as used in the 40.vsftpd event script. That case will subsequently be converted and this functionality can also be used elsewhere. Add documentation to ctdb-script.options(5) to allow parameters that use this to be more easily described. Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs --- diff --git a/ctdb/config/functions b/ctdb/config/functions index 5a7737ec34a..d8f7f57b84c 100755 --- a/ctdb/config/functions +++ b/ctdb/config/functions @@ -760,6 +760,114 @@ ctdb_counter_get() echo $_val } +# +# Fail counter/threshold combination to control warnings and node unhealthy +# + +_failcount_validate_threshold() +{ + case "$1" in + "") return 1 ;; # A failure that doesn't need a warning + *) + if echo "$1" | grep -qx '[0-9]*'; then + return 0 + fi + + echo "WARNING: ${1} is an invalid threshold in \"${2}\" check" + return 1 + ;; + esac +} + +_failcount_common() +{ + _thing="$1" + + _counter=$(echo "$_thing" | sed -e 's@/@_SLASH_@g' -e 's@ @_@g') +} + +failcount_init() +{ + _thing="$1" + + _failcount_common "$_thing" + + ctdb_counter_init "$_counter" +} + +failcount_reset() +{ + _thing="$1" + + _failcount_common "$_thing" + + _failcount=$(ctdb_counter_get "$_counter") + if [ "$_failcount" -eq 0 ]; then + return + fi + + printf 'NOTICE: %s: no longer failing\n' "$_thing" + ctdb_counter_init "$_counter" +} + +failcount_incr() +{ + _thing="$1" + _thresholds="$2" + _output="$3" + + _failcount_common "$_thing" + + ctdb_counter_incr "$_counter" + _failcount=$(ctdb_counter_get "$_counter") + + case "$_thresholds" in + *:*) + _warn_threshold="${_thresholds%:*}" + _unhealthy_threshold="${_thresholds#*:}" + ;; + "") + _warn_threshold=1 + _unhealthy_threshold="" + ;; + *) + _warn_threshold="$_thresholds" + _unhealthy_threshold="" + ;; + esac + + if _failcount_validate_threshold "$_unhealthy_threshold" "$_thing"; then + if [ "$_failcount" -ge "$_unhealthy_threshold" ]; then + printf 'ERROR: %s: fail count %d >= threshold %d\n' \ + "$_thing" \ + "$_failcount" \ + "$_unhealthy_threshold" + # Only print output when exceeding the + # unhealthy threshold + if [ "$_failcount" -eq "$_unhealthy_threshold" ] && \ + [ -n "$_output" ]; then + echo "$_output" + fi + exit 1 + fi + fi + + if _failcount_validate_threshold "$_warn_threshold" "$_thing"; then + if [ "$_failcount" -lt "$_warn_threshold" ]; then + return 0 + fi + fi + + printf 'WARNING: %s: fail count %d >= threshold %d\n' \ + "$_thing" \ + "$_failcount" \ + "$_warn_threshold" + if [ "$_failcount" -eq "$_warn_threshold" ] && [ -n "$_output" ]; then + # Only print output when exceeding the warning threshold + echo "$_output" + fi +} + ######################################################## # ctdb_setup_state_dir diff --git a/ctdb/doc/ctdb-script.options.5.xml b/ctdb/doc/ctdb-script.options.5.xml index 6da272e30a4..3a39d6feb48 100644 --- a/ctdb/doc/ctdb-script.options.5.xml +++ b/ctdb/doc/ctdb-script.options.5.xml @@ -73,6 +73,29 @@ + + Monitoring Thresholds + + + Event scripts can monitor resources or services. When a + problem is detected, it may be better to warn about a problem + rather than to immediately fail monitoring and mark a node as + unhealthy. CTDB provides support for event scripts to do + threshold-based monitoring. + + + + A threshold setting looks like + WARNING_THRESHOLD:ERROR_THRESHOLD. + If the number of problems is ≥ WARNING_THRESHOLD then the + script will log a warning and continue. If the number + problems is ≥ ERROR_THRESHOLD then the script will log an + error and exit with failure, causing monitoring to fail. Note + that ERROR_THRESHOLD is optional, and follows the optional + colon (:) separator. + + +