]> git.ipfire.org Git - thirdparty/samba.git/commitdiff
ctdb-scripts: Implement failcount handling with thresholds
authorMartin Schwenke <mschwenke@ddn.com>
Fri, 3 Mar 2023 06:49:05 +0000 (17:49 +1100)
committerAmitay Isaacs <amitay@samba.org>
Tue, 3 Oct 2023 03:53:35 +0000 (03:53 +0000)
This can be used for simple failure counting, without restarts, as
used in the 40.vsftpd event script.  That case will subsequently be
converted and this functionality can also be used elsewhere.

Add documentation to ctdb-script.options(5) to allow parameters that
use this to be more easily described.

Signed-off-by: Martin Schwenke <mschwenke@ddn.com>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/config/functions
ctdb/doc/ctdb-script.options.5.xml

index 5a7737ec34ab44718c5644b7f1e0f7fc441c6adb..d8f7f57b84c8fb0b3f31612bf402db66a2f83163 100755 (executable)
@@ -760,6 +760,114 @@ ctdb_counter_get()
        echo $_val
 }
 
+#
+# Fail counter/threshold combination to control warnings and node unhealthy
+#
+
+_failcount_validate_threshold()
+{
+       case "$1" in
+       "") return 1 ;; # A failure that doesn't need a warning
+       *)
+               if echo "$1" | grep -qx '[0-9]*'; then
+                       return 0
+               fi
+
+               echo "WARNING: ${1} is an invalid threshold in \"${2}\" check"
+               return 1
+               ;;
+       esac
+}
+
+_failcount_common()
+{
+       _thing="$1"
+
+       _counter=$(echo "$_thing" | sed -e 's@/@_SLASH_@g' -e 's@ @_@g')
+}
+
+failcount_init()
+{
+       _thing="$1"
+
+       _failcount_common "$_thing"
+
+       ctdb_counter_init "$_counter"
+}
+
+failcount_reset()
+{
+       _thing="$1"
+
+       _failcount_common "$_thing"
+
+       _failcount=$(ctdb_counter_get "$_counter")
+       if [ "$_failcount" -eq 0 ]; then
+               return
+       fi
+
+       printf 'NOTICE: %s: no longer failing\n' "$_thing"
+       ctdb_counter_init "$_counter"
+}
+
+failcount_incr()
+{
+       _thing="$1"
+       _thresholds="$2"
+       _output="$3"
+
+       _failcount_common "$_thing"
+
+       ctdb_counter_incr "$_counter"
+       _failcount=$(ctdb_counter_get "$_counter")
+
+       case "$_thresholds" in
+       *:*)
+               _warn_threshold="${_thresholds%:*}"
+               _unhealthy_threshold="${_thresholds#*:}"
+               ;;
+       "")
+               _warn_threshold=1
+               _unhealthy_threshold=""
+               ;;
+       *)
+               _warn_threshold="$_thresholds"
+               _unhealthy_threshold=""
+               ;;
+       esac
+
+       if _failcount_validate_threshold "$_unhealthy_threshold" "$_thing"; then
+               if [ "$_failcount" -ge "$_unhealthy_threshold" ]; then
+                       printf 'ERROR: %s: fail count %d >= threshold %d\n' \
+                              "$_thing" \
+                              "$_failcount" \
+                              "$_unhealthy_threshold"
+                       # Only print output when exceeding the
+                       # unhealthy threshold
+                       if [ "$_failcount" -eq "$_unhealthy_threshold" ] && \
+                                  [ -n "$_output" ]; then
+                               echo "$_output"
+                       fi
+                       exit 1
+               fi
+       fi
+
+       if _failcount_validate_threshold "$_warn_threshold" "$_thing"; then
+               if [ "$_failcount" -lt "$_warn_threshold" ]; then
+                       return 0
+               fi
+       fi
+
+       printf 'WARNING: %s: fail count %d >= threshold %d\n' \
+              "$_thing" \
+              "$_failcount" \
+              "$_warn_threshold"
+       if [ "$_failcount" -eq "$_warn_threshold" ] && [ -n "$_output" ]; then
+               # Only print output when exceeding the warning threshold
+               echo "$_output"
+       fi
+}
+
 ########################################################
 
 # ctdb_setup_state_dir <type> <name>
index 6da272e30a46d2bf65cd2060d8e8f5536c971a12..3a39d6feb48eccf7e06dc6719baa54061c045843 100644 (file)
       </para>
     </refsect2>
 
+    <refsect2>
+      <title>Monitoring Thresholds</title>
+
+      <para>
+       Event scripts can monitor resources or services.  When a
+       problem is detected, it may be better to warn about a problem
+       rather than to immediately fail monitoring and mark a node as
+       unhealthy.  CTDB provides support for event scripts to do
+       threshold-based monitoring.
+      </para>
+
+      <para>
+       A threshold setting looks like
+       <parameter>WARNING_THRESHOLD<optional>:ERROR_THRESHOLD</optional></parameter>.
+       If the number of problems is ≥ WARNING_THRESHOLD then the
+       script will log a warning and continue.  If the number
+       problems is ≥ ERROR_THRESHOLD then the script will log an
+       error and exit with failure, causing monitoring to fail.  Note
+       that ERROR_THRESHOLD is optional, and follows the optional
+       colon (:) separator.
+      </para>
+    </refsect2>
+
   </refsect1>
 
   <refsect1>