MAJOR: contrib/prometheus-exporter: move health check status to labels

author William Dauchy <wdauchy@gmail.com>

Mon, 1 Feb 2021 12:11:51 +0000 (13:11 +0100)

committer Christopher Faulet <cfaulet@haproxy.com>

Mon, 1 Feb 2021 14:16:33 +0000 (15:16 +0100)
author William Dauchy <wdauchy@gmail.com>
Mon, 1 Feb 2021 12:11:51 +0000 (13:11 +0100)
committer Christopher Faulet <cfaulet@haproxy.com>
Mon, 1 Feb 2021 14:16:33 +0000 (15:16 +0100)
diff --git a/contrib/prometheus-exporter/README b/contrib/prometheus-exporter/README

index 5b163d1312446fa1466a74f18ad9e3279400932d..f948ac9b1a82066742b2aae65a1a06bde7f8c2aa 100644 (file)
--- a/contrib/prometheus-exporter/README
+++ b/contrib/prometheus-exporter/README
@@ -73,7 +73,9 @@ exported. Here are examples:
    /metrics?scope=*&scope=               # ==> no metrics will be exported
    /metrics?scope=&scope=global          # ==> global metrics will be exported
  
-* Filtering on servers state
+* How do I prevent my prometheus instance to explode?
+
+** Filtering on servers state
  
  It is possible to exclude from returned metrics all servers in maintenance mode
  passing the parameter "no-maint" in the query-string. This parameter may help to
@@ -82,6 +84,26 @@ manage dynamic provisionning. Note there is no consistency check on the servers
  state. So, if the state of a server changes while the exporter is running, only
  a part of the metrics for this server will be dumped.
  
+prometheus example config:
+
+For server-template users:
+- <job>
+  params:
+    no-maint:
+    - empty
+
+** Scrap server health checks only
+
+All health checks status are dump through `state` label values. If you want to
+scrap server health check status but prevent all server metrics to be saved,
+except the server_check_status, you may configure prometheus that way:
+
+- <job>
+   metric_relabel_configs:
+   - source_labels: ['__name__']
+      regex: 'haproxy_(process_|frontend_|backend_|server_check_status).*'
+      action: keep
+
  Exported metrics
  ------------------
  
@@ -292,30 +314,3 @@ Exported metrics
  | haproxy_server_used_connections_current            | Current number of connections in use.                                     |
  | haproxy_server_need_connections_current            | Estimated needed number of connections.                                   |
  +----------------------------------------------------+---------------------------------------------------------------------------+
-
-Mapping of health check status :
-
-   0 : HCHK_STATUS_UNKNOWN  (Unknown)
-   1 : HCHK_STATUS_INI      (Initializing)
-
-   4 : HCHK_STATUS_HANA     (Health analyze detected enough consecutive errors)
-
-   5 : HCHK_STATUS_SOCKERR  (Socket error)
-
-   6 : HCHK_STATUS_L4OK     (L4 check passed, for example tcp connect)
-   7 : HCHK_STATUS_L4TOUT   (L4 timeout)
-   8 : HCHK_STATUS_L4CON    (L4 connection problem)
-
-   9 : HCHK_STATUS_L6OK     (L6 check passed)
-  10 : HCHK_STATUS_L6TOUT   (L6 (SSL) timeout)
-  11 : HCHK_STATUS_L6RSP    (L6 invalid response - protocol error)
-
-  12 : HCHK_STATUS_L7TOUT   (L7 (HTTP/SMTP) timeout)
-  13 : HCHK_STATUS_L7RSP    (L7 invalid response - protocol error)
-  15 : HCHK_STATUS_L7OKD    (L7 check passed)
-  16 : HCHK_STATUS_L7OKCD   (L7 check conditionally passed)
-  17 : HCHK_STATUS_L7STS    (L7 response error, for example HTTP 5xx)
-
-  18 : HCHK_STATUS_PROCERR  (External process check failure)
-  19 : HCHK_STATUS_PROCTOUT (External process check timeout)
-  20 : HCHK_STATUS_PROCOK   (External process check passed)
diff --git a/contrib/prometheus-exporter/service-prometheus.c b/contrib/prometheus-exporter/service-prometheus.c

index dbf4c7f396f5a139bb79cc5c30282dc41e8ef2d7..df9c7cfdf293c8e0e2dbf09c37b9d0ddb78c5e57 100644 (file)
--- a/contrib/prometheus-exporter/service-prometheus.c
+++ b/contrib/prometheus-exporter/service-prometheus.c
@@ -18,6 +18,7 @@
  #include <haproxy/applet.h>
  #include <haproxy/backend.h>
  #include <haproxy/cfgparse.h>
+#include <haproxy/check.h>
  #include <haproxy/compression.h>
  #include <haproxy/dns.h>
  #include <haproxy/frontend.h>
@@ -319,7 +320,7 @@ const struct ist promex_st_metric_desc[ST_F_TOTAL_FIELDS] = {
         [ST_F_RATE]           = IST("Current number of sessions per second over last elapsed second."),
         [ST_F_RATE_LIM]       = IST("Configured limit on new sessions per second."),
         [ST_F_RATE_MAX]       = IST("Maximum observed number of sessions per second."),
-       [ST_F_CHECK_STATUS]   = IST("Status of last health check (HCHK_STATUS_* values)."),
+       [ST_F_CHECK_STATUS]   = IST("Status of last health check, per state label value."),
         [ST_F_CHECK_CODE]     = IST("layer5-7 code, if available of the last health check."),
         [ST_F_CHECK_DURATION] = IST("Total duration of the latest server health check, in seconds."),
         [ST_F_HRSP_1XX]       = IST("Total number of HTTP responses."),
@@ -886,6 +887,7 @@ static int promex_dump_srv_metrics(struct appctx *appctx, struct htx *htx)
         int ret = 1;
         double secs;
         enum promex_srv_state state;
+       const char *check_state;
         int i;
  
         for (;appctx->st2 < ST_F_TOTAL_FIELDS; appctx->st2++) {
@@ -963,8 +965,19 @@ static int promex_dump_srv_metrics(struct appctx *appctx, struct htx *htx)
                                         case ST_F_CHECK_STATUS:
                                                 if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) != CHK_ST_ENABLED)
                                                         goto next_sv;
-                                               val = mkf_u32(FN_OUTPUT, sv->check.status);
-                                               break;
+
+                                               for (i = 0; i < HCHK_STATUS_SIZE; i++) {
+                                                       if (get_check_status_result(i) < CHK_RES_FAILED)
+                                                               continue;
+                                                       val = mkf_u32(FO_STATUS, sv->check.status == i);
+                                                       check_state = get_check_status_info(i);
+                                                       labels[2].name = ist("state");
+                                                       labels[2].value = ist2(check_state, strlen(check_state));
+                                                       if (!promex_dump_metric(appctx, htx, prefix, &promex_st_metrics[appctx->st2],
+                                                                               &val, labels, &out, max))
+                                                               goto full;
+                                               }
+                                               goto next_sv;
                                         case ST_F_CHECK_CODE:
                                                 if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) != CHK_ST_ENABLED)
                                                         goto next_sv;
author	William Dauchy <wdauchy@gmail.com>
	Mon, 1 Feb 2021 12:11:51 +0000 (13:11 +0100)
committer	Christopher Faulet <cfaulet@haproxy.com>
	Mon, 1 Feb 2021 14:16:33 +0000 (15:16 +0100)
contrib/prometheus-exporter/README		patch \| blob \| blame \| history
contrib/prometheus-exporter/service-prometheus.c		patch \| blob \| blame \| history