From: Christopher Faulet Date: Thu, 21 Nov 2019 13:35:46 +0000 (+0100) Subject: MINOR: contrib/prometheus-exporter: Add heathcheck status/code in server metrics X-Git-Tag: v2.2-dev1~73 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cf403f32e4bd5217064f8831930b45c5fb37131b;p=thirdparty%2Fhaproxy.git MINOR: contrib/prometheus-exporter: Add heathcheck status/code in server metrics ST_F_CHECK_STATUS and ST_F_CHECK_CODE are now part of exported server metrics: * haproxy_server_check_status * haproxy_server_check_code The heathcheck status is an integer corresponding to HCHK_STATUS value. --- diff --git a/contrib/prometheus-exporter/README b/contrib/prometheus-exporter/README index b19acc1bdb..a9bd5e7a39 100644 --- a/contrib/prometheus-exporter/README +++ b/contrib/prometheus-exporter/README @@ -268,6 +268,8 @@ Exported metrics | haproxy_server_client_aborts_total | Total number of data transfers aborted by the client. | | haproxy_server_server_aborts_total | Total number of data transfers aborted by the server. | | haproxy_server_weight | Service weight. | +| haproxy_server_check_status | Status of last health check, if enabled. (see below for the mapping) | +| haproxy_server_check_code | layer5-7 code, if available of the last health check. | | haproxy_server_check_failures_total | Total number of failed check (Only when the server is up). | | haproxy_server_check_up_down_total | Total number of UP->DOWN transitions. | | haproxy_server_downtime_seconds_total | Total downtime (in seconds) for the service. | @@ -278,3 +280,30 @@ Exported metrics | haproxy_server_idle_connections_current | Current number of idle connections available for reuse. | | haproxy_server_idle_connections_limit | Limit on the number of available idle connections. | +----------------------------------------------------+---------------------------------------------------------------------------+ + +Mapping of health check status : + + 0 : HCHK_STATUS_UNKNOWN (Unknown) + 1 : HCHK_STATUS_INI (Initializing) + + 4 : HCHK_STATUS_HANA (Health analyze detected enough consecutive errors) + + 5 : HCHK_STATUS_SOCKERR (Socket error) + + 6 : HCHK_STATUS_L4OK (L4 check passed, for example tcp connect) + 7 : HCHK_STATUS_L4TOUT (L4 timeout) + 8 : HCHK_STATUS_L4CON (L4 connection problem) + + 9 : HCHK_STATUS_L6OK (L6 check passed) + 10 : HCHK_STATUS_L6TOUT (L6 (SSL) timeout) + 11 : HCHK_STATUS_L6RSP (L6 invalid response - protocol error) + + 12 : HCHK_STATUS_L7TOUT (L7 (HTTP/SMTP) timeout) + 13 : HCHK_STATUS_L7RSP (L7 invalid response - protocol error) + 15 : HCHK_STATUS_L7OKD (L7 check passed) + 16 : HCHK_STATUS_L7OKCD (L7 check conditionally passed) + 17 : HCHK_STATUS_L7STS (L7 response error, for example HTTP 5xx) + + 18 : HCHK_STATUS_PROCERR (External process check failure) + 19 : HCHK_STATUS_PROCTOUT (External process check timeout) + 20 : HCHK_STATUS_PROCOK (External process check passed) diff --git a/contrib/prometheus-exporter/service-prometheus.c b/contrib/prometheus-exporter/service-prometheus.c index 0f178eb64e..4cf216a23a 100644 --- a/contrib/prometheus-exporter/service-prometheus.c +++ b/contrib/prometheus-exporter/service-prometheus.c @@ -367,7 +367,7 @@ const int promex_srv_metrics[ST_F_TOTAL_FIELDS] = { [ST_F_WRETR] = ST_F_WREDIS, [ST_F_WREDIS] = ST_F_WREW, [ST_F_STATUS] = ST_F_SCUR, - [ST_F_WEIGHT] = ST_F_CHKFAIL, + [ST_F_WEIGHT] = ST_F_CHECK_STATUS, [ST_F_ACT] = 0, [ST_F_BCK] = 0, [ST_F_CHKFAIL] = ST_F_CHKDOWN, @@ -385,8 +385,8 @@ const int promex_srv_metrics[ST_F_TOTAL_FIELDS] = { [ST_F_RATE] = 0, [ST_F_RATE_LIM] = 0, [ST_F_RATE_MAX] = ST_F_LASTSESS, - [ST_F_CHECK_STATUS] = 0, - [ST_F_CHECK_CODE] = 0, + [ST_F_CHECK_STATUS] = ST_F_CHECK_CODE, + [ST_F_CHECK_CODE] = ST_F_CHKFAIL, [ST_F_CHECK_DURATION] = 0, [ST_F_HRSP_1XX] = ST_F_HRSP_2XX, [ST_F_HRSP_2XX] = ST_F_HRSP_3XX, @@ -709,7 +709,7 @@ const struct ist promex_st_metric_desc[ST_F_TOTAL_FIELDS] = { [ST_F_RATE] = IST("Current number of sessions per second over last elapsed second."), [ST_F_RATE_LIM] = IST("Configured limit on new sessions per second."), [ST_F_RATE_MAX] = IST("Maximum observed number of sessions per second."), - [ST_F_CHECK_STATUS] = IST("Status of last health check (If a check is running, the status will be reported, prefixed with '* ')."), + [ST_F_CHECK_STATUS] = IST("Status of last health check (HCHK_STATUS_* values)."), [ST_F_CHECK_CODE] = IST("layer5-7 code, if available of the last health check."), [ST_F_CHECK_DURATION] = IST("Time in ms took to finish last health check."), [ST_F_HRSP_1XX] = IST("Total number of HTTP responses."), @@ -1027,8 +1027,8 @@ const struct ist promex_st_metric_types[ST_F_TOTAL_FIELDS] = { [ST_F_RATE] = IST("untyped"), [ST_F_RATE_LIM] = IST("gauge"), [ST_F_RATE_MAX] = IST("gauge"), - [ST_F_CHECK_STATUS] = IST("untyped"), - [ST_F_CHECK_CODE] = IST("untyped"), + [ST_F_CHECK_STATUS] = IST("gauge"), + [ST_F_CHECK_CODE] = IST("gauge"), [ST_F_CHECK_DURATION] = IST("gauge"), [ST_F_HRSP_1XX] = IST("counter"), [ST_F_HRSP_2XX] = IST("counter"), @@ -2012,6 +2012,16 @@ static int promex_dump_srv_metrics(struct appctx *appctx, struct htx *htx) weight = (sv->cur_eweight * px->lbprm.wmult + px->lbprm.wdiv - 1) / px->lbprm.wdiv; metric = mkf_u32(FN_AVG, weight); break; + case ST_F_CHECK_STATUS: + if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) != CHK_ST_ENABLED) + goto next_sv; + metric = mkf_u32(FN_OUTPUT, sv->check.status); + break; + case ST_F_CHECK_CODE: + if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) != CHK_ST_ENABLED) + goto next_sv; + metric = mkf_u32(FN_OUTPUT, (sv->check.status < HCHK_STATUS_L57DATA) ? 0 : sv->check.code); + break; case ST_F_CHKFAIL: metric = mkf_u64(FN_COUNTER, sv->counters.failed_checks); break;