]> git.ipfire.org Git - thirdparty/haproxy.git/commitdiff
MINOR: promex: Add agent check status/code/duration metrics
authorChristopher Faulet <cfaulet@haproxy.com>
Thu, 22 May 2025 07:37:09 +0000 (09:37 +0200)
committerChristopher Faulet <cfaulet@haproxy.com>
Thu, 22 May 2025 07:50:10 +0000 (09:50 +0200)
In the Prometheus exporter, the last health check status is already exposed,
with its code and duration in seconds. The server status is also exposed.
But the information about the agent check are not available. It is not
really handy because when a server status is changed because of the agent,
it is not obvious by looking to the Prometheus metrics. Indeed, the server
may reported as DOWN for instance, while the health check status still
reports a success. Being able to get the agent status in that case could be
valuable.

So now, the last agent check status is exposed, with its code and duration
in seconds. Following metrics can be grabbe now:

  * haproxy_server_agent_status
  * haproxy_server_agent_code
  * haproxy_server_agent_duration_seconds

Note that unlike the other metrics, no per-backend aggregated metric is
exposed.

This patch is related to issue #2983.

addons/promex/README
addons/promex/service-prometheus.c
src/stats-proxy.c

index 44a298848d214b2f8c329c036230715209a29773..61d99de152328817e475dee93c6cd7b35dc6572f 100644 (file)
@@ -389,6 +389,9 @@ listed below. Metrics from extra counters are not listed.
 | haproxy_server_max_connect_time_seconds            |
 | haproxy_server_max_response_time_seconds           |
 | haproxy_server_max_total_time_seconds              |
+| haproxy_server_agent_status                        |
+| haproxy_server_agent_code                          |
+| haproxy_server_agent_duration_seconds              |
 | haproxy_server_internal_errors_total               |
 | haproxy_server_unsafe_idle_connections_current     |
 | haproxy_server_safe_idle_connections_current       |
index dbfb9dcb1db5273f03f3726408e1c9bf999446c5..7c429051dd244c2b0d869f59fac6d92a3c7cc709 100644 (file)
@@ -173,6 +173,8 @@ const struct ist promex_st_metric_desc[ST_I_PX_MAX] = {
        [ST_I_PX_CTIME]          = IST("Avg. connect time for last 1024 successful connections."),
        [ST_I_PX_RTIME]          = IST("Avg. response time for last 1024 successful connections."),
        [ST_I_PX_TTIME]          = IST("Avg. total time for last 1024 successful connections."),
+       [ST_I_PX_AGENT_STATUS]   = IST("Status of last agent check, per state label value."),
+       [ST_I_PX_AGENT_DURATION] = IST("Total duration of the latest server agent check, in seconds."),
        [ST_I_PX_QT_MAX]         = IST("Maximum observed time spent in the queue"),
        [ST_I_PX_CT_MAX]         = IST("Maximum observed time spent waiting for a connection to complete"),
        [ST_I_PX_RT_MAX]         = IST("Maximum observed time spent waiting for a server response"),
@@ -1342,6 +1344,7 @@ static int promex_dump_srv_metrics(struct appctx *appctx, struct htx *htx)
                                                secs = (double)sv->check.duration / 1000.0;
                                                val = mkf_flt(FN_DURATION, secs);
                                                break;
+
                                        case ST_I_PX_REQ_TOT:
                                                if (px->mode != PR_MODE_HTTP) {
                                                        sv = NULL;
@@ -1364,6 +1367,36 @@ static int promex_dump_srv_metrics(struct appctx *appctx, struct htx *htx)
                                                labels[lb_idx+1].value = promex_hrsp_code[ctx->field_num - ST_I_PX_HRSP_1XX];
                                                break;
 
+                                       case ST_I_PX_AGENT_STATUS:
+                                               if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) != CHK_ST_ENABLED)
+                                                       goto next_sv;
+
+                                               for (; ctx->obj_state < HCHK_STATUS_SIZE; ctx->obj_state++) {
+                                                       if (get_check_status_result(ctx->obj_state) < CHK_RES_FAILED)
+                                                               continue;
+                                                       val = mkf_u32(FO_STATUS, sv->agent.status == ctx->obj_state);
+                                                       check_state = get_check_status_info(ctx->obj_state);
+                                                       labels[lb_idx+1].name = ist("state");
+                                                       labels[lb_idx+1].value = ist(check_state);
+                                                       if (!promex_dump_ts(appctx, prefix, name, desc,
+                                                                           type,
+                                                                           &val, labels, &out, max))
+                                                               goto full;
+                                               }
+                                               ctx->obj_state = 0;
+                                               goto next_sv;
+                                       case ST_I_PX_AGENT_CODE:
+                                               if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) != CHK_ST_ENABLED)
+                                                       goto next_sv;
+                                               val = mkf_u32(FN_OUTPUT, (sv->agent.status < HCHK_STATUS_L57DATA) ? 0 : sv->agent.code);
+                                               break;
+                                       case ST_I_PX_AGENT_DURATION:
+                                               if (sv->agent.status < HCHK_STATUS_CHECKED)
+                                                   goto next_sv;
+                                               secs = (double)sv->agent.duration / 1000.0;
+                                               val = mkf_flt(FN_DURATION, secs);
+                                               break;
+
                                        default:
                                                break;
                                }
index 3aaa25237518198e32dc6ce2b3e3fde3989b140a..6e4ad4649fdde0a9646f1b62ac5c437a2c3e427c 100644 (file)
@@ -111,9 +111,9 @@ const struct stat_col stat_cols_px[ST_I_PX_MAX] = {
        [ST_I_PX_CTIME]                         = { .name = "ctime",                       .alt_name = "connect_time_average_seconds",    .desc = "Time spent waiting for a connection to complete, in milliseconds, averaged over the 1024 last requests (backend/server)", .cap = STATS_PX_CAP___BS },
        [ST_I_PX_RTIME]                         = { .name = "rtime",                       .alt_name = "response_time_average_seconds",   .desc = "Time spent waiting for a server response, in milliseconds, averaged over the 1024 last requests (backend/server)", .cap = STATS_PX_CAP___BS },
        [ST_I_PX_TTIME]                         = { .name = "ttime",                       .alt_name = "total_time_average_seconds",      .desc = "Total request+response time (request+queue+connect+response+processing), in milliseconds, averaged over the 1024 last requests (backend/server)", .cap = STATS_PX_CAP___BS },
-       [ST_I_PX_AGENT_STATUS]                  = { .name = "agent_status",                .alt_name = NULL,                              .desc = "Status report of the server's latest agent check, prefixed with '*' if a check is currently in progress" },
-       [ST_I_PX_AGENT_CODE]                    = { .name = "agent_code",                  .alt_name = NULL,                              .desc = "Status code reported by the latest server agent check" },
-       [ST_I_PX_AGENT_DURATION]                = { .name = "agent_duration",              .alt_name = NULL,                              .desc = "Total duration of the latest server agent check, in milliseconds" },
+       [ST_I_PX_AGENT_STATUS]                  = { .name = "agent_status",                .alt_name = "agent_status",                    .desc = "Status report of the server's latest agent check, prefixed with '*' if a check is currently in progress", .cap = STATS_PX_CAP____S },
+       [ST_I_PX_AGENT_CODE]                    = { .name = "agent_code",                  .alt_name = "agent_code",                      .desc = "Status code reported by the latest server agent check", .cap = STATS_PX_CAP____S },
+       [ST_I_PX_AGENT_DURATION]                = { .name = "agent_duration",              .alt_name = "agent_duration_seconds",          .desc = "Total duration of the latest server agent check, in milliseconds", .cap = STATS_PX_CAP____S, },
        [ST_I_PX_CHECK_DESC]                    = { .name = "check_desc",                  .alt_name = NULL,                              .desc = "Textual description of the latest health check report for this server" },
        [ST_I_PX_AGENT_DESC]                    = { .name = "agent_desc",                  .alt_name = NULL,                              .desc = "Textual description of the latest agent check report for this server" },
        [ST_I_PX_CHECK_RISE]                    = { .name = "check_rise",                  .alt_name = NULL,                              .desc = "Number of successful health checks before declaring a server UP (server 'rise' setting)" },