]> git.ipfire.org Git - thirdparty/haproxy.git/commitdiff
[MEDIUM] stats: report server and backend cumulated downtime
authorKrzysztof Oledzki <ole@ans.pl>
Mon, 22 Oct 2007 14:21:10 +0000 (16:21 +0200)
committerWilly Tarreau <w@1wt.eu>
Mon, 22 Oct 2007 19:36:23 +0000 (21:36 +0200)
Hello,

This patch implements new statistics for SLA calculation by adding new
field 'Dwntime' with total down time since restart (both HTTP/CSV) and
extending status field (HTTP) or inserting a new one (CSV) with time
showing how long each server/backend is in a current state. Additionaly,
down transations are also calculated and displayed for backends, so it is
possible to know how many times selected backend was down, generating "No
server is available to handle this request." error.

New information are presentetd in two different ways:
   - for HTTP: a "human redable form", one of "100000d 23h", "23h 59m" or
      "59m 59s"
   - for CSV: seconds

I believe that seconds resolution is enough.

As there are more columns in the status page I decided to shrink some
names to make more space:
   - Weight -> Wght
   - Check -> Chk
   - Down -> Dwn

Making described changes I also made some improvements and fixed some
small bugs:
   - don't increment s->health above 's->rise + s->fall - 1'. Previously it
     was incremented an then (re)set to 's->rise + s->fall - 1'.
   - do not set server down if it is down already
   - do not set server up if it is up already
   - fix colspan in multiple places (mostly introduced by my previous patch)
   - add missing "status" header to CSV
   - fix order of retries/redispatches in server (CSV)
   - s/Tthen/Then/
   - s/server/backend/ in DATA_ST_PX_BE (dumpstats.c)

Changes from previous version:
  - deal with negative time intervales
  - don't relay on s->state (SRV_RUNNING)
  - little reworked human_time + compacted format (no spaces). If needed it
    can be used in the future for other purposes by optionally making "cnt"
    as an argument
  - leave set_server_down mostly unchanged
  - only little reworked "process_chk: 9"
  - additional fields in CSV are appended to the rigth
  - fix "SEC" macro
  - named arguments (human_time, be_downtime, srv_downtime)

Hope it is OK. If there are only cosmetic changes needed please fill free
to correct it, however if there are some bigger changes required I would
like to discuss it first or at last to know what exactly was changed
especially since I already put this patch into my production server. :)

Thank you,

Best regards,

  Krzysztof Oledzki

include/common/time.h
include/proto/backend.h
include/proto/server.h
include/types/proxy.h
include/types/server.h
src/backend.c
src/cfgparse.c
src/checks.c
src/dumpstats.c
src/server.c
src/time.c

index 001d63e95469ca5e06c47802d5663a4f00c1c629..ced4b6bd95619d461be70f29e9c5d02265be8e97 100644 (file)
 #include <common/config.h>
 #include <common/standard.h>
 
+#define SEC 1
+#define MINUTE (60 * SEC)
+#define HOUR (60 * MINUTE)
+#define DAY (24 * HOUR)
+
 /* eternity when exprimed in timeval */
 #ifndef TV_ETERNITY
 #define TV_ETERNITY     (~0UL)
@@ -480,6 +485,7 @@ REGPRM3 static inline struct timeval *__tv_ms_add(struct timeval *tv, const stru
         tv1;                       \
 })
 
+char *human_time(int t, short hz);
 
 #endif /* _COMMON_TIME_H */
 
index 467e756806adf5d52a7df48d9e3cad4c1f4188bc..d70c68faf030c61f46d911b513766ffc22f33b79 100644 (file)
@@ -39,7 +39,7 @@ int srv_redispatch_connect(struct session *t);
 
 void recount_servers(struct proxy *px);
 void recalc_server_map(struct proxy *px);
-
+int be_downtime(struct proxy *px);
 
 /*
  * This function tries to find a running server with free connection slots for
index 2df0763a27656aea8153b8afc8b8489388252b6e..27e4f367d92f4fa200cf822f3c20bdb92f1984a6 100644 (file)
@@ -31,7 +31,7 @@
 
 #include <proto/queue.h>
 
-
+int srv_downtime(struct server *s);
 
 #endif /* _PROTO_SERVER_H */
 
index ced7630e3e2e464e780a474bc9c19ba51c2b1fef..04d116272fb9b5d38ccb3adea82ae075bedab88a 100644 (file)
@@ -111,6 +111,11 @@ struct proxy {
        unsigned int maxconn;                   /* max # of active sessions on the frontend */
        unsigned int fullconn;                  /* #conns on backend above which servers are used at full load */
        struct in_addr except_net, except_mask; /* don't x-forward-for for this address. FIXME: should support IPv6 */
+
+       unsigned down_trans;                    /* up-down transitions */
+       unsigned down_time;                     /* total time the proxy was down */
+       time_t last_change;                     /* last time, when the state was changed */
+
        unsigned failed_conns, failed_resp;     /* failed connect() and responses */
        unsigned retries, redispatches;         /* retried and redispatched connections */
        unsigned denied_req, denied_resp;       /* blocked requests/responses because of security concerns */
index 6e1e810edff0ae86046ed76ef3d1a3558d1bb65f..62179f33d89de96b91ca93df8ffe61f18d564808 100644 (file)
@@ -86,10 +86,14 @@ struct server {
        unsigned wscore;                        /* weight score, used during srv map computation */
 
        unsigned failed_checks, down_trans;     /* failed checks and up-down transitions */
+       unsigned down_time;                     /* total time the server was down */
+       time_t last_change;                     /* last time, when the state was changed */
+
        unsigned failed_conns, failed_resp;     /* failed connect() and responses */
        unsigned retries;                       /* retried connections */
        unsigned failed_secu;                   /* blocked responses because of security concerns */
        unsigned cum_sess;                      /* cumulated number of sessions really sent to this server */
+
        long long bytes_in;                     /* number of bytes transferred from the client to the server */
        long long bytes_out;                    /* number of bytes transferred from the server to the client */
 };
index 6cecb1770911e90d55bb89dc345da3b1819ea126..596d5676cafffdf428815d9e57fcf957b753308e 100644 (file)
@@ -717,6 +717,13 @@ int srv_redispatch_connect(struct session *t)
        return 0;
 }
 
+int be_downtime(struct proxy *px) {
+
+       if ((px->srv_act || px->srv_bck) && px->last_change < now.tv_sec)               // ignore negative time
+               return px->down_time;
+
+       return now.tv_sec - px->last_change + px->down_time;
+}
 
 /*
  * Local variables:
index 7483a9c6cb071998c6deb8a44233ed1131295597..7b73e0bee1959ee8ab1ea0a6f8fe8f29987f521b 100644 (file)
@@ -564,7 +564,8 @@ int cfg_parse_listen(const char *file, int linenum, char **args)
                tv_eternity(&curproxy->srvtimeout);
                tv_eternity(&curproxy->contimeout);
                tv_eternity(&curproxy->appsession_timeout);
-               
+
+               curproxy->last_change = now.tv_sec;
                curproxy->id = strdup(args[1]);
                curproxy->cap = rc;
 
@@ -1380,6 +1381,7 @@ int cfg_parse_listen(const char *file, int linenum, char **args)
                LIST_INIT(&newsrv->pendconns);
                do_check = 0;
                newsrv->state = SRV_RUNNING; /* early server setup */
+               newsrv->last_change = now.tv_sec;
                newsrv->id = strdup(args[1]);
 
                /* several ways to check the port component :
index 0cd1a874179886a9d127acfa1e8e9521e2c7a4c0..77e9b513f0ccab60a470ea5afc7989ae3a90f0f3 100644 (file)
@@ -57,9 +57,11 @@ static void set_server_down(struct server *s)
        struct session *sess;
        int xferred;
 
-       s->state &= ~SRV_RUNNING;
-
        if (s->health == s->rise) {
+
+               s->last_change = now.tv_sec;
+               s->state &= ~SRV_RUNNING;
+
                recount_servers(s->proxy);
                s->proxy->map_state |= PR_MAP_RECALC;
 
@@ -92,10 +94,13 @@ static void set_server_down(struct server *s)
 
                Warning("%s", trash);
                send_log(s->proxy, LOG_ALERT, "%s", trash);
-       
+
                if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) {
-                       Alert("%s '%s' has no server available !\n", proxy_type_str(s->proxy), s->proxy->id);
-                       send_log(s->proxy, LOG_EMERG, "%s %s has no server available !\n", proxy_type_str(s->proxy), s->proxy->id);
+                       s->proxy->last_change = now.tv_sec;
+                       s->proxy->down_trans++;
+
+                       Alert("%s '%s' has no server available!\n", proxy_type_str(s->proxy), s->proxy->id);
+                       send_log(s->proxy, LOG_EMERG, "%s %s has no server available!\n", proxy_type_str(s->proxy), s->proxy->id);
                }
                s->down_trans++;
        }
@@ -455,13 +460,24 @@ void process_chk(struct task *t, struct timeval *next)
                /* there was a test running */
                if (s->result > 0) { /* good server detected */
                        //fprintf(stderr, "process_chk: 9\n");
-                       s->health++; /* was bad, stays for a while */
-                       if (s->health >= s->rise) {
-                               s->state |= SRV_RUNNING;
+
+                       if (s->health < s->rise + s->fall - 1) {
+                               s->health++; /* was bad, stays for a while */
 
                                if (s->health == s->rise) {
                                        int xferred;
 
+                                       if (s->last_change < now.tv_sec)                        // ignore negative times
+                                               s->down_time += now.tv_sec - s->last_change;
+                                       s->last_change = now.tv_sec;
+                                       s->state |= SRV_RUNNING;
+
+                                       if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) {
+                                               if (s->proxy->last_change < now.tv_sec)         // ignore negative times
+                                                       s->proxy->down_time += now.tv_sec - s->proxy->last_change;
+                                               s->proxy->last_change = now.tv_sec;
+                                       }
+
                                        recount_servers(s->proxy);
                                        s->proxy->map_state |= PR_MAP_RECALC;
 
@@ -493,7 +509,8 @@ void process_chk(struct task *t, struct timeval *next)
                                        send_log(s->proxy, LOG_NOTICE, "%s", trash);
                                }
 
-                               s->health = s->rise + s->fall - 1; /* OK now */
+                               if (s->health >= s->rise)
+                                       s->health = s->rise + s->fall - 1; /* OK now */
                        }
                        s->curfd = -1; /* no check running anymore */
                        fd_delete(fd);
index 02e0518d857fc299e0de94e8c9ebd1a4494fb928..7ec4b565be0a2cf8f08f4bd8c74275f767498d14 100644 (file)
@@ -16,7 +16,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <time.h>
 #include <pwd.h>
 #include <grp.h>
 
@@ -47,6 +46,7 @@
 #include <proto/proto_uxst.h>
 #include <proto/senddata.h>
 #include <proto/session.h>
+#include <proto/server.h>
 
 /* This function parses a "stats" statement in the "global" section. It returns
  * -1 if there is any error, otherwise zero. If it returns -1, it may write an
@@ -186,8 +186,8 @@ int stats_dump_raw(struct session *s, struct uri_auth *uri, int flags)
                             "dreq,dresp,"
                             "ereq,econ,eresp,"
                             "wretr,wredis,"
-                            "weight,act,bck,"
-                            "chkfail,chkdown"
+                            "status,weight,act,bck,"
+                            "chkfail,chkdown,lastchg,downtime,"
                             "\n");
                        
                if (buffer_write_chunk(rep, &msg) != 0)
@@ -371,8 +371,8 @@ int stats_dump_http(struct session *s, struct uri_auth *uri, int flags)
                             "dreq,dresp,"
                             "ereq,econ,eresp,"
                             "wretr,wredis,"
-                            "weight,act,bck,"
-                            "chkfail,chkdown"
+                            "status,weight,act,bck,"
+                            "chkfail,chkdown,lastchg,downtime,"
                             "\n");
                }
                if (buffer_write_chunk(rep, &msg) != 0)
@@ -582,26 +582,26 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
                if (flags & STAT_FMT_HTML) {
                        /* print a new table */
                        chunk_printf(&msg, sizeof(trash),
-                                    "<table cols=\"22\" class=\"tbl\" width=\"100%%\">\n"
+                                    "<table cols=\"23\" class=\"tbl\" width=\"100%%\">\n"
                                     "<tr align=\"center\" class=\"titre\">"
                                     "<th colspan=2 class=\"pxname\">%s</th>"
-                                    "<th colspan=18 class=\"empty\"></th>"
+                                    "<th colspan=21 class=\"empty\"></th>"
                                     "</tr>\n"
                                     "<tr align=\"center\" class=\"titre\">"
                                     "<th rowspan=2></th>"
                                     "<th colspan=2>Queue</th><th colspan=4>Sessions</th>"
                                     "<th colspan=2>Bytes</th><th colspan=2>Denied</th>"
                                     "<th colspan=3>Errors</th><th colspan=2>Warnings</th>"
-                                    "<th colspan=6>Server</th>"
+                                    "<th colspan=7>Server</th>"
                                     "</tr>\n"
                                     "<tr align=\"center\" class=\"titre\">"
                                     "<th>Cur</th><th>Max</th><th>Cur</th><th>Max</th>"
                                     "<th>Limit</th><th>Cumul</th><th>In</th><th>Out</th>"
                                     "<th>Req</th><th>Resp</th><th>Req</th><th>Conn</th>"
                                     "<th>Resp</th><th>Retr</th><th>Redis</th>"
-                                    "<th>Status</th><th>Weight</th><th>Act</th>"
-                                    "<th>Bck</th><th>Check</th><th>Down</th></tr>\n"
-                                    "",
+                                    "<th>Status</th><th>Wght</th><th>Act</th>"
+                                    "<th>Bck</th><th>Chk</th><th>Dwn</th><th>Dwntme</th>\n"
+                                    "</tr>",
                                     px->id);
 
                        if (buffer_write_chunk(rep, &msg) != 0)
@@ -632,7 +632,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
                                     /* server status : reflect frontend status */
                                     "<td align=center>%s</td>"
                                     /* rest of server: nothing */
-                                    "<td align=center colspan=5></td></tr>"
+                                    "<td align=center colspan=6></td></tr>"
                                     "",
                                     px->feconn, px->feconn_max, px->maxconn, px->cum_feconn,
                                     px->bytes_in, px->bytes_out,
@@ -657,7 +657,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
                                     /* server status : reflect frontend status */
                                     "%s,"
                                     /* rest of server: nothing */
-                                    ",,,,,"
+                                    ",,,,,,,"
                                     "\n",
                                     px->id,
                                     px->feconn, px->feconn_max, px->maxconn, px->cum_feconn,
@@ -734,6 +734,11 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
                                     
                                /* status */
                                chunk_printf(&msg, sizeof(trash), "<td nowrap>");
+
+                               if (sv->state & SRV_CHECKED)
+                                       chunk_printf(&msg, sizeof(trash), "%s ",
+                                               human_time(now.tv_sec - sv->last_change, 1));
+
                                chunk_printf(&msg, sizeof(trash),
                                     srv_hlt_st[sv_state],
                                     (sv->state & SRV_RUNNING) ? (sv->health - sv->rise + 1) : (sv->health),
@@ -749,14 +754,17 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
                                     (sv->state & SRV_BACKUP) ? "-" : "Y",
                                     (sv->state & SRV_BACKUP) ? "Y" : "-");
 
-                               /* check failures : unique, fatal */
+                               /* check failures: unique, fatal, down time */
                                if (sv->state & SRV_CHECKED)
                                        chunk_printf(&msg, sizeof(trash),
-                                            "<td align=right>%d</td><td align=right>%d</td></tr>\n",
-                                            sv->failed_checks, sv->down_trans);
+                                            "<td align=right>%d</td><td align=right>%d</td>"
+                                            "<td nowrap align=right>%s</td>"
+                                            "</tr>\n",
+                                            sv->failed_checks, sv->down_trans,
+                                            human_time(srv_downtime(sv), 1));
                                else
                                        chunk_printf(&msg, sizeof(trash),
-                                            "<td colspan=2></td></tr>\n");
+                                            "<td colspan=3></td></tr>\n");
                        } else {
                                static char *srv_hlt_st[5] = { "DOWN,", "DOWN %d/%d,", "UP %d/%d,",
                                                               "UP,", "no check," };
@@ -774,7 +782,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
                                     /* errors : request, connect, response */
                                     ",%d,%d,"
                                     /* warnings: retries, redispatches */
-                                    ",%d,"
+                                    "%d,,"
                                     "",
                                     px->id, sv->id,
                                     sv->nbpend, sv->nbpend_max,
@@ -798,14 +806,15 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
                                     (sv->state & SRV_BACKUP) ? 0 : 1,
                                     (sv->state & SRV_BACKUP) ? 1 : 0);
 
-                               /* check failures : unique, fatal */
+                               /* check failures: unique, fatal; last change, total downtime */
                                if (sv->state & SRV_CHECKED)
                                        chunk_printf(&msg, sizeof(trash),
-                                            "%d,%d,\n",
-                                            sv->failed_checks, sv->down_trans);
+                                            "%d,%d,%d,%d,\n",
+                                            sv->failed_checks, sv->down_trans,
+                                            now.tv_sec - sv->last_change, srv_downtime(sv));
                                else
                                        chunk_printf(&msg, sizeof(trash),
-                                            ",,\n");
+                                            ",,,,\n");
                        }
                        if (buffer_write_chunk(rep, &msg) != 0)
                                return 0;
@@ -847,23 +856,29 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
                                     "<td align=right></td><td align=right>%d</td><td align=right>%d</td>\n"
                                     /* warnings: retries, redispatches */
                                     "<td align=right>%d</td><td align=right>%d</td>"
-                                    /* server status : reflect backend status (up/down) : we display UP
+                                    /* backend status: reflect backend status (up/down): we display UP
                                      * if the backend has known working servers or if it has no server at
-                                     * all (eg: for stats). Tthen we display the total weight, number of
+                                     * all (eg: for stats). Then we display the total weight, number of
                                      * active and backups. */
-                                    "<td align=center>%s</td><td align=center>%d</td>"
-                                    "<td align=center>%d</td><td align=center>%d</td>"
-                                    /* rest of server: nothing */
-                                    "<td align=center colspan=2></td></tr>"
-                                    "",
+                                    "<td align=center nowrap>%s %s</td><td align=center>%d</td>"
+                                    "<td align=center>%d</td><td align=center>%d</td>",
                                     px->nbpend /* or px->totpend ? */, px->nbpend_max,
                                     px->beconn, px->beconn_max, px->fullconn, px->cum_beconn,
                                     px->bytes_in, px->bytes_out,
                                     px->denied_req, px->denied_resp,
                                     px->failed_conns, px->failed_resp,
                                     px->retries, px->redispatches,
+                                    human_time(now.tv_sec - px->last_change, 1),
                                     (px->srv_map_sz > 0 || !px->srv) ? "UP" : "DOWN",
                                     px->srv_map_sz * gcd, px->srv_act, px->srv_bck);
+
+                               chunk_printf(&msg, sizeof(trash),
+                                    /* rest of backend: nothing, down transformations, total downtime */
+                                    "<td align=center>&nbsp;</td><td align=\"right\">%d</td>"
+                                    "<td align=\"right\" nowrap>%s</td>"
+                                    "</tr>",
+                                    px->down_trans,
+                                    px->srv?human_time(be_downtime(px), 1):"&nbsp;");
                        } else {
                                chunk_printf(&msg, sizeof(trash),
                                     /* pxid, name */
@@ -880,14 +895,15 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
                                     ",%d,%d,"
                                     /* warnings: retries, redispatches */
                                     "%d,%d,"
-                                    /* server status : reflect backend status (up/down) : we display UP
+                                    /* backend status: reflect backend status (up/down): we display UP
                                      * if the backend has known working servers or if it has no server at
-                                     * all (eg: for stats). Tthen we display the total weight, number of
+                                     * all (eg: for stats). Then we display the total weight, number of
                                      * active and backups. */
                                     "%s,"
                                     "%d,%d,%d,"
-                                    /* rest of server: nothing */
-                                    ",,"
+                                    /* rest of backend: nothing, down transformations,
+                                     * last change, total downtime. */
+                                    ",%d,%d,%d,"
                                     "\n",
                                     px->id,
                                     px->nbpend /* or px->totpend ? */, px->nbpend_max,
@@ -897,7 +913,9 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
                                     px->failed_conns, px->failed_resp,
                                     px->retries, px->redispatches,
                                     (px->srv_map_sz > 0 || !px->srv) ? "UP" : "DOWN",
-                                    px->srv_map_sz * gcd, px->srv_act, px->srv_bck);
+                                    px->srv_map_sz * gcd, px->srv_act, px->srv_bck,
+                                    px->down_trans, now.tv_sec - px->last_change,
+                                    px->srv?be_downtime(px):0);
                        }
                        if (buffer_write_chunk(rep, &msg) != 0)
                                return 0;
index 5b33d114b35e016de59b18119ae7d5c1dfb89de4..8b0fa1490b86b4c774ddb939eda9999d23a93269 100644 (file)
  */
 
 #include <common/config.h>
+#include <common/time.h>
+
 #include <types/backend.h>
 #include <types/proxy.h>
 #include <types/server.h>
 
+int srv_downtime(struct server *s) {
+
+       if ((s->state & SRV_RUNNING) && s->last_change < now.tv_sec)            // ignore negative time
+               return s->down_time;
+
+       return now.tv_sec - s->last_change + s->down_time;
+}
 
 /*
  * Local variables:
index b80dca9f49fad8de53bed94310162453c7576550..bb98d98cf1254e2ff977567ec8b5da91136d1f65 100644 (file)
@@ -142,6 +142,40 @@ REGPRM2 int _tv_isgt(const struct timeval *tv1, const struct timeval *tv2)
        return __tv_isgt(tv1, tv2);
 }
 
+char *human_time(int t, short hz) {
+       static char rv[sizeof("24855d23h")+1];  // longest of "23h59m" and "59m59s"
+       char *p = rv;
+       int cnt=2;                              // print two numbers
+
+       if (unlikely(t < 0 || hz <= 0)) {
+               sprintf(p, "?");
+               return rv;
+       }
+
+       if (unlikely(hz > 1))
+               t /= hz;
+
+       if (t >= DAY) {
+               p += sprintf(p, "%dd", t / DAY);
+               cnt--;
+       }
+
+       if (cnt && t % DAY / HOUR) {
+               p += sprintf(p, "%dh", t % DAY / HOUR);
+               cnt--;
+       }
+
+       if (cnt && t % HOUR / MINUTE) {
+               p += sprintf(p, "%dm", t % HOUR / MINUTE);
+               cnt--;
+       }
+
+       if ((cnt && t % MINUTE) || !t)                                  // also display '0s'
+               p += sprintf(p, "%ds", t % MINUTE / SEC);
+
+       return rv;
+}
+
 /*
  * Local variables:
  *  c-indent-level: 8