]> git.ipfire.org Git - thirdparty/haproxy.git/commitdiff
MAJOR: checks: rework completely bogus state machine
authorWilly Tarreau <w@1wt.eu>
Fri, 23 Nov 2012 11:47:05 +0000 (12:47 +0100)
committerWilly Tarreau <w@1wt.eu>
Fri, 23 Nov 2012 11:47:05 +0000 (12:47 +0100)
The porting of checks to using connections was totally bogus. Some checks
were considered successful as soon as the connection was established,
regardless of any response. Some errors would be triggered upon recv
if polling was enabled for send or if the send channel was shut down.

Now the behaviour is much better. It would be cleaner to perform the
fd_delete() in wake_srv_chk() and to process failures and timeouts
separately, but this is already a good start.

include/types/server.h
src/checks.c
src/dumpstats.c

index 87ca0be39aa5fd8cd9e00ead9f3a79ab1500ff15..ac7ab7bacd9cb88115fe8c868a789432f721c89e 100644 (file)
@@ -60,6 +60,7 @@
 #define SRV_TPROXY_MASK        0x0700  /* bind to a non-local address to reach this server */
 #define SRV_SEND_PROXY 0x0800  /* this server talks the PROXY protocol */
 #define SRV_NON_STICK  0x1000  /* never add connections allocated to this server to a stick table */
+#define SRV_CHK_RUNNING 0x2000  /* a check is currently running on this server */
 
 /* function which act on servers need to return various errors */
 #define SRV_STATUS_OK       0   /* everything is OK. */
index f5b7f23f7d2c558325eb1f9cdb2c1eb37a93e2d9..24e6cb07e7fb6c96abe9f6a5a4d00a076c1a5a98 100644 (file)
@@ -765,7 +765,7 @@ static void event_srv_chk_w(struct connection *conn)
        int fd = conn->t.sock.fd;
        struct task *t = s->check.task;
 
-       if (conn->flags & (CO_FL_SOCK_WR_SH | CO_FL_DATA_WR_SH | CO_FL_WAIT_DATA | CO_FL_WAIT_WR))
+       if (conn->flags & (CO_FL_SOCK_WR_SH | CO_FL_DATA_WR_SH))
                conn->flags |= CO_FL_ERROR;
 
        if (unlikely(conn->flags & CO_FL_ERROR)) {
@@ -779,7 +779,7 @@ static void event_srv_chk_w(struct connection *conn)
                goto out_error;
        }
 
-       if (conn->flags & CO_FL_HANDSHAKE)
+       if (conn->flags & (CO_FL_HANDSHAKE | CO_FL_WAIT_WR))
                return;
 
        /* here, we know that the connection is established */
@@ -791,25 +791,23 @@ static void event_srv_chk_w(struct connection *conn)
                                set_server_check_status(s, HCHK_STATUS_L4CON, strerror(errno));
                                goto out_wakeup;
                        }
+                       if (s->check.bo->o) {
+                               goto out_incomplete;
+                       }
                }
 
-               if (!s->check.bo->o) {
-                       /* full request sent, we allow up to <timeout.check> if nonzero for a response */
-                       if (s->proxy->timeout.check) {
-                               t->expire = tick_add_ifset(now_ms, s->proxy->timeout.check);
-                               task_queue(t);
-                       }
-                       goto out_nowake;
+               /* full request sent, we allow up to <timeout.check> if nonzero for a response */
+               if (s->proxy->timeout.check) {
+                       t->expire = tick_add_ifset(now_ms, s->proxy->timeout.check);
+                       task_queue(t);
                }
-               goto out_poll;
+               goto out_nowake;
        }
  out_wakeup:
        task_wakeup(t, TASK_WOKEN_IO);
  out_nowake:
        __conn_data_stop_send(conn);   /* nothing more to write */
-       return;
- out_poll:
-       __conn_data_poll_send(conn);
+ out_incomplete:
        return;
  out_error:
        conn->flags |= CO_FL_ERROR;
@@ -839,9 +837,6 @@ static void event_srv_chk_r(struct connection *conn)
        int done;
        unsigned short msglen;
 
-       if (conn->flags & (CO_FL_SOCK_WR_SH | CO_FL_DATA_WR_SH | CO_FL_WAIT_DATA | CO_FL_WAIT_WR))
-               conn->flags |= CO_FL_ERROR;
-
        if (unlikely((s->result & SRV_CHK_FAILED) || (conn->flags & CO_FL_ERROR))) {
                /* in case of TCP only, this tells us if the connection failed */
                if (!(s->result & SRV_CHK_FAILED))
@@ -850,7 +845,7 @@ static void event_srv_chk_r(struct connection *conn)
                goto out_wakeup;
        }
 
-       if (conn->flags & CO_FL_HANDSHAKE)
+       if (conn->flags & (CO_FL_HANDSHAKE | CO_FL_WAIT_RD))
                return;
 
        /* Warning! Linux returns EAGAIN on SO_ERROR if data are still available
@@ -1247,38 +1242,26 @@ static struct task *server_warmup(struct task *t)
  */
 static struct task *process_chk(struct task *t)
 {
-       int attempts = 0;
        struct server *s = t->context;
        struct connection *conn = s->check.conn;
-       int fd;
        int rv;
        int ret;
 
- new_chk:
-       if (attempts++ > 0) {
-               /* we always fail to create a server, let's stop insisting... */
-               while (tick_is_expired(t->expire, now_ms))
-                       t->expire = tick_add(t->expire, MS_TO_TICKS(s->inter));
-               return t;
-       }
-
-       fd = conn->t.sock.fd;
-       if (fd < 0) {   /* no check currently running */
+       if (!(s->state & SRV_CHK_RUNNING)) {
+               /* no check currently running */
                if (!tick_is_expired(t->expire, now_ms)) /* woke up too early */
                        return t;
 
                /* we don't send any health-checks when the proxy is stopped or when
                 * the server should not be checked.
                 */
-               if (!(s->state & SRV_CHECKED) || s->proxy->state == PR_STSTOPPED || (s->state & SRV_MAINTAIN)) {
-                       while (tick_is_expired(t->expire, now_ms))
-                               t->expire = tick_add(t->expire, MS_TO_TICKS(s->inter));
-                       return t;
-               }
+               if (!(s->state & SRV_CHECKED) || s->proxy->state == PR_STSTOPPED || (s->state & SRV_MAINTAIN))
+                       goto reschedule;
 
                /* we'll initiate a new check */
                set_server_check_status(s, HCHK_STATUS_START, NULL);
 
+               s->state |= SRV_CHK_RUNNING;
                s->check.bi->p = s->check.bi->data;
                s->check.bi->i = 0;
                s->check.bo->p = s->check.bo->data;
@@ -1337,21 +1320,6 @@ static struct task *process_chk(struct task *t)
 
                switch (ret) {
                case SN_ERR_NONE:
-                       break;
-               case SN_ERR_SRVTO: /* ETIMEDOUT */
-               case SN_ERR_SRVCL: /* ECONNREFUSED, ENETUNREACH, ... */
-                       set_server_check_status(s, HCHK_STATUS_L4CON, strerror(errno));
-                       break;
-               case SN_ERR_PRXCOND:
-               case SN_ERR_RESOURCE:
-               case SN_ERR_INTERNAL:
-                       set_server_check_status(s, HCHK_STATUS_SOCKERR, NULL);
-                       break;
-               }
-
-               if (s->result == SRV_CHK_UNKNOWN) {
-                       /* connection attempt was started */
-
                        /* we allow up to min(inter, timeout.connect) for a connection
                         * to establish but only when timeout.check is set
                         * as it may be to short for a full check otherwise
@@ -1362,12 +1330,22 @@ static struct task *process_chk(struct task *t)
                                int t_con = tick_add(now_ms, s->proxy->timeout.connect);
                                t->expire = tick_first(t->expire, t_con);
                        }
-                       return t;
+                       goto reschedule;
+
+               case SN_ERR_SRVTO: /* ETIMEDOUT */
+               case SN_ERR_SRVCL: /* ECONNREFUSED, ENETUNREACH, ... */
+                       set_server_check_status(s, HCHK_STATUS_L4CON, strerror(errno));
+                       break;
+               case SN_ERR_PRXCOND:
+               case SN_ERR_RESOURCE:
+               case SN_ERR_INTERNAL:
+                       set_server_check_status(s, HCHK_STATUS_SOCKERR, NULL);
+                       break;
                }
 
-               /* here, we have seen a failure */
+               /* here, we have seen a synchronous error, no fd was allocated */
 
-               conn->t.sock.fd = -1; /* report that no check is running anymore */
+               s->state &= ~SRV_CHK_RUNNING;
                if (s->health > s->rise) {
                        s->health--; /* still good */
                        s->counters.failed_checks++;
@@ -1375,7 +1353,6 @@ static struct task *process_chk(struct task *t)
                else
                        set_server_down(s);
 
-               //fprintf(stderr, "process_chk: 7, %lu\n", __tv_to_ms(&s->proxy->timeout.connect));
                /* we allow up to min(inter, timeout.connect) for a connection
                 * to establish but only when timeout.check is set
                 * as it may be to short for a full check otherwise
@@ -1389,7 +1366,6 @@ static struct task *process_chk(struct task *t)
                        if (s->proxy->timeout.check)
                                t->expire = tick_first(t->expire, t_con);
                }
-               goto new_chk;
        }
        else {
                /* there was a test running.
@@ -1397,21 +1373,21 @@ static struct task *process_chk(struct task *t)
                 * which can happen on connect timeout or error.
                 */
                if (s->result == SRV_CHK_UNKNOWN) {
-                       if (conn->flags & CO_FL_CONNECTED) {
-                               /* good TCP connection is enough */
+                       if ((conn->flags & CO_FL_CONNECTED) && !(s->proxy->options2 & PR_O2_CHK_ANY)) {
+                               /* good connection is enough for pure TCP check */
                                if (s->check.use_ssl)
                                        set_server_check_status(s, HCHK_STATUS_L6OK, NULL);
                                else
                                        set_server_check_status(s, HCHK_STATUS_L4OK, NULL);
                        }
-                       else if (conn->flags & CO_FL_WAIT_L4_CONN) {
+                       else if ((conn->flags & (CO_FL_CONNECTED|CO_FL_WAIT_L4_CONN)) == CO_FL_WAIT_L4_CONN) {
                                /* L4 failed */
                                if (conn->flags & CO_FL_ERROR)
                                        set_server_check_status(s, HCHK_STATUS_L4CON, NULL);
                                else
                                        set_server_check_status(s, HCHK_STATUS_L4TOUT, NULL);
                        }
-                       else if (conn->flags & CO_FL_WAIT_L6_CONN) {
+                       else if ((conn->flags & (CO_FL_CONNECTED|CO_FL_WAIT_L6_CONN)) == CO_FL_WAIT_L6_CONN) {
                                /* L6 failed */
                                if (conn->flags & CO_FL_ERROR)
                                        set_server_check_status(s, HCHK_STATUS_L6RSP, NULL);
@@ -1437,10 +1413,11 @@ static struct task *process_chk(struct task *t)
 
                                set_server_up(s);
                        }
+
+                       s->state &= ~SRV_CHK_RUNNING;
                        conn_xprt_close(conn);
                        if (conn->ctrl)
-                               fd_delete(fd);
-                       conn->t.sock.fd = -1; /* no check running anymore */
+                               fd_delete(conn->t.sock.fd);
 
                        rv = 0;
                        if (global.spread_checks > 0) {
@@ -1448,7 +1425,6 @@ static struct task *process_chk(struct task *t)
                                rv -= (int) (2 * rv * (rand() / (RAND_MAX + 1.0)));
                        }
                        t->expire = tick_add(now_ms, MS_TO_TICKS(srv_getinter(s) + rv));
-                       goto new_chk;
                }
                else if ((s->result & SRV_CHK_FAILED) || tick_is_expired(t->expire, now_ms)) {
                        if (!(s->result & SRV_CHK_FAILED)) {
@@ -1469,10 +1445,11 @@ static struct task *process_chk(struct task *t)
                        }
                        else
                                set_server_down(s);
+
+                       s->state &= ~SRV_CHK_RUNNING;
                        conn_xprt_close(conn);
                        if (conn->ctrl)
-                               fd_delete(fd);
-                       conn->t.sock.fd = -1;
+                               fd_delete(conn->t.sock.fd);
 
                        rv = 0;
                        if (global.spread_checks > 0) {
@@ -1480,11 +1457,16 @@ static struct task *process_chk(struct task *t)
                                rv -= (int) (2 * rv * (rand() / (RAND_MAX + 1.0)));
                        }
                        t->expire = tick_add(now_ms, MS_TO_TICKS(srv_getinter(s) + rv));
-                       goto new_chk;
                }
-               /* if result is unknown and there's no timeout, we have to wait again */
+               else {
+                       /* if result is unknown and there's no timeout, we have to wait again */
+                       s->result = SRV_CHK_UNKNOWN;
+               }
        }
-       s->result = SRV_CHK_UNKNOWN;
+
+ reschedule:
+       while (tick_is_expired(t->expire, now_ms))
+               t->expire = tick_add(t->expire, MS_TO_TICKS(s->inter));
        return t;
 }
 
index 17bf973c6241fdd39ad16b59c400e7755e2e2fe5..988f94c7fd71a4574721d88f0fbc991699c0cf21 100644 (file)
@@ -2933,7 +2933,7 @@ static int stats_dump_proxy(struct stream_interface *si, struct proxy *px, struc
                                        }
 
                                        chunk_appendf(&trash, "\"><u> %s%s",
-                                               tv_iszero(&sv->check.start)?"":"* ",
+                                               (sv->state & SRV_CHK_RUNNING) ? "" : "* ",
                                                get_check_status_info(sv->check.status));
 
                                        if (sv->check.status >= HCHK_STATUS_L57DATA)