From: Willy Tarreau Date: Mon, 30 Apr 2007 12:37:43 +0000 (+0200) Subject: [BUG] fixed connection establishment detection X-Git-Tag: v1.3.10~24 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6996e15e160f37fa3ebc660b3ea766f3f7019b84;p=thirdparty%2Fhaproxy.git [BUG] fixed connection establishment detection Since the introduction of speculative I/O, it was not always possible to correctly detect a connection establishment. Particularly, in TCP mode, there is no data to send and getsockopt() returns no error. The solution consists in trying a connect() again to get its diagnostic. --- diff --git a/doc/internals/connect-status.txt b/doc/internals/connect-status.txt new file mode 100644 index 0000000000..70bbcc560f --- /dev/null +++ b/doc/internals/connect-status.txt @@ -0,0 +1,28 @@ +Normally, we should use getsockopt(fd, SOL_SOCKET, SO_ERROR) on a pending +connect() to detect whether the connection correctly established or not. + +Unfortunately, getsockopt() does not report the status of a pending connection, +which means that it returns 0 if the connection is still pending. This has to +be expected, because as the name implies it, it only returns errors. + +With the speculative I/O, a new problem was introduced : if we pretend the +socket was indicated as ready and we go to the socket's write() function, +a pending connection will then inevitably be identified as established. + +In fact, there are solutions to this issue : + + - send() returns -EAGAIN if it cannot write, so that as long as there are + pending data in the buffer, we'll be informed about the status of the + connection + + - connect() on an already pending connection will return -1 with errno set to + one of the following values : + - EALREADY : connection already in progress + - EISCONN : connection already established + - anything else will indicate an error. + +=> So instead of using getsockopt() on a pending connection with no data, we + will switch to connect(). This implies that the connection address must be + known within the socket's write() function. + + diff --git a/src/checks.c b/src/checks.c index df16cd1bd6..14f0c8cc88 100644 --- a/src/checks.c +++ b/src/checks.c @@ -110,22 +110,14 @@ static void set_server_down(struct server *s) */ static int event_srv_chk_w(int fd) { - __label__ out_wakeup, out_nowake; + __label__ out_wakeup, out_nowake, out_poll, out_error; struct task *t = fdtab[fd].owner; struct server *s = t->context; - int skerr; - socklen_t lskerr = sizeof(skerr); - skerr = 1; - if (unlikely(fdtab[fd].state == FD_STERROR || - (fdtab[fd].ev & FD_POLL_ERR) || - (getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr) == -1) || - (skerr != 0))) { - /* in case of TCP only, this tells us if the connection failed */ - s->result = -1; - fdtab[fd].state = FD_STERROR; - goto out_wakeup; - } + if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))) + goto out_error; + + /* here, we know that the connection is established */ if (s->result != -1) { /* we don't want to mark 'UP' a server on which we detected an error earlier */ @@ -151,19 +143,39 @@ static int event_srv_chk_w(int fd) EV_FD_SET(fd, DIR_RD); /* prepare for reading reply */ goto out_nowake; } - else if (ret == 0 || errno == EAGAIN) { - /* we want some polling to happen first */ - fdtab[fd].ev &= ~FD_POLL_WR; - return 0; - } - else { - s->result = -1; - EV_FD_CLR(fd, DIR_WR); - } + else if (ret == 0 || errno == EAGAIN) + goto out_poll; + else + goto out_error; } else { + /* We have no data to send to check the connection, and + * getsockopt() will not inform us whether the connection + * is still pending. So we'll reuse connect() to check the + * state of the socket. This has the advantage of givig us + * the following info : + * - error + * - connecting (EALREADY, EINPROGRESS) + * - connected (EISCONN, 0) + */ + + struct sockaddr_in sa; + + sa = (s->check_addr.sin_addr.s_addr) ? s->check_addr : s->addr; + sa.sin_port = htons(s->check_port); + + if (connect(fd, (struct sockaddr *)&sa, sizeof(sa)) == 0) + errno = 0; + + if (errno == EALREADY || errno == EINPROGRESS) + goto out_poll; + + if (errno && errno != EISCONN) + goto out_error; + /* good TCP connection is enough */ s->result = 1; + goto out_wakeup; } } out_wakeup: @@ -172,6 +184,15 @@ static int event_srv_chk_w(int fd) EV_FD_CLR(fd, DIR_WR); /* nothing more to write */ fdtab[fd].ev &= ~FD_POLL_WR; return 1; + out_poll: + /* The connection is still pending. We'll have to poll it + * before attempting to go further. */ + fdtab[fd].ev &= ~FD_POLL_WR; + return 0; + out_error: + s->result = -1; + fdtab[fd].state = FD_STERROR; + goto out_wakeup; } @@ -221,13 +242,16 @@ static int event_srv_chk_r(int fd) return 0; } - if (((s->proxy->options & PR_O_HTTP_CHK) && - (len >= sizeof("HTTP/1.0 000")) && - !memcmp(reply, "HTTP/1.", 7) && - (reply[9] == '2' || reply[9] == '3')) /* 2xx or 3xx */ - || ((s->proxy->options & PR_O_SSL3_CHK) && (len >= 5) && - (reply[0] == 0x15 || reply[0] == 0x16))) /* alert or handshake */ + if ((s->proxy->options & PR_O_HTTP_CHK) && (len >= sizeof("HTTP/1.0 000")) && + (memcmp(reply, "HTTP/1.", 7) == 0) && (reply[9] == '2' || reply[9] == '3')) { + /* HTTP/1.X 2xx or 3xx */ result = 1; + } + else if ((s->proxy->options & PR_O_SSL3_CHK) && (len >= 5) && + (reply[0] == 0x15 || reply[0] == 0x16)) { + /* SSLv3 alert or handshake */ + result = 1; + } if (result == -1) fdtab[fd].state = FD_STERROR; diff --git a/src/stream_sock.c b/src/stream_sock.c index ce04672efd..e420ace348 100644 --- a/src/stream_sock.c +++ b/src/stream_sock.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -41,7 +42,7 @@ * otherwise. */ int stream_sock_read(int fd) { - __label__ out_wakeup; + __label__ out_eternity, out_wakeup, out_error; struct buffer *b = fdtab[fd].cb[DIR_RD].b; int ret, max, retval; int read_poll = MAX_READ_POLL_LOOPS; @@ -52,21 +53,19 @@ int stream_sock_read(int fd) { retval = 1; - if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))) { - /* read/write error */ - b->flags |= BF_READ_ERROR; - fdtab[fd].state = FD_STERROR; - goto out_wakeup; - } - if (unlikely(fdtab[fd].ev & FD_POLL_HUP)) { /* connection closed */ b->flags |= BF_READ_NULL; - goto out_wakeup; + goto out_eternity; + } + else if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))) { + goto out_error; } - retval = 0; - while (read_poll-- > 0) { + while (1) { + /* + * 1. compute the maximum block size we can read at once. + */ if (b->l == 0) { /* let's realign the buffer to optimize I/O */ b->r = b->w = b->lr = b->data; max = b->rlim - b->data; @@ -84,11 +83,17 @@ int stream_sock_read(int fd) { max = b->rlim - b->data; } - if (max == 0) { /* not anymore room to store data */ + if (unlikely(max == 0)) { + /* Not anymore room to store data. This should theorically + * never happen, but better safe than sorry ! + */ EV_FD_CLR(fd, DIR_RD); - break; + goto out_eternity; } + /* + * 2. read the largest possible block + */ #ifndef MSG_NOSIGNAL { int skerr; @@ -107,7 +112,6 @@ int stream_sock_read(int fd) { b->r += ret; b->l += ret; b->flags |= BF_PARTIAL_READ; - retval = 1; if (b->r == b->data + BUFSIZE) { b->r = b->data; /* wrap around the buffer */ @@ -115,6 +119,14 @@ int stream_sock_read(int fd) { b->total += ret; + if (b->l == b->rlim - b->data) { + /* The buffer is now full, there's no point in going through + * the loop again. + */ + EV_FD_CLR(fd, DIR_RD); + goto out_eternity; + } + /* generally if we read something smaller than the 1 or 2 MSS, * it means that it's not worth trying to read again. It may * also happen on headers, but the application then can stop @@ -123,44 +135,57 @@ int stream_sock_read(int fd) { if (ret < MIN_RET_FOR_READ_LOOP) break; - if (!read_poll) + if (--read_poll <= 0) break; - /* we hope to read more data or to get a close on next round */ - continue; } else if (ret == 0) { + /* connection closed */ b->flags |= BF_READ_NULL; - retval = 1; // connection closed - break; + goto out_eternity; } else if (errno == EAGAIN) { /* Ignore EAGAIN but inform the poller that there is - * nothing to read left. + * nothing to read left. But we may have done some work + * justifying to notify the task. */ retval = 0; break; } else { - retval = 1; - b->flags |= BF_READ_ERROR; - fdtab[fd].state = FD_STERROR; - break; + goto out_error; } - } /* while (read_poll) */ + } /* while (1) */ + + /* + * The only way to get out of this loop is to have stopped reading + * without any error nor close, either by limiting the number of + * loops, or because of an EAGAIN. We only rearm the timer if we + * have at least read something. + */ - if (b->flags & BF_READ_STATUS) { - out_wakeup: - if (b->rto && EV_FD_ISSET(fd, DIR_RD)) + if (b->flags & BF_PARTIAL_READ) { + if (b->rto) { tv_ms_add(&b->rex, &now, b->rto); - else - tv_eternity(&b->rex); - - task_wakeup(fdtab[fd].owner); + goto out_wakeup; + } + out_eternity: + tv_eternity(&b->rex); } + out_wakeup: + if (b->flags & BF_READ_STATUS) + task_wakeup(fdtab[fd].owner); fdtab[fd].ev &= ~FD_POLL_RD; return retval; + + out_error: + /* There was an error. we must wakeup the task. No need to clear + * the events, the task will do it. + */ + fdtab[fd].state = FD_STERROR; + b->flags |= BF_READ_ERROR; + goto out_eternity; } @@ -171,7 +196,7 @@ int stream_sock_read(int fd) { * otherwise. */ int stream_sock_write(int fd) { - __label__ out_eternity; + __label__ out_eternity, out_wakeup, out_error; struct buffer *b = fdtab[fd].cb[DIR_WR].b; int ret, max, retval; int write_poll = MAX_WRITE_POLL_LOOPS; @@ -181,17 +206,10 @@ int stream_sock_write(int fd) { #endif retval = 1; + if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))) + goto out_error; - if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))) { - /* read/write error */ - b->flags |= BF_WRITE_ERROR; - fdtab[fd].state = FD_STERROR; - EV_FD_CLR(fd, DIR_WR); - goto out_eternity; - } - - retval = 0; - while (write_poll-- > 0) { + while (1) { if (b->l == 0) { /* let's realign the buffer to optimize I/O */ b->r = b->w = b->lr = b->data; max = 0; @@ -205,24 +223,41 @@ int stream_sock_write(int fd) { if (max == 0) { /* may be we have received a connection acknowledgement in TCP mode without data */ - if (!(b->flags & BF_PARTIAL_WRITE) - && fdtab[fd].state == FD_STCONN) { - int skerr; - socklen_t lskerr = sizeof(skerr); - ret = getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr); - if (ret == -1 || skerr) { - b->flags |= BF_WRITE_ERROR; - fdtab[fd].state = FD_STERROR; - EV_FD_CLR(fd, DIR_WR); - retval = 1; - goto out_eternity; + if (likely(fdtab[fd].state == FD_STCONN)) { + struct session *s = fdtab[fd].owner->context; + + /* We have no data to send to check the connection, and + * getsockopt() will not inform us whether the connection + * is still pending. So we'll reuse connect() to check the + * state of the socket. This has the advantage of givig us + * the following info : + * - error + * - connecting (EALREADY, EINPROGRESS) + * - connected (EISCONN, 0) + */ + if ((connect(fd, (struct sockaddr *)&s->srv_addr, sizeof(s->srv_addr)) == 0)) + errno = 0; + + if (errno == EALREADY || errno == EINPROGRESS) { + retval = 0; + goto out_wakeup; } + + if (errno && errno != EISCONN) + goto out_error; + + /* OK we just need to indicate that we got a connection + * and that we wrote nothing. + */ + b->flags |= BF_WRITE_NULL; + fdtab[fd].state = FD_STREADY; } - b->flags |= BF_WRITE_NULL; - fdtab[fd].state = FD_STREADY; + /* Funny, we were called to write something but there wasn't + * anything. Theorically we cannot get there, but just in case, + * let's disable the write event and pretend we never came there. + */ EV_FD_CLR(fd, DIR_WR); - retval = 1; goto out_eternity; } @@ -246,37 +281,40 @@ int stream_sock_write(int fd) { b->w += ret; b->flags |= BF_PARTIAL_WRITE; - retval = 1; if (b->w == b->data + BUFSIZE) { b->w = b->data; /* wrap around the buffer */ } - if (!write_poll) - break; + if (!b->l) { + EV_FD_CLR(fd, DIR_WR); + goto out_eternity; + } - /* we hope to be able to write more data */ - continue; - } - else if (ret == 0) { - /* nothing written, just pretend we were never called */ - retval = 0; - break; + if (--write_poll <= 0) + break; } - else if (errno == EAGAIN) {/* ignore EAGAIN */ + else if (ret == 0 || errno == EAGAIN) { + /* nothing written, just pretend we were never called + * and wait for the socket to be ready. But we may have + * done some work justifying to notify the task. + */ retval = 0; break; } else { - b->flags |= BF_WRITE_ERROR; - fdtab[fd].state = FD_STERROR; - EV_FD_CLR(fd, DIR_WR); - retval = 1; - goto out_eternity; + goto out_error; } - } /* while (write_poll) */ + } /* while (1) */ - if (b->flags & BF_WRITE_STATUS) { + /* + * The only way to get out of this loop is to have stopped writing + * without any error, either by limiting the number of loops, or + * because of an EAGAIN. We only rearm the timer if we have at least + * written something. + */ + + if (b->flags & BF_PARTIAL_WRITE) { if (b->wto) { tv_ms_add(&b->wex, &now, b->wto); /* FIXME: to prevent the client from expiring read timeouts during writes, @@ -284,16 +322,27 @@ int stream_sock_write(int fd) { * unique one, although that needs some study particularly on full-duplex * TCP connections. */ b->rex = b->wex; + goto out_wakeup; } - else { - out_eternity: - tv_eternity(&b->wex); - } + out_eternity: + tv_eternity(&b->wex); } - task_wakeup(fdtab[fd].owner); + out_wakeup: + if (b->flags & BF_WRITE_STATUS) + task_wakeup(fdtab[fd].owner); fdtab[fd].ev &= ~FD_POLL_WR; return retval; + + out_error: + /* There was an error. we must wakeup the task. No need to clear + * the events, the task will do it. + */ + fdtab[fd].state = FD_STERROR; + b->flags |= BF_WRITE_ERROR; + goto out_eternity; + + }