int fd = conn->t.sock.fd;
fd_insert(fd);
+ /* mark the fd as ready so as not to needlessly poll at the beginning */
+ fd_may_recv(fd);
+ fd_may_send(fd);
fdtab[fd].owner = conn;
fdtab[fd].iocb = conn_fd_handler;
conn->flags |= CO_FL_CTRL_READY;
if ((conn->flags & CO_FL_CTRL_READY) && conn->ctrl) {
unsigned int flags = conn->flags & ~(CO_FL_CURR_RD_ENA | CO_FL_CURR_WR_ENA);
- if (fd_ev_is_set(conn->t.sock.fd, DIR_RD))
+ if (fd_recv_active(conn->t.sock.fd))
flags |= CO_FL_CURR_RD_ENA;
- if (fd_ev_is_set(conn->t.sock.fd, DIR_WR))
+ if (fd_send_active(conn->t.sock.fd))
flags |= CO_FL_CURR_WR_ENA;
conn->flags = flags;
}
* include/proto/fd.h
* File descriptors states.
*
- * Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu
+ * Copyright (C) 2000-2014 Willy Tarreau - w@1wt.eu
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
}
}
+/* Automatically allocates or releases a cache entry for fd <fd> depending on
+ * its new state. This is meant to be used by pollers while processing updates.
+ */
+static inline void fd_alloc_or_release_cache_entry(int fd, int new_state)
+{
+ /* READY and ACTIVE states (the two with both flags set) require a cache entry */
+
+ if (((new_state & (FD_EV_READY_R | FD_EV_ACTIVE_R)) == (FD_EV_READY_R | FD_EV_ACTIVE_R)) ||
+ ((new_state & (FD_EV_READY_W | FD_EV_ACTIVE_W)) == (FD_EV_READY_W | FD_EV_ACTIVE_W))) {
+ fd_alloc_cache_entry(fd);
+ }
+ else {
+ fd_release_cache_entry(fd);
+ }
+}
+
/*
- * Returns non-zero if <fd> is already monitored for events in direction <dir>.
+ * returns the FD's recv state (FD_EV_*)
*/
-static inline int fd_ev_is_set(const int fd, int dir)
+static inline int fd_recv_state(const int fd)
{
- return ((unsigned)fdtab[fd].state >> dir) & FD_EV_STATUS;
+ return ((unsigned)fdtab[fd].state >> (4 * DIR_RD)) & FD_EV_STATUS;
}
-/* Disable processing of events on fd <fd> for direction <dir>. Note: this
- * function was optimized to be used with a constant for <dir>.
+/*
+ * returns true if the FD is active for recv
*/
-static inline void fd_ev_clr(const int fd, int dir)
+static inline int fd_recv_active(const int fd)
{
- unsigned int i = ((unsigned int)fdtab[fd].state) & (FD_EV_STATUS << dir);
- if (i == 0)
- return; /* already disabled */
- fdtab[fd].state ^= i;
- updt_fd(fd); /* need an update entry to change the state */
+ return (unsigned)fdtab[fd].state & FD_EV_ACTIVE_R;
}
-/* Enable polling for events on fd <fd> for direction <dir>. Note: this
- * function was optimized to be used with a constant for <dir>.
+/*
+ * returns true if the FD is ready for recv
*/
-static inline void fd_ev_wai(const int fd, int dir)
+static inline int fd_recv_ready(const int fd)
{
- unsigned int i = ((unsigned int)fdtab[fd].state) & (FD_EV_STATUS << dir);
- if (i == (FD_EV_POLLED << dir))
- return; /* already in desired state */
- fdtab[fd].state ^= i ^ (FD_EV_POLLED << dir);
- updt_fd(fd); /* need an update entry to change the state */
+ return (unsigned)fdtab[fd].state & FD_EV_READY_R;
+}
+
+/*
+ * returns true if the FD is polled for recv
+ */
+static inline int fd_recv_polled(const int fd)
+{
+ return (unsigned)fdtab[fd].state & FD_EV_POLLED_R;
+}
+
+/*
+ * returns the FD's send state (FD_EV_*)
+ */
+static inline int fd_send_state(const int fd)
+{
+ return ((unsigned)fdtab[fd].state >> (4 * DIR_WR)) & FD_EV_STATUS;
+}
+
+/*
+ * returns true if the FD is active for send
+ */
+static inline int fd_send_active(const int fd)
+{
+ return (unsigned)fdtab[fd].state & FD_EV_ACTIVE_W;
}
-/* Enable processing of events on fd <fd> for direction <dir>. Note: this
- * function was optimized to be used with a constant for <dir>.
+/*
+ * returns true if the FD is ready for send
*/
-static inline void fd_ev_set(int fd, int dir)
+static inline int fd_send_ready(const int fd)
{
- unsigned int i = ((unsigned int)fdtab[fd].state) & (FD_EV_STATUS << dir);
+ return (unsigned)fdtab[fd].state & FD_EV_READY_W;
+}
- /* note that we don't care about disabling the polled state when
- * enabling the active state, since it brings no benefit but costs
- * some syscalls.
- */
- if (i & (FD_EV_ACTIVE << dir))
- return; /* already in desired state */
- fdtab[fd].state |= (FD_EV_ACTIVE << dir);
+/*
+ * returns true if the FD is polled for send
+ */
+static inline int fd_send_polled(const int fd)
+{
+ return (unsigned)fdtab[fd].state & FD_EV_POLLED_W;
+}
+
+/* Disable processing recv events on fd <fd> */
+static inline void fd_stop_recv(int fd)
+{
+ if (!((unsigned int)fdtab[fd].state & FD_EV_ACTIVE_R))
+ return; /* already disabled */
+ fdtab[fd].state &= ~FD_EV_ACTIVE_R;
updt_fd(fd); /* need an update entry to change the state */
}
-/* Disable processing of events on fd <fd> for both directions. */
-static inline void fd_ev_rem(const int fd)
+/* Disable processing send events on fd <fd> */
+static inline void fd_stop_send(int fd)
{
- unsigned int i = ((unsigned int)fdtab[fd].state) & FD_EV_CURR_MASK;
- if (i == 0)
+ if (!((unsigned int)fdtab[fd].state & FD_EV_ACTIVE_W))
return; /* already disabled */
- fdtab[fd].state ^= i;
+ fdtab[fd].state &= ~FD_EV_ACTIVE_W;
updt_fd(fd); /* need an update entry to change the state */
}
-/* event manipulation primitives for use by I/O callbacks */
-static inline void fd_want_recv(int fd)
+/* Disable processing of events on fd <fd> for both directions. */
+static inline void fd_stop_both(int fd)
{
- return fd_ev_set(fd, DIR_RD);
+ if (!((unsigned int)fdtab[fd].state & FD_EV_ACTIVE_RW))
+ return; /* already disabled */
+ fdtab[fd].state &= ~FD_EV_ACTIVE_RW;
+ updt_fd(fd); /* need an update entry to change the state */
}
-static inline void fd_stop_recv(int fd)
+/* Report that FD <fd> cannot receive anymore without polling (EAGAIN detected). */
+static inline void fd_cant_recv(const int fd)
{
- return fd_ev_clr(fd, DIR_RD);
+ if (!(((unsigned int)fdtab[fd].state) & FD_EV_READY_R))
+ return; /* already marked as blocked */
+ fdtab[fd].state &= ~FD_EV_READY_R;
+ updt_fd(fd);
}
-static inline void fd_poll_recv(int fd)
+/* Report that FD <fd> can receive anymore without polling. */
+static inline void fd_may_recv(const int fd)
{
- return fd_ev_wai(fd, DIR_RD);
+ if (((unsigned int)fdtab[fd].state) & FD_EV_READY_R)
+ return; /* already marked as blocked */
+ fdtab[fd].state |= FD_EV_READY_R;
+ updt_fd(fd);
}
-static inline void fd_want_send(int fd)
+/* Report that FD <fd> cannot send anymore without polling (EAGAIN detected). */
+static inline void fd_cant_send(const int fd)
{
- return fd_ev_set(fd, DIR_WR);
+ if (!(((unsigned int)fdtab[fd].state) & FD_EV_READY_W))
+ return; /* already marked as blocked */
+ fdtab[fd].state &= ~FD_EV_READY_W;
+ updt_fd(fd);
}
-static inline void fd_stop_send(int fd)
+/* Report that FD <fd> can send anymore without polling (EAGAIN detected). */
+static inline void fd_may_send(const int fd)
{
- return fd_ev_clr(fd, DIR_WR);
+ if (((unsigned int)fdtab[fd].state) & FD_EV_READY_W)
+ return; /* already marked as blocked */
+ fdtab[fd].state |= FD_EV_READY_W;
+ updt_fd(fd);
}
-static inline void fd_poll_send(int fd)
+/* Prepare FD <fd> to try to receive */
+static inline void fd_want_recv(int fd)
{
- return fd_ev_wai(fd, DIR_WR);
+ if (((unsigned int)fdtab[fd].state & FD_EV_ACTIVE_R))
+ return; /* already enabled */
+ fdtab[fd].state |= FD_EV_ACTIVE_R;
+ updt_fd(fd); /* need an update entry to change the state */
}
-static inline void fd_stop_both(int fd)
+/* Prepare FD <fd> to try to send */
+static inline void fd_want_send(int fd)
{
- return fd_ev_rem(fd);
+ if (((unsigned int)fdtab[fd].state & FD_EV_ACTIVE_W))
+ return; /* already enabled */
+ fdtab[fd].state |= FD_EV_ACTIVE_W;
+ updt_fd(fd); /* need an update entry to change the state */
}
/* Prepares <fd> for being polled */
/*
* include/types/fd.h
- * File descriptors states.
+ * File descriptors states - check src/fd.c for explanations.
*
- * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu
+ * Copyright (C) 2000-2014 Willy Tarreau - w@1wt.eu
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
#define FD_POLL_DATA (FD_POLL_IN | FD_POLL_OUT)
#define FD_POLL_STICKY (FD_POLL_ERR | FD_POLL_HUP)
-/* Event state for an FD in each direction, as found in the 4 lower bits of
- * fdtab[].state, and in the 4 next bits.
- */
#define FD_EV_ACTIVE 1U
+#define FD_EV_READY 2U
#define FD_EV_POLLED 4U
-#define FD_EV_STATUS (FD_EV_ACTIVE | FD_EV_POLLED)
+
+#define FD_EV_STATUS (FD_EV_ACTIVE | FD_EV_POLLED | FD_EV_READY)
#define FD_EV_STATUS_R (FD_EV_STATUS)
-#define FD_EV_STATUS_W (FD_EV_STATUS << 1)
+#define FD_EV_STATUS_W (FD_EV_STATUS << 4)
#define FD_EV_POLLED_R (FD_EV_POLLED)
-#define FD_EV_POLLED_W (FD_EV_POLLED << 1)
+#define FD_EV_POLLED_W (FD_EV_POLLED << 4)
#define FD_EV_POLLED_RW (FD_EV_POLLED_R | FD_EV_POLLED_W)
#define FD_EV_ACTIVE_R (FD_EV_ACTIVE)
-#define FD_EV_ACTIVE_W (FD_EV_ACTIVE << 1)
+#define FD_EV_ACTIVE_W (FD_EV_ACTIVE << 4)
#define FD_EV_ACTIVE_RW (FD_EV_ACTIVE_R | FD_EV_ACTIVE_W)
-#define FD_EV_CURR_MASK 0x0FU
-#define FD_EV_PREV_MASK 0xF0U
+#define FD_EV_READY_R (FD_EV_READY)
+#define FD_EV_READY_W (FD_EV_READY << 4)
+#define FD_EV_READY_RW (FD_EV_READY_R | FD_EV_READY_W)
+
+enum fd_states {
+ FD_ST_DISABLED = 0,
+ FD_ST_MUSTPOLL,
+ FD_ST_STOPPED,
+ FD_ST_ACTIVE,
+ FD_ST_ABORT,
+ FD_ST_POLLED,
+ FD_ST_PAUSED,
+ FD_ST_READY
+};
/* info about one given fd */
struct fdtab {
int (*iocb)(int fd); /* I/O handler, returns FD_WAIT_* */
void *owner; /* the connection or listener associated with this fd, NULL if closed */
unsigned int cache; /* position+1 in the FD cache. 0=not in cache. */
- unsigned char state; /* FD state for read and write directions (4+4 bits) */
+ unsigned char state; /* FD state for read and write directions (2*3 bits) */
unsigned char ev; /* event seen in return of poll() : FD_POLL_* */
unsigned char new:1; /* 1 if this fd has just been created */
unsigned char updated:1; /* 1 if this fd is already in the update list */
return;
wait_more_data:
- __conn_data_poll_recv(conn);
+ __conn_data_want_recv(conn);
}
/*
/* update read status if needed */
if (unlikely((f & (CO_FL_DATA_RD_ENA|CO_FL_WAIT_RD)) == (CO_FL_DATA_RD_ENA|CO_FL_WAIT_RD))) {
- fd_poll_recv(c->t.sock.fd);
+ fd_want_recv(c->t.sock.fd);
+ fd_cant_recv(c->t.sock.fd);
f |= CO_FL_CURR_RD_ENA;
}
else if (unlikely((f & (CO_FL_CURR_RD_ENA|CO_FL_DATA_RD_ENA)) == CO_FL_DATA_RD_ENA)) {
/* update write status if needed */
if (unlikely((f & (CO_FL_DATA_WR_ENA|CO_FL_WAIT_WR)) == (CO_FL_DATA_WR_ENA|CO_FL_WAIT_WR))) {
- fd_poll_send(c->t.sock.fd);
+ fd_want_send(c->t.sock.fd);
+ fd_cant_send(c->t.sock.fd);
f |= CO_FL_CURR_WR_ENA;
}
else if (unlikely((f & (CO_FL_CURR_WR_ENA|CO_FL_DATA_WR_ENA)) == CO_FL_DATA_WR_ENA)) {
/* update read status if needed */
if (unlikely((f & (CO_FL_SOCK_RD_ENA|CO_FL_WAIT_RD)) == (CO_FL_SOCK_RD_ENA|CO_FL_WAIT_RD))) {
- fd_poll_recv(c->t.sock.fd);
+ fd_want_recv(c->t.sock.fd);
+ fd_cant_recv(c->t.sock.fd);
f |= CO_FL_CURR_RD_ENA;
}
else if (unlikely((f & (CO_FL_CURR_RD_ENA|CO_FL_SOCK_RD_ENA)) == CO_FL_SOCK_RD_ENA)) {
/* update write status if needed */
if (unlikely((f & (CO_FL_SOCK_WR_ENA|CO_FL_WAIT_WR)) == (CO_FL_SOCK_WR_ENA|CO_FL_WAIT_WR))) {
- fd_poll_send(c->t.sock.fd);
+ fd_want_send(c->t.sock.fd);
+ fd_cant_send(c->t.sock.fd);
f |= CO_FL_CURR_WR_ENA;
}
else if (unlikely((f & (CO_FL_CURR_WR_ENA|CO_FL_SOCK_WR_ENA)) == CO_FL_SOCK_WR_ENA)) {
/*
* FD polling functions for Linux epoll
*
- * Copyright 2000-2012 Willy Tarreau <w@1wt.eu>
+ * Copyright 2000-2014 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
#endif
/*
- * speculative epoll() poller
+ * Linux epoll() poller
*/
REGPRM2 static void _do_poll(struct poller *p, int exp)
{
/* first, scan the update list to find changes */
for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) {
fd = fd_updt[updt_idx];
- en = fdtab[fd].state & 15; /* new events */
- eo = fdtab[fd].state >> 4; /* previous events */
-
- if (fdtab[fd].owner && (eo ^ en)) {
- if ((eo ^ en) & FD_EV_POLLED_RW) {
- /* poll status changed */
- if ((en & FD_EV_POLLED_RW) == 0) {
- /* fd removed from poll list */
- opcode = EPOLL_CTL_DEL;
- }
- else if ((eo & FD_EV_POLLED_RW) == 0) {
- /* new fd in the poll list */
- opcode = EPOLL_CTL_ADD;
- }
- else {
- /* fd status changed */
- opcode = EPOLL_CTL_MOD;
- }
+ en = eo = fdtab[fd].state;
- /* construct the epoll events based on new state */
- ev.events = 0;
- if (en & FD_EV_POLLED_R)
- ev.events |= EPOLLIN | EPOLLRDHUP;
+ fdtab[fd].updated = 0;
+ fdtab[fd].new = 0;
- if (en & FD_EV_POLLED_W)
- ev.events |= EPOLLOUT;
+ if (!fdtab[fd].owner)
+ continue;
- ev.data.fd = fd;
- epoll_ctl(epoll_fd, opcode, fd, &ev);
- }
+ if (en & FD_EV_ACTIVE_R) {
+ if (!(en & FD_EV_READY_R))
+ en |= FD_EV_POLLED_R;
+ }
+ else
+ en &= ~FD_EV_POLLED_R;
- fdtab[fd].state = (en << 4) + en; /* save new events */
+ if (en & FD_EV_ACTIVE_W) {
+ if (!(en & FD_EV_READY_W))
+ en |= FD_EV_POLLED_W;
+ }
+ else
+ en &= ~FD_EV_POLLED_W;
- if (!(en & FD_EV_ACTIVE_RW)) {
- /* This fd doesn't use any active entry anymore, we can
- * kill its entry.
- */
- fd_release_cache_entry(fd);
+ if ((eo ^ en) & FD_EV_POLLED_RW) {
+ /* poll status changed */
+ fdtab[fd].state = en;
+
+ if ((en & FD_EV_POLLED_RW) == 0) {
+ /* fd removed from poll list */
+ opcode = EPOLL_CTL_DEL;
+ }
+ else if ((eo & FD_EV_POLLED_RW) == 0) {
+ /* new fd in the poll list */
+ opcode = EPOLL_CTL_ADD;
}
- else if ((en & ~eo) & FD_EV_ACTIVE_RW) {
- /* we need a new cache entry now */
- fd_alloc_cache_entry(fd);
+ else {
+ /* fd status changed */
+ opcode = EPOLL_CTL_MOD;
}
+
+ /* construct the epoll events based on new state */
+ ev.events = 0;
+ if (en & FD_EV_POLLED_R)
+ ev.events |= EPOLLIN | EPOLLRDHUP;
+
+ if (en & FD_EV_POLLED_W)
+ ev.events |= EPOLLOUT;
+
+ ev.data.fd = fd;
+ epoll_ctl(epoll_fd, opcode, fd, &ev);
}
- fdtab[fd].updated = 0;
- fdtab[fd].new = 0;
+
+ fd_alloc_or_release_cache_entry(fd, en);
}
fd_nbupdt = 0;
if (fdtab[fd].iocb) {
int new_updt, old_updt;
- /* Mark the events as speculative before processing
- * them so that if nothing can be done we don't need
- * to poll again.
- */
- if (fdtab[fd].ev & FD_POLL_IN)
- fd_ev_set(fd, DIR_RD);
+ /* Mark the events as ready before processing */
+ if (fdtab[fd].ev & (FD_POLL_IN | FD_POLL_HUP | FD_POLL_ERR))
+ fd_may_recv(fd);
- if (fdtab[fd].ev & FD_POLL_OUT)
- fd_ev_set(fd, DIR_WR);
+ if (fdtab[fd].ev & (FD_POLL_OUT | FD_POLL_ERR))
+ fd_may_send(fd);
- if (fdtab[fd].cache) {
- /* This fd was already scheduled for being called as a speculative I/O */
+ if (fdtab[fd].cache)
continue;
- }
/* Save number of updates to detect creation of new FDs. */
old_updt = fd_nbupdt;
fdtab[fd].new = 0;
fdtab[fd].ev &= FD_POLL_STICKY;
- if ((fdtab[fd].state & FD_EV_STATUS_R) == FD_EV_ACTIVE_R)
+ if ((fdtab[fd].state & FD_EV_STATUS_R) == (FD_EV_READY_R | FD_EV_ACTIVE_R))
fdtab[fd].ev |= FD_POLL_IN;
- if ((fdtab[fd].state & FD_EV_STATUS_W) == FD_EV_ACTIVE_W)
+ if ((fdtab[fd].state & FD_EV_STATUS_W) == (FD_EV_READY_W | FD_EV_ACTIVE_W))
fdtab[fd].ev |= FD_POLL_OUT;
if (fdtab[fd].ev && fdtab[fd].iocb && fdtab[fd].owner)
/* we can remove this update entry if it's the last one and is
* unused, otherwise we don't touch anything.
*/
- if (new_updt == fd_nbupdt && fdtab[fd].state == 0) {
+ if (new_updt == fd_nbupdt && !fd_recv_active(fd) && !fd_send_active(fd)) {
fdtab[fd].updated = 0;
fd_nbupdt--;
}
}
}
}
-
- /* the caller will take care of speculative events */
+ /* the caller will take care of cached events */
}
/*
- * Initialization of the speculative epoll() poller.
+ * Initialization of the epoll() poller.
* Returns 0 in case of failure, non-zero in case of success. If it fails, it
* disables the poller by setting its pref to 0.
*/
}
/*
- * Termination of the speculative epoll() poller.
+ * Termination of the epoll() poller.
* Memory is released and the poller is marked as unselectable.
*/
REGPRM1 static void _do_term(struct poller *p)
/*
* FD polling functions for FreeBSD kqueue()
*
- * Copyright 2000-2008 Willy Tarreau <w@1wt.eu>
+ * Copyright 2000-2014 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Note: not knowing much about kqueue, I had to rely on OpenBSD's detailed man
- * page and to check how it was implemented in lighttpd to understand it better.
- * But it is possible that I got things wrong.
- *
*/
#include <unistd.h>
/* first, scan the update list to find changes */
for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) {
fd = fd_updt[updt_idx];
- en = fdtab[fd].state & 15; /* new events */
- eo = fdtab[fd].state >> 4; /* previous events */
+ en = eo = fdtab[fd].state;
+
+ fdtab[fd].updated = 0;
+ fdtab[fd].new = 0;
+
+ if (!fdtab[fd].owner)
+ continue;
+
+ if (en & FD_EV_ACTIVE_R) {
+ if (!(en & FD_EV_READY_R))
+ en |= FD_EV_POLLED_R;
+ }
+ else
+ en &= ~FD_EV_POLLED_R;
+
+ if (en & FD_EV_ACTIVE_W) {
+ if (!(en & FD_EV_READY_W))
+ en |= FD_EV_POLLED_W;
+ }
+ else
+ en &= ~FD_EV_POLLED_W;
+
+
+ if ((eo ^ en) & FD_EV_POLLED_RW) {
+ /* poll status changed */
+ fdtab[fd].state = en;
- if (fdtab[fd].owner && (eo ^ en)) {
if ((eo ^ en) & FD_EV_POLLED_R) {
/* read poll status changed */
if (en & FD_EV_POLLED_R) {
changes++;
}
}
-
- fdtab[fd].state = (en << 4) + en; /* save new events */
-
- if (!(en & FD_EV_ACTIVE_RW)) {
- /* This fd doesn't use any active entry anymore, we can
- * kill its entry.
- */
- fd_release_cache_entry(fd);
- }
- else if ((en & ~eo) & FD_EV_ACTIVE_RW) {
- /* we need a new cache entry now */
- fd_alloc_cache_entry(fd);
- }
}
- fdtab[fd].updated = 0;
- fdtab[fd].new = 0;
+
+ fd_alloc_or_release_cache_entry(fd, en);
}
if (changes)
kevent(kqueue_fd, kev, changes, NULL, 0, NULL);
}
if (fdtab[fd].iocb && fdtab[fd].ev) {
- /* Mark the events as speculative before processing
- * them so that if nothing can be done we don't need
- * to poll again.
- */
if (fdtab[fd].ev & FD_POLL_IN)
- fd_ev_set(fd, DIR_RD);
+ fd_may_recv(fd);
if (fdtab[fd].ev & FD_POLL_OUT)
- fd_ev_set(fd, DIR_WR);
+ fd_may_send(fd);
- if (fdtab[fd].cache) {
- /* This fd was already scheduled for being
- * called as a speculative I/O.
- */
+ if (fdtab[fd].cache)
continue;
- }
fdtab[fd].iocb(fd);
}
/*
* FD polling functions for generic poll()
*
- * Copyright 2000-2012 Willy Tarreau <w@1wt.eu>
+ * Copyright 2000-2014 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
/* first, scan the update list to find changes */
for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) {
fd = fd_updt[updt_idx];
- en = fdtab[fd].state & 15; /* new events */
- eo = fdtab[fd].state >> 4; /* previous events */
-
- if (fdtab[fd].owner && (eo ^ en)) {
- if ((eo ^ en) & FD_EV_POLLED_RW) {
- /* poll status changed, update the lists */
- if ((eo & ~en) & FD_EV_POLLED_R)
- hap_fd_clr(fd, fd_evts[DIR_RD]);
- else if ((en & ~eo) & FD_EV_POLLED_R)
- hap_fd_set(fd, fd_evts[DIR_RD]);
-
- if ((eo & ~en) & FD_EV_POLLED_W)
- hap_fd_clr(fd, fd_evts[DIR_WR]);
- else if ((en & ~eo) & FD_EV_POLLED_W)
- hap_fd_set(fd, fd_evts[DIR_WR]);
- }
-
- fdtab[fd].state = (en << 4) + en; /* save new events */
+ en = eo = fdtab[fd].state;
- if (!(en & FD_EV_ACTIVE_RW)) {
- /* This fd doesn't use any active entry anymore, we can
- * kill its entry.
- */
- fd_release_cache_entry(fd);
- }
- else if ((en & ~eo) & FD_EV_ACTIVE_RW) {
- /* we need a new cache entry now */
- fd_alloc_cache_entry(fd);
- }
- }
fdtab[fd].updated = 0;
fdtab[fd].new = 0;
+
+ if (!fdtab[fd].owner)
+ continue;
+
+ if (en & FD_EV_ACTIVE_R) {
+ if (!(en & FD_EV_READY_R))
+ en |= FD_EV_POLLED_R;
+ }
+ else
+ en &= ~FD_EV_POLLED_R;
+
+ if (en & FD_EV_ACTIVE_W) {
+ if (!(en & FD_EV_READY_W))
+ en |= FD_EV_POLLED_W;
+ }
+ else
+ en &= ~FD_EV_POLLED_W;
+
+ if ((eo ^ en) & FD_EV_POLLED_RW) {
+ /* poll status changed, update the lists */
+ fdtab[fd].state = en;
+
+ if ((eo & ~en) & FD_EV_POLLED_R)
+ hap_fd_clr(fd, fd_evts[DIR_RD]);
+ else if ((en & ~eo) & FD_EV_POLLED_R)
+ hap_fd_set(fd, fd_evts[DIR_RD]);
+
+ if ((eo & ~en) & FD_EV_POLLED_W)
+ hap_fd_clr(fd, fd_evts[DIR_WR]);
+ else if ((en & ~eo) & FD_EV_POLLED_W)
+ hap_fd_set(fd, fd_evts[DIR_WR]);
+ }
+
+ fd_alloc_or_release_cache_entry(fd, en);
}
fd_nbupdt = 0;
}
if (fdtab[fd].iocb && fdtab[fd].ev) {
- /* Mark the events as speculative before processing
- * them so that if nothing can be done we don't need
- * to poll again.
- */
if (fdtab[fd].ev & FD_POLL_IN)
- fd_ev_set(fd, DIR_RD);
+ fd_may_recv(fd);
if (fdtab[fd].ev & FD_POLL_OUT)
- fd_ev_set(fd, DIR_WR);
+ fd_may_send(fd);
- if (fdtab[fd].cache) {
- /* This fd was already scheduled for being
- * called as a speculative I/O
- */
+ if (fdtab[fd].cache)
continue;
- }
fdtab[fd].iocb(fd);
}
/*
* FD polling functions for generic select()
*
- * Copyright 2000-2012 Willy Tarreau <w@1wt.eu>
+ * Copyright 2000-2014 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
/* first, scan the update list to find changes */
for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) {
fd = fd_updt[updt_idx];
- en = fdtab[fd].state & 15; /* new events */
- eo = fdtab[fd].state >> 4; /* previous events */
-
- if (fdtab[fd].owner && (eo ^ en)) {
- if ((eo ^ en) & FD_EV_POLLED_RW) {
- /* poll status changed, update the lists */
- if ((eo & ~en) & FD_EV_POLLED_R)
- FD_CLR(fd, fd_evts[DIR_RD]);
- else if ((en & ~eo) & FD_EV_POLLED_R)
- FD_SET(fd, fd_evts[DIR_RD]);
-
- if ((eo & ~en) & FD_EV_POLLED_W)
- FD_CLR(fd, fd_evts[DIR_WR]);
- else if ((en & ~eo) & FD_EV_POLLED_W)
- FD_SET(fd, fd_evts[DIR_WR]);
- }
-
- fdtab[fd].state = (en << 4) + en; /* save new events */
+ en = eo = fdtab[fd].state;
- if (!(en & FD_EV_ACTIVE_RW)) {
- /* This fd doesn't use any active entry anymore, we can
- * kill its entry.
- */
- fd_release_cache_entry(fd);
- }
- else if ((en & ~eo) & FD_EV_ACTIVE_RW) {
- /* we need a new cache entry now */
- fd_alloc_cache_entry(fd);
- }
- }
fdtab[fd].updated = 0;
fdtab[fd].new = 0;
+
+ if (!fdtab[fd].owner)
+ continue;
+
+ if (en & FD_EV_ACTIVE_R) {
+ if (!(en & FD_EV_READY_R))
+ en |= FD_EV_POLLED_R;
+ }
+ else
+ en &= ~FD_EV_POLLED_R;
+
+ if (en & FD_EV_ACTIVE_W) {
+ if (!(en & FD_EV_READY_W))
+ en |= FD_EV_POLLED_W;
+ }
+ else
+ en &= ~FD_EV_POLLED_W;
+
+ if ((eo ^ en) & FD_EV_POLLED_RW) {
+ /* poll status changed, update the lists */
+ fdtab[fd].state = en;
+
+ if ((eo & ~en) & FD_EV_POLLED_R)
+ FD_CLR(fd, fd_evts[DIR_RD]);
+ else if ((en & ~eo) & FD_EV_POLLED_R)
+ FD_SET(fd, fd_evts[DIR_RD]);
+
+ if ((eo & ~en) & FD_EV_POLLED_W)
+ FD_CLR(fd, fd_evts[DIR_WR]);
+ else if ((en & ~eo) & FD_EV_POLLED_W)
+ FD_SET(fd, fd_evts[DIR_WR]);
+ }
+
+ fd_alloc_or_release_cache_entry(fd, en);
}
fd_nbupdt = 0;
fdtab[fd].ev |= FD_POLL_OUT;
if (fdtab[fd].iocb && fdtab[fd].ev) {
- /* Mark the events as speculative before processing
- * them so that if nothing can be done we don't need
- * to poll again.
- */
if (fdtab[fd].ev & FD_POLL_IN)
- fd_ev_set(fd, DIR_RD);
+ fd_may_recv(fd);
if (fdtab[fd].ev & FD_POLL_OUT)
- fd_ev_set(fd, DIR_WR);
+ fd_may_send(fd);
- if (fdtab[fd].cache) {
- /* This fd was already scheduled for being
- * called as a speculative I/O.
- */
+ if (fdtab[fd].cache)
continue;
- }
fdtab[fd].iocb(fd);
}
/*
* File descriptors management functions.
*
- * Copyright 2000-2012 Willy Tarreau <w@1wt.eu>
+ * Copyright 2000-2014 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * This code implements "speculative I/O". The principle is to try to perform
- * expected I/O before registering the events in the poller. Each time this
- * succeeds, it saves a possibly expensive system call to set the event. It
- * generally succeeds for all reads after an accept(), and for writes after a
- * connect(). It also improves performance for streaming connections because
- * even if only one side is polled, the other one may react accordingly
- * depending on the fill level of the buffer. This behaviour is also the only
- * one compatible with event-based pollers (eg: EPOLL_ET).
+ * This code implements an events cache for file descriptors. It remembers the
+ * readiness of a file descriptor after a return from poll() and the fact that
+ * an I/O attempt failed on EAGAIN. Events in the cache which are still marked
+ * ready and active are processed just as if they were reported by poll().
*
- * More importantly, it enables I/O operations that are backed by invisible
- * buffers. For example, SSL is able to read a whole socket buffer and not
- * deliver it to the application buffer because it's full. Unfortunately, it
- * won't be reported by a poller anymore until some new activity happens. The
- * only way to call it again thus is to perform speculative I/O as soon as
- * reading on the FD is enabled again.
+ * This serves multiple purposes. First, it significantly improves performance
+ * by avoiding to subscribe to polling unless absolutely necessary, so most
+ * events are processed without polling at all, especially send() which
+ * benefits from the socket buffers. Second, it is the only way to support
+ * edge-triggered pollers (eg: EPOLL_ET). And third, it enables I/O operations
+ * that are backed by invisible buffers. For example, SSL is able to read a
+ * whole socket buffer and not deliver it to the application buffer because
+ * it's full. Unfortunately, it won't be reported by a poller anymore until
+ * some new activity happens. The only way to call it again thus is to keep
+ * this readiness information in the cache and to access it without polling
+ * once the FD is enabled again.
*
- * The speculative I/O uses a list of expected events and a list of updates.
- * Expected events are events that are expected to come and that we must report
- * to the application until it asks to stop or to poll. Updates are new requests
- * for changing an FD state. Updates are the only way to create new events. This
- * is important because it means that the number of speculative events cannot
- * increase between updates and will only grow one at a time while processing
- * updates. All updates must always be processed, though events might be
- * processed by small batches if required.
+ * One interesting feature of the cache is that it maintains the principle
+ * of speculative I/O introduced in haproxy 1.3 : the first time an event is
+ * enabled, the FD is considered as ready so that the I/O attempt is performed
+ * via the cache without polling. And the polling happens only when EAGAIN is
+ * first met. This avoids polling for HTTP requests, especially when the
+ * defer-accept mode is used. It also avoids polling for sending short data
+ * such as requests to servers or short responses to clients.
+ *
+ * The cache consists in a list of active events and a list of updates.
+ * Active events are events that are expected to come and that we must report
+ * to the application until it asks to stop or asks to poll. Updates are new
+ * requests for changing an FD state. Updates are the only way to create new
+ * events. This is important because it means that the number of cached events
+ * cannot increase between updates and will only grow one at a time while
+ * processing updates. All updates must always be processed, though events
+ * might be processed by small batches if required.
*
* There is no direct link between the FD and the updates list. There is only a
* bit in the fdtab[] to indicate than a file descriptor is already present in
*
* It is important to understand that as long as all expected events are
* processed, they might starve the polled events, especially because polled
- * I/O starvation quickly induces more speculative I/O. One solution to this
+ * I/O starvation quickly induces more cached I/O. One solution to this
* consists in only processing a part of the events at once, but one drawback
- * is that unhandled events will still wake the poller up. Using an event-driven
- * poller such as EPOLL_ET will solve this issue though.
+ * is that unhandled events will still wake the poller up. Using an edge-
+ * triggered poller such as EPOLL_ET will solve this issue though.
*
- * A file descriptor has a distinct state for each direction. This state is a
- * combination of two bits :
- * bit 0 = active Y/N : is set if the FD is active, which means that its
- * handler will be called without prior polling ;
- * bit 1 = polled Y/N : is set if the FD was subscribed to polling
+ * Since we do not want to scan all the FD list to find cached I/O events,
+ * we store them in a list consisting in a linear array holding only the FD
+ * indexes right now. Note that a closed FD cannot exist in the cache, because
+ * it is closed by fd_delete() which in turn calls fd_release_cache_entry()
+ * which always removes it from the list.
*
- * It is perfectly valid to have both bits set at a time, which generally means
- * that the FD was reported by polling, was marked active and not yet unpolled.
- * Such a state must not last long to avoid unneeded wakeups.
+ * The FD array has to hold a back reference to the cache. This reference is
+ * always valid unless the FD is not in the cache and is not updated, in which
+ * case the reference points to index 0.
*
- * The state of the FD as of last change is preserved in two other bits. These
- * ones are useful to save a significant amount of system calls during state
- * changes, because there is no need to update the FD status in the system until
- * we're about to call the poller.
+ * The event state for an FD, as found in fdtab[].state, is maintained for each
+ * direction. The state field is built this way, with R bits in the low nibble
+ * and W bits in the high nibble for ease of access and debugging :
*
- * Since we do not want to scan all the FD list to find speculative I/O events,
- * we store them in a list consisting in a linear array holding only the FD
- * indexes right now. Note that a closed FD cannot exist in the spec list,
- * because it is closed by fd_delete() which in turn calls __fd_clo() which
- * always removes it from the list.
+ * 7 6 5 4 3 2 1 0
+ * [ 0 | PW | RW | AW | 0 | PR | RR | AR ]
+ *
+ * A* = active *R = read
+ * P* = polled *W = write
+ * R* = ready
+ *
+ * An FD is marked "active" when there is a desire to use it.
+ * An FD is marked "polled" when it is registered in the polling.
+ * An FD is marked "ready" when it has not faced a new EAGAIN since last wake-up
+ * (it is a cache of the last EAGAIN regardless of polling changes).
+ *
+ * We have 8 possible states for each direction based on these 3 flags :
+ *
+ * +---+---+---+----------+---------------------------------------------+
+ * | P | R | A | State | Description |
+ * +---+---+---+----------+---------------------------------------------+
+ * | 0 | 0 | 0 | DISABLED | No activity desired, not ready. |
+ * | 0 | 0 | 1 | MUSTPOLL | Activity desired via polling. |
+ * | 0 | 1 | 0 | STOPPED | End of activity without polling. |
+ * | 0 | 1 | 1 | ACTIVE | Activity desired without polling. |
+ * | 1 | 0 | 0 | ABORT | Aborted poll(). Not frequently seen. |
+ * | 1 | 0 | 1 | POLLED | FD is being polled. |
+ * | 1 | 1 | 0 | PAUSED | FD was paused while ready (eg: buffer full) |
+ * | 1 | 1 | 1 | READY | FD was marked ready by poll() |
+ * +---+---+---+----------+---------------------------------------------+
+ *
+ * The transitions are pretty simple :
+ * - fd_want_*() : set flag A
+ * - fd_stop_*() : clear flag A
+ * - fd_cant_*() : clear flag R (when facing EAGAIN)
+ * - fd_may_*() : set flag R (upon return from poll())
+ * - sync() : if (A) { if (!R) P := 1 } else { P := 0 }
+ *
+ * The PAUSED, ABORT and MUSTPOLL states are transient for level-trigerred
+ * pollers and are fixed by the sync() which happens at the beginning of the
+ * poller. For event-triggered pollers, only the MUSTPOLL state will be
+ * transient and ABORT will lead to PAUSED. The ACTIVE state is the only stable
+ * one which has P != A.
+ *
+ * The READY state is a bit special as activity on the FD might be notified
+ * both by the poller or by the cache. But it is needed for some multi-layer
+ * protocols (eg: SSL) where connection activity is not 100% linked to FD
+ * activity. Also some pollers might prefer to implement it as ACTIVE if
+ * enabling/disabling the FD is cheap. The READY and ACTIVE states are the
+ * two states for which a cache entry is allocated.
*
- * For efficiency reasons, we will store the Read and Write bits interlaced to
- * form a 4-bit field, so that we can simply shift the value right by 0/1 and
- * get what we want :
- * 3 2 1 0
- * Wp Rp Wa Ra
+ * The state transitions look like the diagram below. Only the 4 right states
+ * have polling enabled :
*
- * The FD array has to hold a back reference to the speculative list. This
- * reference is always valid unless the FD if currently being polled and not
- * updated (in which case the reference points to index 0).
+ * (POLLED=0) (POLLED=1)
*
- * We store the FD state in the 4 lower bits of fdtab[fd].state, and save the
- * previous state upon changes in the 4 higher bits, so that changes are easy
- * to spot.
+ * +----------+ sync +-------+
+ * | DISABLED | <----- | ABORT | (READY=0, ACTIVE=0)
+ * +----------+ +-------+
+ * clr | ^ set | ^
+ * | | | |
+ * v | set v | clr
+ * +----------+ sync +--------+
+ * | MUSTPOLL | -----> | POLLED | (READY=0, ACTIVE=1)
+ * +----------+ +--------+
+ * ^ poll | ^
+ * | | |
+ * | EAGAIN v | EAGAIN
+ * +--------+ +-------+
+ * | ACTIVE | | READY | (READY=1, ACTIVE=1)
+ * +--------+ +-------+
+ * clr | ^ set | ^
+ * | | | |
+ * v | set v | clr
+ * +---------+ sync +--------+
+ * | STOPPED | <------ | PAUSED | (READY=1, ACTIVE=0)
+ * +---------+ +--------+
*/
#include <stdio.h>
cur_poller.clo(fd);
fd_release_cache_entry(fd);
- fdtab[fd].state &= ~(FD_EV_CURR_MASK | FD_EV_PREV_MASK);
+ fdtab[fd].state = 0;
port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port);
fdinfo[fd].port_range = NULL;
*/
fdtab[fd].ev &= FD_POLL_STICKY;
- if (e & FD_EV_ACTIVE_R)
+ if ((e & (FD_EV_READY_R | FD_EV_ACTIVE_R)) == (FD_EV_READY_R | FD_EV_ACTIVE_R))
fdtab[fd].ev |= FD_POLL_IN;
- if (e & FD_EV_ACTIVE_W)
+ if ((e & (FD_EV_READY_W | FD_EV_ACTIVE_W)) == (FD_EV_READY_W | FD_EV_ACTIVE_W))
fdtab[fd].ev |= FD_POLL_OUT;
if (fdtab[fd].iocb && fdtab[fd].owner && fdtab[fd].ev)
if (unlikely(cfd == -1)) {
switch (errno) {
case EAGAIN:
- fd_poll_recv(fd);
+ fd_cant_recv(fd);
return; /* nothing more to accept */
case EINTR:
case ECONNABORTED:
}
if (len < 0) {
- if (errno == EAGAIN) /* connection not closed yet */
+ if (errno == EAGAIN) {
+ /* connection not closed yet */
+ fd_cant_recv(fd);
return -1;
+ }
if (errno == EINTR) /* oops, try again */
continue;
/* other errors indicate a dead connection, fine. */