* The FD array has to hold a back reference to the speculative list. This
* reference is only valid if at least one of the directions is marked SPEC.
*
+ * We store the FD state in the 4 lower bits of fdtab[fd].spec.e, and save the
+ * previous state upon changes in the 4 higher bits, so that changes are easy
+ * to spot.
*/
-#define FD_EV_IN_SL 1
-#define FD_EV_IN_PL 4
+#define FD_EV_IN_SL 1U
+#define FD_EV_IN_PL 4U
#define FD_EV_IDLE 0
#define FD_EV_SPEC (FD_EV_IN_SL)
#define FD_EV_MASK_W (FD_EV_MASK_R << 1)
#define FD_EV_MASK (FD_EV_MASK_W | FD_EV_MASK_R)
+#define FD_EV_MASK_OLD (FD_EV_MASK << 4)
/* This is the minimum number of events successfully processed in speculative
* mode above which we agree to return without checking epoll() (1/2 times).
static int nbspec = 0; // current size of the spec list
static int absmaxevents = 0; // absolute maximum amounts of polled events
-static int fd_created = 0; // fd creation detector, reset upon poll() entry.
static unsigned int *spec_list = NULL; // speculative I/O list
if (unlikely(i != FD_EV_IDLE))
return 0;
// switch to SPEC state and allocate a SPEC entry.
- fd_created++;
alloc_spec_entry(fd);
}
fdtab[fd].spec.e ^= (unsigned int)(FD_EV_IN_SL << dir);
REGPRM1 static void __fd_clo(int fd)
{
release_spec_entry(fd);
- fdtab[fd].spec.e &= ~(FD_EV_MASK);
+ fdtab[fd].spec.e &= ~(FD_EV_MASK | FD_EV_MASK_OLD);
}
/*
*/
REGPRM2 static void _do_poll(struct poller *p, int exp)
{
- static unsigned int last_skipped;
- static unsigned int spec_processed;
- int status, eo;
+ int status, eo, en;
int fd, opcode;
int count;
int spec_idx;
int wait_time;
- int looping = 0;
-
- re_poll_once:
- /* Here we have two options :
- * - either walk the list forwards and hope to match more events
- * - or walk it backwards to minimize the number of changes and
- * to make better use of the cache.
- * Tests have shown that walking backwards improves perf by 0.2%.
- */
-
- status = 0;
+ /* first, update the poll list according to what changed in the spec list */
spec_idx = nbspec;
while (likely(spec_idx > 0)) {
- int done;
-
spec_idx--;
fd = spec_list[spec_idx];
- eo = fdtab[fd].spec.e; /* save old events */
-
- if (looping && --fd_created < 0) {
- /* we were just checking the newly created FDs */
- break;
- }
- /*
- * Process the speculative events.
- *
- * Principle: events which are marked FD_EV_SPEC are processed
- * with their assigned function. If the function returns 0, it
- * means there is nothing doable without polling first. We will
- * then convert the event to a pollable one by assigning them
- * the WAIT status.
+ en = fdtab[fd].spec.e & 15; /* new events */
+ eo = fdtab[fd].spec.e >> 4; /* previous events */
+
+ /* If an fd with a poll bit is present here, it means that it
+ * has last requested a poll, or is leaving from a poll. Given
+ * that an FD is fully in the poll list or in the spec list but
+ * not in both at once, we'll first ensure that if it is
+ * already being polled in one direction and requests a spec
+ * access, then we'll turn that into a polled access in order
+ * to save a syscall which will likely return EAGAIN.
*/
-#ifdef DEBUG_DEV
- if (fdtab[fd].state == FD_STCLOSE) {
- fprintf(stderr,"fd=%d, fdtab[].ev=%x, fdtab[].spec.e=%x, .s=%d, idx=%d\n",
- fd, fdtab[fd].ev, fdtab[fd].spec.e, fdtab[fd].spec.s1, spec_idx);
- }
-#endif
- done = 0;
- fdtab[fd].ev &= FD_POLL_STICKY;
- if ((eo & FD_EV_MASK_R) == FD_EV_SPEC_R) {
- /* The owner is interested in reading from this FD */
- if (fdtab[fd].state != FD_STERROR) {
- /* Pretend there is something to read */
- fdtab[fd].ev |= FD_POLL_IN;
- if (!fdtab[fd].cb[DIR_RD].f(fd))
- fdtab[fd].spec.e ^= (FD_EV_WAIT_R ^ FD_EV_SPEC_R);
- else
- done = 1;
- }
+ if ((en & FD_EV_RW_PL) && (en & FD_EV_RW_SL)) {
+ /* convert SPEC to WAIT if fd already being polled */
+ if ((en & FD_EV_MASK_R) == FD_EV_SPEC_R)
+ en = (en & ~FD_EV_MASK_R) | FD_EV_WAIT_R;
+
+ if ((en & FD_EV_MASK_W) == FD_EV_SPEC_W)
+ en = (en & ~FD_EV_MASK_W) | FD_EV_WAIT_W;
}
- else if ((eo & FD_EV_MASK_R) == FD_EV_STOP_R) {
+
+ if ((en & FD_EV_MASK_R) == FD_EV_STOP_R) {
/* This FD was being polled and is now being removed. */
- fdtab[fd].spec.e &= ~FD_EV_MASK_R;
- }
-
- if ((eo & FD_EV_MASK_W) == FD_EV_SPEC_W) {
- /* The owner is interested in writing to this FD */
- if (fdtab[fd].state != FD_STERROR) {
- /* Pretend there is something to write */
- fdtab[fd].ev |= FD_POLL_OUT;
- if (!fdtab[fd].cb[DIR_WR].f(fd))
- fdtab[fd].spec.e ^= (FD_EV_WAIT_W ^ FD_EV_SPEC_W);
- else
- done = 1;
- }
+ en &= ~FD_EV_MASK_R;
}
- else if ((eo & FD_EV_MASK_W) == FD_EV_STOP_W) {
+
+ if ((en & FD_EV_MASK_W) == FD_EV_STOP_W) {
/* This FD was being polled and is now being removed. */
- fdtab[fd].spec.e &= ~FD_EV_MASK_W;
+ en &= ~FD_EV_MASK_W;
}
- status += done;
- /* one callback might already have closed the fd by itself */
- if (fdtab[fd].state == FD_STCLOSE)
- continue;
-
- /* Now, we will adjust the event in the poll list. Indeed, it
- * is possible that an event which was previously in the poll
- * list now goes out, and the opposite is possible too. We can
- * have opposite changes for READ and WRITE too.
- */
-
- if ((eo ^ fdtab[fd].spec.e) & FD_EV_RW_PL) {
- /* poll status changed*/
- if ((fdtab[fd].spec.e & FD_EV_RW_PL) == 0) {
+ if ((eo ^ en) & FD_EV_RW_PL) {
+ /* poll status changed */
+ if ((en & FD_EV_RW_PL) == 0) {
/* fd removed from poll list */
opcode = EPOLL_CTL_DEL;
}
/* construct the epoll events based on new state */
ev.events = 0;
- if (fdtab[fd].spec.e & FD_EV_WAIT_R)
+ if (en & FD_EV_WAIT_R)
ev.events |= EPOLLIN;
- if (fdtab[fd].spec.e & FD_EV_WAIT_W)
+ if (en & FD_EV_WAIT_W)
ev.events |= EPOLLOUT;
ev.data.fd = fd;
epoll_ctl(epoll_fd, opcode, fd, &ev);
}
+ fdtab[fd].spec.e = (en << 4) + en; /* save new events */
if (!(fdtab[fd].spec.e & FD_EV_RW_SL)) {
/* This fd switched to combinations of either WAIT or
* IDLE. It must be removed from the spec list.
*/
release_spec_entry(fd);
- continue;
}
}
- /* It may make sense to immediately return here if there are enough
- * processed events, without passing through epoll_wait() because we
- * have exactly done a poll.
- * Measures have shown a great performance increase if we call the
- * epoll_wait() only the second time after speculative accesses have
- * succeeded. This reduces the number of unsucessful calls to
- * epoll_wait() by a factor of about 3, and the total number of calls
- * by about 2.
- * However, when we do that after having processed too many events,
- * events waiting in epoll() starve for too long a time and tend to
- * become themselves eligible for speculative polling. So we try to
- * limit this practise to reasonable situations.
- */
-
- spec_processed += status;
-
- if (looping) {
- last_skipped++;
- return;
- }
-
- if (status >= MIN_RETURN_EVENTS && spec_processed < absmaxevents) {
- /* We have processed at least MIN_RETURN_EVENTS, it's worth
- * returning now without checking epoll_wait().
- */
- if (++last_skipped <= 1) {
- tv_update_date(0, 1);
- return;
- }
- }
- last_skipped = 0;
+ /* compute the epoll_wait() timeout */
- if (nbspec || status || run_queue || signal_queue_len) {
- /* Maybe we have processed some events that we must report, or
- * maybe we still have events in the spec list, or there are
+ if (nbspec || run_queue || signal_queue_len) {
+ /* Maybe we still have events in the spec list, or there are
* some tasks left pending in the run_queue, so we must not
- * wait in epoll() otherwise we will delay their delivery by
+ * wait in epoll() otherwise we would delay their delivery by
* the next timeout.
*/
wait_time = 0;
}
}
- /* now let's wait for real events. We normally use maxpollevents as a
- * high limit, unless <nbspec> is already big, in which case we need
- * to compensate for the high number of events processed there.
- */
- fd = MIN(absmaxevents, spec_processed);
- fd = MAX(global.tune.maxpollevents, fd);
- fd = MIN(maxfd, fd);
- /* we want to detect if an accept() will create new speculative FDs here */
- fd_created = 0;
- spec_processed = 0;
+ /* now let's wait for real events */
+ fd = MIN(maxfd, global.tune.maxpollevents);
gettimeofday(&before_poll, NULL);
status = epoll_wait(epoll_fd, epoll_events, fd, wait_time);
tv_update_date(wait_time, status);
measure_idle();
+ /* process events */
for (count = 0; count < status; count++) {
int e = epoll_events[count].events;
fd = epoll_events[count].data.fd;
/* it looks complicated but gcc can optimize it away when constants
* have same values.
*/
- DPRINTF(stderr, "%s:%d: fd=%d, ev=0x%08x, e=0x%08x\n",
- __FUNCTION__, __LINE__,
- fd, fdtab[fd].ev, e);
-
fdtab[fd].ev &= FD_POLL_STICKY;
fdtab[fd].ev |=
((e & EPOLLIN ) ? FD_POLL_IN : 0) |
}
}
- if (fd_created) {
- /* we have created some fds, certainly in return of an accept(),
- * and they're marked as speculative. If we can manage to perform
- * a read(), we're almost sure to collect all the request at once
- * and avoid several expensive wakeups. So let's try now. Anyway,
- * if we fail, the tasks are still woken up, and the FD gets marked
- * for poll mode.
+ /* now process speculative events if any */
+
+ /* Here we have two options :
+ * - either walk the list forwards and hope to match more events
+ * - or walk it backwards to minimize the number of changes and
+ * to make better use of the cache.
+ * Tests have shown that walking backwards improves perf by 0.2%.
+ */
+
+ spec_idx = nbspec;
+ while (likely(spec_idx > 0)) {
+ spec_idx--;
+ fd = spec_list[spec_idx];
+ eo = fdtab[fd].spec.e; /* save old events */
+
+ /*
+ * Process the speculative events.
+ *
+ * Principle: events which are marked FD_EV_SPEC are processed
+ * with their assigned function. If the function returns 0, it
+ * means there is nothing doable without polling first. We will
+ * then convert the event to a pollable one by assigning them
+ * the WAIT status.
*/
- looping = 1;
- goto re_poll_once;
+
+ fdtab[fd].ev &= FD_POLL_STICKY;
+ if ((eo & FD_EV_MASK_R) == FD_EV_SPEC_R) {
+ /* The owner is interested in reading from this FD */
+ if (fdtab[fd].state != FD_STERROR) {
+ /* Pretend there is something to read */
+ fdtab[fd].ev |= FD_POLL_IN;
+ if (!fdtab[fd].cb[DIR_RD].f(fd))
+ fdtab[fd].spec.e ^= (FD_EV_WAIT_R ^ FD_EV_SPEC_R);
+ }
+ }
+
+ if ((eo & FD_EV_MASK_W) == FD_EV_SPEC_W) {
+ /* The owner is interested in writing to this FD */
+ if (fdtab[fd].state != FD_STERROR) {
+ /* Pretend there is something to write */
+ fdtab[fd].ev |= FD_POLL_OUT;
+ if (!fdtab[fd].cb[DIR_WR].f(fd))
+ fdtab[fd].spec.e ^= (FD_EV_WAIT_W ^ FD_EV_SPEC_W);
+ }
+ }
+
+ /* one callback might already have closed the fd by itself */
+ if (fdtab[fd].state == FD_STCLOSE)
+ continue;
+
+ if (!(fdtab[fd].spec.e & (FD_EV_RW_SL|FD_EV_RW_PL))) {
+ /* This fd switched to IDLE, it can be removed from the spec list. */
+ release_spec_entry(fd);
+ continue;
+ }
}
+
+ /* in the end, we have processed status + spec_processed FDs */
}
/*