]> git.ipfire.org Git - thirdparty/haproxy.git/commitdiff
* released 1.2.5-pre3 v1.2.5-pre3
authorwilly tarreau <willy@wtap.(none)>
Sun, 18 Dec 2005 00:09:15 +0000 (01:09 +0100)
committerwilly tarreau <willy@wtap.(none)>
Sun, 18 Dec 2005 00:09:15 +0000 (01:09 +0100)
* experimental epoll() support (use temporary '-e' argument)

CHANGELOG
Makefile
TODO
haproxy.c

index f0af36f44bcbe31f36c588c31cedb255892f3f76..fc0b5aba16f19ed131c15dd057d4ce35dd3e9242 100644 (file)
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,7 +1,10 @@
 ChangeLog :
 ===========
 
-2005/04/24
+2005/04/25 : 1.2.5-pre3
+  - experimental epoll() support (use temporary '-e' argument)
+
+2005/04/24 : 1.2.5-pre2
   - implemented the HTTP 303 code for error redirection. This forces the
     browser to fetch the given URI with a GET request. The new keyword for
     this is 'errorloc303', and a new 'errorloc302' keyword has been created
@@ -9,7 +12,7 @@ ChangeLog :
   - added more controls in the parser for valid use of '\x' sequence.
   - few fixes from Alex & Klaus
 
-2005/02/17
+2005/02/17 : 1.2.5-pre1
   - fixed a few errors in the documentation
 
 2005/02/13
index 29a79fcb646226c465db141d3e93defe8d136844..70a21fe470dfaf053c40ee1feb5e4afbb9d1449c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -73,8 +73,8 @@ TARGET_OPTS=$(COPTS.$(TARGET))
 REGEX_OPTS=$(COPTS.$(REGEX))
 CPU_OPTS=$(COPTS.$(CPU))
 
-COPTS=-I. $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS)
-LIBS=$(LIBS.$(TARGET)) $(LIBS.$(REGEX))
+COPTS=-I. $(ADDINC) $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS)
+LIBS=$(LIBS.$(TARGET)) $(LIBS.$(REGEX)) $(ADDLIB)
 
 # - use -DSTATTIME=0 to disable statistics, else specify an interval in
 #   milliseconds.
diff --git a/TODO b/TODO
index 047468b3b8f307364eafc133516dddd171104d64..66a4ec6ac661084a4d01c6f00ea4ad047397d416 100644 (file)
--- a/TODO
+++ b/TODO
@@ -139,4 +139,5 @@ Todo for 1.2
 * appcookie
 - weighted round robin
 - option to shutdown(listen_sock) when max connections reached
-
+* epoll
+- replace the event scheduler with an O(log(N)) one
index c5c29177cf188b139a8e97abbc179a707ff9126b..79819e622776b84d4c9c6c57547feff139b321df 100644 (file)
--- a/haproxy.c
+++ b/haproxy.c
@@ -62,6 +62,8 @@
 #include <strings.h>
 #endif
 
+#include <sys/epoll.h>
+
 #include "include/appsession.h"
 
 #define HAPROXY_VERSION "1.2.5"
@@ -575,6 +577,7 @@ struct fdtab {
 /*********************************************************************/
 
 int cfg_maxpconn = 2000;       /* # of simultaneous connections per proxy (-N) */
+int cfg_use_epoll = 0;          /* use epoll() instead of select() ? */
 char *cfg_cfgfile = NULL;      /* configuration file */
 char *progname = NULL;         /* program name */
 int  pid;                      /* current process id */
@@ -607,6 +610,11 @@ fd_set     *ReadEvent,
        *StaticReadEvent,
        *StaticWriteEvent;
 
+/* used by the epoll() emulation of select() */
+fd_set *PrevReadEvent, *PrevWriteEvent;
+struct epoll_event *epoll_events;
+int epoll_fd;
+
 void **pool_session = NULL,
     **pool_buffer   = NULL,
     **pool_fdtab    = NULL,
@@ -803,7 +811,9 @@ void usage(char *name) {
            "        -c check mode : only check config file and exit\n"
            "        -n sets the maximum total # of connections (%d)\n"
            "        -N sets the default, per-proxy maximum # of connections (%d)\n"
-           "        -p writes pids of all children to this file\n\n",
+           "        -p writes pids of all children to this file\n"
+           "        -e tries to use epoll() instead of select()\n"
+           "\n",
            name, DEFAULT_MAXCONN, cfg_maxpconn);
     exit(1);
 }
@@ -1384,6 +1394,20 @@ static inline struct timeval *tv_min(struct timeval *tvmin,
 static inline void fd_delete(int fd) {
     FD_CLR(fd, StaticReadEvent);
     FD_CLR(fd, StaticWriteEvent);
+    if (cfg_use_epoll) {
+       struct epoll_event ev;
+
+       ev.data.fd = fd;
+       if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
+           // it's impossible to tell whether it has already
+           // been done.
+           //perror("epoll_ctl(DEL)");
+           //exit(1);
+       }
+
+       FD_CLR(fd, PrevReadEvent);
+       FD_CLR(fd, PrevWriteEvent);
+    }
     close(fd);
     fdtab[fd].state = FD_STCLOSE;
 
@@ -2130,6 +2154,20 @@ int event_srv_write(int fd) {
 void client_retnclose(struct session *s, int len, const char *msg) {
     FD_CLR(s->cli_fd, StaticReadEvent);
     FD_SET(s->cli_fd, StaticWriteEvent);
+    if (cfg_use_epoll) {
+       struct epoll_event ev;
+
+       ev.data.fd = s->cli_fd;
+       if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, s->cli_fd, &ev) < 0) {
+           // it's impossible to tell whether it has already
+           // been done.
+           //perror("epoll_ctl(DEL)");
+           //exit(1);
+       }
+
+       FD_CLR(s->cli_fd, PrevReadEvent);
+       FD_CLR(s->cli_fd, PrevWriteEvent);
+    }
     tv_eternity(&s->crexpire);
     shutdown(s->cli_fd, SHUT_RD);
     s->cli_state = CL_STSHUTR;
@@ -4832,71 +4870,232 @@ void select_loop() {
       next_time = MINTIME(time2, next_time);
 #endif
 
-      if (next_time > 0) {  /* FIXME */
-         /* Convert to timeval */
-         /* to avoid eventual select loops due to timer precision */
-         next_time += SCHEDULER_RESOLUTION;
-         delta.tv_sec  = next_time / 1000; 
-         delta.tv_usec = (next_time % 1000) * 1000;
-      }
-      else if (next_time == 0) { /* allow select to return immediately when needed */
-         delta.tv_sec = delta.tv_usec = 0;
-      }
 
+      if (cfg_use_epoll) {
+         /* use epoll() */
+         int fds, count;
+         int pr, pw, sr, sw;
+         unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
+         struct epoll_event ev;
 
-      /* let's restore fdset state */
+         for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
 
-      readnotnull = 0; writenotnull = 0;
-      for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) {
-         readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0;
-         writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0;
-      }
+             rn = ((int*)StaticReadEvent)[fds];  ro = ((int*)PrevReadEvent)[fds];
+             wn = ((int*)StaticWriteEvent)[fds]; wo = ((int*)PrevWriteEvent)[fds];
 
-//     /* just a verification code, needs to be removed for performance */
-//     for (i=0; i<maxfd; i++) {
-//         if (FD_ISSET(i, ReadEvent) != FD_ISSET(i, StaticReadEvent))
-//             abort();
-//         if (FD_ISSET(i, WriteEvent) != FD_ISSET(i, StaticWriteEvent))
-//             abort();
-//         
-//     }
-
-      status = select(maxfd,
-                     readnotnull ? ReadEvent : NULL,
-                     writenotnull ? WriteEvent : NULL,
-                     NULL,
-                     (next_time >= 0) ? &delta : NULL);
-      
-      /* this is an experiment on the separation of the select work */
-      // status  = (readnotnull  ? select(maxfd, ReadEvent, NULL, NULL, (next_time >= 0) ? &delta : NULL) : 0);
-      // status |= (writenotnull ? select(maxfd, NULL, WriteEvent, NULL, (next_time >= 0) ? &delta : NULL) : 0);
-      
-      tv_now(&now);
+             if ((ro^rn) | (wo^wn)) {
+                 for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
 
-      if (status > 0) { /* must proceed with events */
+#define WE_KNOW_HOW_FDSET_WORKS
+#ifdef WE_KNOW_HOW_FDSET_WORKS
 
-         int fds;
-         char count;
-         
-         for (fds = 0; (fds << INTBITS) < maxfd; fds++)
-             if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) != 0)
-                 for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
-                     
-                     /* if we specify read first, the accepts and zero reads will be
-                      * seen first. Moreover, system buffers will be flushed faster.
-                      */
-                     if (fdtab[fd].state == FD_STCLOSE)
+#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
+#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
+                     pr = (ro >> ((1<<INTBITS)-count)) & 1;
+                     pw = (wo >> ((1<<INTBITS)-count)) & 1;
+                     sr = (rn >> ((1<<INTBITS)-count)) & 1;
+                     sw = (wn >> ((1<<INTBITS)-count)) & 1;
+#else
+                     pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
+                     pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
+                     sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
+                     sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
+#endif
+#else
+                     pr = FD_ISSET(fd, PrevReadEvent);
+                     pw = FD_ISSET(fd, PrevWriteEvent);
+                     sr = FD_ISSET(fd, StaticReadEvent);
+                     sw = FD_ISSET(fd, StaticWriteEvent);
+#endif
+                     if (!((sr^pr) | (sw^pw)))
                          continue;
-                     
-                     if (FD_ISSET(fd, ReadEvent))
-                         fdtab[fd].read(fd);
 
-                     if (FD_ISSET(fd, WriteEvent))
-                         fdtab[fd].write(fd);
+                     ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
+                     ev.data.fd = fd;
+
+                     if ((pr | pw)) {
+                         /* the file-descriptor already exists... */
+                         if ((sr | sw)) {
+                             /* ...and it will still exist */
+                             if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
+                                 perror("epoll_ctl(MOD)");
+                                 exit(1);
+                             }
+                         } else {
+                             /* ...and it will be removed */
+                             if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
+                                 perror("epoll_ctl(DEL)");
+                                 exit(1);
+                             }
+                         }
+                     } else {
+                         /* the file-descriptor did not exist, let's add it */
+                         if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+                             perror("epoll_ctl(ADD)");
+                             exit(1);
+                         }
+                     }
                  }
-      }
-      else {
-         //      fprintf(stderr,"select returned %d, maxfd=%d\n", status, maxfd);
+                 ((int*)PrevReadEvent)[fds] = rn;
+                 ((int*)PrevWriteEvent)[fds] = wn;
+             }           
+
+#if useless_optimization
+             unsigned a, d, m;  /* add mask, del mask, mod mask */
+
+             a = (rn|wn) & ~(ro|wo);   /* fds to add */
+             d = (ro|wo) & ~(rn|wn);   /* fds to remove, normally none */
+             m = (ro^rn) | (wo^wn);    /* fds which change */
+
+             if (m) {
+                 struct epoll_event ev;
+                 m &= ~(a|d); /* keep only changes, not add/del */
+
+                 if (m) { /* fds which only change */
+                     for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
+                         ev.data.fd = fd;
+                         ev.events = 0;
+
+                         if ((FD_ISSET(fd, PrevReadEvent) || FD_ISSET(fd, PrevWriteEvent)) &&
+                             (FD_ISSET(fd, StaticReadEvent) || FD_ISSET(fd, StaticWriteEvent))) {
+                             if (FD_ISSET(fd, StaticReadEvent))
+                                 ev.events |= EPOLLIN;
+                             if (FD_ISSET(fd, StaticWriteEvent))
+                                 ev.events |= EPOLLOUT;
+                             if (ev.events && epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
+                                 perror("epoll_ctl(MOD)");
+                                 exit(1);
+                             }
+                         }
+                     }
+                 }
+
+                 if (a) { /* fds to add */
+                     // printf("a=%08x\n", a);
+                     for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
+                         ev.data.fd = fd;
+                         ev.events = 0;
+                         if (!FD_ISSET(fd, PrevReadEvent) && !FD_ISSET(fd, PrevWriteEvent)) {
+                             if (FD_ISSET(fd, StaticReadEvent))
+                                 ev.events |= EPOLLIN;
+                             if (FD_ISSET(fd, StaticWriteEvent))
+                                 ev.events |= EPOLLOUT;
+                             if (ev.events && epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+                                 perror("epoll_ctl(ADD)");
+                                 exit(1);
+                             }
+                         }
+                     }
+                 }
+
+                 if (d) { /* fds to delete */
+                     for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
+                         ev.data.fd = fd;
+                         ev.events = 0;
+                         if (FD_ISSET(fd, StaticReadEvent) || FD_ISSET(fd, StaticWriteEvent))
+                             continue;
+                         if (!FD_ISSET(fd, PrevReadEvent) && !FD_ISSET(fd, PrevWriteEvent))
+                             continue;
+                         if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
+                             perror("epoll_ctl(DEL)");
+                             exit(1);
+                         }
+                     }
+                 }
+                 ((int*)PrevReadEvent)[fds] = rn;
+                 ((int*)PrevWriteEvent)[fds] = wn;
+             }
+#endif
+         }
+
+         /* now let's wait for events */
+         status = epoll_wait(epoll_fd, epoll_events, maxfd, next_time);
+         tv_now(&now);
+
+         for (count = 0; count < status; count++) {
+             fd = epoll_events[count].data.fd;
+             
+             if (fdtab[fd].state == FD_STCLOSE)
+                 continue;
+             
+             if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
+                 fdtab[fd].read(fd);
+             
+             if (fdtab[fd].state == FD_STCLOSE)
+                 continue;
+             
+             if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
+                     fdtab[fd].write(fd);
+         }
+      } else {
+         /* use select() */
+
+         if (next_time > 0) {  /* FIXME */
+             /* Convert to timeval */
+             /* to avoid eventual select loops due to timer precision */
+             next_time += SCHEDULER_RESOLUTION;
+             delta.tv_sec  = next_time / 1000; 
+             delta.tv_usec = (next_time % 1000) * 1000;
+         }
+         else if (next_time == 0) { /* allow select to return immediately when needed */
+             delta.tv_sec = delta.tv_usec = 0;
+         }
+
+
+         /* let's restore fdset state */
+
+         readnotnull = 0; writenotnull = 0;
+         for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) {
+             readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0;
+             writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0;
+         }
+
+         //    /* just a verification code, needs to be removed for performance */
+         //    for (i=0; i<maxfd; i++) {
+         //        if (FD_ISSET(i, ReadEvent) != FD_ISSET(i, StaticReadEvent))
+         //            abort();
+         //        if (FD_ISSET(i, WriteEvent) != FD_ISSET(i, StaticWriteEvent))
+         //            abort();
+         //        
+         //    }
+
+         status = select(maxfd,
+                         readnotnull ? ReadEvent : NULL,
+                         writenotnull ? WriteEvent : NULL,
+                         NULL,
+                         (next_time >= 0) ? &delta : NULL);
+      
+         /* this is an experiment on the separation of the select work */
+         // status  = (readnotnull  ? select(maxfd, ReadEvent, NULL, NULL, (next_time >= 0) ? &delta : NULL) : 0);
+         // status |= (writenotnull ? select(maxfd, NULL, WriteEvent, NULL, (next_time >= 0) ? &delta : NULL) : 0);
+
+         tv_now(&now);
+
+         if (status > 0) { /* must proceed with events */
+
+             int fds;
+             char count;
+         
+             for (fds = 0; (fds << INTBITS) < maxfd; fds++)
+                 if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) != 0)
+                     for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
+                         
+                         /* if we specify read first, the accepts and zero reads will be
+                          * seen first. Moreover, system buffers will be flushed faster.
+                          */
+                         if (fdtab[fd].state == FD_STCLOSE)
+                             continue;
+                         
+                         if (FD_ISSET(fd, ReadEvent))
+                             fdtab[fd].read(fd);
+                         
+                         if (FD_ISSET(fd, WriteEvent))
+                             fdtab[fd].write(fd);
+                     }
+         }
+         else {
+             //          fprintf(stderr,"select returned %d, maxfd=%d\n", status, maxfd);
+         }
       }
   }
 }
@@ -6795,6 +6994,8 @@ void init(int argc, char **argv) {
                display_version();
                exit(0);
            }
+           else if (*flag == 'e')
+               cfg_use_epoll = 1;
            else if (*flag == 'V')
                arg_mode |= MODE_VERBOSE;
            else if (*flag == 'd')
@@ -6897,6 +7098,22 @@ void init(int argc, char **argv) {
                sizeof(fd_set) *
                (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
 
+    if (cfg_use_epoll) {
+       epoll_fd = epoll_create(global.maxsock + 1);
+       if (epoll_fd < 0) {
+           Warning("epoll() is not available. Using select() instead.\n");
+           cfg_use_epoll = 0;
+       } else {
+           epoll_events = (struct epoll_event*) calloc(1, sizeof(struct epoll_event) * global.maxsock);
+           PrevReadEvent = (fd_set *)calloc(1,
+                                            sizeof(fd_set) *
+                                            (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
+           PrevWriteEvent = (fd_set *)calloc(1,
+                                             sizeof(fd_set) *
+                                             (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
+       }
+    }
+
     fdtab = (struct fdtab *)calloc(1,
                sizeof(struct fdtab) * (global.maxsock));
     for (i = 0; i < global.maxsock; i++) {
@@ -7109,6 +7326,8 @@ void deinit(void){
     
     if (ReadEvent)        free(ReadEvent);
     if (WriteEvent)       free(WriteEvent);
+    if (PrevReadEvent)    free(ReadEvent);
+    if (PrevWriteEvent)   free(WriteEvent);
     if (StaticReadEvent)  free(StaticReadEvent);
     if (StaticWriteEvent) free(StaticWriteEvent);
     if (fdtab)            free(fdtab);