]> git.ipfire.org Git - thirdparty/haproxy.git/commitdiff
[MEDIUM] Spread health checks even more
authorKrzysztof Oledzki <ole@ans.pl>
Sun, 14 Oct 2007 21:40:01 +0000 (23:40 +0200)
committerWilly Tarreau <w@1wt.eu>
Mon, 15 Oct 2007 07:33:10 +0000 (09:33 +0200)
When one server appears at the same position in multiple backends, it
receives all the checks from all the backends exactly at the same time
because the health-checks are only spread within a backend but not
globally.

Attached patch implements per-server start delay in a different way.
Checks are now spread globally - not locally to one backend. It also makes
them start faster - IMHO there is no need to add a 'server->inter' when
calculating first execution. Calculation were moved from cfgparse.c to
checks.c. There is a new function start_checks() and now it is not called
when haproxy is started in MODE_CHECK.

With this patch it is also possible to set a global 'spread-checks'
parameter. It takes a percentage value (1..50, probably something near
5..10 is a good idea) so haproxy adds or removes that many percent to the
original interval after each check. My test shows that with 18 backends,
54 servers total and 10000ms/5% it takes about 45m to mix them completely.

I decided to use rand/srand pseudo-random number generator. I am aware it
is not recommend for a good randomness but a) we do not need a good random
generator here b) it is probably the most portable one.

include/proto/checks.h
include/types/global.h
src/cfgparse.c
src/checks.c
src/haproxy.c

index 839af55a53c69b3ef3103800bf25199512d4492a..84991757145f01e7f3bd7e3a4030f58ffb20e59a 100644 (file)
@@ -26,6 +26,7 @@
 #include <common/config.h>
 
 void process_chk(struct task *t, struct timeval *next);
+int start_checks();
 
 #endif /* _PROTO_CHECKS_H */
 
index f2de0d94683990e7d363841d2a2ab7aacf695291..340b583d778bee58fae246568b00348b386df59d 100644 (file)
@@ -55,6 +55,7 @@ struct global {
        int rlimit_memmax;      /* default ulimit-d in megs value : 0=unset */
        int mode;
        int last_checks;
+       int spread_checks;
        char *chroot;
        char *pidfile;
        int logfac1, logfac2;
index 19b2ee71adf53aecdcd843067329f0592ad21766..43ed8aa81077e3d0c8e1e5459344836f742d9140 100644 (file)
@@ -451,7 +451,21 @@ int cfg_parse_global(const char *file, int linenum, char **args)
                        Alert("parsing [%s:%d] : too many syslog servers\n", file, linenum);
                        return -1;
                }
-       
+       }
+       else if (!strcmp(args[0], "spread-checks")) {  /* random time between checks (0-50) */
+               if (global.spread_checks != 0) {
+                       Alert("parsing [%s:%d]: spread-checks already specified. Continuing.\n", file, linenum);
+                       return 0;
+               }
+               if (*(args[1]) == 0) {
+                       Alert("parsing [%s:%d]: '%s' expects an integer argument (0..50).\n", file, linenum, args[0]);
+                       return -1;
+               }
+               global.spread_checks = atol(args[1]);
+               if (global.spread_checks < 0 || global.spread_checks > 50) {
+                       Alert("parsing [%s:%d]: 'spread-checks' needs a positive value in range 0..50.\n", file, linenum);
+                       return -1;
+               }
        }
        else {
                Alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section\n", file, linenum, args[0], "global");
@@ -2261,7 +2275,6 @@ int readcfgfile(const char *file)
        char *args[MAX_LINE_ARGS + 1];
        int arg;
        int cfgerr = 0;
-       int nbchk, mininter;
        int confsect = CFG_NONE;
 
        struct proxy *curproxy = NULL;
@@ -2708,56 +2721,6 @@ int readcfgfile(const char *file)
                        newsrv = newsrv->next;
                }
 
-               /* now we'll start this proxy's health checks if any */
-               /* 1- count the checkers to run simultaneously */
-               nbchk = 0;
-               mininter = 0;
-               newsrv = curproxy->srv;
-               while (newsrv != NULL) {
-                       if (newsrv->state & SRV_CHECKED) {
-                               if (!mininter || mininter > newsrv->inter)
-                                       mininter = newsrv->inter;
-                               nbchk++;
-                       }
-                       newsrv = newsrv->next;
-               }
-
-               /* 2- start them as far as possible from each others while respecting
-                * their own intervals. For this, we will start them after their own
-                * interval added to the min interval divided by the number of servers,
-                * weighted by the server's position in the list.
-                */
-               if (nbchk > 0) {
-                       struct task *t;
-                       int srvpos;
-
-                       newsrv = curproxy->srv;
-                       srvpos = 0;
-                       while (newsrv != NULL) {
-                               /* should this server be checked ? */
-                               if (newsrv->state & SRV_CHECKED) {
-                                       if ((t = pool_alloc2(pool2_task)) == NULL) {
-                                               Alert("parsing [%s:%d] : out of memory.\n", file, linenum);
-                                               return -1;
-                                       }
-               
-                                       t->wq = NULL;
-                                       t->qlist.p = NULL;
-                                       t->state = TASK_IDLE;
-                                       t->process = process_chk;
-                                       t->context = newsrv;
-               
-                                       /* check this every ms */
-                                       tv_ms_add(&t->expire, &now,
-                                                 newsrv->inter + mininter * srvpos / nbchk);
-                                       task_queue(t);
-                                       //task_wakeup(&rq, t);
-                                       srvpos++;
-                               }
-                               newsrv = newsrv->next;
-                       }
-               }
-
                curproxy = curproxy->next;
        }
        if (cfgerr > 0) {
index 62f0c2c711db93cd6fc12069a5de7fc019a8f77d..1d8f2b301c58e1c6e5e9ed5208d53a35570e45cf 100644 (file)
@@ -13,7 +13,9 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
+#include <time.h>
 #include <unistd.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
@@ -281,6 +283,7 @@ void process_chk(struct task *t, struct timeval *next)
        struct server *s = t->context;
        struct sockaddr_in sa;
        int fd;
+       int rv;
 
        //fprintf(stderr, "process_chk: task=%p\n", t);
 
@@ -503,8 +506,15 @@ void process_chk(struct task *t, struct timeval *next)
                                set_server_down(s);
                        s->curfd = -1;
                        fd_delete(fd);
+
+                       rv = 0;
+                       if (global.spread_checks > 0) {
+                               rv = s->inter * global.spread_checks / 100;
+                               rv -= (int) (2 * rv * (rand() / (RAND_MAX + 1.0)));
+                               //fprintf(stderr, "process_chk: (%d+/-%d%%) random=%d\n", s->inter, global.spread_checks, rv);
+                       }
                        while (tv_isle(&t->expire, &now))
-                               tv_ms_add(&t->expire, &t->expire, s->inter);
+                               tv_ms_add(&t->expire, &t->expire, s->inter + rv);
                        goto new_chk;
                }
                /* if result is 0 and there's no timeout, we have to wait again */
@@ -517,6 +527,65 @@ void process_chk(struct task *t, struct timeval *next)
        return;
 }
 
+/*
+ * Start health-check.
+ * Returns 0 if OK, -1 if error, and prints the error in this case.
+ */
+int start_checks() {
+
+       struct proxy *px;
+       struct server *s;
+       struct task *t;
+       int nbchk=0, mininter=0, srvpos=0;
+
+       /* 1- count the checkers to run simultaneously */
+       for (px = proxy; px; px = px->next) {
+               for (s = px->srv; s; s = s->next) {
+                       if (!(s->state & SRV_CHECKED))
+                               continue;
+
+                       if (!mininter || mininter > s->inter)
+                               mininter = s->inter;
+
+                       nbchk++;
+               }
+       }
+
+       if (!nbchk)
+               return 0;
+
+       srand((unsigned)time(NULL));
+
+       /*
+        * 2- start them as far as possible from each others. For this, we will
+        * start them after their interval set to the min interval divided by
+        * the number of servers, weighted by the server's position in the list.
+        */
+       for (px = proxy; px; px = px->next) {
+               for (s = px->srv; s; s = s->next) {
+                       if (!(s->state & SRV_CHECKED))
+                               continue;
+
+                       if ((t = pool_alloc2(pool2_task)) == NULL) {
+                               Alert("Starting [%s:%s] check: out of memory.\n", px->id, s->id);
+                               return -1;
+                       }
+
+                       t->wq = NULL;
+                       t->qlist.p = NULL;
+                       t->state = TASK_IDLE;
+                       t->process = process_chk;
+                       t->context = s;
+
+                       /* check this every ms */
+                       tv_ms_add(&t->expire, &now, mininter * srvpos / nbchk);
+                       task_queue(t);
+
+                       srvpos++;
+               }
+       }
+       return 0;
+}
 
 /*
  * Local variables:
index 4437d45b019f5184b5288158a137839db52addcd..7b7a691305806c4bb3045344d04bc480cdb81f24 100644 (file)
@@ -81,6 +81,7 @@
 #include <proto/acl.h>
 #include <proto/backend.h>
 #include <proto/buffers.h>
+#include <proto/checks.h>
 #include <proto/client.h>
 #include <proto/fd.h>
 #include <proto/log.h>
@@ -506,6 +507,7 @@ void init(int argc, char **argv)
                Alert("Error reading configuration file : %s\n", cfg_cfgfile);
                exit(1);
        }
+
        if (have_appsession)
                appsession_init();
 
@@ -514,6 +516,9 @@ void init(int argc, char **argv)
                exit(0);
        }
 
+       if (start_checks() < 0)
+               exit(1);
+
        if (cfg_maxconn > 0)
                global.maxconn = cfg_maxconn;