MAJOR: checks: move health checks changes to set_server_check_status()

author Willy Tarreau <w@1wt.eu>

Tue, 20 May 2014 12:55:13 +0000 (14:55 +0200)

committer Willy Tarreau <w@1wt.eu>

Fri, 23 May 2014 12:29:11 +0000 (14:29 +0200)
author Willy Tarreau <w@1wt.eu>
Tue, 20 May 2014 12:55:13 +0000 (14:55 +0200)
committer Willy Tarreau <w@1wt.eu>
Fri, 23 May 2014 12:29:11 +0000 (14:29 +0200)
diff --git a/src/checks.c b/src/checks.c

index 8d708a3ed9c4bb8f238b5982217faab0d61c4ea5..9875980212b8dbe272fa701b75885a644a5b91a9 100644 (file)
--- a/src/checks.c
+++ b/src/checks.c
@@ -206,7 +206,8 @@ static void check_report_srv_status(struct chunk *msg, struct server *s, struct
  
  /*
   * Set check->status, update check->duration and fill check->result with
- * an adequate CHK_RES_* value.
+ * an adequate CHK_RES_* value. The new check->health is computed based
+ * on the result.
   *
   * Show information in logs about failed health check if server is UP
   * or succeeded health checks if server is DOWN.
@@ -215,6 +216,7 @@ static void set_server_check_status(struct check *check, short status, const cha
  {
         struct server *s = check->server;
         short prev_status = check->status;
+       int report = 0;
  
         if (status == HCHK_STATUS_START) {
                 check->result = CHK_RES_UNKNOWN;        /* no result yet */
@@ -245,71 +247,59 @@ static void set_server_check_status(struct check *check, short status, const cha
         }
  
         /* Failure to connect to the agent as a secondary check should not
-        * cause the server to be marked down. So only log status changes
-        * for HCHK_STATUS_* statuses */
+        * cause the server to be marked down.
+        */
         if ((check->state & CHK_ST_AGENT) && check->status < HCHK_STATUS_L7TOUT)
                 return;
  
-       if (s->proxy->options2 & PR_O2_LOGHCHKS &&
-           ((status != prev_status) ||
-           ((check->health != 0) && (check->result == CHK_RES_FAILED)) ||
-           (((check->health != check->rise + check->fall - 1)) && (check->result >= CHK_RES_PASSED)))) {
-               int health, rise, fall;
-               enum srv_state state;
-
-               chunk_reset(&trash);
-
-               /* FIXME begin: calculate local version of the health/rise/fall/state */
-               health = check->health;
-               rise   = check->rise;
-               fall   = check->fall;
-               state  = s->state;
-
-               switch (check->result) {
-               case CHK_RES_FAILED:
-                       if (health > rise) {
-                               health--; /* still good */
-                       } else {
-                               if (health == rise)
-                                       state = SRV_ST_STOPPED;
+       report = 0;
  
-                               health = 0;
-                       }
-                       break;
+       switch (check->result) {
+       case CHK_RES_FAILED:
+               if (check->health >= check->rise) {
+                       s->counters.failed_checks++;
+                       report = 1;
+                       check->health--;
+                       if (check->health < check->rise)
+                               check->health = 0;
+               }
+               break;
  
-               case CHK_RES_PASSED:
-               case CHK_RES_CONDPASS:
-                       if (health < rise + fall - 1) {
-                               health++; /* was bad, stays for a while */
+       case CHK_RES_PASSED:
+       case CHK_RES_CONDPASS:  /* "condpass" cannot make the first step but it OK after a "passed" */
+               if ((check->health < check->rise + check->fall - 1) &&
+                   (check->result == CHK_RES_PASSED || check->health > 0)) {
+                       report = 1;
+                       check->health++;
  
-                               if (health == rise)
-                                       state = SRV_ST_RUNNING;
+                       if (check->health >= check->rise)
+                               check->health = check->rise + check->fall - 1; /* OK now */
+               }
  
-                               if (health >= rise)
-                                       health = rise + fall - 1; /* OK now */
-                       }
+               /* clear consecutive_errors if observing is enabled */
+               if (s->onerror)
+                       s->consecutive_errors = 0;
+               break;
  
-                       /* clear consecutive_errors if observing is enabled */
-                       if (s->onerror)
-                               s->consecutive_errors = 0;
-                       break;
-               default:
-                       break;
-               }
+       default:
+               break;
+       }
  
-               chunk_appendf(&trash,
+       if (s->proxy->options2 & PR_O2_LOGHCHKS &&
+           (status != prev_status || report)) {
+               chunk_printf(&trash,
                              "Health check for %sserver %s/%s %s%s",
                              s->flags & SRV_F_BACKUP ? "backup " : "",
                              s->proxy->id, s->id,
                              (check->result == CHK_RES_CONDPASS) ? "conditionally ":"",
-                            (check->result >= CHK_RES_PASSED)   ? "succeeded":"failed");
+                            (check->result >= CHK_RES_PASSED)   ? "succeeded" : "failed");
  
                 check_report_srv_status(&trash, s, check, -1);
  
                 chunk_appendf(&trash, ", status: %d/%d %s",
-                            (state != SRV_ST_STOPPED) ? (health - rise + 1) : (health),
-                            (state != SRV_ST_STOPPED) ? (fall) : (rise),
-                            (state != SRV_ST_STOPPED) ? (s->uweight?"UP":"DRAIN"):"DOWN");
+                            (check->health >= check->rise) ? check->health - check->rise + 1 : check->health,
+                            (check->health >= check->rise) ? check->fall : check->rise,
+                            (check->health >= check->rise) ? (s->uweight ? "UP" : "DRAIN") : "DOWN");
  
                 Warning("%s.\n", trash.str);
                 send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.str);
@@ -332,6 +322,15 @@ static void check_set_server_down(struct check *check)
         if ((s->admin & SRV_ADMF_MAINT) || s->state == SRV_ST_STOPPED)
                 return;
  
+       /* The agent secondary check should only cause a server to be marked
+        * as down if check->status is HCHK_STATUS_L7STS, which indicates
+        * that the agent returned "fail", "stopped" or "down".
+        * The implication here is that failure to connect to the agent
+        * as a secondary check should not cause the server to be marked
+        * down. */
+       if ((check->state & CHK_ST_AGENT) && check->status != HCHK_STATUS_L7STS)
+               return;
+
         check->health = 0; /* failure */
         s->last_change = now.tv_sec;
         s->state = SRV_ST_STOPPED;
@@ -500,27 +499,6 @@ static void check_set_server_drain(struct check *check)
                 check->health = check->rise + check->fall - 1; /* OK now */
  }
  
-static void check_failed(struct check *check)
-{
-       struct server *s = check->server;
-
-       /* The agent secondary check should only cause a server to be marked
-        * as down if check->status is HCHK_STATUS_L7STS, which indicates
-        * that the agent returned "fail", "stopped" or "down".
-        * The implication here is that failure to connect to the agent
-        * as a secondary check should not cause the server to be marked
-        * down. */
-       if ((check->state & CHK_ST_AGENT) && check->status != HCHK_STATUS_L7STS)
-               return;
-
-       if (check->health > check->rise) {
-               check->health--; /* still good */
-               s->counters.failed_checks++;
-       }
-       else
-               check_set_server_down(check);
-}
-
  /* note: use health_adjust() only, which first checks that the observe mode is
   * enabled.
   */
@@ -577,7 +555,8 @@ void __health_adjust(struct server *s, short status)
                 case HANA_ONERR_FAILCHK:
                 /* simulate a failed health check */
                         set_server_check_status(&s->check, HCHK_STATUS_HANA, trash.str);
-                       check_failed(&s->check);
+                       if (!s->check.health)
+                               check_set_server_down(&s->check);
  
                         break;
  
@@ -585,7 +564,8 @@ void __health_adjust(struct server *s, short status)
                 /* mark server down */
                         s->check.health = s->check.rise;
                         set_server_check_status(&s->check, HCHK_STATUS_HANA, trash.str);
-                       check_set_server_down(&s->check);
+                       if (!s->check.health)
+                               check_set_server_down(&s->check);
  
                         break;
  
@@ -1528,7 +1508,8 @@ static struct task *process_chk(struct task *t)
                 /* here, we have seen a synchronous error, no fd was allocated */
  
                 check->state &= ~CHK_ST_INPROGRESS;
-               check_failed(check);
+               if (!check->health)
+                       check_set_server_down(check);
  
                 /* we allow up to min(inter, timeout.connect) for a connection
                  * to establish but only when timeout.check is set
@@ -1577,21 +1558,20 @@ static struct task *process_chk(struct task *t)
  
                 if (check->result == CHK_RES_FAILED) {
                         /* a failure or timeout detected */
-                       check_failed(check);
+                       if (!check->health && check->server->state != SRV_ST_STOPPED)
+                               check_set_server_down(check);
                 }
                 else if (check->result == CHK_RES_CONDPASS && s->state != SRV_ST_STOPPED && !(s->admin & SRV_ADMF_MAINT)) {
                         /* check is OK but asks for drain mode */
-                       if (check->health < check->rise + check->fall - 1) {
-                               check->health++;
+                       if (check->health >= check->rise && check->server->state != SRV_ST_STOPPING)
                                 check_set_server_drain(check);
-                       }
                 }
                 else if (check->result == CHK_RES_PASSED && !(s->admin & SRV_ADMF_MAINT)) {
                         /* check is OK */
-                       if (check->health < check->rise + check->fall - 1) {
-                               check->health++;
+                       if (check->health >= check->rise &&
+                           (check->server->state == SRV_ST_STOPPING ||
+                            check->server->state == SRV_ST_STOPPED))
                                 check_set_server_up(check);
-                       }
                 }
                 check->state &= ~CHK_ST_INPROGRESS;
author	Willy Tarreau <w@1wt.eu>
	Tue, 20 May 2014 12:55:13 +0000 (14:55 +0200)
committer	Willy Tarreau <w@1wt.eu>
	Fri, 23 May 2014 12:29:11 +0000 (14:29 +0200)