MEDIUM: server: properly support and propagate the maintenance status

author Willy Tarreau <w@1wt.eu>

Fri, 16 May 2014 09:25:16 +0000 (11:25 +0200)

committer Willy Tarreau <w@1wt.eu>

Thu, 22 May 2014 09:27:00 +0000 (11:27 +0200)
author Willy Tarreau <w@1wt.eu>
Fri, 16 May 2014 09:25:16 +0000 (11:25 +0200)
committer Willy Tarreau <w@1wt.eu>
Thu, 22 May 2014 09:27:00 +0000 (11:27 +0200)
diff --git a/include/proto/server.h b/include/proto/server.h

index 277974a96d94446d34c3bd53c4d0512423aaa773..e069f7fc8611c399d577431d0742c5a8aa33f5ea 100644 (file)
--- a/include/proto/server.h
+++ b/include/proto/server.h
@@ -116,6 +116,32 @@ void srv_shutdown_sessions(struct server *srv, int why);
   */
  void srv_shutdown_backup_sessions(struct proxy *px, int why);
  
+/* Appends some information to a message string related to a server going UP or DOWN.
+ * If <forced> is null and the server tracks another one, a "via" information will
+ * be provided to know where the status came from. If xferred is non-negative, some
+ * information about requeued sessions are provided.
+ */
+void srv_adm_append_status(struct chunk *msg, struct server *s, int xferred, int forced);
+
+/* Puts server <s> into maintenance mode, and propagate that status down to all
+ * tracking servers. This does the same action as the CLI's "disable server x".
+ * A log is emitted for all servers that were not yet in maintenance mode.
+ * Health checks are disabled but not agent checks. The server is marked as
+ * being either forced into maintenance by having <mode> set to SRV_ADMF_FMAINT,
+ * or as inheriting the maintenance status by having <mode> set to
+ * SRV_ADMF_IMAINT. Nothing is done if neither flag is set.
+ */
+void srv_adm_set_maint(struct server *s, enum srv_admin mode);
+
+/* Gets server <s> out of maintenance mode, and propagate that status down to
+ * all tracking servers. This does the same action as the CLI's "enable server x".
+ * A log is emitted for all servers that leave maintenance mode. Health checks
+ * are possibly enabled again. The server is marked as leaving forced maintenance
+ * when <mode> is set to SRV_ADMF_FMAINT, or as leaving inherited maintenance
+ * when <mode> set to SRV_ADMF_IMAINT. Nothing is done if neither flag is set.
+ */
+void srv_adm_set_ready(struct server *s, enum srv_admin mode);
+
  /*
   * Local variables:
   *  c-indent-level: 8
diff --git a/src/cfgparse.c b/src/cfgparse.c

index 5384f4f959c43628f04d2e11419d611f51cb0259..08168a1713f6b2be55f63b5b3f25703eee83e90f 100644 (file)
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -6618,6 +6618,7 @@ out_uri_auth_compat:
  
                                 /* if the other server is forced disabled, we have to do the same here */
                                 if (srv->admin & SRV_ADMF_MAINT) {
+                                       newsrv->admin |= SRV_ADMF_IMAINT;
                                         newsrv->state = SRV_ST_STOPPED;
                                         newsrv->check.health = 0;
                                         newsrv->agent.health = 0;
diff --git a/src/dumpstats.c b/src/dumpstats.c

index fbe1d2ab099e54549dde65f827e376d2cec1805a..d0cd63261071ef353a939a419c8392c14947adfa 100644 (file)
--- a/src/dumpstats.c
+++ b/src/dumpstats.c
@@ -1710,26 +1710,7 @@ static int stats_sock_parse_request(struct stream_interface *si, char *line)
                         if (!sv)
                                 return 1;
  
-                       if (sv->admin & SRV_ADMF_MAINT) {
-                               /* The server is really in maintenance, we can change the server state */
-                               if (sv->track) {
-                                       /* If this server tracks the status of another one,
-                                       * we must restore the good status.
-                                       */
-                                       if (sv->track->state != SRV_ST_STOPPED) {
-                                               set_server_up(&sv->check);
-                                               sv->check.health = sv->check.rise;      /* up, but will fall down at first failure */
-                                       } else {
-                                               sv->admin &= ~SRV_ADMF_FMAINT;
-                                               sv->check.state &= ~CHK_ST_PAUSED;
-                                               set_server_down(&sv->check);
-                                       }
-                               } else {
-                                       set_server_up(&sv->check);
-                                       sv->check.health = sv->check.rise;      /* up, but will fall down at first failure */
-                               }
-                       }
-
+                       srv_adm_set_ready(sv, SRV_ADMF_FMAINT);
                         return 1;
                 }
                 else if (strcmp(args[1], "frontend") == 0) {
@@ -1782,13 +1763,7 @@ static int stats_sock_parse_request(struct stream_interface *si, char *line)
                         if (!sv)
                                 return 1;
  
-                       if (!(sv->admin & SRV_ADMF_MAINT)) {
-                               /* Not already in maintenance, we can change the server state */
-                               sv->admin |= SRV_ADMF_FMAINT;
-                               sv->check.state |= CHK_ST_PAUSED;
-                               set_server_down(&sv->check);
-                       }
-
+                       srv_adm_set_maint(sv, SRV_ADMF_FMAINT);
                         return 1;
                 }
                 else if (strcmp(args[1], "frontend") == 0) {
@@ -2780,7 +2755,7 @@ static int stats_dump_sv_stats(struct stream_interface *si, struct proxy *px, in
                         "<i>no check</i>"
                 };
  
-               if ((sv->admin | ref->admin) & SRV_ADMF_MAINT)
+               if (sv->admin & SRV_ADMF_MAINT)
                         chunk_appendf(&trash, "<tr class=\"maintain\">");
                 else
                         chunk_appendf(&trash,
@@ -2915,10 +2890,6 @@ static int stats_dump_sv_stats(struct stream_interface *si, struct proxy *px, in
                         chunk_appendf(&trash, "%s ", human_time(now.tv_sec - sv->last_change, 1));
                         chunk_appendf(&trash, "MAINT");
                 }
-               else if (ref != sv && (ref->admin & SRV_ADMF_MAINT)) {
-                       chunk_appendf(&trash, "%s ", human_time(now.tv_sec - ref->last_change, 1));
-                       chunk_appendf(&trash, "MAINT(via)");
-               }
                 else if (ref->check.state & CHK_ST_ENABLED) {
                         chunk_appendf(&trash, "%s ", human_time(now.tv_sec - ref->last_change, 1));
                         chunk_appendf(&trash,
@@ -2975,13 +2946,11 @@ static int stats_dump_sv_stats(struct stream_interface *si, struct proxy *px, in
                                       ref->observe ? "/Health Analyses" : "",
                                       ref->counters.down_trans, human_time(srv_downtime(sv), 1));
                 }
-               else if (sv != ref) {
-                       if (sv->admin & SRV_ADMF_MAINT)
-                               chunk_appendf(&trash, "<td class=ac colspan=3></td>");
-                       else
-                               chunk_appendf(&trash,
-                                             "<td class=ac colspan=3><a class=lfsb href=\"#%s/%s\">via %s/%s</a></td>",
-                                             ref->proxy->id, ref->id, ref->proxy->id, ref->id);
+               else if (!(sv->admin & SRV_ADMF_FMAINT) && sv != ref) {
+                       /* tracking a server */
+                       chunk_appendf(&trash,
+                                     "<td class=ac colspan=3><a class=lfsb href=\"#%s/%s\">via %s/%s</a></td>",
+                                     ref->proxy->id, ref->id, ref->proxy->id, ref->id);
                 }
                 else
                         chunk_appendf(&trash, "<td colspan=3></td>");
@@ -3030,10 +2999,10 @@ static int stats_dump_sv_stats(struct stream_interface *si, struct proxy *px, in
                               sv->counters.retries, sv->counters.redispatches);
  
                 /* status */
-               if (sv->admin & SRV_ADMF_MAINT)
+               if (sv->admin & SRV_ADMF_IMAINT)
+                       chunk_appendf(&trash, "MAINT (via %s/%s),", ref->proxy->id, ref->id);
+               else if (sv->admin & SRV_ADMF_MAINT)
                         chunk_appendf(&trash, "MAINT,");
-               else if (ref != sv && (ref->admin & SRV_ADMF_MAINT))
-                       chunk_appendf(&trash, "MAINT(via),");
                 else
                         chunk_appendf(&trash,
                                       srv_hlt_st[state],
@@ -4249,32 +4218,17 @@ static int stats_process_http_post(struct stream_interface *si)
                                 else if ((sv = findserver(px, value)) != NULL) {
                                         switch (action) {
                                         case ST_ADM_ACTION_DISABLE:
-                                               if ((px->state != PR_STSTOPPED) && !(sv->admin & SRV_ADMF_MAINT)) {
-                                                       /* Not already in maintenance, we can change the server state */
-                                                       sv->admin |= SRV_ADMF_FMAINT;
-                                                       sv->check.state |= CHK_ST_PAUSED;
-                                                       set_server_down(&sv->check);
+                                               if ((px->state != PR_STSTOPPED) && !(sv->admin & SRV_ADMF_FMAINT)) {
                                                         altered_servers++;
                                                         total_servers++;
+                                                       srv_adm_set_maint(sv, SRV_ADMF_FMAINT);
                                                 }
                                                 break;
                                         case ST_ADM_ACTION_ENABLE:
-                                               if ((px->state != PR_STSTOPPED) && (sv->admin & SRV_ADMF_MAINT)) {
-                                                       /* Already in maintenance, we can change the server state.
-                                                        * If this server tracks the status of another one,
-                                                        * we must restore the good status.
-                                                        */
-                                                       if (!sv->track || (sv->track->state != SRV_ST_STOPPED)) {
-                                                               set_server_up(&sv->check);
-                                                               sv->check.health = sv->check.rise;      /* up, but will fall down at first failure */
-                                                       }
-                                                       else {
-                                                               sv->admin &= ~SRV_ADMF_FMAINT;
-                                                               sv->check.state &= ~CHK_ST_PAUSED;
-                                                               set_server_down(&sv->check);
-                                                       }
+                                               if ((px->state != PR_STSTOPPED) && (sv->admin & SRV_ADMF_FMAINT)) {
                                                         altered_servers++;
                                                         total_servers++;
+                                                       srv_adm_set_ready(sv, SRV_ADMF_FMAINT);
                                                 }
                                                 break;
                                         case ST_ADM_ACTION_STOP:
diff --git a/src/server.c b/src/server.c

index 07d243694e127dd65e8601d088879beca3d61eee..f0fb0a76dc771b92174124e27e16ee36a3420dd1 100644 (file)
--- a/src/server.c
+++ b/src/server.c
@@ -186,6 +186,242 @@ void srv_shutdown_backup_sessions(struct proxy *px, int why)
                         srv_shutdown_sessions(srv, why);
  }
  
+/* Appends some information to a message string related to a server going UP or DOWN.
+ * If <forced> is null and the server tracks another one, a "via" information will
+ * be provided to know where the status came from. If xferred is non-negative, some
+ * information about requeued sessions are provided.
+ */
+void srv_adm_append_status(struct chunk *msg, struct server *s, int xferred, int forced)
+{
+       if (!forced && s->track)
+               chunk_appendf(msg, " via %s/%s",
+                       s->track->proxy->id, s->track->id);
+
+       if (xferred >= 0) {
+               if (s->state == SRV_ST_STOPPED)
+                       chunk_appendf(msg, ". %d active and %d backup servers left.%s"
+                               " %d sessions active, %d requeued, %d remaining in queue",
+                               s->proxy->srv_act, s->proxy->srv_bck,
+                               (s->proxy->srv_bck && !s->proxy->srv_act) ? " Running on backup." : "",
+                               s->cur_sess, xferred, s->nbpend);
+               else
+                       chunk_appendf(msg, ". %d active and %d backup servers online.%s"
+                               " %d sessions requeued, %d total in queue",
+                               s->proxy->srv_act, s->proxy->srv_bck,
+                               (s->proxy->srv_bck && !s->proxy->srv_act) ? " Running on backup." : "",
+                               xferred, s->nbpend);
+       }
+}
+
+/* Puts server <s> into maintenance mode, and propagate that status down to all
+ * tracking servers. This does the same action as the CLI's "disable server x".
+ * A log is emitted for all servers that were not yet in maintenance mode.
+ * Health checks are disabled but not agent checks. The server is marked as
+ * being either forced into maintenance by having <mode> set to SRV_ADMF_FMAINT,
+ * or as inheriting the maintenance status by having <mode> set to
+ * SRV_ADMF_IMAINT. Nothing is done if neither flag is set.
+ */
+void srv_adm_set_maint(struct server *s, enum srv_admin mode)
+{
+       struct check *check = &s->check;
+       struct server *srv;
+       int xferred;
+
+       if (!mode)
+               return;
+
+       /* stop going down as soon as we meet a server already in the same state */
+       if (s->admin & mode)
+               return;
+
+       s->admin |= mode;
+
+       if (s->check.state & CHK_ST_ENABLED) {
+               s->check.state |= CHK_ST_PAUSED;
+               check->health = 0;
+       }
+
+       if (s->state == SRV_ST_STOPPED) {       /* server was already down */
+               if (!(s->admin & ~mode & SRV_ADMF_MAINT)) {
+                       chunk_printf(&trash,
+                                    "%sServer %s/%s was DOWN and now enters maintenance",
+                                    s->flags & SRV_F_BACKUP ? "Backup " : "", s->proxy->id, s->id);
+
+                       srv_adm_append_status(&trash, s, -1, (mode & SRV_ADMF_FMAINT));
+
+                       Warning("%s.\n", trash.str);
+                       send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.str);
+               }
+       }
+       else {  /* server was still running */
+               int srv_was_stopping = (s->state == SRV_ST_STOPPING);
+               int prev_srv_count = s->proxy->srv_bck + s->proxy->srv_act;
+
+               check->health = 0; /* failure */
+               s->last_change = now.tv_sec;
+               s->state = SRV_ST_STOPPED;
+               if (s->proxy->lbprm.set_server_status_down)
+                       s->proxy->lbprm.set_server_status_down(s);
+
+               if (s->onmarkeddown & HANA_ONMARKEDDOWN_SHUTDOWNSESSIONS)
+                       srv_shutdown_sessions(s, SN_ERR_DOWN);
+
+               /* we might have sessions queued on this server and waiting for
+                * a connection. Those which are redispatchable will be queued
+                * to another server or to the proxy itself.
+                */
+               xferred = pendconn_redistribute(s);
+
+               chunk_printf(&trash,
+                            "%sServer %s/%s is going DOWN for maintenance",
+                            s->flags & SRV_F_BACKUP ? "Backup " : "",
+                            s->proxy->id, s->id);
+
+               srv_adm_append_status(&trash, s, xferred, (mode & SRV_ADMF_FMAINT));
+
+               Warning("%s.\n", trash.str);
+               send_log(s->proxy, srv_was_stopping ? LOG_NOTICE : LOG_ALERT, "%s.\n", trash.str);
+
+               if (prev_srv_count && s->proxy->srv_bck == 0 && s->proxy->srv_act == 0)
+                       set_backend_down(s->proxy);
+
+               s->counters.down_trans++;
+       }
+
+       for (srv = s->trackers; srv; srv = srv->tracknext)
+               srv_adm_set_maint(srv, SRV_ADMF_IMAINT);
+}
+
+/* Gets server <s> out of maintenance mode, and propagate that status down to
+ * all tracking servers. This does the same action as the CLI's "enable server x".
+ * A log is emitted for all servers that leave maintenance mode. Health checks
+ * are possibly enabled again. The server is marked as leaving forced maintenance
+ * when <mode> is set to SRV_ADMF_FMAINT, or as leaving inherited maintenance
+ * when <mode> set to SRV_ADMF_IMAINT. Nothing is done if neither flag is set.
+ */
+void srv_adm_set_ready(struct server *s, enum srv_admin mode)
+{
+       struct check *check = &s->check;
+       struct server *srv;
+       int xferred = -1;
+
+       if (!mode)
+               return;
+
+       /* stop going down as soon as we see the flag is not there anymore */
+       if (!(s->admin & mode))
+               return;
+
+       s->admin &= ~mode;
+
+       if (s->admin & SRV_ADMF_MAINT) {
+               /* remaining in maintenance mode, let's inform precisely about the
+                * situation.
+                */
+
+               if (s->admin & SRV_ADMF_FMAINT) {
+                       chunk_printf(&trash,
+                                    "%sServer %s/%s remains in forced maintenance",
+                                    s->flags & SRV_F_BACKUP ? "Backup " : "",
+                                    s->proxy->id, s->id);
+               }
+               else {
+                       chunk_printf(&trash,
+                                    "%sServer %s/%s is leaving forced maintenance but remains in maintenance",
+                                    s->flags & SRV_F_BACKUP ? "Backup " : "",
+                                    s->proxy->id, s->id);
+
+                       if (s->track) /* normally it's mandatory here */
+                               chunk_appendf(&trash, " via %s/%s",
+                                             s->track->proxy->id, s->track->id);
+               }
+
+               Warning("%s.\n", trash.str);
+               send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.str);
+               return;
+       }
+
+       /* OK here we're leaving maintenance, we have many things to check,
+        * because the server might possibly be coming back up depending on
+        * its state. In practice, leaving maintenance means that we should
+        * immediately turn to UP (more or less the slowstart) under the
+        * following conditions :
+        *   - server is neither checked nor tracked
+        *   - server tracks another server which is not checked
+        *   - server tracks another server which is already up
+        * Which sums up as something simpler :
+        * "either the server's or the tracked server's checks are disabled or up".
+        * Otherwise we only re-enable health checks.
+        */
+
+       if (s->check.state & CHK_ST_ENABLED) {
+               s->check.state &= ~CHK_ST_PAUSED;
+               check->health = check->rise; /* start OK but check immediately */
+       }
+
+       if ((!s->track &&
+            (!(s->agent.state & CHK_ST_ENABLED) || (s->agent.health >= s->agent.rise)) &&
+            (!(s->check.state & CHK_ST_ENABLED) || (s->check.health >= s->check.rise))) ||
+           (s->track &&
+            (!(s->track->agent.state & CHK_ST_ENABLED) || (s->track->agent.health >= s->track->agent.rise)) &&
+            (!(s->track->check.state & CHK_ST_ENABLED) || (s->track->check.health >= s->track->check.rise)))) {
+
+               if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) {
+                       if (s->proxy->last_change < now.tv_sec)         // ignore negative times
+                               s->proxy->down_time += now.tv_sec - s->proxy->last_change;
+                       s->proxy->last_change = now.tv_sec;
+               }
+
+               if (s->last_change < now.tv_sec)                        // ignore negative times
+                       s->down_time += now.tv_sec - s->last_change;
+               s->last_change = now.tv_sec;
+
+               s->state = SRV_ST_STARTING;
+               if (s->slowstart > 0)
+                       task_schedule(s->warmup, tick_add(now_ms, MS_TO_TICKS(MAX(1000, s->slowstart / 20))));
+               else
+                       s->state = SRV_ST_RUNNING;
+
+               server_recalc_eweight(s);
+
+               /* If the server is set with "on-marked-up shutdown-backup-sessions",
+                * and it's not a backup server and its effective weight is > 0,
+                * then it can accept new connections, so we shut down all sessions
+                * on all backup servers.
+                */
+               if ((s->onmarkedup & HANA_ONMARKEDUP_SHUTDOWNBACKUPSESSIONS) &&
+                   !(s->flags & SRV_F_BACKUP) && s->eweight)
+                       srv_shutdown_backup_sessions(s->proxy, SN_ERR_UP);
+
+               /* check if we can handle some connections queued at the proxy. We
+                * will take as many as we can handle.
+                */
+               xferred = pendconn_grab_from_px(s);
+       }
+
+       if (mode & SRV_ADMF_FMAINT) {
+               chunk_printf(&trash,
+                            "%sServer %s/%s is %s (leaving forced maintenance)",
+                            s->flags & SRV_F_BACKUP ? "Backup " : "",
+                            s->proxy->id, s->id,
+                            (s->state == SRV_ST_STOPPED) ? "DOWN" : "UP");
+       }
+       else {
+               chunk_printf(&trash,
+                            "%sServer %s/%s is %s (leaving maintenance)",
+                            s->flags & SRV_F_BACKUP ? "Backup " : "",
+                            s->proxy->id, s->id,
+                            (s->state == SRV_ST_STOPPED) ? "DOWN" : "UP");
+               srv_adm_append_status(&trash, s, xferred, 0);
+       }
+
+       Warning("%s.\n", trash.str);
+       send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.str);
+
+       for (srv = s->trackers; srv; srv = srv->tracknext)
+               srv_adm_set_ready(srv, SRV_ADMF_IMAINT);
+}
+
  /* Note: must not be declared <const> as its list will be overwritten.
   * Please take care of keeping this list alphabetically sorted, doing so helps
   * all code contributors.
author	Willy Tarreau <w@1wt.eu>
	Fri, 16 May 2014 09:25:16 +0000 (11:25 +0200)
committer	Willy Tarreau <w@1wt.eu>
	Thu, 22 May 2014 09:27:00 +0000 (11:27 +0200)
include/proto/server.h		patch \| blob \| blame \| history
src/cfgparse.c		patch \| blob \| blame \| history
src/dumpstats.c		patch \| blob \| blame \| history
src/server.c		patch \| blob \| blame \| history