From: Amos Jeffries <squid3@treenet.co.nz>
Date: Thu, 17 Jul 2008 12:38:06 +0000 (+1200)
Subject: Author: Mark Nottingham <mnot@pobox.com>
X-Git-Tag: SQUID_3_1_0_1~49^2~138
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=32a47e3eefac17ab6d209a357d0f77c2ef452580;p=thirdparty%2Fsquid.git

Author: Mark Nottingham <mnot@pobox.com>
Bug #2376: Round-Robin becomes unbalanced when a peer dies and comes back

When a peer goes down and then comes back, its round-robin counters aren't
reset, causing it to get a disproportionate amount of traffic until it "catches
up" with the rest of the peers in the round-robin pool.

If it was down for load-related issues, this has the effect of making it more
likely that it will go down again, because it's temporarily handling the load
of the entire pool.

Normally, this isn't a concern, because the number of requests that it can get
out-of-step is relatively small (bounded to how many requests it can be given
before it is considered down -- is this 10 in all cases, or are there corner
cases?), but in an accelerator case where the origin has a process-based
request-handling model, or back-end processes are CPU-intensive, it is.

This patch resets the counters each time a peer changes state.
---

diff --git a/src/cache_cf.cc b/src/cache_cf.cc
index 0422705688..60ae59b223 100644
--- a/src/cache_cf.cc
+++ b/src/cache_cf.cc
@@ -1863,7 +1863,7 @@ parse_peer(peer ** head)
 
     *head = p;
 
-    peerClearRR(p);
+    peerClearRRStart();
 }
 
 static void
diff --git a/src/neighbors.cc b/src/neighbors.cc
index 2336375e18..9418187cd8 100644
--- a/src/neighbors.cc
+++ b/src/neighbors.cc
@@ -384,19 +384,67 @@ getWeightedRoundRobinParent(HttpRequest * request)
     return q;
 }
 
-/* This gets called every 5 minutes to clear the round-robin counter. */
+/**
+ * This gets called every 5 minutes to clear the round-robin counter.
+ * The exact timing is an arbitrary default, set on estimate timing of a
+ * large number of requests in a high-performance environment during the
+ * period. The larger the number of requests between cycled resets the
+ * more balanced the operations.
+ *
+ \param data	unused.
+ \todo Make the reset timing a selectable parameter in squid.conf
+ */
+static void 
+peerClearRRLoop(void *data)
+{
+    peerClearRR();
+    eventAdd("peerClearRR", peerClearRRLoop, data, 5 * 60.0, 0);
+}
+
+/**
+ * This gets called on startup and restart to kick off the peer round-robin
+ * maintenance event. It ensures that no matter how many times its called
+ * no more than one event is scheduled.
+ */
 void
-peerClearRR(void *data)
+peerClearRRStart(void)
 {
-    peer *p = (peer *)data;
-    p->rr_count -= p->rr_lastcount;
+    static int event_added = 0;
+    if (!event_added) {
+        peerClearRRLoop(NULL);
+    }
+}
 
-    if (p->rr_count < 0)
+/**
+ * Called whenever the round-robin counters need to be reset to a sane state.
+ * So far those times are:
+ \item On startup and reconfigure - to set the counters to sane initial settings.
+ \item When a peer has revived from dead, to prevent the revived peer being
+ *     flooded with requests which it has 'missed' during the down period.
+ */
+void
+peerClearRR()
+{
+    peer *p = NULL;
+    for (p = Config.peers; p; p = p->next) {
         p->rr_count = 0;
+    }
+}
 
-    p->rr_lastcount = p->rr_count;
+/**
+ * Perform all actions when a peer is detected revived.
+ */
+void
+peerAlive(peer *p)
+{
+    if (p->stats.logged_state == PEER_DEAD && p->tcp_up) {
+        debugs(15, 1, "Detected REVIVED " << neighborTypeStr(p) << ": " << p->name);
+        p->stats.logged_state = PEER_ALIVE;
+        peerClearRR();
+    }
 
-    eventAdd("peerClearRR", peerClearRR, p, 5 * 60.0, 0);
+    p->stats.last_reply = squid_curtime;
+    p->stats.probe_start = 0;
 }
 
 peer *
@@ -857,13 +905,7 @@ peerNoteDigestLookup(HttpRequest * request, peer * p, lookup_t lookup)
 static void
 neighborAlive(peer * p, const MemObject * mem, const icp_common_t * header)
 {
-    if (p->stats.logged_state == PEER_DEAD && p->tcp_up) {
-        debugs(15, 1, "Detected REVIVED " << neighborTypeStr(p) << ": " << p->name);
-        p->stats.logged_state = PEER_ALIVE;
-    }
-
-    p->stats.last_reply = squid_curtime;
-    p->stats.probe_start = 0;
+    peerAlive(p);
     p->stats.pings_acked++;
 
     if ((icp_opcode) header->opcode <= ICP_END)
@@ -901,14 +943,7 @@ neighborUpdateRtt(peer * p, MemObject * mem)
 static void
 neighborAliveHtcp(peer * p, const MemObject * mem, const htcpReplyData * htcp)
 {
-    if (p->stats.logged_state == PEER_DEAD && p->tcp_up) {
-        debugs(15, 1, "Detected REVIVED " << neighborTypeStr(p) << ": " << p->name);
-        p->stats.logged_state = PEER_ALIVE;
-    }
-
-    p->stats.last_reply = squid_curtime;
-    p->stats.probe_start = 0;
-    p->stats.pings_acked++;
+    peerAlive(p);
     p->htcp.counts[htcp->hit ? 1 : 0]++;
     p->htcp.version = htcp->version;
 }
@@ -1317,13 +1352,13 @@ peerConnectSucceded(peer * p)
 {
     if (!p->tcp_up) {
         debugs(15, 2, "TCP connection to " << p->host << "/" << p->http_port << " succeded");
-        debugs(15, 1, "Detected REVIVED " << neighborTypeStr(p) << ": " << p->name);
-        p->stats.logged_state = PEER_ALIVE;
+        p->tcp_up = PEER_TCP_MAGIC_COUNT; // NP: so peerAlive(p) works properly.
+        peerAlive(p);
 	if (!p->n_addresses)
 	    ipcache_nbgethostbyname(p->host, peerDNSConfigure, p);
     }
-
-    p->tcp_up = PEER_TCP_MAGIC_COUNT;
+    else
+        p->tcp_up = PEER_TCP_MAGIC_COUNT;
 }
 
 static void
diff --git a/src/protos.h b/src/protos.h
index b3603b0da2..313358bace 100644
--- a/src/protos.h
+++ b/src/protos.h
@@ -390,7 +390,8 @@ SQUIDCEXTERN peer *peerFindByNameAndPort(const char *, unsigned short);
 SQUIDCEXTERN peer *getDefaultParent(HttpRequest * request);
 SQUIDCEXTERN peer *getRoundRobinParent(HttpRequest * request);
 SQUIDCEXTERN peer *getWeightedRoundRobinParent(HttpRequest * request);
-SQUIDCEXTERN void peerClearRR(void *);
+SQUIDCEXTERN void peerClearRRStart(void);
+SQUIDCEXTERN void peerClearRR(void);
 SQUIDCEXTERN peer *getAnyParent(HttpRequest * request);
 SQUIDCEXTERN lookup_t peerDigestLookup(peer * p, HttpRequest * request);
 SQUIDCEXTERN peer *neighborsDigestSelect(HttpRequest * request);
diff --git a/src/structs.h b/src/structs.h
index 95b2b33de6..852d3a2fd3 100644
--- a/src/structs.h
+++ b/src/structs.h
@@ -963,7 +963,6 @@ struct peer
     IPAddress addresses[10];
     int n_addresses;
     int rr_count;
-    int rr_lastcount;
     peer *next;
     int test_fd;