From: Eduard Bagdasaryan <eduard.bagdasaryan@measurement-factory.com>
Date: Mon, 10 Jul 2023 12:07:53 +0000 (+0000)
Subject: Handle helper program startup failure as its death (#1395)
X-Git-Tag: SQUID_7_0_1~400
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=654835f01f2c65fcb62979a055c2a59e78b7265d;p=thirdparty%2Fsquid.git

Handle helper program startup failure as its death (#1395)

Squid quits when started helper programs die too fast without responding
to requests[^1] because such a Squid instance is unlikely to provide
acceptable service (and a full restart may actually fix the problem or,
at the very least, is more likely to bring the needed admin attention).

The same logic now applies when Squid fails to start a helper (i.e. when
ipcCreate() fails). There is no conceptual difference between those two
kinds of failures as far as helper handling code is concerned, so we now
treat them the same way.

Without these changes, helper start failures may result in an unusable
(but running) Squid instance, especially if no helpers can be started at
all, because new transactions get stuck waiting in the queue until
clients timeout. Such persistent ipcCreate() failures may be caused, for
example, by its fork() hitting an overcommit memory limits.

[^1]: The actual condition excludes cases where at least startup=N
helpers are still running. That exclusion and other helper failure
handling details are problematic, but adjusting that code is outside
_this_ fix scope: Here, we only apply _existing_ handling logic to a
missed case.
---

diff --git a/src/helper.cc b/src/helper.cc
index ca940e578b..c83cb53dd4 100644
--- a/src/helper.cc
+++ b/src/helper.cc
@@ -251,6 +251,8 @@ helperOpenServers(helper * hlp)
 
     assert(nargs <= HELPER_MAX_ARGS);
 
+    int successfullyStarted = 0;
+
     for (k = 0; k < need_new; ++k) {
         getCurrentTime();
         rfd = wfd = -1;
@@ -268,6 +270,7 @@ helperOpenServers(helper * hlp)
             continue;
         }
 
+        ++successfullyStarted;
         ++ hlp->childs.n_running;
         ++ hlp->childs.n_active;
         srv = new helper_server;
@@ -317,6 +320,13 @@ helperOpenServers(helper * hlp)
         comm_read(srv->readPipe, srv->rbuf, srv->rbuf_sz - 1, call);
     }
 
+    // Call handleFewerServers() before hlp->last_restart is updated because
+    // that method uses last_restart to measure the delay since previous start.
+    // TODO: Refactor last_restart code to measure failure frequency rather than
+    // detecting a helper #X failure that is being close to the helper #Y start.
+    if (successfullyStarted < need_new)
+        hlp->handleFewerServers(false);
+
     hlp->last_restart = squid_curtime;
     safe_free(shortname);
     safe_free(procname);
@@ -376,6 +386,8 @@ helperStatefulOpenServers(statefulhelper * hlp)
 
     assert(nargs <= HELPER_MAX_ARGS);
 
+    int successfullyStarted = 0;
+
     for (int k = 0; k < need_new; ++k) {
         getCurrentTime();
         int rfd = -1;
@@ -395,6 +407,7 @@ helperStatefulOpenServers(statefulhelper * hlp)
             continue;
         }
 
+        ++successfullyStarted;
         ++ hlp->childs.n_running;
         ++ hlp->childs.n_active;
         helper_stateful_server *srv = new helper_stateful_server;
@@ -436,6 +449,13 @@ helperStatefulOpenServers(statefulhelper * hlp)
         comm_read(srv->readPipe, srv->rbuf, srv->rbuf_sz - 1, call);
     }
 
+    // Call handleFewerServers() before hlp->last_restart is updated because
+    // that method uses last_restart to measure the delay since previous start.
+    // TODO: Refactor last_restart code to measure failure frequency rather than
+    // detecting a helper #X failure that is being close to the helper #Y start.
+    if (successfullyStarted < need_new)
+        hlp->handleFewerServers(false);
+
     hlp->last_restart = squid_curtime;
     safe_free(shortname);
     safe_free(procname);
@@ -838,21 +858,35 @@ helper::handleKilledServer(HelperServerBase *srv, bool &needsNewServers)
         --childs.n_active;
         debugs(84, DBG_CRITICAL, "WARNING: " << id_name << " #" << srv->index << " exited");
 
-        if (childs.needNew() > 0) {
-            debugs(80, DBG_IMPORTANT, "Too few " << id_name << " processes are running (need " << childs.needNew() << "/" << childs.n_max << ")");
+        handleFewerServers(srv->stats.replies >= 1);
 
-            if (childs.n_active < childs.n_startup && last_restart > squid_curtime - 30) {
-                if (srv->stats.replies < 1)
-                    fatalf("The %s helpers are crashing too rapidly, need help!\n", id_name);
-                else
-                    debugs(80, DBG_CRITICAL, "ERROR: The " << id_name << " helpers are crashing too rapidly, need help!");
-            }
+        if (childs.needNew() > 0) {
             srv->flags.shutdown = true;
             needsNewServers = true;
         }
     }
 }
 
+void
+helper::handleFewerServers(const bool madeProgress)
+{
+    const auto needNew = childs.needNew();
+
+    if (!needNew)
+        return; // some server(s) have died, but we still have enough
+
+    debugs(80, DBG_IMPORTANT, "Too few " << id_name << " processes are running (need " << needNew << "/" << childs.n_max << ")" <<
+           Debug::Extra << "active processes: " << childs.n_active <<
+           Debug::Extra << "processes configured to start at (re)configuration: " << childs.n_startup);
+
+    if (childs.n_active < childs.n_startup && last_restart > squid_curtime - 30) {
+        if (madeProgress)
+            debugs(80, DBG_CRITICAL, "ERROR: The " << id_name << " helpers are crashing too rapidly, need help!");
+        else
+            fatalf("The %s helpers are crashing too rapidly, need help!", id_name);
+    }
+}
+
 void
 helper_server::HelperServerClosed(helper_server *srv)
 {
diff --git a/src/helper.h b/src/helper.h
index 5f01ded6c2..7156488775 100644
--- a/src/helper.h
+++ b/src/helper.h
@@ -103,6 +103,11 @@ public:
     /// \param needsNewServers true if new servers must started, false otherwise
     void handleKilledServer(HelperServerBase *srv, bool &needsNewServers);
 
+    /// Reacts to unexpected server death(s), including a failure to start server(s)
+    /// and an unexpected exit of a previously started server. \sa handleKilledServer()
+    /// \param madeProgress whether the died server(s) responded to any requests
+    void handleFewerServers(bool madeProgress);
+
 public:
     wordlist *cmdline;
     dlink_list servers;