]> git.ipfire.org Git - thirdparty/squid.git/commitdiff
Automatically revive hopeless kids on reconfigure and after a timeout.
authorAlex Rousskov <rousskov@measurement-factory.com>
Wed, 2 Aug 2017 22:13:27 +0000 (16:13 -0600)
committerAlex Rousskov <rousskov@measurement-factory.com>
Thu, 3 Aug 2017 00:06:11 +0000 (18:06 -0600)
Squid declares kids with "repeated, frequent failures" as hopeless.
Hopeless kids were not automatically restarted. In the absence of
automated recovery, admins were forced to restart the whole Squid
instance (after fixing the underlying problem that led to those kid
failures). Squid instance restarts hurt users.

In many cases, the underlying kid-killing problem is temporary, and
Squid can eventually fully recover without any admin involvement.

Squid now automatically restarts a hopeless kid after a configurable
downtime (a new hopeless_kid_revival_delay directive with a 60 minute
default value).

Also restart all hopeless kids upon receiving a reconfiguration signal.

Also avoid sending signals to non-running kids, fixing an old minor bug.

src/SquidConfig.h
src/cf.data.pre
src/ipc/Kid.cc
src/ipc/Kid.h
src/ipc/Kids.cc
src/ipc/Kids.h
src/main.cc
src/tools.cc

index f42acdaeb87a71825325427c795cf27e499e6d7c..d152d4c9b3689e3fc3b57ed79ee8e0a7e9ca8869 100644 (file)
@@ -96,6 +96,7 @@ public:
     time_t positiveDnsTtl;
     time_t shutdownLifetime;
     time_t backgroundPingRate;
+    time_t hopelessKidRevivalDelay; ///< hopeless_kid_revival_delay
 
     struct {
         time_t read;
index 600f59590abed5da6d635b409f63d17e1db15dd2..2aab4b19c1664b60ee501e54f03abfe6151479b4 100644 (file)
@@ -473,6 +473,26 @@ DOC_START
        CAP_IPC_LOCK capability, or equivalent.
 DOC_END
 
+NAME: hopeless_kid_revival_delay
+COMMENT: time-units
+TYPE: time_t
+LOC: Config.hopelessKidRevivalDelay
+DEFAULT: 1 hour
+DOC_START
+       Normally, when a kid process dies, Squid immediately restarts the
+       kid. A kid experiencing frequent deaths is marked as "hopeless" for
+       the duration specified by this directive. Hopeless kids are not
+       automatically restarted.
+
+       Currently, zero values are not supported because they result in
+       misconfigured SMP Squid instances running forever, endlessly
+       restarting each dying kid. To effectively disable hopeless kids
+       revival, set the delay to a huge value (e.g., 1 year).
+
+       Reconfiguration also clears all hopeless kids designations, allowing
+       for manual revival of hopeless kids.
+DOC_END
+
 COMMENT_START
  OPTIONS FOR AUTHENTICATION
  -----------------------------------------------------------------------------
index 9f650f47f3fc9f491a8aef991fec113da28f498e..2e0ea4e2752d94cea87c947d1cbf5102439559c5 100644 (file)
@@ -11,6 +11,7 @@
 #include "squid.h"
 #include "globals.h"
 #include "ipc/Kid.h"
+#include "SquidConfig.h"
 
 #include <ctime>
 #if HAVE_SYS_WAIT_H
@@ -45,8 +46,9 @@ void Kid::start(pid_t cpid)
     assert(cpid > 0);
 
     isRunning = true;
+    stopTime = 0;
     pid = cpid;
-    time(&startTime);
+    startTime = squid_curtime;
 }
 
 /// called when kid terminates, sets exiting status
@@ -57,15 +59,39 @@ Kid::stop(PidStatus const theExitStatus)
     assert(startTime != 0);
 
     isRunning = false;
+    stopTime = squid_curtime;
+    status = theExitStatus;
 
-    time_t stop_time;
-    time(&stop_time);
-    if ((stop_time - startTime) < fastFailureTimeLimit)
+    if ((stopTime - startTime) < fastFailureTimeLimit)
         ++badFailures;
     else
         badFailures = 0; // the failures are not "frequent" [any more]
 
-    status = theExitStatus;
+    reportStopped(); // after all state changes
+}
+
+/// describes a recently stopped kid
+void
+Kid::reportStopped() const
+{
+    if (calledExit()) {
+        syslog(LOG_NOTICE,
+               "Squid Parent: %s process %d exited with status %d",
+               theName.termedBuf(), pid, exitStatus());
+    } else if (signaled()) {
+        syslog(LOG_NOTICE,
+               "Squid Parent: %s process %d exited due to signal %d with status %d",
+               theName.termedBuf(), pid, termSignal(), exitStatus());
+    } else {
+        syslog(LOG_NOTICE, "Squid Parent: %s process %d exited",
+               theName.termedBuf(), pid);
+    }
+
+    if (hopeless() && Config.hopelessKidRevivalDelay) {
+        syslog(LOG_NOTICE, "Squid Parent: %s process %d will not be restarted for %ld "
+               "seconds due to repeated, frequent failures",
+               theName.termedBuf(), pid, Config.hopelessKidRevivalDelay);
+    }
 }
 
 /// returns true if tracking of kid is stopped
@@ -147,3 +173,9 @@ const String& Kid::name() const
     return theName;
 }
 
+time_t
+Kid::deathDuration() const
+{
+    return squid_curtime > stopTime ? squid_curtime - stopTime : 0;
+}
+
index d3f5d18dd365922acc7ff4e7cc4d07e4bd6c52df..186bf20163b7344696a3d614f4901e25006475de 100644 (file)
@@ -47,6 +47,12 @@ public:
     /// whether the failures are "repeated and frequent"
     bool hopeless() const;
 
+    /// forgets all past failures, ensuring that we are not hopeless()
+    void forgetFailures() { badFailures = 0; }
+
+    /// \returns the time since process termination
+    time_t deathDuration() const;
+
     /// returns true if the process terminated normally
     bool calledExit() const;
 
@@ -72,6 +78,8 @@ public:
     const String& name() const;
 
 private:
+    void reportStopped() const;
+
     // Information preserved across restarts
     String theName; ///< process name
     int badFailures; ///< number of "repeated frequent" failures
@@ -79,6 +87,7 @@ private:
     // Information specific to a running or stopped kid
     pid_t  pid; ///< current (for a running kid) or last (for stopped kid) PID
     time_t startTime; ///< last start time
+    time_t stopTime = 0; ///< last termination time
     bool   isRunning; ///< whether the kid is assumed to be alive
     PidStatus status; ///< exit status of a stopped kid
 };
index 7cb5704ea5511ad132572d822bb221d5224c5330..e599bb28b09f6f9edd9a93387e0cfc9c2e95cc4a 100644 (file)
@@ -82,6 +82,35 @@ bool Kids::allHopeless() const
     return true;
 }
 
+void
+Kids::forgetAllFailures()
+{
+    for (auto &kid: storage)
+        kid.forgetFailures();
+}
+
+time_t
+Kids::forgetOldFailures()
+{
+    time_t nextCheckDelay = 0;
+    for (auto &kid: storage) {
+        if (!kid.hopeless())
+            continue;
+
+        const auto deathDuration = kid.deathDuration(); // protect from time changes
+        if (Config.hopelessKidRevivalDelay <= deathDuration) {
+            kid.forgetFailures(); // this kid will be revived now
+            continue;
+        }
+
+        const auto remainingDeathTime = Config.hopelessKidRevivalDelay - deathDuration;
+        assert(remainingDeathTime > 0);
+        if (remainingDeathTime < nextCheckDelay || !nextCheckDelay)
+            nextCheckDelay = remainingDeathTime;
+    }
+    return nextCheckDelay; // still zero if there were no still-hopeless kids
+}
+
 /// whether all kids called exited happy
 bool Kids::allExitedHappy() const
 {
index 23788bf33a8ddd7195dd59f654f5fe070ccad01d..58b46fc9313f039d4401a8f782421fc9c550c059 100644 (file)
@@ -36,6 +36,13 @@ public:
     /// whether all kids are hopeless
     bool allHopeless() const;
 
+    /// forgets all failures in all kids
+    void forgetAllFailures();
+
+    /// forgets all failures in hopeless kids that were dead for a long time
+    /// \returns seconds till the next check (zero if there are no hopeless kids left)
+    time_t forgetOldFailures();
+
     /// whether all kids called exited happy
     bool allExitedHappy() const;
 
index 6f3e321f578e19234c3a8cf91fd7bcaae175c08b..f481f92e6d1f4972b58b089c5e4b0779b8cf1166 100644 (file)
@@ -153,12 +153,14 @@ static int malloc_debug_level = 0;
 static volatile int do_reconfigure = 0;
 static volatile int do_rotate = 0;
 static volatile int do_shutdown = 0;
+static volatile int do_revive_kids = 0;
 static volatile int shutdown_status = EXIT_SUCCESS;
 static volatile int do_handle_stopped_child = 0;
 
 static int RotateSignal = -1;
 static int ReconfigureSignal = -1;
 static int ShutdownSignal = -1;
+static int ReviveKidsSignal = -1;
 
 static void mainRotate(void);
 static void mainReconfigureStart(void);
@@ -735,6 +737,19 @@ reconfigure(int sig)
 #endif
 }
 
+void
+master_revive_kids(int sig)
+{
+    ReviveKidsSignal = sig;
+    do_revive_kids = true;
+
+#if !_SQUID_WINDOWS_
+#if !HAVE_SIGACTION
+    signal(sig, master_revive_kids);
+#endif
+#endif
+}
+
 /// Shutdown signal handler for master process
 void
 master_shutdown(int sig)
@@ -1774,32 +1789,87 @@ mainStartScript(const char *prog)
     }
 }
 
-#endif /* _SQUID_WINDOWS_ */
+/// Initiates shutdown sequence. Shutdown ends when the last running kids stops.
+static void
+masterShutdownStart()
+{
+    if (AvoidSignalAction("shutdown", do_shutdown))
+        return;
+    debugs(1, 2, "received shutdown command");
+    shutting_down = 1;
+}
 
-#if !_SQUID_WINDOWS_
+/// Initiates reconfiguration sequence. See also: masterReconfigureFinish().
 static void
-masterCheckAndBroadcastSignals()
+masterReconfigureStart()
 {
-    // if (do_reconfigure)
-    //     TODO: hot-reconfiguration of the number of kids and PID file location
+    if (AvoidSignalAction("reconfiguration", do_reconfigure))
+        return;
+    debugs(1, 2, "received reconfiguration command");
+    reconfiguring = 1;
+    TheKids.forgetAllFailures();
+    // TODO: hot-reconfiguration of the number of kids, kids revival delay,
+    // PID file location, etc.
+}
 
+/// Ends reconfiguration sequence started by masterReconfigureStart().
+static void
+masterReconfigureFinish()
+{
+    reconfiguring = 0;
+}
+
+/// Reacts to the kid revival alarm.
+static void
+masterReviveKids()
+{
+    if (AvoidSignalAction("kids revival", do_revive_kids))
+        return;
+    debugs(1, 2, "woke up after ~" << Config.hopelessKidRevivalDelay << "s");
+    // nothing to do here -- actual revival happens elsewhere in the main loop
+    // the alarm was needed just to wake us up so that we do a loop iteration
+}
+
+static void
+masterCheckAndBroadcastSignals()
+{
     if (do_shutdown)
-        shutting_down = 1;
+        masterShutdownStart();
+    if (do_reconfigure)
+        masterReconfigureStart();
+    if (do_revive_kids)
+        masterReviveKids();
+
+    // emulate multi-step reconfiguration assumed by AvoidSignalAction()
+    if (reconfiguring)
+        masterReconfigureFinish();
 
     BroadcastSignalIfAny(DebugSignal);
     BroadcastSignalIfAny(RotateSignal);
     BroadcastSignalIfAny(ReconfigureSignal);
     BroadcastSignalIfAny(ShutdownSignal);
+    ReviveKidsSignal = -1; // alarms are not broadcasted
+}
+
+/// Maintains the following invariant: An alarm should be scheduled when and
+/// only when there are hopeless kid(s) that cannot be immediately revived.
+static void
+masterMaintainKidRevivalSchedule()
+{
+    const auto nextCheckDelay = TheKids.forgetOldFailures();
+    assert(nextCheckDelay >= 0);
+    (void)alarm(static_cast<unsigned int>(nextCheckDelay)); // resets or cancels
+    if (nextCheckDelay)
+        debugs(1, 2, "will recheck hopeless kids in " << nextCheckDelay << " seconds");
 }
-#endif
 
 static inline bool
 masterSignaled()
 {
-    return (DebugSignal > 0 || RotateSignal > 0 || ReconfigureSignal > 0 || ShutdownSignal > 0);
+    return (DebugSignal > 0 || RotateSignal > 0 || ReconfigureSignal > 0 ||
+            ShutdownSignal > 0 || ReviveKidsSignal > 0);
 }
 
-#if !_SQUID_WINDOWS_
 /// makes the caller a daemon process running in the background
 static void
 GoIntoBackground()
@@ -1815,6 +1885,23 @@ GoIntoBackground()
     }
     // child, running as a background daemon (or a failed-to-fork parent)
 }
+
+static void
+masterExit()
+{
+    if (TheKids.someSignaled(SIGINT) || TheKids.someSignaled(SIGTERM)) {
+        syslog(LOG_ALERT, "Exiting due to unexpected forced shutdown");
+        exit(EXIT_FAILURE);
+    }
+
+    if (TheKids.allHopeless()) {
+        syslog(LOG_ALERT, "Exiting due to repeated, frequent failures");
+        exit(EXIT_FAILURE);
+    }
+
+    exit(EXIT_SUCCESS);
+}
+
 #endif /* !_SQUID_WINDOWS_ */
 
 static void
@@ -1830,6 +1917,12 @@ watch_child(char *argv[])
 
     int nullfd;
 
+    // TODO: zero values are not supported because they result in
+    // misconfigured SMP Squid instances running forever, endlessly
+    // restarting each dying kid.
+    if (Config.hopelessKidRevivalDelay <= 0)
+        throw TexcHere("hopeless_kid_revival_delay must be positive");
+
     enter_suid();
 
     openlog(APP_SHORTNAME, LOG_PID | LOG_NDELAY | LOG_CONS, LOG_LOCAL4);
@@ -1892,6 +1985,7 @@ watch_child(char *argv[])
     squid_signal(SIGHUP, reconfigure, 0);
 
     squid_signal(SIGTERM, master_shutdown, 0);
+    squid_signal(SIGALRM, master_revive_kids, 0);
     squid_signal(SIGINT, master_shutdown, 0);
 #ifdef SIGTTIN
     squid_signal(SIGTTIN, master_shutdown, 0);
@@ -1904,6 +1998,8 @@ watch_child(char *argv[])
     }
     TheKids.init();
 
+    configured_once = 1;
+
     syslog(LOG_NOTICE, "Squid Parent: will start %d kids", (int)TheKids.count());
 
     // keep [re]starting kids until it is time to quit
@@ -1951,33 +2047,16 @@ watch_child(char *argv[])
             waitFlag = WNOHANG;
         PidStatus status;
         pid = WaitForAnyPid(status, waitFlag);
+        getCurrentTime();
 
         // check for a stopped kid
-        Kid* kid = pid > 0 ? TheKids.find(pid) : NULL;
-        if (kid) {
+        if (Kid *kid = pid > 0 ? TheKids.find(pid) : nullptr)
             kid->stop(status);
-            if (kid->calledExit()) {
-                syslog(LOG_NOTICE,
-                       "Squid Parent: %s process %d exited with status %d",
-                       kid->name().termedBuf(),
-                       kid->getPid(), kid->exitStatus());
-            } else if (kid->signaled()) {
-                syslog(LOG_NOTICE,
-                       "Squid Parent: %s process %d exited due to signal %d with status %d",
-                       kid->name().termedBuf(),
-                       kid->getPid(), kid->termSignal(), kid->exitStatus());
-            } else {
-                syslog(LOG_NOTICE, "Squid Parent: %s process %d exited",
-                       kid->name().termedBuf(), kid->getPid());
-            }
-            if (kid->hopeless()) {
-                syslog(LOG_NOTICE, "Squid Parent: %s process %d will not"
-                       " be restarted due to repeated, frequent failures",
-                       kid->name().termedBuf(), kid->getPid());
-            }
-        } else if (pid > 0) {
+        else if (pid > 0)
             syslog(LOG_NOTICE, "Squid Parent: unknown child process %d exited", pid);
-        }
+
+        masterCheckAndBroadcastSignals();
+        masterMaintainKidRevivalSchedule();
 
         if (!TheKids.someRunning() && !TheKids.shouldRestartSome()) {
             leave_suid();
@@ -1985,21 +2064,8 @@ watch_child(char *argv[])
             // RegisteredRunner::startShutdown which promises a loop iteration.
             RunRegisteredHere(RegisteredRunner::finishShutdown);
             enter_suid();
-
-            if (TheKids.someSignaled(SIGINT) || TheKids.someSignaled(SIGTERM)) {
-                syslog(LOG_ALERT, "Exiting due to unexpected forced shutdown");
-                exit(EXIT_FAILURE);
-            }
-
-            if (TheKids.allHopeless()) {
-                syslog(LOG_ALERT, "Exiting due to repeated, frequent failures");
-                exit(EXIT_FAILURE);
-            }
-
-            exit(EXIT_SUCCESS);
+            masterExit();
         }
-
-        masterCheckAndBroadcastSignals();
     }
 
     /* NOTREACHED */
index 8f78023f0640ebe8adaf41e0d187c2aa2f760de6..44a0397d483effce32bbf8bc7faf156f1a462a13 100644 (file)
@@ -365,8 +365,9 @@ BroadcastSignalIfAny(int& sig)
     if (sig > 0) {
         if (IamMasterProcess()) {
             for (int i = TheKids.count() - 1; i >= 0; --i) {
-                Kid& kid = TheKids.get(i);
-                kill(kid.getPid(), sig);
+                const auto &kid = TheKids.get(i);
+                if (kid.running())
+                    kill(kid.getPid(), sig);
             }
         }
         sig = -1;