From: Alex Rousskov Date: Wed, 2 Aug 2017 22:13:27 +0000 (-0600) Subject: Automatically revive hopeless kids on reconfigure and after a timeout. X-Git-Tag: M-staged-PR71~66 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=00e2479;p=thirdparty%2Fsquid.git Automatically revive hopeless kids on reconfigure and after a timeout. Squid declares kids with "repeated, frequent failures" as hopeless. Hopeless kids were not automatically restarted. In the absence of automated recovery, admins were forced to restart the whole Squid instance (after fixing the underlying problem that led to those kid failures). Squid instance restarts hurt users. In many cases, the underlying kid-killing problem is temporary, and Squid can eventually fully recover without any admin involvement. Squid now automatically restarts a hopeless kid after a configurable downtime (a new hopeless_kid_revival_delay directive with a 60 minute default value). Also restart all hopeless kids upon receiving a reconfiguration signal. Also avoid sending signals to non-running kids, fixing an old minor bug. --- diff --git a/src/SquidConfig.h b/src/SquidConfig.h index f42acdaeb8..d152d4c9b3 100644 --- a/src/SquidConfig.h +++ b/src/SquidConfig.h @@ -96,6 +96,7 @@ public: time_t positiveDnsTtl; time_t shutdownLifetime; time_t backgroundPingRate; + time_t hopelessKidRevivalDelay; ///< hopeless_kid_revival_delay struct { time_t read; diff --git a/src/cf.data.pre b/src/cf.data.pre index 600f59590a..2aab4b19c1 100644 --- a/src/cf.data.pre +++ b/src/cf.data.pre @@ -473,6 +473,26 @@ DOC_START CAP_IPC_LOCK capability, or equivalent. DOC_END +NAME: hopeless_kid_revival_delay +COMMENT: time-units +TYPE: time_t +LOC: Config.hopelessKidRevivalDelay +DEFAULT: 1 hour +DOC_START + Normally, when a kid process dies, Squid immediately restarts the + kid. A kid experiencing frequent deaths is marked as "hopeless" for + the duration specified by this directive. Hopeless kids are not + automatically restarted. + + Currently, zero values are not supported because they result in + misconfigured SMP Squid instances running forever, endlessly + restarting each dying kid. To effectively disable hopeless kids + revival, set the delay to a huge value (e.g., 1 year). + + Reconfiguration also clears all hopeless kids designations, allowing + for manual revival of hopeless kids. +DOC_END + COMMENT_START OPTIONS FOR AUTHENTICATION ----------------------------------------------------------------------------- diff --git a/src/ipc/Kid.cc b/src/ipc/Kid.cc index 9f650f47f3..2e0ea4e275 100644 --- a/src/ipc/Kid.cc +++ b/src/ipc/Kid.cc @@ -11,6 +11,7 @@ #include "squid.h" #include "globals.h" #include "ipc/Kid.h" +#include "SquidConfig.h" #include #if HAVE_SYS_WAIT_H @@ -45,8 +46,9 @@ void Kid::start(pid_t cpid) assert(cpid > 0); isRunning = true; + stopTime = 0; pid = cpid; - time(&startTime); + startTime = squid_curtime; } /// called when kid terminates, sets exiting status @@ -57,15 +59,39 @@ Kid::stop(PidStatus const theExitStatus) assert(startTime != 0); isRunning = false; + stopTime = squid_curtime; + status = theExitStatus; - time_t stop_time; - time(&stop_time); - if ((stop_time - startTime) < fastFailureTimeLimit) + if ((stopTime - startTime) < fastFailureTimeLimit) ++badFailures; else badFailures = 0; // the failures are not "frequent" [any more] - status = theExitStatus; + reportStopped(); // after all state changes +} + +/// describes a recently stopped kid +void +Kid::reportStopped() const +{ + if (calledExit()) { + syslog(LOG_NOTICE, + "Squid Parent: %s process %d exited with status %d", + theName.termedBuf(), pid, exitStatus()); + } else if (signaled()) { + syslog(LOG_NOTICE, + "Squid Parent: %s process %d exited due to signal %d with status %d", + theName.termedBuf(), pid, termSignal(), exitStatus()); + } else { + syslog(LOG_NOTICE, "Squid Parent: %s process %d exited", + theName.termedBuf(), pid); + } + + if (hopeless() && Config.hopelessKidRevivalDelay) { + syslog(LOG_NOTICE, "Squid Parent: %s process %d will not be restarted for %ld " + "seconds due to repeated, frequent failures", + theName.termedBuf(), pid, Config.hopelessKidRevivalDelay); + } } /// returns true if tracking of kid is stopped @@ -147,3 +173,9 @@ const String& Kid::name() const return theName; } +time_t +Kid::deathDuration() const +{ + return squid_curtime > stopTime ? squid_curtime - stopTime : 0; +} + diff --git a/src/ipc/Kid.h b/src/ipc/Kid.h index d3f5d18dd3..186bf20163 100644 --- a/src/ipc/Kid.h +++ b/src/ipc/Kid.h @@ -47,6 +47,12 @@ public: /// whether the failures are "repeated and frequent" bool hopeless() const; + /// forgets all past failures, ensuring that we are not hopeless() + void forgetFailures() { badFailures = 0; } + + /// \returns the time since process termination + time_t deathDuration() const; + /// returns true if the process terminated normally bool calledExit() const; @@ -72,6 +78,8 @@ public: const String& name() const; private: + void reportStopped() const; + // Information preserved across restarts String theName; ///< process name int badFailures; ///< number of "repeated frequent" failures @@ -79,6 +87,7 @@ private: // Information specific to a running or stopped kid pid_t pid; ///< current (for a running kid) or last (for stopped kid) PID time_t startTime; ///< last start time + time_t stopTime = 0; ///< last termination time bool isRunning; ///< whether the kid is assumed to be alive PidStatus status; ///< exit status of a stopped kid }; diff --git a/src/ipc/Kids.cc b/src/ipc/Kids.cc index 7cb5704ea5..e599bb28b0 100644 --- a/src/ipc/Kids.cc +++ b/src/ipc/Kids.cc @@ -82,6 +82,35 @@ bool Kids::allHopeless() const return true; } +void +Kids::forgetAllFailures() +{ + for (auto &kid: storage) + kid.forgetFailures(); +} + +time_t +Kids::forgetOldFailures() +{ + time_t nextCheckDelay = 0; + for (auto &kid: storage) { + if (!kid.hopeless()) + continue; + + const auto deathDuration = kid.deathDuration(); // protect from time changes + if (Config.hopelessKidRevivalDelay <= deathDuration) { + kid.forgetFailures(); // this kid will be revived now + continue; + } + + const auto remainingDeathTime = Config.hopelessKidRevivalDelay - deathDuration; + assert(remainingDeathTime > 0); + if (remainingDeathTime < nextCheckDelay || !nextCheckDelay) + nextCheckDelay = remainingDeathTime; + } + return nextCheckDelay; // still zero if there were no still-hopeless kids +} + /// whether all kids called exited happy bool Kids::allExitedHappy() const { diff --git a/src/ipc/Kids.h b/src/ipc/Kids.h index 23788bf33a..58b46fc931 100644 --- a/src/ipc/Kids.h +++ b/src/ipc/Kids.h @@ -36,6 +36,13 @@ public: /// whether all kids are hopeless bool allHopeless() const; + /// forgets all failures in all kids + void forgetAllFailures(); + + /// forgets all failures in hopeless kids that were dead for a long time + /// \returns seconds till the next check (zero if there are no hopeless kids left) + time_t forgetOldFailures(); + /// whether all kids called exited happy bool allExitedHappy() const; diff --git a/src/main.cc b/src/main.cc index 6f3e321f57..f481f92e6d 100644 --- a/src/main.cc +++ b/src/main.cc @@ -153,12 +153,14 @@ static int malloc_debug_level = 0; static volatile int do_reconfigure = 0; static volatile int do_rotate = 0; static volatile int do_shutdown = 0; +static volatile int do_revive_kids = 0; static volatile int shutdown_status = EXIT_SUCCESS; static volatile int do_handle_stopped_child = 0; static int RotateSignal = -1; static int ReconfigureSignal = -1; static int ShutdownSignal = -1; +static int ReviveKidsSignal = -1; static void mainRotate(void); static void mainReconfigureStart(void); @@ -735,6 +737,19 @@ reconfigure(int sig) #endif } +void +master_revive_kids(int sig) +{ + ReviveKidsSignal = sig; + do_revive_kids = true; + +#if !_SQUID_WINDOWS_ +#if !HAVE_SIGACTION + signal(sig, master_revive_kids); +#endif +#endif +} + /// Shutdown signal handler for master process void master_shutdown(int sig) @@ -1774,32 +1789,87 @@ mainStartScript(const char *prog) } } -#endif /* _SQUID_WINDOWS_ */ +/// Initiates shutdown sequence. Shutdown ends when the last running kids stops. +static void +masterShutdownStart() +{ + if (AvoidSignalAction("shutdown", do_shutdown)) + return; + debugs(1, 2, "received shutdown command"); + shutting_down = 1; +} -#if !_SQUID_WINDOWS_ +/// Initiates reconfiguration sequence. See also: masterReconfigureFinish(). static void -masterCheckAndBroadcastSignals() +masterReconfigureStart() { - // if (do_reconfigure) - // TODO: hot-reconfiguration of the number of kids and PID file location + if (AvoidSignalAction("reconfiguration", do_reconfigure)) + return; + debugs(1, 2, "received reconfiguration command"); + reconfiguring = 1; + TheKids.forgetAllFailures(); + // TODO: hot-reconfiguration of the number of kids, kids revival delay, + // PID file location, etc. +} +/// Ends reconfiguration sequence started by masterReconfigureStart(). +static void +masterReconfigureFinish() +{ + reconfiguring = 0; +} + +/// Reacts to the kid revival alarm. +static void +masterReviveKids() +{ + if (AvoidSignalAction("kids revival", do_revive_kids)) + return; + debugs(1, 2, "woke up after ~" << Config.hopelessKidRevivalDelay << "s"); + // nothing to do here -- actual revival happens elsewhere in the main loop + // the alarm was needed just to wake us up so that we do a loop iteration +} + +static void +masterCheckAndBroadcastSignals() +{ if (do_shutdown) - shutting_down = 1; + masterShutdownStart(); + if (do_reconfigure) + masterReconfigureStart(); + if (do_revive_kids) + masterReviveKids(); + + // emulate multi-step reconfiguration assumed by AvoidSignalAction() + if (reconfiguring) + masterReconfigureFinish(); BroadcastSignalIfAny(DebugSignal); BroadcastSignalIfAny(RotateSignal); BroadcastSignalIfAny(ReconfigureSignal); BroadcastSignalIfAny(ShutdownSignal); + ReviveKidsSignal = -1; // alarms are not broadcasted +} + +/// Maintains the following invariant: An alarm should be scheduled when and +/// only when there are hopeless kid(s) that cannot be immediately revived. +static void +masterMaintainKidRevivalSchedule() +{ + const auto nextCheckDelay = TheKids.forgetOldFailures(); + assert(nextCheckDelay >= 0); + (void)alarm(static_cast(nextCheckDelay)); // resets or cancels + if (nextCheckDelay) + debugs(1, 2, "will recheck hopeless kids in " << nextCheckDelay << " seconds"); } -#endif static inline bool masterSignaled() { - return (DebugSignal > 0 || RotateSignal > 0 || ReconfigureSignal > 0 || ShutdownSignal > 0); + return (DebugSignal > 0 || RotateSignal > 0 || ReconfigureSignal > 0 || + ShutdownSignal > 0 || ReviveKidsSignal > 0); } -#if !_SQUID_WINDOWS_ /// makes the caller a daemon process running in the background static void GoIntoBackground() @@ -1815,6 +1885,23 @@ GoIntoBackground() } // child, running as a background daemon (or a failed-to-fork parent) } + +static void +masterExit() +{ + if (TheKids.someSignaled(SIGINT) || TheKids.someSignaled(SIGTERM)) { + syslog(LOG_ALERT, "Exiting due to unexpected forced shutdown"); + exit(EXIT_FAILURE); + } + + if (TheKids.allHopeless()) { + syslog(LOG_ALERT, "Exiting due to repeated, frequent failures"); + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); +} + #endif /* !_SQUID_WINDOWS_ */ static void @@ -1830,6 +1917,12 @@ watch_child(char *argv[]) int nullfd; + // TODO: zero values are not supported because they result in + // misconfigured SMP Squid instances running forever, endlessly + // restarting each dying kid. + if (Config.hopelessKidRevivalDelay <= 0) + throw TexcHere("hopeless_kid_revival_delay must be positive"); + enter_suid(); openlog(APP_SHORTNAME, LOG_PID | LOG_NDELAY | LOG_CONS, LOG_LOCAL4); @@ -1892,6 +1985,7 @@ watch_child(char *argv[]) squid_signal(SIGHUP, reconfigure, 0); squid_signal(SIGTERM, master_shutdown, 0); + squid_signal(SIGALRM, master_revive_kids, 0); squid_signal(SIGINT, master_shutdown, 0); #ifdef SIGTTIN squid_signal(SIGTTIN, master_shutdown, 0); @@ -1904,6 +1998,8 @@ watch_child(char *argv[]) } TheKids.init(); + configured_once = 1; + syslog(LOG_NOTICE, "Squid Parent: will start %d kids", (int)TheKids.count()); // keep [re]starting kids until it is time to quit @@ -1951,33 +2047,16 @@ watch_child(char *argv[]) waitFlag = WNOHANG; PidStatus status; pid = WaitForAnyPid(status, waitFlag); + getCurrentTime(); // check for a stopped kid - Kid* kid = pid > 0 ? TheKids.find(pid) : NULL; - if (kid) { + if (Kid *kid = pid > 0 ? TheKids.find(pid) : nullptr) kid->stop(status); - if (kid->calledExit()) { - syslog(LOG_NOTICE, - "Squid Parent: %s process %d exited with status %d", - kid->name().termedBuf(), - kid->getPid(), kid->exitStatus()); - } else if (kid->signaled()) { - syslog(LOG_NOTICE, - "Squid Parent: %s process %d exited due to signal %d with status %d", - kid->name().termedBuf(), - kid->getPid(), kid->termSignal(), kid->exitStatus()); - } else { - syslog(LOG_NOTICE, "Squid Parent: %s process %d exited", - kid->name().termedBuf(), kid->getPid()); - } - if (kid->hopeless()) { - syslog(LOG_NOTICE, "Squid Parent: %s process %d will not" - " be restarted due to repeated, frequent failures", - kid->name().termedBuf(), kid->getPid()); - } - } else if (pid > 0) { + else if (pid > 0) syslog(LOG_NOTICE, "Squid Parent: unknown child process %d exited", pid); - } + + masterCheckAndBroadcastSignals(); + masterMaintainKidRevivalSchedule(); if (!TheKids.someRunning() && !TheKids.shouldRestartSome()) { leave_suid(); @@ -1985,21 +2064,8 @@ watch_child(char *argv[]) // RegisteredRunner::startShutdown which promises a loop iteration. RunRegisteredHere(RegisteredRunner::finishShutdown); enter_suid(); - - if (TheKids.someSignaled(SIGINT) || TheKids.someSignaled(SIGTERM)) { - syslog(LOG_ALERT, "Exiting due to unexpected forced shutdown"); - exit(EXIT_FAILURE); - } - - if (TheKids.allHopeless()) { - syslog(LOG_ALERT, "Exiting due to repeated, frequent failures"); - exit(EXIT_FAILURE); - } - - exit(EXIT_SUCCESS); + masterExit(); } - - masterCheckAndBroadcastSignals(); } /* NOTREACHED */ diff --git a/src/tools.cc b/src/tools.cc index 8f78023f06..44a0397d48 100644 --- a/src/tools.cc +++ b/src/tools.cc @@ -365,8 +365,9 @@ BroadcastSignalIfAny(int& sig) if (sig > 0) { if (IamMasterProcess()) { for (int i = TheKids.count() - 1; i >= 0; --i) { - Kid& kid = TheKids.get(i); - kill(kid.getPid(), sig); + const auto &kid = TheKids.get(i); + if (kid.running()) + kill(kid.getPid(), sig); } } sig = -1;