From: Dmitry Kurochkin Date: Wed, 30 Mar 2011 19:39:14 +0000 (-0600) Subject: If a worker process crashes during shutdown, dump core and prevent restarts. X-Git-Tag: take06~27^2~49 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=ca4b9ee6b684afffb9c4d96b28a166627eee7722;p=thirdparty%2Fsquid.git If a worker process crashes during shutdown, dump core and prevent restarts. Before the change, if a worker process crashes during shutdown, death() handler would exit with code 1, and master process would restart the worker. Now workers send SIGUSR1 to master when shutting down. When master process gets the SIGUSR1 signal, it stops restarting workers. SIGUSR1 is already used for log rotation, but it is fine to use SIGUSR1 for master process shutdown notifications because master is never responsible for both log rotation and kid restarts. Terminate with abort(3) instead of exit(3) to leave a core dump if Squid worker crashes during shutdown. Also the patch fixes potential infinite loop in master process. Master used to finish only when all kids exited with success, or all kids are hopeless, or all kids were killed by a signal, but when some kids are hopeless and others were killed, the master process would not exit. After the change, master exits when there are no running kids and no kids should be restarted. Add syslog notice if kid becomes hopeless. --- diff --git a/src/ipc/Kid.cc b/src/ipc/Kid.cc index 1768d0e7af..5fe8849b46 100644 --- a/src/ipc/Kid.cc +++ b/src/ipc/Kid.cc @@ -6,6 +6,7 @@ */ #include "config.h" +#include "globals.h" #include "ipc/Kid.h" #if HAVE_SYS_WAIT_H @@ -64,6 +65,18 @@ bool Kid::running() const return isRunning; } +/// returns true if master process should restart this kid +bool Kid::shouldRestart() const +{ + return !(running() || + exitedHappy() || + hopeless() || + shutting_down || + signaled(SIGKILL) || // squid -k kill + signaled(SIGINT) || // unexpected forced shutdown + signaled(SIGTERM)); // unexpected forced shutdown +} + /// returns current pid for a running kid and last pid for a stopped kid pid_t Kid::getPid() const { diff --git a/src/ipc/Kid.h b/src/ipc/Kid.h index fbdf189dd3..184f2391e4 100644 --- a/src/ipc/Kid.h +++ b/src/ipc/Kid.h @@ -40,6 +40,9 @@ public: /// returns true if tracking of kid is stopped bool running() const; + /// returns true if master should restart this kid + bool shouldRestart() const; + /// returns current pid for a running kid and last pid for a stopped kid pid_t getPid() const; diff --git a/src/ipc/Kids.cc b/src/ipc/Kids.cc index 1f1af2e4d6..836e97bad8 100644 --- a/src/ipc/Kids.cc +++ b/src/ipc/Kids.cc @@ -80,14 +80,34 @@ bool Kids::allExitedHappy() const return true; } -/// whether all kids died from a given signal -bool Kids::allSignaled(int sgnl) const +/// whether some kids died from a given signal +bool Kids::someSignaled(const int sgnl) const { for (size_t i = 0; i < storage.size(); ++i) { - if (!storage[i].signaled(sgnl)) - return false; + if (storage[i].signaled(sgnl)) + return true; } - return true; + return false; +} + +/// whether some kids are running +bool Kids::someRunning() const +{ + for (size_t i = 0; i < storage.size(); ++i) { + if (storage[i].running()) + return true; + } + return false; +} + +/// whether some kids should be restarted by master +bool Kids::shouldRestartSome() const +{ + for (size_t i = 0; i < storage.size(); ++i) { + if (storage[i].shouldRestart()) + return true; + } + return false; } /// returns the number of kids diff --git a/src/ipc/Kids.h b/src/ipc/Kids.h index bfa3735a28..f6369423f0 100644 --- a/src/ipc/Kids.h +++ b/src/ipc/Kids.h @@ -36,8 +36,14 @@ public: /// whether all kids called exited happy bool allExitedHappy() const; - /// whether all kids died from a given signal - bool allSignaled(int sgnl) const; + /// whether some kids died from a given signal + bool someSignaled(const int sgnl) const; + + /// whether some kids are running + bool someRunning() const; + + /// whether some kids should be restarted by master + bool shouldRestartSome() const; /// returns the number of kids size_t count() const; diff --git a/src/main.cc b/src/main.cc index 1c2d8ca772..ffc2332068 100644 --- a/src/main.cc +++ b/src/main.cc @@ -614,14 +614,24 @@ shut_down(int sig) shutdown_status = 1; #endif + + const pid_t ppid = getppid(); + + if (ppid > 1) { + // notify master that we are shutting down + if (kill(ppid, SIGUSR1) < 0) + debugs(1, DBG_IMPORTANT, "Failed to send SIGUSR1 to master process," + " pid " << ppid << ": " << xstrerror()); + } + #ifndef _SQUID_MSWIN_ #if KILL_PARENT_OPT - if (getppid() > 1) { - debugs(1, 1, "Killing master process, pid " << getppid()); + if (ppid > 1) { + debugs(1, DBG_IMPORTANT, "Killing master process, pid " << ppid); - if (kill(getppid(), sig) < 0) - debugs(1, 1, "kill " << getppid() << ": " << xstrerror()); + if (kill(ppid, sig) < 0) + debugs(1, DBG_IMPORTANT, "kill " << ppid << ": " << xstrerror()); } #endif /* KILL_PARENT_OPT */ @@ -1682,6 +1692,9 @@ watch_child(char *argv[]) dup2(nullfd, 2); } + // handle shutdown notifications from kids + squid_signal(SIGUSR1, sig_shutdown, SA_RESTART); + if (Config.workers > 128) { syslog(LOG_ALERT, "Suspiciously high workers value: %d", Config.workers); @@ -1696,7 +1709,7 @@ watch_child(char *argv[]) // start each kid that needs to be [re]started; once for (int i = TheKids.count() - 1; i >= 0; --i) { Kid& kid = TheKids.get(i); - if (kid.hopeless() || kid.exitedHappy() || kid.running()) + if (!kid.shouldRestart()) continue; if ((pid = fork()) == 0) { @@ -1742,6 +1755,11 @@ watch_child(char *argv[]) } else { syslog(LOG_NOTICE, "Squid Parent: child process %d exited", kid->getPid()); } + if (kid->hopeless()) { + syslog(LOG_NOTICE, "Squid Parent: child process %d will not" + " be restarted due to repeated, frequent failures", + kid->getPid()); + } } else { syslog(LOG_NOTICE, "Squid Parent: unknown child process %d exited", pid); } @@ -1752,24 +1770,20 @@ watch_child(char *argv[]) while ((pid = waitpid(-1, &status, WNOHANG)) > 0); #endif - if (TheKids.allExitedHappy()) { - exit(0); - } + if (!TheKids.someRunning() && !TheKids.shouldRestartSome()) { + if (TheKids.someSignaled(SIGINT) || TheKids.someSignaled(SIGTERM)) { + syslog(LOG_ALERT, "Exiting due to unexpected forced shutdown"); + exit(1); + } - if (TheKids.allHopeless()) { - syslog(LOG_ALERT, "Exiting due to repeated, frequent failures"); - exit(1); - } + if (TheKids.allHopeless()) { + syslog(LOG_ALERT, "Exiting due to repeated, frequent failures"); + exit(1); + } - if (TheKids.allSignaled(SIGKILL)) { exit(0); } - if (TheKids.allSignaled(SIGINT) || TheKids.allSignaled(SIGTERM)) { - syslog(LOG_ALERT, "Exiting due to unexpected forced shutdown"); - exit(1); - } - squid_signal(SIGINT, SIG_DFL, SA_RESTART); sleep(3); } diff --git a/src/protos.h b/src/protos.h index acd7411f3b..bac9cfca70 100644 --- a/src/protos.h +++ b/src/protos.h @@ -561,6 +561,7 @@ SQUIDCEXTERN void safeunlink(const char *path, int quiet); void death(int sig); void sigusr2_handle(int sig); void sig_child(int sig); +void sig_shutdown(int sig); ///< handles shutdown notifications from kids SQUIDCEXTERN void leave_suid(void); SQUIDCEXTERN void enter_suid(void); SQUIDCEXTERN void no_suid(void); diff --git a/src/tools.cc b/src/tools.cc index 64af91020a..7f9353f78e 100644 --- a/src/tools.cc +++ b/src/tools.cc @@ -397,9 +397,6 @@ death(int sig) puts(dead_msg()); } - if (shutting_down) - exit(1); - abort(); } @@ -595,6 +592,12 @@ sig_child(int sig) #endif } +void +sig_shutdown(int sig) +{ + shutting_down = 1; +} + const char * getMyHostname(void) {