Unix MPMs: Shut down the server more quickly when child processes are

author Jeff Trawick <trawick@apache.org>

Sat, 18 Sep 2004 00:33:56 +0000 (00:33 +0000)

committer Jeff Trawick <trawick@apache.org>

Sat, 18 Sep 2004 00:33:56 +0000 (00:33 +0000)
author Jeff Trawick <trawick@apache.org>
Sat, 18 Sep 2004 00:33:56 +0000 (00:33 +0000)
committer Jeff Trawick <trawick@apache.org>
Sat, 18 Sep 2004 00:33:56 +0000 (00:33 +0000)
diff --git a/CHANGES b/CHANGES

index b995a527b92aa7d0258320369bbb62fb3ed7f500..a0e2201f7708d6e16671c5f437a83c200f981842 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -2,6 +2,9 @@ Changes with Apache 2.1.0-dev
  
    [Remove entries to the current 2.0 section below, when backported]
  
+  *) Unix MPMs: Shut down the server more quickly when child processes are
+     slow to exit.  [Joe Orton, Jeff Trawick]
+
    *) mod_info: Added listing of the Request Hooks and added more build 
       information like 'httpd -V' contains. Changed output to XHTML. 
       [Paul Querna]
diff --git a/server/mpm_common.c b/server/mpm_common.c

index 101a0af7793c5fd89f641c5a45ac42d7cd578817..e4c6de073ad24b5a05de58d081c2d9f1b086e380 100644 (file)
--- a/server/mpm_common.c
+++ b/server/mpm_common.c
@@ -61,22 +61,55 @@
  #ifdef AP_MPM_WANT_RECLAIM_CHILD_PROCESSES
  void ap_reclaim_child_processes(int terminate)
  {
-    int i;
-    long int waittime = 1024 * 16;      /* in usecs */
+    apr_time_t waittime = 1024 * 16;
      apr_status_t waitret;
-    int tries;
+    int i;
      int not_dead_yet;
      int max_daemons;
+    apr_time_t starttime = apr_time_now();
+    /* this table of actions and elapsed times tells what action is taken
+     * at which elapsed time from starting the reclaim
+     */
+    struct {
+        enum {DO_NOTHING, SEND_SIGTERM, SEND_SIGKILL, GIVEUP} action;
+        apr_time_t action_time;
+    } action_table[] = {
+        {DO_NOTHING, 0}, /* dummy entry for iterations where we reap
+                          * children but take no action against
+                          * stragglers
+                          */
+        {SEND_SIGTERM, apr_time_from_sec(3)},
+        {SEND_SIGTERM, apr_time_from_sec(5)},
+        {SEND_SIGTERM, apr_time_from_sec(7)},
+        {SEND_SIGKILL, apr_time_from_sec(9)},
+        {GIVEUP,       apr_time_from_sec(10)}
+    };
+    int cur_action;      /* index of action we decided to take this
+                          * iteration
+                          */
+    int next_action = 1; /* index of first real action */
  
      ap_mpm_query(AP_MPMQ_MAX_DAEMON_USED, &max_daemons);
  
-    for (tries = terminate ? 4 : 1; tries <= 9; ++tries) {
-        /* don't want to hold up progress any more than
-         * necessary, but we need to allow children a few moments to exit.
-         * Set delay with an exponential backoff.
-         */
+    do {
          apr_sleep(waittime);
+        /* don't let waittime get longer than 1 second; otherwise, we don't
+         * react quickly to the last child exiting, and taking action can
+         * be delayed
+         */
          waittime = waittime * 4;
+        if (waittime > apr_time_from_sec(1)) {
+            waittime = apr_time_from_sec(1);
+        }
+
+        /* see what action to take, if any */
+        if (action_table[next_action].action_time <= apr_time_now() - starttime) {
+            cur_action = next_action;
+            ++next_action;
+        }
+        else {
+            cur_action = 0; /* nothing to do */
+        }
  
          /* now see who is done */
          not_dead_yet = 0;
@@ -95,32 +128,28 @@ void ap_reclaim_child_processes(int terminate)
              }
  
              ++not_dead_yet;
-            switch (tries) {
-            case 1:     /*  16ms */
-            case 2:     /*  82ms */
-            case 3:     /* 344ms */
-            case 4:     /*  16ms */
+            switch(action_table[cur_action].action) {
+            case DO_NOTHING:
                  break;
-
-            case 5:     /*  82ms */
-            case 6:     /* 344ms */
-            case 7:     /* 1.4sec */
+                
+            case SEND_SIGTERM:
                  /* ok, now it's being annoying */
                  ap_log_error(APLOG_MARK, APLOG_WARNING,
                               0, ap_server_conf,
-                             "child process %ld still did not exit, "
+                             "child process %" APR_PID_T_FMT
+                             " still did not exit, "
                               "sending a SIGTERM",
-                             (long)pid);
+                             pid);
                  kill(pid, SIGTERM);
                  break;
-
-            case 8:     /*  6 sec */
-                /* die child scum */
+                
+            case SEND_SIGKILL:
                  ap_log_error(APLOG_MARK, APLOG_ERR,
                               0, ap_server_conf,
-                             "child process %ld still did not exit, "
+                             "child process %" APR_PID_T_FMT
+                             "  still did not exit, "
                               "sending a SIGKILL",
-                             (long)pid);
+                             pid);
  #ifndef BEOS
                  kill(pid, SIGKILL);
  #else
@@ -133,8 +162,8 @@ void ap_reclaim_child_processes(int terminate)
                  kill_thread(pid);
  #endif
                  break;
-
-            case 9:     /* 14 sec */
+                
+            case GIVEUP:
                  /* gave it our best shot, but alas...  If this really
                   * is a child we are trying to kill and it really hasn't
                   * exited, we will likely fail to bind to the port
@@ -142,9 +171,10 @@ void ap_reclaim_child_processes(int terminate)
                   */
                  ap_log_error(APLOG_MARK, APLOG_ERR,
                               0, ap_server_conf,
-                             "could not make child process %ld exit, "
+                             "could not make child process %" APR_PID_T_FMT
+                             " exit, "
                               "attempting to continue anyway",
-                             (long)pid);
+                             pid);
                  break;
              }
          }
@@ -153,11 +183,8 @@ void ap_reclaim_child_processes(int terminate)
          apr_proc_other_child_refresh_all(APR_OC_REASON_RESTART);
  #endif
  
-        if (!not_dead_yet) {
-            /* nothing left to wait for */
-            break;
-        }
-    }
+    } while (not_dead_yet > 0 &&
+             action_table[cur_action].action != GIVEUP);
  }
  #endif /* AP_MPM_WANT_RECLAIM_CHILD_PROCESSES */
author	Jeff Trawick <trawick@apache.org>
	Sat, 18 Sep 2004 00:33:56 +0000 (00:33 +0000)
committer	Jeff Trawick <trawick@apache.org>
	Sat, 18 Sep 2004 00:33:56 +0000 (00:33 +0000)
CHANGES		patch \| blob \| blame \| history
server/mpm_common.c		patch \| blob \| blame \| history