mpm_event, mpm_worker: Remain active amidst prevalent child process

author Jeff Trawick <trawick@apache.org>

Thu, 19 Jul 2012 21:31:52 +0000 (21:31 +0000)

committer Jeff Trawick <trawick@apache.org>

Thu, 19 Jul 2012 21:31:52 +0000 (21:31 +0000)
author Jeff Trawick <trawick@apache.org>
Thu, 19 Jul 2012 21:31:52 +0000 (21:31 +0000)
committer Jeff Trawick <trawick@apache.org>
Thu, 19 Jul 2012 21:31:52 +0000 (21:31 +0000)
diff --git a/CHANGES b/CHANGES

index 38f7db0921e9724bf232c6f4bbd408c98f64d0c3..661ecbc3ce8bcd690340cff7c5b7ceb99c0cf9f3 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,9 @@
                                                           -*- coding: utf-8 -*-
  Changes with Apache 2.5.0
  
+  *) mpm_event, mpm_worker: Remain active amidst prevalent child process
+     resource shortages.  [Jeff Trawick]
+
    *) mpm_event, mpm_worker: Fix cases where the spawn rate wasn't reduced
       after child process resource shortages.  [Jeff Trawick]
  
diff --git a/docs/log-message-tags/next-number b/docs/log-message-tags/next-number

index e2eb34935940782d6b4bac5fd7cfe7bcf42dd102..da898df886b32db114de8bf649e4c0fe86f08782 100644 (file)
--- a/docs/log-message-tags/next-number
+++ b/docs/log-message-tags/next-number
@@ -1 +1 @@
-2324
+2326
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c

index 2c137cd74868e912914672958a08027a42d4b340..da447f774b99359f86d2869e393e65fa67d00cbd 100644 (file)
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -183,6 +183,7 @@ static apr_uint32_t lingering_count = 0;    /* Number of connections in lingerin
  static apr_uint32_t suspended_count = 0;    /* Number of suspended connections */
  static apr_uint32_t clogged_count = 0;      /* Number of threads processing ssl conns */
  static int resource_shortage = 0;
+static int had_healthy_child = 0;
  static fd_queue_t *worker_queue;
  static fd_queue_info_t *worker_queue_info;
  static int mpm_state = AP_MPMQ_STARTING;
@@ -2403,6 +2404,7 @@ static void perform_idle_server_maintenance(void)
          int any_dying_threads = 0;
          int any_dead_threads = 0;
          int all_dead_threads = 1;
+        int child_threads_active = 0;
  
          if (i >= retained->max_daemons_limit
              && totally_free_length == retained->idle_spawn_rate)
@@ -2438,6 +2440,7 @@ static void perform_idle_server_maintenance(void)
                  }
                  if (status >= SERVER_READY && status < SERVER_GRACEFUL) {
                      ++active_thread_count;
+                    ++child_threads_active;
                  }
              }
          }
@@ -2464,6 +2467,9 @@ static void perform_idle_server_maintenance(void)
              }
              ++free_length;
          }
+        else if (child_threads_active == threads_per_child) {
+            had_healthy_child = 1;
+        }
          /* XXX if (!ps->quiescing)     is probably more reliable  GLA */
          if (!any_dying_threads) {
              last_non_dead = i;
@@ -2472,21 +2478,23 @@ static void perform_idle_server_maintenance(void)
      }
  
      if (retained->sick_child_detected) {
-        if (active_thread_count > 0) {
-            /* some child processes appear to be working.  don't kill the
-             * whole server.
+        if (had_healthy_child) {
+            /* Assume this is a transient error, even though it may not be.  Leave
+             * the server up in case it is able to serve some requests or the
+             * problem will be resolved.
               */
              retained->sick_child_detected = 0;
          }
          else {
-            /* looks like a basket case.  give up.
+            /* looks like a basket case, as no child ever fully initialized; give up.
               */
              shutdown_pending = 1;
              child_fatal = 1;
              ap_log_error(APLOG_MARK, APLOG_ALERT, 0,
-                         ap_server_conf, APLOGNO(00483)
-                         "No active workers found..."
-                         " Apache is exiting!");
+                         ap_server_conf, APLOGNO(02324)
+                         "A resource shortage or other unrecoverable failure "
+                         "was encountered before any child process initialized "
+                         "successfully... httpd is exiting!");
              /* the child already logged the failure details */
              return;
          }
diff --git a/server/mpm/worker/worker.c b/server/mpm/worker/worker.c

index 5a279c9e0ae68facff59d4b91ca521570f2bc96d..147f807d17dfd2d1bb2b6a6523c8a6889255acbd 100644 (file)
--- a/server/mpm/worker/worker.c
+++ b/server/mpm/worker/worker.c
@@ -129,6 +129,7 @@ static int listener_may_exit = 0;
  static int requests_this_child;
  static int num_listensocks = 0;
  static int resource_shortage = 0;
+static int had_healthy_child = 0;
  static fd_queue_t *worker_queue;
  static fd_queue_info_t *worker_queue_info;
  static int mpm_state = AP_MPMQ_STARTING;
@@ -1473,6 +1474,7 @@ static void perform_idle_server_maintenance(void)
          int any_dying_threads = 0;
          int any_dead_threads = 0;
          int all_dead_threads = 1;
+        int child_threads_active = 0;
  
          if (i >= retained->max_daemons_limit && totally_free_length == retained->idle_spawn_rate)
              /* short cut if all active processes have been examined and
@@ -1507,6 +1509,7 @@ static void perform_idle_server_maintenance(void)
                  }
                  if (status >= SERVER_READY && status < SERVER_GRACEFUL) {
                      ++active_thread_count;
+                    ++child_threads_active;
                  }
              }
          }
@@ -1532,6 +1535,9 @@ static void perform_idle_server_maintenance(void)
              }
              ++free_length;
          }
+        else if (child_threads_active == threads_per_child) {
+            had_healthy_child = 1;
+        }
          /* XXX if (!ps->quiescing)     is probably more reliable  GLA */
          if (!any_dying_threads) {
              last_non_dead = i;
@@ -1540,21 +1546,23 @@ static void perform_idle_server_maintenance(void)
      }
  
      if (retained->sick_child_detected) {
-        if (active_thread_count > 0) {
-            /* some child processes appear to be working.  don't kill the
-             * whole server.
+        if (had_healthy_child) {
+            /* Assume this is a transient error, even though it may not be.  Leave
+             * the server up in case it is able to serve some requests or the
+             * problem will be resolved.
               */
              retained->sick_child_detected = 0;
          }
          else {
-            /* looks like a basket case.  give up.
+            /* looks like a basket case, as no child ever fully initialized; give up.
               */
              shutdown_pending = 1;
              child_fatal = 1;
              ap_log_error(APLOG_MARK, APLOG_ALERT, 0,
-                         ap_server_conf, APLOGNO(00285)
-                         "No active workers found..."
-                         " Apache is exiting!");
+                         ap_server_conf, APLOGNO(02325)
+                         "A resource shortage or other unrecoverable failure "
+                         "was encountered before any child process initialized "
+                         "successfully... httpd is exiting!");
              /* the child already logged the failure details */
              return;
          }
author	Jeff Trawick <trawick@apache.org>
	Thu, 19 Jul 2012 21:31:52 +0000 (21:31 +0000)
committer	Jeff Trawick <trawick@apache.org>
	Thu, 19 Jul 2012 21:31:52 +0000 (21:31 +0000)
CHANGES		patch \| blob \| blame \| history
docs/log-message-tags/next-number		patch \| blob \| blame \| history
server/mpm/event/event.c		patch \| blob \| blame \| history
server/mpm/worker/worker.c		patch \| blob \| blame \| history