]> git.ipfire.org Git - thirdparty/apache/httpd.git/commitdiff
mpm_event, mpm_worker: Remain active amidst prevalent child process
authorJeff Trawick <trawick@apache.org>
Thu, 19 Jul 2012 21:31:52 +0000 (21:31 +0000)
committerJeff Trawick <trawick@apache.org>
Thu, 19 Jul 2012 21:31:52 +0000 (21:31 +0000)
resource shortages.

This is a somewhat different direction than r168182 ("transient thread
creation errors shouldn't take down the whole server").

r168182: If APEXIT_CHILDSICK is received and there aren't any
         active children at the time, exit.

Now:     If APEXIT_CHILDSICK is received and we never successfully
         initialized a child, exit.

The issue seen with the r168182 handling is that it is rather easy
to be left with no active child processes (which causes the server
to exit completely) during a resource shortage that lasts for some
measurable period of time, as contrasted with a resource shortage
that results in only a handful of allocation failures.

Now the server will remain active, though as long as the resource
shortage exists children may continually fail and the parent will
try once per second to create a replacement.  The existing logic
to reduce the spawn rate after such errors will prevent the
parent from trying to create children more rapidly.

git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1363557 13f79535-47bb-0310-9956-ffa450edef68

CHANGES
docs/log-message-tags/next-number
server/mpm/event/event.c
server/mpm/worker/worker.c

diff --git a/CHANGES b/CHANGES
index 38f7db0921e9724bf232c6f4bbd408c98f64d0c3..661ecbc3ce8bcd690340cff7c5b7ceb99c0cf9f3 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,9 @@
                                                          -*- coding: utf-8 -*-
 Changes with Apache 2.5.0
 
+  *) mpm_event, mpm_worker: Remain active amidst prevalent child process
+     resource shortages.  [Jeff Trawick]
+
   *) mpm_event, mpm_worker: Fix cases where the spawn rate wasn't reduced
      after child process resource shortages.  [Jeff Trawick]
 
index e2eb34935940782d6b4bac5fd7cfe7bcf42dd102..da898df886b32db114de8bf649e4c0fe86f08782 100644 (file)
@@ -1 +1 @@
-2324
+2326
index 2c137cd74868e912914672958a08027a42d4b340..da447f774b99359f86d2869e393e65fa67d00cbd 100644 (file)
@@ -183,6 +183,7 @@ static apr_uint32_t lingering_count = 0;    /* Number of connections in lingerin
 static apr_uint32_t suspended_count = 0;    /* Number of suspended connections */
 static apr_uint32_t clogged_count = 0;      /* Number of threads processing ssl conns */
 static int resource_shortage = 0;
+static int had_healthy_child = 0;
 static fd_queue_t *worker_queue;
 static fd_queue_info_t *worker_queue_info;
 static int mpm_state = AP_MPMQ_STARTING;
@@ -2403,6 +2404,7 @@ static void perform_idle_server_maintenance(void)
         int any_dying_threads = 0;
         int any_dead_threads = 0;
         int all_dead_threads = 1;
+        int child_threads_active = 0;
 
         if (i >= retained->max_daemons_limit
             && totally_free_length == retained->idle_spawn_rate)
@@ -2438,6 +2440,7 @@ static void perform_idle_server_maintenance(void)
                 }
                 if (status >= SERVER_READY && status < SERVER_GRACEFUL) {
                     ++active_thread_count;
+                    ++child_threads_active;
                 }
             }
         }
@@ -2464,6 +2467,9 @@ static void perform_idle_server_maintenance(void)
             }
             ++free_length;
         }
+        else if (child_threads_active == threads_per_child) {
+            had_healthy_child = 1;
+        }
         /* XXX if (!ps->quiescing)     is probably more reliable  GLA */
         if (!any_dying_threads) {
             last_non_dead = i;
@@ -2472,21 +2478,23 @@ static void perform_idle_server_maintenance(void)
     }
 
     if (retained->sick_child_detected) {
-        if (active_thread_count > 0) {
-            /* some child processes appear to be working.  don't kill the
-             * whole server.
+        if (had_healthy_child) {
+            /* Assume this is a transient error, even though it may not be.  Leave
+             * the server up in case it is able to serve some requests or the
+             * problem will be resolved.
              */
             retained->sick_child_detected = 0;
         }
         else {
-            /* looks like a basket case give up.
+            /* looks like a basket case, as no child ever fully initialized; give up.
              */
             shutdown_pending = 1;
             child_fatal = 1;
             ap_log_error(APLOG_MARK, APLOG_ALERT, 0,
-                         ap_server_conf, APLOGNO(00483)
-                         "No active workers found..."
-                         " Apache is exiting!");
+                         ap_server_conf, APLOGNO(02324)
+                         "A resource shortage or other unrecoverable failure "
+                         "was encountered before any child process initialized "
+                         "successfully... httpd is exiting!");
             /* the child already logged the failure details */
             return;
         }
index 5a279c9e0ae68facff59d4b91ca521570f2bc96d..147f807d17dfd2d1bb2b6a6523c8a6889255acbd 100644 (file)
@@ -129,6 +129,7 @@ static int listener_may_exit = 0;
 static int requests_this_child;
 static int num_listensocks = 0;
 static int resource_shortage = 0;
+static int had_healthy_child = 0;
 static fd_queue_t *worker_queue;
 static fd_queue_info_t *worker_queue_info;
 static int mpm_state = AP_MPMQ_STARTING;
@@ -1473,6 +1474,7 @@ static void perform_idle_server_maintenance(void)
         int any_dying_threads = 0;
         int any_dead_threads = 0;
         int all_dead_threads = 1;
+        int child_threads_active = 0;
 
         if (i >= retained->max_daemons_limit && totally_free_length == retained->idle_spawn_rate)
             /* short cut if all active processes have been examined and
@@ -1507,6 +1509,7 @@ static void perform_idle_server_maintenance(void)
                 }
                 if (status >= SERVER_READY && status < SERVER_GRACEFUL) {
                     ++active_thread_count;
+                    ++child_threads_active;
                 }
             }
         }
@@ -1532,6 +1535,9 @@ static void perform_idle_server_maintenance(void)
             }
             ++free_length;
         }
+        else if (child_threads_active == threads_per_child) {
+            had_healthy_child = 1;
+        }
         /* XXX if (!ps->quiescing)     is probably more reliable  GLA */
         if (!any_dying_threads) {
             last_non_dead = i;
@@ -1540,21 +1546,23 @@ static void perform_idle_server_maintenance(void)
     }
 
     if (retained->sick_child_detected) {
-        if (active_thread_count > 0) {
-            /* some child processes appear to be working.  don't kill the
-             * whole server.
+        if (had_healthy_child) {
+            /* Assume this is a transient error, even though it may not be.  Leave
+             * the server up in case it is able to serve some requests or the
+             * problem will be resolved.
              */
             retained->sick_child_detected = 0;
         }
         else {
-            /* looks like a basket case give up.
+            /* looks like a basket case, as no child ever fully initialized; give up.
              */
             shutdown_pending = 1;
             child_fatal = 1;
             ap_log_error(APLOG_MARK, APLOG_ALERT, 0,
-                         ap_server_conf, APLOGNO(00285)
-                         "No active workers found..."
-                         " Apache is exiting!");
+                         ap_server_conf, APLOGNO(02325)
+                         "A resource shortage or other unrecoverable failure "
+                         "was encountered before any child process initialized "
+                         "successfully... httpd is exiting!");
             /* the child already logged the failure details */
             return;
         }