master: Replace per-process idle_kill timeout with per-service

author Timo Sirainen <timo.sirainen@open-xchange.com>

Thu, 23 Mar 2023 14:07:26 +0000 (16:07 +0200)

committer Timo Sirainen <timo.sirainen@open-xchange.com>

Thu, 11 May 2023 09:15:05 +0000 (12:15 +0300)
author Timo Sirainen <timo.sirainen@open-xchange.com>
Thu, 23 Mar 2023 14:07:26 +0000 (16:07 +0200)
committer Timo Sirainen <timo.sirainen@open-xchange.com>
Thu, 11 May 2023 09:15:05 +0000 (12:15 +0300)
diff --git a/src/master/service-monitor.c b/src/master/service-monitor.c

index b035d742b6abd2390074425a081dfb1923f8f9e2..ddfea6085119d4afeae69fa022e39246f7b0c3aa 100644 (file)
--- a/src/master/service-monitor.c
+++ b/src/master/service-monitor.c
@@ -34,30 +34,67 @@ static void service_status_more(struct service_process *process,
                                 const struct master_status *status);
  static void service_monitor_listen_start_force(struct service *service);
  
-static void service_process_kill_idle(struct service_process *process)
+static void service_process_idle_kill_timeout(struct service_process *process)
  {
-       struct service *service = process->service;
         struct master_status status;
  
-       i_assert(process->available_count == service->client_limit);
+       service_error(process->service, "Process %s is ignoring idle SIGINT",
+                     dec2str(process->pid));
+
+       /* assume this process is busy */
+       i_zero(&status);
+       service_status_more(process, &status);
+       process->available_count = 0;
+}
  
+static void service_kill_idle(struct service *service)
+{
+       /* Kill processes which have idled over idle_kill seconds.
+
+          (It's actually not important which processes get killed. A better
+          way could be to kill the oldest processes since they might have to
+          be restarted anyway soon due to reaching service_count, but we'd
+          have to use priority queue for tracking that, which is more
+          expensive and probably not worth it.) */
+       if (service->idle_processes_head == NULL) {
+               /* no idling processes to kill */
+               timeout_remove(&service->to_idle);
+               return;
+       }
         if (service->process_avail <= service->set->process_min_avail) {
                 /* we don't have any extra idling processes anymore. */
-               timeout_remove(&process->to_idle);
-       } else if (process->last_kill_sent > process->last_status_update+1) {
-               service_error(service, "Process %s is ignoring idle SIGINT",
-                             dec2str(process->pid));
-
-               /* assume this process is busy */
-               i_zero(&status);
-               service_status_more(process, &status);
-               process->available_count = 0;
-       } else {
+               timeout_remove(&service->to_idle);
+               return;
+       }
+       time_t newest_time_to_kill = ioloop_time - service->idle_kill;
+       unsigned int max_processes_to_kill =
+               service->process_avail - service->set->process_min_avail;
+       for (; max_processes_to_kill > 0; max_processes_to_kill--) {
+               struct service_process *process = service->idle_processes_head;
+
+               if (process->to_idle_kill != NULL) {
+                       /* already tried to kill all the idle processes */
+                       break;
+               }
+               if (process->idle_start > newest_time_to_kill)
+                       break;
+
+               i_assert(process->available_count == service->client_limit);
                 if (kill(process->pid, SIGINT) < 0 && errno != ESRCH) {
                         service_error(service, "kill(%s, SIGINT) failed: %m",
                                       dec2str(process->pid));
                 }
                 process->last_kill_sent = ioloop_time;
+               process->to_idle_kill =
+                       timeout_add(service->idle_kill * 1000,
+                                   service_process_idle_kill_timeout, process);
+
+               /* Move it to the end of the list, so it's not tried to be
+                  killed again. */
+               DLLIST2_REMOVE(&service->idle_processes_head,
+                              &service->idle_processes_tail, process);
+               DLLIST2_APPEND(&service->idle_processes_head,
+                              &service->idle_processes_tail, process);
         }
  }
  
@@ -79,8 +116,6 @@ static void service_status_more(struct service_process *process,
         process->total_count +=
                 process->available_count - status->available_count;
  
-       timeout_remove(&process->to_idle);
-
         if (status->available_count != 0)
                 return;
  
@@ -120,16 +155,13 @@ static void service_check_idle(struct service_process *process)
         process->idle_start = ioloop_time;
  
         if (service->process_avail > service->set->process_min_avail &&
-           process->to_idle == NULL &&
+           service->to_idle == NULL &&
             service->idle_kill != UINT_MAX) {
-               /* we have more processes than we really need.
-                  add a bit of randomness so that we don't send the
-                  signal to all of them at once */
-               process->to_idle =
-                       timeout_add((service->idle_kill * 1000) +
-                                   i_rand_limit(100) * 10,
-                                   service_process_kill_idle,
-                                   process);
+               /* We have more processes than we really need. Start a timer
+                  to trigger idle_kill. */
+               service->to_idle =
+                       timeout_add(service->idle_kill * 1000,
+                                   service_kill_idle, service);
         }
  }
  
@@ -177,6 +209,8 @@ service_status_input_one(struct service *service,
                 return;
         }
         process->last_status_update = ioloop_time;
+       /* process worked a bit - it may have ignored idle-kill signal */
+       timeout_remove(&process->to_idle_kill);
  
         /* first status notification */
         timeout_remove(&process->to_status);
@@ -600,6 +634,7 @@ void service_monitor_stop(struct service *service)
  
         timeout_remove(&service->to_throttle);
         timeout_remove(&service->to_prefork);
+       timeout_remove(&service->to_idle);
  }
  
  void service_monitor_stop_close(struct service *service)
diff --git a/src/master/service-process.c b/src/master/service-process.c

index 296cc3e3e95422c4e8a20583ec5dc84e2830e668..0de515572b339f1a38038447493a81e427a237bb 100644 (file)
--- a/src/master/service-process.c
+++ b/src/master/service-process.c
@@ -410,7 +410,7 @@ void service_process_destroy(struct service_process *process)
         i_assert(service->process_avail <= service->process_count);
  
         timeout_remove(&process->to_status);
-       timeout_remove(&process->to_idle);
+       timeout_remove(&process->to_idle_kill);
         if (service->list->log_byes != NULL)
                 service_process_notify_add(service->list->log_byes, process);
  
diff --git a/src/master/service-process.h b/src/master/service-process.h

index 797f63db4619c859c9fccf540a29e3f47ec22f10..f3e47bb245947f48f1d3a96e99f68aa958a0d08d 100644 (file)
--- a/src/master/service-process.h
+++ b/src/master/service-process.h
@@ -26,8 +26,9 @@ struct service_process {
            This field also determines whether the process is in the service's
            "busy" or "idle" processes linked list. */
         time_t idle_start;
-       /* kill process if it hits idle timeout */
-       struct timeout *to_idle;
+       /* Timeout for processing idle-kill SIGINT. Either the process will die
+          or it sends a notification about not being idling anymore */
+       struct timeout *to_idle_kill;
  
         /* time when we last received a status update */
         time_t last_status_update;
diff --git a/src/master/service.h b/src/master/service.h

index 8234d81af447e05d73f55d3012140b916a7e92ba..8287665d48f42de66b91df9fb7bab3824734fd42 100644 (file)
--- a/src/master/service.h
+++ b/src/master/service.h
@@ -113,6 +113,8 @@ struct service {
         struct timeout *to_drop;
         /* delayed process_limit reached warning with SERVICE_TYPE_WORKER */
         struct timeout *to_drop_warning;
+       /* next time to try to kill idling processes */
+       struct timeout *to_idle;
  
         /* prefork processes up to process_min_avail if there's time */
         struct timeout *to_prefork;
author	Timo Sirainen <timo.sirainen@open-xchange.com>
	Thu, 23 Mar 2023 14:07:26 +0000 (16:07 +0200)
committer	Timo Sirainen <timo.sirainen@open-xchange.com>
	Thu, 11 May 2023 09:15:05 +0000 (12:15 +0300)
src/master/service-monitor.c		patch \| blob \| blame \| history
src/master/service-process.c		patch \| blob \| blame \| history
src/master/service-process.h		patch \| blob \| blame \| history
src/master/service.h		patch \| blob \| blame \| history