]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: rework how we track service and scope PIDs
authorLennart Poettering <lennart@poettering.net>
Thu, 31 May 2018 13:41:59 +0000 (15:41 +0200)
committerLennart Poettering <lennart@poettering.net>
Tue, 5 Jun 2018 20:06:48 +0000 (22:06 +0200)
This reworks how systemd tracks processes on cgroupv1 systems where
cgroup notification is not reliable. Previously, whenever we had reason
to believe that new processes showed up or got removed we'd scan the
cgroup of the scope or service unit for new processes, and would tidy up
the list of PIDs previously watched. This scanning is relatively slow,
and does not scale well. With this change behaviour is changed: instead
of scanning for new/removed processes right away we do this work in a
per-unit deferred event loop job. This event source is scheduled at a
very low priority, so that it is executed when we have time but does not
starve other event sources. This has two benefits: this expensive work is
coalesced, if events happen in quick succession, and we won't delay
SIGCHLD handling for too long.

This patch basically replaces all direct invocation of
unit_watch_all_pids() in scope.c and service.c with invocations of the
new unit_enqueue_rewatch_pids() call which just enqueues a request of
watching/tidying up the PID sets (with one exception: in
scope_enter_signal() and service_enter_signal() we'll still do
unit_watch_all_pids() synchronously first, since we really want to know
all processes we are about to kill so that we can track them properly.

Moreover, all direct invocations of unit_tidy_watch_pids() and
unit_synthesize_cgroup_empty_event() are removed too, when the
unit_enqueue_rewatch_pids() call is invoked, as the queued job will run
those operations too.

All of this is done on cgroupsv1 systems only, and is disabled on
cgroupsv2 systems as cgroup-empty notifications are reliable there, and
we do not need SIGCHLD events to track processes there.

Fixes: #9138
src/core/scope.c
src/core/service.c
src/core/unit.c
src/core/unit.h

index 7410f154e6ccd959efeb42a9ca1fb56ae17ecb62..0d22c7525b309d72f8e608d7f52f33ce51dfdcc0 100644 (file)
@@ -92,8 +92,10 @@ static void scope_set_state(Scope *s, ScopeState state) {
         if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
                 s->timer_event_source = sd_event_source_unref(s->timer_event_source);
 
-        if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED))
+        if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) {
                 unit_unwatch_all_pids(UNIT(s));
+                unit_dequeue_rewatch_pids(UNIT(s));
+        }
 
         if (state != old_state)
                 log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state));
@@ -212,7 +214,7 @@ static int scope_coldplug(Unit *u) {
         }
 
         if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED))
-                unit_watch_all_pids(UNIT(s));
+                (void) unit_enqueue_rewatch_pids(u);
 
         bus_scope_track_controller(s);
 
@@ -257,7 +259,12 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
         if (s->result == SCOPE_SUCCESS)
                 s->result = f;
 
-        unit_watch_all_pids(UNIT(s));
+        /* Before sending any signal, make sure we track all members of this cgroup */
+        (void) unit_watch_all_pids(UNIT(s));
+
+        /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have
+         * died now */
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
 
         /* If we have a controller set let's ask the controller nicely to terminate the scope, instead of us going
          * directly into SIGTERM berserk mode */
@@ -455,17 +462,13 @@ static void scope_notify_cgroup_empty_event(Unit *u) {
 }
 
 static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
-
         assert(u);
 
         /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to
          * watch, under the assumption that we'll sooner or later get a SIGCHLD for them, as the original
          * process we watched was probably the parent of them, and they are hence now our children. */
-        unit_tidy_watch_pids(u, 0, 0);
-        unit_watch_all_pids(u);
 
-        /* If the PID set is empty now, then let's finish this off. */
-        unit_synthesize_cgroup_empty_event(u);
+        (void) unit_enqueue_rewatch_pids(u);
 }
 
 static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
@@ -515,13 +518,9 @@ int scope_abandon(Scope *s) {
 
         scope_set_state(s, SCOPE_ABANDONED);
 
-        /* The client is no longer watching the remaining processes,
-         * so let's step in here, under the assumption that the
-         * remaining processes will be sooner or later reassigned to
-         * us as parent. */
-
-        unit_tidy_watch_pids(UNIT(s), 0, 0);
-        unit_watch_all_pids(UNIT(s));
+        /* The client is no longer watching the remaining processes, so let's step in here, under the assumption that
+         * the remaining processes will be sooner or later reassigned to us as parent. */
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
 
         return 0;
 }
index bd62fbce3a0e2a14c1cc2ce5d0cd1830326ddf0e..db0c1dc397a70fa3ff6040e8d593d1af164d7e2d 100644 (file)
@@ -1058,8 +1058,10 @@ static void service_set_state(Service *s, ServiceState state) {
                 s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
         }
 
-        if (IN_SET(state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART))
+        if (IN_SET(state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) {
                 unit_unwatch_all_pids(UNIT(s));
+                unit_dequeue_rewatch_pids(UNIT(s));
+        }
 
         if (!IN_SET(state,
                     SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
@@ -1154,17 +1156,15 @@ static int service_coldplug(Unit *u) {
                         return r;
         }
 
-        if (!IN_SET(s->deserialized_state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART))
-                unit_watch_all_pids(UNIT(s));
-
-        if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD))
-                service_start_watchdog(s);
-
         if (!IN_SET(s->deserialized_state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) {
+                (void) unit_enqueue_rewatch_pids(u);
                 (void) unit_setup_dynamic_creds(u);
                 (void) unit_setup_exec_runtime(u);
         }
 
+        if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD))
+                service_start_watchdog(s);
+
         if (UNIT_ISSET(s->accept_socket)) {
                 Socket* socket = SOCKET(UNIT_DEREF(s->accept_socket));
 
@@ -1679,7 +1679,8 @@ static void service_enter_stop_post(Service *s, ServiceResult f) {
                 s->result = f;
 
         service_unwatch_control_pid(s);
-        unit_watch_all_pids(UNIT(s));
+
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
 
         s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST];
         if (s->control_command) {
@@ -1731,7 +1732,12 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
         if (s->result == SERVICE_SUCCESS)
                 s->result = f;
 
-        unit_watch_all_pids(UNIT(s));
+        /* Before sending any signal, make sure we track all members of this cgroup */
+        (void) unit_watch_all_pids(UNIT(s));
+
+        /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have
+         * died now */
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
 
         r = unit_kill_context(
                         UNIT(s),
@@ -1772,7 +1778,7 @@ fail:
 static void service_enter_stop_by_notify(Service *s) {
         assert(s);
 
-        unit_watch_all_pids(UNIT(s));
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
 
         service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec));
 
@@ -1789,7 +1795,7 @@ static void service_enter_stop(Service *s, ServiceResult f) {
                 s->result = f;
 
         service_unwatch_control_pid(s);
-        unit_watch_all_pids(UNIT(s));
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
 
         s->control_command = s->exec_command[SERVICE_EXEC_STOP];
         if (s->control_command) {
@@ -3290,11 +3296,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
         /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to watch,
          * under the assumption that we'll sooner or later get a SIGCHLD for them, as the original process we watched
          * was probably the parent of them, and they are hence now our children. */
-        unit_tidy_watch_pids(u, s->main_pid, s->control_pid);
-        unit_watch_all_pids(u);
-
-        /* If the PID set is empty now, then let's check if the cgroup is empty too and finish off the unit. */
-        unit_synthesize_cgroup_empty_event(u);
+        (void) unit_enqueue_rewatch_pids(u);
 }
 
 static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
index bfe63365dd737797832691b2bf044e6fcfdf5a75..7265f95c9532ea71527d5611028fb52b188e3deb 100644 (file)
@@ -567,8 +567,9 @@ void unit_free(Unit *u) {
 
         unit_done(u);
 
-        sd_bus_slot_unref(u->match_bus_slot);
+        unit_dequeue_rewatch_pids(u);
 
+        sd_bus_slot_unref(u->match_bus_slot);
         sd_bus_track_unref(u->bus_track);
         u->deserialized_refs = strv_free(u->deserialized_refs);
 
@@ -2605,7 +2606,8 @@ void unit_unwatch_all_pids(Unit *u) {
         u->pids = set_free(u->pids);
 }
 
-void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) {
+static void unit_tidy_watch_pids(Unit *u) {
+        pid_t except1, except2;
         Iterator i;
         void *e;
 
@@ -2613,6 +2615,9 @@ void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) {
 
         /* Cleans dead PIDs from our list */
 
+        except1 = unit_main_pid(u);
+        except2 = unit_control_pid(u);
+
         SET_FOREACH(e, u->pids, i) {
                 pid_t pid = PTR_TO_PID(e);
 
@@ -2624,6 +2629,76 @@ void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) {
         }
 }
 
+static int on_rewatch_pids_event(sd_event_source *s, void *userdata) {
+        Unit *u = userdata;
+
+        assert(s);
+        assert(u);
+
+        unit_tidy_watch_pids(u);
+        unit_watch_all_pids(u);
+
+        /* If the PID set is empty now, then let's finish this off. */
+        unit_synthesize_cgroup_empty_event(u);
+
+        return 0;
+}
+
+int unit_enqueue_rewatch_pids(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (!u->cgroup_path)
+                return -ENOENT;
+
+        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+        if (r < 0)
+                return r;
+        if (r > 0) /* On unified we can use proper notifications */
+                return 0;
+
+        /* Enqueues a low-priority job that will clean up dead PIDs from our list of PIDs to watch and subscribe to new
+         * PIDs that might have appeared. We do this in a delayed job because the work might be quite slow, as it
+         * involves issuing kill(pid, 0) on all processes we watch. */
+
+        if (!u->rewatch_pids_event_source) {
+                _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+
+                r = sd_event_add_defer(u->manager->event, &s, on_rewatch_pids_event, u);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate event source for tidying watched PIDs: %m");
+
+                r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_IDLE);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to adjust priority of event source for tidying watched PIDs: m");
+
+                (void) sd_event_source_set_description(s, "tidy-watch-pids");
+
+                u->rewatch_pids_event_source = TAKE_PTR(s);
+        }
+
+        r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_ONESHOT);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enable event source for tidying watched PIDs: %m");
+
+        return 0;
+}
+
+void unit_dequeue_rewatch_pids(Unit *u) {
+        int r;
+        assert(u);
+
+        if (!u->rewatch_pids_event_source)
+                return;
+
+        r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_OFF);
+        if (r < 0)
+                log_warning_errno(r, "Failed to disable event source for tidying watched PIDs, ignoring: %m");
+
+        u->rewatch_pids_event_source = sd_event_source_unref(u->rewatch_pids_event_source);
+}
+
 bool unit_job_is_applicable(Unit *u, JobType j) {
         assert(u);
         assert(j >= 0 && j < _JOB_TYPE_MAX);
index f1b5d61af70c49127cc1664b43d3ee265afea9bb..76270a0a7c729023f496d3b9861a50bd9f9a1bf3 100644 (file)
@@ -279,6 +279,10 @@ typedef struct Unit {
 
         uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX];
 
+        /* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new
+         * ones which might have appeared. */
+        sd_event_source *rewatch_pids_event_source;
+
         /* How to start OnFailure units */
         JobMode on_failure_job_mode;
 
@@ -650,7 +654,8 @@ int unit_watch_pid(Unit *u, pid_t pid);
 void unit_unwatch_pid(Unit *u, pid_t pid);
 void unit_unwatch_all_pids(Unit *u);
 
-void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2);
+int unit_enqueue_rewatch_pids(Unit *u);
+void unit_dequeue_rewatch_pids(Unit *u);
 
 int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name);
 int unit_watch_bus_name(Unit *u, const char *name);