]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
fix race between daemon-reload and other commands
authorDavid Tardon <dtardon@redhat.com>
Tue, 24 Apr 2018 13:19:38 +0000 (15:19 +0200)
committerZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Sat, 19 May 2018 09:37:00 +0000 (11:37 +0200)
When "systemctl daemon-reload" is run at the same time as "systemctl
start foo", the latter might hang. That's because commands like start
wait for JobRemoved signal to know when the job is finished. But if the
job is finished during reloading, the signal is never sent.

The hang can be easily reproduced by running

    # for ((N=1; N>0; N++)) ; do echo $N ; systemctl daemon-reload ; done
    # for ((N=1; N>0; N++)) ; do echo $N ; systemctl start systemd-coredump.socket ; done

in two different terminals. The start command will hang after 1-2
iterations.

This keeps track of jobs that were started before reload and finished
during it and sends JobRemoved after the reload has finished.

src/core/job.c
src/core/job.h
src/core/manager.c
src/core/manager.h

index 32f6fc1a50f243583c218800631d31ebf3c38e3a..a0f7685c4e10df55700bdce2897f333e91929bae 100644 (file)
@@ -43,6 +43,7 @@ Job* job_new_raw(Unit *unit) {
         j->manager = unit->manager;
         j->unit = unit;
         j->type = _JOB_TYPE_INVALID;
+        j->reloaded = false;
 
         return j;
 }
@@ -64,7 +65,7 @@ Job* job_new(Unit *unit, JobType type) {
         return j;
 }
 
-void job_free(Job *j) {
+void job_unlink(Job *j) {
         assert(j);
         assert(!j->installed);
         assert(!j->transaction_prev);
@@ -72,16 +73,33 @@ void job_free(Job *j) {
         assert(!j->subject_list);
         assert(!j->object_list);
 
-        if (j->in_run_queue)
+        if (j->in_run_queue) {
                 LIST_REMOVE(run_queue, j->manager->run_queue, j);
+                j->in_run_queue = false;
+        }
 
-        if (j->in_dbus_queue)
+        if (j->in_dbus_queue) {
                 LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j);
+                j->in_dbus_queue = false;
+        }
 
-        if (j->in_gc_queue)
+        if (j->in_gc_queue) {
                 LIST_REMOVE(gc_queue, j->manager->gc_job_queue, j);
+                j->in_gc_queue = false;
+        }
 
-        sd_event_source_unref(j->timer_event_source);
+        j->timer_event_source = sd_event_source_unref(j->timer_event_source);
+}
+
+void job_free(Job *j) {
+        assert(j);
+        assert(!j->installed);
+        assert(!j->transaction_prev);
+        assert(!j->transaction_next);
+        assert(!j->subject_list);
+        assert(!j->object_list);
+
+        job_unlink(j);
 
         sd_bus_track_unref(j->bus_track);
         strv_free(j->deserialized_clients);
@@ -241,6 +259,7 @@ int job_install_deserialized(Job *j) {
 
         *pj = j;
         j->installed = true;
+        j->reloaded = true;
 
         if (j->state == JOB_RUNNING)
                 j->unit->manager->n_running_jobs++;
@@ -844,6 +863,19 @@ static void job_fail_dependencies(Unit *u, UnitDependency d) {
         }
 }
 
+static int job_save_pending_finished_job(Job *j) {
+        int r;
+
+        assert(j);
+
+        r = set_ensure_allocated(&j->manager->pending_finished_jobs, NULL);
+        if (r < 0)
+                return r;
+
+        job_unlink(j);
+        return set_put(j->manager->pending_finished_jobs, j);
+}
+
 int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already) {
         Unit *u;
         Unit *other;
@@ -883,7 +915,12 @@ int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool alr
                 j->manager->n_failed_jobs++;
 
         job_uninstall(j);
-        job_free(j);
+        /* Remember jobs started before the reload */
+        if (MANAGER_IS_RELOADING(j->manager) && j->reloaded) {
+                if (job_save_pending_finished_job(j) < 0)
+                        job_free(j);
+        } else
+                job_free(j);
 
         /* Fail depending jobs on failure */
         if (result != JOB_DONE && recursive) {
index ccb8e1b67415c7daeb944cd27c466a67f1f633a9..338670e39351f6a44690a6ca1d2ec56254f90680 100644 (file)
@@ -162,10 +162,12 @@ struct Job {
         bool irreversible:1;
         bool in_gc_queue:1;
         bool ref_by_private_bus:1;
+        bool reloaded:1;
 };
 
 Job* job_new(Unit *unit, JobType type);
 Job* job_new_raw(Unit *unit);
+void job_unlink(Job *job);
 void job_free(Job *job);
 Job* job_install(Job *j);
 int job_install_deserialized(Job *j);
index 204fc8b819d9c9a00a11d887f9da45db4f5209fe..b4f197c20498bb126a79694a476975ec24800a1c 100644 (file)
@@ -3186,6 +3186,17 @@ finish:
         return r;
 }
 
+static void manager_flush_finished_jobs(Manager *m) {
+        Job *j;
+
+        while ((j = set_steal_first(m->pending_finished_jobs))) {
+                bus_job_send_removed_signal(j);
+                job_free(j);
+        }
+
+        m->pending_finished_jobs = set_free(m->pending_finished_jobs);
+}
+
 int manager_reload(Manager *m) {
         int r, q;
         _cleanup_fclose_ FILE *f = NULL;
@@ -3294,6 +3305,9 @@ int manager_reload(Manager *m) {
         if (q < 0 && r >= 0)
                 r = q;
 
+        if (!MANAGER_IS_RELOADING(m))
+                manager_flush_finished_jobs(m);
+
         m->send_reloading_done = true;
 
         return r;
index f41cce1c09e99e6583f388431da61962a03f0cf9..1f97c15365c83e23edfa9dd5063d6cfb2d93cb52 100644 (file)
@@ -300,6 +300,9 @@ struct Manager {
 
         /* non-zero if we are reloading or reexecuting, */
         int n_reloading;
+        /* A set which contains all jobs that started before reload and finished
+         * during it */
+        Set *pending_finished_jobs;
 
         unsigned n_installed_jobs;
         unsigned n_failed_jobs;