]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: Fail to start/stop/reload unit if frozen 31039/head
authorAdrian Vovk <adrianvovk@gmail.com>
Wed, 24 Jan 2024 00:50:21 +0000 (19:50 -0500)
committerAdrian Vovk <adrianvovk@gmail.com>
Tue, 30 Jan 2024 16:18:16 +0000 (11:18 -0500)
Previously, unit_{start,stop,reload} would call the low-level cgroup
unfreeze function whenever a unit was started, stopped, or reloaded. It
did so with no error checking. This call would ultimately recurse up the
cgroup tree, and unfreeze all the parent cgroups of the unit, unless an
error occurred (in which case I have no idea what would happen...)

After the freeze/thaw rework in a previous commit, this can no longer
work. If we recursively thaw the parent cgroups of the unit, there may
be sibling units marked as PARENT_FROZEN which will no longer actually
have frozen parents. Fixing this is a lot more complicated than simply
disallowing start/stop/reload on a frozen unit

Fixes https://github.com/systemd/systemd/issues/15849

src/core/job.c
src/core/job.h
src/core/unit.c
src/shared/bus-wait-for-jobs.c
test/units/testsuite-38.sh

index e78c2a70db6015c15fc0f4823cfab4c96fa40464..595fbce93ae024fb8dd52cfa20785515b19b30d8 100644 (file)
@@ -633,16 +633,19 @@ static const char* job_done_message_format(Unit *u, JobType t, JobResult result)
                 [JOB_UNSUPPORTED] = "Starting of %s unsupported.",
                 [JOB_COLLECTED]   = "Unnecessary job was removed for %s.",
                 [JOB_ONCE]        = "Unit %s has been started before and cannot be started again.",
+                [JOB_FROZEN]      = "Cannot start frozen unit %s.",
         };
         static const char* const generic_finished_stop_job[_JOB_RESULT_MAX] = {
                 [JOB_DONE]        = "Stopped %s.",
                 [JOB_FAILED]      = "Stopped %s with error.",
                 [JOB_TIMEOUT]     = "Timed out stopping %s.",
+                [JOB_FROZEN]      = "Cannot stop frozen unit %s.",
         };
         static const char* const generic_finished_reload_job[_JOB_RESULT_MAX] = {
                 [JOB_DONE]        = "Reloaded %s.",
                 [JOB_FAILED]      = "Reload failed for %s.",
                 [JOB_TIMEOUT]     = "Timed out reloading %s.",
+                [JOB_FROZEN]      = "Cannot reload frozen unit %s.",
         };
         /* When verify-active detects the unit is inactive, report it.
          * Most likely a DEPEND warning from a requisiting unit will
@@ -704,6 +707,7 @@ static const struct {
         [JOB_UNSUPPORTED] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "UNSUPP" },
         [JOB_COLLECTED]   = { LOG_INFO,                                    },
         [JOB_ONCE]        = { LOG_ERR,     ANSI_HIGHLIGHT_RED,    " ONCE " },
+        [JOB_FROZEN]      = { LOG_ERR,     ANSI_HIGHLIGHT_RED,    "FROZEN" },
 };
 
 static const char* job_done_mid(JobType type, JobResult result) {
@@ -954,6 +958,8 @@ int job_run_and_invalidate(Job *j) {
                         r = job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false);
                 else if (r == -ESTALE)
                         r = job_finish_and_invalidate(j, JOB_ONCE, true, false);
+                else if (r == -EDEADLK)
+                        r = job_finish_and_invalidate(j, JOB_FROZEN, true, false);
                 else if (r < 0)
                         r = job_finish_and_invalidate(j, JOB_FAILED, true, false);
         }
@@ -1011,7 +1017,7 @@ int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool alr
                 goto finish;
         }
 
-        if (IN_SET(result, JOB_FAILED, JOB_INVALID))
+        if (IN_SET(result, JOB_FAILED, JOB_INVALID, JOB_FROZEN))
                 j->manager->n_failed_jobs++;
 
         job_uninstall(j);
@@ -1645,6 +1651,7 @@ static const char* const job_result_table[_JOB_RESULT_MAX] = {
         [JOB_UNSUPPORTED] = "unsupported",
         [JOB_COLLECTED]   = "collected",
         [JOB_ONCE]        = "once",
+        [JOB_FROZEN]      = "frozen",
 };
 
 DEFINE_STRING_TABLE_LOOKUP(job_result, JobResult);
index 891d87a79cd558426b7eabf1d74754579b9e0372..8318b524b4401d7f01cfa594f9d6cf9de43b585d 100644 (file)
@@ -96,6 +96,7 @@ enum JobResult {
         JOB_UNSUPPORTED,         /* Couldn't start a unit, because the unit type is not supported on the system */
         JOB_COLLECTED,           /* Job was garbage collected, since nothing needed it anymore */
         JOB_ONCE,                /* Unit was started before, and hence can't be started again */
+        JOB_FROZEN,              /* Unit is currently frozen, so we can't safely operate on it */
         _JOB_RESULT_MAX,
         _JOB_RESULT_INVALID = -EINVAL,
 };
index b4634e0a86a854a9c48ada146b4bce609c23d7f6..6f61a2285e3f8b302eeb518e9c254c1e38715325 100644 (file)
@@ -1956,6 +1956,10 @@ int unit_start(Unit *u, ActivationDetails *details) {
                 return unit_start(following, details);
         }
 
+        /* Check to make sure the unit isn't frozen */
+        if (u->freezer_state != FREEZER_RUNNING)
+                return -EDEADLK;
+
         /* Check our ability to start early so that failure conditions don't cause us to enter a busy loop. */
         if (UNIT_VTABLE(u)->can_start) {
                 r = UNIT_VTABLE(u)->can_start(u);
@@ -1972,7 +1976,6 @@ int unit_start(Unit *u, ActivationDetails *details) {
          * waits for a holdoff timer to elapse before it will start again. */
 
         unit_add_to_dbus_queue(u);
-        unit_cgroup_freezer_action(u, FREEZER_THAW);
 
         if (!u->activation_details) /* Older details object wins */
                 u->activation_details = activation_details_ref(details);
@@ -2007,6 +2010,7 @@ bool unit_can_isolate(Unit *u) {
  *         -EBADR:    This unit type does not support stopping.
  *         -EALREADY: Unit is already stopped.
  *         -EAGAIN:   An operation is already in progress. Retry later.
+ *         -EDEADLK:  Unit is frozen
  */
 int unit_stop(Unit *u) {
         UnitActiveState state;
@@ -2024,11 +2028,14 @@ int unit_stop(Unit *u) {
                 return unit_stop(following);
         }
 
+        /* Check to make sure the unit isn't frozen */
+        if (u->freezer_state != FREEZER_RUNNING)
+                return -EDEADLK;
+
         if (!UNIT_VTABLE(u)->stop)
                 return -EBADR;
 
         unit_add_to_dbus_queue(u);
-        unit_cgroup_freezer_action(u, FREEZER_THAW);
 
         return UNIT_VTABLE(u)->stop(u);
 }
@@ -2053,6 +2060,7 @@ bool unit_can_stop(Unit *u) {
  *         -EBADR:    This unit type does not support reloading.
  *         -ENOEXEC:  Unit is not started.
  *         -EAGAIN:   An operation is already in progress. Retry later.
+ *         -EDEADLK:  Unit is frozen.
  */
 int unit_reload(Unit *u) {
         UnitActiveState state;
@@ -2079,6 +2087,10 @@ int unit_reload(Unit *u) {
                 return unit_reload(following);
         }
 
+        /* Check to make sure the unit isn't frozen */
+        if (u->freezer_state != FREEZER_RUNNING)
+                return -EDEADLK;
+
         unit_add_to_dbus_queue(u);
 
         if (!UNIT_VTABLE(u)->reload) {
@@ -2087,8 +2099,6 @@ int unit_reload(Unit *u) {
                 return 0;
         }
 
-        unit_cgroup_freezer_action(u, FREEZER_THAW);
-
         return UNIT_VTABLE(u)->reload(u);
 }
 
index 70f2043347f1f6fb7295b2075f60f68c271792f4..2930552be486bb84afa84a3f53a417a6ae921a30 100644 (file)
@@ -248,6 +248,8 @@ static int check_wait_response(BusWaitForJobs *d, WaitJobsFlags flags, const cha
                         log_error("Queued job for %s was garbage collected.", d->name);
                 else if (streq(d->result, "once"))
                         log_error("Unit %s was started already once and can't be started again.", d->name);
+                else if (streq(d->result, "frozen"))
+                        log_error("Cannot perform operation on frozen unit %s.", d->name);
                 else if (endswith(d->name, ".service")) {
                         /* Job result is unknown. For services, let's also try Result property. */
                         _cleanup_free_ char *result = NULL;
@@ -276,6 +278,8 @@ static int check_wait_response(BusWaitForJobs *d, WaitJobsFlags flags, const cha
                 return -EOPNOTSUPP;
         else if (streq(d->result, "once"))
                 return -ESTALE;
+        else if (streq(d->result, "frozen"))
+                return -EDEADLK;
 
         return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
                                "Unexpected job result '%s' for unit '%s', assuming server side newer than us.",
index 0737b9c199b1399d0c731adbcd98229a8a93d750..e557fa1dbf960f4f3135d3c01f33cef2619832cc 100755 (executable)
@@ -149,31 +149,6 @@ testcase_dbus_api() {
     echo
 }
 
-testcase_jobs() {
-    local pid_before=
-    local pid_after=
-    echo "Test that it is possible to apply jobs on frozen units:"
-
-    systemctl start "${unit}"
-    dbus_freeze "${unit}"
-    check_freezer_state "${unit}" "frozen"
-
-    echo -n "  - restart: "
-    pid_before=$(systemctl show -p MainPID "${unit}" --value)
-    systemctl restart "${unit}"
-    pid_after=$(systemctl show -p MainPID "${unit}" --value)
-    [ "$pid_before" != "$pid_after" ] && echo "[ OK ]"
-
-    dbus_freeze "${unit}"
-    check_freezer_state "${unit}" "frozen"
-
-    echo -n "  - stop: "
-    timeout 5s systemctl stop "${unit}"
-    echo "[ OK ]"
-
-    echo
-}
-
 testcase_systemctl() {
     echo "Test that systemctl freeze/thaw verbs:"
 
@@ -321,6 +296,11 @@ testcase_recursive() {
     check_cgroup_state "$slice/$unit" 1
     echo "[ OK ]"
 
+    echo -n "  - can't stop a frozen unit: "
+    (! systemctl -q stop "$unit" )
+    echo "[ OK ]"
+    systemctl thaw "$unit"
+
     systemctl stop "$unit"
     systemctl stop "$slice"
 
@@ -368,6 +348,7 @@ testcase_preserve_state() {
     check_freezer_state "$unit" "frozen"
     echo "[ OK ]"
 
+    systemctl thaw "$unit"
     systemctl stop "$unit"
     systemctl stop "$slice"