]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
pid1: add SurviveFinalKillSignal= to skip units on final sigterm/sigkill spree
authorLuca Boccassi <bluca@debian.org>
Thu, 27 Jul 2023 22:14:31 +0000 (23:14 +0100)
committerLuca Boccassi <bluca@debian.org>
Thu, 28 Sep 2023 12:48:14 +0000 (13:48 +0100)
Add a new boolean for units, SurviveFinalKillSignal=yes/no. Units that
set it will not have their process receive the final sigterm/sigkill in
the shutdown phase.

This is implemented by checking if a process is part of a cgroup marked
with a user.survive_final_kill_signal xattr (or a trusted xattr if we
can't set a user one, which were added only in kernel v5.7 and are not
supported in CentOS 8).

12 files changed:
man/org.freedesktop.systemd1.xml
man/systemd-soft-reboot.service.xml
man/systemd.unit.xml
src/core/cgroup.c
src/core/dbus-unit.c
src/core/load-fragment-gperf.gperf.in
src/core/main.c
src/core/unit-serialize.c
src/core/unit.h
src/shared/bus-unit-util.c
src/shared/killall.c
test/units/testsuite-82.sh

index f9aa67739f7049529ef6034e3a6df0b0841902d5..01d8f659d501054a4790ed181d9aadbca3df23b3 100644 (file)
@@ -2028,6 +2028,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b DefaultDependencies = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly b SurviveFinalKillSignal = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s OnSuccessJobMode = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s OnFailureJobMode = '...';
@@ -2142,6 +2144,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <!--property CanFreeze is not documented!-->
 
+    <!--property SurviveFinalKillSignal is not documented!-->
+
     <!--property OnSuccessJobMode is not documented!-->
 
     <!--property OnFailureJobMode is not documented!-->
@@ -2354,6 +2358,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="DefaultDependencies"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="SurviveFinalKillSignal"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="OnSuccessJobMode"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="OnFailureJobMode"/>
@@ -11613,6 +11619,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>AccessSELinuxContext</varname> and
       <varname>ActivationDetails</varname> were added in version 252.</para>
       <para><function>QueueSignal()</function> was added in version 254.</para>
+      <para><varname>SurviveFinalKillSignal</varname> was added in version 255.</para>
     </refsect2>
     <refsect2>
       <title>Service Unit Objects</title>
index f7908d3f032f05b5d17c8e3e3e32c81cf31db00e..1de2fbc5f376c0ff001975076c4f7e8c3cc4c05e 100644 (file)
       <listitem><para>The <filename>/run/</filename> file system remains mounted and populated and may be
       used to pass state information between such userspace reboot cycles.</para></listitem>
 
-      <listitem><para>Service processes may continue to run over the transition, if they are placed in
-      services that remain active until the very end of shutdown (which again is achieved via
-      <varname>DefaultDependencies=no</varname>). They must also be set up to avoid being killed by the
-      aforementioned <constant>SIGTERM</constant> spree (as per <ulink
-      url="https://systemd.io/ROOT_STORAGE_DAEMONS">systemd and Storage Daemons for the Root File
-      System</ulink>).</para></listitem>
+      <listitem><para>Service processes may continue to run over the transition, past soft-reboot and into
+      the next session, if they are placed in services that remain active until the very end of shutdown
+      (which again is achieved via <varname>DefaultDependencies=no</varname>). They must also be set up to
+      avoid being killed by the aforementioned <constant>SIGTERM</constant> and <constant>SIGKILL</constant>
+      via <varname>SurviveFinalKillSignal=yes</varname>, and also be configured to avoid being stopped on
+      isolate via <varname>IgnoreOnIsolate=yes</varname>. They also have to be configured to be stopped on
+      normal shutdown, reboot and maintenance mode. Finally, they have to be ordered after
+      <constant>basic.target</constant> to ensure correct ordeering on boot. Note that in case any new or
+      custom units are used to isolate to, or that implement an equivalent shutdown functionality, they will
+      also have to be configured manually for correct ordering and conflicting. For example:</para>
+
+      <programlisting>[Unit]
+Description=My surviving service
+SurviveFinalKillSignal=yes
+IgnoreOnIsolate=yes
+DefaultDependencies=no
+After=basic.target
+Conflicts=reboot.target
+Before=reboot.target
+Conflicts=kexec.target
+Before=kexec.target
+Conflicts=poweroff.target
+Before=poweroff.target
+Conflicts=halt.target
+Before=halt.target
+Conflicts=rescue.target
+Before=rescue.target
+Conflicts=emergency.target
+Before=emergency.target
+
+[Service]
+Type=oneshot
+ExecStart=sleep infinity
+      </programlisting>
+      </listitem>
 
       <listitem><para>File system mounts may remain mounted during the transition, and complex storage
       attached, if configured to remain until the very end of the shutdown process. (Also achieved via
index fa867dba1acecadea1f0f227fac4bcf1d7ecc667..3447391c9e92a8d0f851b0c9cc5be9c7cb213e8f 100644 (file)
         <xi:include href="version-info.xml" xpointer="v201"/></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>SurviveFinalKillSignal=</varname></term>
+
+        <listitem><para>Takes a boolean argument. Defaults to <option>no</option>. If <option>yes</option>,
+        processes belonging to this unit will not be sent the final <literal>SIGTERM</literal> and
+        <literal>SIGKILL</literal> signals during the final phase of the system shutdown process.
+        This functionality replaces the older mechanism that allowed a program to set
+        <literal>argv[0][0] = '@'</literal> as described at
+        <ulink url="https://systemd.io/ROOT_STORAGE_DAEMONS">systemd and Storage Daemons for the Root File
+        System</ulink>, which however continues to be supported.</para>
+
+        <xi:include href="version-info.xml" xpointer="v255"/></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>CollectMode=</varname></term>
 
index 32c78a449b99c29d09d70e8b6b39d6409c9229c0..e8f8ddc2445fe87d7de5637bbdadd181c47c8bd9 100644 (file)
@@ -888,6 +888,7 @@ int cgroup_log_xattr_apply(Unit *u, const char *cgroup_path) {
 
 static void cgroup_xattr_apply(Unit *u) {
         bool b;
+        int r;
 
         assert(u);
 
@@ -921,6 +922,32 @@ static void cgroup_xattr_apply(Unit *u) {
                 else
                         unit_remove_xattr_graceful(u, NULL, xn);
         }
+
+        if (u->survive_final_kill_signal) {
+                r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER,
+                                 u->cgroup_path,
+                                 "user.survive_final_kill_signal",
+                                 "1",
+                                 1,
+                                 /* flags= */ 0);
+                /* user xattr support was added in kernel v5.7 */
+                if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                        r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER,
+                                        u->cgroup_path,
+                                        "trusted.survive_final_kill_signal",
+                                        "1",
+                                        1,
+                                        /* flags= */ 0);
+                if (r < 0)
+                        log_unit_debug_errno(u,
+                                             r,
+                                             "Failed to set 'survive_final_kill_signal' xattr on control "
+                                             "group %s, ignoring: %m",
+                                             empty_to_root(u->cgroup_path));
+        } else {
+                unit_remove_xattr_graceful(u, /* cgroup_path= */ NULL, "user.survive_final_kill_signal");
+                unit_remove_xattr_graceful(u, /* cgroup_path= */ NULL, "trusted.survive_final_kill_signal");
+        }
 }
 
 static int lookup_block_device(const char *p, dev_t *ret) {
index e9b446945aaa60846b043973c8b3292bf6fac9b6..05b80cbf331848bcd05bf2bd5d94c8965a145037 100644 (file)
@@ -921,6 +921,7 @@ const sd_bus_vtable bus_unit_vtable[] = {
         SD_BUS_PROPERTY("RefuseManualStop", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_stop), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("AllowIsolate", "b", bus_property_get_bool, offsetof(Unit, allow_isolate), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("DefaultDependencies", "b", bus_property_get_bool, offsetof(Unit, default_dependencies), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SurviveFinalKillSignal", "b", bus_property_get_bool, offsetof(Unit, survive_final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("OnSuccesJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* deprecated */
         SD_BUS_PROPERTY("OnSuccessJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("OnFailureJobMode", "s", property_get_job_mode, offsetof(Unit, on_failure_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -2174,6 +2175,9 @@ static int bus_unit_set_transient_property(
         if (streq(name, "DefaultDependencies"))
                 return bus_set_transient_bool(u, name, &u->default_dependencies, message, flags, error);
 
+        if (streq(name, "SurviveFinalKillSignal"))
+                return bus_set_transient_bool(u, name, &u->survive_final_kill_signal, message, flags, error);
+
         if (streq(name, "OnSuccessJobMode"))
                 return bus_set_transient_job_mode(u, name, &u->on_success_job_mode, message, flags, error);
 
index 6cdf13197517b5f030bce56ecef84a12ca85028a..77a0dce529f7311c1c7aad3e41ffbc3455b27616 100644 (file)
@@ -313,6 +313,7 @@ Unit.RefuseManualStart,                  config_parse_bool,
 Unit.RefuseManualStop,                   config_parse_bool,                           0,                                  offsetof(Unit, refuse_manual_stop)
 Unit.AllowIsolate,                       config_parse_bool,                           0,                                  offsetof(Unit, allow_isolate)
 Unit.DefaultDependencies,                config_parse_bool,                           0,                                  offsetof(Unit, default_dependencies)
+Unit.SurviveFinalKillSignal,             config_parse_bool,                           0,                                  offsetof(Unit, survive_final_kill_signal)
 Unit.OnSuccessJobMode,                   config_parse_job_mode,                       0,                                  offsetof(Unit, on_success_job_mode)
 Unit.OnFailureJobMode,                   config_parse_job_mode,                       0,                                  offsetof(Unit, on_failure_job_mode)
 {# The following is a legacy alias name for compatibility #}
index 4ee5e8735f42b19dff1487097c93e396da71f2cd..cbcf3ddeeae2ef5ef504b185921b0b116bfb92c1 100644 (file)
@@ -1766,7 +1766,8 @@ static void finish_remaining_processes(ManagerObjective objective) {
         if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
                 broadcast_signal(SIGTERM, /* wait_for_exit= */ false, /* send_sighup= */ true, arg_defaults.timeout_stop_usec);
 
-        /* On soft reboot really make sure nothing is left */
+        /* On soft reboot really make sure nothing is left. Note that this will skip cgroups
+         * of units that were configured with SurviveFinalKillSignal=yes. */
         if (objective == MANAGER_SOFT_REBOOT)
                 broadcast_signal(SIGKILL, /* wait_for_exit= */ false, /* send_sighup= */ false, arg_defaults.timeout_stop_usec);
 }
index f0030d3211bcf3f2e2c4ec233dbfe862c578bc77..28510026159066cc24eed8dea607b2fe626ad2e5 100644 (file)
@@ -826,6 +826,7 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) {
                         "%s\tRefuseManualStart: %s\n"
                         "%s\tRefuseManualStop: %s\n"
                         "%s\tDefaultDependencies: %s\n"
+                        "%s\tSurviveFinalKillSignal: %s\n"
                         "%s\tOnSuccessJobMode: %s\n"
                         "%s\tOnFailureJobMode: %s\n"
                         "%s\tIgnoreOnIsolate: %s\n",
@@ -833,6 +834,7 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) {
                         prefix, yes_no(u->refuse_manual_start),
                         prefix, yes_no(u->refuse_manual_stop),
                         prefix, yes_no(u->default_dependencies),
+                        prefix, yes_no(u->survive_final_kill_signal),
                         prefix, job_mode_to_string(u->on_success_job_mode),
                         prefix, job_mode_to_string(u->on_failure_job_mode),
                         prefix, yes_no(u->ignore_on_isolate));
index f1a80cc8912201ff896b0173b231286a8efc8fd0..ee466f351a3aecbc0d729b63c3c94dd5f8a272a5 100644 (file)
@@ -451,6 +451,9 @@ typedef struct Unit {
         /* Create default dependencies */
         bool default_dependencies;
 
+        /* Configure so that the unit survives a system transition without stopping/starting. */
+        bool survive_final_kill_signal;
+
         /* Refuse manual starting, allow starting only indirectly via dependency. */
         bool refuse_manual_start;
 
index f7003df9f0823f02571b2babea635e4193aff543..f88a4f5aab085d32590bc30dea68010344b8dbdd 100644 (file)
@@ -2607,6 +2607,7 @@ static int bus_append_unit_property(sd_bus_message *m, const char *field, const
                               "RefuseManualStop",
                               "AllowIsolate",
                               "IgnoreOnIsolate",
+                              "SurviveFinalKillSignal",
                               "DefaultDependencies"))
                 return bus_append_parse_boolean(m, field, eq);
 
index 0b5a6642ec23e51f863553130f1161c314c18cc9..66acba5b092e5619eedfe26f37b3833db2eec4c6 100644 (file)
@@ -11,6 +11,7 @@
 #include "alloc-util.h"
 #include "constants.h"
 #include "dirent-util.h"
+#include "errno-util.h"
 #include "fd-util.h"
 #include "format-util.h"
 #include "initrd-util.h"
 #include "string-util.h"
 #include "terminal-util.h"
 
-static bool ignore_proc(pid_t pid, bool warn_rootfs) {
+static bool argv_has_at(pid_t pid) {
         _cleanup_fclose_ FILE *f = NULL;
         const char *p;
         char c = 0;
+
+        p = procfs_file_alloca(pid, "cmdline");
+        f = fopen(p, "re");
+        if (!f) {
+                log_debug_errno(errno, "Failed to open %s, ignoring: %m", p);
+                return true; /* not really, but has the desired effect */
+        }
+
+        /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for
+         * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as
+         * actual kernel threads are already filtered out above. */
+        (void) fread(&c, 1, 1, f);
+
+        /* Processes with argv[0][0] = '@' we ignore from the killing spree.
+         *
+         * https://systemd.io/ROOT_STORAGE_DAEMONS */
+        return c == '@';
+}
+
+static bool is_survivor_cgroup(pid_t pid) {
+        _cleanup_free_ char *cgroup_path = NULL;
+        int r;
+
+        r = cg_pid_get_path(/* root= */ NULL, pid, &cgroup_path);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to get cgroup path of process " PID_FMT ", ignoring: %m", pid);
+                return false;
+        }
+
+        r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.survive_final_kill_signal");
+        /* user xattr support was added to kernel v5.7, try with the trusted namespace as a fallback */
+        if (ERRNO_IS_NEG_XATTR_ABSENT(r))
+                r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER,
+                                      cgroup_path,
+                                      "trusted.survive_final_kill_signal");
+        if (r < 0)
+                log_debug_errno(r,
+                                "Failed to get survive_final_kill_signal xattr of %s, ignoring: %m",
+                                cgroup_path);
+
+        return r > 0;
+}
+
+static bool ignore_proc(pid_t pid, bool warn_rootfs) {
         uid_t uid;
         int r;
 
@@ -38,6 +83,10 @@ static bool ignore_proc(pid_t pid, bool warn_rootfs) {
         if (r != 0)
                 return true; /* also ignore processes where we can't determine this */
 
+        /* Ignore processes that are part of a cgroup marked with the user.survive_final_kill_signal xattr */
+        if (is_survivor_cgroup(pid))
+                return true;
+
         r = get_process_uid(pid, &uid);
         if (r < 0)
                 return true; /* not really, but better safe than sorry */
@@ -46,20 +95,7 @@ static bool ignore_proc(pid_t pid, bool warn_rootfs) {
         if (uid != 0)
                 return false;
 
-        p = procfs_file_alloca(pid, "cmdline");
-        f = fopen(p, "re");
-        if (!f)
-                return true; /* not really, but has the desired effect */
-
-        /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for
-         * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as
-         * actual kernel threads are already filtered out above. */
-        (void) fread(&c, 1, 1, f);
-
-        /* Processes with argv[0][0] = '@' we ignore from the killing spree.
-         *
-         * https://systemd.io/ROOT_STORAGE_DAEMONS */
-        if (c != '@')
+        if (!argv_has_at(pid))
                 return false;
 
         if (warn_rootfs &&
index d13fe1b76f7c47862235b67ea14271708e880a1b..24bd976b8735ae9c9d061acd20d7f242ed285875 100755 (executable)
@@ -20,8 +20,8 @@ if [ -f /run/testsuite82.touch3 ]; then
     read -r x <&5
     test "$x" = "oinkoink"
 
-    # Check that no service is still around
-    test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
+    # Check that the surviving service is still around
+    test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
     test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
 
     # All succeeded, exit cleanly now
@@ -43,8 +43,8 @@ elif [ -f /run/testsuite82.touch2 ]; then
     systemd-notify --fd=3 --pid=parent 3<"$T"
     rm "$T"
 
-    # Check that no service is still around
-    test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
+    # Check that the surviving service is still around
+    test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
     test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
 
     # Test that we really are in the new overlayfs root fs
@@ -57,6 +57,9 @@ elif [ -f /run/testsuite82.touch2 ]; then
     mount --bind /original-root /run/nextroot
     mount
 
+    # Restart the unit that is not supposed to survive
+    systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity
+
     # Now issue the soft reboot. We should be right back soon.
     touch /run/testsuite82.touch3
     systemctl --no-block soft-reboot
@@ -85,8 +88,8 @@ elif [ -f /run/testsuite82.touch ]; then
     systemd-notify --fd=3 --pid=parent 3<"$T"
     rm "$T"
 
-    # Check that no service survived, regardless of the configuration
-    test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
+    # Check that the surviving service is still around
+    test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
     test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
 
     # This time we test the /run/nextroot/ root switching logic. (We synthesize a new rootfs from the old via overlayfs)
@@ -107,6 +110,9 @@ elif [ -f /run/testsuite82.touch ]; then
     # Bind our current root into the target so that we later can return to it
     mount --bind / /run/nextroot/original-root
 
+    # Restart the unit that is not supposed to survive
+    systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity
+
     # Now issue the soft reboot. We should be right back soon.
     touch /run/testsuite82.touch2
     systemctl --no-block soft-reboot
@@ -123,23 +129,17 @@ else
     systemd-notify --fd=3 --pid=parent 3<"$T"
     rm "$T"
 
-    # Create a script that can survive the soft reboot by ignoring SIGTERM (we
-    # do this instead of the argv[0][0] = '@' thing because that's so hard to
-    # do from a shell
-    T="/dev/shm/survive-$RANDOM.sh"
-    cat >$T <<EOF
-#!/bin/bash
-trap "" TERM
-systemd-notify --ready
-rm "$T"
-exec sleep infinity
-EOF
-    chmod +x "$T"
-    # This sets DefaultDependencies=no so that it remains running until the
-    # very end, and IgnoreOnIsolate=yes so that it isn't stopped via the
-    # "testsuite.target" isolation we do on next boot
-    systemd-run -p Type=notify -p DefaultDependencies=no -p IgnoreOnIsolate=yes --unit=testsuite-82-survive.service "$T"
-    systemd-run -p Type=exec -p DefaultDependencies=no -p IgnoreOnIsolate=yes --unit=testsuite-82-nosurvive.service sleep infinity
+    # Configure this transient unit to survive the soft reboot - it will not conflict with shutdown.target
+    # and it will be ignored on the isolate that happens in the next boot.
+    systemd-run -p Type=exec --unit=testsuite-82-survive.service \
+        --property SurviveFinalKillSignal=yes \
+        --property IgnoreOnIsolate=yes \
+        --property DefaultDependencies=no \
+        --property After=basic.target \
+        --property "Conflicts=reboot.target kexec.target poweroff.target halt.target emergency.target rescue.target" \
+        --property "Before=reboot.target kexec.target poweroff.target halt.target emergency.target rescue.target" \
+        sleep infinity
+    systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity
 
     # Check that we can set up an inhibitor, and that busctl monitor sees the
     # PrepareForShutdownWithMetadata signal and that it says 'soft-reboot'.