@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b DefaultDependencies = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly b SurviveFinalKillSignal = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s OnSuccessJobMode = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s OnFailureJobMode = '...';
<!--property CanFreeze is not documented!-->
+ <!--property SurviveFinalKillSignal is not documented!-->
+
<!--property OnSuccessJobMode is not documented!-->
<!--property OnFailureJobMode is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="DefaultDependencies"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="SurviveFinalKillSignal"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="OnSuccessJobMode"/>
<variablelist class="dbus-property" generated="True" extra-ref="OnFailureJobMode"/>
<para><varname>AccessSELinuxContext</varname> and
<varname>ActivationDetails</varname> were added in version 252.</para>
<para><function>QueueSignal()</function> was added in version 254.</para>
+ <para><varname>SurviveFinalKillSignal</varname> was added in version 255.</para>
</refsect2>
<refsect2>
<title>Service Unit Objects</title>
<listitem><para>The <filename>/run/</filename> file system remains mounted and populated and may be
used to pass state information between such userspace reboot cycles.</para></listitem>
- <listitem><para>Service processes may continue to run over the transition, if they are placed in
- services that remain active until the very end of shutdown (which again is achieved via
- <varname>DefaultDependencies=no</varname>). They must also be set up to avoid being killed by the
- aforementioned <constant>SIGTERM</constant> spree (as per <ulink
- url="https://systemd.io/ROOT_STORAGE_DAEMONS">systemd and Storage Daemons for the Root File
- System</ulink>).</para></listitem>
+ <listitem><para>Service processes may continue to run over the transition, past soft-reboot and into
+ the next session, if they are placed in services that remain active until the very end of shutdown
+ (which again is achieved via <varname>DefaultDependencies=no</varname>). They must also be set up to
+ avoid being killed by the aforementioned <constant>SIGTERM</constant> and <constant>SIGKILL</constant>
+ via <varname>SurviveFinalKillSignal=yes</varname>, and also be configured to avoid being stopped on
+ isolate via <varname>IgnoreOnIsolate=yes</varname>. They also have to be configured to be stopped on
+ normal shutdown, reboot and maintenance mode. Finally, they have to be ordered after
+ <constant>basic.target</constant> to ensure correct ordeering on boot. Note that in case any new or
+ custom units are used to isolate to, or that implement an equivalent shutdown functionality, they will
+ also have to be configured manually for correct ordering and conflicting. For example:</para>
+
+ <programlisting>[Unit]
+Description=My surviving service
+SurviveFinalKillSignal=yes
+IgnoreOnIsolate=yes
+DefaultDependencies=no
+After=basic.target
+Conflicts=reboot.target
+Before=reboot.target
+Conflicts=kexec.target
+Before=kexec.target
+Conflicts=poweroff.target
+Before=poweroff.target
+Conflicts=halt.target
+Before=halt.target
+Conflicts=rescue.target
+Before=rescue.target
+Conflicts=emergency.target
+Before=emergency.target
+
+[Service]
+Type=oneshot
+ExecStart=sleep infinity
+ </programlisting>
+ </listitem>
<listitem><para>File system mounts may remain mounted during the transition, and complex storage
attached, if configured to remain until the very end of the shutdown process. (Also achieved via
<xi:include href="version-info.xml" xpointer="v201"/></listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>SurviveFinalKillSignal=</varname></term>
+
+ <listitem><para>Takes a boolean argument. Defaults to <option>no</option>. If <option>yes</option>,
+ processes belonging to this unit will not be sent the final <literal>SIGTERM</literal> and
+ <literal>SIGKILL</literal> signals during the final phase of the system shutdown process.
+ This functionality replaces the older mechanism that allowed a program to set
+ <literal>argv[0][0] = '@'</literal> as described at
+ <ulink url="https://systemd.io/ROOT_STORAGE_DAEMONS">systemd and Storage Daemons for the Root File
+ System</ulink>, which however continues to be supported.</para>
+
+ <xi:include href="version-info.xml" xpointer="v255"/></listitem>
+ </varlistentry>
+
<varlistentry>
<term><varname>CollectMode=</varname></term>
static void cgroup_xattr_apply(Unit *u) {
bool b;
+ int r;
assert(u);
else
unit_remove_xattr_graceful(u, NULL, xn);
}
+
+ if (u->survive_final_kill_signal) {
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER,
+ u->cgroup_path,
+ "user.survive_final_kill_signal",
+ "1",
+ 1,
+ /* flags= */ 0);
+ /* user xattr support was added in kernel v5.7 */
+ if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER,
+ u->cgroup_path,
+ "trusted.survive_final_kill_signal",
+ "1",
+ 1,
+ /* flags= */ 0);
+ if (r < 0)
+ log_unit_debug_errno(u,
+ r,
+ "Failed to set 'survive_final_kill_signal' xattr on control "
+ "group %s, ignoring: %m",
+ empty_to_root(u->cgroup_path));
+ } else {
+ unit_remove_xattr_graceful(u, /* cgroup_path= */ NULL, "user.survive_final_kill_signal");
+ unit_remove_xattr_graceful(u, /* cgroup_path= */ NULL, "trusted.survive_final_kill_signal");
+ }
}
static int lookup_block_device(const char *p, dev_t *ret) {
SD_BUS_PROPERTY("RefuseManualStop", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_stop), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("AllowIsolate", "b", bus_property_get_bool, offsetof(Unit, allow_isolate), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultDependencies", "b", bus_property_get_bool, offsetof(Unit, default_dependencies), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SurviveFinalKillSignal", "b", bus_property_get_bool, offsetof(Unit, survive_final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("OnSuccesJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* deprecated */
SD_BUS_PROPERTY("OnSuccessJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("OnFailureJobMode", "s", property_get_job_mode, offsetof(Unit, on_failure_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
if (streq(name, "DefaultDependencies"))
return bus_set_transient_bool(u, name, &u->default_dependencies, message, flags, error);
+ if (streq(name, "SurviveFinalKillSignal"))
+ return bus_set_transient_bool(u, name, &u->survive_final_kill_signal, message, flags, error);
+
if (streq(name, "OnSuccessJobMode"))
return bus_set_transient_job_mode(u, name, &u->on_success_job_mode, message, flags, error);
Unit.RefuseManualStop, config_parse_bool, 0, offsetof(Unit, refuse_manual_stop)
Unit.AllowIsolate, config_parse_bool, 0, offsetof(Unit, allow_isolate)
Unit.DefaultDependencies, config_parse_bool, 0, offsetof(Unit, default_dependencies)
+Unit.SurviveFinalKillSignal, config_parse_bool, 0, offsetof(Unit, survive_final_kill_signal)
Unit.OnSuccessJobMode, config_parse_job_mode, 0, offsetof(Unit, on_success_job_mode)
Unit.OnFailureJobMode, config_parse_job_mode, 0, offsetof(Unit, on_failure_job_mode)
{# The following is a legacy alias name for compatibility #}
if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
broadcast_signal(SIGTERM, /* wait_for_exit= */ false, /* send_sighup= */ true, arg_defaults.timeout_stop_usec);
- /* On soft reboot really make sure nothing is left */
+ /* On soft reboot really make sure nothing is left. Note that this will skip cgroups
+ * of units that were configured with SurviveFinalKillSignal=yes. */
if (objective == MANAGER_SOFT_REBOOT)
broadcast_signal(SIGKILL, /* wait_for_exit= */ false, /* send_sighup= */ false, arg_defaults.timeout_stop_usec);
}
"%s\tRefuseManualStart: %s\n"
"%s\tRefuseManualStop: %s\n"
"%s\tDefaultDependencies: %s\n"
+ "%s\tSurviveFinalKillSignal: %s\n"
"%s\tOnSuccessJobMode: %s\n"
"%s\tOnFailureJobMode: %s\n"
"%s\tIgnoreOnIsolate: %s\n",
prefix, yes_no(u->refuse_manual_start),
prefix, yes_no(u->refuse_manual_stop),
prefix, yes_no(u->default_dependencies),
+ prefix, yes_no(u->survive_final_kill_signal),
prefix, job_mode_to_string(u->on_success_job_mode),
prefix, job_mode_to_string(u->on_failure_job_mode),
prefix, yes_no(u->ignore_on_isolate));
/* Create default dependencies */
bool default_dependencies;
+ /* Configure so that the unit survives a system transition without stopping/starting. */
+ bool survive_final_kill_signal;
+
/* Refuse manual starting, allow starting only indirectly via dependency. */
bool refuse_manual_start;
"RefuseManualStop",
"AllowIsolate",
"IgnoreOnIsolate",
+ "SurviveFinalKillSignal",
"DefaultDependencies"))
return bus_append_parse_boolean(m, field, eq);
#include "alloc-util.h"
#include "constants.h"
#include "dirent-util.h"
+#include "errno-util.h"
#include "fd-util.h"
#include "format-util.h"
#include "initrd-util.h"
#include "string-util.h"
#include "terminal-util.h"
-static bool ignore_proc(pid_t pid, bool warn_rootfs) {
+static bool argv_has_at(pid_t pid) {
_cleanup_fclose_ FILE *f = NULL;
const char *p;
char c = 0;
+
+ p = procfs_file_alloca(pid, "cmdline");
+ f = fopen(p, "re");
+ if (!f) {
+ log_debug_errno(errno, "Failed to open %s, ignoring: %m", p);
+ return true; /* not really, but has the desired effect */
+ }
+
+ /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for
+ * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as
+ * actual kernel threads are already filtered out above. */
+ (void) fread(&c, 1, 1, f);
+
+ /* Processes with argv[0][0] = '@' we ignore from the killing spree.
+ *
+ * https://systemd.io/ROOT_STORAGE_DAEMONS */
+ return c == '@';
+}
+
+static bool is_survivor_cgroup(pid_t pid) {
+ _cleanup_free_ char *cgroup_path = NULL;
+ int r;
+
+ r = cg_pid_get_path(/* root= */ NULL, pid, &cgroup_path);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to get cgroup path of process " PID_FMT ", ignoring: %m", pid);
+ return false;
+ }
+
+ r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.survive_final_kill_signal");
+ /* user xattr support was added to kernel v5.7, try with the trusted namespace as a fallback */
+ if (ERRNO_IS_NEG_XATTR_ABSENT(r))
+ r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER,
+ cgroup_path,
+ "trusted.survive_final_kill_signal");
+ if (r < 0)
+ log_debug_errno(r,
+ "Failed to get survive_final_kill_signal xattr of %s, ignoring: %m",
+ cgroup_path);
+
+ return r > 0;
+}
+
+static bool ignore_proc(pid_t pid, bool warn_rootfs) {
uid_t uid;
int r;
if (r != 0)
return true; /* also ignore processes where we can't determine this */
+ /* Ignore processes that are part of a cgroup marked with the user.survive_final_kill_signal xattr */
+ if (is_survivor_cgroup(pid))
+ return true;
+
r = get_process_uid(pid, &uid);
if (r < 0)
return true; /* not really, but better safe than sorry */
if (uid != 0)
return false;
- p = procfs_file_alloca(pid, "cmdline");
- f = fopen(p, "re");
- if (!f)
- return true; /* not really, but has the desired effect */
-
- /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for
- * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as
- * actual kernel threads are already filtered out above. */
- (void) fread(&c, 1, 1, f);
-
- /* Processes with argv[0][0] = '@' we ignore from the killing spree.
- *
- * https://systemd.io/ROOT_STORAGE_DAEMONS */
- if (c != '@')
+ if (!argv_has_at(pid))
return false;
if (warn_rootfs &&
read -r x <&5
test "$x" = "oinkoink"
- # Check that no service is still around
- test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
+ # Check that the surviving service is still around
+ test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
# All succeeded, exit cleanly now
systemd-notify --fd=3 --pid=parent 3<"$T"
rm "$T"
- # Check that no service is still around
- test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
+ # Check that the surviving service is still around
+ test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
# Test that we really are in the new overlayfs root fs
mount --bind /original-root /run/nextroot
mount
+ # Restart the unit that is not supposed to survive
+ systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity
+
# Now issue the soft reboot. We should be right back soon.
touch /run/testsuite82.touch3
systemctl --no-block soft-reboot
systemd-notify --fd=3 --pid=parent 3<"$T"
rm "$T"
- # Check that no service survived, regardless of the configuration
- test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
+ # Check that the surviving service is still around
+ test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
# This time we test the /run/nextroot/ root switching logic. (We synthesize a new rootfs from the old via overlayfs)
# Bind our current root into the target so that we later can return to it
mount --bind / /run/nextroot/original-root
+ # Restart the unit that is not supposed to survive
+ systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity
+
# Now issue the soft reboot. We should be right back soon.
touch /run/testsuite82.touch2
systemctl --no-block soft-reboot
systemd-notify --fd=3 --pid=parent 3<"$T"
rm "$T"
- # Create a script that can survive the soft reboot by ignoring SIGTERM (we
- # do this instead of the argv[0][0] = '@' thing because that's so hard to
- # do from a shell
- T="/dev/shm/survive-$RANDOM.sh"
- cat >$T <<EOF
-#!/bin/bash
-trap "" TERM
-systemd-notify --ready
-rm "$T"
-exec sleep infinity
-EOF
- chmod +x "$T"
- # This sets DefaultDependencies=no so that it remains running until the
- # very end, and IgnoreOnIsolate=yes so that it isn't stopped via the
- # "testsuite.target" isolation we do on next boot
- systemd-run -p Type=notify -p DefaultDependencies=no -p IgnoreOnIsolate=yes --unit=testsuite-82-survive.service "$T"
- systemd-run -p Type=exec -p DefaultDependencies=no -p IgnoreOnIsolate=yes --unit=testsuite-82-nosurvive.service sleep infinity
+ # Configure this transient unit to survive the soft reboot - it will not conflict with shutdown.target
+ # and it will be ignored on the isolate that happens in the next boot.
+ systemd-run -p Type=exec --unit=testsuite-82-survive.service \
+ --property SurviveFinalKillSignal=yes \
+ --property IgnoreOnIsolate=yes \
+ --property DefaultDependencies=no \
+ --property After=basic.target \
+ --property "Conflicts=reboot.target kexec.target poweroff.target halt.target emergency.target rescue.target" \
+ --property "Before=reboot.target kexec.target poweroff.target halt.target emergency.target rescue.target" \
+ sleep infinity
+ systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity
# Check that we can set up an inhibitor, and that busctl monitor sees the
# PrepareForShutdownWithMetadata signal and that it says 'soft-reboot'.