From: Luca Boccassi Date: Thu, 27 Jul 2023 22:14:31 +0000 (+0100) Subject: pid1: add SurviveFinalKillSignal= to skip units on final sigterm/sigkill spree X-Git-Tag: v255-rc1~396^2~2 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=559214cbbd71d0f9a5b95edf7659610e25cd29cc;p=thirdparty%2Fsystemd.git pid1: add SurviveFinalKillSignal= to skip units on final sigterm/sigkill spree Add a new boolean for units, SurviveFinalKillSignal=yes/no. Units that set it will not have their process receive the final sigterm/sigkill in the shutdown phase. This is implemented by checking if a process is part of a cgroup marked with a user.survive_final_kill_signal xattr (or a trusted xattr if we can't set a user one, which were added only in kernel v5.7 and are not supported in CentOS 8). --- diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index f9aa67739f7..01d8f659d50 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2028,6 +2028,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b DefaultDependencies = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b SurviveFinalKillSignal = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s OnSuccessJobMode = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s OnFailureJobMode = '...'; @@ -2142,6 +2144,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -2354,6 +2358,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -11613,6 +11619,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ AccessSELinuxContext and ActivationDetails were added in version 252. QueueSignal() was added in version 254. + SurviveFinalKillSignal was added in version 255. Service Unit Objects diff --git a/man/systemd-soft-reboot.service.xml b/man/systemd-soft-reboot.service.xml index f7908d3f032..1de2fbc5f37 100644 --- a/man/systemd-soft-reboot.service.xml +++ b/man/systemd-soft-reboot.service.xml @@ -96,12 +96,41 @@ The /run/ file system remains mounted and populated and may be used to pass state information between such userspace reboot cycles. - Service processes may continue to run over the transition, if they are placed in - services that remain active until the very end of shutdown (which again is achieved via - DefaultDependencies=no). They must also be set up to avoid being killed by the - aforementioned SIGTERM spree (as per systemd and Storage Daemons for the Root File - System). + Service processes may continue to run over the transition, past soft-reboot and into + the next session, if they are placed in services that remain active until the very end of shutdown + (which again is achieved via DefaultDependencies=no). They must also be set up to + avoid being killed by the aforementioned SIGTERM and SIGKILL + via SurviveFinalKillSignal=yes, and also be configured to avoid being stopped on + isolate via IgnoreOnIsolate=yes. They also have to be configured to be stopped on + normal shutdown, reboot and maintenance mode. Finally, they have to be ordered after + basic.target to ensure correct ordeering on boot. Note that in case any new or + custom units are used to isolate to, or that implement an equivalent shutdown functionality, they will + also have to be configured manually for correct ordering and conflicting. For example: + + [Unit] +Description=My surviving service +SurviveFinalKillSignal=yes +IgnoreOnIsolate=yes +DefaultDependencies=no +After=basic.target +Conflicts=reboot.target +Before=reboot.target +Conflicts=kexec.target +Before=kexec.target +Conflicts=poweroff.target +Before=poweroff.target +Conflicts=halt.target +Before=halt.target +Conflicts=rescue.target +Before=rescue.target +Conflicts=emergency.target +Before=emergency.target + +[Service] +Type=oneshot +ExecStart=sleep infinity + + File system mounts may remain mounted during the transition, and complex storage attached, if configured to remain until the very end of the shutdown process. (Also achieved via diff --git a/man/systemd.unit.xml b/man/systemd.unit.xml index fa867dba1ac..3447391c9e9 100644 --- a/man/systemd.unit.xml +++ b/man/systemd.unit.xml @@ -1023,6 +1023,20 @@ + + SurviveFinalKillSignal= + + Takes a boolean argument. Defaults to . If , + processes belonging to this unit will not be sent the final SIGTERM and + SIGKILL signals during the final phase of the system shutdown process. + This functionality replaces the older mechanism that allowed a program to set + argv[0][0] = '@' as described at + systemd and Storage Daemons for the Root File + System, which however continues to be supported. + + + + CollectMode= diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 32c78a449b9..e8f8ddc2445 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -888,6 +888,7 @@ int cgroup_log_xattr_apply(Unit *u, const char *cgroup_path) { static void cgroup_xattr_apply(Unit *u) { bool b; + int r; assert(u); @@ -921,6 +922,32 @@ static void cgroup_xattr_apply(Unit *u) { else unit_remove_xattr_graceful(u, NULL, xn); } + + if (u->survive_final_kill_signal) { + r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, + u->cgroup_path, + "user.survive_final_kill_signal", + "1", + 1, + /* flags= */ 0); + /* user xattr support was added in kernel v5.7 */ + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, + u->cgroup_path, + "trusted.survive_final_kill_signal", + "1", + 1, + /* flags= */ 0); + if (r < 0) + log_unit_debug_errno(u, + r, + "Failed to set 'survive_final_kill_signal' xattr on control " + "group %s, ignoring: %m", + empty_to_root(u->cgroup_path)); + } else { + unit_remove_xattr_graceful(u, /* cgroup_path= */ NULL, "user.survive_final_kill_signal"); + unit_remove_xattr_graceful(u, /* cgroup_path= */ NULL, "trusted.survive_final_kill_signal"); + } } static int lookup_block_device(const char *p, dev_t *ret) { diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c index e9b446945aa..05b80cbf331 100644 --- a/src/core/dbus-unit.c +++ b/src/core/dbus-unit.c @@ -921,6 +921,7 @@ const sd_bus_vtable bus_unit_vtable[] = { SD_BUS_PROPERTY("RefuseManualStop", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_stop), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("AllowIsolate", "b", bus_property_get_bool, offsetof(Unit, allow_isolate), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultDependencies", "b", bus_property_get_bool, offsetof(Unit, default_dependencies), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SurviveFinalKillSignal", "b", bus_property_get_bool, offsetof(Unit, survive_final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("OnSuccesJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* deprecated */ SD_BUS_PROPERTY("OnSuccessJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("OnFailureJobMode", "s", property_get_job_mode, offsetof(Unit, on_failure_job_mode), SD_BUS_VTABLE_PROPERTY_CONST), @@ -2174,6 +2175,9 @@ static int bus_unit_set_transient_property( if (streq(name, "DefaultDependencies")) return bus_set_transient_bool(u, name, &u->default_dependencies, message, flags, error); + if (streq(name, "SurviveFinalKillSignal")) + return bus_set_transient_bool(u, name, &u->survive_final_kill_signal, message, flags, error); + if (streq(name, "OnSuccessJobMode")) return bus_set_transient_job_mode(u, name, &u->on_success_job_mode, message, flags, error); diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index 6cdf1319751..77a0dce529f 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -313,6 +313,7 @@ Unit.RefuseManualStart, config_parse_bool, Unit.RefuseManualStop, config_parse_bool, 0, offsetof(Unit, refuse_manual_stop) Unit.AllowIsolate, config_parse_bool, 0, offsetof(Unit, allow_isolate) Unit.DefaultDependencies, config_parse_bool, 0, offsetof(Unit, default_dependencies) +Unit.SurviveFinalKillSignal, config_parse_bool, 0, offsetof(Unit, survive_final_kill_signal) Unit.OnSuccessJobMode, config_parse_job_mode, 0, offsetof(Unit, on_success_job_mode) Unit.OnFailureJobMode, config_parse_job_mode, 0, offsetof(Unit, on_failure_job_mode) {# The following is a legacy alias name for compatibility #} diff --git a/src/core/main.c b/src/core/main.c index 4ee5e8735f4..cbcf3ddeeae 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -1766,7 +1766,8 @@ static void finish_remaining_processes(ManagerObjective objective) { if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT)) broadcast_signal(SIGTERM, /* wait_for_exit= */ false, /* send_sighup= */ true, arg_defaults.timeout_stop_usec); - /* On soft reboot really make sure nothing is left */ + /* On soft reboot really make sure nothing is left. Note that this will skip cgroups + * of units that were configured with SurviveFinalKillSignal=yes. */ if (objective == MANAGER_SOFT_REBOOT) broadcast_signal(SIGKILL, /* wait_for_exit= */ false, /* send_sighup= */ false, arg_defaults.timeout_stop_usec); } diff --git a/src/core/unit-serialize.c b/src/core/unit-serialize.c index f0030d3211b..28510026159 100644 --- a/src/core/unit-serialize.c +++ b/src/core/unit-serialize.c @@ -826,6 +826,7 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) { "%s\tRefuseManualStart: %s\n" "%s\tRefuseManualStop: %s\n" "%s\tDefaultDependencies: %s\n" + "%s\tSurviveFinalKillSignal: %s\n" "%s\tOnSuccessJobMode: %s\n" "%s\tOnFailureJobMode: %s\n" "%s\tIgnoreOnIsolate: %s\n", @@ -833,6 +834,7 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) { prefix, yes_no(u->refuse_manual_start), prefix, yes_no(u->refuse_manual_stop), prefix, yes_no(u->default_dependencies), + prefix, yes_no(u->survive_final_kill_signal), prefix, job_mode_to_string(u->on_success_job_mode), prefix, job_mode_to_string(u->on_failure_job_mode), prefix, yes_no(u->ignore_on_isolate)); diff --git a/src/core/unit.h b/src/core/unit.h index f1a80cc8912..ee466f351a3 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -451,6 +451,9 @@ typedef struct Unit { /* Create default dependencies */ bool default_dependencies; + /* Configure so that the unit survives a system transition without stopping/starting. */ + bool survive_final_kill_signal; + /* Refuse manual starting, allow starting only indirectly via dependency. */ bool refuse_manual_start; diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index f7003df9f08..f88a4f5aab0 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -2607,6 +2607,7 @@ static int bus_append_unit_property(sd_bus_message *m, const char *field, const "RefuseManualStop", "AllowIsolate", "IgnoreOnIsolate", + "SurviveFinalKillSignal", "DefaultDependencies")) return bus_append_parse_boolean(m, field, eq); diff --git a/src/shared/killall.c b/src/shared/killall.c index 0b5a6642ec2..66acba5b092 100644 --- a/src/shared/killall.c +++ b/src/shared/killall.c @@ -11,6 +11,7 @@ #include "alloc-util.h" #include "constants.h" #include "dirent-util.h" +#include "errno-util.h" #include "fd-util.h" #include "format-util.h" #include "initrd-util.h" @@ -22,10 +23,54 @@ #include "string-util.h" #include "terminal-util.h" -static bool ignore_proc(pid_t pid, bool warn_rootfs) { +static bool argv_has_at(pid_t pid) { _cleanup_fclose_ FILE *f = NULL; const char *p; char c = 0; + + p = procfs_file_alloca(pid, "cmdline"); + f = fopen(p, "re"); + if (!f) { + log_debug_errno(errno, "Failed to open %s, ignoring: %m", p); + return true; /* not really, but has the desired effect */ + } + + /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for + * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as + * actual kernel threads are already filtered out above. */ + (void) fread(&c, 1, 1, f); + + /* Processes with argv[0][0] = '@' we ignore from the killing spree. + * + * https://systemd.io/ROOT_STORAGE_DAEMONS */ + return c == '@'; +} + +static bool is_survivor_cgroup(pid_t pid) { + _cleanup_free_ char *cgroup_path = NULL; + int r; + + r = cg_pid_get_path(/* root= */ NULL, pid, &cgroup_path); + if (r < 0) { + log_warning_errno(r, "Failed to get cgroup path of process " PID_FMT ", ignoring: %m", pid); + return false; + } + + r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.survive_final_kill_signal"); + /* user xattr support was added to kernel v5.7, try with the trusted namespace as a fallback */ + if (ERRNO_IS_NEG_XATTR_ABSENT(r)) + r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, + cgroup_path, + "trusted.survive_final_kill_signal"); + if (r < 0) + log_debug_errno(r, + "Failed to get survive_final_kill_signal xattr of %s, ignoring: %m", + cgroup_path); + + return r > 0; +} + +static bool ignore_proc(pid_t pid, bool warn_rootfs) { uid_t uid; int r; @@ -38,6 +83,10 @@ static bool ignore_proc(pid_t pid, bool warn_rootfs) { if (r != 0) return true; /* also ignore processes where we can't determine this */ + /* Ignore processes that are part of a cgroup marked with the user.survive_final_kill_signal xattr */ + if (is_survivor_cgroup(pid)) + return true; + r = get_process_uid(pid, &uid); if (r < 0) return true; /* not really, but better safe than sorry */ @@ -46,20 +95,7 @@ static bool ignore_proc(pid_t pid, bool warn_rootfs) { if (uid != 0) return false; - p = procfs_file_alloca(pid, "cmdline"); - f = fopen(p, "re"); - if (!f) - return true; /* not really, but has the desired effect */ - - /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for - * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as - * actual kernel threads are already filtered out above. */ - (void) fread(&c, 1, 1, f); - - /* Processes with argv[0][0] = '@' we ignore from the killing spree. - * - * https://systemd.io/ROOT_STORAGE_DAEMONS */ - if (c != '@') + if (!argv_has_at(pid)) return false; if (warn_rootfs && diff --git a/test/units/testsuite-82.sh b/test/units/testsuite-82.sh index d13fe1b76f7..24bd976b873 100755 --- a/test/units/testsuite-82.sh +++ b/test/units/testsuite-82.sh @@ -20,8 +20,8 @@ if [ -f /run/testsuite82.touch3 ]; then read -r x <&5 test "$x" = "oinkoink" - # Check that no service is still around - test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active" + # Check that the surviving service is still around + test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active" test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active" # All succeeded, exit cleanly now @@ -43,8 +43,8 @@ elif [ -f /run/testsuite82.touch2 ]; then systemd-notify --fd=3 --pid=parent 3<"$T" rm "$T" - # Check that no service is still around - test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active" + # Check that the surviving service is still around + test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active" test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active" # Test that we really are in the new overlayfs root fs @@ -57,6 +57,9 @@ elif [ -f /run/testsuite82.touch2 ]; then mount --bind /original-root /run/nextroot mount + # Restart the unit that is not supposed to survive + systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity + # Now issue the soft reboot. We should be right back soon. touch /run/testsuite82.touch3 systemctl --no-block soft-reboot @@ -85,8 +88,8 @@ elif [ -f /run/testsuite82.touch ]; then systemd-notify --fd=3 --pid=parent 3<"$T" rm "$T" - # Check that no service survived, regardless of the configuration - test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active" + # Check that the surviving service is still around + test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active" test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active" # This time we test the /run/nextroot/ root switching logic. (We synthesize a new rootfs from the old via overlayfs) @@ -107,6 +110,9 @@ elif [ -f /run/testsuite82.touch ]; then # Bind our current root into the target so that we later can return to it mount --bind / /run/nextroot/original-root + # Restart the unit that is not supposed to survive + systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity + # Now issue the soft reboot. We should be right back soon. touch /run/testsuite82.touch2 systemctl --no-block soft-reboot @@ -123,23 +129,17 @@ else systemd-notify --fd=3 --pid=parent 3<"$T" rm "$T" - # Create a script that can survive the soft reboot by ignoring SIGTERM (we - # do this instead of the argv[0][0] = '@' thing because that's so hard to - # do from a shell - T="/dev/shm/survive-$RANDOM.sh" - cat >$T <