readonly u RestartSteps = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestartMaxDelayUSec = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly t RestartRandomizedDelayUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t RestartUSecNext = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
<!--property RestartMaxDelayUSec is not documented!-->
+ <!--property RestartRandomizedDelayUSec is not documented!-->
+
<!--property RestartUSecNext is not documented!-->
<!--property TimeoutStartFailureMode is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="RestartMaxDelayUSec"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="RestartRandomizedDelayUSec"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="RestartUSecNext"/>
<variablelist class="dbus-property" generated="True" extra-ref="TimeoutStartUSec"/>
<varname>CPUSetPartition</varname>, and
<varname>OOMRules</varname> were added in version 261.</para>
<para><varname>LUOSession</varname> was added in version 262.</para>
+ <para><varname>RestartRandomizedDelayUSec</varname> was added in version 262.</para>
</refsect2>
<refsect2>
<title>Socket Unit Objects</title>
<xi:include href="version-info.xml" xpointer="v254"/></listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>RestartRandomizedDelaySec=</varname></term>
+ <listitem><para>Delay automatic restarts by a randomly selected, evenly distributed amount of time
+ between 0 and the specified time value, added on top of the delay otherwise configured via
+ <varname>RestartSec=</varname> (and <varname>RestartSteps=</varname>/<varname>RestartMaxDelaySec=</varname>,
+ if used). Takes a value in the same format as <varname>RestartSec=</varname>. Defaults to 0, indicating
+ that no randomized delay shall be applied.</para>
+
+ <para>This setting is useful to stretch out the restarts of similarly configured service instances that
+ fail at the same time, to prevent them from restarting simultaneously and possibly resulting in
+ resource congestion. It is the restart-side analogue of <varname>RandomizedDelaySec=</varname> in
+ <citerefentry><refentrytitle>systemd.timer</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para>
+
+ <xi:include href="version-info.xml" xpointer="v262"/></listitem>
+ </varlistentry>
+
<varlistentry>
<term><varname>TimeoutStartSec=</varname></term>
<listitem><para>Configures the time to wait for start-up. If a daemon service does not signal
SD_BUS_PROPERTY("RestartUSec", "t", bus_property_get_usec, offsetof(Service, restart_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestartSteps", "u", bus_property_get_unsigned, offsetof(Service, restart_steps), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestartMaxDelayUSec", "t", bus_property_get_usec, offsetof(Service, restart_max_delay_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartRandomizedDelayUSec", "t", bus_property_get_usec, offsetof(Service, restart_randomized_delay_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestartUSecNext", "t", property_get_restart_usec_next, 0, 0),
SD_BUS_PROPERTY("TimeoutStartUSec", "t", bus_property_get_usec, offsetof(Service, timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Service, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
if (streq(name, "RestartMaxDelayUSec"))
return bus_set_transient_usec(u, name, &s->restart_max_delay_usec, message, flags, reterr_error);
+ if (streq(name, "RestartRandomizedDelayUSec"))
+ return bus_set_transient_usec(u, name, &s->restart_randomized_delay_usec, message, flags, reterr_error);
+
if (streq(name, "TimeoutStartUSec")) {
r = bus_set_transient_usec(u, name, &s->timeout_start_usec, message, flags, reterr_error);
if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags))
Service.RestartSec, config_parse_sec, 0, offsetof(Service, restart_usec)
Service.RestartSteps, config_parse_unsigned, 0, offsetof(Service, restart_steps)
Service.RestartMaxDelaySec, config_parse_sec, 0, offsetof(Service, restart_max_delay_usec)
+Service.RestartRandomizedDelaySec, config_parse_sec, 0, offsetof(Service, restart_randomized_delay_usec)
Service.TimeoutSec, config_parse_service_timeout, 0, 0
Service.TimeoutStartSec, config_parse_service_timeout, 0, 0
Service.TimeoutStopSec, config_parse_sec_fix_0, 0, offsetof(Service, timeout_stop_usec)
(long double) (n_restarts_next - 1) / s->restart_steps));
}
+static usec_t service_restart_usec_next_jittered(const Service *s) {
+ assert(s);
+
+ /* Single helper for the restart timer and the deadline reconstructed at coldplug so they can't drift */
+ return usec_add(service_restart_usec_next(s), s->restart_randomized_delay_chosen_usec);
+}
+
static void service_extend_event_source_timeout(Service *s, sd_event_source *source, usec_t extended) {
usec_t current;
int r;
s->restart_usec = s->restart_max_delay_usec;
}
+ if (s->restart_randomized_delay_usec == USEC_INFINITY) {
+ log_unit_warning(UNIT(s), "RestartRandomizedDelaySec= cannot be infinity, ignoring.");
+ s->restart_randomized_delay_usec = 0;
+ }
+
if (s->refresh_on_reload_set && s->refresh_on_reload_flags != _SERVICE_REFRESH_ON_RELOAD_ALL) {
if (FLAGS_SET(s->refresh_on_reload_flags, SERVICE_RELOAD_EXTENSIONS))
service_can_reload_extensions(s, /* warn = */ true);
"%sRestartSec: %s\n"
"%sRestartSteps: %u\n"
"%sRestartMaxDelaySec: %s\n"
+ "%sRestartRandomizedDelaySec: %s\n"
"%sTimeoutStartSec: %s\n"
"%sTimeoutStopSec: %s\n"
"%sTimeoutStartFailureMode: %s\n"
prefix, FORMAT_TIMESPAN(s->restart_usec, USEC_PER_SEC),
prefix, s->restart_steps,
prefix, FORMAT_TIMESPAN(s->restart_max_delay_usec, USEC_PER_SEC),
+ prefix, FORMAT_TIMESPAN(s->restart_randomized_delay_usec, USEC_PER_SEC),
prefix, FORMAT_TIMESPAN(s->timeout_start_usec, USEC_PER_SEC),
prefix, FORMAT_TIMESPAN(s->timeout_stop_usec, USEC_PER_SEC),
prefix, service_timeout_failure_mode_to_string(s->timeout_start_failure_mode),
return usec_add(UNIT(s)->state_change_timestamp.monotonic, service_timeout_abort_usec(s));
case SERVICE_AUTO_RESTART:
- return usec_add(UNIT(s)->inactive_enter_timestamp.monotonic, service_restart_usec_next(s));
+ return usec_add(UNIT(s)->inactive_enter_timestamp.monotonic,
+ service_restart_usec_next_jittered(s));
case SERVICE_CLEANING:
return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->exec_context.timeout_clean_usec);
if (s->restart_mode != SERVICE_RESTART_MODE_DIRECT)
service_set_state(s, restart_state);
- restart_usec_next = service_restart_usec_next(s);
+ /* Do the randomized restart delay once and remember it so that it's stable across daemon-reload */
+ s->restart_randomized_delay_chosen_usec = s->restart_randomized_delay_usec > 0 ?
+ random_u64_range(s->restart_randomized_delay_usec) : 0;
+
+ restart_usec_next = service_restart_usec_next_jittered(s);
r = service_arm_timer(s, /* relative= */ true, restart_usec_next);
if (r < 0) {
log_unit_notice(UNIT(s), "Service dead, subsequent restarts will be executed with debug level logging.");
}
- log_unit_debug(UNIT(s), "Next restart interval calculated as: %s", FORMAT_TIMESPAN(restart_usec_next, 0));
+ log_unit_debug(UNIT(s), "Next restart interval calculated as: %s (randomized delay: %s)",
+ FORMAT_TIMESPAN(restart_usec_next, 0),
+ FORMAT_TIMESPAN(s->restart_randomized_delay_chosen_usec, 0));
service_set_state(s, SERVICE_AUTO_RESTART);
} else {
(void) serialize_bool(f, "bus-name-good", s->bus_name_good);
(void) serialize_item_format(f, "n-restarts", "%u", s->n_restarts);
+ (void) serialize_usec(f, "restart-randomized-delay-chosen-usec", s->restart_randomized_delay_chosen_usec);
(void) serialize_bool(f, "forbid-restart", s->forbid_restart);
service_serialize_exec_command(u, f, s->control_command);
if (r < 0)
log_unit_debug_errno(u, r, "Failed to parse serialized restart counter '%s': %m", value);
+ } else if (streq(key, "restart-randomized-delay-chosen-usec")) {
+ (void) deserialize_usec(value, &s->restart_randomized_delay_chosen_usec);
+
} else if (streq(key, "forbid-restart")) {
r = parse_boolean(value);
if (r < 0)
unsigned restart_steps;
usec_t restart_usec;
usec_t restart_max_delay_usec;
+ usec_t restart_randomized_delay_usec; /* configured upper bound for the randomized restart delay */
+ usec_t restart_randomized_delay_chosen_usec; /* the value actually picked for the pending auto-restart */
usec_t timeout_start_usec;
usec_t timeout_stop_usec;
usec_t timeout_abort_usec;
JSON_BUILD_PAIR_FINITE_USEC("RestartUSec", s->restart_usec),
JSON_BUILD_PAIR_UNSIGNED_NON_ZERO("RestartSteps", s->restart_steps),
JSON_BUILD_PAIR_FINITE_USEC_NON_ZERO("RestartMaxDelayUSec", s->restart_max_delay_usec),
+ JSON_BUILD_PAIR_FINITE_USEC_NON_ZERO("RestartRandomizedDelayUSec", s->restart_randomized_delay_usec),
JSON_BUILD_PAIR_FINITE_USEC("TimeoutStartUSec", s->timeout_start_usec),
JSON_BUILD_PAIR_FINITE_USEC("TimeoutStopUSec", s->timeout_stop_usec),
JSON_BUILD_PAIR_ENUM("TimeoutStartFailureMode", service_timeout_failure_mode_to_string(s->timeout_start_failure_mode)),
{ "GuessMainPID", bus_append_parse_boolean },
{ "RestartSec", bus_append_parse_sec_rename },
{ "RestartMaxDelaySec", bus_append_parse_sec_rename },
+ { "RestartRandomizedDelaySec", bus_append_parse_sec_rename },
{ "TimeoutStartSec", bus_append_parse_sec_rename },
{ "TimeoutStopSec", bus_append_parse_sec_rename },
{ "TimeoutAbortSec", bus_append_parse_sec_rename },
SD_VARLINK_DEFINE_FIELD(RestartSteps, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.service.html#RestartMaxDelaySec="),
SD_VARLINK_DEFINE_FIELD(RestartMaxDelayUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
+ SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.service.html#RestartRandomizedDelaySec="),
+ SD_VARLINK_DEFINE_FIELD(RestartRandomizedDelayUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.service.html#TimeoutStartSec="),
SD_VARLINK_DEFINE_FIELD(TimeoutStartUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.service.html#TimeoutStopSec="),
rm /run/systemd/system/"$UNIT_NAME"
+# Test RestartRandomizedDelaySec=
+
+export UNIT_NAME="TEST-03-JOBS-restart-randomized-delay.service"
+
+cat >"/run/systemd/system/$UNIT_NAME" <<EOF
+[Service]
+Type=simple
+ExecStart=false
+Restart=on-failure
+RestartSec=1
+RestartRandomizedDelaySec=1
+StartLimitIntervalSec=0
+EOF
+
+systemctl daemon-reload
+
+# The option should be parsed and exposed on the bus in usec.
+assert_eq "$(systemctl show "$UNIT_NAME" -P RestartRandomizedDelayUSec)" "1s"
+
+# The chosen delay is logged at debug level when the unit enters auto-restart, so we can read it without
+# waiting for the delay to elapse.
+PREV_LOG_LEVEL="$(systemctl log-level)"
+
+restart_randomized_delay_cleanup() {
+ set +e
+ systemctl log-level "$PREV_LOG_LEVEL"
+ systemctl stop "$UNIT_NAME"
+ rm -f /run/systemd/system/"$UNIT_NAME"
+ systemctl daemon-reload
+}
+trap restart_randomized_delay_cleanup EXIT
+
+systemctl log-level debug
+
+get_restart_interval() {
+ # Enter auto-restart once, read the logged "<total>|<delay>", then stop again so it never has to elapse.
+ systemctl start --no-block "$UNIT_NAME"
+ timeout 10 bash -c 'while [[ "$(systemctl show "'"$UNIT_NAME"'" -P SubState)" != "auto-restart" ]]; do sleep .2; done'
+ systemctl stop "$UNIT_NAME"
+ journalctl --sync
+ # needed because of -o pipefail
+ { journalctl -q --no-pager -o cat -b -u "$UNIT_NAME" --grep="Next restart interval calculated as" || true; } |
+ sed -n 's/.*calculated as: \(.*\) (randomized delay: \(.*\))$/\1|\2/p' | tail -n1
+}
+
+# Several samples + "not all equal": two draws could rarely render identically (~1e-6) and falsely fail.
+DELAYS=()
+TOTALS=()
+for _ in {1..4}; do
+ IFS='|' read -r total delay <<<"$(get_restart_interval)"
+ TOTALS+=("$total")
+ DELAYS+=("$delay")
+done
+
+systemctl log-level "$PREV_LOG_LEVEL"
+
+: "Chosen randomized restart delays: ${DELAYS[*]} (totals: ${TOTALS[*]})"
+for delay in "${DELAYS[@]}"; do
+ assert_neq "$delay" ""
+ # Within bound: a value below 1s never renders a bare "<digit>s" token (only ms/us).
+ if [[ "$delay" =~ [0-9]s ]]; then
+ echo "FAIL: randomized restart delay '$delay' exceeds the configured 1s bound" >&2
+ exit 1
+ fi
+done
+# Total must vary, proving the jitter is folded into the armed timer (not merely logged).
+all_equal=1
+for total in "${TOTALS[@]}"; do
+ [[ "$total" == "${TOTALS[0]}" ]] || all_equal=0
+done
+assert_eq "$all_equal" "0"
+
touch /testok