]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: add ExecMainHandoverTimestamp property recording time-of-execve
authorLuca Boccassi <bluca@debian.org>
Fri, 19 Apr 2024 23:50:16 +0000 (00:50 +0100)
committerZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Mon, 22 Apr 2024 13:16:05 +0000 (15:16 +0200)
Enable the exec_fd logic for Type=notify* services too, and change it
to send a timestamp instead of a '1' byte. Record the timestamp in a
new ExecMainHandoverTimestamp property so that users can track accurately
when control is handed over from systemd to the service payload, so
that latency and startup performance can be trivially and accurately
tracked and attributed.

man/org.freedesktop.systemd1.xml
src/core/dbus-execute.h
src/core/exec-invoke.c
src/core/execute.c
src/core/execute.h
src/core/service.c
test/units/testsuite-07.exec-timestamps.sh [new file with mode: 0755]

index f60b3ba702852ddd63e6dc7fc1c87c1aac3acc8a..1870ed02bb614ffbee17947add216725e5cf4024 100644 (file)
@@ -2768,6 +2768,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       readonly t ExecMainStartTimestampMonotonic = ...;
       readonly t ExecMainExitTimestamp = ...;
       readonly t ExecMainExitTimestampMonotonic = ...;
+      readonly t ExecMainHandoverTimestamp = ...;
+      readonly t ExecMainHandoverTimestampMonotonic = ...;
       readonly u ExecMainPID = ...;
       readonly i ExecMainCode = ...;
       readonly i ExecMainStatus = ...;
@@ -4057,6 +4059,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ExecMainExitTimestampMonotonic"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ExecMainHandoverTimestamp"/>
+
+    <variablelist class="dbus-property" generated="True" extra-ref="ExecMainHandoverTimestampMonotonic"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="ExecMainPID"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="ExecMainCode"/>
@@ -4706,12 +4712,17 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
       <para><varname>ExecMainStartTimestamp</varname>, <varname>ExecMainStartTimestampMonotonic</varname>,
       <varname>ExecMainExitTimestamp</varname>, <varname>ExecMainExitTimestampMonotonic</varname>,
+      <varname>ExecMainHandoverTimestamp</varname>, <varname>ExecMainHandoverTimestampMonotonic</varname>,
       <varname>ExecMainPID</varname>, <varname>ExecMainCode</varname>, <varname>ExecMainStatus</varname>
-      contain information about the main process of the service as far as it is known. This is often the same
-      runtime information that is stored in <varname>ExecStart</varname>. However, it deviates for
-      <varname>Type=forking</varname> services where the main process of the service is not forked off
-      systemd directly. These fields either contain information of the last run of the process or of the
-      current running process.</para>
+      contain information about the main process of the service as far as it is known. The
+      <varname>ExecMainStartTimestamp</varname> timestamps record when the main child process is spawned by
+      the service manager. <varname>ExecMainExitTimestamp</varname> timestamps record when the main child
+      process exit has been detected by the service manager. <varname>ExecMainHandoverTimestamp</varname>
+      timestamps record when the service executable is executed by <command>systemd-executor</command>.
+      This is often the same runtime information that is stored in <varname>ExecStart=</varname>. However,
+      it deviates for <varname>Type=forking</varname> services where the main process of the service is not
+      forked off systemd directly. These fields either contain information of the last run of the process or
+      of the current running process.</para>
 
       <para><varname>MainPID</varname> and <varname>ControlPID</varname> contain the main and control PID of
       the service. The main PID is the current main PID of the service and is 0 when the service currently
@@ -12055,8 +12066,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>MemoryZSwapCurrent</varname> were added in version 255.</para>
       <para><varname>EffectiveMemoryHigh</varname>,
       <varname>EffectiveMemoryMax</varname>,
-      <varname>EffectiveTasksMax</varname>, and
-      <varname>MemoryZSwapWriteback</varname> were added in version 256.</para>
+      <varname>EffectiveTasksMax</varname>,
+      <varname>MemoryZSwapWriteback</varname>,
+      <varname>ExecMainHandoverTimestampMonotonic</varname>, and
+      <varname>ExecMainHandoverTimestamp</varname> were added in version 256.</para>
     </refsect2>
     <refsect2>
       <title>Socket Unit Objects</title>
index 5926bdb4b139a99e52dad887292bddcf0348181a..a1205b0997127af06b2c7411de1e315714447dda 100644 (file)
@@ -9,6 +9,7 @@
 #define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags)                   \
         BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \
         BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \
+        BUS_PROPERTY_DUAL_TIMESTAMP(prefix "HandoverTimestamp", (offset) + offsetof(ExecStatus, handover_timestamp), flags), \
         SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \
         SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \
         SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags)
index 32b48ececc0d1f267fb260889e1e7d13f972f28d..ff0fdbec5ff8f719af7b489af620c240c9067596 100644 (file)
@@ -5265,12 +5265,14 @@ int exec_invoke(
         log_command_line(context, params, "Executing", executable, final_argv);
 
         if (params->exec_fd >= 0) {
-                uint8_t hot = 1;
+                usec_t t = now(CLOCK_MONOTONIC);
 
                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
-                 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
+                 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. We send a
+                 * timestamp so that the service manager and users can track the precise moment we handed
+                 * over execution of the service to the kernel. */
 
-                if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
+                if (write(params->exec_fd, &t, sizeof(t)) < 0) {
                         *exit_status = EXIT_EXEC;
                         return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
                 }
@@ -5279,7 +5281,7 @@ int exec_invoke(
         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
 
         if (params->exec_fd >= 0) {
-                uint8_t hot = 0;
+                uint64_t hot = 0;
 
                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
                  * that POLLHUP on it no longer means execve() succeeded. */
index 5a4acd0775621e7c1bc91d59cc2d8280c7369675..cab74fdcc1046c66efee18fd06626e198fffe49d 100644 (file)
@@ -1866,6 +1866,11 @@ void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
                         "%sStart Timestamp: %s\n",
                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
 
+        if (dual_timestamp_is_set(&s->handover_timestamp))
+                fprintf(f,
+                        "%sHandover Timestamp: %s\n",
+                        prefix, FORMAT_TIMESTAMP(s->handover_timestamp.realtime));
+
         if (dual_timestamp_is_set(&s->exit_timestamp))
                 fprintf(f,
                         "%sExit Timestamp: %s\n"
index c41f9cbec2fc89a483eee9e6c3408ca05ba54b58..acca6e3152d40968284896608305022786e371a6 100644 (file)
@@ -91,6 +91,7 @@ typedef enum ExecKeyringMode {
 struct ExecStatus {
         dual_timestamp start_timestamp;
         dual_timestamp exit_timestamp;
+        dual_timestamp handover_timestamp;
         pid_t pid;
         int code;     /* as in siginfo_t::si_code */
         int status;   /* as in siginfo_t::si_status */
@@ -443,7 +444,9 @@ struct ExecParameters {
         int stdout_fd;
         int stderr_fd;
 
-        /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */
+        /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done. It
+         * will also be used to send a timestamp taken as the very last operation before execve, for
+         * tracking purposes. */
         int exec_fd;
 
         char *notify_socket;
index f6800c317032f6c62ad4d2e0cfec85199a32f06e..366f52f3e0c5af4e3c9b22f552a06e6bba562924 100644 (file)
@@ -24,6 +24,7 @@
 #include "fd-util.h"
 #include "fileio.h"
 #include "format-util.h"
+#include "io-util.h"
 #include "load-dropin.h"
 #include "load-fragment.h"
 #include "log.h"
@@ -1675,7 +1676,8 @@ static int service_spawn_internal(
                 log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds);
         }
 
-        if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
+        if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) &&
+            IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC, SERVICE_DBUS)) {
                 r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd);
                 if (r < 0)
                         return r;
@@ -3008,6 +3010,7 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
                 (void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid);
                 (void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp);
                 (void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp);
+                (void) serialize_dual_timestamp(f, "main-exec-status-handover", &s->main_exec_status.handover_timestamp);
 
                 if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) {
                         (void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code);
@@ -3292,6 +3295,8 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
                 deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp);
         else if (streq(key, "main-exec-status-exit"))
                 deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp);
+        else if (streq(key, "main-exec-status-handover"))
+                deserialize_dual_timestamp(value, &s->main_exec_status.handover_timestamp);
         else if (streq(key, "notify-access-override")) {
                 NotifyAccess notify_access;
 
@@ -3520,21 +3525,22 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
          * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless
          * the child signalled us first that it is about to call the execve(). It does so by sending us a simple
          * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it
-         * sends a zero byte we'll ignore POLLHUP on the fd again. */
+         * sends a zero byte we'll ignore POLLHUP on the fd again.
+         * For exec, dbus, notify and notify-reload types we also get a timestamp from sd-executor, taken immediately
+         * before the target executable is execve'd, so that we can accurately track when control is handed over to the
+         * payload. */
 
         for (;;) {
-                uint8_t x;
+                uint64_t x = 0;
                 ssize_t n;
 
-                n = read(fd, &x, sizeof(x));
-                if (n < 0) {
-                        if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
-                                return 0;
-
-                        return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m");
-                }
-                if (n == 0) { /* EOF → the event we are waiting for */
-
+                n = loop_read(fd, &x, sizeof(x), /* do_poll= */ false);
+                if (n == -EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
+                        return 0;
+                if (n < 0)
+                        return log_unit_error_errno(UNIT(s), n, "Failed to read from exec_fd: %m");
+                if (n == 0) {
+                        /* EOF → the event we are waiting for in case of Type=exec */
                         s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
 
                         if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */
@@ -3550,13 +3556,37 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
 
                         return 0;
                 }
+                if (n == 1) {
+                        /* Backward compatibility block: a single byte was read, which means somehow an old
+                         * executor before this logic was introduced sent the notification, so there is no
+                         * timestamp (reset it). */
 
-                /* A byte was read → this turns on/off the exec fd logic */
-                assert(n == sizeof(x));
-                s->exec_fd_hot = x;
-        }
+                        s->exec_fd_hot = x;
+                        if (s->state == SERVICE_START)
+                                s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL;
 
-        return 0;
+                        continue;
+                }
+
+                if (x == 0) {
+                        /* If we get x=0 then the execve actually failed, which means the exec fd logic is
+                         * now off. Also reset the exec timestamp, given it has no meaning anymore. */
+
+                        s->exec_fd_hot = false;
+                        if (s->state == SERVICE_START)
+                                s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL;
+                } else {
+                        /* A non-zero value was read, which means the exec fd logic is now enabled. Record
+                         * the received timestamp so that users can track when control is handed over to the
+                         * service payload. */
+
+                        s->exec_fd_hot = true;
+                        if (s->state == SERVICE_START) {
+                                assert_cc(sizeof(uint64_t) == sizeof(usec_t));
+                                dual_timestamp_from_monotonic(&s->main_exec_status.handover_timestamp, (usec_t)x);
+                        }
+                }
+        }
 }
 
 static void service_notify_cgroup_empty_event(Unit *u) {
diff --git a/test/units/testsuite-07.exec-timestamps.sh b/test/units/testsuite-07.exec-timestamps.sh
new file mode 100755 (executable)
index 0000000..799bc89
--- /dev/null
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+set -eux
+set -o pipefail
+
+# Check that timestamps of a Type=notify service are consistent
+
+systemd-run --service-type notify --property NotifyAccess=all --unit notify.service --wait sh -c 'systemd-notify --ready; exit 1' || :
+
+start=$(systemctl show --property=ExecMainStartTimestampMonotonic --value notify.service)
+handover=$(systemctl show --property=ExecMainHandoverTimestampMonotonic --value notify.service)
+active=$(systemctl show --property=ActiveEnterTimestampMonotonic --value notify.service)
+exit=$(systemctl show --property=ExecMainExitTimestampMonotonic --value notify.service)
+
+[[ $start -le $handover ]]
+[[ $handover -le $active ]]
+[[ $active -le $exit ]]