]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
sd-event: always operate on child source via pidfd 36480/head
authorMike Yuan <me@yhndnzj.com>
Tue, 18 Feb 2025 15:49:05 +0000 (16:49 +0100)
committerMike Yuan <me@yhndnzj.com>
Fri, 21 Feb 2025 17:08:55 +0000 (18:08 +0100)
Follow-up for 6e14c46bac760d01868b0bf48461f6ac44c86be3

Nowadays a pidfd is guarenteed to be around for child
event sources, hence drop the effectively unused pid-based
branches.

Addresses https://github.com/systemd/systemd/pull/36410#discussion_r1959930716

man/sd_event_add_child.xml
src/libsystemd/sd-event/sd-event.c

index e3b9cdd74608f7d4170b9a7926fbde252e2ed4d0..4bf07baf59209aa3f71013e77684034139e3f644 100644 (file)
     </para>
 
     <para><function>sd_event_source_get_child_pidfd()</function> retrieves the file descriptor referencing
-    the watched process ("pidfd") if this functionality is available. On kernels that support the concept the
-    event loop will make use of pidfds to watch child processes, regardless of whether the individual event sources
-    are allocated via <function>sd_event_add_child()</function> or
-    <function>sd_event_add_child_pidfd()</function>. If the latter call was used to allocate the event
-    source, this function returns the file descriptor used for allocation. On kernels that do not support the
-    pidfd concept this function will fail with <constant>EOPNOTSUPP</constant>. This call takes the event
-    source object as the <parameter>source</parameter> parameter and returns the numeric file descriptor.
+    the watched process ("pidfd"). The event loop internally makes use of pidfds to watch child processes,
+    regardless of whether the individual event sources are allocated via <function>sd_event_add_child()</function>
+    or <function>sd_event_add_child_pidfd()</function>. If the latter call was used to allocate the event
+    source, this function returns the original file descriptor used for allocation. This call takes
+    the event source object as the <parameter>source</parameter> parameter and returns the numeric file descriptor.
     </para>
 
     <para><function>sd_event_source_get_child_pidfd_own()</function> may be used to query whether the pidfd
     exist. This behaviour might change eventually.</para>
 
     <para><function>sd_event_source_send_child_signal()</function> may be used to send a UNIX signal to the
-    watched process. If the pidfd concept is supported in the kernel, this is implemented via <citerefentry
-    project='man-pages'><refentrytitle>pidfd_send_signal</refentrytitle><manvolnum>2</manvolnum></citerefentry>
-    and otherwise via <citerefentry
-    project='man-pages'><refentrytitle>rt_sigqueueinfo</refentrytitle><manvolnum>2</manvolnum></citerefentry>
-    (or via <citerefentry
-    project='man-pages'><refentrytitle>kill</refentrytitle><manvolnum>2</manvolnum></citerefentry> in case
-    <parameter>info</parameter> is <constant>NULL</constant>). The specified parameters match those of these
-    underlying system calls, except that the <parameter>info</parameter> is never modified (and is thus
-    declared constant). Like for the underlying system calls, the <parameter>flags</parameter> parameter
+    watched process via <citerefentry
+    project='man-pages'><refentrytitle>pidfd_send_signal</refentrytitle><manvolnum>2</manvolnum></citerefentry>.
+    The specified parameters match those of the underlying system call, except that the <parameter>info</parameter>
+    is never modified (and is thus declared constant). Like for the underlying system call, the <parameter>flags</parameter> parameter
     currently must be zero.</para>
   </refsect1>
 
           <listitem><para>The passed event source is not a child process event source.</para></listitem>
         </varlistentry>
 
-        <varlistentry>
-          <term><constant>-EOPNOTSUPP</constant></term>
-
-          <listitem><para>A pidfd was requested but the kernel does not support this concept.</para>
-
-          <xi:include href="version-info.xml" xpointer="v245"/></listitem>
-        </varlistentry>
-
       </variablelist>
     </refsect2>
   </refsect1>
index 1fea7d5aa7897fb7c438e461e2a6bc5c0270f286..d2ef4306e20e3a24e01b10a67d67cafb495a7129 100644 (file)
 
 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
 
-static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
+static bool EVENT_SOURCE_WATCH_PIDFD(const sd_event_source *s) {
         /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
         return s &&
                 s->type == SOURCE_CHILD &&
-                s->child.pidfd >= 0 &&
                 s->child.options == WEXITED;
 }
 
@@ -1088,12 +1087,11 @@ static sd_event_source* source_free(sd_event_source *s) {
                 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
 
                 if (s->child.process_owned) {
+                        assert(s->child.pid > 0);
+                        assert(s->child.pidfd >= 0);
 
                         if (!s->child.exited) {
-                                if (s->child.pidfd >= 0)
-                                        r = RET_NERRNO(pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0));
-                                else
-                                        r = RET_NERRNO(kill(s->child.pid, SIGKILL));
+                                r = RET_NERRNO(pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0));
                                 if (r < 0 && r != -ESRCH)
                                         log_debug_errno(r, "Failed to kill process " PID_FMT ", ignoring: %m",
                                                         s->child.pid);
@@ -1103,10 +1101,7 @@ static sd_event_source* source_free(sd_event_source *s) {
                                 siginfo_t si = {};
 
                                 /* Reap the child if we can */
-                                if (s->child.pidfd >= 0)
-                                        (void) waitid(P_PIDFD, s->child.pidfd, &si, WEXITED);
-                                else
-                                        (void) waitid(P_PID, s->child.pid, &si, WEXITED);
+                                (void) waitid(P_PIDFD, s->child.pidfd, &si, WEXITED);
                         }
                 }
 
@@ -2999,13 +2994,13 @@ static int event_source_online(
 
         case SOURCE_CHILD:
                 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
-                        /* yes, we have pidfd */
+                        /* yes, we can rely on pidfd */
 
                         r = source_child_pidfd_register(s, enabled);
                         if (r < 0)
                                 return r;
                 } else {
-                        /* no pidfd, or something other to watch for than WEXITED */
+                        /* something other to watch for than WEXITED */
 
                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
                         if (r < 0) {
@@ -3198,9 +3193,6 @@ _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
         assert_return(s->type == SOURCE_CHILD, -EDOM);
         assert_return(!event_origin_changed(s->event), -ECHILD);
 
-        if (s->child.pidfd < 0)
-                return -EOPNOTSUPP;
-
         return s->child.pidfd;
 }
 
@@ -3251,9 +3243,7 @@ _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
         assert_return(s, -EINVAL);
         assert_return(s->type == SOURCE_CHILD, -EDOM);
         assert_return(!event_origin_changed(s->event), -ECHILD);
-
-        if (s->child.pidfd < 0)
-                return -EOPNOTSUPP;
+        assert(s->child.pidfd >= 0);
 
         return s->child.pidfd_owned;
 }
@@ -3262,9 +3252,7 @@ _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
         assert_return(s, -EINVAL);
         assert_return(s->type == SOURCE_CHILD, -EDOM);
         assert_return(!event_origin_changed(s->event), -ECHILD);
-
-        if (s->child.pidfd < 0)
-                return -EOPNOTSUPP;
+        assert(s->child.pidfd >= 0);
 
         s->child.pidfd_owned = own;
         return 0;
@@ -3723,9 +3711,9 @@ static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priori
 
         e->need_process_child = false;
 
-        /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
-         * for, instead of using P_ALL. This is because we only want to get child information of very
-         * specific child processes, and not all of them. We might not have processed the SIGCHLD event
+        /* So, this is ugly. We iteratively invoke waitid() + WNOHANG with each child process we shall wait for,
+         * instead of using P_ALL. This is because we only want to get child information of very specific
+         * child processes, and not all of them. We might not have processed the SIGCHLD event
          * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
          * hence we really don't want anything flushed out of the kernel's queue that we don't care
          * about. Since this is O(n) this means that if you have a lot of processes you probably want
@@ -3736,6 +3724,7 @@ static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priori
 
         HASHMAP_FOREACH(s, e->child_sources) {
                 assert(s->type == SOURCE_CHILD);
+                assert(s->child.pidfd >= 0);
 
                 if (s->priority > threshold)
                         continue;
@@ -3755,7 +3744,7 @@ static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priori
                         continue;
 
                 zero(s->child.siginfo);
-                if (waitid(P_PID, s->child.pid, &s->child.siginfo,
+                if (waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo,
                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
                         return negative_errno();
 
@@ -3764,14 +3753,12 @@ static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priori
 
                         if (zombie)
                                 s->child.exited = true;
-
-                        if (!zombie && (s->child.options & WEXITED)) {
-                                /* If the child isn't dead then let's immediately remove the state
-                                 * change from the queue, since there's no benefit in leaving it
-                                 * queued. */
+                        else if (s->child.options & WEXITED) {
+                                /* If the child isn't dead then let's immediately remove the state change
+                                 * from the queue, since there's no benefit in leaving it queued. */
 
                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
-                                (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
+                                (void) waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
                         }
 
                         r = source_set_pending(s, true);
@@ -3792,6 +3779,7 @@ static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
         assert(e);
         assert(s);
         assert(s->type == SOURCE_CHILD);
+        assert(s->child.pidfd >= 0);
 
         if (s->pending)
                 return 0;
@@ -3802,8 +3790,13 @@ static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
         if (!EVENT_SOURCE_WATCH_PIDFD(s))
                 return 0;
 
+        /* Note that pidfd would also generate EPOLLHUP when the process gets reaped. But at this point we
+         * only permit EPOLLIN, under the assumption that upon EPOLLHUP the child source should already
+         * be set to pending, and we would have returned early above. */
+        assert(!s->child.exited);
+
         zero(s->child.siginfo);
-        if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
+        if (waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
                 return -errno;
 
         if (s->child.siginfo.si_pid == 0)
@@ -4228,7 +4221,7 @@ static int source_dispatch(sd_event_source *s) {
 
                 /* Now, reap the PID for good. */
                 if (zombie) {
-                        (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
+                        (void) waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG|WEXITED);
                         s->child.waited = true;
                 }