]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/libsystemd/sd-event/sd-event.c
sd-event: add relative timer calls
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
index d93d2f40469d5dce512361f0182c7ee4491bc580..8b338f5db6c910d21aefdb0da16af5e560b1946b 100644 (file)
@@ -9,6 +9,7 @@
 #include "sd-id128.h"
 
 #include "alloc-util.h"
+#include "env-util.h"
 #include "event-source.h"
 #include "fd-util.h"
 #include "fs-util.h"
@@ -16,7 +17,7 @@
 #include "list.h"
 #include "macro.h"
 #include "memory-util.h"
-#include "missing.h"
+#include "missing_syscall.h"
 #include "prioq.h"
 #include "process-util.h"
 #include "set.h"
 
 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
 
+static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
+        /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
+        return s &&
+                s->type == SOURCE_CHILD &&
+                s->child.pidfd >= 0 &&
+                s->child.options == WEXITED;
+}
+
 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
         [SOURCE_IO] = "io",
         [SOURCE_TIME_REALTIME] = "realtime",
@@ -106,6 +115,9 @@ struct sd_event {
 
         unsigned n_sources;
 
+        struct epoll_event *event_queue;
+        size_t event_queue_allocated;
+
         LIST_HEAD(sd_event_source, sources);
 
         usec_t last_run, last_log;
@@ -277,6 +289,8 @@ static sd_event *event_free(sd_event *e) {
         hashmap_free(e->child_sources);
         set_free(e->post_sources);
 
+        free(e->event_queue);
+
         return mfree(e);
 }
 
@@ -356,8 +370,6 @@ static bool event_pid_changed(sd_event *e) {
 }
 
 static void source_io_unregister(sd_event_source *s) {
-        int r;
-
         assert(s);
         assert(s->type == SOURCE_IO);
 
@@ -367,8 +379,7 @@ static void source_io_unregister(sd_event_source *s) {
         if (!s->io.registered)
                 return;
 
-        r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
-        if (r < 0)
+        if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
                                 strna(s->description), event_source_type_to_string(s->type));
 
@@ -380,22 +391,20 @@ static int source_io_register(
                 int enabled,
                 uint32_t events) {
 
-        struct epoll_event ev;
-        int r;
-
         assert(s);
         assert(s->type == SOURCE_IO);
         assert(enabled != SD_EVENT_OFF);
 
-        ev = (struct epoll_event) {
+        struct epoll_event ev = {
                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
                 .data.ptr = s,
         };
+        int r;
 
-        if (s->io.registered)
-                r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
-        else
-                r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
+        r = epoll_ctl(s->event->epoll_fd,
+                      s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
+                      s->io.fd,
+                      &ev);
         if (r < 0)
                 return -errno;
 
@@ -404,6 +413,49 @@ static int source_io_register(
         return 0;
 }
 
+static void source_child_pidfd_unregister(sd_event_source *s) {
+        assert(s);
+        assert(s->type == SOURCE_CHILD);
+
+        if (event_pid_changed(s->event))
+                return;
+
+        if (!s->child.registered)
+                return;
+
+        if (EVENT_SOURCE_WATCH_PIDFD(s))
+                if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
+                        log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
+                                        strna(s->description), event_source_type_to_string(s->type));
+
+        s->child.registered = false;
+}
+
+static int source_child_pidfd_register(sd_event_source *s, int enabled) {
+        int r;
+
+        assert(s);
+        assert(s->type == SOURCE_CHILD);
+        assert(enabled != SD_EVENT_OFF);
+
+        if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+                struct epoll_event ev = {
+                        .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
+                        .data.ptr = s,
+                };
+
+                if (s->child.registered)
+                        r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->child.pidfd, &ev);
+                else
+                        r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->child.pidfd, &ev);
+                if (r < 0)
+                        return -errno;
+        }
+
+        s->child.registered = true;
+        return 0;
+}
+
 static clockid_t event_source_type_to_clock(EventSourceType t) {
 
         switch (t) {
@@ -493,7 +545,6 @@ static int event_make_signal_data(
                 int sig,
                 struct signal_data **ret) {
 
-        struct epoll_event ev;
         struct signal_data *d;
         bool added = false;
         sigset_t ss_copy;
@@ -560,7 +611,7 @@ static int event_make_signal_data(
 
         d->fd = fd_move_above_stdio(r);
 
-        ev = (struct epoll_event) {
+        struct epoll_event ev = {
                 .events = EPOLLIN,
                 .data.ptr = d,
         };
@@ -614,9 +665,8 @@ static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig)
 
         assert(e);
 
-        /* Rechecks if the specified signal is still something we are
-         * interested in. If not, we'll unmask it, and possibly drop
-         * the signalfd for it. */
+        /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
+         * and possibly drop the signalfd for it. */
 
         if (sig == SIGCHLD &&
             e->n_enabled_child_sources > 0)
@@ -707,9 +757,13 @@ static void source_disconnect(sd_event_source *s) {
                         }
 
                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
-                        event_gc_signal_data(s->event, &s->priority, SIGCHLD);
                 }
 
+                if (EVENT_SOURCE_WATCH_PIDFD(s))
+                        source_child_pidfd_unregister(s);
+                else
+                        event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+
                 break;
 
         case SOURCE_DEFER:
@@ -769,9 +823,7 @@ static void source_disconnect(sd_event_source *s) {
         if (s->prepare)
                 prioq_remove(s->event->prepare, s, &s->prepare_index);
 
-        event = s->event;
-
-        s->event = NULL;
+        event = TAKE_PTR(s->event);
         LIST_REMOVE(sources, event->sources, s);
         event->n_sources--;
 
@@ -790,6 +842,44 @@ static void source_free(sd_event_source *s) {
         if (s->type == SOURCE_IO && s->io.owned)
                 s->io.fd = safe_close(s->io.fd);
 
+        if (s->type == SOURCE_CHILD) {
+                /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
+
+                if (s->child.process_owned) {
+
+                        if (!s->child.exited) {
+                                bool sent = false;
+
+                                if (s->child.pidfd >= 0) {
+                                        if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
+                                                if (errno == ESRCH) /* Already dead */
+                                                        sent = true;
+                                                else if (!ERRNO_IS_NOT_SUPPORTED(errno))
+                                                        log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
+                                                                        s->child.pid);
+                                        } else
+                                                sent = true;
+                                }
+
+                                if (!sent)
+                                        if (kill(s->child.pid, SIGKILL) < 0)
+                                                if (errno != ESRCH) /* Already dead */
+                                                        log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
+                                                                        s->child.pid);
+                        }
+
+                        if (!s->child.waited) {
+                                siginfo_t si = {};
+
+                                /* Reap the child if we can */
+                                (void) waitid(P_PID, s->child.pid, &si, WEXITED);
+                        }
+                }
+
+                if (s->child.pidfd_owned)
+                        s->child.pidfd = safe_close(s->child.pidfd);
+        }
+
         if (s->destroy_callback)
                 s->destroy_callback(s->userdata);
 
@@ -947,33 +1037,31 @@ static int event_setup_timer_fd(
                 struct clock_data *d,
                 clockid_t clock) {
 
-        struct epoll_event ev;
-        int r, fd;
-
         assert(e);
         assert(d);
 
         if (_likely_(d->fd >= 0))
                 return 0;
 
+        _cleanup_close_ int fd = -1;
+        int r;
+
         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
         if (fd < 0)
                 return -errno;
 
         fd = fd_move_above_stdio(fd);
 
-        ev = (struct epoll_event) {
+        struct epoll_event ev = {
                 .events = EPOLLIN,
                 .data.ptr = d,
         };
 
         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
-        if (r < 0) {
-                safe_close(fd);
+        if (r < 0)
                 return -errno;
-        }
 
-        d->fd = fd;
+        d->fd = TAKE_FD(fd);
         return 0;
 }
 
@@ -1058,6 +1146,31 @@ _public_ int sd_event_add_time(
         return 0;
 }
 
+_public_ int sd_event_add_time_relative(
+                sd_event *e,
+                sd_event_source **ret,
+                clockid_t clock,
+                uint64_t usec,
+                uint64_t accuracy,
+                sd_event_time_handler_t callback,
+                void *userdata) {
+
+        usec_t t;
+        int r;
+
+        /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
+         * checks for overflow. */
+
+        r = sd_event_now(e, clock, &t);
+        if (r < 0)
+                return r;
+
+        if (usec >= USEC_INFINITY - t)
+                return -EOVERFLOW;
+
+        return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
+}
+
 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
         assert(s);
 
@@ -1073,7 +1186,6 @@ _public_ int sd_event_add_signal(
 
         _cleanup_(source_freep) sd_event_source *s = NULL;
         struct signal_data *d;
-        sigset_t ss;
         int r;
 
         assert_return(e, -EINVAL);
@@ -1085,11 +1197,10 @@ _public_ int sd_event_add_signal(
         if (!callback)
                 callback = signal_exit_callback;
 
-        r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
-        if (r != 0)
-                return -r;
-
-        if (!sigismember(&ss, sig))
+        r = signal_is_blocked(sig);
+        if (r < 0)
+                return r;
+        if (r == 0)
                 return -EBUSY;
 
         if (!e->signal_sources) {
@@ -1124,6 +1235,11 @@ _public_ int sd_event_add_signal(
         return 0;
 }
 
+static bool shall_use_pidfd(void) {
+        /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
+        return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
+}
+
 _public_ int sd_event_add_child(
                 sd_event *e,
                 sd_event_source **ret,
@@ -1144,6 +1260,20 @@ _public_ int sd_event_add_child(
         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
         assert_return(!event_pid_changed(e), -ECHILD);
 
+        if (e->n_enabled_child_sources == 0) {
+                /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
+                 * for compatibility with pre-pidfd and because we don't want the reap the child processes
+                 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
+                 * take effect.
+                 *
+                 * (As an optimization we only do this check on the first child event source created.) */
+                r = signal_is_blocked(SIGCHLD);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return -EBUSY;
+        }
+
         r = hashmap_ensure_allocated(&e->child_sources, NULL);
         if (r < 0)
                 return r;
@@ -1155,30 +1285,144 @@ _public_ int sd_event_add_child(
         if (!s)
                 return -ENOMEM;
 
+        s->wakeup = WAKEUP_EVENT_SOURCE;
         s->child.pid = pid;
         s->child.options = options;
         s->child.callback = callback;
         s->userdata = userdata;
         s->enabled = SD_EVENT_ONESHOT;
 
+        /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
+         * pin the PID, and make regular waitid() handling race-free. */
+
+        if (shall_use_pidfd()) {
+                s->child.pidfd = pidfd_open(s->child.pid, 0);
+                if (s->child.pidfd < 0) {
+                        /* Propagate errors unless the syscall is not supported or blocked */
+                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                                return -errno;
+                } else
+                        s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
+        } else
+                s->child.pidfd = -1;
+
         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
         if (r < 0)
                 return r;
 
         e->n_enabled_child_sources++;
 
-        r = event_make_signal_data(e, SIGCHLD, NULL);
-        if (r < 0) {
-                e->n_enabled_child_sources--;
-                return r;
-        }
+        if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+                /* We have a pidfd and we only want to watch for exit */
+
+                r = source_child_pidfd_register(s, s->enabled);
+                if (r < 0) {
+                        e->n_enabled_child_sources--;
+                        return r;
+                }
+        } else {
+                /* We have no pidfd or we shall wait for some other event than WEXITED */
+
+                r = event_make_signal_data(e, SIGCHLD, NULL);
+                if (r < 0) {
+                        e->n_enabled_child_sources--;
+                        return r;
+                }
 
-        e->need_process_child = true;
+                e->need_process_child = true;
+        }
 
         if (ret)
                 *ret = s;
+
         TAKE_PTR(s);
+        return 0;
+}
 
+_public_ int sd_event_add_child_pidfd(
+                sd_event *e,
+                sd_event_source **ret,
+                int pidfd,
+                int options,
+                sd_event_child_handler_t callback,
+                void *userdata) {
+
+
+        _cleanup_(source_freep) sd_event_source *s = NULL;
+        pid_t pid;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(pidfd >= 0, -EBADF);
+        assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
+        assert_return(options != 0, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (e->n_enabled_child_sources == 0) {
+                r = signal_is_blocked(SIGCHLD);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return -EBUSY;
+        }
+
+        r = hashmap_ensure_allocated(&e->child_sources, NULL);
+        if (r < 0)
+                return r;
+
+        r = pidfd_get_pid(pidfd, &pid);
+        if (r < 0)
+                return r;
+
+        if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
+                return -EBUSY;
+
+        s = source_new(e, !ret, SOURCE_CHILD);
+        if (!s)
+                return -ENOMEM;
+
+        s->wakeup = WAKEUP_EVENT_SOURCE;
+        s->child.pidfd = pidfd;
+        s->child.pid = pid;
+        s->child.options = options;
+        s->child.callback = callback;
+        s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ONESHOT;
+
+        r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
+        if (r < 0)
+                return r;
+
+        e->n_enabled_child_sources++;
+
+        if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+                /* We only want to watch for WEXITED */
+
+                r = source_child_pidfd_register(s, s->enabled);
+                if (r < 0) {
+                        e->n_enabled_child_sources--;
+                        return r;
+                }
+        } else {
+                /* We shall wait for some other event than WEXITED */
+
+                r = event_make_signal_data(e, SIGCHLD, NULL);
+                if (r < 0) {
+                        e->n_enabled_child_sources--;
+                        return r;
+                }
+
+                e->need_process_child = true;
+        }
+
+        if (ret)
+                *ret = s;
+
+        TAKE_PTR(s);
         return 0;
 }
 
@@ -1231,10 +1475,6 @@ _public_ int sd_event_add_post(
         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
         assert_return(!event_pid_changed(e), -ECHILD);
 
-        r = set_ensure_allocated(&e->post_sources, NULL);
-        if (r < 0)
-                return r;
-
         s = source_new(e, !ret, SOURCE_POST);
         if (!s)
                 return -ENOMEM;
@@ -1243,9 +1483,10 @@ _public_ int sd_event_add_post(
         s->userdata = userdata;
         s->enabled = SD_EVENT_ON;
 
-        r = set_put(e->post_sources, s);
+        r = set_ensure_put(&e->post_sources, NULL, s);
         if (r < 0)
                 return r;
+        assert(r > 0);
 
         if (ret)
                 *ret = s;
@@ -1326,7 +1567,6 @@ static int event_make_inotify_data(
 
         _cleanup_close_ int fd = -1;
         struct inotify_data *d;
-        struct epoll_event ev;
         int r;
 
         assert(e);
@@ -1365,7 +1605,7 @@ static int event_make_inotify_data(
                 return r;
         }
 
-        ev = (struct epoll_event) {
+        struct epoll_event ev = {
                 .events = EPOLLIN,
                 .data.ptr = d,
         };
@@ -1765,7 +2005,7 @@ _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
                         return r;
                 }
 
-                epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
+                (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
         }
 
         return 0;
@@ -2026,7 +2266,11 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
                         assert(s->event->n_enabled_child_sources > 0);
                         s->event->n_enabled_child_sources--;
 
-                        event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+                        if (EVENT_SOURCE_WATCH_PIDFD(s))
+                                source_child_pidfd_unregister(s);
+                        else
+                                event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+
                         break;
 
                 case SOURCE_EXIT:
@@ -2100,12 +2344,25 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
 
                         s->enabled = m;
 
-                        r = event_make_signal_data(s->event, SIGCHLD, NULL);
-                        if (r < 0) {
-                                s->enabled = SD_EVENT_OFF;
-                                s->event->n_enabled_child_sources--;
-                                event_gc_signal_data(s->event, &s->priority, SIGCHLD);
-                                return r;
+                        if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+                                /* yes, we have pidfd */
+
+                                r = source_child_pidfd_register(s, s->enabled);
+                                if (r < 0) {
+                                        s->enabled = SD_EVENT_OFF;
+                                        s->event->n_enabled_child_sources--;
+                                        return r;
+                                }
+                        } else {
+                                /* no pidfd, or something other to watch for than WEXITED */
+
+                                r = event_make_signal_data(s->event, SIGCHLD, NULL);
+                                if (r < 0) {
+                                        s->enabled = SD_EVENT_OFF;
+                                        s->event->n_enabled_child_sources--;
+                                        event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+                                        return r;
+                                }
                         }
 
                         break;
@@ -2170,6 +2427,23 @@ _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
         return 0;
 }
 
+_public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
+        usec_t t;
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
+
+        r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
+        if (r < 0)
+                return r;
+
+        if (usec >= USEC_INFINITY - t)
+                return -EOVERFLOW;
+
+        return sd_event_source_set_time(s, t + usec);
+}
+
 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
         assert_return(s, -EINVAL);
         assert_return(usec, -EINVAL);
@@ -2228,6 +2502,98 @@ _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
         return 0;
 }
 
+_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (s->child.pidfd < 0)
+                return -EOPNOTSUPP;
+
+        return s->child.pidfd;
+}
+
+_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+        assert_return(SIGNAL_VALID(sig), -EINVAL);
+
+        /* If we already have seen indication the process exited refuse sending a signal early. This way we
+         * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
+         * available. */
+        if (s->child.exited)
+                return -ESRCH;
+
+        if (s->child.pidfd >= 0) {
+                siginfo_t copy;
+
+                /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
+                 * structure here */
+                if (si)
+                        copy = *si;
+
+                if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
+                        /* Let's propagate the error only if the system call is not implemented or prohibited */
+                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                                return -errno;
+                } else
+                        return 0;
+        }
+
+        /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
+         * this here. */
+        if (flags != 0)
+                return -EOPNOTSUPP;
+
+        if (si) {
+                /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
+                siginfo_t copy = *si;
+
+                if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
+                        return -errno;
+        } else if (kill(s->child.pid, sig) < 0)
+                return -errno;
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+        if (s->child.pidfd < 0)
+                return -EOPNOTSUPP;
+
+        return s->child.pidfd_owned;
+}
+
+_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+        if (s->child.pidfd < 0)
+                return -EOPNOTSUPP;
+
+        s->child.pidfd_owned = own;
+        return 0;
+}
+
+_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+        return s->child.process_owned;
+}
+
+_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+        s->child.process_owned = own;
+        return 0;
+}
+
 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
         assert_return(s, -EINVAL);
         assert_return(mask, -EINVAL);
@@ -2538,6 +2904,12 @@ static int process_child(sd_event *e) {
                 if (s->enabled == SD_EVENT_OFF)
                         continue;
 
+                if (s->child.exited)
+                        continue;
+
+                if (EVENT_SOURCE_WATCH_PIDFD(s)) /* There's a usable pidfd known for this event source? then don't waitid() for it here */
+                        continue;
+
                 zero(s->child.siginfo);
                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
@@ -2547,6 +2919,9 @@ static int process_child(sd_event *e) {
                 if (s->child.siginfo.si_pid != 0) {
                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
 
+                        if (zombie)
+                                s->child.exited = true;
+
                         if (!zombie && (s->child.options & WEXITED)) {
                                 /* If the child isn't dead then let's
                                  * immediately remove the state change
@@ -2566,6 +2941,33 @@ static int process_child(sd_event *e) {
         return 0;
 }
 
+static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
+        assert(e);
+        assert(s);
+        assert(s->type == SOURCE_CHILD);
+
+        if (s->pending)
+                return 0;
+
+        if (s->enabled == SD_EVENT_OFF)
+                return 0;
+
+        if (!EVENT_SOURCE_WATCH_PIDFD(s))
+                return 0;
+
+        zero(s->child.siginfo);
+        if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
+                return -errno;
+
+        if (s->child.siginfo.si_pid == 0)
+                return 0;
+
+        if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
+                s->child.exited = true;
+
+        return source_set_pending(s, true);
+}
+
 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
         bool read_one = false;
         int r;
@@ -2850,8 +3252,10 @@ static int source_dispatch(sd_event_source *s) {
                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
 
                 /* Now, reap the PID for good. */
-                if (zombie)
+                if (zombie) {
                         (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
+                        s->child.waited = true;
+                }
 
                 break;
         }
@@ -3052,6 +3456,11 @@ _public_ int sd_event_prepare(sd_event *e) {
         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
 
+        /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
+         * this check here once, since gettid() is typically not cached, and thus want to minimize
+         * syscalls */
+        assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
+
         if (e->exit_requested)
                 goto pending;
 
@@ -3102,8 +3511,7 @@ pending:
 }
 
 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
-        struct epoll_event *ev_queue;
-        unsigned ev_queue_max;
+        size_t event_queue_max;
         int r, m, i;
 
         assert_return(e, -EINVAL);
@@ -3117,14 +3525,15 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
                 return 1;
         }
 
-        ev_queue_max = MAX(e->n_sources, 1u);
-        ev_queue = newa(struct epoll_event, ev_queue_max);
+        event_queue_max = MAX(e->n_sources, 1u);
+        if (!GREEDY_REALLOC(e->event_queue, e->event_queue_allocated, event_queue_max))
+                return -ENOMEM;
 
         /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
         if (e->inotify_data_buffered)
                 timeout = 0;
 
-        m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
+        m = epoll_wait(e->epoll_fd, e->event_queue, event_queue_max,
                        timeout == (uint64_t) -1 ? -1 : (int) DIV_ROUND_UP(timeout, USEC_PER_MSEC));
         if (m < 0) {
                 if (errno == EINTR) {
@@ -3140,29 +3549,50 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
 
         for (i = 0; i < m; i++) {
 
-                if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
-                        r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
+                if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
+                        r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
                 else {
-                        WakeupType *t = ev_queue[i].data.ptr;
+                        WakeupType *t = e->event_queue[i].data.ptr;
 
                         switch (*t) {
 
-                        case WAKEUP_EVENT_SOURCE:
-                                r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
+                        case WAKEUP_EVENT_SOURCE: {
+                                sd_event_source *s = e->event_queue[i].data.ptr;
+
+                                assert(s);
+
+                                switch (s->type) {
+
+                                case SOURCE_IO:
+                                        r = process_io(e, s, e->event_queue[i].events);
+                                        break;
+
+                                case SOURCE_CHILD:
+                                        r = process_pidfd(e, s, e->event_queue[i].events);
+                                        break;
+
+                                default:
+                                        assert_not_reached("Unexpected event source type");
+                                }
+
                                 break;
+                        }
 
                         case WAKEUP_CLOCK_DATA: {
-                                struct clock_data *d = ev_queue[i].data.ptr;
-                                r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
+                                struct clock_data *d = e->event_queue[i].data.ptr;
+
+                                assert(d);
+
+                                r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
                                 break;
                         }
 
                         case WAKEUP_SIGNAL_DATA:
-                                r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
+                                r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events);
                                 break;
 
                         case WAKEUP_INOTIFY_DATA:
-                                r = event_inotify_data_read(e, ev_queue[i].data.ptr, ev_queue[i].events);
+                                r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events);
                                 break;
 
                         default:
@@ -3445,8 +3875,6 @@ _public_ int sd_event_set_watchdog(sd_event *e, int b) {
                 return e->watchdog;
 
         if (b) {
-                struct epoll_event ev;
-
                 r = sd_watchdog_enabled(false, &e->watchdog_period);
                 if (r <= 0)
                         return r;
@@ -3463,7 +3891,7 @@ _public_ int sd_event_set_watchdog(sd_event *e, int b) {
                 if (r < 0)
                         goto fail;
 
-                ev = (struct epoll_event) {
+                struct epoll_event ev = {
                         .events = EPOLLIN,
                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
                 };
@@ -3476,7 +3904,7 @@ _public_ int sd_event_set_watchdog(sd_event *e, int b) {
 
         } else {
                 if (e->watchdog_fd >= 0) {
-                        epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
+                        (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
                         e->watchdog_fd = safe_close(e->watchdog_fd);
                 }
         }