1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
4 #include <sys/timerfd.h>
11 #include "alloc-util.h"
13 #include "event-source.h"
19 #include "memory-util.h"
20 #include "missing_syscall.h"
22 #include "process-util.h"
24 #include "signal-util.h"
25 #include "string-table.h"
26 #include "string-util.h"
28 #include "time-util.h"
30 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
32 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source
*s
) {
33 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
35 s
->type
== SOURCE_CHILD
&&
36 s
->child
.pidfd
>= 0 &&
37 s
->child
.options
== WEXITED
;
40 static bool event_source_is_online(sd_event_source
*s
) {
42 return s
->enabled
!= SD_EVENT_OFF
&& !s
->ratelimited
;
45 static bool event_source_is_offline(sd_event_source
*s
) {
47 return s
->enabled
== SD_EVENT_OFF
|| s
->ratelimited
;
50 static const char* const event_source_type_table
[_SOURCE_EVENT_SOURCE_TYPE_MAX
] = {
52 [SOURCE_TIME_REALTIME
] = "realtime",
53 [SOURCE_TIME_BOOTTIME
] = "bootime",
54 [SOURCE_TIME_MONOTONIC
] = "monotonic",
55 [SOURCE_TIME_REALTIME_ALARM
] = "realtime-alarm",
56 [SOURCE_TIME_BOOTTIME_ALARM
] = "boottime-alarm",
57 [SOURCE_SIGNAL
] = "signal",
58 [SOURCE_CHILD
] = "child",
59 [SOURCE_DEFER
] = "defer",
60 [SOURCE_POST
] = "post",
61 [SOURCE_EXIT
] = "exit",
62 [SOURCE_WATCHDOG
] = "watchdog",
63 [SOURCE_INOTIFY
] = "inotify",
66 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type
, int);
68 #define EVENT_SOURCE_IS_TIME(t) \
70 SOURCE_TIME_REALTIME, \
71 SOURCE_TIME_BOOTTIME, \
72 SOURCE_TIME_MONOTONIC, \
73 SOURCE_TIME_REALTIME_ALARM, \
74 SOURCE_TIME_BOOTTIME_ALARM)
76 #define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
79 SOURCE_TIME_REALTIME, \
80 SOURCE_TIME_BOOTTIME, \
81 SOURCE_TIME_MONOTONIC, \
82 SOURCE_TIME_REALTIME_ALARM, \
83 SOURCE_TIME_BOOTTIME_ALARM, \
97 /* timerfd_create() only supports these five clocks so far. We
98 * can add support for more clocks when the kernel learns to
99 * deal with them, too. */
100 struct clock_data realtime
;
101 struct clock_data boottime
;
102 struct clock_data monotonic
;
103 struct clock_data realtime_alarm
;
104 struct clock_data boottime_alarm
;
108 sd_event_source
**signal_sources
; /* indexed by signal number */
109 Hashmap
*signal_data
; /* indexed by priority */
111 Hashmap
*child_sources
;
112 unsigned n_online_child_sources
;
118 Hashmap
*inotify_data
; /* indexed by priority */
120 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
121 LIST_HEAD(struct inode_data
, inode_data_to_close
);
123 /* A list of inotify objects that already have events buffered which aren't processed yet */
124 LIST_HEAD(struct inotify_data
, inotify_data_buffered
);
129 triple_timestamp timestamp
;
132 bool exit_requested
:1;
133 bool need_process_child
:1;
135 bool profile_delays
:1;
140 sd_event
**default_event_ptr
;
142 usec_t watchdog_last
, watchdog_period
;
146 struct epoll_event
*event_queue
;
147 size_t event_queue_allocated
;
149 LIST_HEAD(sd_event_source
, sources
);
151 usec_t last_run_usec
, last_log_usec
;
152 unsigned delays
[sizeof(usec_t
) * 8];
155 static thread_local sd_event
*default_event
= NULL
;
157 static void source_disconnect(sd_event_source
*s
);
158 static void event_gc_inode_data(sd_event
*e
, struct inode_data
*d
);
160 static sd_event
*event_resolve(sd_event
*e
) {
161 return e
== SD_EVENT_DEFAULT
? default_event
: e
;
164 static int pending_prioq_compare(const void *a
, const void *b
) {
165 const sd_event_source
*x
= a
, *y
= b
;
171 /* Enabled ones first */
172 if (x
->enabled
!= SD_EVENT_OFF
&& y
->enabled
== SD_EVENT_OFF
)
174 if (x
->enabled
== SD_EVENT_OFF
&& y
->enabled
!= SD_EVENT_OFF
)
177 /* Non rate-limited ones first. */
178 r
= CMP(!!x
->ratelimited
, !!y
->ratelimited
);
182 /* Lower priority values first */
183 r
= CMP(x
->priority
, y
->priority
);
187 /* Older entries first */
188 return CMP(x
->pending_iteration
, y
->pending_iteration
);
191 static int prepare_prioq_compare(const void *a
, const void *b
) {
192 const sd_event_source
*x
= a
, *y
= b
;
198 /* Enabled ones first */
199 if (x
->enabled
!= SD_EVENT_OFF
&& y
->enabled
== SD_EVENT_OFF
)
201 if (x
->enabled
== SD_EVENT_OFF
&& y
->enabled
!= SD_EVENT_OFF
)
204 /* Non rate-limited ones first. */
205 r
= CMP(!!x
->ratelimited
, !!y
->ratelimited
);
209 /* Move most recently prepared ones last, so that we can stop
210 * preparing as soon as we hit one that has already been
211 * prepared in the current iteration */
212 r
= CMP(x
->prepare_iteration
, y
->prepare_iteration
);
216 /* Lower priority values first */
217 return CMP(x
->priority
, y
->priority
);
220 static usec_t
time_event_source_next(const sd_event_source
*s
) {
223 /* We have two kinds of event sources that have elapsation times associated with them: the actual
224 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
225 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
226 * looking at here. */
228 if (s
->ratelimited
) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
229 assert(s
->rate_limit
.begin
!= 0);
230 assert(s
->rate_limit
.interval
!= 0);
231 return usec_add(s
->rate_limit
.begin
, s
->rate_limit
.interval
);
234 /* Otherwise this must be a time event source, if not ratelimited */
235 if (EVENT_SOURCE_IS_TIME(s
->type
))
238 return USEC_INFINITY
;
241 static int earliest_time_prioq_compare(const void *a
, const void *b
) {
242 const sd_event_source
*x
= a
, *y
= b
;
244 /* Enabled ones first */
245 if (x
->enabled
!= SD_EVENT_OFF
&& y
->enabled
== SD_EVENT_OFF
)
247 if (x
->enabled
== SD_EVENT_OFF
&& y
->enabled
!= SD_EVENT_OFF
)
250 /* Move the pending ones to the end */
251 if (!x
->pending
&& y
->pending
)
253 if (x
->pending
&& !y
->pending
)
257 return CMP(time_event_source_next(x
), time_event_source_next(y
));
260 static usec_t
time_event_source_latest(const sd_event_source
*s
) {
263 if (s
->ratelimited
) { /* For ratelimited stuff the earliest and the latest time shall actually be the
264 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
266 assert(s
->rate_limit
.begin
!= 0);
267 assert(s
->rate_limit
.interval
!= 0);
268 return usec_add(s
->rate_limit
.begin
, s
->rate_limit
.interval
);
271 /* Must be a time event source, if not ratelimited */
272 if (EVENT_SOURCE_IS_TIME(s
->type
))
273 return usec_add(s
->time
.next
, s
->time
.accuracy
);
275 return USEC_INFINITY
;
278 static int latest_time_prioq_compare(const void *a
, const void *b
) {
279 const sd_event_source
*x
= a
, *y
= b
;
281 /* Enabled ones first */
282 if (x
->enabled
!= SD_EVENT_OFF
&& y
->enabled
== SD_EVENT_OFF
)
284 if (x
->enabled
== SD_EVENT_OFF
&& y
->enabled
!= SD_EVENT_OFF
)
287 /* Move the pending ones to the end */
288 if (!x
->pending
&& y
->pending
)
290 if (x
->pending
&& !y
->pending
)
294 return CMP(time_event_source_latest(x
), time_event_source_latest(y
));
297 static int exit_prioq_compare(const void *a
, const void *b
) {
298 const sd_event_source
*x
= a
, *y
= b
;
300 assert(x
->type
== SOURCE_EXIT
);
301 assert(y
->type
== SOURCE_EXIT
);
303 /* Enabled ones first */
304 if (x
->enabled
!= SD_EVENT_OFF
&& y
->enabled
== SD_EVENT_OFF
)
306 if (x
->enabled
== SD_EVENT_OFF
&& y
->enabled
!= SD_EVENT_OFF
)
309 /* Lower priority values first */
310 return CMP(x
->priority
, y
->priority
);
313 static void free_clock_data(struct clock_data
*d
) {
315 assert(d
->wakeup
== WAKEUP_CLOCK_DATA
);
318 prioq_free(d
->earliest
);
319 prioq_free(d
->latest
);
322 static sd_event
*event_free(sd_event
*e
) {
327 while ((s
= e
->sources
)) {
329 source_disconnect(s
);
330 sd_event_source_unref(s
);
333 assert(e
->n_sources
== 0);
335 if (e
->default_event_ptr
)
336 *(e
->default_event_ptr
) = NULL
;
338 safe_close(e
->epoll_fd
);
339 safe_close(e
->watchdog_fd
);
341 free_clock_data(&e
->realtime
);
342 free_clock_data(&e
->boottime
);
343 free_clock_data(&e
->monotonic
);
344 free_clock_data(&e
->realtime_alarm
);
345 free_clock_data(&e
->boottime_alarm
);
347 prioq_free(e
->pending
);
348 prioq_free(e
->prepare
);
351 free(e
->signal_sources
);
352 hashmap_free(e
->signal_data
);
354 hashmap_free(e
->inotify_data
);
356 hashmap_free(e
->child_sources
);
357 set_free(e
->post_sources
);
359 free(e
->event_queue
);
364 _public_
int sd_event_new(sd_event
** ret
) {
368 assert_return(ret
, -EINVAL
);
370 e
= new(sd_event
, 1);
378 .realtime
.wakeup
= WAKEUP_CLOCK_DATA
,
380 .realtime
.next
= USEC_INFINITY
,
381 .boottime
.wakeup
= WAKEUP_CLOCK_DATA
,
383 .boottime
.next
= USEC_INFINITY
,
384 .monotonic
.wakeup
= WAKEUP_CLOCK_DATA
,
386 .monotonic
.next
= USEC_INFINITY
,
387 .realtime_alarm
.wakeup
= WAKEUP_CLOCK_DATA
,
388 .realtime_alarm
.fd
= -1,
389 .realtime_alarm
.next
= USEC_INFINITY
,
390 .boottime_alarm
.wakeup
= WAKEUP_CLOCK_DATA
,
391 .boottime_alarm
.fd
= -1,
392 .boottime_alarm
.next
= USEC_INFINITY
,
393 .perturb
= USEC_INFINITY
,
394 .original_pid
= getpid_cached(),
397 r
= prioq_ensure_allocated(&e
->pending
, pending_prioq_compare
);
401 e
->epoll_fd
= epoll_create1(EPOLL_CLOEXEC
);
402 if (e
->epoll_fd
< 0) {
407 e
->epoll_fd
= fd_move_above_stdio(e
->epoll_fd
);
409 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
410 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 … 2^63 us will be logged every 5s.");
411 e
->profile_delays
= true;
422 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event
, sd_event
, event_free
);
424 _public_ sd_event_source
* sd_event_source_disable_unref(sd_event_source
*s
) {
426 (void) sd_event_source_set_enabled(s
, SD_EVENT_OFF
);
427 return sd_event_source_unref(s
);
430 static bool event_pid_changed(sd_event
*e
) {
433 /* We don't support people creating an event loop and keeping
434 * it around over a fork(). Let's complain. */
436 return e
->original_pid
!= getpid_cached();
439 static void source_io_unregister(sd_event_source
*s
) {
441 assert(s
->type
== SOURCE_IO
);
443 if (event_pid_changed(s
->event
))
446 if (!s
->io
.registered
)
449 if (epoll_ctl(s
->event
->epoll_fd
, EPOLL_CTL_DEL
, s
->io
.fd
, NULL
) < 0)
450 log_debug_errno(errno
, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
451 strna(s
->description
), event_source_type_to_string(s
->type
));
453 s
->io
.registered
= false;
456 static int source_io_register(
462 assert(s
->type
== SOURCE_IO
);
463 assert(enabled
!= SD_EVENT_OFF
);
465 struct epoll_event ev
= {
466 .events
= events
| (enabled
== SD_EVENT_ONESHOT
? EPOLLONESHOT
: 0),
470 if (epoll_ctl(s
->event
->epoll_fd
,
471 s
->io
.registered
? EPOLL_CTL_MOD
: EPOLL_CTL_ADD
,
475 s
->io
.registered
= true;
480 static void source_child_pidfd_unregister(sd_event_source
*s
) {
482 assert(s
->type
== SOURCE_CHILD
);
484 if (event_pid_changed(s
->event
))
487 if (!s
->child
.registered
)
490 if (EVENT_SOURCE_WATCH_PIDFD(s
))
491 if (epoll_ctl(s
->event
->epoll_fd
, EPOLL_CTL_DEL
, s
->child
.pidfd
, NULL
) < 0)
492 log_debug_errno(errno
, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
493 strna(s
->description
), event_source_type_to_string(s
->type
));
495 s
->child
.registered
= false;
498 static int source_child_pidfd_register(sd_event_source
*s
, int enabled
) {
500 assert(s
->type
== SOURCE_CHILD
);
501 assert(enabled
!= SD_EVENT_OFF
);
503 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
504 struct epoll_event ev
= {
505 .events
= EPOLLIN
| (enabled
== SD_EVENT_ONESHOT
? EPOLLONESHOT
: 0),
509 if (epoll_ctl(s
->event
->epoll_fd
,
510 s
->child
.registered
? EPOLL_CTL_MOD
: EPOLL_CTL_ADD
,
511 s
->child
.pidfd
, &ev
) < 0)
515 s
->child
.registered
= true;
519 static clockid_t
event_source_type_to_clock(EventSourceType t
) {
523 case SOURCE_TIME_REALTIME
:
524 return CLOCK_REALTIME
;
526 case SOURCE_TIME_BOOTTIME
:
527 return CLOCK_BOOTTIME
;
529 case SOURCE_TIME_MONOTONIC
:
530 return CLOCK_MONOTONIC
;
532 case SOURCE_TIME_REALTIME_ALARM
:
533 return CLOCK_REALTIME_ALARM
;
535 case SOURCE_TIME_BOOTTIME_ALARM
:
536 return CLOCK_BOOTTIME_ALARM
;
539 return (clockid_t
) -1;
543 static EventSourceType
clock_to_event_source_type(clockid_t clock
) {
548 return SOURCE_TIME_REALTIME
;
551 return SOURCE_TIME_BOOTTIME
;
553 case CLOCK_MONOTONIC
:
554 return SOURCE_TIME_MONOTONIC
;
556 case CLOCK_REALTIME_ALARM
:
557 return SOURCE_TIME_REALTIME_ALARM
;
559 case CLOCK_BOOTTIME_ALARM
:
560 return SOURCE_TIME_BOOTTIME_ALARM
;
563 return _SOURCE_EVENT_SOURCE_TYPE_INVALID
;
567 static struct clock_data
* event_get_clock_data(sd_event
*e
, EventSourceType t
) {
572 case SOURCE_TIME_REALTIME
:
575 case SOURCE_TIME_BOOTTIME
:
578 case SOURCE_TIME_MONOTONIC
:
579 return &e
->monotonic
;
581 case SOURCE_TIME_REALTIME_ALARM
:
582 return &e
->realtime_alarm
;
584 case SOURCE_TIME_BOOTTIME_ALARM
:
585 return &e
->boottime_alarm
;
592 static void event_free_signal_data(sd_event
*e
, struct signal_data
*d
) {
598 hashmap_remove(e
->signal_data
, &d
->priority
);
603 static int event_make_signal_data(
606 struct signal_data
**ret
) {
608 struct signal_data
*d
;
616 if (event_pid_changed(e
))
619 if (e
->signal_sources
&& e
->signal_sources
[sig
])
620 priority
= e
->signal_sources
[sig
]->priority
;
622 priority
= SD_EVENT_PRIORITY_NORMAL
;
624 d
= hashmap_get(e
->signal_data
, &priority
);
626 if (sigismember(&d
->sigset
, sig
) > 0) {
632 d
= new(struct signal_data
, 1);
636 *d
= (struct signal_data
) {
637 .wakeup
= WAKEUP_SIGNAL_DATA
,
639 .priority
= priority
,
642 r
= hashmap_ensure_put(&e
->signal_data
, &uint64_hash_ops
, &d
->priority
, d
);
652 assert_se(sigaddset(&ss_copy
, sig
) >= 0);
654 r
= signalfd(d
->fd
, &ss_copy
, SFD_NONBLOCK
|SFD_CLOEXEC
);
668 d
->fd
= fd_move_above_stdio(r
);
670 struct epoll_event ev
= {
675 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, d
->fd
, &ev
) < 0) {
687 event_free_signal_data(e
, d
);
692 static void event_unmask_signal_data(sd_event
*e
, struct signal_data
*d
, int sig
) {
696 /* Turns off the specified signal in the signal data
697 * object. If the signal mask of the object becomes empty that
700 if (sigismember(&d
->sigset
, sig
) == 0)
703 assert_se(sigdelset(&d
->sigset
, sig
) >= 0);
705 if (sigisemptyset(&d
->sigset
)) {
706 /* If all the mask is all-zero we can get rid of the structure */
707 event_free_signal_data(e
, d
);
713 if (signalfd(d
->fd
, &d
->sigset
, SFD_NONBLOCK
|SFD_CLOEXEC
) < 0)
714 log_debug_errno(errno
, "Failed to unset signal bit, ignoring: %m");
717 static void event_gc_signal_data(sd_event
*e
, const int64_t *priority
, int sig
) {
718 struct signal_data
*d
;
719 static const int64_t zero_priority
= 0;
723 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
724 * and possibly drop the signalfd for it. */
726 if (sig
== SIGCHLD
&&
727 e
->n_online_child_sources
> 0)
730 if (e
->signal_sources
&&
731 e
->signal_sources
[sig
] &&
732 event_source_is_online(e
->signal_sources
[sig
]))
736 * The specified signal might be enabled in three different queues:
738 * 1) the one that belongs to the priority passed (if it is non-NULL)
739 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
740 * 3) the 0 priority (to cover the SIGCHLD case)
742 * Hence, let's remove it from all three here.
746 d
= hashmap_get(e
->signal_data
, priority
);
748 event_unmask_signal_data(e
, d
, sig
);
751 if (e
->signal_sources
&& e
->signal_sources
[sig
]) {
752 d
= hashmap_get(e
->signal_data
, &e
->signal_sources
[sig
]->priority
);
754 event_unmask_signal_data(e
, d
, sig
);
757 d
= hashmap_get(e
->signal_data
, &zero_priority
);
759 event_unmask_signal_data(e
, d
, sig
);
762 static void event_source_pp_prioq_reshuffle(sd_event_source
*s
) {
765 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
766 * they are enabled/disabled or marked pending and such. */
769 prioq_reshuffle(s
->event
->pending
, s
, &s
->pending_index
);
772 prioq_reshuffle(s
->event
->prepare
, s
, &s
->prepare_index
);
775 static void event_source_time_prioq_reshuffle(sd_event_source
*s
) {
776 struct clock_data
*d
;
780 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
781 * pending, enable state. Makes sure the two prioq's are ordered properly again. */
784 d
= &s
->event
->monotonic
;
786 assert(EVENT_SOURCE_IS_TIME(s
->type
));
787 assert_se(d
= event_get_clock_data(s
->event
, s
->type
));
790 prioq_reshuffle(d
->earliest
, s
, &s
->earliest_index
);
791 prioq_reshuffle(d
->latest
, s
, &s
->latest_index
);
792 d
->needs_rearm
= true;
795 static void event_source_time_prioq_remove(
797 struct clock_data
*d
) {
802 prioq_remove(d
->earliest
, s
, &s
->earliest_index
);
803 prioq_remove(d
->latest
, s
, &s
->latest_index
);
804 s
->earliest_index
= s
->latest_index
= PRIOQ_IDX_NULL
;
805 d
->needs_rearm
= true;
808 static void source_disconnect(sd_event_source
*s
) {
816 assert(s
->event
->n_sources
> 0);
822 source_io_unregister(s
);
826 case SOURCE_TIME_REALTIME
:
827 case SOURCE_TIME_BOOTTIME
:
828 case SOURCE_TIME_MONOTONIC
:
829 case SOURCE_TIME_REALTIME_ALARM
:
830 case SOURCE_TIME_BOOTTIME_ALARM
:
831 /* Only remove this event source from the time event source here if it is not ratelimited. If
832 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
833 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
835 if (!s
->ratelimited
) {
836 struct clock_data
*d
;
837 assert_se(d
= event_get_clock_data(s
->event
, s
->type
));
838 event_source_time_prioq_remove(s
, d
);
844 if (s
->signal
.sig
> 0) {
846 if (s
->event
->signal_sources
)
847 s
->event
->signal_sources
[s
->signal
.sig
] = NULL
;
849 event_gc_signal_data(s
->event
, &s
->priority
, s
->signal
.sig
);
855 if (s
->child
.pid
> 0) {
856 if (event_source_is_online(s
)) {
857 assert(s
->event
->n_online_child_sources
> 0);
858 s
->event
->n_online_child_sources
--;
861 (void) hashmap_remove(s
->event
->child_sources
, PID_TO_PTR(s
->child
.pid
));
864 if (EVENT_SOURCE_WATCH_PIDFD(s
))
865 source_child_pidfd_unregister(s
);
867 event_gc_signal_data(s
->event
, &s
->priority
, SIGCHLD
);
876 set_remove(s
->event
->post_sources
, s
);
880 prioq_remove(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
883 case SOURCE_INOTIFY
: {
884 struct inode_data
*inode_data
;
886 inode_data
= s
->inotify
.inode_data
;
888 struct inotify_data
*inotify_data
;
889 assert_se(inotify_data
= inode_data
->inotify_data
);
891 /* Detach this event source from the inode object */
892 LIST_REMOVE(inotify
.by_inode_data
, inode_data
->event_sources
, s
);
893 s
->inotify
.inode_data
= NULL
;
896 assert(inotify_data
->n_pending
> 0);
897 inotify_data
->n_pending
--;
900 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
901 * continued to being watched. That's because inotify doesn't really have an API for that: we
902 * can only change watch masks with access to the original inode either by fd or by path. But
903 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
904 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
905 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
906 * there), but given the need for open_by_handle_at() which is privileged and not universally
907 * available this would be quite an incomplete solution. Hence we go the other way, leave the
908 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
909 * anymore after reception. Yes, this sucks, but … Linux … */
911 /* Maybe release the inode data (and its inotify) */
912 event_gc_inode_data(s
->event
, inode_data
);
919 assert_not_reached("Wut? I shouldn't exist.");
923 prioq_remove(s
->event
->pending
, s
, &s
->pending_index
);
926 prioq_remove(s
->event
->prepare
, s
, &s
->prepare_index
);
929 event_source_time_prioq_remove(s
, &s
->event
->monotonic
);
931 event
= TAKE_PTR(s
->event
);
932 LIST_REMOVE(sources
, event
->sources
, s
);
935 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
936 * pidfd associated with this event source, which we'll do only on source_free(). */
939 sd_event_unref(event
);
942 static sd_event_source
* source_free(sd_event_source
*s
) {
945 source_disconnect(s
);
947 if (s
->type
== SOURCE_IO
&& s
->io
.owned
)
948 s
->io
.fd
= safe_close(s
->io
.fd
);
950 if (s
->type
== SOURCE_CHILD
) {
951 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
953 if (s
->child
.process_owned
) {
955 if (!s
->child
.exited
) {
958 if (s
->child
.pidfd
>= 0) {
959 if (pidfd_send_signal(s
->child
.pidfd
, SIGKILL
, NULL
, 0) < 0) {
960 if (errno
== ESRCH
) /* Already dead */
962 else if (!ERRNO_IS_NOT_SUPPORTED(errno
))
963 log_debug_errno(errno
, "Failed to kill process " PID_FMT
" via pidfd_send_signal(), re-trying via kill(): %m",
970 if (kill(s
->child
.pid
, SIGKILL
) < 0)
971 if (errno
!= ESRCH
) /* Already dead */
972 log_debug_errno(errno
, "Failed to kill process " PID_FMT
" via kill(), ignoring: %m",
976 if (!s
->child
.waited
) {
979 /* Reap the child if we can */
980 (void) waitid(P_PID
, s
->child
.pid
, &si
, WEXITED
);
984 if (s
->child
.pidfd_owned
)
985 s
->child
.pidfd
= safe_close(s
->child
.pidfd
);
988 if (s
->destroy_callback
)
989 s
->destroy_callback(s
->userdata
);
991 free(s
->description
);
994 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source
*, source_free
);
996 static int source_set_pending(sd_event_source
*s
, bool b
) {
1000 assert(s
->type
!= SOURCE_EXIT
);
1002 if (s
->pending
== b
)
1008 s
->pending_iteration
= s
->event
->iteration
;
1010 r
= prioq_put(s
->event
->pending
, s
, &s
->pending_index
);
1016 assert_se(prioq_remove(s
->event
->pending
, s
, &s
->pending_index
));
1018 if (EVENT_SOURCE_IS_TIME(s
->type
))
1019 event_source_time_prioq_reshuffle(s
);
1021 if (s
->type
== SOURCE_SIGNAL
&& !b
) {
1022 struct signal_data
*d
;
1024 d
= hashmap_get(s
->event
->signal_data
, &s
->priority
);
1025 if (d
&& d
->current
== s
)
1029 if (s
->type
== SOURCE_INOTIFY
) {
1031 assert(s
->inotify
.inode_data
);
1032 assert(s
->inotify
.inode_data
->inotify_data
);
1035 s
->inotify
.inode_data
->inotify_data
->n_pending
++;
1037 assert(s
->inotify
.inode_data
->inotify_data
->n_pending
> 0);
1038 s
->inotify
.inode_data
->inotify_data
->n_pending
--;
1045 static sd_event_source
*source_new(sd_event
*e
, bool floating
, EventSourceType type
) {
1050 s
= new(sd_event_source
, 1);
1054 *s
= (struct sd_event_source
) {
1057 .floating
= floating
,
1059 .pending_index
= PRIOQ_IDX_NULL
,
1060 .prepare_index
= PRIOQ_IDX_NULL
,
1066 LIST_PREPEND(sources
, e
->sources
, s
);
1072 static int io_exit_callback(sd_event_source
*s
, int fd
, uint32_t revents
, void *userdata
) {
1075 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1078 _public_
int sd_event_add_io(
1080 sd_event_source
**ret
,
1083 sd_event_io_handler_t callback
,
1086 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1089 assert_return(e
, -EINVAL
);
1090 assert_return(e
= event_resolve(e
), -ENOPKG
);
1091 assert_return(fd
>= 0, -EBADF
);
1092 assert_return(!(events
& ~(EPOLLIN
|EPOLLOUT
|EPOLLRDHUP
|EPOLLPRI
|EPOLLERR
|EPOLLHUP
|EPOLLET
)), -EINVAL
);
1093 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1094 assert_return(!event_pid_changed(e
), -ECHILD
);
1097 callback
= io_exit_callback
;
1099 s
= source_new(e
, !ret
, SOURCE_IO
);
1103 s
->wakeup
= WAKEUP_EVENT_SOURCE
;
1105 s
->io
.events
= events
;
1106 s
->io
.callback
= callback
;
1107 s
->userdata
= userdata
;
1108 s
->enabled
= SD_EVENT_ON
;
1110 r
= source_io_register(s
, s
->enabled
, events
);
1121 static void initialize_perturb(sd_event
*e
) {
1122 sd_id128_t bootid
= {};
1124 /* When we sleep for longer, we try to realign the wakeup to
1125 the same time within each minute/second/250ms, so that
1126 events all across the system can be coalesced into a single
1127 CPU wakeup. However, let's take some system-specific
1128 randomness for this value, so that in a network of systems
1129 with synced clocks timer events are distributed a
1130 bit. Here, we calculate a perturbation usec offset from the
1133 if (_likely_(e
->perturb
!= USEC_INFINITY
))
1136 if (sd_id128_get_boot(&bootid
) >= 0)
1137 e
->perturb
= (bootid
.qwords
[0] ^ bootid
.qwords
[1]) % USEC_PER_MINUTE
;
1140 static int event_setup_timer_fd(
1142 struct clock_data
*d
,
1148 if (_likely_(d
->fd
>= 0))
1151 _cleanup_close_
int fd
= -1;
1153 fd
= timerfd_create(clock
, TFD_NONBLOCK
|TFD_CLOEXEC
);
1157 fd
= fd_move_above_stdio(fd
);
1159 struct epoll_event ev
= {
1164 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, fd
, &ev
) < 0)
1167 d
->fd
= TAKE_FD(fd
);
1171 static int time_exit_callback(sd_event_source
*s
, uint64_t usec
, void *userdata
) {
1174 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1177 static int setup_clock_data(sd_event
*e
, struct clock_data
*d
, clockid_t clock
) {
1183 r
= event_setup_timer_fd(e
, d
, clock
);
1188 r
= prioq_ensure_allocated(&d
->earliest
, earliest_time_prioq_compare
);
1192 r
= prioq_ensure_allocated(&d
->latest
, latest_time_prioq_compare
);
1199 static int event_source_time_prioq_put(
1201 struct clock_data
*d
) {
1208 r
= prioq_put(d
->earliest
, s
, &s
->earliest_index
);
1212 r
= prioq_put(d
->latest
, s
, &s
->latest_index
);
1214 assert_se(prioq_remove(d
->earliest
, s
, &s
->earliest_index
) > 0);
1215 s
->earliest_index
= PRIOQ_IDX_NULL
;
1219 d
->needs_rearm
= true;
1223 _public_
int sd_event_add_time(
1225 sd_event_source
**ret
,
1229 sd_event_time_handler_t callback
,
1232 EventSourceType type
;
1233 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1234 struct clock_data
*d
;
1237 assert_return(e
, -EINVAL
);
1238 assert_return(e
= event_resolve(e
), -ENOPKG
);
1239 assert_return(accuracy
!= UINT64_MAX
, -EINVAL
);
1240 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1241 assert_return(!event_pid_changed(e
), -ECHILD
);
1243 if (!clock_supported(clock
)) /* Checks whether the kernel supports the clock */
1246 type
= clock_to_event_source_type(clock
); /* checks whether sd-event supports this clock */
1251 callback
= time_exit_callback
;
1253 assert_se(d
= event_get_clock_data(e
, type
));
1255 r
= setup_clock_data(e
, d
, clock
);
1259 s
= source_new(e
, !ret
, type
);
1263 s
->time
.next
= usec
;
1264 s
->time
.accuracy
= accuracy
== 0 ? DEFAULT_ACCURACY_USEC
: accuracy
;
1265 s
->time
.callback
= callback
;
1266 s
->earliest_index
= s
->latest_index
= PRIOQ_IDX_NULL
;
1267 s
->userdata
= userdata
;
1268 s
->enabled
= SD_EVENT_ONESHOT
;
1270 r
= event_source_time_prioq_put(s
, d
);
1281 _public_
int sd_event_add_time_relative(
1283 sd_event_source
**ret
,
1287 sd_event_time_handler_t callback
,
1293 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1294 * checks for overflow. */
1296 r
= sd_event_now(e
, clock
, &t
);
1300 if (usec
>= USEC_INFINITY
- t
)
1303 return sd_event_add_time(e
, ret
, clock
, t
+ usec
, accuracy
, callback
, userdata
);
1306 static int signal_exit_callback(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
1309 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1312 _public_
int sd_event_add_signal(
1314 sd_event_source
**ret
,
1316 sd_event_signal_handler_t callback
,
1319 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1320 struct signal_data
*d
;
1323 assert_return(e
, -EINVAL
);
1324 assert_return(e
= event_resolve(e
), -ENOPKG
);
1325 assert_return(SIGNAL_VALID(sig
), -EINVAL
);
1326 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1327 assert_return(!event_pid_changed(e
), -ECHILD
);
1330 callback
= signal_exit_callback
;
1332 r
= signal_is_blocked(sig
);
1338 if (!e
->signal_sources
) {
1339 e
->signal_sources
= new0(sd_event_source
*, _NSIG
);
1340 if (!e
->signal_sources
)
1342 } else if (e
->signal_sources
[sig
])
1345 s
= source_new(e
, !ret
, SOURCE_SIGNAL
);
1349 s
->signal
.sig
= sig
;
1350 s
->signal
.callback
= callback
;
1351 s
->userdata
= userdata
;
1352 s
->enabled
= SD_EVENT_ON
;
1354 e
->signal_sources
[sig
] = s
;
1356 r
= event_make_signal_data(e
, sig
, &d
);
1360 /* Use the signal name as description for the event source by default */
1361 (void) sd_event_source_set_description(s
, signal_to_string(sig
));
1370 static int child_exit_callback(sd_event_source
*s
, const siginfo_t
*si
, void *userdata
) {
1373 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1376 static bool shall_use_pidfd(void) {
1377 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1378 return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1381 _public_
int sd_event_add_child(
1383 sd_event_source
**ret
,
1386 sd_event_child_handler_t callback
,
1389 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1392 assert_return(e
, -EINVAL
);
1393 assert_return(e
= event_resolve(e
), -ENOPKG
);
1394 assert_return(pid
> 1, -EINVAL
);
1395 assert_return(!(options
& ~(WEXITED
|WSTOPPED
|WCONTINUED
)), -EINVAL
);
1396 assert_return(options
!= 0, -EINVAL
);
1397 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1398 assert_return(!event_pid_changed(e
), -ECHILD
);
1401 callback
= child_exit_callback
;
1403 if (e
->n_online_child_sources
== 0) {
1404 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1405 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1406 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1409 * (As an optimization we only do this check on the first child event source created.) */
1410 r
= signal_is_blocked(SIGCHLD
);
1417 r
= hashmap_ensure_allocated(&e
->child_sources
, NULL
);
1421 if (hashmap_contains(e
->child_sources
, PID_TO_PTR(pid
)))
1424 s
= source_new(e
, !ret
, SOURCE_CHILD
);
1428 s
->wakeup
= WAKEUP_EVENT_SOURCE
;
1430 s
->child
.options
= options
;
1431 s
->child
.callback
= callback
;
1432 s
->userdata
= userdata
;
1433 s
->enabled
= SD_EVENT_ONESHOT
;
1435 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1436 * pin the PID, and make regular waitid() handling race-free. */
1438 if (shall_use_pidfd()) {
1439 s
->child
.pidfd
= pidfd_open(s
->child
.pid
, 0);
1440 if (s
->child
.pidfd
< 0) {
1441 /* Propagate errors unless the syscall is not supported or blocked */
1442 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
))
1445 s
->child
.pidfd_owned
= true; /* If we allocate the pidfd we own it by default */
1447 s
->child
.pidfd
= -1;
1449 r
= hashmap_put(e
->child_sources
, PID_TO_PTR(pid
), s
);
1453 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
1454 /* We have a pidfd and we only want to watch for exit */
1455 r
= source_child_pidfd_register(s
, s
->enabled
);
1460 /* We have no pidfd or we shall wait for some other event than WEXITED */
1461 r
= event_make_signal_data(e
, SIGCHLD
, NULL
);
1465 e
->need_process_child
= true;
1468 e
->n_online_child_sources
++;
1476 _public_
int sd_event_add_child_pidfd(
1478 sd_event_source
**ret
,
1481 sd_event_child_handler_t callback
,
1485 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1489 assert_return(e
, -EINVAL
);
1490 assert_return(e
= event_resolve(e
), -ENOPKG
);
1491 assert_return(pidfd
>= 0, -EBADF
);
1492 assert_return(!(options
& ~(WEXITED
|WSTOPPED
|WCONTINUED
)), -EINVAL
);
1493 assert_return(options
!= 0, -EINVAL
);
1494 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1495 assert_return(!event_pid_changed(e
), -ECHILD
);
1498 callback
= child_exit_callback
;
1500 if (e
->n_online_child_sources
== 0) {
1501 r
= signal_is_blocked(SIGCHLD
);
1508 r
= hashmap_ensure_allocated(&e
->child_sources
, NULL
);
1512 r
= pidfd_get_pid(pidfd
, &pid
);
1516 if (hashmap_contains(e
->child_sources
, PID_TO_PTR(pid
)))
1519 s
= source_new(e
, !ret
, SOURCE_CHILD
);
1523 s
->wakeup
= WAKEUP_EVENT_SOURCE
;
1524 s
->child
.pidfd
= pidfd
;
1526 s
->child
.options
= options
;
1527 s
->child
.callback
= callback
;
1528 s
->child
.pidfd_owned
= false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1529 s
->userdata
= userdata
;
1530 s
->enabled
= SD_EVENT_ONESHOT
;
1532 r
= hashmap_put(e
->child_sources
, PID_TO_PTR(pid
), s
);
1536 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
1537 /* We only want to watch for WEXITED */
1538 r
= source_child_pidfd_register(s
, s
->enabled
);
1542 /* We shall wait for some other event than WEXITED */
1543 r
= event_make_signal_data(e
, SIGCHLD
, NULL
);
1547 e
->need_process_child
= true;
1550 e
->n_online_child_sources
++;
1558 static int generic_exit_callback(sd_event_source
*s
, void *userdata
) {
1561 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1564 _public_
int sd_event_add_defer(
1566 sd_event_source
**ret
,
1567 sd_event_handler_t callback
,
1570 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1573 assert_return(e
, -EINVAL
);
1574 assert_return(e
= event_resolve(e
), -ENOPKG
);
1575 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1576 assert_return(!event_pid_changed(e
), -ECHILD
);
1579 callback
= generic_exit_callback
;
1581 s
= source_new(e
, !ret
, SOURCE_DEFER
);
1585 s
->defer
.callback
= callback
;
1586 s
->userdata
= userdata
;
1587 s
->enabled
= SD_EVENT_ONESHOT
;
1589 r
= source_set_pending(s
, true);
1600 _public_
int sd_event_add_post(
1602 sd_event_source
**ret
,
1603 sd_event_handler_t callback
,
1606 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1609 assert_return(e
, -EINVAL
);
1610 assert_return(e
= event_resolve(e
), -ENOPKG
);
1611 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1612 assert_return(!event_pid_changed(e
), -ECHILD
);
1615 callback
= generic_exit_callback
;
1617 s
= source_new(e
, !ret
, SOURCE_POST
);
1621 s
->post
.callback
= callback
;
1622 s
->userdata
= userdata
;
1623 s
->enabled
= SD_EVENT_ON
;
1625 r
= set_ensure_put(&e
->post_sources
, NULL
, s
);
1637 _public_
int sd_event_add_exit(
1639 sd_event_source
**ret
,
1640 sd_event_handler_t callback
,
1643 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1646 assert_return(e
, -EINVAL
);
1647 assert_return(e
= event_resolve(e
), -ENOPKG
);
1648 assert_return(callback
, -EINVAL
);
1649 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1650 assert_return(!event_pid_changed(e
), -ECHILD
);
1652 r
= prioq_ensure_allocated(&e
->exit
, exit_prioq_compare
);
1656 s
= source_new(e
, !ret
, SOURCE_EXIT
);
1660 s
->exit
.callback
= callback
;
1661 s
->userdata
= userdata
;
1662 s
->exit
.prioq_index
= PRIOQ_IDX_NULL
;
1663 s
->enabled
= SD_EVENT_ONESHOT
;
1665 r
= prioq_put(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
1676 static void event_free_inotify_data(sd_event
*e
, struct inotify_data
*d
) {
1682 assert(hashmap_isempty(d
->inodes
));
1683 assert(hashmap_isempty(d
->wd
));
1685 if (d
->buffer_filled
> 0)
1686 LIST_REMOVE(buffered
, e
->inotify_data_buffered
, d
);
1688 hashmap_free(d
->inodes
);
1689 hashmap_free(d
->wd
);
1691 assert_se(hashmap_remove(e
->inotify_data
, &d
->priority
) == d
);
1694 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_DEL
, d
->fd
, NULL
) < 0)
1695 log_debug_errno(errno
, "Failed to remove inotify fd from epoll, ignoring: %m");
1702 static int event_make_inotify_data(
1705 struct inotify_data
**ret
) {
1707 _cleanup_close_
int fd
= -1;
1708 struct inotify_data
*d
;
1713 d
= hashmap_get(e
->inotify_data
, &priority
);
1720 fd
= inotify_init1(IN_NONBLOCK
|O_CLOEXEC
);
1724 fd
= fd_move_above_stdio(fd
);
1726 d
= new(struct inotify_data
, 1);
1730 *d
= (struct inotify_data
) {
1731 .wakeup
= WAKEUP_INOTIFY_DATA
,
1733 .priority
= priority
,
1736 r
= hashmap_ensure_put(&e
->inotify_data
, &uint64_hash_ops
, &d
->priority
, d
);
1738 d
->fd
= safe_close(d
->fd
);
1743 struct epoll_event ev
= {
1748 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, d
->fd
, &ev
) < 0) {
1750 d
->fd
= safe_close(d
->fd
); /* let's close this ourselves, as event_free_inotify_data() would otherwise
1751 * remove the fd from the epoll first, which we don't want as we couldn't
1752 * add it in the first place. */
1753 event_free_inotify_data(e
, d
);
1763 static int inode_data_compare(const struct inode_data
*x
, const struct inode_data
*y
) {
1769 r
= CMP(x
->dev
, y
->dev
);
1773 return CMP(x
->ino
, y
->ino
);
1776 static void inode_data_hash_func(const struct inode_data
*d
, struct siphash
*state
) {
1779 siphash24_compress(&d
->dev
, sizeof(d
->dev
), state
);
1780 siphash24_compress(&d
->ino
, sizeof(d
->ino
), state
);
1783 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops
, struct inode_data
, inode_data_hash_func
, inode_data_compare
);
1785 static void event_free_inode_data(
1787 struct inode_data
*d
) {
1794 assert(!d
->event_sources
);
1797 LIST_REMOVE(to_close
, e
->inode_data_to_close
, d
);
1801 if (d
->inotify_data
) {
1804 if (d
->inotify_data
->fd
>= 0) {
1805 /* So here's a problem. At the time this runs the watch descriptor might already be
1806 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
1807 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
1808 * likely case to happen. */
1810 if (inotify_rm_watch(d
->inotify_data
->fd
, d
->wd
) < 0 && errno
!= EINVAL
)
1811 log_debug_errno(errno
, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d
->wd
);
1814 assert_se(hashmap_remove(d
->inotify_data
->wd
, INT_TO_PTR(d
->wd
)) == d
);
1817 assert_se(hashmap_remove(d
->inotify_data
->inodes
, d
) == d
);
1823 static void event_gc_inode_data(
1825 struct inode_data
*d
) {
1827 struct inotify_data
*inotify_data
;
1834 if (d
->event_sources
)
1837 inotify_data
= d
->inotify_data
;
1838 event_free_inode_data(e
, d
);
1840 if (inotify_data
&& hashmap_isempty(inotify_data
->inodes
))
1841 event_free_inotify_data(e
, inotify_data
);
1844 static int event_make_inode_data(
1846 struct inotify_data
*inotify_data
,
1849 struct inode_data
**ret
) {
1851 struct inode_data
*d
, key
;
1855 assert(inotify_data
);
1857 key
= (struct inode_data
) {
1862 d
= hashmap_get(inotify_data
->inodes
, &key
);
1870 r
= hashmap_ensure_allocated(&inotify_data
->inodes
, &inode_data_hash_ops
);
1874 d
= new(struct inode_data
, 1);
1878 *d
= (struct inode_data
) {
1883 .inotify_data
= inotify_data
,
1886 r
= hashmap_put(inotify_data
->inodes
, d
, d
);
1898 static uint32_t inode_data_determine_mask(struct inode_data
*d
) {
1899 bool excl_unlink
= true;
1900 uint32_t combined
= 0;
1905 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
1906 * the IN_EXCL_UNLINK flag is ANDed instead.
1908 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
1909 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
1910 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
1911 * events we don't care for client-side. */
1913 LIST_FOREACH(inotify
.by_inode_data
, s
, d
->event_sources
) {
1915 if ((s
->inotify
.mask
& IN_EXCL_UNLINK
) == 0)
1916 excl_unlink
= false;
1918 combined
|= s
->inotify
.mask
;
1921 return (combined
& ~(IN_ONESHOT
|IN_DONT_FOLLOW
|IN_ONLYDIR
|IN_EXCL_UNLINK
)) | (excl_unlink
? IN_EXCL_UNLINK
: 0);
1924 static int inode_data_realize_watch(sd_event
*e
, struct inode_data
*d
) {
1925 uint32_t combined_mask
;
1931 combined_mask
= inode_data_determine_mask(d
);
1933 if (d
->wd
>= 0 && combined_mask
== d
->combined_mask
)
1936 r
= hashmap_ensure_allocated(&d
->inotify_data
->wd
, NULL
);
1940 wd
= inotify_add_watch_fd(d
->inotify_data
->fd
, d
->fd
, combined_mask
);
1945 r
= hashmap_put(d
->inotify_data
->wd
, INT_TO_PTR(wd
), d
);
1947 (void) inotify_rm_watch(d
->inotify_data
->fd
, wd
);
1953 } else if (d
->wd
!= wd
) {
1955 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
1956 (void) inotify_rm_watch(d
->fd
, wd
);
1960 d
->combined_mask
= combined_mask
;
1964 static int inotify_exit_callback(sd_event_source
*s
, const struct inotify_event
*event
, void *userdata
) {
1967 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1970 _public_
int sd_event_add_inotify(
1972 sd_event_source
**ret
,
1975 sd_event_inotify_handler_t callback
,
1978 struct inotify_data
*inotify_data
= NULL
;
1979 struct inode_data
*inode_data
= NULL
;
1980 _cleanup_close_
int fd
= -1;
1981 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1985 assert_return(e
, -EINVAL
);
1986 assert_return(e
= event_resolve(e
), -ENOPKG
);
1987 assert_return(path
, -EINVAL
);
1988 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1989 assert_return(!event_pid_changed(e
), -ECHILD
);
1992 callback
= inotify_exit_callback
;
1994 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
1995 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
1996 * the user can't use them for us. */
1997 if (mask
& IN_MASK_ADD
)
2000 fd
= open(path
, O_PATH
|O_CLOEXEC
|
2001 (mask
& IN_ONLYDIR
? O_DIRECTORY
: 0)|
2002 (mask
& IN_DONT_FOLLOW
? O_NOFOLLOW
: 0));
2006 if (fstat(fd
, &st
) < 0)
2009 s
= source_new(e
, !ret
, SOURCE_INOTIFY
);
2013 s
->enabled
= mask
& IN_ONESHOT
? SD_EVENT_ONESHOT
: SD_EVENT_ON
;
2014 s
->inotify
.mask
= mask
;
2015 s
->inotify
.callback
= callback
;
2016 s
->userdata
= userdata
;
2018 /* Allocate an inotify object for this priority, and an inode object within it */
2019 r
= event_make_inotify_data(e
, SD_EVENT_PRIORITY_NORMAL
, &inotify_data
);
2023 r
= event_make_inode_data(e
, inotify_data
, st
.st_dev
, st
.st_ino
, &inode_data
);
2025 event_free_inotify_data(e
, inotify_data
);
2029 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2030 * the event source, until then, for which we need the original inode. */
2031 if (inode_data
->fd
< 0) {
2032 inode_data
->fd
= TAKE_FD(fd
);
2033 LIST_PREPEND(to_close
, e
->inode_data_to_close
, inode_data
);
2036 /* Link our event source to the inode data object */
2037 LIST_PREPEND(inotify
.by_inode_data
, inode_data
->event_sources
, s
);
2038 s
->inotify
.inode_data
= inode_data
;
2040 /* Actually realize the watch now */
2041 r
= inode_data_realize_watch(e
, inode_data
);
2045 (void) sd_event_source_set_description(s
, path
);
2054 static sd_event_source
* event_source_free(sd_event_source
*s
) {
2058 /* Here's a special hack: when we are called from a
2059 * dispatch handler we won't free the event source
2060 * immediately, but we will detach the fd from the
2061 * epoll. This way it is safe for the caller to unref
2062 * the event source and immediately close the fd, but
2063 * we still retain a valid event source object after
2066 if (s
->dispatching
) {
2067 if (s
->type
== SOURCE_IO
)
2068 source_io_unregister(s
);
2070 source_disconnect(s
);
2077 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source
, sd_event_source
, event_source_free
);
2079 _public_
int sd_event_source_set_description(sd_event_source
*s
, const char *description
) {
2080 assert_return(s
, -EINVAL
);
2081 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2083 return free_and_strdup(&s
->description
, description
);
2086 _public_
int sd_event_source_get_description(sd_event_source
*s
, const char **description
) {
2087 assert_return(s
, -EINVAL
);
2088 assert_return(description
, -EINVAL
);
2089 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2091 if (!s
->description
)
2094 *description
= s
->description
;
2098 _public_ sd_event
*sd_event_source_get_event(sd_event_source
*s
) {
2099 assert_return(s
, NULL
);
2104 _public_
int sd_event_source_get_pending(sd_event_source
*s
) {
2105 assert_return(s
, -EINVAL
);
2106 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
2107 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2108 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2113 _public_
int sd_event_source_get_io_fd(sd_event_source
*s
) {
2114 assert_return(s
, -EINVAL
);
2115 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2116 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2121 _public_
int sd_event_source_set_io_fd(sd_event_source
*s
, int fd
) {
2124 assert_return(s
, -EINVAL
);
2125 assert_return(fd
>= 0, -EBADF
);
2126 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2127 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2132 if (event_source_is_offline(s
)) {
2134 s
->io
.registered
= false;
2138 saved_fd
= s
->io
.fd
;
2139 assert(s
->io
.registered
);
2142 s
->io
.registered
= false;
2144 r
= source_io_register(s
, s
->enabled
, s
->io
.events
);
2146 s
->io
.fd
= saved_fd
;
2147 s
->io
.registered
= true;
2151 (void) epoll_ctl(s
->event
->epoll_fd
, EPOLL_CTL_DEL
, saved_fd
, NULL
);
2157 _public_
int sd_event_source_get_io_fd_own(sd_event_source
*s
) {
2158 assert_return(s
, -EINVAL
);
2159 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2164 _public_
int sd_event_source_set_io_fd_own(sd_event_source
*s
, int own
) {
2165 assert_return(s
, -EINVAL
);
2166 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2172 _public_
int sd_event_source_get_io_events(sd_event_source
*s
, uint32_t* events
) {
2173 assert_return(s
, -EINVAL
);
2174 assert_return(events
, -EINVAL
);
2175 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2176 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2178 *events
= s
->io
.events
;
2182 _public_
int sd_event_source_set_io_events(sd_event_source
*s
, uint32_t events
) {
2185 assert_return(s
, -EINVAL
);
2186 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2187 assert_return(!(events
& ~(EPOLLIN
|EPOLLOUT
|EPOLLRDHUP
|EPOLLPRI
|EPOLLERR
|EPOLLHUP
|EPOLLET
)), -EINVAL
);
2188 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2189 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2191 /* edge-triggered updates are never skipped, so we can reset edges */
2192 if (s
->io
.events
== events
&& !(events
& EPOLLET
))
2195 r
= source_set_pending(s
, false);
2199 if (event_source_is_online(s
)) {
2200 r
= source_io_register(s
, s
->enabled
, events
);
2205 s
->io
.events
= events
;
2210 _public_
int sd_event_source_get_io_revents(sd_event_source
*s
, uint32_t* revents
) {
2211 assert_return(s
, -EINVAL
);
2212 assert_return(revents
, -EINVAL
);
2213 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2214 assert_return(s
->pending
, -ENODATA
);
2215 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2217 *revents
= s
->io
.revents
;
2221 _public_
int sd_event_source_get_signal(sd_event_source
*s
) {
2222 assert_return(s
, -EINVAL
);
2223 assert_return(s
->type
== SOURCE_SIGNAL
, -EDOM
);
2224 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2226 return s
->signal
.sig
;
2229 _public_
int sd_event_source_get_priority(sd_event_source
*s
, int64_t *priority
) {
2230 assert_return(s
, -EINVAL
);
2231 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2233 *priority
= s
->priority
;
2237 _public_
int sd_event_source_set_priority(sd_event_source
*s
, int64_t priority
) {
2238 bool rm_inotify
= false, rm_inode
= false;
2239 struct inotify_data
*new_inotify_data
= NULL
;
2240 struct inode_data
*new_inode_data
= NULL
;
2243 assert_return(s
, -EINVAL
);
2244 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2245 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2247 if (s
->priority
== priority
)
2250 if (s
->type
== SOURCE_INOTIFY
) {
2251 struct inode_data
*old_inode_data
;
2253 assert(s
->inotify
.inode_data
);
2254 old_inode_data
= s
->inotify
.inode_data
;
2256 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2257 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2258 * events we allow priority changes only until the first following iteration. */
2259 if (old_inode_data
->fd
< 0)
2262 r
= event_make_inotify_data(s
->event
, priority
, &new_inotify_data
);
2267 r
= event_make_inode_data(s
->event
, new_inotify_data
, old_inode_data
->dev
, old_inode_data
->ino
, &new_inode_data
);
2272 if (new_inode_data
->fd
< 0) {
2273 /* Duplicate the fd for the new inode object if we don't have any yet */
2274 new_inode_data
->fd
= fcntl(old_inode_data
->fd
, F_DUPFD_CLOEXEC
, 3);
2275 if (new_inode_data
->fd
< 0) {
2280 LIST_PREPEND(to_close
, s
->event
->inode_data_to_close
, new_inode_data
);
2283 /* Move the event source to the new inode data structure */
2284 LIST_REMOVE(inotify
.by_inode_data
, old_inode_data
->event_sources
, s
);
2285 LIST_PREPEND(inotify
.by_inode_data
, new_inode_data
->event_sources
, s
);
2286 s
->inotify
.inode_data
= new_inode_data
;
2288 /* Now create the new watch */
2289 r
= inode_data_realize_watch(s
->event
, new_inode_data
);
2292 LIST_REMOVE(inotify
.by_inode_data
, new_inode_data
->event_sources
, s
);
2293 LIST_PREPEND(inotify
.by_inode_data
, old_inode_data
->event_sources
, s
);
2294 s
->inotify
.inode_data
= old_inode_data
;
2298 s
->priority
= priority
;
2300 event_gc_inode_data(s
->event
, old_inode_data
);
2302 } else if (s
->type
== SOURCE_SIGNAL
&& event_source_is_online(s
)) {
2303 struct signal_data
*old
, *d
;
2305 /* Move us from the signalfd belonging to the old
2306 * priority to the signalfd of the new priority */
2308 assert_se(old
= hashmap_get(s
->event
->signal_data
, &s
->priority
));
2310 s
->priority
= priority
;
2312 r
= event_make_signal_data(s
->event
, s
->signal
.sig
, &d
);
2314 s
->priority
= old
->priority
;
2318 event_unmask_signal_data(s
->event
, old
, s
->signal
.sig
);
2320 s
->priority
= priority
;
2322 event_source_pp_prioq_reshuffle(s
);
2324 if (s
->type
== SOURCE_EXIT
)
2325 prioq_reshuffle(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
2331 event_free_inode_data(s
->event
, new_inode_data
);
2334 event_free_inotify_data(s
->event
, new_inotify_data
);
2339 _public_
int sd_event_source_get_enabled(sd_event_source
*s
, int *ret
) {
2340 assert_return(s
, -EINVAL
);
2341 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2346 return s
->enabled
!= SD_EVENT_OFF
;
2349 static int event_source_offline(
2358 assert(enabled
== SD_EVENT_OFF
|| ratelimited
);
2360 /* Unset the pending flag when this event source is disabled */
2361 if (s
->enabled
!= SD_EVENT_OFF
&&
2362 enabled
== SD_EVENT_OFF
&&
2363 !IN_SET(s
->type
, SOURCE_DEFER
, SOURCE_EXIT
)) {
2364 r
= source_set_pending(s
, false);
2369 was_offline
= event_source_is_offline(s
);
2370 s
->enabled
= enabled
;
2371 s
->ratelimited
= ratelimited
;
2376 source_io_unregister(s
);
2379 case SOURCE_TIME_REALTIME
:
2380 case SOURCE_TIME_BOOTTIME
:
2381 case SOURCE_TIME_MONOTONIC
:
2382 case SOURCE_TIME_REALTIME_ALARM
:
2383 case SOURCE_TIME_BOOTTIME_ALARM
:
2384 event_source_time_prioq_reshuffle(s
);
2388 event_gc_signal_data(s
->event
, &s
->priority
, s
->signal
.sig
);
2393 assert(s
->event
->n_online_child_sources
> 0);
2394 s
->event
->n_online_child_sources
--;
2397 if (EVENT_SOURCE_WATCH_PIDFD(s
))
2398 source_child_pidfd_unregister(s
);
2400 event_gc_signal_data(s
->event
, &s
->priority
, SIGCHLD
);
2404 prioq_reshuffle(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
2409 case SOURCE_INOTIFY
:
2413 assert_not_reached("Wut? I shouldn't exist.");
2419 static int event_source_online(
2428 assert(enabled
!= SD_EVENT_OFF
|| !ratelimited
);
2430 /* Unset the pending flag when this event source is enabled */
2431 if (s
->enabled
== SD_EVENT_OFF
&&
2432 enabled
!= SD_EVENT_OFF
&&
2433 !IN_SET(s
->type
, SOURCE_DEFER
, SOURCE_EXIT
)) {
2434 r
= source_set_pending(s
, false);
2439 /* Are we really ready for onlining? */
2440 if (enabled
== SD_EVENT_OFF
|| ratelimited
) {
2441 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2442 s
->enabled
= enabled
;
2443 s
->ratelimited
= ratelimited
;
2447 was_online
= event_source_is_online(s
);
2451 r
= source_io_register(s
, enabled
, s
->io
.events
);
2457 r
= event_make_signal_data(s
->event
, s
->signal
.sig
, NULL
);
2459 event_gc_signal_data(s
->event
, &s
->priority
, s
->signal
.sig
);
2466 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
2467 /* yes, we have pidfd */
2469 r
= source_child_pidfd_register(s
, enabled
);
2473 /* no pidfd, or something other to watch for than WEXITED */
2475 r
= event_make_signal_data(s
->event
, SIGCHLD
, NULL
);
2477 event_gc_signal_data(s
->event
, &s
->priority
, SIGCHLD
);
2483 s
->event
->n_online_child_sources
++;
2486 case SOURCE_TIME_REALTIME
:
2487 case SOURCE_TIME_BOOTTIME
:
2488 case SOURCE_TIME_MONOTONIC
:
2489 case SOURCE_TIME_REALTIME_ALARM
:
2490 case SOURCE_TIME_BOOTTIME_ALARM
:
2494 case SOURCE_INOTIFY
:
2498 assert_not_reached("Wut? I shouldn't exist.");
2501 s
->enabled
= enabled
;
2502 s
->ratelimited
= ratelimited
;
2504 /* Non-failing operations below */
2506 case SOURCE_TIME_REALTIME
:
2507 case SOURCE_TIME_BOOTTIME
:
2508 case SOURCE_TIME_MONOTONIC
:
2509 case SOURCE_TIME_REALTIME_ALARM
:
2510 case SOURCE_TIME_BOOTTIME_ALARM
:
2511 event_source_time_prioq_reshuffle(s
);
2515 prioq_reshuffle(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
2525 _public_
int sd_event_source_set_enabled(sd_event_source
*s
, int m
) {
2528 assert_return(s
, -EINVAL
);
2529 assert_return(IN_SET(m
, SD_EVENT_OFF
, SD_EVENT_ON
, SD_EVENT_ONESHOT
), -EINVAL
);
2530 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2532 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
2533 if (s
->event
->state
== SD_EVENT_FINISHED
)
2534 return m
== SD_EVENT_OFF
? 0 : -ESTALE
;
2536 if (s
->enabled
== m
) /* No change? */
2539 if (m
== SD_EVENT_OFF
)
2540 r
= event_source_offline(s
, m
, s
->ratelimited
);
2542 if (s
->enabled
!= SD_EVENT_OFF
) {
2543 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
2544 * event source is already enabled after all. */
2549 r
= event_source_online(s
, m
, s
->ratelimited
);
2554 event_source_pp_prioq_reshuffle(s
);
2558 _public_
int sd_event_source_get_time(sd_event_source
*s
, uint64_t *usec
) {
2559 assert_return(s
, -EINVAL
);
2560 assert_return(usec
, -EINVAL
);
2561 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
2562 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2564 *usec
= s
->time
.next
;
2568 _public_
int sd_event_source_set_time(sd_event_source
*s
, uint64_t usec
) {
2571 assert_return(s
, -EINVAL
);
2572 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
2573 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2574 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2576 r
= source_set_pending(s
, false);
2580 s
->time
.next
= usec
;
2582 event_source_time_prioq_reshuffle(s
);
2586 _public_
int sd_event_source_set_time_relative(sd_event_source
*s
, uint64_t usec
) {
2590 assert_return(s
, -EINVAL
);
2591 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
2593 r
= sd_event_now(s
->event
, event_source_type_to_clock(s
->type
), &t
);
2597 usec
= usec_add(t
, usec
);
2598 if (usec
== USEC_INFINITY
)
2601 return sd_event_source_set_time(s
, usec
);
2604 _public_
int sd_event_source_get_time_accuracy(sd_event_source
*s
, uint64_t *usec
) {
2605 assert_return(s
, -EINVAL
);
2606 assert_return(usec
, -EINVAL
);
2607 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
2608 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2610 *usec
= s
->time
.accuracy
;
2614 _public_
int sd_event_source_set_time_accuracy(sd_event_source
*s
, uint64_t usec
) {
2617 assert_return(s
, -EINVAL
);
2618 assert_return(usec
!= UINT64_MAX
, -EINVAL
);
2619 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
2620 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2621 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2623 r
= source_set_pending(s
, false);
2628 usec
= DEFAULT_ACCURACY_USEC
;
2630 s
->time
.accuracy
= usec
;
2632 event_source_time_prioq_reshuffle(s
);
2636 _public_
int sd_event_source_get_time_clock(sd_event_source
*s
, clockid_t
*clock
) {
2637 assert_return(s
, -EINVAL
);
2638 assert_return(clock
, -EINVAL
);
2639 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
2640 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2642 *clock
= event_source_type_to_clock(s
->type
);
2646 _public_
int sd_event_source_get_child_pid(sd_event_source
*s
, pid_t
*pid
) {
2647 assert_return(s
, -EINVAL
);
2648 assert_return(pid
, -EINVAL
);
2649 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
2650 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2652 *pid
= s
->child
.pid
;
2656 _public_
int sd_event_source_get_child_pidfd(sd_event_source
*s
) {
2657 assert_return(s
, -EINVAL
);
2658 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
2659 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2661 if (s
->child
.pidfd
< 0)
2664 return s
->child
.pidfd
;
2667 _public_
int sd_event_source_send_child_signal(sd_event_source
*s
, int sig
, const siginfo_t
*si
, unsigned flags
) {
2668 assert_return(s
, -EINVAL
);
2669 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
2670 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2671 assert_return(SIGNAL_VALID(sig
), -EINVAL
);
2673 /* If we already have seen indication the process exited refuse sending a signal early. This way we
2674 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
2676 if (s
->child
.exited
)
2679 if (s
->child
.pidfd
>= 0) {
2682 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
2687 if (pidfd_send_signal(s
->child
.pidfd
, sig
, si
? ©
: NULL
, 0) < 0) {
2688 /* Let's propagate the error only if the system call is not implemented or prohibited */
2689 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
))
2695 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
2701 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
2702 siginfo_t copy
= *si
;
2704 if (rt_sigqueueinfo(s
->child
.pid
, sig
, ©
) < 0)
2706 } else if (kill(s
->child
.pid
, sig
) < 0)
2712 _public_
int sd_event_source_get_child_pidfd_own(sd_event_source
*s
) {
2713 assert_return(s
, -EINVAL
);
2714 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
2716 if (s
->child
.pidfd
< 0)
2719 return s
->child
.pidfd_owned
;
2722 _public_
int sd_event_source_set_child_pidfd_own(sd_event_source
*s
, int own
) {
2723 assert_return(s
, -EINVAL
);
2724 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
2726 if (s
->child
.pidfd
< 0)
2729 s
->child
.pidfd_owned
= own
;
2733 _public_
int sd_event_source_get_child_process_own(sd_event_source
*s
) {
2734 assert_return(s
, -EINVAL
);
2735 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
2737 return s
->child
.process_owned
;
2740 _public_
int sd_event_source_set_child_process_own(sd_event_source
*s
, int own
) {
2741 assert_return(s
, -EINVAL
);
2742 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
2744 s
->child
.process_owned
= own
;
2748 _public_
int sd_event_source_get_inotify_mask(sd_event_source
*s
, uint32_t *mask
) {
2749 assert_return(s
, -EINVAL
);
2750 assert_return(mask
, -EINVAL
);
2751 assert_return(s
->type
== SOURCE_INOTIFY
, -EDOM
);
2752 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2754 *mask
= s
->inotify
.mask
;
2758 _public_
int sd_event_source_set_prepare(sd_event_source
*s
, sd_event_handler_t callback
) {
2761 assert_return(s
, -EINVAL
);
2762 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
2763 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2764 assert_return(!event_pid_changed(s
->event
), -ECHILD
);
2766 if (s
->prepare
== callback
)
2769 if (callback
&& s
->prepare
) {
2770 s
->prepare
= callback
;
2774 r
= prioq_ensure_allocated(&s
->event
->prepare
, prepare_prioq_compare
);
2778 s
->prepare
= callback
;
2781 r
= prioq_put(s
->event
->prepare
, s
, &s
->prepare_index
);
2785 prioq_remove(s
->event
->prepare
, s
, &s
->prepare_index
);
2790 _public_
void* sd_event_source_get_userdata(sd_event_source
*s
) {
2791 assert_return(s
, NULL
);
2796 _public_
void *sd_event_source_set_userdata(sd_event_source
*s
, void *userdata
) {
2799 assert_return(s
, NULL
);
2802 s
->userdata
= userdata
;
2807 static int event_source_enter_ratelimited(sd_event_source
*s
) {
2812 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
2813 * the end of the rate limit time window, much as if it was a timer event source. */
2816 return 0; /* Already ratelimited, this is a NOP hence */
2818 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
2819 r
= setup_clock_data(s
->event
, &s
->event
->monotonic
, CLOCK_MONOTONIC
);
2823 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
2824 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
2825 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
2826 if (EVENT_SOURCE_IS_TIME(s
->type
))
2827 event_source_time_prioq_remove(s
, event_get_clock_data(s
->event
, s
->type
));
2829 /* Now, let's add the event source to the monotonic clock instead */
2830 r
= event_source_time_prioq_put(s
, &s
->event
->monotonic
);
2834 /* And let's take the event source officially offline */
2835 r
= event_source_offline(s
, s
->enabled
, /* ratelimited= */ true);
2837 event_source_time_prioq_remove(s
, &s
->event
->monotonic
);
2841 event_source_pp_prioq_reshuffle(s
);
2843 log_debug("Event source %p (%s) entered rate limit state.", s
, strna(s
->description
));
2847 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
2848 * space for it should already be allocated. */
2849 if (EVENT_SOURCE_IS_TIME(s
->type
))
2850 assert_se(event_source_time_prioq_put(s
, event_get_clock_data(s
->event
, s
->type
)) >= 0);
2855 static int event_source_leave_ratelimit(sd_event_source
*s
) {
2860 if (!s
->ratelimited
)
2863 /* Let's take the event source out of the monotonic prioq first. */
2864 event_source_time_prioq_remove(s
, &s
->event
->monotonic
);
2866 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
2867 if (EVENT_SOURCE_IS_TIME(s
->type
)) {
2868 r
= event_source_time_prioq_put(s
, event_get_clock_data(s
->event
, s
->type
));
2873 /* Let's try to take it online again. */
2874 r
= event_source_online(s
, s
->enabled
, /* ratelimited= */ false);
2876 /* Do something roughly sensible when this failed: undo the two prioq ops above */
2877 if (EVENT_SOURCE_IS_TIME(s
->type
))
2878 event_source_time_prioq_remove(s
, event_get_clock_data(s
->event
, s
->type
));
2883 event_source_pp_prioq_reshuffle(s
);
2884 ratelimit_reset(&s
->rate_limit
);
2886 log_debug("Event source %p (%s) left rate limit state.", s
, strna(s
->description
));
2890 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
2891 * simply put it back in it, maybe we can then process it more successfully next iteration. */
2892 assert_se(event_source_time_prioq_put(s
, &s
->event
->monotonic
) >= 0);
2897 static usec_t
sleep_between(sd_event
*e
, usec_t a
, usec_t b
) {
2904 if (a
>= USEC_INFINITY
)
2905 return USEC_INFINITY
;
2910 initialize_perturb(e
);
2913 Find a good time to wake up again between times a and b. We
2914 have two goals here:
2916 a) We want to wake up as seldom as possible, hence prefer
2917 later times over earlier times.
2919 b) But if we have to wake up, then let's make sure to
2920 dispatch as much as possible on the entire system.
2922 We implement this by waking up everywhere at the same time
2923 within any given minute if we can, synchronised via the
2924 perturbation value determined from the boot ID. If we can't,
2925 then we try to find the same spot in every 10s, then 1s and
2926 then 250ms step. Otherwise, we pick the last possible time
2930 c
= (b
/ USEC_PER_MINUTE
) * USEC_PER_MINUTE
+ e
->perturb
;
2932 if (_unlikely_(c
< USEC_PER_MINUTE
))
2935 c
-= USEC_PER_MINUTE
;
2941 c
= (b
/ (USEC_PER_SEC
*10)) * (USEC_PER_SEC
*10) + (e
->perturb
% (USEC_PER_SEC
*10));
2943 if (_unlikely_(c
< USEC_PER_SEC
*10))
2946 c
-= USEC_PER_SEC
*10;
2952 c
= (b
/ USEC_PER_SEC
) * USEC_PER_SEC
+ (e
->perturb
% USEC_PER_SEC
);
2954 if (_unlikely_(c
< USEC_PER_SEC
))
2963 c
= (b
/ (USEC_PER_MSEC
*250)) * (USEC_PER_MSEC
*250) + (e
->perturb
% (USEC_PER_MSEC
*250));
2965 if (_unlikely_(c
< USEC_PER_MSEC
*250))
2968 c
-= USEC_PER_MSEC
*250;
2977 static int event_arm_timer(
2979 struct clock_data
*d
) {
2981 struct itimerspec its
= {};
2982 sd_event_source
*a
, *b
;
2988 if (!d
->needs_rearm
)
2991 d
->needs_rearm
= false;
2993 a
= prioq_peek(d
->earliest
);
2994 if (!a
|| a
->enabled
== SD_EVENT_OFF
|| time_event_source_next(a
) == USEC_INFINITY
) {
2999 if (d
->next
== USEC_INFINITY
)
3003 if (timerfd_settime(d
->fd
, TFD_TIMER_ABSTIME
, &its
, NULL
) < 0)
3006 d
->next
= USEC_INFINITY
;
3010 b
= prioq_peek(d
->latest
);
3011 assert_se(b
&& b
->enabled
!= SD_EVENT_OFF
);
3013 t
= sleep_between(e
, time_event_source_next(a
), time_event_source_latest(b
));
3017 assert_se(d
->fd
>= 0);
3020 /* We don' want to disarm here, just mean some time looooong ago. */
3021 its
.it_value
.tv_sec
= 0;
3022 its
.it_value
.tv_nsec
= 1;
3024 timespec_store(&its
.it_value
, t
);
3026 if (timerfd_settime(d
->fd
, TFD_TIMER_ABSTIME
, &its
, NULL
) < 0)
3033 static int process_io(sd_event
*e
, sd_event_source
*s
, uint32_t revents
) {
3036 assert(s
->type
== SOURCE_IO
);
3038 /* If the event source was already pending, we just OR in the
3039 * new revents, otherwise we reset the value. The ORing is
3040 * necessary to handle EPOLLONESHOT events properly where
3041 * readability might happen independently of writability, and
3042 * we need to keep track of both */
3045 s
->io
.revents
|= revents
;
3047 s
->io
.revents
= revents
;
3049 return source_set_pending(s
, true);
3052 static int flush_timer(sd_event
*e
, int fd
, uint32_t events
, usec_t
*next
) {
3059 assert_return(events
== EPOLLIN
, -EIO
);
3061 ss
= read(fd
, &x
, sizeof(x
));
3063 if (IN_SET(errno
, EAGAIN
, EINTR
))
3069 if (_unlikely_(ss
!= sizeof(x
)))
3073 *next
= USEC_INFINITY
;
3078 static int process_timer(
3081 struct clock_data
*d
) {
3090 s
= prioq_peek(d
->earliest
);
3091 if (!s
|| time_event_source_next(s
) > n
)
3094 if (s
->ratelimited
) {
3095 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3097 assert(s
->ratelimited
);
3099 r
= event_source_leave_ratelimit(s
);
3106 if (s
->enabled
== SD_EVENT_OFF
|| s
->pending
)
3109 r
= source_set_pending(s
, true);
3113 event_source_time_prioq_reshuffle(s
);
3119 static int process_child(sd_event
*e
, int64_t threshold
, int64_t *ret_min_priority
) {
3120 int64_t min_priority
= threshold
;
3121 bool something_new
= false;
3126 assert(ret_min_priority
);
3128 if (!e
->need_process_child
) {
3129 *ret_min_priority
= min_priority
;
3133 e
->need_process_child
= false;
3136 So, this is ugly. We iteratively invoke waitid() with P_PID
3137 + WNOHANG for each PID we wait for, instead of using
3138 P_ALL. This is because we only want to get child
3139 information of very specific child processes, and not all
3140 of them. We might not have processed the SIGCHLD even of a
3141 previous invocation and we don't want to maintain a
3142 unbounded *per-child* event queue, hence we really don't
3143 want anything flushed out of the kernel's queue that we
3144 don't care about. Since this is O(n) this means that if you
3145 have a lot of processes you probably want to handle SIGCHLD
3148 We do not reap the children here (by using WNOWAIT), this
3149 is only done after the event source is dispatched so that
3150 the callback still sees the process as a zombie.
3153 HASHMAP_FOREACH(s
, e
->child_sources
) {
3154 assert(s
->type
== SOURCE_CHILD
);
3156 if (s
->priority
> threshold
)
3162 if (event_source_is_offline(s
))
3165 if (s
->child
.exited
)
3168 if (EVENT_SOURCE_WATCH_PIDFD(s
)) /* There's a usable pidfd known for this event source? then don't waitid() for it here */
3171 zero(s
->child
.siginfo
);
3172 if (waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
,
3173 WNOHANG
| (s
->child
.options
& WEXITED
? WNOWAIT
: 0) | s
->child
.options
) < 0)
3174 return negative_errno();
3176 if (s
->child
.siginfo
.si_pid
!= 0) {
3177 bool zombie
= IN_SET(s
->child
.siginfo
.si_code
, CLD_EXITED
, CLD_KILLED
, CLD_DUMPED
);
3180 s
->child
.exited
= true;
3182 if (!zombie
&& (s
->child
.options
& WEXITED
)) {
3183 /* If the child isn't dead then let's
3184 * immediately remove the state change
3185 * from the queue, since there's no
3186 * benefit in leaving it queued */
3188 assert(s
->child
.options
& (WSTOPPED
|WCONTINUED
));
3189 (void) waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
, WNOHANG
|(s
->child
.options
& (WSTOPPED
|WCONTINUED
)));
3192 r
= source_set_pending(s
, true);
3196 something_new
= true;
3197 min_priority
= MIN(min_priority
, s
->priority
);
3202 *ret_min_priority
= min_priority
;
3203 return something_new
;
3206 static int process_pidfd(sd_event
*e
, sd_event_source
*s
, uint32_t revents
) {
3209 assert(s
->type
== SOURCE_CHILD
);
3214 if (event_source_is_offline(s
))
3217 if (!EVENT_SOURCE_WATCH_PIDFD(s
))
3220 zero(s
->child
.siginfo
);
3221 if (waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
, WNOHANG
| WNOWAIT
| s
->child
.options
) < 0)
3224 if (s
->child
.siginfo
.si_pid
== 0)
3227 if (IN_SET(s
->child
.siginfo
.si_code
, CLD_EXITED
, CLD_KILLED
, CLD_DUMPED
))
3228 s
->child
.exited
= true;
3230 return source_set_pending(s
, true);
3233 static int process_signal(sd_event
*e
, struct signal_data
*d
, uint32_t events
, int64_t *min_priority
) {
3238 assert_return(events
== EPOLLIN
, -EIO
);
3239 assert(min_priority
);
3241 /* If there's a signal queued on this priority and SIGCHLD is
3242 on this priority too, then make sure to recheck the
3243 children we watch. This is because we only ever dequeue
3244 the first signal per priority, and if we dequeue one, and
3245 SIGCHLD might be enqueued later we wouldn't know, but we
3246 might have higher priority children we care about hence we
3247 need to check that explicitly. */
3249 if (sigismember(&d
->sigset
, SIGCHLD
))
3250 e
->need_process_child
= true;
3252 /* If there's already an event source pending for this
3253 * priority we don't read another */
3258 struct signalfd_siginfo si
;
3260 sd_event_source
*s
= NULL
;
3262 n
= read(d
->fd
, &si
, sizeof(si
));
3264 if (IN_SET(errno
, EAGAIN
, EINTR
))
3270 if (_unlikely_(n
!= sizeof(si
)))
3273 assert(SIGNAL_VALID(si
.ssi_signo
));
3275 if (e
->signal_sources
)
3276 s
= e
->signal_sources
[si
.ssi_signo
];
3282 s
->signal
.siginfo
= si
;
3285 r
= source_set_pending(s
, true);
3288 if (r
> 0 && *min_priority
>= s
->priority
) {
3289 *min_priority
= s
->priority
;
3290 return 1; /* an event source with smaller priority is queued. */
3297 static int event_inotify_data_read(sd_event
*e
, struct inotify_data
*d
, uint32_t revents
, int64_t threshold
) {
3303 assert_return(revents
== EPOLLIN
, -EIO
);
3305 /* If there's already an event source pending for this priority, don't read another */
3306 if (d
->n_pending
> 0)
3309 /* Is the read buffer non-empty? If so, let's not read more */
3310 if (d
->buffer_filled
> 0)
3313 if (d
->priority
> threshold
)
3316 n
= read(d
->fd
, &d
->buffer
, sizeof(d
->buffer
));
3318 if (IN_SET(errno
, EAGAIN
, EINTR
))
3325 d
->buffer_filled
= (size_t) n
;
3326 LIST_PREPEND(buffered
, e
->inotify_data_buffered
, d
);
3331 static void event_inotify_data_drop(sd_event
*e
, struct inotify_data
*d
, size_t sz
) {
3334 assert(sz
<= d
->buffer_filled
);
3339 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3340 memmove(d
->buffer
.raw
, d
->buffer
.raw
+ sz
, d
->buffer_filled
- sz
);
3341 d
->buffer_filled
-= sz
;
3343 if (d
->buffer_filled
== 0)
3344 LIST_REMOVE(buffered
, e
->inotify_data_buffered
, d
);
3347 static int event_inotify_data_process(sd_event
*e
, struct inotify_data
*d
) {
3353 /* If there's already an event source pending for this priority, don't read another */
3354 if (d
->n_pending
> 0)
3357 while (d
->buffer_filled
> 0) {
3360 /* Let's validate that the event structures are complete */
3361 if (d
->buffer_filled
< offsetof(struct inotify_event
, name
))
3364 sz
= offsetof(struct inotify_event
, name
) + d
->buffer
.ev
.len
;
3365 if (d
->buffer_filled
< sz
)
3368 if (d
->buffer
.ev
.mask
& IN_Q_OVERFLOW
) {
3369 struct inode_data
*inode_data
;
3371 /* The queue overran, let's pass this event to all event sources connected to this inotify
3374 HASHMAP_FOREACH(inode_data
, d
->inodes
) {
3377 LIST_FOREACH(inotify
.by_inode_data
, s
, inode_data
->event_sources
) {
3379 if (event_source_is_offline(s
))
3382 r
= source_set_pending(s
, true);
3388 struct inode_data
*inode_data
;
3391 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3392 * our watch descriptor table. */
3393 if (d
->buffer
.ev
.mask
& IN_IGNORED
) {
3395 inode_data
= hashmap_remove(d
->wd
, INT_TO_PTR(d
->buffer
.ev
.wd
));
3397 event_inotify_data_drop(e
, d
, sz
);
3401 /* The watch descriptor was removed by the kernel, let's drop it here too */
3402 inode_data
->wd
= -1;
3404 inode_data
= hashmap_get(d
->wd
, INT_TO_PTR(d
->buffer
.ev
.wd
));
3406 event_inotify_data_drop(e
, d
, sz
);
3411 /* Trigger all event sources that are interested in these events. Also trigger all event
3412 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3413 LIST_FOREACH(inotify
.by_inode_data
, s
, inode_data
->event_sources
) {
3415 if (event_source_is_offline(s
))
3418 if ((d
->buffer
.ev
.mask
& (IN_IGNORED
|IN_UNMOUNT
)) == 0 &&
3419 (s
->inotify
.mask
& d
->buffer
.ev
.mask
& IN_ALL_EVENTS
) == 0)
3422 r
= source_set_pending(s
, true);
3428 /* Something pending now? If so, let's finish, otherwise let's read more. */
3429 if (d
->n_pending
> 0)
3436 static int process_inotify(sd_event
*e
) {
3437 struct inotify_data
*d
;
3442 LIST_FOREACH(buffered
, d
, e
->inotify_data_buffered
) {
3443 r
= event_inotify_data_process(e
, d
);
3453 static int source_dispatch(sd_event_source
*s
) {
3454 _cleanup_(sd_event_unrefp
) sd_event
*saved_event
= NULL
;
3455 EventSourceType saved_type
;
3459 assert(s
->pending
|| s
->type
== SOURCE_EXIT
);
3461 /* Save the event source type, here, so that we still know it after the event callback which might
3462 * invalidate the event. */
3463 saved_type
= s
->type
;
3465 /* Similar, store a reference to the event loop object, so that we can still access it after the
3466 * callback might have invalidated/disconnected the event source. */
3467 saved_event
= sd_event_ref(s
->event
);
3469 /* Check if we hit the ratelimit for this event source, if so, let's disable it. */
3470 assert(!s
->ratelimited
);
3471 if (!ratelimit_below(&s
->rate_limit
)) {
3472 r
= event_source_enter_ratelimited(s
);
3479 if (!IN_SET(s
->type
, SOURCE_DEFER
, SOURCE_EXIT
)) {
3480 r
= source_set_pending(s
, false);
3485 if (s
->type
!= SOURCE_POST
) {
3488 /* If we execute a non-post source, let's mark all
3489 * post sources as pending */
3491 SET_FOREACH(z
, s
->event
->post_sources
) {
3492 if (event_source_is_offline(z
))
3495 r
= source_set_pending(z
, true);
3501 if (s
->enabled
== SD_EVENT_ONESHOT
) {
3502 r
= sd_event_source_set_enabled(s
, SD_EVENT_OFF
);
3507 s
->dispatching
= true;
3512 r
= s
->io
.callback(s
, s
->io
.fd
, s
->io
.revents
, s
->userdata
);
3515 case SOURCE_TIME_REALTIME
:
3516 case SOURCE_TIME_BOOTTIME
:
3517 case SOURCE_TIME_MONOTONIC
:
3518 case SOURCE_TIME_REALTIME_ALARM
:
3519 case SOURCE_TIME_BOOTTIME_ALARM
:
3520 r
= s
->time
.callback(s
, s
->time
.next
, s
->userdata
);
3524 r
= s
->signal
.callback(s
, &s
->signal
.siginfo
, s
->userdata
);
3527 case SOURCE_CHILD
: {
3530 zombie
= IN_SET(s
->child
.siginfo
.si_code
, CLD_EXITED
, CLD_KILLED
, CLD_DUMPED
);
3532 r
= s
->child
.callback(s
, &s
->child
.siginfo
, s
->userdata
);
3534 /* Now, reap the PID for good. */
3536 (void) waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
, WNOHANG
|WEXITED
);
3537 s
->child
.waited
= true;
3544 r
= s
->defer
.callback(s
, s
->userdata
);
3548 r
= s
->post
.callback(s
, s
->userdata
);
3552 r
= s
->exit
.callback(s
, s
->userdata
);
3555 case SOURCE_INOTIFY
: {
3556 struct sd_event
*e
= s
->event
;
3557 struct inotify_data
*d
;
3560 assert(s
->inotify
.inode_data
);
3561 assert_se(d
= s
->inotify
.inode_data
->inotify_data
);
3563 assert(d
->buffer_filled
>= offsetof(struct inotify_event
, name
));
3564 sz
= offsetof(struct inotify_event
, name
) + d
->buffer
.ev
.len
;
3565 assert(d
->buffer_filled
>= sz
);
3567 r
= s
->inotify
.callback(s
, &d
->buffer
.ev
, s
->userdata
);
3569 /* When no event is pending anymore on this inotify object, then let's drop the event from the
3571 if (d
->n_pending
== 0)
3572 event_inotify_data_drop(e
, d
, sz
);
3577 case SOURCE_WATCHDOG
:
3578 case _SOURCE_EVENT_SOURCE_TYPE_MAX
:
3579 case _SOURCE_EVENT_SOURCE_TYPE_INVALID
:
3580 assert_not_reached("Wut? I shouldn't exist.");
3583 s
->dispatching
= false;
3586 log_debug_errno(r
, "Event source %s (type %s) returned error, %s: %m",
3587 strna(s
->description
),
3588 event_source_type_to_string(saved_type
),
3589 s
->exit_on_failure
? "exiting" : "disabling");
3591 if (s
->exit_on_failure
)
3592 (void) sd_event_exit(saved_event
, r
);
3598 sd_event_source_set_enabled(s
, SD_EVENT_OFF
);
3603 static int event_prepare(sd_event
*e
) {
3611 s
= prioq_peek(e
->prepare
);
3612 if (!s
|| s
->prepare_iteration
== e
->iteration
|| event_source_is_offline(s
))
3615 s
->prepare_iteration
= e
->iteration
;
3616 r
= prioq_reshuffle(e
->prepare
, s
, &s
->prepare_index
);
3622 s
->dispatching
= true;
3623 r
= s
->prepare(s
, s
->userdata
);
3624 s
->dispatching
= false;
3627 log_debug_errno(r
, "Prepare callback of event source %s (type %s) returned error, %s: %m",
3628 strna(s
->description
),
3629 event_source_type_to_string(s
->type
),
3630 s
->exit_on_failure
? "exiting" : "disabling");
3632 if (s
->exit_on_failure
)
3633 (void) sd_event_exit(e
, r
);
3639 sd_event_source_set_enabled(s
, SD_EVENT_OFF
);
3645 static int dispatch_exit(sd_event
*e
) {
3651 p
= prioq_peek(e
->exit
);
3652 if (!p
|| event_source_is_offline(p
)) {
3653 e
->state
= SD_EVENT_FINISHED
;
3657 _unused_
_cleanup_(sd_event_unrefp
) sd_event
*ref
= sd_event_ref(e
);
3659 e
->state
= SD_EVENT_EXITING
;
3660 r
= source_dispatch(p
);
3661 e
->state
= SD_EVENT_INITIAL
;
3665 static sd_event_source
* event_next_pending(sd_event
*e
) {
3670 p
= prioq_peek(e
->pending
);
3674 if (event_source_is_offline(p
))
3680 static int arm_watchdog(sd_event
*e
) {
3681 struct itimerspec its
= {};
3685 assert(e
->watchdog_fd
>= 0);
3687 t
= sleep_between(e
,
3688 e
->watchdog_last
+ (e
->watchdog_period
/ 2),
3689 e
->watchdog_last
+ (e
->watchdog_period
* 3 / 4));
3691 timespec_store(&its
.it_value
, t
);
3693 /* Make sure we never set the watchdog to 0, which tells the
3694 * kernel to disable it. */
3695 if (its
.it_value
.tv_sec
== 0 && its
.it_value
.tv_nsec
== 0)
3696 its
.it_value
.tv_nsec
= 1;
3698 if (timerfd_settime(e
->watchdog_fd
, TFD_TIMER_ABSTIME
, &its
, NULL
) < 0)
3704 static int process_watchdog(sd_event
*e
) {
3710 /* Don't notify watchdog too often */
3711 if (e
->watchdog_last
+ e
->watchdog_period
/ 4 > e
->timestamp
.monotonic
)
3714 sd_notify(false, "WATCHDOG=1");
3715 e
->watchdog_last
= e
->timestamp
.monotonic
;
3717 return arm_watchdog(e
);
3720 static void event_close_inode_data_fds(sd_event
*e
) {
3721 struct inode_data
*d
;
3725 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
3726 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
3727 * adjustments to the even source, such as changing the priority (which requires us to remove and re-add a watch
3728 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
3731 while ((d
= e
->inode_data_to_close
)) {
3733 d
->fd
= safe_close(d
->fd
);
3735 LIST_REMOVE(to_close
, e
->inode_data_to_close
, d
);
3739 _public_
int sd_event_prepare(sd_event
*e
) {
3742 assert_return(e
, -EINVAL
);
3743 assert_return(e
= event_resolve(e
), -ENOPKG
);
3744 assert_return(!event_pid_changed(e
), -ECHILD
);
3745 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
3746 assert_return(e
->state
== SD_EVENT_INITIAL
, -EBUSY
);
3748 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
3749 * this check here once, since gettid() is typically not cached, and thus want to minimize
3751 assert_return(!e
->default_event_ptr
|| e
->tid
== gettid(), -EREMOTEIO
);
3753 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
3754 _unused_
_cleanup_(sd_event_unrefp
) sd_event
*ref
= sd_event_ref(e
);
3756 if (e
->exit_requested
)
3761 e
->state
= SD_EVENT_PREPARING
;
3762 r
= event_prepare(e
);
3763 e
->state
= SD_EVENT_INITIAL
;
3767 r
= event_arm_timer(e
, &e
->realtime
);
3771 r
= event_arm_timer(e
, &e
->boottime
);
3775 r
= event_arm_timer(e
, &e
->monotonic
);
3779 r
= event_arm_timer(e
, &e
->realtime_alarm
);
3783 r
= event_arm_timer(e
, &e
->boottime_alarm
);
3787 event_close_inode_data_fds(e
);
3789 if (event_next_pending(e
) || e
->need_process_child
)
3792 e
->state
= SD_EVENT_ARMED
;
3797 e
->state
= SD_EVENT_ARMED
;
3798 r
= sd_event_wait(e
, 0);
3800 e
->state
= SD_EVENT_ARMED
;
3805 static int epoll_wait_usec(
3807 struct epoll_event
*events
,
3813 static bool epoll_pwait2_absent
= false;
3815 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not.
3817 * FIXME: this is temporarily disabled until epoll_pwait2() becomes more widely available.
3818 * See https://github.com/systemd/systemd/pull/18973 and
3819 * https://github.com/systemd/systemd/issues/19052. */
3821 if (!epoll_pwait2_absent
&& timeout
!= USEC_INFINITY
) {
3824 r
= epoll_pwait2(fd
,
3827 timespec_store(&ts
, timeout
),
3831 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
))
3832 return -errno
; /* Only fallback to old epoll_wait() if the syscall is masked or not
3835 epoll_pwait2_absent
= true;
3839 if (timeout
== USEC_INFINITY
)
3844 k
= DIV_ROUND_UP(timeout
, USEC_PER_MSEC
);
3846 msec
= INT_MAX
; /* Saturate */
3861 static int process_epoll(sd_event
*e
, usec_t timeout
, int64_t threshold
, int64_t *ret_min_priority
) {
3862 int64_t min_priority
= threshold
;
3863 bool something_new
= false;
3864 size_t n_event_queue
, m
;
3868 assert(ret_min_priority
);
3870 n_event_queue
= MAX(e
->n_sources
, 1u);
3871 if (!GREEDY_REALLOC(e
->event_queue
, e
->event_queue_allocated
, n_event_queue
))
3874 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
3875 if (e
->inotify_data_buffered
)
3879 r
= epoll_wait_usec(e
->epoll_fd
, e
->event_queue
, e
->event_queue_allocated
, timeout
);
3885 if (m
< e
->event_queue_allocated
)
3888 if (e
->event_queue_allocated
>= n_event_queue
* 10)
3891 if (!GREEDY_REALLOC(e
->event_queue
, e
->event_queue_allocated
, e
->event_queue_allocated
+ n_event_queue
))
3897 /* Set timestamp only when this is called first time. */
3898 if (threshold
== INT64_MAX
)
3899 triple_timestamp_get(&e
->timestamp
);
3901 for (size_t i
= 0; i
< m
; i
++) {
3903 if (e
->event_queue
[i
].data
.ptr
== INT_TO_PTR(SOURCE_WATCHDOG
))
3904 r
= flush_timer(e
, e
->watchdog_fd
, e
->event_queue
[i
].events
, NULL
);
3906 WakeupType
*t
= e
->event_queue
[i
].data
.ptr
;
3910 case WAKEUP_EVENT_SOURCE
: {
3911 sd_event_source
*s
= e
->event_queue
[i
].data
.ptr
;
3915 if (s
->priority
> threshold
)
3918 min_priority
= MIN(min_priority
, s
->priority
);
3923 r
= process_io(e
, s
, e
->event_queue
[i
].events
);
3927 r
= process_pidfd(e
, s
, e
->event_queue
[i
].events
);
3931 assert_not_reached("Unexpected event source type");
3937 case WAKEUP_CLOCK_DATA
: {
3938 struct clock_data
*d
= e
->event_queue
[i
].data
.ptr
;
3942 r
= flush_timer(e
, d
->fd
, e
->event_queue
[i
].events
, &d
->next
);
3946 case WAKEUP_SIGNAL_DATA
:
3947 r
= process_signal(e
, e
->event_queue
[i
].data
.ptr
, e
->event_queue
[i
].events
, &min_priority
);
3950 case WAKEUP_INOTIFY_DATA
:
3951 r
= event_inotify_data_read(e
, e
->event_queue
[i
].data
.ptr
, e
->event_queue
[i
].events
, threshold
);
3955 assert_not_reached("Invalid wake-up pointer");
3961 something_new
= true;
3964 *ret_min_priority
= min_priority
;
3965 return something_new
;
3968 _public_
int sd_event_wait(sd_event
*e
, uint64_t timeout
) {
3971 assert_return(e
, -EINVAL
);
3972 assert_return(e
= event_resolve(e
), -ENOPKG
);
3973 assert_return(!event_pid_changed(e
), -ECHILD
);
3974 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
3975 assert_return(e
->state
== SD_EVENT_ARMED
, -EBUSY
);
3977 if (e
->exit_requested
) {
3978 e
->state
= SD_EVENT_PENDING
;
3982 for (int64_t threshold
= INT64_MAX
; ; threshold
--) {
3983 int64_t epoll_min_priority
, child_min_priority
;
3985 /* There may be a possibility that new epoll (especially IO) and child events are
3986 * triggered just after process_epoll() call but before process_child(), and the new IO
3987 * events may have higher priority than the child events. To salvage these events,
3988 * let's call epoll_wait() again, but accepts only events with higher priority than the
3989 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
3990 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
3991 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
3993 r
= process_epoll(e
, timeout
, threshold
, &epoll_min_priority
);
3995 e
->state
= SD_EVENT_PENDING
;
4000 if (r
== 0 && threshold
< INT64_MAX
)
4001 /* No new epoll event. */
4004 r
= process_child(e
, threshold
, &child_min_priority
);
4008 /* No new child event. */
4011 threshold
= MIN(epoll_min_priority
, child_min_priority
);
4012 if (threshold
== INT64_MIN
)
4018 r
= process_watchdog(e
);
4022 r
= process_timer(e
, e
->timestamp
.realtime
, &e
->realtime
);
4026 r
= process_timer(e
, e
->timestamp
.boottime
, &e
->boottime
);
4030 r
= process_timer(e
, e
->timestamp
.monotonic
, &e
->monotonic
);
4034 r
= process_timer(e
, e
->timestamp
.realtime
, &e
->realtime_alarm
);
4038 r
= process_timer(e
, e
->timestamp
.boottime
, &e
->boottime_alarm
);
4042 r
= process_inotify(e
);
4046 if (event_next_pending(e
)) {
4047 e
->state
= SD_EVENT_PENDING
;
4054 e
->state
= SD_EVENT_INITIAL
;
4059 _public_
int sd_event_dispatch(sd_event
*e
) {
4063 assert_return(e
, -EINVAL
);
4064 assert_return(e
= event_resolve(e
), -ENOPKG
);
4065 assert_return(!event_pid_changed(e
), -ECHILD
);
4066 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4067 assert_return(e
->state
== SD_EVENT_PENDING
, -EBUSY
);
4069 if (e
->exit_requested
)
4070 return dispatch_exit(e
);
4072 p
= event_next_pending(e
);
4074 _unused_
_cleanup_(sd_event_unrefp
) sd_event
*ref
= sd_event_ref(e
);
4076 e
->state
= SD_EVENT_RUNNING
;
4077 r
= source_dispatch(p
);
4078 e
->state
= SD_EVENT_INITIAL
;
4082 e
->state
= SD_EVENT_INITIAL
;
4087 static void event_log_delays(sd_event
*e
) {
4088 char b
[ELEMENTSOF(e
->delays
) * DECIMAL_STR_MAX(unsigned) + 1], *p
;
4093 for (i
= 0; i
< ELEMENTSOF(e
->delays
); i
++) {
4094 l
= strpcpyf(&p
, l
, "%u ", e
->delays
[i
]);
4097 log_debug("Event loop iterations: %s", b
);
4100 _public_
int sd_event_run(sd_event
*e
, uint64_t timeout
) {
4103 assert_return(e
, -EINVAL
);
4104 assert_return(e
= event_resolve(e
), -ENOPKG
);
4105 assert_return(!event_pid_changed(e
), -ECHILD
);
4106 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4107 assert_return(e
->state
== SD_EVENT_INITIAL
, -EBUSY
);
4109 if (e
->profile_delays
&& e
->last_run_usec
!= 0) {
4113 this_run
= now(CLOCK_MONOTONIC
);
4115 l
= u64log2(this_run
- e
->last_run_usec
);
4116 assert(l
< ELEMENTSOF(e
->delays
));
4119 if (this_run
- e
->last_log_usec
>= 5*USEC_PER_SEC
) {
4120 event_log_delays(e
);
4121 e
->last_log_usec
= this_run
;
4125 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4126 _unused_
_cleanup_(sd_event_unrefp
) sd_event
*ref
= sd_event_ref(e
);
4128 r
= sd_event_prepare(e
);
4130 /* There was nothing? Then wait... */
4131 r
= sd_event_wait(e
, timeout
);
4133 if (e
->profile_delays
)
4134 e
->last_run_usec
= now(CLOCK_MONOTONIC
);
4137 /* There's something now, then let's dispatch it */
4138 r
= sd_event_dispatch(e
);
4148 _public_
int sd_event_loop(sd_event
*e
) {
4151 assert_return(e
, -EINVAL
);
4152 assert_return(e
= event_resolve(e
), -ENOPKG
);
4153 assert_return(!event_pid_changed(e
), -ECHILD
);
4154 assert_return(e
->state
== SD_EVENT_INITIAL
, -EBUSY
);
4156 _unused_
_cleanup_(sd_event_unrefp
) sd_event
*ref
= NULL
;
4158 while (e
->state
!= SD_EVENT_FINISHED
) {
4159 r
= sd_event_run(e
, UINT64_MAX
);
4164 return e
->exit_code
;
4167 _public_
int sd_event_get_fd(sd_event
*e
) {
4168 assert_return(e
, -EINVAL
);
4169 assert_return(e
= event_resolve(e
), -ENOPKG
);
4170 assert_return(!event_pid_changed(e
), -ECHILD
);
4175 _public_
int sd_event_get_state(sd_event
*e
) {
4176 assert_return(e
, -EINVAL
);
4177 assert_return(e
= event_resolve(e
), -ENOPKG
);
4178 assert_return(!event_pid_changed(e
), -ECHILD
);
4183 _public_
int sd_event_get_exit_code(sd_event
*e
, int *code
) {
4184 assert_return(e
, -EINVAL
);
4185 assert_return(e
= event_resolve(e
), -ENOPKG
);
4186 assert_return(code
, -EINVAL
);
4187 assert_return(!event_pid_changed(e
), -ECHILD
);
4189 if (!e
->exit_requested
)
4192 *code
= e
->exit_code
;
4196 _public_
int sd_event_exit(sd_event
*e
, int code
) {
4197 assert_return(e
, -EINVAL
);
4198 assert_return(e
= event_resolve(e
), -ENOPKG
);
4199 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4200 assert_return(!event_pid_changed(e
), -ECHILD
);
4202 e
->exit_requested
= true;
4203 e
->exit_code
= code
;
4208 _public_
int sd_event_now(sd_event
*e
, clockid_t clock
, uint64_t *usec
) {
4209 assert_return(e
, -EINVAL
);
4210 assert_return(e
= event_resolve(e
), -ENOPKG
);
4211 assert_return(usec
, -EINVAL
);
4212 assert_return(!event_pid_changed(e
), -ECHILD
);
4214 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock
))
4217 /* Generate a clean error in case CLOCK_BOOTTIME is not available. Note that don't use clock_supported() here,
4218 * for a reason: there are systems where CLOCK_BOOTTIME is supported, but CLOCK_BOOTTIME_ALARM is not, but for
4219 * the purpose of getting the time this doesn't matter. */
4220 if (IN_SET(clock
, CLOCK_BOOTTIME
, CLOCK_BOOTTIME_ALARM
) && !clock_boottime_supported())
4223 if (!triple_timestamp_is_set(&e
->timestamp
)) {
4224 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4229 *usec
= triple_timestamp_by_clock(&e
->timestamp
, clock
);
4233 _public_
int sd_event_default(sd_event
**ret
) {
4238 return !!default_event
;
4240 if (default_event
) {
4241 *ret
= sd_event_ref(default_event
);
4245 r
= sd_event_new(&e
);
4249 e
->default_event_ptr
= &default_event
;
4257 _public_
int sd_event_get_tid(sd_event
*e
, pid_t
*tid
) {
4258 assert_return(e
, -EINVAL
);
4259 assert_return(e
= event_resolve(e
), -ENOPKG
);
4260 assert_return(tid
, -EINVAL
);
4261 assert_return(!event_pid_changed(e
), -ECHILD
);
4271 _public_
int sd_event_set_watchdog(sd_event
*e
, int b
) {
4274 assert_return(e
, -EINVAL
);
4275 assert_return(e
= event_resolve(e
), -ENOPKG
);
4276 assert_return(!event_pid_changed(e
), -ECHILD
);
4278 if (e
->watchdog
== !!b
)
4282 r
= sd_watchdog_enabled(false, &e
->watchdog_period
);
4286 /* Issue first ping immediately */
4287 sd_notify(false, "WATCHDOG=1");
4288 e
->watchdog_last
= now(CLOCK_MONOTONIC
);
4290 e
->watchdog_fd
= timerfd_create(CLOCK_MONOTONIC
, TFD_NONBLOCK
|TFD_CLOEXEC
);
4291 if (e
->watchdog_fd
< 0)
4294 r
= arm_watchdog(e
);
4298 struct epoll_event ev
= {
4300 .data
.ptr
= INT_TO_PTR(SOURCE_WATCHDOG
),
4303 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, e
->watchdog_fd
, &ev
) < 0) {
4309 if (e
->watchdog_fd
>= 0) {
4310 (void) epoll_ctl(e
->epoll_fd
, EPOLL_CTL_DEL
, e
->watchdog_fd
, NULL
);
4311 e
->watchdog_fd
= safe_close(e
->watchdog_fd
);
4319 e
->watchdog_fd
= safe_close(e
->watchdog_fd
);
4323 _public_
int sd_event_get_watchdog(sd_event
*e
) {
4324 assert_return(e
, -EINVAL
);
4325 assert_return(e
= event_resolve(e
), -ENOPKG
);
4326 assert_return(!event_pid_changed(e
), -ECHILD
);
4331 _public_
int sd_event_get_iteration(sd_event
*e
, uint64_t *ret
) {
4332 assert_return(e
, -EINVAL
);
4333 assert_return(e
= event_resolve(e
), -ENOPKG
);
4334 assert_return(!event_pid_changed(e
), -ECHILD
);
4336 *ret
= e
->iteration
;
4340 _public_
int sd_event_source_set_destroy_callback(sd_event_source
*s
, sd_event_destroy_t callback
) {
4341 assert_return(s
, -EINVAL
);
4343 s
->destroy_callback
= callback
;
4347 _public_
int sd_event_source_get_destroy_callback(sd_event_source
*s
, sd_event_destroy_t
*ret
) {
4348 assert_return(s
, -EINVAL
);
4351 *ret
= s
->destroy_callback
;
4353 return !!s
->destroy_callback
;
4356 _public_
int sd_event_source_get_floating(sd_event_source
*s
) {
4357 assert_return(s
, -EINVAL
);
4362 _public_
int sd_event_source_set_floating(sd_event_source
*s
, int b
) {
4363 assert_return(s
, -EINVAL
);
4365 if (s
->floating
== !!b
)
4368 if (!s
->event
) /* Already disconnected */
4374 sd_event_source_ref(s
);
4375 sd_event_unref(s
->event
);
4377 sd_event_ref(s
->event
);
4378 sd_event_source_unref(s
);
4384 _public_
int sd_event_source_get_exit_on_failure(sd_event_source
*s
) {
4385 assert_return(s
, -EINVAL
);
4386 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
4388 return s
->exit_on_failure
;
4391 _public_
int sd_event_source_set_exit_on_failure(sd_event_source
*s
, int b
) {
4392 assert_return(s
, -EINVAL
);
4393 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
4395 if (s
->exit_on_failure
== !!b
)
4398 s
->exit_on_failure
= b
;
4402 _public_
int sd_event_source_set_ratelimit(sd_event_source
*s
, uint64_t interval
, unsigned burst
) {
4405 assert_return(s
, -EINVAL
);
4407 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
4408 * so is a programming error. */
4409 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s
->type
), -EDOM
);
4411 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
4412 * non-ratelimited. */
4413 r
= event_source_leave_ratelimit(s
);
4417 s
->rate_limit
= (RateLimit
) { interval
, burst
};
4421 _public_
int sd_event_source_get_ratelimit(sd_event_source
*s
, uint64_t *ret_interval
, unsigned *ret_burst
) {
4422 assert_return(s
, -EINVAL
);
4424 /* Querying whether an event source has ratelimiting configured is not a loggable offsense, hence
4425 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error */
4426 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s
->type
))
4429 if (!ratelimit_configured(&s
->rate_limit
))
4433 *ret_interval
= s
->rate_limit
.interval
;
4435 *ret_burst
= s
->rate_limit
.burst
;
4440 _public_
int sd_event_source_is_ratelimited(sd_event_source
*s
) {
4441 assert_return(s
, -EINVAL
);
4443 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s
->type
))
4446 if (!ratelimit_configured(&s
->rate_limit
))
4449 return s
->ratelimited
;